From 200f091c95bbc4b8660636bd345805c45d6eced7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 28 Sep 2024 14:08:31 -0700 Subject: coredump: Do not lock during 'comm' reporting The 'comm' member will always be NUL terminated, and this is not fast-path, so we can just perform a direct memcpy during a coredump instead of potentially deadlocking while holding the task struct lock. Reported-by: Vegard Nossum Closes: https://lore.kernel.org/all/d122ece6-3606-49de-ae4d-8da88846bef2@oracle.com Fixes: c114e9948c2b ("coredump: Standartize and fix logging") Tested-by: Vegard Nossum Link: https://lore.kernel.org/r/20240928210830.work.307-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/coredump.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 45e598fe3476..77e6e195d1d6 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -52,8 +52,8 @@ extern void do_coredump(const kernel_siginfo_t *siginfo); #define __COREDUMP_PRINTK(Level, Format, ...) \ do { \ char comm[TASK_COMM_LEN]; \ - \ - get_task_comm(comm, current); \ + /* This will always be NUL terminated. */ \ + memcpy(comm, current->comm, sizeof(comm)); \ printk_ratelimited(Level "coredump: %d(%*pE): " Format "\n", \ task_tgid_vnr(current), (int)strlen(comm), comm, ##__VA_ARGS__); \ } while (0) \ -- cgit v1.2.3 From b9c44b91476b67327a521568a854babecc4070ab Mon Sep 17 00:00:00 2001 From: Yabin Cui Date: Wed, 15 May 2024 12:36:07 -0700 Subject: perf/core: Save raw sample data conditionally based on sample type Currently, space for raw sample data is always allocated within sample records for both BPF output and tracepoint events. This leads to unused space in sample records when raw sample data is not requested. This patch enforces checking sample type of an event in perf_sample_save_raw_data(). So raw sample data will only be saved if explicitly requested, reducing overhead when it is not needed. Fixes: 0a9081cf0a11 ("perf/core: Add perf_sample_save_raw_data() helper") Signed-off-by: Yabin Cui Signed-off-by: Ingo Molnar Reviewed-by: Ian Rogers Acked-by: Namhyung Kim Link: https://lore.kernel.org/r/20240515193610.2350456-2-yabinc@google.com --- include/linux/perf_event.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index cb99ec8c9e96..f7c0a3f2f502 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1287,12 +1287,18 @@ static inline void perf_sample_save_callchain(struct perf_sample_data *data, } static inline void perf_sample_save_raw_data(struct perf_sample_data *data, + struct perf_event *event, struct perf_raw_record *raw) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; int size; + if (!(event->attr.sample_type & PERF_SAMPLE_RAW)) + return; + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_RAW)) + return; + do { sum += frag->size; if (perf_raw_frag_last(frag)) -- cgit v1.2.3 From f226805bc5f60adf03783d8e4cbfe303ccecd64e Mon Sep 17 00:00:00 2001 From: Yabin Cui Date: Wed, 15 May 2024 12:36:08 -0700 Subject: perf/core: Check sample_type in perf_sample_save_callchain Check sample_type in perf_sample_save_callchain() to prevent saving callchain data when it isn't required. Suggested-by: Namhyung Kim Signed-off-by: Yabin Cui Signed-off-by: Ingo Molnar Reviewed-by: Ian Rogers Acked-by: Namhyung Kim Link: https://lore.kernel.org/r/20240515193610.2350456-3-yabinc@google.com --- include/linux/perf_event.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f7c0a3f2f502..3ac202d971fb 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1279,6 +1279,11 @@ static inline void perf_sample_save_callchain(struct perf_sample_data *data, { int size = 1; + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) + return; + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) + return; + data->callchain = perf_callchain(event, regs); size += data->callchain->nr; -- cgit v1.2.3 From faac6f105ef169e2e5678c14e1ffebf2a7d780b6 Mon Sep 17 00:00:00 2001 From: Yabin Cui Date: Wed, 15 May 2024 12:36:09 -0700 Subject: perf/core: Check sample_type in perf_sample_save_brstack Check sample_type in perf_sample_save_brstack() to prevent saving branch stack data when it isn't required. Suggested-by: Namhyung Kim Signed-off-by: Yabin Cui Signed-off-by: Ingo Molnar Reviewed-by: Ian Rogers Acked-by: Namhyung Kim Link: https://lore.kernel.org/r/20240515193610.2350456-4-yabinc@google.com --- include/linux/perf_event.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3ac202d971fb..bf831b1485ff 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1320,6 +1320,11 @@ static inline void perf_sample_save_raw_data(struct perf_sample_data *data, data->sample_flags |= PERF_SAMPLE_RAW; } +static inline bool has_branch_stack(struct perf_event *event) +{ + return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; +} + static inline void perf_sample_save_brstack(struct perf_sample_data *data, struct perf_event *event, struct perf_branch_stack *brs, @@ -1327,6 +1332,11 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data, { int size = sizeof(u64); /* nr */ + if (!has_branch_stack(event)) + return; + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_BRANCH_STACK)) + return; + if (branch_sample_hw_index(event)) size += sizeof(u64); size += brs->nr * sizeof(struct perf_branch_entry); @@ -1716,11 +1726,6 @@ static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) # define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) #endif -static inline bool has_branch_stack(struct perf_event *event) -{ - return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; -} - static inline bool needs_branch_stack(struct perf_event *event) { return event->attr.branch_sample_type != 0; -- cgit v1.2.3 From 2815a56e4b7252a836969f5674ee356ea1ce482c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 14 Nov 2024 10:26:17 -0500 Subject: x86/mm/tlb: Add tracepoint for TLB flush IPI to stale CPU Add a tracepoint when we send a TLB flush IPI to a CPU that used to be in the mm_cpumask, but isn't any more. Suggested-by: Dave Hansen Signed-off-by: Rik van Riel Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241114152723.1294686-3-riel@surriel.com --- include/linux/mm_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6e3bdf8e38bc..6b6f05404304 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1335,6 +1335,7 @@ enum tlb_flush_reason { TLB_LOCAL_SHOOTDOWN, TLB_LOCAL_MM_SHOOTDOWN, TLB_REMOTE_SEND_IPI, + TLB_REMOTE_WRONG_CPU, NR_TLB_FLUSH_REASONS, }; -- cgit v1.2.3 From 0a499a7e9819e7a0980408f18df68160a0b55f2e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Dec 2024 17:08:26 -0800 Subject: lib/crc32: drop leading underscores from __crc32c_le_base Remove the leading underscores from __crc32c_le_base(). This is in preparation for adding crc32c_le_arch() and eventually renaming __crc32c_le() to crc32c_le(). Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20241202010844.144356-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc32.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 87f788c0d607..5b07fc9081c4 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -39,7 +39,7 @@ static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2) } u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len); -u32 __pure __crc32c_le_base(u32 crc, unsigned char const *p, size_t len); +u32 __pure crc32c_le_base(u32 crc, unsigned char const *p, size_t len); /** * __crc32c_le_combine - Combine two crc32c check values into one. For two -- cgit v1.2.3 From d36cebe03c3ae4ea1fde20cfc797fab8729c3ab5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Dec 2024 17:08:27 -0800 Subject: lib/crc32: improve support for arch-specific overrides Currently the CRC32 library functions are defined as weak symbols, and the arm64 and riscv architectures override them. This method of arch-specific overrides has the limitation that it only works when both the base and arch code is built-in. Also, it makes the arch-specific code be silently not used if it is accidentally built with lib-y instead of obj-y; unfortunately the RISC-V code does this. This commit reorganizes the code to have explicit *_arch() functions that are called when they are enabled, similar to how some of the crypto library code works (e.g. chacha_crypt() calls chacha_crypt_arch()). Make the existing kconfig choice for the CRC32 implementation also control whether the arch-optimized implementation (if one is available) is enabled or not. Make it enabled by default if CRC32 is also enabled. The result is that arch-optimized CRC32 library functions will be included automatically when appropriate, but it is now possible to disable them. They can also now be built as a loadable module if the CRC32 library functions happen to be used only by loadable modules, in which case the arch and base CRC32 modules will be automatically loaded via direct symbol dependency when appropriate. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20241202010844.144356-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc32.h | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 5b07fc9081c4..58c632533b08 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -8,10 +8,34 @@ #include #include -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len); -u32 __pure crc32_le_base(u32 crc, unsigned char const *p, size_t len); -u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len); -u32 __pure crc32_be_base(u32 crc, unsigned char const *p, size_t len); +u32 __pure crc32_le_arch(u32 crc, const u8 *p, size_t len); +u32 __pure crc32_le_base(u32 crc, const u8 *p, size_t len); +u32 __pure crc32_be_arch(u32 crc, const u8 *p, size_t len); +u32 __pure crc32_be_base(u32 crc, const u8 *p, size_t len); +u32 __pure crc32c_le_arch(u32 crc, const u8 *p, size_t len); +u32 __pure crc32c_le_base(u32 crc, const u8 *p, size_t len); + +static inline u32 __pure crc32_le(u32 crc, const u8 *p, size_t len) +{ + if (IS_ENABLED(CONFIG_CRC32_ARCH)) + return crc32_le_arch(crc, p, len); + return crc32_le_base(crc, p, len); +} + +static inline u32 __pure crc32_be(u32 crc, const u8 *p, size_t len) +{ + if (IS_ENABLED(CONFIG_CRC32_ARCH)) + return crc32_be_arch(crc, p, len); + return crc32_be_base(crc, p, len); +} + +/* TODO: leading underscores should be dropped once callers have been updated */ +static inline u32 __pure __crc32c_le(u32 crc, const u8 *p, size_t len) +{ + if (IS_ENABLED(CONFIG_CRC32_ARCH)) + return crc32c_le_arch(crc, p, len); + return crc32c_le_base(crc, p, len); +} /** * crc32_le_combine - Combine two crc32 check values into one. For two @@ -38,9 +62,6 @@ static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2) return crc32_le_shift(crc1, len2) ^ crc2; } -u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len); -u32 __pure crc32c_le_base(u32 crc, unsigned char const *p, size_t len); - /** * __crc32c_le_combine - Combine two crc32c check values into one. For two * sequences of bytes, seq1 and seq2 with lengths len1 -- cgit v1.2.3 From b5ae12e0ee099e4c458f7814f0317f4e2cbf105e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Dec 2024 17:08:28 -0800 Subject: lib/crc32: expose whether the lib is really optimized at runtime Make the CRC32 library export a function crc32_optimizations() which returns flags that indicate which CRC32 functions are actually executing optimized code at runtime. This will be used to determine whether the crc32[c]-$arch shash algorithms should be registered in the crypto API. btrfs could also start using these flags instead of the hack that it currently uses where it parses the crypto_shash_driver_name. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20241202010844.144356-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc32.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 58c632533b08..e9bd40056687 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -37,6 +37,21 @@ static inline u32 __pure __crc32c_le(u32 crc, const u8 *p, size_t len) return crc32c_le_base(crc, p, len); } +/* + * crc32_optimizations() returns flags that indicate which CRC32 library + * functions are using architecture-specific optimizations. Unlike + * IS_ENABLED(CONFIG_CRC32_ARCH) it takes into account the different CRC32 + * variants and also whether any needed CPU features are available at runtime. + */ +#define CRC32_LE_OPTIMIZATION BIT(0) /* crc32_le() is optimized */ +#define CRC32_BE_OPTIMIZATION BIT(1) /* crc32_be() is optimized */ +#define CRC32C_OPTIMIZATION BIT(2) /* __crc32c_le() is optimized */ +#if IS_ENABLED(CONFIG_CRC32_ARCH) +u32 crc32_optimizations(void); +#else +static inline u32 crc32_optimizations(void) { return 0; } +#endif + /** * crc32_le_combine - Combine two crc32 check values into one. For two * sequences of bytes, seq1 and seq2 with lengths len1 -- cgit v1.2.3 From 38a9a5121c3bcf2ed857430a92e493568b247c35 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Dec 2024 17:08:40 -0800 Subject: lib/crc32: make crc32c() go directly to lib Now that the lower level __crc32c_le() library function is optimized for each architecture, make crc32c() just call that instead of taking an inefficient and error-prone detour through the shash API. Note: a future cleanup should make crc32c_le() be the actual library function instead of __crc32c_le(). That will require updating callers of __crc32c_le() to use crc32c_le() instead, and updating callers of crc32c_le() that expect a 'const void *' arg to expect 'const u8 *' instead. Similarly, a future cleanup should remove LIBCRC32C by making everyone who is selecting it just select CRC32 directly instead. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20241202010844.144356-16-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc32c.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h index 357ae4611a45..47eb78003c26 100644 --- a/include/linux/crc32c.h +++ b/include/linux/crc32c.h @@ -2,9 +2,12 @@ #ifndef _LINUX_CRC32C_H #define _LINUX_CRC32C_H -#include +#include -extern u32 crc32c(u32 crc, const void *address, unsigned int length); +static inline u32 crc32c(u32 crc, const void *address, unsigned int length) +{ + return __crc32c_le(crc, address, length); +} /* This macro exists for backwards-compatibility. */ #define crc32c_le crc32c -- cgit v1.2.3 From dd348f054b24a3f57cbcdc2c8e7ebc22c62eb72f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Dec 2024 17:08:42 -0800 Subject: jbd2: switch to using the crc32c library Now that the crc32c() library function directly takes advantage of architecture-specific optimizations, it is unnecessary to go through the crypto API. Just use crc32c(). This is much simpler, and it improves performance due to eliminating the crypto API overhead. Reviewed-by: Ard Biesheuvel Reviewed-by: Darrick J. Wong Acked-by: Theodore Ts'o Link: https://lore.kernel.org/r/20241202010844.144356-18-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/jbd2.h | 33 +++------------------------------ 1 file changed, 3 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 50f7ea8714bf..561025b4f3d9 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -28,7 +28,7 @@ #include #include #include -#include +#include #endif #define journal_oom_retry 1 @@ -1241,13 +1241,6 @@ struct journal_s */ void *j_private; - /** - * @j_chksum_driver: - * - * Reference to checksum algorithm driver via cryptoapi. - */ - struct crypto_shash *j_chksum_driver; - /** * @j_csum_seed: * @@ -1750,10 +1743,7 @@ static inline bool jbd2_journal_has_csum_v2or3_feature(journal_t *j) static inline int jbd2_journal_has_csum_v2or3(journal_t *journal) { - WARN_ON_ONCE(jbd2_journal_has_csum_v2or3_feature(journal) && - journal->j_chksum_driver == NULL); - - return journal->j_chksum_driver != NULL; + return jbd2_journal_has_csum_v2or3_feature(journal); } static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb) @@ -1790,27 +1780,10 @@ static inline unsigned long jbd2_log_space_left(journal_t *journal) #define BJ_Reserved 4 /* Buffer is reserved for access by journal */ #define BJ_Types 5 -/* JBD uses a CRC32 checksum */ -#define JBD_MAX_CHECKSUM_SIZE 4 - static inline u32 jbd2_chksum(journal_t *journal, u32 crc, const void *address, unsigned int length) { - DEFINE_RAW_FLEX(struct shash_desc, desc, __ctx, - DIV_ROUND_UP(JBD_MAX_CHECKSUM_SIZE, - sizeof(*((struct shash_desc *)0)->__ctx))); - int err; - - BUG_ON(crypto_shash_descsize(journal->j_chksum_driver) > - JBD_MAX_CHECKSUM_SIZE); - - desc->tfm = journal->j_chksum_driver; - *(u32 *)desc->__ctx = crc; - - err = crypto_shash_update(desc, address, length); - BUG_ON(err); - - return *(u32 *)desc->__ctx; + return crc32c(crc, address, length); } /* Return most recent uncommitted transaction */ -- cgit v1.2.3 From be3c45b070cba3be4dd248b38d4798e3e2859451 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Dec 2024 17:20:45 -0800 Subject: lib/crc-t10dif: stop wrapping the crypto API In preparation for making the CRC-T10DIF library directly optimized for each architecture, like what has been done for CRC32, get rid of the weird layering where crc_t10dif_update() calls into the crypto API. Instead, move crc_t10dif_generic() into the crc-t10dif library module, and make crc_t10dif_update() just call crc_t10dif_generic(). Acceleration will be reintroduced via crc_t10dif_arch() in the following patches. Reviewed-by: Ard Biesheuvel Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241202012056.209768-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc-t10dif.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc-t10dif.h b/include/linux/crc-t10dif.h index 6bb0c0bf357b..206ba2305483 100644 --- a/include/linux/crc-t10dif.h +++ b/include/linux/crc-t10dif.h @@ -6,11 +6,17 @@ #define CRC_T10DIF_DIGEST_SIZE 2 #define CRC_T10DIF_BLOCK_SIZE 1 -#define CRC_T10DIF_STRING "crct10dif" -extern __u16 crc_t10dif_generic(__u16 crc, const unsigned char *buffer, - size_t len); -extern __u16 crc_t10dif(unsigned char const *, size_t); -extern __u16 crc_t10dif_update(__u16 crc, unsigned char const *, size_t); +u16 crc_t10dif_generic(u16 crc, const u8 *p, size_t len); + +static inline u16 crc_t10dif_update(u16 crc, const u8 *p, size_t len) +{ + return crc_t10dif_generic(crc, p, len); +} + +static inline u16 crc_t10dif(const u8 *p, size_t len) +{ + return crc_t10dif_update(0, p, len); +} #endif -- cgit v1.2.3 From 0961c3bcefa64d5f0999e2b703391862c733bb52 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Dec 2024 17:20:46 -0800 Subject: lib/crc-t10dif: add support for arch overrides Following what was done for CRC32, add support for architecture-specific override of the CRC-T10DIF library. This will allow the CRC-T10DIF library functions to access architecture-optimized code directly. Reviewed-by: Ard Biesheuvel Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241202012056.209768-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc-t10dif.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crc-t10dif.h b/include/linux/crc-t10dif.h index 206ba2305483..16787c1cee21 100644 --- a/include/linux/crc-t10dif.h +++ b/include/linux/crc-t10dif.h @@ -7,10 +7,13 @@ #define CRC_T10DIF_DIGEST_SIZE 2 #define CRC_T10DIF_BLOCK_SIZE 1 +u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len); u16 crc_t10dif_generic(u16 crc, const u8 *p, size_t len); static inline u16 crc_t10dif_update(u16 crc, const u8 *p, size_t len) { + if (IS_ENABLED(CONFIG_CRC_T10DIF_ARCH)) + return crc_t10dif_arch(crc, p, len); return crc_t10dif_generic(crc, p, len); } @@ -19,4 +22,13 @@ static inline u16 crc_t10dif(const u8 *p, size_t len) return crc_t10dif_update(0, p, len); } +#if IS_ENABLED(CONFIG_CRC_T10DIF_ARCH) +bool crc_t10dif_is_optimized(void); +#else +static inline bool crc_t10dif_is_optimized(void) +{ + return false; +} +#endif + #endif -- cgit v1.2.3 From 0a670e151a71434765de69590944e18c08ee08cf Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 25 Nov 2024 15:09:57 +0100 Subject: tree-wide: s/override_creds()/override_creds_light(get_new_cred())/g Convert all callers from override_creds() to override_creds_light(get_new_cred()) in preparation of making override_creds() not take a separate reference at all. Link: https://lore.kernel.org/r/20241125-work-cred-v2-1-68b9d38bb5b2@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cred.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index e4a3155fe409..b0bc1fea9ca0 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -211,9 +211,10 @@ static inline struct cred *get_new_cred_many(struct cred *cred, int nr) * Get a reference on the specified set of new credentials. The caller must * release the reference. */ -static inline struct cred *get_new_cred(struct cred *cred) +static inline struct cred *get_new_cred(const struct cred *cred) { - return get_new_cred_many(cred, 1); + struct cred *nonconst_cred = (struct cred *) cred; + return get_new_cred_many(nonconst_cred, 1); } /** -- cgit v1.2.3 From 95c54bc81791c210b131f2b1013942487e74896f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 25 Nov 2024 15:09:58 +0100 Subject: cred: return old creds from revert_creds_light() So we can easily convert revert_creds() callers over to drop the reference count explicitly. Link: https://lore.kernel.org/r/20241125-work-cred-v2-2-68b9d38bb5b2@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cred.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index b0bc1fea9ca0..57cf0256ea29 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -185,9 +185,12 @@ static inline const struct cred *override_creds_light(const struct cred *overrid return old; } -static inline void revert_creds_light(const struct cred *revert_cred) +static inline const struct cred *revert_creds_light(const struct cred *revert_cred) { + const struct cred *override_cred = current->cred; + rcu_assign_pointer(current->cred, revert_cred); + return override_cred; } /** -- cgit v1.2.3 From a51a1d6bcaa345cc88e738cad468083c4e13aa3b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 25 Nov 2024 15:10:00 +0100 Subject: cred: remove old {override,revert}_creds() helpers They are now unused. Link: https://lore.kernel.org/r/20241125-work-cred-v2-4-68b9d38bb5b2@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cred.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 57cf0256ea29..80dcc18ef6e4 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -155,8 +155,6 @@ extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); -extern const struct cred *override_creds(const struct cred *); -extern void revert_creds(const struct cred *); extern struct cred *prepare_kernel_cred(struct task_struct *); extern int set_security_override(struct cred *, u32); extern int set_security_override_from_ctx(struct cred *, const char *); @@ -172,11 +170,6 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred) cred->cap_inheritable)); } -/* - * Override creds without bumping reference count. Caller must ensure - * reference remains valid or has taken reference. Almost always not the - * interface you want. Use override_creds()/revert_creds() instead. - */ static inline const struct cred *override_creds_light(const struct cred *override_cred) { const struct cred *old = current->cred; -- cgit v1.2.3 From 6771e004b40962402d0e973fc7d2e0e61364fdfb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 25 Nov 2024 15:10:01 +0100 Subject: tree-wide: s/override_creds_light()/override_creds()/g Rename all calls to override_creds_light() back to overrid_creds(). Link: https://lore.kernel.org/r/20241125-work-cred-v2-5-68b9d38bb5b2@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cred.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 80dcc18ef6e4..a073e6163c4e 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -170,7 +170,7 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred) cred->cap_inheritable)); } -static inline const struct cred *override_creds_light(const struct cred *override_cred) +static inline const struct cred *override_creds(const struct cred *override_cred) { const struct cred *old = current->cred; -- cgit v1.2.3 From 51c0bcf0973a3836adfc46f30f876f412478e376 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 25 Nov 2024 15:10:02 +0100 Subject: tree-wide: s/revert_creds_light()/revert_creds()/g Rename all calls to revert_creds_light() back to revert_creds(). Link: https://lore.kernel.org/r/20241125-work-cred-v2-6-68b9d38bb5b2@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cred.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index a073e6163c4e..a7df1c759ef0 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -178,7 +178,7 @@ static inline const struct cred *override_creds(const struct cred *override_cred return old; } -static inline const struct cred *revert_creds_light(const struct cred *revert_cred) +static inline const struct cred *revert_creds(const struct cred *revert_cred) { const struct cred *override_cred = current->cred; -- cgit v1.2.3 From 6efbb80490a545cfd9f87ebd9225879d8cdbed93 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 25 Nov 2024 15:10:25 +0100 Subject: cred: remove unused get_new_cred() This helper is not used anymore so remove it. Link: https://lore.kernel.org/r/20241125-work-cred-v2-29-68b9d38bb5b2@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cred.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index a7df1c759ef0..360f5fd3854b 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -200,19 +200,6 @@ static inline struct cred *get_new_cred_many(struct cred *cred, int nr) return cred; } -/** - * get_new_cred - Get a reference on a new set of credentials - * @cred: The new credentials to reference - * - * Get a reference on the specified set of new credentials. The caller must - * release the reference. - */ -static inline struct cred *get_new_cred(const struct cred *cred) -{ - struct cred *nonconst_cred = (struct cred *) cred; - return get_new_cred_many(nonconst_cred, 1); -} - /** * get_cred_many - Get references on a set of credentials * @cred: The credentials to reference -- cgit v1.2.3 From a6babf4cbeaaa1c97a205382cdc958571f668ea8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 26 Nov 2024 14:22:16 +0100 Subject: cred: fold get_new_cred_many() into get_cred_many() There's no need for this to be a separate helper. Link: https://lore.kernel.org/r/20241126-zaunpfahl-wovon-c3979b990a63@brauner Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cred.h | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 360f5fd3854b..0c3c4b16b469 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -186,20 +186,6 @@ static inline const struct cred *revert_creds(const struct cred *revert_cred) return override_cred; } -/** - * get_new_cred_many - Get references on a new set of credentials - * @cred: The new credentials to reference - * @nr: Number of references to acquire - * - * Get references on the specified set of new credentials. The caller must - * release all acquired references. - */ -static inline struct cred *get_new_cred_many(struct cred *cred, int nr) -{ - atomic_long_add(nr, &cred->usage); - return cred; -} - /** * get_cred_many - Get references on a set of credentials * @cred: The credentials to reference @@ -220,7 +206,8 @@ static inline const struct cred *get_cred_many(const struct cred *cred, int nr) if (!cred) return cred; nonconst_cred->non_rcu = 0; - return get_new_cred_many(nonconst_cred, nr); + atomic_long_add(nr, &nonconst_cred->usage); + return cred; } /* -- cgit v1.2.3 From 7863dcc72d0f4b13a641065670426435448b3d80 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 22 Nov 2024 14:24:58 +0100 Subject: pid: allow pid_max to be set per pid namespace The pid_max sysctl is a global value. For a long time the default value has been 65535 and during the pidfd dicussions Linus proposed to bump pid_max by default (cf. [1]). Based on this discussion systemd started bumping pid_max to 2^22. So all new systems now run with a very high pid_max limit with some distros having also backported that change. The decision to bump pid_max is obviously correct. It just doesn't make a lot of sense nowadays to enforce such a low pid number. There's sufficient tooling to make selecting specific processes without typing really large pid numbers available. In any case, there are workloads that have expections about how large pid numbers they accept. Either for historical reasons or architectural reasons. One concreate example is the 32-bit version of Android's bionic libc which requires pid numbers less than 65536. There are workloads where it is run in a 32-bit container on a 64-bit kernel. If the host has a pid_max value greater than 65535 the libc will abort thread creation because of size assumptions of pthread_mutex_t. That's a fairly specific use-case however, in general specific workloads that are moved into containers running on a host with a new kernel and a new systemd can run into issues with large pid_max values. Obviously making assumptions about the size of the allocated pid is suboptimal but we have userspace that does it. Of course, giving containers the ability to restrict the number of processes in their respective pid namespace indepent of the global limit through pid_max is something desirable in itself and comes in handy in general. Independent of motivating use-cases the existence of pid namespaces makes this also a good semantical extension and there have been prior proposals pushing in a similar direction. The trick here is to minimize the risk of regressions which I think is doable. The fact that pid namespaces are hierarchical will help us here. What we mostly care about is that when the host sets a low pid_max limit, say (crazy number) 100 that no descendant pid namespace can allocate a higher pid number in its namespace. Since pid allocation is hierarchial this can be ensured by checking each pid allocation against the pid namespace's pid_max limit. This means if the allocation in the descendant pid namespace succeeds, the ancestor pid namespace can reject it. If the ancestor pid namespace has a higher limit than the descendant pid namespace the descendant pid namespace will reject the pid allocation. The ancestor pid namespace will obviously not care about this. All in all this means pid_max continues to enforce a system wide limit on the number of processes but allows pid namespaces sufficient leeway in handling workloads with assumptions about pid values and allows containers to restrict the number of processes in a pid namespace through the pid_max interface. [1]: https://lore.kernel.org/linux-api/CAHk-=wiZ40LVjnXSi9iHLE_-ZBsWFGCgdmNiYZUXn1-V5YBg2g@mail.gmail.com - rebased from 5.14-rc1 - a few fixes (missing ns_free_inum on error path, missing initialization, etc) - permission check changes in pid_table_root_permissions - unsigned int pid_max -> int pid_max (keep pid_max type as it was) - add READ_ONCE in alloc_pid() as suggested by Christian - rebased from 6.7 and take into account: * sysctl: treewide: drop unused argument ctl_table_root::set_ownership(table) * sysctl: treewide: constify ctl_table_header::ctl_table_arg * pidfd: add pidfs * tracing: Move saved_cmdline code into trace_sched_switch.c Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20241122132459.135120-2-aleksandr.mikhalitsyn@canonical.com Signed-off-by: Christian Brauner --- include/linux/pid.h | 3 --- include/linux/pid_namespace.h | 10 +++++++++- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pid.h b/include/linux/pid.h index a3aad9b4074c..c800cbee584b 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -106,9 +106,6 @@ extern void exchange_tids(struct task_struct *task, struct task_struct *old); extern void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type); -extern int pid_max; -extern int pid_max_min, pid_max_max; - /* * look up a PID in the hash table. Must be called with the tasklist_lock * or rcu_read_lock() held. diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index f9f9931e02d6..7c67a5811199 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -30,6 +30,7 @@ struct pid_namespace { struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; + int pid_max; struct pid_namespace *parent; #ifdef CONFIG_BSD_PROCESS_ACCT struct fs_pin *bacct; @@ -38,9 +39,14 @@ struct pid_namespace { struct ucounts *ucounts; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; -#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + struct work_struct work; +#ifdef CONFIG_SYSCTL + struct ctl_table_set set; + struct ctl_table_header *sysctls; +#if defined(CONFIG_MEMFD_CREATE) int memfd_noexec_scope; #endif +#endif } __randomize_layout; extern struct pid_namespace init_pid_ns; @@ -117,6 +123,8 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); void pidhash_init(void); void pid_idr_init(void); +int register_pidns_sysctls(struct pid_namespace *pidns); +void unregister_pidns_sysctls(struct pid_namespace *pidns); static inline bool task_is_in_init_pid_ns(struct task_struct *tsk) { -- cgit v1.2.3 From 96450ead16527cbef559b5bd046182e731228f95 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:14 -0800 Subject: seqlock: add raw_seqcount_try_begin Add raw_seqcount_try_begin() to opens a read critical section of the given seqcount_t if the counter is even. This enables eliding the critical section entirely if the counter is odd, instead of doing the speculation knowing it will fail. Suggested-by: Peter Zijlstra Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: David Hildenbrand Reviewed-by: Liam R. Howlett Link: https://lkml.kernel.org/r/20241122174416.1367052-1-surenb@google.com --- include/linux/seqlock.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5298765d6ca4..22c2c48b4265 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -318,6 +318,28 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) __seq; \ }) +/** + * raw_seqcount_try_begin() - begin a seqcount_t read critical section + * w/o lockdep and w/o counter stabilization + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants + * + * Similar to raw_seqcount_begin(), except it enables eliding the critical + * section entirely if odd, instead of doing the speculation knowing it will + * fail. + * + * Useful when counter stabilization is more or less equivalent to taking + * the lock and there is a slowpath that does that. + * + * If true, start will be set to the (even) sequence count read. + * + * Return: true when a read critical section is started. + */ +#define raw_seqcount_try_begin(s, start) \ +({ \ + start = raw_read_seqcount(s); \ + !(start & 1); \ +}) + /** * raw_seqcount_begin() - begin a seqcount_t read critical section w/o * lockdep and w/o counter stabilization -- cgit v1.2.3 From eb449bd96954b1c1e491d19066cfd2a010f0aa47 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:15 -0800 Subject: mm: convert mm_lock_seq to a proper seqcount Convert mm_lock_seq to be seqcount_t and change all mmap_write_lock variants to increment it, in-line with the usual seqcount usage pattern. This lets us check whether the mmap_lock is write-locked by checking mm_lock_seq.sequence counter (odd=locked, even=unlocked). This will be used when implementing mmap_lock speculation functions. As a result vm_lock_seq is also change to be unsigned to match the type of mm_lock_seq.sequence. Suggested-by: Peter Zijlstra Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Liam R. Howlett Link: https://lkml.kernel.org/r/20241122174416.1367052-2-surenb@google.com --- include/linux/mm.h | 12 +++++------ include/linux/mm_types.h | 7 ++++-- include/linux/mmap_lock.h | 55 +++++++++++++++++++++++++++++++---------------- 3 files changed, 47 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c39c4945946c..ca59d165f1f2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -710,7 +710,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) return false; if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) @@ -727,7 +727,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { up_read(&vma->vm_lock->lock); return false; } @@ -742,7 +742,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) +static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -750,7 +750,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ - *mm_lock_seq = vma->vm_mm->mm_lock_seq; + *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; return (vma->vm_lock_seq == *mm_lock_seq); } @@ -761,7 +761,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) */ static inline void vma_start_write(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; if (__is_vma_write_locked(vma, &mm_lock_seq)) return; @@ -779,7 +779,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) static inline void vma_assert_write_locked(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7361a8f3ab68..97e2f4fe1d6c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -697,7 +697,7 @@ struct vm_area_struct { * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ - int vm_lock_seq; + unsigned int vm_lock_seq; /* Unstable RCU readers are allowed to read this. */ struct vma_lock *vm_lock; #endif @@ -891,6 +891,9 @@ struct mm_struct { * Roughly speaking, incrementing the sequence number is * equivalent to releasing locks on VMAs; reading the sequence * number can be part of taking a read lock on a VMA. + * Incremented every time mmap_lock is write-locked/unlocked. + * Initialized to 0, therefore odd values indicate mmap_lock + * is write-locked and even values that it's released. * * Can be modified under write mmap_lock using RELEASE * semantics. @@ -899,7 +902,7 @@ struct mm_struct { * Can be read with ACQUIRE semantics if not holding write * mmap_lock. */ - int mm_lock_seq; + seqcount_t mm_lock_seq; #endif diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index de9dc20b01ba..9715326f5a85 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -71,39 +71,39 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK -/* - * Drop all currently-held per-VMA locks. - * This is called from the mmap_lock implementation directly before releasing - * a write-locked mmap_lock (or downgrading it to read-locked). - * This should normally NOT be called manually from other places. - * If you want to call this manually anyway, keep in mind that this will release - * *all* VMA write locks, including ones from further up the stack. - */ -static inline void vma_end_write_all(struct mm_struct *mm) +static inline void mm_lock_seqcount_init(struct mm_struct *mm) { - mmap_assert_write_locked(mm); - /* - * Nobody can concurrently modify mm->mm_lock_seq due to exclusive - * mmap_lock being held. - * We need RELEASE semantics here to ensure that preceding stores into - * the VMA take effect before we unlock it with this store. - * Pairs with ACQUIRE semantics in vma_start_read(). - */ - smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1); + seqcount_init(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) +{ + do_raw_write_seqcount_begin(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_end(struct mm_struct *mm) +{ + ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq); + do_raw_write_seqcount_end(&mm->mm_lock_seq); } + #else -static inline void vma_end_write_all(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} #endif static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); + mm_lock_seqcount_init(mm); } static inline void mmap_write_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, true); down_write(&mm->mmap_lock); + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } @@ -111,6 +111,7 @@ static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) { __mmap_lock_trace_start_locking(mm, true); down_write_nested(&mm->mmap_lock, subclass); + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } @@ -120,10 +121,26 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm) __mmap_lock_trace_start_locking(mm, true); ret = down_write_killable(&mm->mmap_lock); + if (!ret) + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, ret == 0); return ret; } +/* + * Drop all currently-held per-VMA locks. + * This is called from the mmap_lock implementation directly before releasing + * a write-locked mmap_lock (or downgrading it to read-locked). + * This should normally NOT be called manually from other places. + * If you want to call this manually anyway, keep in mind that this will release + * *all* VMA write locks, including ones from further up the stack. + */ +static inline void vma_end_write_all(struct mm_struct *mm) +{ + mmap_assert_write_locked(mm); + mm_lock_seqcount_end(mm); +} + static inline void mmap_write_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, true); -- cgit v1.2.3 From 03a001b156d2da186a5618de242750d06bf81e2d Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:16 -0800 Subject: mm: introduce mmap_lock_speculate_{try_begin|retry} Add helper functions to speculatively perform operations without read-locking mmap_lock, expecting that mmap_lock will not be write-locked and mm is not modified from under us. Suggested-by: Peter Zijlstra Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Liam R. Howlett Link: https://lkml.kernel.org/r/20241122174416.1367052-3-surenb@google.com --- include/linux/mmap_lock.h | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 9715326f5a85..45a21faa3ff6 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -71,6 +71,7 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK + static inline void mm_lock_seqcount_init(struct mm_struct *mm) { seqcount_init(&mm->mm_lock_seq); @@ -87,11 +88,39 @@ static inline void mm_lock_seqcount_end(struct mm_struct *mm) do_raw_write_seqcount_end(&mm->mm_lock_seq); } -#else +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + /* + * Since mmap_lock is a sleeping lock, and waiting for it to become + * unlocked is more or less equivalent with taking it ourselves, don't + * bother with the speculative path if mmap_lock is already write-locked + * and take the slow path, which takes the lock. + */ + return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq); +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return read_seqcount_retry(&mm->mm_lock_seq, seq); +} + +#else /* CONFIG_PER_VMA_LOCK */ + static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} -#endif + +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + return false; +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return true; +} + +#endif /* CONFIG_PER_VMA_LOCK */ static inline void mmap_init_lock(struct mm_struct *mm) { -- cgit v1.2.3 From 2116b349e29a2e9ba17ea2e45b31234e4b350793 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:52 +0100 Subject: objtool: Generic annotation infrastructure Avoid endless .discard.foo sections for each annotation, create a single .discard.annotate_insn section that takes an annotation type along with the instruction. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094310.932794537@infradead.org --- include/linux/objtool.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index b3b8d3dab52d..d98531ecc687 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -57,6 +57,13 @@ ".long 998b\n\t" \ ".popsection\n\t" +#define ASM_ANNOTATE(type) \ + "911:\n\t" \ + ".pushsection .discard.annotate_insn,\"M\",@progbits,8\n\t" \ + ".long 911b - .\n\t" \ + ".long " __stringify(type) "\n\t" \ + ".popsection\n\t" + #else /* __ASSEMBLY__ */ /* @@ -146,6 +153,14 @@ .popsection .endm +.macro ANNOTATE type:req +.Lhere_\@: + .pushsection .discard.annotate_insn,"M",@progbits,8 + .long .Lhere_\@ - . + .long \type + .popsection +.endm + #endif /* __ASSEMBLY__ */ #else /* !CONFIG_OBJTOOL */ @@ -155,6 +170,7 @@ #define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t" #define STACK_FRAME_NON_STANDARD(func) #define STACK_FRAME_NON_STANDARD_FP(func) +#define ASM_ANNOTATE(type) #define ANNOTATE_NOENDBR #define ASM_REACHABLE #else @@ -167,6 +183,8 @@ .endm .macro REACHABLE .endm +.macro ANNOTATE type:req +.endm #endif #endif /* CONFIG_OBJTOOL */ -- cgit v1.2.3 From 22c3d58079688b697f36d670616e463cbb14d058 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:53 +0100 Subject: objtool: Convert ANNOTATE_NOENDBR to ANNOTATE Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.042140333@infradead.org --- include/linux/objtool.h | 17 ++++------------- include/linux/objtool_types.h | 5 +++++ 2 files changed, 9 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index d98531ecc687..b5e9c0ab4048 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -45,12 +45,6 @@ #define STACK_FRAME_NON_STANDARD_FP(func) #endif -#define ANNOTATE_NOENDBR \ - "986: \n\t" \ - ".pushsection .discard.noendbr\n\t" \ - ".long 986b\n\t" \ - ".popsection\n\t" - #define ASM_REACHABLE \ "998:\n\t" \ ".pushsection .discard.reachable\n\t" \ @@ -64,6 +58,8 @@ ".long " __stringify(type) "\n\t" \ ".popsection\n\t" +#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) + #else /* __ASSEMBLY__ */ /* @@ -122,13 +118,6 @@ #endif .endm -.macro ANNOTATE_NOENDBR -.Lhere_\@: - .pushsection .discard.noendbr - .long .Lhere_\@ - .popsection -.endm - /* * Use objtool to validate the entry requirement that all code paths do * VALIDATE_UNRET_END before RET. @@ -161,6 +150,8 @@ .popsection .endm +#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR + #endif /* __ASSEMBLY__ */ #else /* !CONFIG_OBJTOOL */ diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index 453a4f4ef39d..4884f8cf8429 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -54,4 +54,9 @@ struct unwind_hint { #define UNWIND_HINT_TYPE_SAVE 6 #define UNWIND_HINT_TYPE_RESTORE 7 +/* + * Annotate types + */ +#define ANNOTYPE_NOENDBR 1 + #endif /* _LINUX_OBJTOOL_TYPES_H */ -- cgit v1.2.3 From bf5febebd99fddfc6226a94e937d38a8d470b24e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:54 +0100 Subject: objtool: Convert ANNOTATE_RETPOLINE_SAFE to ANNOTATE Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.145275669@infradead.org --- include/linux/objtool_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index 4884f8cf8429..1b348361ad1d 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -58,5 +58,6 @@ struct unwind_hint { * Annotate types */ #define ANNOTYPE_NOENDBR 1 +#define ANNOTYPE_RETPOLINE_SAFE 2 #endif /* _LINUX_OBJTOOL_TYPES_H */ -- cgit v1.2.3 From 317f2a64618c528539d17fe6957a64106087fbd2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:55 +0100 Subject: objtool: Convert instrumentation_{begin,end}() to ANNOTATE Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.245980207@infradead.org --- include/linux/instrumentation.h | 11 +++++------ include/linux/objtool.h | 12 +++++++++--- include/linux/objtool_types.h | 2 ++ 3 files changed, 16 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/instrumentation.h b/include/linux/instrumentation.h index bc7babe91b2e..c8f866cf02d8 100644 --- a/include/linux/instrumentation.h +++ b/include/linux/instrumentation.h @@ -4,14 +4,14 @@ #ifdef CONFIG_NOINSTR_VALIDATION +#include #include /* Begin/end of an instrumentation safe region */ #define __instrumentation_begin(c) ({ \ asm volatile(__stringify(c) ": nop\n\t" \ - ".pushsection .discard.instr_begin\n\t" \ - ".long " __stringify(c) "b - .\n\t" \ - ".popsection\n\t" : : "i" (c)); \ + __ASM_ANNOTATE(__ASM_BREF(c), ANNOTYPE_INSTR_BEGIN)\ + : : "i" (c)); \ }) #define instrumentation_begin() __instrumentation_begin(__COUNTER__) @@ -48,9 +48,8 @@ */ #define __instrumentation_end(c) ({ \ asm volatile(__stringify(c) ": nop\n\t" \ - ".pushsection .discard.instr_end\n\t" \ - ".long " __stringify(c) "b - .\n\t" \ - ".popsection\n\t" : : "i" (c)); \ + __ASM_ANNOTATE(__ASM_BREF(c), ANNOTYPE_INSTR_END) \ + : : "i" (c)); \ }) #define instrumentation_end() __instrumentation_end(__COUNTER__) #else /* !CONFIG_NOINSTR_VALIDATION */ diff --git a/include/linux/objtool.h b/include/linux/objtool.h index b5e9c0ab4048..89c67cd7eebe 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -51,13 +51,18 @@ ".long 998b\n\t" \ ".popsection\n\t" -#define ASM_ANNOTATE(type) \ - "911:\n\t" \ +#define __ASM_BREF(label) label ## b + +#define __ASM_ANNOTATE(label, type) \ ".pushsection .discard.annotate_insn,\"M\",@progbits,8\n\t" \ - ".long 911b - .\n\t" \ + ".long " __stringify(label) " - .\n\t" \ ".long " __stringify(type) "\n\t" \ ".popsection\n\t" +#define ASM_ANNOTATE(type) \ + "911:\n\t" \ + __ASM_ANNOTATE(911b, type) + #define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) #else /* __ASSEMBLY__ */ @@ -161,6 +166,7 @@ #define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t" #define STACK_FRAME_NON_STANDARD(func) #define STACK_FRAME_NON_STANDARD_FP(func) +#define __ASM_ANNOTATE(label, type) #define ASM_ANNOTATE(type) #define ANNOTATE_NOENDBR #define ASM_REACHABLE diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index 1b348361ad1d..d4d68dd36f7a 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -59,5 +59,7 @@ struct unwind_hint { */ #define ANNOTYPE_NOENDBR 1 #define ANNOTYPE_RETPOLINE_SAFE 2 +#define ANNOTYPE_INSTR_BEGIN 3 +#define ANNOTYPE_INSTR_END 4 #endif /* _LINUX_OBJTOOL_TYPES_H */ -- cgit v1.2.3 From 18aa6118a1689b4d73c5ebbd917ae3f20c9c0db1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:56 +0100 Subject: objtool: Convert VALIDATE_UNRET_BEGIN to ANNOTATE Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.358508242@infradead.org --- include/linux/objtool.h | 9 +++------ include/linux/objtool_types.h | 1 + 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 89c67cd7eebe..5f0bf8052dc7 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -130,15 +130,12 @@ * NOTE: The macro must be used at the beginning of a global symbol, otherwise * it will be ignored. */ -.macro VALIDATE_UNRET_BEGIN #if defined(CONFIG_NOINSTR_VALIDATION) && \ (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) -.Lhere_\@: - .pushsection .discard.validate_unret - .long .Lhere_\@ - . - .popsection +#define VALIDATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN +#else +#define VALIDATE_UNRET_BEGIN #endif -.endm .macro REACHABLE .Lhere_\@: diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index d4d68dd36f7a..16236a56364b 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -61,5 +61,6 @@ struct unwind_hint { #define ANNOTYPE_RETPOLINE_SAFE 2 #define ANNOTYPE_INSTR_BEGIN 3 #define ANNOTYPE_INSTR_END 4 +#define ANNOTYPE_UNRET_BEGIN 5 #endif /* _LINUX_OBJTOOL_TYPES_H */ -- cgit v1.2.3 From f0cd57c35a75f152d3b31b9be3f7f413b96a6d3f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:57 +0100 Subject: objtool: Convert ANNOTATE_IGNORE_ALTERNATIVE to ANNOTATE Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.465691316@infradead.org --- include/linux/objtool_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index 16236a56364b..eab15dbe1cb7 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -62,5 +62,6 @@ struct unwind_hint { #define ANNOTYPE_INSTR_BEGIN 3 #define ANNOTYPE_INSTR_END 4 #define ANNOTYPE_UNRET_BEGIN 5 +#define ANNOTYPE_IGNORE_ALTS 6 #endif /* _LINUX_OBJTOOL_TYPES_H */ -- cgit v1.2.3 From 112765ca1cb9353e71b4f5af4e6e6c4a69c28d99 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:58 +0100 Subject: objtool: Convert ANNOTATE_INTRA_FUNCTION_CALL to ANNOTATE Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.584892071@infradead.org --- include/linux/objtool.h | 16 ++++++---------- include/linux/objtool_types.h | 1 + 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 5f0bf8052dc7..42287c1e32ce 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -67,16 +67,6 @@ #else /* __ASSEMBLY__ */ -/* - * This macro indicates that the following intra-function call is valid. - * Any non-annotated intra-function call will cause objtool to issue a warning. - */ -#define ANNOTATE_INTRA_FUNCTION_CALL \ - 999: \ - .pushsection .discard.intra_function_calls; \ - .long 999b; \ - .popsection; - /* * In asm, there are two kinds of code: normal C-type callable functions and * the rest. The normal callable functions can be called by other code, and @@ -154,6 +144,12 @@ #define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR +/* + * This macro indicates that the following intra-function call is valid. + * Any non-annotated intra-function call will cause objtool to issue a warning. + */ +#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL + #endif /* __ASSEMBLY__ */ #else /* !CONFIG_OBJTOOL */ diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index eab15dbe1cb7..23d6fb6d04c7 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -63,5 +63,6 @@ struct unwind_hint { #define ANNOTYPE_INSTR_END 4 #define ANNOTYPE_UNRET_BEGIN 5 #define ANNOTYPE_IGNORE_ALTS 6 +#define ANNOTYPE_INTRA_FUNCTION_CALL 7 #endif /* _LINUX_OBJTOOL_TYPES_H */ -- cgit v1.2.3 From bb8170067470cc7af28e4386e600b1e0a6a8956a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:39:00 +0100 Subject: objtool: Collect more annotations in objtool.h Suggested-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.786598147@infradead.org --- include/linux/instrumentation.h | 4 +-- include/linux/objtool.h | 80 +++++++++++++++++++++++++++-------------- 2 files changed, 55 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/instrumentation.h b/include/linux/instrumentation.h index c8f866cf02d8..bf675a8aef8a 100644 --- a/include/linux/instrumentation.h +++ b/include/linux/instrumentation.h @@ -10,7 +10,7 @@ /* Begin/end of an instrumentation safe region */ #define __instrumentation_begin(c) ({ \ asm volatile(__stringify(c) ": nop\n\t" \ - __ASM_ANNOTATE(__ASM_BREF(c), ANNOTYPE_INSTR_BEGIN)\ + ANNOTATE_INSTR_BEGIN(__ASM_BREF(c)) \ : : "i" (c)); \ }) #define instrumentation_begin() __instrumentation_begin(__COUNTER__) @@ -48,7 +48,7 @@ */ #define __instrumentation_end(c) ({ \ asm volatile(__stringify(c) ": nop\n\t" \ - __ASM_ANNOTATE(__ASM_BREF(c), ANNOTYPE_INSTR_END) \ + ANNOTATE_INSTR_END(__ASM_BREF(c)) \ : : "i" (c)); \ }) #define instrumentation_end() __instrumentation_end(__COUNTER__) diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 42287c1e32ce..fd487d466bb2 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -63,8 +63,6 @@ "911:\n\t" \ __ASM_ANNOTATE(911b, type) -#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) - #else /* __ASSEMBLY__ */ /* @@ -113,19 +111,6 @@ #endif .endm -/* - * Use objtool to validate the entry requirement that all code paths do - * VALIDATE_UNRET_END before RET. - * - * NOTE: The macro must be used at the beginning of a global symbol, otherwise - * it will be ignored. - */ -#if defined(CONFIG_NOINSTR_VALIDATION) && \ - (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) -#define VALIDATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN -#else -#define VALIDATE_UNRET_BEGIN -#endif .macro REACHABLE .Lhere_\@: @@ -142,14 +127,6 @@ .popsection .endm -#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR - -/* - * This macro indicates that the following intra-function call is valid. - * Any non-annotated intra-function call will cause objtool to issue a warning. - */ -#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL - #endif /* __ASSEMBLY__ */ #else /* !CONFIG_OBJTOOL */ @@ -161,16 +138,12 @@ #define STACK_FRAME_NON_STANDARD_FP(func) #define __ASM_ANNOTATE(label, type) #define ASM_ANNOTATE(type) -#define ANNOTATE_NOENDBR #define ASM_REACHABLE #else -#define ANNOTATE_INTRA_FUNCTION_CALL .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 .endm .macro STACK_FRAME_NON_STANDARD func:req .endm -.macro ANNOTATE_NOENDBR -.endm .macro REACHABLE .endm .macro ANNOTATE type:req @@ -179,4 +152,57 @@ #endif /* CONFIG_OBJTOOL */ +#ifndef __ASSEMBLY__ +/* + * Annotate away the various 'relocation to !ENDBR` complaints; knowing that + * these relocations will never be used for indirect calls. + */ +#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) +/* + * This should be used immediately before an indirect jump/call. It tells + * objtool the subsequent indirect jump/call is vouched safe for retpoline + * builds. + */ +#define ANNOTATE_RETPOLINE_SAFE ASM_ANNOTATE(ANNOTYPE_RETPOLINE_SAFE) +/* + * See linux/instrumentation.h + */ +#define ANNOTATE_INSTR_BEGIN(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_BEGIN) +#define ANNOTATE_INSTR_END(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_END) +/* + * objtool annotation to ignore the alternatives and only consider the original + * instruction(s). + */ +#define ANNOTATE_IGNORE_ALTERNATIVE ASM_ANNOTATE(ANNOTYPE_IGNORE_ALTS) +/* + * This macro indicates that the following intra-function call is valid. + * Any non-annotated intra-function call will cause objtool to issue a warning. + */ +#define ANNOTATE_INTRA_FUNCTION_CALL ASM_ANNOTATE(ANNOTYPE_INTRA_FUNCTION_CALL) +/* + * Use objtool to validate the entry requirement that all code paths do + * VALIDATE_UNRET_END before RET. + * + * NOTE: The macro must be used at the beginning of a global symbol, otherwise + * it will be ignored. + */ +#define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN) + +#else +#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR +#define ANNOTATE_RETPOLINE_SAFE ANNOTATE type=ANNOTYPE_RETPOLINE_SAFE +/* ANNOTATE_INSTR_BEGIN ANNOTATE type=ANNOTYPE_INSTR_BEGIN */ +/* ANNOTATE_INSTR_END ANNOTATE type=ANNOTYPE_INSTR_END */ +#define ANNOTATE_IGNORE_ALTERNATIVE ANNOTATE type=ANNOTYPE_IGNORE_ALTS +#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL +#define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN +#endif + +#if defined(CONFIG_NOINSTR_VALIDATION) && \ + (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) +#define VALIDATE_UNRET_BEGIN ANNOTATE_UNRET_BEGIN +#else +#define VALIDATE_UNRET_BEGIN +#endif + #endif /* _LINUX_OBJTOOL_H */ -- cgit v1.2.3 From c837de3810982cd41cd70e5170da1931439f025c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:39:01 +0100 Subject: unreachable: Unify Since barrier_before_unreachable() is empty for !GCC it is trivial to unify the two definitions. Less is more. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.924381359@infradead.org --- include/linux/compiler-gcc.h | 12 ------------ include/linux/compiler.h | 10 +++++++--- 2 files changed, 7 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index d0ed9583743f..c9b58188ec61 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -52,18 +52,6 @@ */ #define barrier_before_unreachable() asm volatile("") -/* - * Mark a position in code as unreachable. This can be used to - * suppress control flow warnings after asm blocks that transfer - * control elsewhere. - */ -#define unreachable() \ - do { \ - annotate_unreachable(); \ - barrier_before_unreachable(); \ - __builtin_unreachable(); \ - } while (0) - #if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) #define __HAVE_BUILTIN_BSWAP32__ #define __HAVE_BUILTIN_BSWAP64__ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 469a64dd6495..7be80897a62f 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -141,12 +141,16 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #define __annotate_jump_table #endif /* CONFIG_OBJTOOL */ -#ifndef unreachable -# define unreachable() do { \ +/* + * Mark a position in code as unreachable. This can be used to + * suppress control flow warnings after asm blocks that transfer + * control elsewhere. + */ +#define unreachable() do { \ annotate_unreachable(); \ + barrier_before_unreachable(); \ __builtin_unreachable(); \ } while (0) -#endif /* * KENTRY - kernel entry point -- cgit v1.2.3 From 06e24745985c8dd0da18337503afcf2f2fdbdff1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:39:04 +0100 Subject: objtool: Remove annotate_{,un}reachable() There are no users of annotate_reachable() left. And the annotate_unreachable() usage in unreachable() is plain wrong; it will hide dangerous fall-through code-gen. Remove both. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094312.235637588@infradead.org --- include/linux/compiler.h | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 7be80897a62f..3d9a0e483e51 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -109,35 +109,9 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, /* Unreachable code */ #ifdef CONFIG_OBJTOOL -/* - * These macros help objtool understand GCC code flow for unreachable code. - * The __COUNTER__ based labels are a hack to make each instance of the macros - * unique, to convince GCC not to merge duplicate inline asm statements. - */ -#define __stringify_label(n) #n - -#define __annotate_reachable(c) ({ \ - asm volatile(__stringify_label(c) ":\n\t" \ - ".pushsection .discard.reachable\n\t" \ - ".long " __stringify_label(c) "b - .\n\t" \ - ".popsection\n\t"); \ -}) -#define annotate_reachable() __annotate_reachable(__COUNTER__) - -#define __annotate_unreachable(c) ({ \ - asm volatile(__stringify_label(c) ":\n\t" \ - ".pushsection .discard.unreachable\n\t" \ - ".long " __stringify_label(c) "b - .\n\t" \ - ".popsection\n\t" : : "i" (c)); \ -}) -#define annotate_unreachable() __annotate_unreachable(__COUNTER__) - /* Annotate a C jump table to allow objtool to follow the code flow */ #define __annotate_jump_table __section(".rodata..c_jump_table,\"a\",@progbits #") - #else /* !CONFIG_OBJTOOL */ -#define annotate_reachable() -#define annotate_unreachable() #define __annotate_jump_table #endif /* CONFIG_OBJTOOL */ @@ -147,7 +121,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, * control elsewhere. */ #define unreachable() do { \ - annotate_unreachable(); \ barrier_before_unreachable(); \ __builtin_unreachable(); \ } while (0) -- cgit v1.2.3 From e7a174fb43d24adca066e82d1cb9fdee092d48d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:39:05 +0100 Subject: objtool: Convert {.UN}REACHABLE to ANNOTATE Suggested-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094312.353431347@infradead.org --- include/linux/objtool.h | 18 +++++++----------- include/linux/objtool_types.h | 1 + 2 files changed, 8 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index fd487d466bb2..e3cb13583fba 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -111,14 +111,6 @@ #endif .endm - -.macro REACHABLE -.Lhere_\@: - .pushsection .discard.reachable - .long .Lhere_\@ - .popsection -.endm - .macro ANNOTATE type:req .Lhere_\@: .pushsection .discard.annotate_insn,"M",@progbits,8 @@ -138,14 +130,11 @@ #define STACK_FRAME_NON_STANDARD_FP(func) #define __ASM_ANNOTATE(label, type) #define ASM_ANNOTATE(type) -#define ASM_REACHABLE #else .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 .endm .macro STACK_FRAME_NON_STANDARD func:req .endm -.macro REACHABLE -.endm .macro ANNOTATE type:req .endm #endif @@ -187,6 +176,12 @@ * it will be ignored. */ #define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN) +/* + * This should be used directly after an instruction that is considered + * terminating, like a noreturn CALL or UD2 when we know they are not -- eg + * WARN using UD2. + */ +#define ANNOTATE_REACHABLE ASM_ANNOTATE(ANNOTYPE_REACHABLE) #else #define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR @@ -196,6 +191,7 @@ #define ANNOTATE_IGNORE_ALTERNATIVE ANNOTATE type=ANNOTYPE_IGNORE_ALTS #define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL #define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN +#define ANNOTATE_REACHABLE ANNOTATE type=ANNOTYPE_REACHABLE #endif #if defined(CONFIG_NOINSTR_VALIDATION) && \ diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index 23d6fb6d04c7..df5d9fa84dba 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -64,5 +64,6 @@ struct unwind_hint { #define ANNOTYPE_UNRET_BEGIN 5 #define ANNOTYPE_IGNORE_ALTS 6 #define ANNOTYPE_INTRA_FUNCTION_CALL 7 +#define ANNOTYPE_REACHABLE 8 #endif /* _LINUX_OBJTOOL_TYPES_H */ -- cgit v1.2.3 From 87116ae6da034242baf06e799f9f0e2a8ee6a796 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:39:06 +0100 Subject: objtool: Fix ANNOTATE_REACHABLE to be a normal annotation Currently REACHABLE is weird for being on the instruction after the instruction it modifies. Since all REACHABLE annotations have an explicit instruction, flip them around. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094312.494176035@infradead.org --- include/linux/objtool.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index e3cb13583fba..c722a921165b 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -177,11 +177,11 @@ */ #define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN) /* - * This should be used directly after an instruction that is considered + * This should be used to refer to an instruction that is considered * terminating, like a noreturn CALL or UD2 when we know they are not -- eg * WARN using UD2. */ -#define ANNOTATE_REACHABLE ASM_ANNOTATE(ANNOTYPE_REACHABLE) +#define ANNOTATE_REACHABLE(label) __ASM_ANNOTATE(label, ANNOTYPE_REACHABLE) #else #define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR -- cgit v1.2.3 From ae5c677729e99b8cb3e6252aaa9b72a92985d203 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 30 Oct 2024 13:52:50 -0400 Subject: sched/core: Remove HK_TYPE_SCHED The HK_TYPE_SCHED housekeeping type is defined but not set anywhere. So any code that try to use HK_TYPE_SCHED are essentially dead code. So remove HK_TYPE_SCHED and any code that use it. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20241030175253.125248-2-longman@redhat.com --- include/linux/sched/isolation.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 2b461129d1fa..499d5e480882 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -10,7 +10,6 @@ enum hk_type { HK_TYPE_TIMER, HK_TYPE_RCU, HK_TYPE_MISC, - HK_TYPE_SCHED, HK_TYPE_TICK, HK_TYPE_DOMAIN, HK_TYPE_WQ, -- cgit v1.2.3 From 6010d245ddc9f463bbf0311ac49073a78f444755 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 30 Oct 2024 13:52:52 -0400 Subject: sched/isolation: Consolidate housekeeping cpumasks that are always identical The housekeeping cpumasks are only set by two boot commandline parameters: "nohz_full" and "isolcpus". When there is more than one of "nohz_full" or "isolcpus", the extra ones must have the same CPU list or the setup will fail partially. The HK_TYPE_DOMAIN and HK_TYPE_MANAGED_IRQ types are settable by "isolcpus" only and their settings can be independent of the other types. The other housekeeping types are all set by "nohz_full" or "isolcpus=nohz" without a way to set them individually. So they all have identical cpumasks. There is actually no point in having different cpumasks for these "nohz_full" only housekeeping types. Consolidate these types to use the same cpumask by aliasing them to the same value. If there is a need to set any of them independently in the future, we can break them out to their own cpumasks again. With this change, the number of cpumasks in the housekeeping structure drops from 9 to 3. Other than that, there should be no other functional change. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20241030175253.125248-4-longman@redhat.com --- include/linux/sched/isolation.h | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 499d5e480882..d8501f4709b5 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -7,15 +7,21 @@ #include enum hk_type { - HK_TYPE_TIMER, - HK_TYPE_RCU, - HK_TYPE_MISC, - HK_TYPE_TICK, HK_TYPE_DOMAIN, - HK_TYPE_WQ, HK_TYPE_MANAGED_IRQ, - HK_TYPE_KTHREAD, - HK_TYPE_MAX + HK_TYPE_KERNEL_NOISE, + HK_TYPE_MAX, + + /* + * The following housekeeping types are only set by the nohz_full + * boot commandline option. So they can share the same value. + */ + HK_TYPE_TICK = HK_TYPE_KERNEL_NOISE, + HK_TYPE_TIMER = HK_TYPE_KERNEL_NOISE, + HK_TYPE_RCU = HK_TYPE_KERNEL_NOISE, + HK_TYPE_MISC = HK_TYPE_KERNEL_NOISE, + HK_TYPE_WQ = HK_TYPE_KERNEL_NOISE, + HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE }; #ifdef CONFIG_CPU_ISOLATION -- cgit v1.2.3 From 6fe437cfe2cdc797b03f63b338a13fac96ed6a08 Mon Sep 17 00:00:00 2001 From: Levi Yun Date: Tue, 3 Dec 2024 14:31:08 +0000 Subject: firmware: arm_ffa: Fix the race around setting ffa_dev->properties Currently, ffa_dev->properties is set after the ffa_device_register() call return in ffa_setup_partitions(). This could potentially result in a race where the partition's properties is accessed while probing struct ffa_device before it is set. Update the ffa_device_register() to receive ffa_partition_info so all the data from the partition information received from the firmware can be updated into the struct ffa_device before the calling device_register() in ffa_device_register(). Fixes: e781858488b9 ("firmware: arm_ffa: Add initial FFA bus support for device enumeration") Signed-off-by: Levi Yun Message-Id: <20241203143109.1030514-2-yeoreum.yun@arm.com> Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index a28e2a6a13d0..74169dd0f659 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -166,9 +166,12 @@ static inline void *ffa_dev_get_drvdata(struct ffa_device *fdev) return dev_get_drvdata(&fdev->dev); } +struct ffa_partition_info; + #if IS_REACHABLE(CONFIG_ARM_FFA_TRANSPORT) -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_ops *ops); +struct ffa_device * +ffa_device_register(const struct ffa_partition_info *part_info, + const struct ffa_ops *ops); void ffa_device_unregister(struct ffa_device *ffa_dev); int ffa_driver_register(struct ffa_driver *driver, struct module *owner, const char *mod_name); @@ -176,9 +179,9 @@ void ffa_driver_unregister(struct ffa_driver *driver); bool ffa_device_is_valid(struct ffa_device *ffa_dev); #else -static inline -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_ops *ops) +static inline struct ffa_device * +ffa_device_register(const struct ffa_partition_info *part_info, + const struct ffa_ops *ops) { return NULL; } -- cgit v1.2.3 From 790fb9956eead785b720ccc0851f09a5ca3a093e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 2 Dec 2024 09:20:04 -0800 Subject: linux/dmaengine.h: fix a few kernel-doc warnings The comment block for "Interleaved Transfer Request" should not begin with "/**" since it is not in kernel-doc format. Fix doc name for enum sum_check_flags. Fix all (4) missing struct member warnings. Use "Warning:" for one "Note:" in enum dma_desc_metadata_mode since scripts/kernel-doc does not allow more than one Note: per function or identifier description. This leaves around 49 kernel-doc warnings like: include/linux/dmaengine.h:43: warning: Enum value 'DMA_OUT_OF_ORDER' not described in enum 'dma_status' and another scripts/kernel-doc problem with it not being able to parse some typedefs. Fixes: b14dab792dee ("DMAEngine: Define interleaved transfer request api") Fixes: ad283ea4a3ce ("async_tx: add sum check flags") Fixes: 272420214d26 ("dmaengine: Add DMA_CTRL_REUSE") Fixes: f067025bc676 ("dmaengine: add support to provide error result from a DMA transation") Fixes: d38a8c622a1b ("dmaengine: prepare for generic 'unmap' data") Fixes: 5878853fc938 ("dmaengine: Add API function dmaengine_prep_peripheral_dma_vec()") Signed-off-by: Randy Dunlap Cc: Dan Williams Cc: Dave Jiang Cc: Paul Cercueil Cc: Nuno Sa Cc: Vinod Koul Cc: dmaengine@vger.kernel.org Link: https://lore.kernel.org/r/20241202172004.76020-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index b137fdb56093..346251bf1026 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -84,7 +84,7 @@ enum dma_transfer_direction { DMA_TRANS_NONE, }; -/** +/* * Interleaved Transfer Request * ---------------------------- * A chunk is collection of contiguous bytes to be transferred. @@ -223,7 +223,7 @@ enum sum_check_bits { }; /** - * enum pq_check_flags - result of async_{xor,pq}_zero_sum operations + * enum sum_check_flags - result of async_{xor,pq}_zero_sum operations * @SUM_CHECK_P_RESULT - 1 if xor zero sum error, 0 otherwise * @SUM_CHECK_Q_RESULT - 1 if reed-solomon zero sum error, 0 otherwise */ @@ -286,7 +286,7 @@ typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t; * pointer to the engine's metadata area * 4. Read out the metadata from the pointer * - * Note: the two mode is not compatible and clients must use one mode for a + * Warning: the two modes are not compatible and clients must use one mode for a * descriptor. */ enum dma_desc_metadata_mode { @@ -594,9 +594,13 @@ struct dma_descriptor_metadata_ops { * @phys: physical address of the descriptor * @chan: target channel for this operation * @tx_submit: accept the descriptor, assign ordered cookie and mark the + * @desc_free: driver's callback function to free a resusable descriptor + * after completion * descriptor pending. To be pushed on .issue_pending() call * @callback: routine to call after this operation is complete + * @callback_result: error result from a DMA transaction * @callback_param: general parameter to pass to the callback routine + * @unmap: hook for generic DMA unmap data * @desc_metadata_mode: core managed metadata mode to protect mixed use of * DESC_METADATA_CLIENT or DESC_METADATA_ENGINE. Otherwise * DESC_METADATA_NONE @@ -827,6 +831,9 @@ struct dma_filter { * @device_prep_dma_memset: prepares a memset operation * @device_prep_dma_memset_sg: prepares a memset operation over a scatter list * @device_prep_dma_interrupt: prepares an end of chain interrupt operation + * @device_prep_peripheral_dma_vec: prepares a scatter-gather DMA transfer, + * where the address and size of each segment is located in one entry of + * the dma_vec array. * @device_prep_slave_sg: prepares a slave dma operation * @device_prep_dma_cyclic: prepare a cyclic dma operation suitable for audio. * The function takes a buffer of size buf_len. The callback function will -- cgit v1.2.3 From dcbef0798eb825cd584f7a93f62bed63f7fbbfc9 Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Wed, 18 Sep 2024 11:10:22 -0700 Subject: dmaengine: amd: qdma: Remove using the private get and set dma_ops APIs The get_dma_ops and set_dma_ops APIs were never for driver to use. Remove these calls from QDMA driver. Instead, pass the DMA device pointer from the qdma_platdata structure. Fixes: 73d5fc92a11c ("dmaengine: amd: qdma: Add AMD QDMA driver") Signed-off-by: Lizhi Hou Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240918181022.2155715-1-lizhi.hou@amd.com Signed-off-by: Vinod Koul --- include/linux/platform_data/amd_qdma.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/amd_qdma.h b/include/linux/platform_data/amd_qdma.h index 576d952f97ed..967a6ef31cf9 100644 --- a/include/linux/platform_data/amd_qdma.h +++ b/include/linux/platform_data/amd_qdma.h @@ -26,11 +26,13 @@ struct dma_slave_map; * @max_mm_channels: Maximum number of MM DMA channels in each direction * @device_map: DMA slave map * @irq_index: The index of first IRQ + * @dma_dev: The device pointer for dma operations */ struct qdma_platdata { u32 max_mm_channels; u32 irq_index; struct dma_slave_map *device_map; + struct device *dma_dev; }; #endif /* _PLATDATA_AMD_QDMA_H */ -- cgit v1.2.3 From d9b4067aef50aa7a13d1a9fbb4e35bdea6804ff3 Mon Sep 17 00:00:00 2001 From: Duan Chenghao Date: Thu, 24 Oct 2024 10:40:38 +0800 Subject: USB: Fix the issue of task recovery failure caused by USB status when S4 wakes up When a device is inserted into the USB port and an S4 wakeup is initiated, after the USB-hub initialization is completed, it will automatically enter suspend mode. Upon detecting a device on the USB port, it will proceed with resume and set the hcd to the HCD_FLAG_WAKEUP_PENDING state. During the S4 wakeup process, peripherals are put into suspend mode, followed by task recovery. However, upon detecting that the hcd is in the HCD_FLAG_WAKEUP_PENDING state, it will return an EBUSY status, causing the S4 suspend to fail and subsequent task recovery to not proceed. - [ 27.594598][ 1] PM: pci_pm_freeze(): hcd_pci_suspend+0x0/0x28 returns -16 [ 27.594601][ 1] PM: dpm_run_callback(): pci_pm_freeze+0x0/0x100 returns -16 [ 27.603420][ 1] ehci-pci 0000:00:04.1: pci_pm_freeze+0x0/0x100 returned 0 after 3 usecs [ 27.612233][ 1] ehci-pci 0000:00:05.1: pci_pm_freeze+0x0/0x100 returned -16 after 17223 usecs [ 27.810067][ 1] PM: Device 0000:00:05.1 failed to quiesce async: error -16 [ 27.816988][ 1] PM: quiesce of devices aborted after 1833.282 msecs [ 27.823302][ 1] PM: start quiesce of devices aborted after 1839.975 msecs ...... [ 31.303172][ 1] PM: recover of devices complete after 3473.039 msecs [ 31.309818][ 1] PM: Failed to load hibernation image, recovering. [ 31.348188][ 1] PM: Basic memory bitmaps freed [ 31.352686][ 1] OOM killer enabled. [ 31.356232][ 1] Restarting tasks ... done. [ 31.360609][ 1] PM: resume from hibernation failed (0) [ 31.365800][ 1] PM: Hibernation image not present or could not be loaded. The "do_wakeup" is determined based on whether the controller's power/wakeup attribute is set. The current issue necessitates considering the type of suspend that is occurring. If the suspend type is either PM_EVENT_FREEZE or PM_EVENT_QUIESCE, then "do_wakeup" should be set to false. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202410151722.rfjtknRz-lkp@intel.com/ Signed-off-by: Alan Stern Signed-off-by: Duan Chenghao Link: https://lore.kernel.org/r/20241024024038.26157-1-duanchenghao@kylinos.cn Signed-off-by: Greg Kroah-Hartman --- include/linux/pm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index e7f0260f15ad..08c37b83fea8 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -570,7 +570,8 @@ const struct dev_pm_ops name = { \ { .event = PM_EVENT_AUTO_RESUME, }) #define PMSG_IS_AUTO(msg) (((msg).event & PM_EVENT_AUTO) != 0) - +#define PMSG_NO_WAKEUP(msg) (((msg).event & \ + (PM_EVENT_FREEZE | PM_EVENT_QUIESCE)) != 0) /* * Device run-time power management status. * -- cgit v1.2.3 From d8d936c51388442f769a81e512b505dcf87c6a51 Mon Sep 17 00:00:00 2001 From: Dingyan Li <18500469033@163.com> Date: Wed, 30 Oct 2024 16:38:58 +0800 Subject: usb: storage: add a macro for the upper limit of max LUN The meaning of this value is already used in several places, but with constant values and comments to explain it separately. It's better to have a central place to do this then use the macro in those places for better readability. Signed-off-by: Dingyan Li <18500469033@163.com> Reviewed-by: Alan Stern Link: https://lore.kernel.org/r/20241030083858.46907-1-18500469033@163.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/storage.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/storage.h b/include/linux/usb/storage.h index 8539956bc2be..51be3bb8fccb 100644 --- a/include/linux/usb/storage.h +++ b/include/linux/usb/storage.h @@ -82,4 +82,12 @@ struct bulk_cs_wrap { #define US_BULK_RESET_REQUEST 0xff #define US_BULK_GET_MAX_LUN 0xfe +/* + * If 4 LUNs are supported then the LUNs would be + * numbered from 0 to 3, and the return value for + * US_BULK_GET_MAX_LUN request would be 3. The valid + * LUN field is 4 bits wide, the upper limit is 0x0f. + */ +#define US_BULK_MAX_LUN_LIMIT 0x0f + #endif -- cgit v1.2.3 From 535a07698b8b3e6f305673102d297262cae2360a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 4 Dec 2024 05:09:22 +0200 Subject: serial: 8250_pci: Share WCH IDs with parport_serial driver parport_serial driver uses subset of WCH IDs that are present in 8250_pci. Share them via pci_ids.h and switch parport_serial to use defined constants. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241204031114.1029882-3-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/pci_ids.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index d2402bf4aea2..de5deb1a0118 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2593,6 +2593,11 @@ #define PCI_VENDOR_ID_REDHAT 0x1b36 +#define PCI_VENDOR_ID_WCHIC 0x1c00 +#define PCI_DEVICE_ID_WCHIC_CH382_0S1P 0x3050 +#define PCI_DEVICE_ID_WCHIC_CH382_2S1P 0x3250 +#define PCI_DEVICE_ID_WCHIC_CH382_2S 0x3253 + #define PCI_VENDOR_ID_SILICOM_DENMARK 0x1c2c #define PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS 0x1c36 @@ -2647,6 +2652,12 @@ #define PCI_VENDOR_ID_AKS 0x416c #define PCI_DEVICE_ID_AKS_ALADDINCARD 0x0100 +#define PCI_VENDOR_ID_WCHCN 0x4348 +#define PCI_DEVICE_ID_WCHCN_CH353_4S 0x3453 +#define PCI_DEVICE_ID_WCHCN_CH353_2S1PF 0x5046 +#define PCI_DEVICE_ID_WCHCN_CH353_1S1P 0x5053 +#define PCI_DEVICE_ID_WCHCN_CH353_2S1P 0x7053 + #define PCI_VENDOR_ID_ACCESSIO 0x494f #define PCI_DEVICE_ID_ACCESSIO_WDG_CSM 0x22c0 -- cgit v1.2.3 From 6fba89813ccf333d2bc4d5caea04cd5f3c39eb50 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 23 Oct 2024 14:21:54 -0700 Subject: lsm: ensure the correct LSM context releaser Add a new lsm_context data structure to hold all the information about a "security context", including the string, its size and which LSM allocated the string. The allocation information is necessary because LSMs have different policies regarding the lifecycle of these strings. SELinux allocates and destroys them on each use, whereas Smack provides a pointer to an entry in a list that never goes away. Update security_release_secctx() to use the lsm_context instead of a (char *, len) pair. Change its callers to do likewise. The LSMs supporting this hook have had comments added to remind the developer that there is more work to be done. The BPF security module provides all LSM hooks. While there has yet to be a known instance of a BPF configuration that uses security contexts, the possibility is real. In the existing implementation there is potential for multiple frees in that case. Cc: linux-integrity@vger.kernel.org Cc: netdev@vger.kernel.org Cc: audit@vger.kernel.org Cc: netfilter-devel@vger.kernel.org To: Pablo Neira Ayuso Cc: linux-nfs@vger.kernel.org Cc: Todd Kjos Signed-off-by: Casey Schaufler [PM: subject tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 +- include/linux/security.h | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index eb2937599cb0..c13df23132eb 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -300,7 +300,7 @@ LSM_HOOK(int, -EOPNOTSUPP, secid_to_secctx, u32 secid, char **secdata, LSM_HOOK(int, -EOPNOTSUPP, lsmprop_to_secctx, struct lsm_prop *prop, char **secdata, u32 *seclen) LSM_HOOK(int, 0, secctx_to_secid, const char *secdata, u32 seclen, u32 *secid) -LSM_HOOK(void, LSM_RET_VOID, release_secctx, char *secdata, u32 seclen) +LSM_HOOK(void, LSM_RET_VOID, release_secctx, struct lsm_context *cp) LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode) LSM_HOOK(int, 0, inode_notifysecctx, struct inode *inode, void *ctx, u32 ctxlen) LSM_HOOK(int, 0, inode_setsecctx, struct dentry *dentry, void *ctx, u32 ctxlen) diff --git a/include/linux/security.h b/include/linux/security.h index cbdba435b798..68e56935716b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -225,6 +225,37 @@ extern unsigned long dac_mmap_min_addr; #define dac_mmap_min_addr 0UL #endif +/* + * A "security context" is the text representation of + * the information used by LSMs. + * This structure contains the string, its length, and which LSM + * it is useful for. + */ +struct lsm_context { + char *context; /* Provided by the module */ + u32 len; + int id; /* Identifies the module */ +}; + +/** + * lsmcontext_init - initialize an lsmcontext structure. + * @cp: Pointer to the context to initialize + * @context: Initial context, or NULL + * @size: Size of context, or 0 + * @id: Which LSM provided the context + * + * Fill in the lsmcontext from the provided information. + * This is a scaffolding function that will be removed when + * lsm_context integration is complete. + */ +static inline void lsmcontext_init(struct lsm_context *cp, char *context, + u32 size, int id) +{ + cp->id = id; + cp->context = context; + cp->len = size; +} + /* * Values used in the task_security_ops calls */ @@ -556,7 +587,7 @@ int security_ismaclabel(const char *name); int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen); int security_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata, u32 *seclen); int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid); -void security_release_secctx(char *secdata, u32 seclen); +void security_release_secctx(struct lsm_context *cp); void security_inode_invalidate_secctx(struct inode *inode); int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen); int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen); @@ -1545,7 +1576,7 @@ static inline int security_secctx_to_secid(const char *secdata, return -EOPNOTSUPP; } -static inline void security_release_secctx(char *secdata, u32 seclen) +static inline void security_release_secctx(struct lsm_context *cp) { } -- cgit v1.2.3 From 1995edc5f9089ecb8b77a34f21e4abd8f887b856 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 3 Dec 2024 19:03:54 -0800 Subject: bpf: Consolidate locks and reference state in verifier state Currently, state for RCU read locks and preemption is in bpf_verifier_state, while locks and pointer reference state remains in bpf_func_state. There is no particular reason to keep the latter in bpf_func_state. Additionally, it is copied into a new frame's state and copied back to the caller frame's state everytime the verifier processes a pseudo call instruction. This is a bit wasteful, given this state is global for a given verification state / path. Move all resource and reference related state in bpf_verifier_state structure in this patch, in preparation for introducing new reference state types in the future. Since we switch print_verifier_state and friends to print using vstate, we now need to explicitly pass in the verifier state from the caller along with the bpf_func_state, so modify the prototype and callers to do so. To ensure func state matches the verifier state when we're printing data, take in frame number instead of bpf_func_state pointer instead and avoid inconsistencies induced by the caller. Acked-by: Eduard Zingerman Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241204030400.208005-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f4290c179bee..03e351c43fa8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -315,9 +315,6 @@ struct bpf_func_state { u32 callback_depth; /* The following fields should be last. See copy_func_state() */ - int acquired_refs; - int active_locks; - struct bpf_reference_state *refs; /* The state of the stack. Each element of the array describes BPF_REG_SIZE * (i.e. 8) bytes worth of stack memory. * stack[0] represents bytes [*(r10-8)..*(r10-1)] @@ -370,6 +367,8 @@ struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; struct bpf_verifier_state *parent; + /* Acquired reference states */ + struct bpf_reference_state *refs; /* * 'branches' field is the number of branches left to explore: * 0 - all possible paths from this state reached bpf_exit or @@ -419,9 +418,12 @@ struct bpf_verifier_state { u32 insn_idx; u32 curframe; - bool speculative; + u32 acquired_refs; + u32 active_locks; + u32 active_preempt_locks; bool active_rcu_lock; - u32 active_preempt_lock; + + bool speculative; /* If this state was ever pointed-to by other state's loop_entry field * this flag would be set to true. Used to avoid freeing such states * while they are still in use. @@ -979,8 +981,9 @@ const char *dynptr_type_str(enum bpf_dynptr_type type); const char *iter_type_str(const struct btf *btf, u32 btf_id); const char *iter_state_str(enum bpf_iter_state state); -void print_verifier_state(struct bpf_verifier_env *env, - const struct bpf_func_state *state, bool print_all); -void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state); +void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, + u32 frameno, bool print_all); +void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, + u32 frameno); #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit v1.2.3 From c8e2ee1f3df05dc4caa746c062c6b5791c745c79 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 3 Dec 2024 19:03:57 -0800 Subject: bpf: Introduce support for bpf_local_irq_{save,restore} Teach the verifier about IRQ-disabled sections through the introduction of two new kfuncs, bpf_local_irq_save, to save IRQ state and disable them, and bpf_local_irq_restore, to restore IRQ state and enable them back again. For the purposes of tracking the saved IRQ state, the verifier is taught about a new special object on the stack of type STACK_IRQ_FLAG. This is a 8 byte value which saves the IRQ flags which are to be passed back to the IRQ restore kfunc. Renumber the enums for REF_TYPE_* to simplify the check in find_lock_state, filtering out non-lock types as they grow will become cumbersome and is unecessary. To track a dynamic number of IRQ-disabled regions and their associated saved states, a new resource type RES_TYPE_IRQ is introduced, which its state management functions: acquire_irq_state and release_irq_state, taking advantage of the refactoring and clean ups made in earlier commits. One notable requirement of the kernel's IRQ save and restore API is that they cannot happen out of order. For this purpose, when releasing reference we keep track of the prev_id we saw with REF_TYPE_IRQ. Since reference states are inserted in increasing order of the index, this is used to remember the ordering of acquisitions of IRQ saved states, so that we maintain a logical stack in acquisition order of resource identities, and can enforce LIFO ordering when restoring IRQ state. The top of the stack is maintained using bpf_verifier_state's active_irq_id. To maintain the stack property when releasing reference states, we need to modify release_reference_state to instead shift the remaining array left using memmove instead of swapping deleted element with last that might break the ordering. A selftest to test this subtle behavior is added in late patches. The logic to detect initialized and unitialized irq flag slots, marking and unmarking is similar to how it's done for iterators. No additional checks are needed in refsafe for REF_TYPE_IRQ, apart from the usual check_id satisfiability check on the ref[i].id. We have to perform the same check_ids check on state->active_irq_id as well. To ensure we don't get assigned REF_TYPE_PTR by default after acquire_reference_state, if someone forgets to assign the type, let's also renumber the enum ref_state_type. This way any unassigned types get caught by refsafe's default switch statement, don't assume REF_TYPE_PTR by default. The kfuncs themselves are plain wrappers over local_irq_save and local_irq_restore macros. Acked-by: Eduard Zingerman Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241204030400.208005-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 03e351c43fa8..de09ac3067ae 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -233,6 +233,7 @@ enum bpf_stack_slot_type { */ STACK_DYNPTR, STACK_ITER, + STACK_IRQ_FLAG, }; #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ @@ -254,8 +255,9 @@ struct bpf_reference_state { * default to pointer reference on zero initialization of a state. */ enum ref_state_type { - REF_TYPE_PTR = 0, - REF_TYPE_LOCK, + REF_TYPE_PTR = 1, + REF_TYPE_IRQ = 2, + REF_TYPE_LOCK = 3, } type; /* Track each reference created with a unique id, even if the same * instruction creates the reference multiple times (eg, via CALL). @@ -421,6 +423,7 @@ struct bpf_verifier_state { u32 acquired_refs; u32 active_locks; u32 active_preempt_locks; + u32 active_irq_id; bool active_rcu_lock; bool speculative; -- cgit v1.2.3 From 2d470c778120d3cdb8d8ab250329ca85f49f12b1 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 23 Oct 2024 14:21:55 -0700 Subject: lsm: replace context+len with lsm_context Replace the (secctx,seclen) pointer pair with a single lsm_context pointer to allow return of the LSM identifier along with the context and context length. This allows security_release_secctx() to know how to release the context. Callers have been modified to use or save the returned data from the new structure. security_secid_to_secctx() and security_lsmproc_to_secctx() will now return the length value on success instead of 0. Cc: netdev@vger.kernel.org Cc: audit@vger.kernel.org Cc: netfilter-devel@vger.kernel.org Cc: Todd Kjos Signed-off-by: Casey Schaufler [PM: subject tweak, kdoc fix, signedness fix from Dan Carpenter] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 5 ++--- include/linux/security.h | 9 ++++----- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index c13df23132eb..01e5a8e09bba 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -295,10 +295,9 @@ LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, const char *name, char **value) LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size) LSM_HOOK(int, 0, ismaclabel, const char *name) -LSM_HOOK(int, -EOPNOTSUPP, secid_to_secctx, u32 secid, char **secdata, - u32 *seclen) +LSM_HOOK(int, -EOPNOTSUPP, secid_to_secctx, u32 secid, struct lsm_context *cp) LSM_HOOK(int, -EOPNOTSUPP, lsmprop_to_secctx, struct lsm_prop *prop, - char **secdata, u32 *seclen) + struct lsm_context *cp) LSM_HOOK(int, 0, secctx_to_secid, const char *secdata, u32 seclen, u32 *secid) LSM_HOOK(void, LSM_RET_VOID, release_secctx, struct lsm_context *cp) LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode) diff --git a/include/linux/security.h b/include/linux/security.h index 68e56935716b..58518bbc00a6 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -584,8 +584,8 @@ int security_getprocattr(struct task_struct *p, int lsmid, const char *name, int security_setprocattr(int lsmid, const char *name, void *value, size_t size); int security_netlink_send(struct sock *sk, struct sk_buff *skb); int security_ismaclabel(const char *name); -int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen); -int security_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata, u32 *seclen); +int security_secid_to_secctx(u32 secid, struct lsm_context *cp); +int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp); int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid); void security_release_secctx(struct lsm_context *cp); void security_inode_invalidate_secctx(struct inode *inode); @@ -1557,14 +1557,13 @@ static inline int security_ismaclabel(const char *name) return 0; } -static inline int security_secid_to_secctx(u32 secid, char **secdata, - u32 *seclen) +static inline int security_secid_to_secctx(u32 secid, struct lsm_context *cp) { return -EOPNOTSUPP; } static inline int security_lsmprop_to_secctx(struct lsm_prop *prop, - char **secdata, u32 *seclen) + struct lsm_context *cp) { return -EOPNOTSUPP; } -- cgit v1.2.3 From 76ecf306ae5da84ef8f48c7a2608736e6866440c Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 23 Oct 2024 14:21:56 -0700 Subject: lsm: use lsm_context in security_inode_getsecctx Change the security_inode_getsecctx() interface to fill a lsm_context structure instead of data and length pointers. This provides the information about which LSM created the context so that security_release_secctx() can use the correct hook. Cc: linux-nfs@vger.kernel.org Signed-off-by: Casey Schaufler [PM: subject tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 4 ++-- include/linux/security.h | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 01e5a8e09bba..69e1076448c6 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -303,8 +303,8 @@ LSM_HOOK(void, LSM_RET_VOID, release_secctx, struct lsm_context *cp) LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode) LSM_HOOK(int, 0, inode_notifysecctx, struct inode *inode, void *ctx, u32 ctxlen) LSM_HOOK(int, 0, inode_setsecctx, struct dentry *dentry, void *ctx, u32 ctxlen) -LSM_HOOK(int, -EOPNOTSUPP, inode_getsecctx, struct inode *inode, void **ctx, - u32 *ctxlen) +LSM_HOOK(int, -EOPNOTSUPP, inode_getsecctx, struct inode *inode, + struct lsm_context *cp) #if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE) LSM_HOOK(int, 0, post_notification, const struct cred *w_cred, diff --git a/include/linux/security.h b/include/linux/security.h index 58518bbc00a6..29f8100bc7c8 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -591,7 +591,7 @@ void security_release_secctx(struct lsm_context *cp); void security_inode_invalidate_secctx(struct inode *inode); int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen); int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen); -int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen); +int security_inode_getsecctx(struct inode *inode, struct lsm_context *cp); int security_locked_down(enum lockdown_reason what); int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, u32 *uctx_len, void *val, size_t val_len, u64 id, u64 flags); @@ -1591,7 +1591,8 @@ static inline int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 { return -EOPNOTSUPP; } -static inline int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) +static inline int security_inode_getsecctx(struct inode *inode, + struct lsm_context *cp) { return -EOPNOTSUPP; } -- cgit v1.2.3 From b530104f50e86db6f187d39fed5821b3cca755ee Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 23 Oct 2024 14:21:57 -0700 Subject: lsm: lsm_context in security_dentry_init_security Replace the (secctx,seclen) pointer pair with a single lsm_context pointer to allow return of the LSM identifier along with the context and context length. This allows security_release_secctx() to know how to release the context. Callers have been modified to use or save the returned data from the new structure. Cc: ceph-devel@vger.kernel.org Cc: linux-nfs@vger.kernel.org Signed-off-by: Casey Schaufler [PM: subject tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 +- include/linux/security.h | 26 +++----------------------- 2 files changed, 4 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 69e1076448c6..e2f1ce37c41e 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -83,7 +83,7 @@ LSM_HOOK(int, 0, move_mount, const struct path *from_path, const struct path *to_path) LSM_HOOK(int, -EOPNOTSUPP, dentry_init_security, struct dentry *dentry, int mode, const struct qstr *name, const char **xattr_name, - void **ctx, u32 *ctxlen) + struct lsm_context *cp) LSM_HOOK(int, 0, dentry_create_files_as, struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, struct cred *new) diff --git a/include/linux/security.h b/include/linux/security.h index 29f8100bc7c8..980b6c207cad 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -237,25 +237,6 @@ struct lsm_context { int id; /* Identifies the module */ }; -/** - * lsmcontext_init - initialize an lsmcontext structure. - * @cp: Pointer to the context to initialize - * @context: Initial context, or NULL - * @size: Size of context, or 0 - * @id: Which LSM provided the context - * - * Fill in the lsmcontext from the provided information. - * This is a scaffolding function that will be removed when - * lsm_context integration is complete. - */ -static inline void lsmcontext_init(struct lsm_context *cp, char *context, - u32 size, int id) -{ - cp->id = id; - cp->context = context; - cp->len = size; -} - /* * Values used in the task_security_ops calls */ @@ -409,8 +390,8 @@ int security_sb_clone_mnt_opts(const struct super_block *oldsb, int security_move_mount(const struct path *from_path, const struct path *to_path); int security_dentry_init_security(struct dentry *dentry, int mode, const struct qstr *name, - const char **xattr_name, void **ctx, - u32 *ctxlen); + const char **xattr_name, + struct lsm_context *lsmcxt); int security_dentry_create_files_as(struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, @@ -883,8 +864,7 @@ static inline int security_dentry_init_security(struct dentry *dentry, int mode, const struct qstr *name, const char **xattr_name, - void **ctx, - u32 *ctxlen) + struct lsm_context *lsmcxt) { return -EOPNOTSUPP; } -- cgit v1.2.3 From ed638918f4df39daa458435f0825b487c1f192c8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 22 Oct 2024 11:07:53 -0700 Subject: scsi: Rename .slave_alloc() and .slave_destroy() Rename .slave_alloc() into .sdev_init() and .slave_destroy() into .sdev_destroy(). The new names make it clear that these are actions on SCSI devices. Make this change in the SCSI core, SCSI drivers and also in the ATA drivers. No functionality has been changed. This patch has been created as follows: * Change the text "slave_alloc" into "sdev_init" in all source files except those in drivers/net/ and Documentation/. * Change the text "slave_destroy" into "sdev_destroy" in all source files except those in drivers/net/ and Documentation/. * Rename lpfc_no_slave() into lpfc_no_sdev(). * Manually adjust whitespace where necessary to restore vertical alignment (dc395x driver and include/linux/libata.h). Acked-by: Damien Le Moal Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20241022180839.2712439-2-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- include/linux/libata.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index c1a85d46eba6..fb5e13b8d89f 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1199,10 +1199,10 @@ extern int ata_std_bios_param(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[]); extern void ata_scsi_unlock_native_capacity(struct scsi_device *sdev); -extern int ata_scsi_slave_alloc(struct scsi_device *sdev); +extern int ata_scsi_sdev_init(struct scsi_device *sdev); int ata_scsi_device_configure(struct scsi_device *sdev, struct queue_limits *lim); -extern void ata_scsi_slave_destroy(struct scsi_device *sdev); +extern void ata_scsi_sdev_destroy(struct scsi_device *sdev); extern int ata_scsi_change_queue_depth(struct scsi_device *sdev, int queue_depth); extern int ata_change_queue_depth(struct ata_port *ap, struct scsi_device *sdev, @@ -1458,8 +1458,8 @@ extern const struct attribute_group *ata_common_sdev_groups[]; .this_id = ATA_SHT_THIS_ID, \ .emulated = ATA_SHT_EMULATED, \ .proc_name = drv_name, \ - .slave_alloc = ata_scsi_slave_alloc, \ - .slave_destroy = ata_scsi_slave_destroy, \ + .sdev_init = ata_scsi_sdev_init, \ + .sdev_destroy = ata_scsi_sdev_destroy, \ .bios_param = ata_std_bios_param, \ .unlock_native_capacity = ata_scsi_unlock_native_capacity,\ .max_sectors = ATA_MAX_SECTORS_LBA48 -- cgit v1.2.3 From 47c2e30afcec52968e50db01f92dda7d373042cb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 22 Oct 2024 11:07:54 -0700 Subject: scsi: Rename .device_configure() into .sdev_configure() Improve naming consistency with the .sdev_prep() and .sdev_destroy() methods by renaming .device_configure() into .sdev_configure(). Cc: Christoph Hellwig Acked-by: Damien Le Moal Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20241022180839.2712439-3-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- include/linux/libata.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index fb5e13b8d89f..7717f06a548d 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1200,8 +1200,7 @@ extern int ata_std_bios_param(struct scsi_device *sdev, sector_t capacity, int geom[]); extern void ata_scsi_unlock_native_capacity(struct scsi_device *sdev); extern int ata_scsi_sdev_init(struct scsi_device *sdev); -int ata_scsi_device_configure(struct scsi_device *sdev, - struct queue_limits *lim); +int ata_scsi_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim); extern void ata_scsi_sdev_destroy(struct scsi_device *sdev); extern int ata_scsi_change_queue_depth(struct scsi_device *sdev, int queue_depth); @@ -1301,8 +1300,8 @@ extern struct ata_port *ata_port_alloc(struct ata_host *host); extern void ata_port_free(struct ata_port *ap); extern int ata_tport_add(struct device *parent, struct ata_port *ap); extern void ata_tport_delete(struct ata_port *ap); -int ata_sas_device_configure(struct scsi_device *sdev, struct queue_limits *lim, - struct ata_port *ap); +int ata_sas_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim, + struct ata_port *ap); extern int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap); extern void ata_tf_to_fis(const struct ata_taskfile *tf, u8 pmp, int is_cmd, u8 *fis); @@ -1468,13 +1467,13 @@ extern const struct attribute_group *ata_common_sdev_groups[]; __ATA_BASE_SHT(drv_name), \ .can_queue = ATA_DEF_QUEUE, \ .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ - .device_configure = ata_scsi_device_configure + .sdev_configure = ata_scsi_sdev_configure #define ATA_SUBBASE_SHT_QD(drv_name, drv_qd) \ __ATA_BASE_SHT(drv_name), \ .can_queue = drv_qd, \ .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ - .device_configure = ata_scsi_device_configure + .sdev_configure = ata_scsi_sdev_configure #define ATA_BASE_SHT(drv_name) \ ATA_SUBBASE_SHT(drv_name), \ -- cgit v1.2.3 From a61b19f4a6586590a9ae6baf2ac4a25a852e547f Mon Sep 17 00:00:00 2001 From: Maksym Kutsevol Date: Mon, 2 Dec 2024 11:55:07 -0800 Subject: netpoll: Make netpoll_send_udp return status instead of void netpoll_send_udp can return if send was successful. It will allow client code to be aware of the send status. Possible return values are the result of __netpoll_send_skb (cast to int) and -ENOMEM. This doesn't cover the case when TX was not successful instantaneously and was scheduled for later, __netpoll__send_skb returns success in that case. Signed-off-by: Maksym Kutsevol Link: https://patch.msgid.link/20241202-netcons-add-udp-send-fail-statistics-to-netconsole-v5-1-70e82239f922@kutsevol.com Signed-off-by: Jakub Kicinski --- include/linux/netpoll.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index b34301650c47..f91e50a76efd 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -57,7 +57,7 @@ static inline void netpoll_poll_disable(struct net_device *dev) { return; } static inline void netpoll_poll_enable(struct net_device *dev) { return; } #endif -void netpoll_send_udp(struct netpoll *np, const char *msg, int len); +int netpoll_send_udp(struct netpoll *np, const char *msg, int len); void netpoll_print_options(struct netpoll *np); int netpoll_parse_options(struct netpoll *np, char *opt); int __netpoll_setup(struct netpoll *np, struct net_device *ndev); -- cgit v1.2.3 From b4c7698dd95f253c6958d8c6ac219098009bf28a Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:31:02 +0000 Subject: net: phy: add phy_inband_caps() Add a method to query the PHY's in-band capabilities for a PHY interface mode. Where the interface mode does not have in-band capability, or the PHY driver has not been updated to return this information, then phy_inband_caps() should return zero. Otherwise, PHY drivers will return a value consisting of the following flags: LINK_INBAND_DISABLE indicates that the hardware does not support in-band signalling, or can have in-band signalling configured via software to be disabled. LINK_INBAND_ENABLE indicates that the hardware will use in-band signalling, or can have in-band signalling configured via software to be enabled. LINK_INBAND_BYPASS indicates that the hardware has the ability to bypass in-band signalling when enabled after a timeout if the link partner does not respond to its in-band signalling. This reports the PHY capabilities for the particular interface mode, not the current configuration. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tIUre-006ITz-KF@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 563c46205685..ccb93d892da9 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -817,6 +817,24 @@ struct phy_tdr_config { }; #define PHY_PAIR_ALL -1 +/** + * enum link_inband_signalling - in-band signalling modes that are supported + * + * @LINK_INBAND_DISABLE: in-band signalling can be disabled + * @LINK_INBAND_ENABLE: in-band signalling can be enabled without bypass + * @LINK_INBAND_BYPASS: in-band signalling can be enabled with bypass + * + * The possible and required bits can only be used if the valid bit is set. + * If possible is clear, that means inband signalling can not be used. + * Required is only valid when possible is set, and means that inband + * signalling must be used. + */ +enum link_inband_signalling { + LINK_INBAND_DISABLE = BIT(0), + LINK_INBAND_ENABLE = BIT(1), + LINK_INBAND_BYPASS = BIT(2), +}; + /** * struct phy_plca_cfg - Configuration of the PLCA (Physical Layer Collision * Avoidance) Reconciliation Sublayer. @@ -956,6 +974,14 @@ struct phy_driver { */ int (*get_features)(struct phy_device *phydev); + /** + * @inband_caps: query whether in-band is supported for the given PHY + * interface mode. Returns a bitmask of bits defined by enum + * link_inband_signalling. + */ + unsigned int (*inband_caps)(struct phy_device *phydev, + phy_interface_t interface); + /** * @get_rate_matching: Get the supported type of rate matching for a * particular phy interface. This is used by phy consumers to determine @@ -1818,6 +1844,8 @@ int phy_config_aneg(struct phy_device *phydev); int _phy_start_aneg(struct phy_device *phydev); int phy_start_aneg(struct phy_device *phydev); int phy_aneg_done(struct phy_device *phydev); +unsigned int phy_inband_caps(struct phy_device *phydev, + phy_interface_t interface); int phy_speed_down(struct phy_device *phydev, bool sync); int phy_speed_up(struct phy_device *phydev); bool phy_check_valid(int speed, int duplex, unsigned long *features); -- cgit v1.2.3 From 5d58a890c02770ba8d790b1f3c6e8c0e20514dc2 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:31:18 +0000 Subject: net: phy: add phy_config_inband() Add a method to configure the PHY's in-band mode. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tIUru-006IUI-08@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index ccb93d892da9..61a1bc81f597 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -982,6 +982,11 @@ struct phy_driver { unsigned int (*inband_caps)(struct phy_device *phydev, phy_interface_t interface); + /** + * @config_inband: configure in-band mode for the PHY + */ + int (*config_inband)(struct phy_device *phydev, unsigned int modes); + /** * @get_rate_matching: Get the supported type of rate matching for a * particular phy interface. This is used by phy consumers to determine @@ -1846,6 +1851,7 @@ int phy_start_aneg(struct phy_device *phydev); int phy_aneg_done(struct phy_device *phydev); unsigned int phy_inband_caps(struct phy_device *phydev, phy_interface_t interface); +int phy_config_inband(struct phy_device *phydev, unsigned int modes); int phy_speed_down(struct phy_device *phydev, bool sync); int phy_speed_up(struct phy_device *phydev); bool phy_check_valid(int speed, int duplex, unsigned long *features); -- cgit v1.2.3 From df874f9e52c340cc6f0a0014a97b778f67d46849 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:31:28 +0000 Subject: net: phylink: add pcs_inband_caps() method Add a pcs_inband_caps() method to query the PCS for its inband link capabilities, and use this to determine whether link modes used with optical SFPs can be supported. When a PCS does not provide a method, we allow inband negotiation to be either on or off, making this a no-op until the pcs_inband_caps() method is implemented by a PCS driver. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tIUs4-006IUU-7K@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 5c01048860c4..5462cc6a37dc 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -419,6 +419,7 @@ struct phylink_pcs { /** * struct phylink_pcs_ops - MAC PCS operations structure. * @pcs_validate: validate the link configuration. + * @pcs_inband_caps: query inband support for interface mode. * @pcs_enable: enable the PCS. * @pcs_disable: disable the PCS. * @pcs_pre_config: pre-mac_config method (for errata) @@ -434,6 +435,8 @@ struct phylink_pcs { struct phylink_pcs_ops { int (*pcs_validate)(struct phylink_pcs *pcs, unsigned long *supported, const struct phylink_link_state *state); + unsigned int (*pcs_inband_caps)(struct phylink_pcs *pcs, + phy_interface_t interface); int (*pcs_enable)(struct phylink_pcs *pcs); void (*pcs_disable)(struct phylink_pcs *pcs); void (*pcs_pre_config)(struct phylink_pcs *pcs, @@ -470,6 +473,20 @@ struct phylink_pcs_ops { int pcs_validate(struct phylink_pcs *pcs, unsigned long *supported, const struct phylink_link_state *state); +/** + * pcs_inband_caps - query PCS in-band capabilities for interface mode. + * @pcs: a pointer to a &struct phylink_pcs. + * @interface: interface mode to be queried + * + * Returns zero if it is unknown what in-band signalling is supported by the + * PHY (e.g. because the PHY driver doesn't implement the method.) Otherwise, + * returns a bit mask of the LINK_INBAND_* values from + * &enum link_inband_signalling to describe which inband modes are supported + * for this interface mode. + */ +unsigned int pcs_inband_caps(struct phylink_pcs *pcs, + phy_interface_t interface); + /** * pcs_enable() - enable the PCS. * @pcs: a pointer to a &struct phylink_pcs. -- cgit v1.2.3 From e05feab22fd7dabcd6d272c4e2401ec1acdfdb9b Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Tue, 3 Dec 2024 15:45:37 +0200 Subject: RDMA/mlx5: Enforce same type port association for multiport RoCE Different core device types such as PFs and VFs shouldn't be affiliated together since they have different capabilities, fix that by enforcing type check before doing the affiliation. Fixes: 32f69e4be269 ("{net, IB}/mlx5: Manage port association for multiport RoCE") Reviewed-by: Mark Bloch Signed-off-by: Patrisious Haddad Link: https://patch.msgid.link/88699500f690dff1c1852c1ddb71f8a1cc8b956e.1733233480.git.leonro@nvidia.com Reviewed-by: Mateusz Polchlopek Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fc7e6153b73d..4f9e6f6dbaab 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1202,6 +1202,12 @@ static inline bool mlx5_core_is_vf(const struct mlx5_core_dev *dev) return dev->coredev_type == MLX5_COREDEV_VF; } +static inline bool mlx5_core_same_coredev_type(const struct mlx5_core_dev *dev1, + const struct mlx5_core_dev *dev2) +{ + return dev1->coredev_type == dev2->coredev_type; +} + static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev) { return dev->caps.embedded_cpu; -- cgit v1.2.3 From f9a5b34f9251cf530fecf08ef039be64ead8c459 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Thu, 5 Dec 2024 00:09:21 +0200 Subject: net/mlx5: ifc: Reorganize mlx5_ifc_flow_table_context_bits The nested union at the end is not in the same style as the rest of the code, so un-nest it to make the style uniformly applied again. Signed-off-by: Cosmin Ratiu Reviewed-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241204220931.254964-2-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4fbbcf35498b..f3650f989e68 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -6324,6 +6324,20 @@ struct mlx5_ifc_modify_other_hca_cap_in_bits { struct mlx5_ifc_other_hca_cap_bits other_capability; }; +struct mlx5_ifc_sw_owner_icm_root_params_bits { + u8 sw_owner_icm_root_1[0x40]; + + u8 sw_owner_icm_root_0[0x40]; +}; + +struct mlx5_ifc_rtc_params_bits { + u8 rtc_id_0[0x20]; + + u8 rtc_id_1[0x20]; + + u8 reserved_at_40[0x40]; +}; + struct mlx5_ifc_flow_table_context_bits { u8 reformat_en[0x1]; u8 decap_en[0x1]; @@ -6342,20 +6356,10 @@ struct mlx5_ifc_flow_table_context_bits { u8 lag_master_next_table_id[0x18]; u8 reserved_at_60[0x60]; - union { - struct { - u8 sw_owner_icm_root_1[0x40]; - - u8 sw_owner_icm_root_0[0x40]; - } sws; - struct { - u8 rtc_id_0[0x20]; - u8 rtc_id_1[0x20]; - - u8 reserved_at_100[0x40]; - - } hws; + union { + struct mlx5_ifc_sw_owner_icm_root_params_bits sws; + struct mlx5_ifc_rtc_params_bits hws; }; }; -- cgit v1.2.3 From e799ac9dd3c485a7cda3586f2a12784b030b9df0 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 5 Dec 2024 00:09:22 +0200 Subject: net/mlx5: Add ConnectX-8 device to ifc In preparation for ConnectX-8 SWS support, add enum for the new device type. Signed-off-by: Yevgeny Kliteynik Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241204220931.254964-3-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f3650f989e68..bd9b1833408e 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1590,6 +1590,7 @@ enum { MLX5_STEERING_FORMAT_CONNECTX_5 = 0, MLX5_STEERING_FORMAT_CONNECTX_6DX = 1, MLX5_STEERING_FORMAT_CONNECTX_7 = 2, + MLX5_STEERING_FORMAT_CONNECTX_8 = 3, }; struct mlx5_ifc_cmd_hca_cap_bits { -- cgit v1.2.3 From 03713108e0cccf325bb71941edd9ed6122142907 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 5 Dec 2024 00:09:23 +0200 Subject: net/mlx5: Add support for new scheduling elements Introduce new scheduling elements in the E-Switch QoS hierarchy to enhance traffic management capabilities. This patch adds support for: - Rate Limit scheduling elements: Enables bandwidth limitation across multiple nodes without a shared ancestor, providing a mechanism for more granular control of bandwidth allocation. - Traffic Class Transmit Scheduling Arbiter (TSAR): Introduces the infrastructure for creating Traffic Class TSARs, allowing hierarchical arbitration based on traffic classes. - Traffic Class Arbiter TSAR: Adds support for a TSAR capable of managing arbitration between multiple traffic classes, enabling improved bandwidth prioritization and traffic management. No functional changes are introduced in this patch. Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241204220931.254964-4-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index bd9b1833408e..8b202521b774 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1103,7 +1103,8 @@ struct mlx5_ifc_qos_cap_bits { u8 packet_pacing_min_rate[0x20]; - u8 reserved_at_80[0x10]; + u8 reserved_at_80[0xb]; + u8 log_esw_max_rate_limit[0x5]; u8 packet_pacing_rate_table_size[0x10]; u8 esw_element_type[0x10]; @@ -4104,6 +4105,7 @@ enum { SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC = 0x2, SCHEDULING_CONTEXT_ELEMENT_TYPE_PARA_VPORT_TC = 0x3, SCHEDULING_CONTEXT_ELEMENT_TYPE_QUEUE_GROUP = 0x4, + SCHEDULING_CONTEXT_ELEMENT_TYPE_RATE_LIMIT = 0x5, }; enum { @@ -4112,22 +4114,26 @@ enum { ELEMENT_TYPE_CAP_MASK_VPORT_TC = 1 << 2, ELEMENT_TYPE_CAP_MASK_PARA_VPORT_TC = 1 << 3, ELEMENT_TYPE_CAP_MASK_QUEUE_GROUP = 1 << 4, + ELEMENT_TYPE_CAP_MASK_RATE_LIMIT = 1 << 5, }; enum { TSAR_ELEMENT_TSAR_TYPE_DWRR = 0x0, TSAR_ELEMENT_TSAR_TYPE_ROUND_ROBIN = 0x1, TSAR_ELEMENT_TSAR_TYPE_ETS = 0x2, + TSAR_ELEMENT_TSAR_TYPE_TC_ARB = 0x3, }; enum { TSAR_TYPE_CAP_MASK_DWRR = 1 << 0, TSAR_TYPE_CAP_MASK_ROUND_ROBIN = 1 << 1, TSAR_TYPE_CAP_MASK_ETS = 1 << 2, + TSAR_TYPE_CAP_MASK_TC_ARB = 1 << 3, }; struct mlx5_ifc_tsar_element_bits { - u8 reserved_at_0[0x8]; + u8 traffic_class[0x4]; + u8 reserved_at_4[0x4]; u8 tsar_type[0x8]; u8 reserved_at_10[0x10]; }; @@ -4164,7 +4170,9 @@ struct mlx5_ifc_scheduling_context_bits { u8 max_average_bw[0x20]; - u8 reserved_at_e0[0x120]; + u8 max_bw_obj_id[0x20]; + + u8 reserved_at_100[0x100]; }; struct mlx5_ifc_rqtc_bits { -- cgit v1.2.3 From f09ed834a946f9c77088d53af4d4806974728d7b Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Thu, 5 Dec 2024 00:09:24 +0200 Subject: net/mlx5: qos: Add ifc support for cross-esw scheduling This adds the capability bit and the vport element fields related to cross-esw scheduling. Signed-off-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241204220931.254964-5-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 8b202521b774..5451ff1d4356 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1095,7 +1095,9 @@ struct mlx5_ifc_qos_cap_bits { u8 log_esw_max_sched_depth[0x4]; u8 reserved_at_10[0x10]; - u8 reserved_at_20[0xb]; + u8 reserved_at_20[0x9]; + u8 esw_cross_esw_sched[0x1]; + u8 reserved_at_2a[0x1]; u8 log_max_qos_nic_queue_group[0x5]; u8 reserved_at_30[0x10]; @@ -4139,13 +4141,16 @@ struct mlx5_ifc_tsar_element_bits { }; struct mlx5_ifc_vport_element_bits { - u8 reserved_at_0[0x10]; + u8 reserved_at_0[0x4]; + u8 eswitch_owner_vhca_id_valid[0x1]; + u8 eswitch_owner_vhca_id[0xb]; u8 vport_number[0x10]; }; struct mlx5_ifc_vport_tc_element_bits { u8 traffic_class[0x4]; - u8 reserved_at_4[0xc]; + u8 eswitch_owner_vhca_id_valid[0x1]; + u8 eswitch_owner_vhca_id[0xb]; u8 vport_number[0x10]; }; -- cgit v1.2.3 From 6d9d6ab3a82af50e36e13e7bc8e2d1b970e39f79 Mon Sep 17 00:00:00 2001 From: Takahiro Kuwano Date: Tue, 3 Dec 2024 11:46:49 +0900 Subject: mtd: spinand: Introduce a way to avoid raw access SkyHigh spinand device has ECC enable bit in configuration register but it must be always enabled. If ECC is disabled, read and write ops results in undetermined state. For such devices, a way to avoid raw access is needed. Introduce SPINAND_NO_RAW_ACCESS flag to advertise the device does not support raw access. In such devices, the on-die ECC engine ops returns error to I/O request in raw mode. Checking and marking BBM need to be cared as special case, by adding fallback mechanism that tries read/write OOB with ECC enabled. Signed-off-by: Takahiro Kuwano Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 702e5fb13dae..5cf11005b41a 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -314,6 +314,7 @@ struct spinand_ecc_info { #define SPINAND_HAS_CR_FEAT_BIT BIT(1) #define SPINAND_HAS_PROG_PLANE_SELECT_BIT BIT(2) #define SPINAND_HAS_READ_PLANE_SELECT_BIT BIT(3) +#define SPINAND_NO_RAW_ACCESS BIT(4) /** * struct spinand_ondie_ecc_conf - private SPI-NAND on-die ECC engine structure -- cgit v1.2.3 From 1a50e3612de9187857f55ee14a573f7f8e7d4ebc Mon Sep 17 00:00:00 2001 From: Takahiro Kuwano Date: Tue, 3 Dec 2024 11:46:50 +0900 Subject: mtd: spinand: Add support for SkyHigh S35ML-3 family SkyHigh S35ML01G300, S35ML01G301, S35ML02G300, and S35ML04G300 are 1Gb, 2Gb, and 4Gb SLC SPI NAND flash family. This family of devices has on-die ECC which parity bits are stored to hidden area. In this family the on-die ECC cannot be disabled so raw access needs to be prevented. Link: https://www.skyhighmemory.com/download/SPI_S35ML01_04G3_002_19205.pdf?v=P Co-developed-by: KR Kim Signed-off-by: KR Kim Signed-off-by: Takahiro Kuwano Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 5cf11005b41a..cbbcd44ac225 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -268,6 +268,7 @@ extern const struct spinand_manufacturer gigadevice_spinand_manufacturer; extern const struct spinand_manufacturer macronix_spinand_manufacturer; extern const struct spinand_manufacturer micron_spinand_manufacturer; extern const struct spinand_manufacturer paragon_spinand_manufacturer; +extern const struct spinand_manufacturer skyhigh_spinand_manufacturer; extern const struct spinand_manufacturer toshiba_spinand_manufacturer; extern const struct spinand_manufacturer winbond_spinand_manufacturer; extern const struct spinand_manufacturer xtx_spinand_manufacturer; -- cgit v1.2.3 From 7cd1107f48e2a246c6a628c2381e1b8aafa4675a Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 3 Dec 2024 18:37:25 +0100 Subject: bpf, xdp: constify some bpf_prog * function arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In lots of places, bpf_prog pointer is used only for tracing or other stuff that doesn't modify the structure itself. Same for net_device. Address at least some of them and add `const` attributes there. The object code didn't change, but that may prevent unwanted data modifications and also allow more helpers to have const arguments. Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Signed-off-by: Jakub Kicinski --- include/linux/bpf.h | 12 ++++++------ include/linux/filter.h | 9 +++++---- include/linux/netdevice.h | 6 +++--- include/linux/skbuff.h | 2 +- 4 files changed, 15 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eaee2a819f4c..ec3acb16359e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2591,10 +2591,10 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog); + const struct bpf_prog *xdp_prog); int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog, struct bpf_map *map, - bool exclude_ingress); + const struct bpf_prog *xdp_prog, + struct bpf_map *map, bool exclude_ingress); void __cpu_map_flush(struct list_head *flush_list); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, @@ -2864,15 +2864,15 @@ struct sk_buff; static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { return 0; } static inline int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog, struct bpf_map *map, - bool exclude_ingress) + const struct bpf_prog *xdp_prog, + struct bpf_map *map, bool exclude_ingress) { return 0; } diff --git a/include/linux/filter.h b/include/linux/filter.h index 3a21947f2fd4..9a5d23ae3855 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1179,17 +1179,18 @@ static inline int xdp_ok_fwd_dev(const struct net_device *fwd, * This does not appear to be a real limitation for existing software. */ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct xdp_buff *xdp, struct bpf_prog *prog); + struct xdp_buff *xdp, const struct bpf_prog *prog); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *prog); + const struct bpf_prog *prog); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, struct xdp_frame *xdpf, - struct bpf_prog *prog); + const struct bpf_prog *prog); void xdp_do_flush(void); -void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act); +void bpf_warn_invalid_xdp_action(const struct net_device *dev, + const struct bpf_prog *prog, u32 act); #ifdef CONFIG_INET struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ecc686409161..ecca21387a68 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3958,9 +3958,9 @@ static inline void dev_consume_skb_any(struct sk_buff *skb) } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog); -void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb); + const struct bpf_prog *xdp_prog); +void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog); +int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb); int netif_rx(struct sk_buff *skb); int __netif_rx(struct sk_buff *skb); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 58009fa66102..95452d1a07fc 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3627,7 +3627,7 @@ static inline netmem_ref skb_frag_netmem(const skb_frag_t *frag) int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, - struct bpf_prog *prog); + const struct bpf_prog *prog); /** * skb_frag_address - gets the address of the data contained in a paged fragment -- cgit v1.2.3 From e77d9aee951341119be16a991fcfc76d1154d22a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 3 Dec 2024 18:37:29 +0100 Subject: xdp: register system page pool as an XDP memory model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To make the system page pool usable as a source for allocating XDP frames, we need to register it with xdp_reg_mem_model(), so that page return works correctly. This is done in preparation for using the system page_pool to convert XDP_PASS XSk frames to skbs; for the same reason, make the per-cpu variable non-static so we can access it from other source files as well (but w/o exporting). Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241203173733.3181246-7-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ecca21387a68..d1a8d98b132c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3322,6 +3322,7 @@ struct softnet_data { }; DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +DECLARE_PER_CPU(struct page_pool *, system_page_pool); #ifndef CONFIG_PREEMPT_RT static inline int dev_recursion_level(void) -- cgit v1.2.3 From 8f1c716090a7ed20fea802b63b37758169d59b81 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 5 Dec 2024 10:42:10 +0000 Subject: net: phy: remove genphy_c45_eee_is_active()'s is_enabled arg All callers to genphy_c45_eee_is_active() now pass NULL as the is_enabled argument, which means we never use the value computed in this function. Remove the argument and clean up this function. Signed-off-by: Russell King (Oracle) Reviewed-by: Heiner Kallweit Link: https://patch.msgid.link/E1tJ9JC-006LIt-Ne@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 61a1bc81f597..bb157136351e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1991,7 +1991,7 @@ int genphy_c45_plca_set_cfg(struct phy_device *phydev, int genphy_c45_plca_get_status(struct phy_device *phydev, struct phy_plca_status *plca_st); int genphy_c45_eee_is_active(struct phy_device *phydev, unsigned long *adv, - unsigned long *lp, bool *is_enabled); + unsigned long *lp); int genphy_c45_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data); int genphy_c45_ethtool_set_eee(struct phy_device *phydev, -- cgit v1.2.3 From 18eabadd73ae60023ab05e376246bd725fb0c113 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 4 Dec 2024 13:11:21 +0100 Subject: vrf: Make pcpu_dstats update functions available to other modules. Currently vrf is the only module that uses NETDEV_PCPU_STAT_DSTATS. In order to make this kind of statistics available to other modules, we need to define the update functions in netdevice.h. Therefore, let's define dev_dstats_*() functions for RX and TX packet updates (packets, bytes and drops). Use these new functions in vrf.c instead of vrf_rx_stats() and the other manual counter updates. While there, update the type of the "len" variables to "unsigned int", so that there're aligned with both skb->len and the new dstats update functions. Signed-off-by: Guillaume Nault Link: https://patch.msgid.link/d7a552ee382c79f4854e7fcc224cf176cd21150d.1733313925.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d1a8d98b132c..135105441681 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2854,6 +2854,46 @@ static inline void dev_lstats_add(struct net_device *dev, unsigned int len) u64_stats_update_end(&lstats->syncp); } +static inline void dev_dstats_rx_add(struct net_device *dev, + unsigned int len) +{ + struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); + + u64_stats_update_begin(&dstats->syncp); + u64_stats_inc(&dstats->rx_packets); + u64_stats_add(&dstats->rx_bytes, len); + u64_stats_update_end(&dstats->syncp); +} + +static inline void dev_dstats_rx_dropped(struct net_device *dev) +{ + struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); + + u64_stats_update_begin(&dstats->syncp); + u64_stats_inc(&dstats->rx_drops); + u64_stats_update_end(&dstats->syncp); +} + +static inline void dev_dstats_tx_add(struct net_device *dev, + unsigned int len) +{ + struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); + + u64_stats_update_begin(&dstats->syncp); + u64_stats_inc(&dstats->tx_packets); + u64_stats_add(&dstats->tx_bytes, len); + u64_stats_update_end(&dstats->syncp); +} + +static inline void dev_dstats_tx_dropped(struct net_device *dev) +{ + struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); + + u64_stats_update_begin(&dstats->syncp); + u64_stats_inc(&dstats->tx_drops); + u64_stats_update_end(&dstats->syncp); +} + #define __netdev_alloc_pcpu_stats(type, gfp) \ ({ \ typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\ -- cgit v1.2.3 From cb3e9a446763da8850c8280be009d95f61e11a94 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 22 Nov 2024 11:42:48 -0600 Subject: iio: adc: ad_sigma_delta: add tab to align irq_line Align the irq_line field in struct ad_sigma_delta with the other fields. Signed-off-by: David Lechner Link: https://patch.msgid.link/20241122-iio-adc-ad_signal_delta-fix-align-v1-1-d0a071d2dc83@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/ad_sigma_delta.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index f8c1d2505940..1851f8fed3a4 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -96,7 +96,7 @@ struct ad_sigma_delta { unsigned int active_slots; unsigned int current_slot; unsigned int num_slots; - int irq_line; + int irq_line; bool status_appended; /* map slots to channels in order to know what to expect from devices */ unsigned int *slots; -- cgit v1.2.3 From 0c39208bc3af6f7afcbcdb675ad8fbe6b3022865 Mon Sep 17 00:00:00 2001 From: Robert Budai Date: Mon, 25 Nov 2024 15:35:08 +0200 Subject: iio: imu: adis: Remove documented not used elements This patch removes elements from adis.h that are documented but not used anymore. Signed-off-by: Robert Budai Link: https://patch.msgid.link/20241125133520.24328-2-robert.budai@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index e6a75356567a..4bb98d9731de 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -99,7 +99,6 @@ struct adis_data { * @spi: Reference to SPI device which owns this ADIS IIO device * @trig: IIO trigger object data * @data: ADIS chip variant specific data - * @burst: ADIS burst transfer information * @burst_extra_len: Burst extra length. Should only be used by devices that can * dynamically change their burst mode length. * @state_lock: Lock used by the device to protect state -- cgit v1.2.3 From e895f2edfe4820b671c700ccc17be35b1de3d295 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Mon, 25 Nov 2024 22:16:19 +0100 Subject: iio: core: fix doc reference to iio_push_to_buffers_with_ts_unaligned Use the right name of the function, which is defined in drivers/iio/industrialio-buffer.c Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241125-iio_memset_scan_holes-v1-11-0cb6e98d895c@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio-opaque.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h index a89e7e43e441..4247497f3f8b 100644 --- a/include/linux/iio/iio-opaque.h +++ b/include/linux/iio/iio-opaque.h @@ -28,7 +28,7 @@ * @groupcounter: index of next attribute group * @legacy_scan_el_group: attribute group for legacy scan elements attribute group * @legacy_buffer_group: attribute group for legacy buffer attributes group - * @bounce_buffer: for devices that call iio_push_to_buffers_with_timestamp_unaligned() + * @bounce_buffer: for devices that call iio_push_to_buffers_with_ts_unaligned() * @bounce_buffer_size: size of currently allocate bounce buffer * @scan_index_timestamp: cache of the index to the timestamp * @clock_id: timestamping clock posix identifier -- cgit v1.2.3 From ebe559609d7829b52c6642b581860760984faf9d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 15 Nov 2024 10:30:14 -0500 Subject: fs: get rid of __FMODE_NONOTIFY kludge All it takes to get rid of the __FMODE_NONOTIFY kludge is switching fanotify from anon_inode_getfd() to anon_inode_getfile_fmode() and adding a dentry_open_nonotify() helper to be used by fanotify on the other path. That's it - no more weird shit in OPEN_FMODE(), etc. Signed-off-by: Al Viro Link: https://lore.kernel.org/linux-fsdevel/20241113043003.GH3387508@ZenIV/ Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/d1231137e7b661a382459e79a764259509a4115d.1731684329.git.josef@toxicpanda.com --- include/linux/fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..93c2b720271e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2751,6 +2751,8 @@ static inline struct file *file_open_root_mnt(struct vfsmount *mnt, } struct file *dentry_open(const struct path *path, int flags, const struct cred *creds); +struct file *dentry_open_nonotify(const struct path *path, int flags, + const struct cred *cred); struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred); struct path *backing_file_user_path(struct file *f); @@ -3707,11 +3709,9 @@ struct ctl_table; int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) -#define __FMODE_NONOTIFY ((__force int) FMODE_NONOTIFY) #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) -#define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \ - (flag & __FMODE_NONOTIFY))) +#define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE)) static inline bool is_sxid(umode_t mode) { -- cgit v1.2.3 From e419ddeabe7edd89650a19f411f928eea12b35b1 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Mon, 13 Nov 2023 23:32:09 +1000 Subject: m68k: Use kernel's generic muldi3 libgcc function Use the kernels own generic lib/muldi3.c implementation of muldi3 for 68K machines. Some 68K CPUs support 64bit multiplies so move the arch specific umul_ppmm() macro into a header file that is included by lib/muldi3.c. That way it can take advantage of the single instruction when available. There does not appear to be any existing mechanism for the generic lib/muldi3.c code to pick up an external arch definition of umul_ppmm(). Create an arch specific libgcc.h that can optionally be included by the system include/linux/libgcc.h to allow for this. Somewhat oddly there is also a similar definition of umul_ppmm() in the non-architecture code in lib/crypto/mpi/longlong.h for a wide range or machines. Its presence ends up complicating the include setup and means not being able to use something like compiler.h instead. Actually there is a few other defines of umul_ppmm() macros spread around in various architectures, but not directly usable for the m68k case. Signed-off-by: Greg Ungerer Link: https://lore.kernel.org/20231113133209.1367286-1-gerg@linux-m68k.org Reviewed-by: Geert Uytterhoeven Reviewed-by: Arnd Bergmann Signed-off-by: Geert Uytterhoeven --- include/linux/libgcc.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libgcc.h b/include/linux/libgcc.h index fc388da6a027..0d68f9d6a6a7 100644 --- a/include/linux/libgcc.h +++ b/include/linux/libgcc.h @@ -34,4 +34,8 @@ long long notrace __lshrdi3(long long u, word_type b); long long notrace __muldi3(long long u, long long v); word_type notrace __ucmpdi2(unsigned long long a, unsigned long long b); +#ifdef CONFIG_HAVE_ARCH_LIBGCC_H +#include +#endif + #endif /* __ASM_LIBGCC_H */ -- cgit v1.2.3 From 017b76fb8e5b6066f6791e7ad2387deb2c9c9a14 Mon Sep 17 00:00:00 2001 From: Joy Zou Date: Thu, 5 Dec 2024 11:51:12 -0500 Subject: regulator: pca9450: Add PMIC pca9452 support Add the PMIC pca9452 support, which add ldo3 compared with pca9451a. Signed-off-by: Joy Zou Signed-off-by: Frank Li Link: https://patch.msgid.link/20241205-pca9450-v1-4-aab448b74e78@nxp.com Signed-off-by: Mark Brown --- include/linux/regulator/pca9450.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/regulator/pca9450.h b/include/linux/regulator/pca9450.h index 243633c8dceb..b427b5873de1 100644 --- a/include/linux/regulator/pca9450.h +++ b/include/linux/regulator/pca9450.h @@ -10,6 +10,7 @@ enum pca9450_chip_type { PCA9450_TYPE_PCA9450A = 0, PCA9450_TYPE_PCA9450BC, PCA9450_TYPE_PCA9451A, + PCA9450_TYPE_PCA9452, PCA9450_TYPE_AMOUNT, }; -- cgit v1.2.3 From 2ff913ab3f472321ac1931b663314edd6c211a0c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 5 Dec 2024 16:24:14 -0800 Subject: uprobes: Simplify session consumer tracking In practice, each return_instance will typically contain either zero or one return_consumer, depending on whether it has any uprobe session consumer attached or not. It's highly unlikely that more than one uprobe session consumers will be attached to any given uprobe, so there is no need to optimize for that case. But the way we currently do memory allocation and accounting is by pre-allocating the space for 4 session consumers in contiguous block of memory next to struct return_instance fixed part. This is unnecessarily wasteful. This patch changes this to keep struct return_instance fixed-sized with one pre-allocated return_consumer, while (in a highly unlikely scenario) allowing for more session consumers in a separate dynamically allocated and reallocated array. We also simplify accounting a bit by not maintaining a separate temporary capacity for consumers array, and, instead, relying on krealloc() to be a no-op if underlying memory can accommodate a slightly bigger allocation (but again, it's very uncommon scenario to even have to do this reallocation). All this gets rid of ri_size(), simplifies push_consumer() and removes confusing ri->consumers_cnt re-assignment, while containing this singular preallocated consumer logic contained within a few simple preexisting helpers. Having fixed-sized struct return_instance simplifies and speeds up return_instance reuse that we ultimately add later in this patch set, see follow up patches. Signed-off-by: Andrii Nakryiko Signed-off-by: Ingo Molnar Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20241206002417.3295533-2-andrii@kernel.org --- include/linux/uprobes.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index e0a4c2082245..1d449978558d 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -154,12 +154,18 @@ struct return_instance { unsigned long stack; /* stack pointer */ unsigned long orig_ret_vaddr; /* original return address */ bool chained; /* true, if instance is nested */ - int consumers_cnt; + int cons_cnt; /* total number of session consumers */ struct return_instance *next; /* keep as stack */ struct rcu_head rcu; - struct return_consumer consumers[] __counted_by(consumers_cnt); + /* singular pre-allocated return_consumer instance for common case */ + struct return_consumer consumer; + /* + * extra return_consumer instances for rare cases of multiple session consumers, + * contains (cons_cnt - 1) elements + */ + struct return_consumer *extra_consumers; } ____cacheline_aligned; enum rp_check { -- cgit v1.2.3 From 8622e45b5da17e777e0e45f16296072494452318 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 5 Dec 2024 16:24:17 -0800 Subject: uprobes: Reuse return_instances between multiple uretprobes within task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of constantly allocating and freeing very short-lived struct return_instance, reuse it as much as possible within current task. For that, store a linked list of reusable return_instances within current->utask. The only complication is that ri_timer() might be still processing such return_instance. And so while the main uretprobe processing logic might be already done with return_instance and would be OK to immediately reuse it for the next uretprobe instance, it's not correct to unconditionally reuse it just like that. Instead we make sure that ri_timer() can't possibly be processing it by using seqcount_t, with ri_timer() being "a writer", while free_ret_instance() being "a reader". If, after we unlink return instance from utask->return_instances list, we know that ri_timer() hasn't gotten to processing utask->return_instances yet, then we can be sure that immediate return_instance reuse is OK, and so we put it onto utask->ri_pool for future (potentially, almost immediate) reuse. This change shows improvements both in single CPU performance (by avoiding relatively expensive kmalloc/free combon) and in terms of multi-CPU scalability, where you can see that per-CPU throughput doesn't decline as steeply with increased number of CPUs (which were previously attributed to kmalloc()/free() through profiling): BASELINE (latest perf/core) =========================== uretprobe-nop ( 1 cpus): 1.898 ± 0.002M/s ( 1.898M/s/cpu) uretprobe-nop ( 2 cpus): 3.574 ± 0.011M/s ( 1.787M/s/cpu) uretprobe-nop ( 3 cpus): 5.279 ± 0.066M/s ( 1.760M/s/cpu) uretprobe-nop ( 4 cpus): 6.824 ± 0.047M/s ( 1.706M/s/cpu) uretprobe-nop ( 5 cpus): 8.339 ± 0.060M/s ( 1.668M/s/cpu) uretprobe-nop ( 6 cpus): 9.812 ± 0.047M/s ( 1.635M/s/cpu) uretprobe-nop ( 7 cpus): 11.030 ± 0.048M/s ( 1.576M/s/cpu) uretprobe-nop ( 8 cpus): 12.453 ± 0.126M/s ( 1.557M/s/cpu) uretprobe-nop (10 cpus): 14.838 ± 0.044M/s ( 1.484M/s/cpu) uretprobe-nop (12 cpus): 17.092 ± 0.115M/s ( 1.424M/s/cpu) uretprobe-nop (14 cpus): 19.576 ± 0.022M/s ( 1.398M/s/cpu) uretprobe-nop (16 cpus): 22.264 ± 0.015M/s ( 1.391M/s/cpu) uretprobe-nop (24 cpus): 33.534 ± 0.078M/s ( 1.397M/s/cpu) uretprobe-nop (32 cpus): 43.262 ± 0.127M/s ( 1.352M/s/cpu) uretprobe-nop (40 cpus): 53.252 ± 0.080M/s ( 1.331M/s/cpu) uretprobe-nop (48 cpus): 55.778 ± 0.045M/s ( 1.162M/s/cpu) uretprobe-nop (56 cpus): 56.850 ± 0.227M/s ( 1.015M/s/cpu) uretprobe-nop (64 cpus): 62.005 ± 0.077M/s ( 0.969M/s/cpu) uretprobe-nop (72 cpus): 66.445 ± 0.236M/s ( 0.923M/s/cpu) uretprobe-nop (80 cpus): 68.353 ± 0.180M/s ( 0.854M/s/cpu) THIS PATCHSET (on top of latest perf/core) ========================================== uretprobe-nop ( 1 cpus): 2.253 ± 0.004M/s ( 2.253M/s/cpu) uretprobe-nop ( 2 cpus): 4.281 ± 0.003M/s ( 2.140M/s/cpu) uretprobe-nop ( 3 cpus): 6.389 ± 0.027M/s ( 2.130M/s/cpu) uretprobe-nop ( 4 cpus): 8.328 ± 0.005M/s ( 2.082M/s/cpu) uretprobe-nop ( 5 cpus): 10.353 ± 0.001M/s ( 2.071M/s/cpu) uretprobe-nop ( 6 cpus): 12.513 ± 0.010M/s ( 2.086M/s/cpu) uretprobe-nop ( 7 cpus): 14.525 ± 0.017M/s ( 2.075M/s/cpu) uretprobe-nop ( 8 cpus): 15.633 ± 0.013M/s ( 1.954M/s/cpu) uretprobe-nop (10 cpus): 19.532 ± 0.011M/s ( 1.953M/s/cpu) uretprobe-nop (12 cpus): 21.405 ± 0.009M/s ( 1.784M/s/cpu) uretprobe-nop (14 cpus): 24.857 ± 0.020M/s ( 1.776M/s/cpu) uretprobe-nop (16 cpus): 26.466 ± 0.018M/s ( 1.654M/s/cpu) uretprobe-nop (24 cpus): 40.513 ± 0.222M/s ( 1.688M/s/cpu) uretprobe-nop (32 cpus): 54.180 ± 0.074M/s ( 1.693M/s/cpu) uretprobe-nop (40 cpus): 66.100 ± 0.082M/s ( 1.652M/s/cpu) uretprobe-nop (48 cpus): 70.544 ± 0.068M/s ( 1.470M/s/cpu) uretprobe-nop (56 cpus): 74.494 ± 0.055M/s ( 1.330M/s/cpu) uretprobe-nop (64 cpus): 79.317 ± 0.029M/s ( 1.239M/s/cpu) uretprobe-nop (72 cpus): 84.875 ± 0.020M/s ( 1.179M/s/cpu) uretprobe-nop (80 cpus): 92.318 ± 0.224M/s ( 1.154M/s/cpu) For reference, with uprobe-nop we hit the following throughput: uprobe-nop (80 cpus): 143.485 ± 0.035M/s ( 1.794M/s/cpu) So now uretprobe stays a bit closer to that performance. Signed-off-by: Andrii Nakryiko Signed-off-by: Ingo Molnar Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20241206002417.3295533-5-andrii@kernel.org --- include/linux/uprobes.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 1d449978558d..b1df7d792fa1 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -16,6 +16,7 @@ #include #include #include +#include struct uprobe; struct vm_area_struct; @@ -124,6 +125,10 @@ struct uprobe_task { unsigned int depth; struct return_instance *return_instances; + struct return_instance *ri_pool; + struct timer_list ri_timer; + seqcount_t ri_seqcount; + union { struct { struct arch_uprobe_task autask; @@ -137,7 +142,6 @@ struct uprobe_task { }; struct uprobe *active_uprobe; - struct timer_list ri_timer; unsigned long xol_vaddr; struct arch_uprobe *auprobe; -- cgit v1.2.3 From 6057b90ecc84f232dd32a047a086a4c4c271765f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 3 Dec 2024 10:04:40 -0800 Subject: perf/core: Export perf_exclude_event() While at it, rename the same function in s390 cpum_sf PMU. Signed-off-by: Namhyung Kim Signed-off-by: Ingo Molnar Tested-by: Ravi Bangoria Reviewed-by: Ravi Bangoria Acked-by: Thomas Richter Link: https://lore.kernel.org/r/20241203180441.1634709-2-namhyung@kernel.org --- include/linux/perf_event.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index bf831b1485ff..8333f132f4a9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1690,6 +1690,8 @@ static inline int perf_allow_tracepoint(struct perf_event_attr *attr) return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); } +extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs); + extern void perf_event_init(void); extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, @@ -1895,6 +1897,10 @@ static inline u64 perf_event_pause(struct perf_event *event, bool reset) { return 0; } +static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) +{ + return 0; +} #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) -- cgit v1.2.3 From 07a756a49f4b4290b49ea46e089cbe6f79ff8d26 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 6 Nov 2024 07:42:47 -0800 Subject: Drivers: hv: util: Avoid accessing a ringbuffer not initialized yet If the KVP (or VSS) daemon starts before the VMBus channel's ringbuffer is fully initialized, we can hit the panic below: hv_utils: Registering HyperV Utility Driver hv_vmbus: registering driver hv_utils ... BUG: kernel NULL pointer dereference, address: 0000000000000000 CPU: 44 UID: 0 PID: 2552 Comm: hv_kvp_daemon Tainted: G E 6.11.0-rc3+ #1 RIP: 0010:hv_pkt_iter_first+0x12/0xd0 Call Trace: ... vmbus_recvpacket hv_kvp_onchannelcallback vmbus_on_event tasklet_action_common tasklet_action handle_softirqs irq_exit_rcu sysvec_hyperv_stimer0 asm_sysvec_hyperv_stimer0 ... kvp_register_done hvt_op_read vfs_read ksys_read __x64_sys_read This can happen because the KVP/VSS channel callback can be invoked even before the channel is fully opened: 1) as soon as hv_kvp_init() -> hvutil_transport_init() creates /dev/vmbus/hv_kvp, the kvp daemon can open the device file immediately and register itself to the driver by writing a message KVP_OP_REGISTER1 to the file (which is handled by kvp_on_msg() ->kvp_handle_handshake()) and reading the file for the driver's response, which is handled by hvt_op_read(), which calls hvt->on_read(), i.e. kvp_register_done(). 2) the problem with kvp_register_done() is that it can cause the channel callback to be called even before the channel is fully opened, and when the channel callback is starting to run, util_probe()-> vmbus_open() may have not initialized the ringbuffer yet, so the callback can hit the panic of NULL pointer dereference. To reproduce the panic consistently, we can add a "ssleep(10)" for KVP in __vmbus_open(), just before the first hv_ringbuffer_init(), and then we unload and reload the driver hv_utils, and run the daemon manually within the 10 seconds. Fix the panic by reordering the steps in util_probe() so the char dev entry used by the KVP or VSS daemon is not created until after vmbus_open() has completed. This reordering prevents the race condition from happening. Reported-by: Dexuan Cui Fixes: e0fa3e5e7df6 ("Drivers: hv: utils: fix a race on userspace daemons registration") Cc: stable@vger.kernel.org Signed-off-by: Michael Kelley Acked-by: Wei Liu Link: https://lore.kernel.org/r/20241106154247.2271-3-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20241106154247.2271-3-mhklinux@outlook.com> --- include/linux/hyperv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 22c22fb91042..02a226bcf0ed 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1559,6 +1559,7 @@ struct hv_util_service { void *channel; void (*util_cb)(void *); int (*util_init)(struct hv_util_service *); + int (*util_init_transport)(void); void (*util_deinit)(void); int (*util_pre_suspend)(void); int (*util_pre_resume)(void); -- cgit v1.2.3 From df8e78607d4795806b59564ba7a3e2e125d119fc Mon Sep 17 00:00:00 2001 From: Bastien Curutchet Date: Wed, 4 Dec 2024 10:43:16 +0100 Subject: memory: ti-aemif: Export aemif_*_cs_timings() Export the aemif_set_cs_timing() and aemif_check_cs_timing() symbols so they can be used by other drivers Add a mutex to protect the CS configuration register from concurrent accesses between the AEMIF and its 'children'. Signed-off-by: Bastien Curutchet Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/r/20241204094319.1050826-7-bastien.curutchet@bootlin.com [krzysztof: wrap aemif_set_cs_timings() at 80-char] Signed-off-by: Krzysztof Kozlowski --- include/linux/memory/ti-aemif.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 include/linux/memory/ti-aemif.h (limited to 'include/linux') diff --git a/include/linux/memory/ti-aemif.h b/include/linux/memory/ti-aemif.h new file mode 100644 index 000000000000..da94a9d985e7 --- /dev/null +++ b/include/linux/memory/ti-aemif.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __MEMORY_TI_AEMIF_H +#define __MEMORY_TI_AEMIF_H + +/** + * struct aemif_cs_timings: structure to hold CS timing configuration + * values are expressed in number of clock cycles - 1 + * @ta: minimum turn around time + * @rhold: read hold width + * @rstrobe: read strobe width + * @rsetup: read setup width + * @whold: write hold width + * @wstrobe: write strobe width + * @wsetup: write setup width + */ +struct aemif_cs_timings { + u32 ta; + u32 rhold; + u32 rstrobe; + u32 rsetup; + u32 whold; + u32 wstrobe; + u32 wsetup; +}; + +struct aemif_device; + +int aemif_set_cs_timings(struct aemif_device *aemif, u8 cs, struct aemif_cs_timings *timings); +int aemif_check_cs_timings(struct aemif_cs_timings *timings); + +#endif // __MEMORY_TI_AEMIF_H -- cgit v1.2.3 From d1fd972914239996dbd15c5142d7f6e09d95a002 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:29 +0000 Subject: ktime: Add us_to_ktime() Add a us_to_ktime() helper to go with ms_to_ktime() and ns_to_ktime(). Signed-off-by: David Howells cc: Thomas Gleixner cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-2-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/ktime.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 3a4e723eae0f..383ed9985802 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -222,6 +222,11 @@ static inline ktime_t ns_to_ktime(u64 ns) return ns; } +static inline ktime_t us_to_ktime(u64 us) +{ + return us * NSEC_PER_USEC; +} + static inline ktime_t ms_to_ktime(u64 ms) { return ms * NSEC_PER_MSEC; -- cgit v1.2.3 From 3f330db30638b6489d548084a7e8843374d41ad0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 5 Dec 2024 08:59:14 -0800 Subject: net: reformat kdoc return statements kernel-doc -Wall warns about missing Return: statement for non-void functions. We have a number of kdocs in our headers which are missing the colon, IOW they use * Return some value or * Returns some value Having the colon makes some sense, it should help kdoc parser avoid false positives. So add them. This is mostly done with a sed script, and removing the unnecessary cases (mostly the comments which aren't kdoc). Acked-by: Johannes Berg Acked-by: Richard Cochran Acked-by: Sergey Ryazanov Reviewed-by: Edward Cree Acked-by: Alexandra Winter Acked-by: Pablo Neira Ayuso Link: https://patch.msgid.link/20241205165914.1071102-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/etherdevice.h | 18 +++++++++--------- include/linux/ethtool.h | 6 +++--- include/linux/if_vlan.h | 28 ++++++++++++++-------------- include/linux/netdevice.h | 14 ++++++++------ include/linux/netfilter/x_tables.h | 2 +- include/linux/netfilter_netdev.h | 3 ++- include/linux/ptp_clock_kernel.h | 4 ++-- include/linux/rfkill.h | 2 +- include/linux/rtnetlink.h | 2 +- include/linux/skbuff.h | 16 ++++++++-------- include/linux/wwan.h | 2 +- 11 files changed, 50 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index ecf203f01034..9a1eacf35d37 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -81,7 +81,7 @@ static const u8 eth_ipv6_mcast_addr_base[ETH_ALEN] __aligned(2) = * is_link_local_ether_addr - Determine if given Ethernet address is link-local * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if address is link local reserved addr (01:80:c2:00:00:0X) per + * Return: true if address is link local reserved addr (01:80:c2:00:00:0X) per * IEEE 802.1Q 8.6.3 Frame filtering. * * Please note: addr must be aligned to u16. @@ -104,7 +104,7 @@ static inline bool is_link_local_ether_addr(const u8 *addr) * is_zero_ether_addr - Determine if give Ethernet address is all zeros. * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is all zeroes. + * Return: true if the address is all zeroes. * * Please note: addr must be aligned to u16. */ @@ -123,7 +123,7 @@ static inline bool is_zero_ether_addr(const u8 *addr) * is_multicast_ether_addr - Determine if the Ethernet address is a multicast. * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is a multicast address. + * Return: true if the address is a multicast address. * By definition the broadcast address is also a multicast address. */ static inline bool is_multicast_ether_addr(const u8 *addr) @@ -157,7 +157,7 @@ static inline bool is_multicast_ether_addr_64bits(const u8 *addr) * is_local_ether_addr - Determine if the Ethernet address is locally-assigned one (IEEE 802). * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is a local address. + * Return: true if the address is a local address. */ static inline bool is_local_ether_addr(const u8 *addr) { @@ -168,7 +168,7 @@ static inline bool is_local_ether_addr(const u8 *addr) * is_broadcast_ether_addr - Determine if the Ethernet address is broadcast * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is the broadcast address. + * Return: true if the address is the broadcast address. * * Please note: addr must be aligned to u16. */ @@ -183,7 +183,7 @@ static inline bool is_broadcast_ether_addr(const u8 *addr) * is_unicast_ether_addr - Determine if the Ethernet address is unicast * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is a unicast address. + * Return: true if the address is a unicast address. */ static inline bool is_unicast_ether_addr(const u8 *addr) { @@ -197,7 +197,7 @@ static inline bool is_unicast_ether_addr(const u8 *addr) * Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not * a multicast address, and is not FF:FF:FF:FF:FF:FF. * - * Return true if the address is valid. + * Return: true if the address is valid. * * Please note: addr must be aligned to u16. */ @@ -214,7 +214,7 @@ static inline bool is_valid_ether_addr(const u8 *addr) * * Check that the value from the Ethertype/length field is a valid Ethertype. * - * Return true if the valid is an 802.3 supported Ethertype. + * Return: true if the valid is an 802.3 supported Ethertype. */ static inline bool eth_proto_is_802_3(__be16 proto) { @@ -458,7 +458,7 @@ static inline bool ether_addr_is_ip_mcast(const u8 *addr) * ether_addr_to_u64 - Convert an Ethernet address into a u64 value. * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return a u64 value of the address + * Return: a u64 value of the address */ static inline u64 ether_addr_to_u64(const u8 *addr) { diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index b8b935b52603..e217c6321ed0 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -257,7 +257,7 @@ struct ethtool_link_ksettings { * @mode : one of the ETHTOOL_LINK_MODE_*_BIT * (not atomic, no bound checking) * - * Returns true/false. + * Returns: true/false. */ #define ethtool_link_ksettings_test_link_mode(ptr, name, mode) \ test_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name) @@ -1199,7 +1199,7 @@ ethtool_params_from_link_mode(struct ethtool_link_ksettings *link_ksettings, * @dev: pointer to net_device structure * @vclock_index: pointer to pointer of vclock index * - * Return number of phc vclocks + * Return: number of phc vclocks */ int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index); @@ -1253,7 +1253,7 @@ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add, * ethtool_get_ts_info_by_layer - Obtains time stamping capabilities from the MAC or PHY layer. * @dev: pointer to net_device structure * @info: buffer to hold the result - * Returns zero on success, non-zero otherwise. + * Returns: zero on success, non-zero otherwise. */ int ethtool_get_ts_info_by_layer(struct net_device *dev, struct kernel_ethtool_ts_info *info); diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index c1645c86eed9..d6326b53e336 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -310,7 +310,7 @@ static inline bool vlan_uses_dev(const struct net_device *dev) * eth_type_vlan - check for valid vlan ether type. * @ethertype: ether type to check * - * Returns true if the ether type is a vlan ether type. + * Returns: true if the ether type is a vlan ether type. */ static inline bool eth_type_vlan(__be16 ethertype) { @@ -341,9 +341,9 @@ static inline bool vlan_hw_offload_capable(netdev_features_t features, * @mac_len: MAC header length including outer vlan headers * * Inserts the VLAN tag into @skb as part of the payload at offset mac_len - * Returns error if skb_cow_head fails. - * * Does not change skb->protocol so this function can be used during receive. + * + * Returns: error if skb_cow_head fails. */ static inline int __vlan_insert_inner_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci, @@ -390,9 +390,9 @@ static inline int __vlan_insert_inner_tag(struct sk_buff *skb, * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload - * Returns error if skb_cow_head fails. - * * Does not change skb->protocol so this function can be used during receive. + * + * Returns: error if skb_cow_head fails. */ static inline int __vlan_insert_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) @@ -533,7 +533,7 @@ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb, * @skb: skbuff to query * @vlan_tci: buffer to store value * - * Returns error if the skb is not of VLAN type + * Returns: error if the skb is not of VLAN type */ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { @@ -551,7 +551,7 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * @skb: skbuff to query * @vlan_tci: buffer to store value * - * Returns error if @skb->vlan_tci is not set correctly + * Returns: error if @skb->vlan_tci is not set correctly */ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb, u16 *vlan_tci) @@ -570,7 +570,7 @@ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb, * @skb: skbuff to query * @vlan_tci: buffer to store value * - * Returns error if the skb is not VLAN tagged + * Returns: error if the skb is not VLAN tagged */ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { @@ -587,7 +587,7 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * @type: first vlan protocol * @depth: buffer to store length of eth and vlan tags in bytes * - * Returns the EtherType of the packet, regardless of whether it is + * Returns: the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, @@ -629,7 +629,7 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query * - * Returns the EtherType of the packet, regardless of whether it is + * Returns: the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ static inline __be16 vlan_get_protocol(const struct sk_buff *skb) @@ -710,7 +710,7 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb, * Expects the skb to contain a VLAN tag in the payload, and to have skb->data * pointing at the MAC header. * - * Returns a new pointer to skb->data, or NULL on failure to pull. + * Returns: a new pointer to skb->data, or NULL on failure to pull. */ static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) { @@ -727,7 +727,7 @@ static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) * skb_vlan_tagged - check if skb is vlan tagged. * @skb: skbuff to query * - * Returns true if the skb is tagged, regardless of whether it is hardware + * Returns: true if the skb is tagged, regardless of whether it is hardware * accelerated or not. */ static inline bool skb_vlan_tagged(const struct sk_buff *skb) @@ -743,7 +743,7 @@ static inline bool skb_vlan_tagged(const struct sk_buff *skb) * skb_vlan_tagged_multi - check if skb is vlan tagged with multiple headers. * @skb: skbuff to query * - * Returns true if the skb is tagged with multiple vlan headers, regardless + * Returns: true if the skb is tagged with multiple vlan headers, regardless * of whether it is hardware accelerated or not. */ static inline bool skb_vlan_tagged_multi(struct sk_buff *skb) @@ -774,7 +774,7 @@ static inline bool skb_vlan_tagged_multi(struct sk_buff *skb) * @skb: skbuff to query * @features: features to be checked * - * Returns features without unsafe ones if the skb has multiple tags. + * Returns: features without unsafe ones if the skb has multiple tags. */ static inline netdev_features_t vlan_features_check(struct sk_buff *skb, netdev_features_t features) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 135105441681..d917949bba03 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -509,7 +509,7 @@ static inline bool napi_prefer_busy_poll(struct napi_struct *n) * is scheduled for example in the context of delayed timer * that can be skipped if a NAPI is already scheduled. * - * Return True if NAPI is scheduled, False otherwise. + * Return: True if NAPI is scheduled, False otherwise. */ static inline bool napi_is_scheduled(struct napi_struct *n) { @@ -524,7 +524,7 @@ bool napi_schedule_prep(struct napi_struct *n); * * Schedule NAPI poll routine to be called if it is not already * running. - * Return true if we schedule a NAPI or false if not. + * Return: true if we schedule a NAPI or false if not. * Refer to napi_schedule_prep() for additional reason on why * a NAPI might not be scheduled. */ @@ -558,7 +558,7 @@ static inline void napi_schedule_irqoff(struct napi_struct *n) * Mark NAPI processing as complete. Should only be called if poll budget * has not been completely consumed. * Prefer over napi_complete(). - * Return false if device should avoid rearming interrupts. + * Return: false if device should avoid rearming interrupts. */ bool napi_complete_done(struct napi_struct *n, int work_done); @@ -3851,7 +3851,7 @@ static inline bool netif_attr_test_mask(unsigned long j, * @online_mask: bitmask for CPUs/Rx queues that are online * @nr_bits: number of bits in the bitmask * - * Returns true if a CPU/Rx queue is online. + * Returns: true if a CPU/Rx queue is online. */ static inline bool netif_attr_test_online(unsigned long j, const unsigned long *online_mask, @@ -3871,7 +3871,8 @@ static inline bool netif_attr_test_online(unsigned long j, * @srcp: the cpumask/Rx queue mask pointer * @nr_bits: number of bits in the bitmask * - * Returns >= nr_bits if no further CPUs/Rx queues set. + * Returns: next (after n) CPU/Rx queue index in the mask; + * >= nr_bits if no further CPUs/Rx queues set. */ static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, unsigned int nr_bits) @@ -3893,7 +3894,8 @@ static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, * @src2p: the second CPUs/Rx queues mask pointer * @nr_bits: number of bits in the bitmask * - * Returns >= nr_bits if no further CPUs/Rx queues set in both. + * Returns: next (after n) CPU/Rx queue index set in both masks; + * >= nr_bits if no further CPUs/Rx queues set in both. */ static inline int netif_attrmask_next_and(int n, const unsigned long *src1p, const unsigned long *src2p, diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 5897f3dbaf7c..f39f688d7285 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -357,7 +357,7 @@ extern struct static_key xt_tee_enabled; * Begin packet processing : all readers must wait the end * 1) Must be called with preemption disabled * 2) softirqs must be disabled too (or we should use this_cpu_add()) - * Returns : + * Returns: * 1 if no recursion on this cpu * 0 if recursion detected */ diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h index 8676316547cc..3175073a66ba 100644 --- a/include/linux/netfilter_netdev.h +++ b/include/linux/netfilter_netdev.h @@ -66,7 +66,6 @@ static inline bool nf_hook_egress_active(void) * @rc: result code which shall be returned by __dev_queue_xmit() on failure * @dev: netdev whose egress hooks shall be applied to @skb * - * Returns @skb on success or %NULL if the packet was consumed or filtered. * Caller must hold rcu_read_lock. * * On ingress, packets are classified first by tc, then by netfilter. @@ -81,6 +80,8 @@ static inline bool nf_hook_egress_active(void) * called recursively by tunnel drivers such as vxlan, the flag is reverted to * false after sch_handle_egress(). This ensures that netfilter is applied * both on the overlay and underlying network. + * + * Returns: @skb on success or %NULL if the packet was consumed or filtered. */ static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, struct net_device *dev) diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index c892d22ce0a7..0d68d09bedd1 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -307,7 +307,7 @@ static inline u64 adjust_by_scaled_ppm(u64 base, long scaled_ppm) * @info: Structure describing the new clock. * @parent: Pointer to the parent device of the new clock. * - * Returns a valid pointer on success or PTR_ERR on failure. If PHC + * Returns: a valid pointer on success or PTR_ERR on failure. If PHC * support is missing at the configuration level, this function * returns NULL, and drivers are expected to gracefully handle that * case separately. @@ -445,7 +445,7 @@ int ptp_get_vclocks_index(int pclock_index, int **vclock_index); * @hwtstamp: timestamp * @vclock_index: phc index of ptp vclock. * - * Returns converted timestamp, or 0 on error. + * Returns: converted timestamp, or 0 on error. */ ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp, int vclock_index); #else diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h index 997b34197385..6816e4c5f3f0 100644 --- a/include/linux/rfkill.h +++ b/include/linux/rfkill.h @@ -241,7 +241,7 @@ bool rfkill_soft_blocked(struct rfkill *rfkill); * rfkill_find_type - Helper for finding rfkill type by name * @name: the name of the type * - * Returns enum rfkill_type that corresponds to the name. + * Returns: enum rfkill_type that corresponds to the name. */ enum rfkill_type rfkill_find_type(const char *name); diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 14b88f551920..811ce44113f6 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -78,7 +78,7 @@ static inline bool lockdep_rtnl_is_held(void) * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL * @p: The pointer to read, prior to dereferencing * - * Return the value of the specified RCU-protected pointer, but omit + * Return: the value of the specified RCU-protected pointer, but omit * the READ_ONCE(), because caller holds RTNL. */ #define rtnl_dereference(p) \ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 95452d1a07fc..69624b394cd9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1134,7 +1134,7 @@ static inline bool skb_pfmemalloc(const struct sk_buff *skb) * skb_dst - returns skb dst_entry * @skb: buffer * - * Returns skb dst_entry, regardless of reference taken or not. + * Returns: skb dst_entry, regardless of reference taken or not. */ static inline struct dst_entry *skb_dst(const struct sk_buff *skb) { @@ -1222,7 +1222,7 @@ static inline bool skb_wifi_acked_valid(const struct sk_buff *skb) * skb_unref - decrement the skb's reference count * @skb: buffer * - * Returns true if we can free the skb. + * Returns: true if we can free the skb. */ static inline bool skb_unref(struct sk_buff *skb) { @@ -1344,7 +1344,7 @@ struct sk_buff_fclones { * @sk: socket * @skb: buffer * - * Returns true if skb is a fast clone, and its clone is not freed. + * Returns: true if skb is a fast clone, and its clone is not freed. * Some drivers call skb_orphan() in their ndo_start_xmit(), * so we also check that didn't happen. */ @@ -3516,7 +3516,7 @@ static inline struct page *__dev_alloc_page_noprof(gfp_t gfp_mask) * A page shouldn't be considered for reusing/recycling if it was allocated * under memory pressure or at a distant memory node. * - * Returns false if this page should be returned to page allocator, true + * Returns: false if this page should be returned to page allocator, true * otherwise. */ static inline bool dev_page_is_reusable(const struct page *page) @@ -3633,7 +3633,7 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, * skb_frag_address - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer * - * Returns the address of the data within @frag. The page must already + * Returns: the address of the data within @frag. The page must already * be mapped. */ static inline void *skb_frag_address(const skb_frag_t *frag) @@ -3648,7 +3648,7 @@ static inline void *skb_frag_address(const skb_frag_t *frag) * skb_frag_address_safe - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer * - * Returns the address of the data within @frag. Checks that the page + * Returns: the address of the data within @frag. Checks that the page * is mapped and returns %NULL otherwise. */ static inline void *skb_frag_address_safe(const skb_frag_t *frag) @@ -3890,7 +3890,7 @@ static inline int skb_linearize(struct sk_buff *skb) * skb_has_shared_frag - can any frag be overwritten * @skb: buffer to test * - * Return true if the skb has at least one frag that might be modified + * Return: true if the skb has at least one frag that might be modified * by an external entity (as in vmsplice()/sendfile()) */ static inline bool skb_has_shared_frag(const struct sk_buff *skb) @@ -4612,7 +4612,7 @@ static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb) /* Check if we need to perform checksum complete validation. * - * Returns true if checksum complete is needed, false otherwise + * Returns: true if checksum complete is needed, false otherwise * (either checksum is unnecessary or zero checksum is allowed). */ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb, diff --git a/include/linux/wwan.h b/include/linux/wwan.h index 79c781875c09..a4d6cc0c9f68 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -97,7 +97,7 @@ struct wwan_port_caps { * * This function must be balanced with a call to wwan_remove_port(). * - * Returns a valid pointer to wwan_port on success or PTR_ERR on failure + * Returns: a valid pointer to wwan_port on success or PTR_ERR on failure */ struct wwan_port *wwan_create_port(struct device *parent, enum wwan_port_type type, -- cgit v1.2.3 From 31cdd8418234e70043abd26894b57eb201489cba Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:42:58 +0100 Subject: net: stmmac: Fix CSR divider comment The comment in declaration of STMMAC_CSR_250_300M incorrectly describes the constant as '/* MDC = clk_scr_i/122 */' but the DWC Ether QOS Handbook version 5.20a says it is CSR clock/124. Signed-off-by: Jan Petrous (OSS) Reviewed-by: Jacob Keller Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-1-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index d79ff252cfdc..75cbfb576358 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -33,7 +33,7 @@ #define STMMAC_CSR_20_35M 0x2 /* MDC = clk_scr_i/16 */ #define STMMAC_CSR_35_60M 0x3 /* MDC = clk_scr_i/26 */ #define STMMAC_CSR_150_250M 0x4 /* MDC = clk_scr_i/102 */ -#define STMMAC_CSR_250_300M 0x5 /* MDC = clk_scr_i/122 */ +#define STMMAC_CSR_250_300M 0x5 /* MDC = clk_scr_i/124 */ /* MTL algorithms identifiers */ #define MTL_TX_ALGORITHM_WRR 0x0 -- cgit v1.2.3 From c8fab05d021dfc04401102f9fa1de07fc8f75d8d Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:42:59 +0100 Subject: net: stmmac: Extend CSR calc support Add support for CSR clock range up to 800 MHz. Reviewed-by: Jacob Keller Reviewed-by: Russell King (Oracle) Signed-off-by: Jan Petrous (OSS) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-2-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 75cbfb576358..865d0fe26f98 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -34,6 +34,8 @@ #define STMMAC_CSR_35_60M 0x3 /* MDC = clk_scr_i/26 */ #define STMMAC_CSR_150_250M 0x4 /* MDC = clk_scr_i/102 */ #define STMMAC_CSR_250_300M 0x5 /* MDC = clk_scr_i/124 */ +#define STMMAC_CSR_300_500M 0x6 /* MDC = clk_scr_i/204 */ +#define STMMAC_CSR_500_800M 0x7 /* MDC = clk_scr_i/324 */ /* MTL algorithms identifiers */ #define MTL_TX_ALGORITHM_WRR 0x0 -- cgit v1.2.3 From cb09f61a9ab84369c62f2ef7f8a2b797f596f6d1 Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:43:00 +0100 Subject: net: stmmac: Fix clock rate variables size The clock API clk_get_rate() returns unsigned long value. Expand affected members of stmmac platform data and convert the stmmac_clk_csr_set() and dwmac4_core_init() methods to defining the unsigned long clk_rate local variables. Reviewed-by: Andrew Lunn Reviewed-by: Serge Semin Reviewed-by: Russell King (Oracle) Signed-off-by: Jan Petrous (OSS) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-3-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 865d0fe26f98..c9878a612e53 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -252,8 +252,8 @@ struct plat_stmmacenet_data { struct clk *stmmac_clk; struct clk *pclk; struct clk *clk_ptp_ref; - unsigned int clk_ptp_rate; - unsigned int clk_ref_rate; + unsigned long clk_ptp_rate; + unsigned long clk_ref_rate; unsigned int mult_fact_100ns; s32 ptp_max_adj; u32 cdc_error_adj; @@ -265,7 +265,7 @@ struct plat_stmmacenet_data { int mac_port_sel_speed; int has_xgmac; u8 vlan_fail_q; - unsigned int eee_usecs_rate; + unsigned long eee_usecs_rate; struct pci_dev *pdev; int int_snapshot_num; int msi_mac_vec; -- cgit v1.2.3 From 386aa60abdb600a4e5ad818e6dba171685942e54 Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:43:01 +0100 Subject: net: phy: Add helper for mapping RGMII link speed to clock rate The RGMII interface supports three data rates: 10/100 Mbps and 1 Gbps. These speeds correspond to clock frequencies of 2.5/25 MHz and 125 MHz, respectively. Many Ethernet drivers, including glues in stmmac, follow a similar pattern of converting RGMII speed to clock frequency. To simplify code, define the helper rgmii_clock(speed) to convert connection speed to clock frequency. Suggested-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Signed-off-by: Jan Petrous (OSS) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-4-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index bb157136351e..e597a32cc787 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -298,6 +298,29 @@ static inline const char *phy_modes(phy_interface_t interface) } } +/** + * rgmii_clock - map link speed to the clock rate + * @speed: link speed value + * + * Description: maps RGMII supported link speeds + * into the clock rates. + * + * Returns: clock rate or negative errno + */ +static inline long rgmii_clock(int speed) +{ + switch (speed) { + case SPEED_10: + return 2500000; + case SPEED_100: + return 25000000; + case SPEED_1000: + return 125000000; + default: + return -EINVAL; + } +} + #define PHY_INIT_TIMEOUT 100000 #define PHY_FORCE_TIMEOUT 10 -- cgit v1.2.3 From 09463346b6c23672cdd451f500d2a23b792bd6f0 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Fri, 15 Nov 2024 19:26:50 +0800 Subject: crypto: hisilicon/zip - add data aggregation feature The zip device adds data aggregation feature, data with the same key can be combined. This patch enables the device data aggregation feature. New feature is called "hashagg" name and registered to the uacce subsystem to allow applications to submit data aggregation operations in user space. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 6dbd0d49628f..3a13fb719dd0 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -97,6 +97,8 @@ /* page number for queue file region */ #define QM_DOORBELL_PAGE_NR 1 +#define QM_DEV_ALG_MAX_LEN 256 + /* uacce mode of the driver */ #define UACCE_MODE_NOUACCE 0 /* don't use uacce */ #define UACCE_MODE_SVA 1 /* use uacce sva mode */ @@ -156,6 +158,7 @@ enum qm_cap_bits { QM_SUPPORT_MB_COMMAND, QM_SUPPORT_SVA_PREFETCH, QM_SUPPORT_RPM, + QM_SUPPORT_DAE, }; struct qm_dev_alg { -- cgit v1.2.3 From 771ba5c982a28ede1d33de9702c0f3501f1f9e1c Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Fri, 15 Nov 2024 19:26:51 +0800 Subject: crypto: hisilicon/zip - support new error report The error detection of the data aggregation feature is separated from the compression/decompression feature. This patch enables the error detection and reporting of the data aggregation feature. When an unrecoverable error occurs in the algorithm core, the device reports the error to the driver, and the driver will reset the device. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 3a13fb719dd0..c1dafbabbd6b 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -269,6 +269,8 @@ struct hisi_qm_err_ini { void (*show_last_dfx_regs)(struct hisi_qm *qm); void (*err_info_init)(struct hisi_qm *qm); enum acc_err_result (*get_err_result)(struct hisi_qm *qm); + bool (*dev_is_abnormal)(struct hisi_qm *qm); + int (*set_priv_status)(struct hisi_qm *qm); }; struct hisi_qm_cap_info { -- cgit v1.2.3 From ea79df10331218b04d90f028a605f33c879c518d Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Mon, 25 Nov 2024 14:23:11 +0100 Subject: mmc: core: Drop the MMC_RSP_R1_NO_CRC response The MMC_RSP_R1_NO_CRC type of response is not being used by the mmc core for any commands. Let's therefore drop it, together with the corresponding code in the host drivers. Signed-off-by: Ulf Hansson Acked-by: Wolfram Sang # for TMIO Reviewed-by: Avri Altman Message-ID: <20241125132311.23939-1-ulf.hansson@linaro.org> --- include/linux/mmc/core.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index 56972bd78462..e13856ab6ad0 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -64,9 +64,6 @@ struct mmc_command { #define MMC_RSP_R6 (MMC_RSP_PRESENT|MMC_RSP_CRC|MMC_RSP_OPCODE) #define MMC_RSP_R7 (MMC_RSP_PRESENT|MMC_RSP_CRC|MMC_RSP_OPCODE) -/* Can be used by core to poll after switch to MMC HS mode */ -#define MMC_RSP_R1_NO_CRC (MMC_RSP_PRESENT|MMC_RSP_OPCODE) - #define mmc_resp_type(cmd) ((cmd)->flags & (MMC_RSP_PRESENT|MMC_RSP_136|MMC_RSP_CRC|MMC_RSP_BUSY|MMC_RSP_OPCODE)) /* -- cgit v1.2.3 From ed97550d470d00ebafe9de888fd100cb82d3abb6 Mon Sep 17 00:00:00 2001 From: Andy-ld Lu Date: Tue, 26 Nov 2024 20:48:22 +0800 Subject: mmc: core: Introduce the MMC_RSP_R1B_NO_CRC response The R1B response type with ignoring CRC is used in the mmc_cqe_recovery(), introduce the MMC_RSP_R1B_NO_CRC response type to simplify the code. Signed-off-by: Andy-ld Lu Message-ID: <20241126125041.16071-2-andy-ld.lu@mediatek.com> Signed-off-by: Ulf Hansson --- include/linux/mmc/core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index e13856ab6ad0..01e0f591a20b 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -57,6 +57,7 @@ struct mmc_command { #define MMC_RSP_NONE (0) #define MMC_RSP_R1 (MMC_RSP_PRESENT|MMC_RSP_CRC|MMC_RSP_OPCODE) #define MMC_RSP_R1B (MMC_RSP_PRESENT|MMC_RSP_CRC|MMC_RSP_OPCODE|MMC_RSP_BUSY) +#define MMC_RSP_R1B_NO_CRC (MMC_RSP_PRESENT|MMC_RSP_OPCODE|MMC_RSP_BUSY) #define MMC_RSP_R2 (MMC_RSP_PRESENT|MMC_RSP_136|MMC_RSP_CRC) #define MMC_RSP_R3 (MMC_RSP_PRESENT) #define MMC_RSP_R4 (MMC_RSP_PRESENT) -- cgit v1.2.3 From d4cc8912cbff4990940b33cc61a9b09ddaee9704 Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Mon, 9 Dec 2024 16:49:56 +0000 Subject: firmware: arm_scmi: Add module aliases to i.MX vendor protocols Using the pattern 'scmi-protocol-0x-' as MODULE_ALIAS allows the SCMI core to autoload this protocol, if built as a module, when its protocol operations are requested by an SCMI driver. Cc: Peng Fan Acked-by: Peng Fan Signed-off-by: Cristian Marussi Message-Id: <20241209164957.1801886-3-cristian.marussi@arm.com> Signed-off-by: Sudeep Holla --- include/linux/scmi_imx_protocol.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scmi_imx_protocol.h b/include/linux/scmi_imx_protocol.h index 066216f1357a..53b356a26414 100644 --- a/include/linux/scmi_imx_protocol.h +++ b/include/linux/scmi_imx_protocol.h @@ -13,10 +13,11 @@ #include #include -enum scmi_nxp_protocol { - SCMI_PROTOCOL_IMX_BBM = 0x81, - SCMI_PROTOCOL_IMX_MISC = 0x84, -}; +#define SCMI_PROTOCOL_IMX_BBM 0x81 +#define SCMI_PROTOCOL_IMX_MISC 0x84 + +#define SCMI_IMX_VENDOR "NXP" +#define SCMI_IMX_SUBVENDOR "IMX" struct scmi_imx_bbm_proto_ops { int (*rtc_time_set)(const struct scmi_protocol_handle *ph, u32 id, -- cgit v1.2.3 From a94204f4d48e28a711b7ed10399f749286c433e3 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:15 -0500 Subject: fsnotify: opt-in for permission events at file open time Legacy inotify/fanotify listeners can add watches for events on inode, parent or mount and expect to get events (e.g. FS_MODIFY) on files that were already open at the time of setting up the watches. fanotify permission events are typically used by Anti-malware sofware, that is watching the entire mount and it is not common to have more that one Anti-malware engine installed on a system. To reduce the overhead of the fsnotify_file_perm() hooks on every file access, relax the semantics of the legacy FAN_ACCESS_PERM event to generate events only if there were *any* permission event listeners on the filesystem at the time that the file was opened. The new semantic is implemented by extending the FMODE_NONOTIFY bit into two FMODE_NONOTIFY_* bits, that are used to store a mode for which of the events types to report. This is going to apply to the new fanotify pre-content events in order to reduce the cost of the new pre-content event vfs hooks. [Thanks to Bert Karwatzki for reporting a bug in this code with CONFIG_FANOTIFY_ACCESS_PERMISSIONS disabled] Suggested-by: Linus Torvalds Link: https://lore.kernel.org/linux-fsdevel/CAHk-=wj8L=mtcRTi=NECHMGfZQgXOp_uix1YVh04fEmrKaMnXA@mail.gmail.com/ Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/5ea5f8e283d1edb55aa79c35187bfe344056af14.1731684329.git.josef@toxicpanda.com --- include/linux/fs.h | 43 ++++++++++++++++++++++++++++++++++++++----- include/linux/fsnotify.h | 39 +++++++++++++++++++++++---------------- 2 files changed, 61 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 93c2b720271e..5f7ac5b548a4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -173,13 +173,20 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_NOREUSE ((__force fmode_t)(1 << 23)) -/* FMODE_* bit 24 */ - /* File is embedded in backing_file object */ -#define FMODE_BACKING ((__force fmode_t)(1 << 25)) +#define FMODE_BACKING ((__force fmode_t)(1 << 24)) + +/* + * Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be + * generated (see below) + */ +#define FMODE_NONOTIFY ((__force fmode_t)(1 << 25)) -/* File was opened by fanotify and shouldn't generate fanotify events */ -#define FMODE_NONOTIFY ((__force fmode_t)(1 << 26)) +/* + * Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be + * generated (see below) + */ +#define FMODE_NONOTIFY_PERM ((__force fmode_t)(1 << 26)) /* File is capable of returning -EAGAIN if I/O will block */ #define FMODE_NOWAIT ((__force fmode_t)(1 << 27)) @@ -190,6 +197,32 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File does not contribute to nr_files count */ #define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29)) +/* + * The two FMODE_NONOTIFY* define which fsnotify events should not be generated + * for a file. These are the possible values of (f->f_mode & + * FMODE_FSNOTIFY_MASK) and their meaning: + * + * FMODE_NONOTIFY - suppress all (incl. non-permission) events. + * FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events. + * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only pre-content events. + */ +#define FMODE_FSNOTIFY_MASK \ + (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM) + +#define FMODE_FSNOTIFY_NONE(mode) \ + ((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY) +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS +#define FMODE_FSNOTIFY_PERM(mode) \ + ((mode & FMODE_FSNOTIFY_MASK) == 0 || \ + (mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)) +#define FMODE_FSNOTIFY_HSM(mode) \ + ((mode & FMODE_FSNOTIFY_MASK) == 0) +#else +#define FMODE_FSNOTIFY_PERM(mode) 0 +#define FMODE_FSNOTIFY_HSM(mode) 0 +#endif + + /* * Attribute flags. These should be or-ed together to figure out what * has been changed! diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 278620e063ab..8d1849137a96 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -108,38 +108,35 @@ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask) fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY); } -static inline int fsnotify_file(struct file *file, __u32 mask) +static inline int fsnotify_path(const struct path *path, __u32 mask) { - const struct path *path; + return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); +} +static inline int fsnotify_file(struct file *file, __u32 mask) +{ /* * FMODE_NONOTIFY are fds generated by fanotify itself which should not * generate new events. We also don't want to generate events for * FMODE_PATH fds (involves open & close events) as they are just * handle creation / destruction events and not "real" file events. */ - if (file->f_mode & (FMODE_NONOTIFY | FMODE_PATH)) + if (FMODE_FSNOTIFY_NONE(file->f_mode)) return 0; - path = &file->f_path; - /* Permission events require group prio >= FSNOTIFY_PRIO_CONTENT */ - if (mask & ALL_FSNOTIFY_PERM_EVENTS && - !fsnotify_sb_has_priority_watchers(path->dentry->d_sb, - FSNOTIFY_PRIO_CONTENT)) - return 0; - - return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); + return fsnotify_path(&file->f_path, mask); } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + +void file_set_fsnotify_mode(struct file *file); + /* * fsnotify_file_area_perm - permission hook before access to file range */ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, const loff_t *ppos, size_t count) { - __u32 fsnotify_mask = FS_ACCESS_PERM; - /* * filesystem may be modified in the context of permission events * (e.g. by HSM filling a file on access), so sb freeze protection @@ -150,7 +147,10 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, if (!(perm_mask & MAY_READ)) return 0; - return fsnotify_file(file, fsnotify_mask); + if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode))) + return 0; + + return fsnotify_path(&file->f_path, FS_ACCESS_PERM); } /* @@ -168,16 +168,23 @@ static inline int fsnotify_open_perm(struct file *file) { int ret; + if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode))) + return 0; + if (file->f_flags & __FMODE_EXEC) { - ret = fsnotify_file(file, FS_OPEN_EXEC_PERM); + ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM); if (ret) return ret; } - return fsnotify_file(file, FS_OPEN_PERM); + return fsnotify_path(&file->f_path, FS_OPEN_PERM); } #else +static inline void file_set_fsnotify_mode(struct file *file) +{ +} + static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, const loff_t *ppos, size_t count) { -- cgit v1.2.3 From 318652e07fa5b1743d08eeccd69a1f47f2c15710 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:16 -0500 Subject: fsnotify: check if file is actually being watched for pre-content events on open So far, we set FMODE_NONOTIFY_ flags at open time if we know that there are no permission event watchers at all on the filesystem, but lack of FMODE_NONOTIFY_ flags does not mean that the file is actually watched. For pre-content events, it is possible to optimize things so that we don't bother trying to send pre-content events if file was not watched (through sb, mnt, parent or inode itself) on open. Set FMODE_NONOTIFY_ flags according to that. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/2ddcc9f8d1fde48d085318a6b5a889289d8871d8.1731684329.git.josef@toxicpanda.com --- include/linux/fsnotify_backend.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 3ecf7768e577..9c105244815d 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -77,6 +77,9 @@ */ #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME) +/* Pre-content events can be used to fill file content */ +#define FSNOTIFY_PRE_CONTENT_EVENTS 0 + #define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \ FS_OPEN_EXEC_PERM) -- cgit v1.2.3 From 0a076036b631f086a6bce93a45eaa216f234f121 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:19 -0500 Subject: fanotify: reserve event bit of deprecated FAN_DIR_MODIFY Avoid reusing it, because we would like to reserve it for future FAN_PATH_MODIFY pre-content event. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/632d9f80428e2e7a6b6a8ccc2925d87c92bbb518.1731684329.git.josef@toxicpanda.com --- include/linux/fsnotify_backend.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 9c105244815d..c38762b62bf1 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -55,6 +55,7 @@ #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */ #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */ +/* #define FS_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ /* * Set on inode mark that cares about things that happen to its children. -- cgit v1.2.3 From f156524e5d72c81792eee81f828784dc8a37a7f2 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:20 -0500 Subject: fsnotify: introduce pre-content permission events The new FS_PRE_ACCESS permission event is similar to FS_ACCESS_PERM, but it meant for a different use case of filling file content before access to a file range, so it has slightly different semantics. Generate FS_PRE_ACCESS/FS_ACCESS_PERM as two seperate events, so content scanners could inspect the content filled by pre-content event handler. Unlike FS_ACCESS_PERM, FS_PRE_ACCESS is also called before a file is modified by syscalls as write() and fallocate(). FS_ACCESS_PERM is reported also on blockdev and pipes, but the new pre-content events are only reported for regular files and dirs. The pre-content events are meant to be used by hierarchical storage managers that want to fill the content of files on first access. There are some specific requirements from filesystems that could be used with pre-content events, so add a flag for fs to opt-in for pre-content events explicitly before they can be used. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/b934c5e3af205abc4e0e4709f6486815937ddfdf.1731684329.git.josef@toxicpanda.com --- include/linux/fs.h | 1 + include/linux/fsnotify.h | 19 ++++++++++++++++++- include/linux/fsnotify_backend.h | 11 ++++++++--- 3 files changed, 27 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 5f7ac5b548a4..3f4d59464965 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1265,6 +1265,7 @@ extern int send_sigurg(struct file *file); #define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ #define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ #define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ +#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ /* Possible states of 'frozen' field */ enum { diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 8d1849137a96..d91aa064f0e4 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -144,12 +144,29 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, */ lockdep_assert_once(file_write_not_started(file)); - if (!(perm_mask & MAY_READ)) + if (!(perm_mask & (MAY_READ | MAY_WRITE | MAY_ACCESS))) return 0; if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode))) return 0; + /* + * read()/write() and other types of access generate pre-content events. + */ + if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { + int ret = fsnotify_path(&file->f_path, FS_PRE_ACCESS); + + if (ret) + return ret; + } + + if (!(perm_mask & MAY_READ)) + return 0; + + /* + * read() also generates the legacy FS_ACCESS_PERM event, so content + * scanners can inspect the content filled by pre-content event. + */ return fsnotify_path(&file->f_path, FS_ACCESS_PERM); } diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index c38762b62bf1..9bda354b5538 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -57,6 +57,8 @@ #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */ /* #define FS_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ +#define FS_PRE_ACCESS 0x00100000 /* Pre-content access hook */ + /* * Set on inode mark that cares about things that happen to its children. * Always set for dnotify and inotify. @@ -78,11 +80,14 @@ */ #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME) +/* Content events can be used to inspect file content */ +#define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \ + FS_ACCESS_PERM) /* Pre-content events can be used to fill file content */ -#define FSNOTIFY_PRE_CONTENT_EVENTS 0 +#define FSNOTIFY_PRE_CONTENT_EVENTS (FS_PRE_ACCESS) -#define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \ - FS_OPEN_EXEC_PERM) +#define ALL_FSNOTIFY_PERM_EVENTS (FSNOTIFY_CONTENT_PERM_EVENTS | \ + FSNOTIFY_PRE_CONTENT_EVENTS) /* * This is a list of all events that may get sent to a parent that is watching -- cgit v1.2.3 From 9740d17162deca7138fad7dcf3ef52324832c32b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:21 -0500 Subject: fsnotify: pass optional file access range in pre-content event We would like to add file range information to pre-content events. Pass a struct file_range with offset and length to event handler along with pre-content permission event. The offset and length are aligned to page size, but we may need to align them to minimum folio size for filesystems with large block size. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/88eddee301231d814aede27fb4d5b41ae37c9702.1731684329.git.josef@toxicpanda.com --- include/linux/fsnotify.h | 4 ++-- include/linux/fsnotify_backend.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index d91aa064f0e4..87044acf8e79 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -154,7 +154,7 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, * read()/write() and other types of access generate pre-content events. */ if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { - int ret = fsnotify_path(&file->f_path, FS_PRE_ACCESS); + int ret = fsnotify_pre_content(&file->f_path, ppos, count); if (ret) return ret; @@ -171,7 +171,7 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, } /* - * fsnotify_file_perm - permission hook before file access + * fsnotify_file_perm - permission hook before file access (unknown range) */ static inline int fsnotify_file_perm(struct file *file, int perm_mask) { diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 9bda354b5538..0d24a21a8e60 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -294,6 +294,7 @@ static inline void fsnotify_group_assert_locked(struct fsnotify_group *group) /* When calling fsnotify tell it if the data is a path or inode */ enum fsnotify_data_type { FSNOTIFY_EVENT_NONE, + FSNOTIFY_EVENT_FILE_RANGE, FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, FSNOTIFY_EVENT_DENTRY, @@ -306,6 +307,17 @@ struct fs_error_report { struct super_block *sb; }; +struct file_range { + const struct path *path; + loff_t pos; + size_t count; +}; + +static inline const struct path *file_range_path(const struct file_range *range) +{ + return range->path; +} + static inline struct inode *fsnotify_data_inode(const void *data, int data_type) { switch (data_type) { @@ -315,6 +327,8 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type) return d_inode(data); case FSNOTIFY_EVENT_PATH: return d_inode(((const struct path *)data)->dentry); + case FSNOTIFY_EVENT_FILE_RANGE: + return d_inode(file_range_path(data)->dentry); case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *)data)->inode; default: @@ -330,6 +344,8 @@ static inline struct dentry *fsnotify_data_dentry(const void *data, int data_typ return (struct dentry *)data; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry; + case FSNOTIFY_EVENT_FILE_RANGE: + return file_range_path(data)->dentry; default: return NULL; } @@ -341,6 +357,8 @@ static inline const struct path *fsnotify_data_path(const void *data, switch (data_type) { case FSNOTIFY_EVENT_PATH: return data; + case FSNOTIFY_EVENT_FILE_RANGE: + return file_range_path(data); default: return NULL; } @@ -356,6 +374,8 @@ static inline struct super_block *fsnotify_data_sb(const void *data, return ((struct dentry *)data)->d_sb; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry->d_sb; + case FSNOTIFY_EVENT_FILE_RANGE: + return file_range_path(data)->dentry->d_sb; case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *) data)->sb; default: @@ -375,6 +395,18 @@ static inline struct fs_error_report *fsnotify_data_error_report( } } +static inline const struct file_range *fsnotify_data_file_range( + const void *data, + int data_type) +{ + switch (data_type) { + case FSNOTIFY_EVENT_FILE_RANGE: + return (struct file_range *)data; + default: + return NULL; + } +} + /* * Index to merged marks iterator array that correlates to a type of watch. * The type of watched object can be deduced from the iterator type, but not @@ -863,9 +895,17 @@ static inline void fsnotify_init_event(struct fsnotify_event *event) { INIT_LIST_HEAD(&event->list); } +int fsnotify_pre_content(const struct path *path, const loff_t *ppos, + size_t count); #else +static inline int fsnotify_pre_content(const struct path *path, + const loff_t *ppos, size_t count) +{ + return 0; +} + static inline int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, struct inode *inode, u32 cookie) -- cgit v1.2.3 From 4acf3bc76e521b47acebcefc6312c97992f4ca29 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:22 -0500 Subject: fsnotify: generate pre-content permission event on truncate Generate FS_PRE_ACCESS event before truncate, without sb_writers held. Move the security hooks also before sb_start_write() to conform with other security hooks (e.g. in write, fallocate). The event will have a range info of the page surrounding the new size to provide an opportunity to fill the conetnt at the end of file before truncating to non-page aligned size. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/23af8201db6ac2efdea94f09ab067d81ba5de7a7.1731684329.git.josef@toxicpanda.com --- include/linux/fsnotify.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 87044acf8e79..1a9ef8f6784d 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -170,6 +170,21 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, return fsnotify_path(&file->f_path, FS_ACCESS_PERM); } +/* + * fsnotify_truncate_perm - permission hook before file truncate + */ +static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) +{ + struct inode *inode = d_inode(path->dentry); + + if (!(inode->i_sb->s_iflags & SB_I_ALLOW_HSM) || + !fsnotify_sb_has_priority_watchers(inode->i_sb, + FSNOTIFY_PRIO_PRE_CONTENT)) + return 0; + + return fsnotify_pre_content(path, &length, 0); +} + /* * fsnotify_file_perm - permission hook before file access (unknown range) */ @@ -208,6 +223,11 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, return 0; } +static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) +{ + return 0; +} + static inline int fsnotify_file_perm(struct file *file, int perm_mask) { return 0; -- cgit v1.2.3 From 4f8afa33817a6420398d1c177c6e220a05081f51 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:23 -0500 Subject: fanotify: introduce FAN_PRE_ACCESS permission event Similar to FAN_ACCESS_PERM permission event, but it is only allowed with class FAN_CLASS_PRE_CONTENT and only allowed on regular files and dirs. Unlike FAN_ACCESS_PERM, it is safe to write to the file being accessed in the context of the event handler. This pre-content event is meant to be used by hierarchical storage managers that want to fill the content of files on first read access. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/b80986f8d5b860acea2c9a73c0acd93587be5fe4.1731684329.git.josef@toxicpanda.com --- include/linux/fanotify.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 89ff45bd6f01..c747af064d2c 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -89,6 +89,16 @@ #define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE | \ FAN_RENAME) +/* Content events can be used to inspect file content */ +#define FANOTIFY_CONTENT_PERM_EVENTS (FAN_OPEN_PERM | FAN_OPEN_EXEC_PERM | \ + FAN_ACCESS_PERM) +/* Pre-content events can be used to fill file content */ +#define FANOTIFY_PRE_CONTENT_EVENTS (FAN_PRE_ACCESS) + +/* Events that require a permission response from user */ +#define FANOTIFY_PERM_EVENTS (FANOTIFY_CONTENT_PERM_EVENTS | \ + FANOTIFY_PRE_CONTENT_EVENTS) + /* Events that can be reported with event->fd */ #define FANOTIFY_FD_EVENTS (FANOTIFY_PATH_EVENTS | FANOTIFY_PERM_EVENTS) @@ -104,10 +114,6 @@ FANOTIFY_INODE_EVENTS | \ FANOTIFY_ERROR_EVENTS) -/* Events that require a permission response from user */ -#define FANOTIFY_PERM_EVENTS (FAN_OPEN_PERM | FAN_ACCESS_PERM | \ - FAN_OPEN_EXEC_PERM) - /* Extra flags that may be reported with event or control handling of events */ #define FANOTIFY_EVENT_FLAGS (FAN_EVENT_ON_CHILD | FAN_ONDIR) -- cgit v1.2.3 From b4b2ff4f61ded819bfa22e50fdec7693f51cbbee Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:25 -0500 Subject: fanotify: allow to set errno in FAN_DENY permission response With FAN_DENY response, user trying to perform the filesystem operation gets an error with errno set to EPERM. It is useful for hierarchical storage management (HSM) service to be able to deny access for reasons more diverse than EPERM, for example EAGAIN, if HSM could retry the operation later. Allow fanotify groups with priority FAN_CLASSS_PRE_CONTENT to responsd to permission events with the response value FAN_DENY_ERRNO(errno), instead of FAN_DENY to return a custom error. Limit custom error values to errors expected on read(2)/write(2) and open(2) of regular files. This list could be extended in the future. Userspace can test for legitimate values of FAN_DENY_ERRNO(errno) by writing a response to an fanotify group fd with a value of FAN_NOFD in the fd field of the response. The change in fanotify_response is backward compatible, because errno is written in the high 8 bits of the 32bit response field and old kernels reject respose value with high bits set. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/1e5fb6af84b69ca96b5c849fa5f10bdf4d1dc414.1731684329.git.josef@toxicpanda.com --- include/linux/fanotify.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index c747af064d2c..78f660ebc318 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -132,7 +132,9 @@ /* These masks check for invalid bits in permission responses. */ #define FANOTIFY_RESPONSE_ACCESS (FAN_ALLOW | FAN_DENY) #define FANOTIFY_RESPONSE_FLAGS (FAN_AUDIT | FAN_INFO) -#define FANOTIFY_RESPONSE_VALID_MASK (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS) +#define FANOTIFY_RESPONSE_VALID_MASK \ + (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS | \ + (FAN_ERRNO_MASK << FAN_ERRNO_SHIFT)) /* Do not use these old uapi constants internally */ #undef FAN_ALL_CLASS_BITS -- cgit v1.2.3 From b04b981f3a842ef63e06048fafaa8d20c20334c6 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Sat, 30 Nov 2024 17:39:37 +0100 Subject: pmdomain: core: Support naming idle states Commit 422f2d418186 ("arm64: dts: qcom: Drop undocumented domain "idle-state-name"") brought to light the common misbelief that idle-state-names also applies to e.g. PSCI power domain idle states. Make that a reality, mimicking the property name used by cpuidle states. Signed-off-by: Konrad Dybcio Message-ID: <20241130-topic-idle_state_name-v1-2-d0ff67b0c8e9@oss.qualcomm.com> Signed-off-by: Ulf Hansson --- include/linux/pm_domain.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 45646bfcaf1a..1aab31370065 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -147,6 +147,7 @@ struct genpd_governor_data { }; struct genpd_power_state { + const char *name; s64 power_off_latency_ns; s64 power_on_latency_ns; s64 residency_ns; -- cgit v1.2.3 From 7d5265ffcd8b41da5e09066360540d6e0716e9cd Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 12 Nov 2024 10:28:26 -0500 Subject: rseq: Validate read-only fields under DEBUG_RSEQ config The rseq uapi requires cooperation between users of the rseq fields to ensure that all libraries and applications using rseq within a process do not interfere with each other. This is especially important for fields which are meant to be read-only from user-space, as documented in uapi/linux/rseq.h: - cpu_id_start, - cpu_id, - node_id, - mm_cid. Storing to those fields from a user-space library prevents any sharing of the rseq ABI with other libraries and applications, as other users are not aware that the content of those fields has been altered by a third-party library. This is unfortunately the current behavior of tcmalloc: it purposefully overlaps part of a cached value with the cpu_id_start upper bits to get notified about preemption, because the kernel clears those upper bits before returning to user-space. This behavior does not conform to the rseq uapi header ABI. This prevents tcmalloc from using rseq when rseq is registered by the GNU C library 2.35+. It requires tcmalloc users to disable glibc rseq registration with a glibc tunable, which is a sad state of affairs. Considering that tcmalloc and the GNU C library are the two first upstream projects using rseq, and that they are already incompatible due to use of this hack, adding kernel-level validation of all read-only fields content is necessary to ensure future users of rseq abide by the rseq ABI requirements. Validate that user-space does not corrupt the read-only fields and conform to the rseq uapi header ABI when the kernel is built with CONFIG_DEBUG_RSEQ=y. This is done by storing a copy of the read-only fields in the task_struct, and validating the prior values present in user-space before updating them. If the values do not match, print a warning on the console (printk_ratelimited()). This is a first step to identify misuses of the rseq ABI by printing a warning on the console. After a giving some time to userspace to correct its use of rseq, the plan is to eventually terminate offending processes with SIGSEGV. This change is expected to produce warnings for the upstream tcmalloc implementation, but tcmalloc developers mentioned they were open to adapt their implementation to kernel-level change. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://github.com/google/tcmalloc/issues/144 --- include/linux/sched.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index d380bffee2ef..b5916be49f62 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1367,6 +1367,15 @@ struct task_struct { * with respect to preemption. */ unsigned long rseq_event_mask; +# ifdef CONFIG_DEBUG_RSEQ + /* + * This is a place holder to save a copy of the rseq fields for + * validation of read-only fields. The struct rseq has a + * variable-length array at the end, so it cannot be used + * directly. Reserve a size large enough for the known fields. + */ + char rseq_fields[sizeof(struct rseq)]; +# endif #endif #ifdef CONFIG_SCHED_MM_CID -- cgit v1.2.3 From 3c48780d48df029cf9d5f42b8971663e6fb975ae Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 4 Dec 2024 11:48:05 -0800 Subject: of: Hide of_default_bus_match_table[] This isn't used outside this file. Hide the array in the C file. Signed-off-by: Stephen Boyd Acked-by: Saravana Kannan Link: https://lore.kernel.org/r/20241204194806.2665589-1-swboyd@chromium.org Signed-off-by: Rob Herring (Arm) --- include/linux/of_platform.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h index a2ff1ad48f7f..17471ef8e092 100644 --- a/include/linux/of_platform.h +++ b/include/linux/of_platform.h @@ -47,8 +47,6 @@ struct of_dev_auxdata { { .compatible = _compat, .phys_addr = _phys, .name = _name, \ .platform_data = _pdata } -extern const struct of_device_id of_default_bus_match_table[]; - /* Platform drivers register/unregister */ extern struct platform_device *of_device_alloc(struct device_node *np, const char *bus_id, -- cgit v1.2.3 From 549de562d794a42bb647952e965e588390e16fe0 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 5 Dec 2024 21:18:57 -0600 Subject: ACPI: platform-profile: Add a name member to handlers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to prepare for allowing multiple handlers, introduce a name field that can be used to distinguish between different handlers. Tested-by: Mark Pearson Tested-by: Matthew Schwartz Reviewed-by: Hans de Goede Reviewed-by: Mark Pearson Reviewed-by: Maximilian Luz Reviewed-by: Ilpo Järvinen Reviewed-by: Armin Wolf Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20241206031918.1537-2-mario.limonciello@amd.com Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index f5492ed413f3..6fa988e41742 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -27,6 +27,7 @@ enum platform_profile_option { }; struct platform_profile_handler { + const char *name; unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; int (*profile_get)(struct platform_profile_handler *pprof, enum platform_profile_option *profile); -- cgit v1.2.3 From 6f5e63ddc333dae371be6f8a8f70a82043697a4c Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 5 Dec 2024 21:18:59 -0600 Subject: ACPI: platform_profile: Add device pointer into platform profile handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to let platform profile handlers manage platform profile for their driver the core code will need a pointer to the device. Add this to the structure and use it in the trivial driver cases. Reviewed-by: Armin Wolf Reviewed-by: Ilpo Järvinen Reviewed-by: Maximilian Luz Reviewed-by: Mark Pearson Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20241206031918.1537-4-mario.limonciello@amd.com Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 6fa988e41742..daec6b9bad81 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -28,6 +28,7 @@ enum platform_profile_option { struct platform_profile_handler { const char *name; + struct device *dev; unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; int (*profile_get)(struct platform_profile_handler *pprof, enum platform_profile_option *profile); -- cgit v1.2.3 From 9b3bb37b44a317626464e79da8b39989b421963f Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 5 Dec 2024 21:19:00 -0600 Subject: ACPI: platform_profile: Add platform handler argument to platform_profile_remove() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To allow registering and unregistering multiple platform handlers calls to platform_profile_remove() will need to know which handler is to be removed. Add an argument for this. Tested-by: Mark Pearson Tested-by: Matthew Schwartz Reviewed-by: Hans de Goede Reviewed-by: Mark Pearson Reviewed-by: Maximilian Luz Reviewed-by: Ilpo Järvinen Reviewed-by: Armin Wolf Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20241206031918.1537-5-mario.limonciello@amd.com Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index daec6b9bad81..bcaf3aa39160 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -37,7 +37,7 @@ struct platform_profile_handler { }; int platform_profile_register(struct platform_profile_handler *pprof); -int platform_profile_remove(void); +int platform_profile_remove(struct platform_profile_handler *pprof); int platform_profile_cycle(void); void platform_profile_notify(void); -- cgit v1.2.3 From 4d5c027bf55661da2621c694ea39908ae2d3a46a Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 5 Dec 2024 21:19:01 -0600 Subject: ACPI: platform_profile: Pass the profile handler into platform_profile_notify() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The profile handler will be used to notify the appropriate class devices. Reviewed-by: Armin Wolf Reviewed-by: Mark Pearson Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20241206031918.1537-6-mario.limonciello@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index bcaf3aa39160..8ec0b8da56db 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -39,6 +39,6 @@ struct platform_profile_handler { int platform_profile_register(struct platform_profile_handler *pprof); int platform_profile_remove(struct platform_profile_handler *pprof); int platform_profile_cycle(void); -void platform_profile_notify(void); +void platform_profile_notify(struct platform_profile_handler *pprof); #endif /*_PLATFORM_PROFILE_H_*/ -- cgit v1.2.3 From 77be5cacb2c2d8c3ddd069f0b4e9408f553af1d8 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 5 Dec 2024 21:19:06 -0600 Subject: ACPI: platform_profile: Create class for ACPI platform profile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When registering a platform profile handler create a class device that will allow changing a single platform profile handler. The class and sysfs group are no longer needed when the platform profile core is a module and unloaded, so remove them at that time as well. Reviewed-by: Armin Wolf Tested-by: Mark Pearson Reviewed-by: Mark Pearson Reviewed-by: Ilpo Järvinen Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20241206031918.1537-11-mario.limonciello@amd.com Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 8ec0b8da56db..a888fd085c51 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -29,6 +29,8 @@ enum platform_profile_option { struct platform_profile_handler { const char *name; struct device *dev; + struct device *class_dev; + int minor; unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; int (*profile_get)(struct platform_profile_handler *pprof, enum platform_profile_option *profile); -- cgit v1.2.3 From 494637cf5bf098ac0fe125dd6d23368419fe9da4 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 5 Dec 2024 21:19:12 -0600 Subject: ACPI: platform_profile: Add concept of a "custom" profile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When two profile handlers don't agree on the current profile it's ambiguous what to show to the legacy sysfs interface. Add a "custom" profile string that userspace will be able to use the legacy sysfs interface to distinguish this situation.. Additionally drivers can choose to use this to indicate that a user has modified driver settings in a way that the platform profile advertised by a driver is not accurate. Reviewed-by: Armin Wolf Tested-by: Mark Pearson Reviewed-by: Mark Pearson Reviewed-by: Ilpo Järvinen Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20241206031918.1537-17-mario.limonciello@amd.com Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index a888fd085c51..0682bb4c57e5 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -23,6 +23,7 @@ enum platform_profile_option { PLATFORM_PROFILE_BALANCED, PLATFORM_PROFILE_BALANCED_PERFORMANCE, PLATFORM_PROFILE_PERFORMANCE, + PLATFORM_PROFILE_CUSTOM, PLATFORM_PROFILE_LAST, /*must always be last */ }; -- cgit v1.2.3 From 9029409d1a250da19f1086ab1113752411c5163d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 10 Dec 2024 22:55:49 +0100 Subject: power: supply: core: introduce power_supply_for_each_psy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All existing callers of power_supply_for_each_device() want to iterate over 'struct power_supply', not 'struct device'. The power_supply_for_each_device() forces each caller to duplicate the logic to go from one to the other. Introduce power_supply_for_each_psy() to simplify the callers. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241210-power-supply-dev_to_psy-v2-2-9d8c9d24cfe4@weissschuh.net Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index b98106e1a90f..11d54270eaa9 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -882,6 +882,7 @@ extern int power_supply_powers(struct power_supply *psy, struct device *dev); extern void *power_supply_get_drvdata(struct power_supply *psy); extern int power_supply_for_each_device(void *data, int (*fn)(struct device *dev, void *data)); +extern int power_supply_for_each_psy(void *data, int (*fn)(struct power_supply *psy, void *data)); static inline bool power_supply_is_amp_property(enum power_supply_property psp) { -- cgit v1.2.3 From bfc330323cf3ea6d5c9985179384c0b56f2d5372 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 10 Dec 2024 22:55:53 +0100 Subject: power: supply: core: remove power_supply_for_each_device() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no users anymore. All potential future users are expected to use power_supply_for_each_psy(). Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241210-power-supply-dev_to_psy-v2-6-9d8c9d24cfe4@weissschuh.net Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 11d54270eaa9..3d67f4a6a1c9 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -881,7 +881,6 @@ extern int power_supply_powers(struct power_supply *psy, struct device *dev); #define to_power_supply(device) container_of(device, struct power_supply, dev) extern void *power_supply_get_drvdata(struct power_supply *psy); -extern int power_supply_for_each_device(void *data, int (*fn)(struct device *dev, void *data)); extern int power_supply_for_each_psy(void *data, int (*fn)(struct power_supply *psy, void *data)); static inline bool power_supply_is_amp_property(enum power_supply_property psp) -- cgit v1.2.3 From f52204036326bd9c07db08bab6607f423c801716 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 10 Dec 2024 22:55:54 +0100 Subject: power: supply: core: introduce dev_to_psy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The psy core and drivers currently use dev_get_drvdata() to go from a 'struct device' to its 'struct power_supply'. This is not typesafe and or documented. Introduce a new helper to make this pattern explicit. Instead of using dev_get_drvdata(), use container_of_const() which also preserves the constness. Furthermore 'dev' does need to be dereferenced anymore and at some point the drvdata could be reused for something else. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241210-power-supply-dev_to_psy-v2-7-9d8c9d24cfe4@weissschuh.net Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 3d67f4a6a1c9..17fc383785bf 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -318,6 +318,8 @@ struct power_supply { #endif }; +#define dev_to_psy(__dev) container_of_const(__dev, struct power_supply, dev) + /* * This is recommended structure to specify static power supply parameters. * Generic one, parametrizable for different power supplies. Power supply -- cgit v1.2.3 From be325f08c432ae5ac6d6594d163e1899cdf202df Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Dec 2024 10:07:45 +0000 Subject: rtnetlink: add ndo_fdb_dump_context rtnl_fdb_dump() and various ndo_fdb_dump() helpers share a hidden layout of cb->ctx. Before switching rtnl_fdb_dump() to for_each_netdev_dump() in the following patch, make this more explicit. Signed-off-by: Eric Dumazet Reviewed-by: Ido Schimmel Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241209100747.2269613-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 811ce44113f6..c43cffb014a7 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -178,6 +178,13 @@ void rtnetlink_init(void); void __rtnl_unlock(void); void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail); +/* Shared by rtnl_fdb_dump() and various ndo_fdb_dump() helpers. */ +struct ndo_fdb_dump_context { + unsigned long s_h; + unsigned long s_idx; + unsigned long fdb_idx; +}; + extern int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, -- cgit v1.2.3 From 53970a05f799087e2dd2005973609188504e7fcc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Dec 2024 10:07:46 +0000 Subject: rtnetlink: switch rtnl_fdb_dump() to for_each_netdev_dump() This is the last netdev iterator still using net->dev_index_head[]. Convert to modern for_each_netdev_dump() for better scalability, and use common patterns in our stack. Following patch in this series removes the pad field in struct ndo_fdb_dump_context. Signed-off-by: Eric Dumazet Reviewed-by: Ido Schimmel Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241209100747.2269613-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index c43cffb014a7..5546571c2553 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -180,8 +180,8 @@ void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail); /* Shared by rtnl_fdb_dump() and various ndo_fdb_dump() helpers. */ struct ndo_fdb_dump_context { - unsigned long s_h; - unsigned long s_idx; + unsigned long ifindex; + unsigned long pad; unsigned long fdb_idx; }; -- cgit v1.2.3 From 53a6d8912372fc23ea82cc7a49eb59047aa0a650 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Dec 2024 10:07:47 +0000 Subject: rtnetlink: remove pad field in ndo_fdb_dump_context I chose to remove this field in a separate patch to ease potential bisection, in case one ndo_fdb_dump() is still using the old way (cb->args[2] instead of ctx->fdb_idx) Signed-off-by: Eric Dumazet Reviewed-by: Ido Schimmel Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241209100747.2269613-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 5546571c2553..3b9d132cbc9e 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -181,7 +181,6 @@ void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail); /* Shared by rtnl_fdb_dump() and various ndo_fdb_dump() helpers. */ struct ndo_fdb_dump_context { unsigned long ifindex; - unsigned long pad; unsigned long fdb_idx; }; -- cgit v1.2.3 From 51d20d1dacbec589d459e11fc88fbca419f84a99 Mon Sep 17 00:00:00 2001 From: Long Li Date: Mon, 9 Dec 2024 19:42:40 +0800 Subject: iomap: fix zero padding data issue in concurrent append writes During concurrent append writes to XFS filesystem, zero padding data may appear in the file after power failure. This happens due to imprecise disk size updates when handling write completion. Consider this scenario with concurrent append writes same file: Thread 1: Thread 2: ------------ ----------- write [A, A+B] update inode size to A+B submit I/O [A, A+BS] write [A+B, A+B+C] update inode size to A+B+C After reboot: 1) with A+B+C < A+BS, the file has zero padding in range [A+B, A+B+C] |< Block Size (BS) >| |DDDDDDDDDDDDDDDD0000000000000000| ^ ^ ^ A A+B A+B+C (EOF) 2) with A+B+C > A+BS, the file has zero padding in range [A+B, A+BS] |< Block Size (BS) >|< Block Size (BS) >| |DDDDDDDDDDDDDDDD0000000000000000|00000000000000000000000000000000| ^ ^ ^ ^ A A+B A+BS A+B+C (EOF) D = Valid Data 0 = Zero Padding The issue stems from disk size being set to min(io_offset + io_size, inode->i_size) at I/O completion. Since io_offset+io_size is block size granularity, it may exceed the actual valid file data size. In the case of concurrent append writes, inode->i_size may be larger than the actual range of valid file data written to disk, leading to inaccurate disk size updates. This patch modifies the meaning of io_size to represent the size of valid data within EOF in an ioend. If the ioend spans beyond i_size, io_size will be trimmed to provide the file with more accurate size information. This is particularly useful for on-disk size updates at completion time. After this change, ioends that span i_size will not grow or merge with other ioends in concurrent scenarios. However, these cases that need growth/merging rarely occur and it seems no noticeable performance impact. Although rounding up io_size could enable ioend growth/merging in these scenarios, we decided to keep the code simple after discussion [1]. Another benefit is that it makes the xfs_ioend_is_append() check more accurate, which can reduce unnecessary end bio callbacks of xfs_end_bio() in certain scenarios, such as repeated writes at the file tail without extending the file size. Link [1]: https://patchwork.kernel.org/project/xfs/patch/20241113091907.56937-1-leo.lilong@huawei.com Fixes: ae259a9c8593 ("fs: introduce iomap infrastructure") # goes further back than this Signed-off-by: Long Li Link: https://lore.kernel.org/r/20241209114241.3725722-3-leo.lilong@huawei.com Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 5675af6b740c..75bf54e76f3b 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -335,7 +335,7 @@ struct iomap_ioend { u16 io_type; u16 io_flags; /* IOMAP_F_* */ struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of the extent */ + size_t io_size; /* size of data within eof */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ struct bio io_bio; /* MUST BE LAST! */ -- cgit v1.2.3 From 5aec7c065fba0c56d6c1ea5d629395210f174be8 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 28 Nov 2024 12:14:14 +0000 Subject: coresight: Drop atomics in connection refcounts These belong to the device being enabled or disabled and are only ever used inside the device's spinlock. Remove the atomics to not imply that there are any other concurrent accesses. If atomics were necessary I don't think they would have been enough anyway. There would be nothing to prevent an enable or disable running concurrently if not for the spinlock. Signed-off-by: James Clark Reviewed-by: Yeoreum Yun Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20241128121414.2425119-1-james.clark@linaro.org --- include/linux/coresight.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index c13342594278..834029cb9ba2 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -200,8 +200,8 @@ struct coresight_connection { struct coresight_device *dest_dev; struct coresight_sysfs_link *link; struct coresight_device *src_dev; - atomic_t src_refcnt; - atomic_t dest_refcnt; + int src_refcnt; + int dest_refcnt; }; /** -- cgit v1.2.3 From fd9b7e8e9fbc23d69fa4accc881dea2cf13a2e2e Mon Sep 17 00:00:00 2001 From: Mao Jinlong Date: Thu, 21 Nov 2024 14:28:28 +0800 Subject: coresight: Add support to get static id for system trace sources Dynamic trace id was introduced in coresight subsystem, so trace id is allocated dynamically. However, some hardware ATB source has static trace id and it cannot be changed via software programming. For such source, it can call coresight_get_static_trace_id to get the fixed trace id from device node and pass id to coresight_trace_id_get_static_system_id to reserve the id. Signed-off-by: Mao Jinlong Reviewed-by: Mike Leach Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20241121062829.11571-3-quic_jinlmao@quicinc.com --- include/linux/coresight.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 834029cb9ba2..055ce5cd5c44 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -662,6 +662,7 @@ void coresight_relaxed_write64(struct coresight_device *csdev, void coresight_write64(struct coresight_device *csdev, u64 val, u32 offset); extern int coresight_get_cpu(struct device *dev); +extern int coresight_get_static_trace_id(struct device *dev, u32 *id); struct coresight_platform_data *coresight_get_platform_data(struct device *dev); struct coresight_connection * -- cgit v1.2.3 From eb708cd631a8dca17ff004ccc39bbeb096c1db22 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Wed, 11 Dec 2024 13:35:58 +0000 Subject: regmap: regmap_multi_reg_read(): make register list const Mark the list of registers passed into regmap_multi_reg_read() as a pointer to const. This allows the caller to define the register list as const data. This requires making the same change to _regmap_bulk_read(), which is called by regmap_multi_reg_read(). Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20241211133558.884669-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/regmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index fd41baccbf3e..3871c74f7677 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -1244,7 +1244,7 @@ int regmap_noinc_read(struct regmap *map, unsigned int reg, void *val, size_t val_len); int regmap_bulk_read(struct regmap *map, unsigned int reg, void *val, size_t val_count); -int regmap_multi_reg_read(struct regmap *map, unsigned int *reg, void *val, +int regmap_multi_reg_read(struct regmap *map, const unsigned int *reg, void *val, size_t val_count); int regmap_update_bits_base(struct regmap *map, unsigned int reg, unsigned int mask, unsigned int val, -- cgit v1.2.3 From 8392bc2ff8c8bf7c4c5e6dfa71ccd893a3c046f6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Nov 2024 10:30:29 -0500 Subject: fsnotify: generate pre-content permission event on page fault FS_PRE_ACCESS will be generated on page fault depending on the faulting method. This pre-content event is meant to be used by hierarchical storage managers that want to fill in the file content on first read access. Export a simple helper that file systems that have their own ->fault() will use, and have a more complicated helper to be do fancy things in filemap_fault. Signed-off-by: Josef Bacik Signed-off-by: Jan Kara Link: https://patch.msgid.link/aa56c50ce81b1fd18d7f5d71dd2dfced5eba9687.1731684329.git.josef@toxicpanda.com --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c39c4945946c..e6c3c9cbcfe5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3420,6 +3420,7 @@ extern vm_fault_t filemap_fault(struct vm_fault *vmf); extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); +extern vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf); extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ -- cgit v1.2.3 From 0357ef03c94ef835bd44a0658b8edb672a9dbf51 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 28 Nov 2024 15:25:32 +0100 Subject: fs: don't block write during exec on pre-content watched files Commit 2a010c412853 ("fs: don't block i_writecount during exec") removed the legacy behavior of getting ETXTBSY on attempt to open and executable file for write while it is being executed. This commit was reverted because an application that depends on this legacy behavior was broken by the change. We need to allow HSM writing into executable files while executed to fill their content on-the-fly. To that end, disable the ETXTBSY legacy behavior for files that are watched by pre-content events. This change is not expected to cause regressions with existing systems which do not have any pre-content event listeners. Signed-off-by: Amir Goldstein Acked-by: Christian Brauner Signed-off-by: Jan Kara Link: https://patch.msgid.link/20241128142532.465176-1-amir73il@gmail.com --- include/linux/fs.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3f4d59464965..a1230c40fef1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3095,6 +3095,28 @@ static inline void allow_write_access(struct file *file) if (file) atomic_inc(&file_inode(file)->i_writecount); } + +/* + * Do not prevent write to executable file when watched by pre-content events. + * + * Note that FMODE_FSNOTIFY_HSM mode is set depending on pre-content watches at + * the time of file open and remains constant for entire lifetime of the file, + * so if pre-content watches are added post execution or removed before the end + * of the execution, it will not cause i_writecount reference leak. + */ +static inline int exe_file_deny_write_access(struct file *exe_file) +{ + if (unlikely(FMODE_FSNOTIFY_HSM(exe_file->f_mode))) + return 0; + return deny_write_access(exe_file); +} +static inline void exe_file_allow_write_access(struct file *exe_file) +{ + if (unlikely(!exe_file || FMODE_FSNOTIFY_HSM(exe_file->f_mode))) + return; + allow_write_access(exe_file); +} + static inline bool inode_is_open_for_write(const struct inode *inode) { return atomic_read(&inode->i_writecount) > 0; -- cgit v1.2.3 From a87ef09b1fdf75fdc2d6b386ff23a35589173055 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 6 Dec 2024 18:28:36 +0100 Subject: iio: adc: ad_sigma_delta: Add support for reading irq status using a GPIO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some of the ADCs by Analog signal their irq condition on the MISO line. So typically that line is connected to an SPI controller and a GPIO. The GPIO is used as input and the respective interrupt is enabled when the last SPI transfer is completed. Depending on the GPIO controller the toggling MISO line might make the interrupt pending even while it's masked. In that case the irq handler is called immediately after irq_enable() and so before the device actually pulls that line low which results in non-sense values being reported to the upper layers. The only way to find out if the line was actually pulled low is to read the GPIO. (There is a flag in AD7124's status register that also signals if an interrupt was asserted, but reading that register toggles the MISO line and so might trigger another spurious interrupt.) Add the possibility to specify an interrupt GPIO in the machine description in addition to the plain interrupt. This GPIO is used then to check if the irq line is actually active in the irq handler. Signed-off-by: Uwe Kleine-König Link: https://patch.msgid.link/5be9a4cc4dc600ec384c88db01dd661a21506b9c.1733504533.git.u.kleine-koenig@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/ad_sigma_delta.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index 1851f8fed3a4..895b7ebf4be5 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -29,6 +29,7 @@ struct ad_sd_calib_data { struct ad_sigma_delta; struct device; +struct gpio_desc; struct iio_dev; /** @@ -96,6 +97,7 @@ struct ad_sigma_delta { unsigned int active_slots; unsigned int current_slot; unsigned int num_slots; + struct gpio_desc *rdy_gpiod; int irq_line; bool status_appended; /* map slots to channels in order to know what to expect from devices */ -- cgit v1.2.3 From f522589c139debb8af56dbead0c6e9dfca2d5ce4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 6 Dec 2024 18:28:38 +0100 Subject: iio: adc: ad_sigma_delta: Fix a race condition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ad_sigma_delta driver helper uses irq_disable_nosync(). With that one it is possible that the irq handler still runs after the irq_disable_nosync() function call returns. Also to properly synchronize irq disabling in the different threads proper locking is needed and because it's unclear if the irq handler's irq_disable_nosync() call comes first or the one in the enabler's error path, all code locations that disable the irq must check for .irq_dis first to ensure there is exactly one disable call per enable call. So add a spinlock to the struct ad_sigma_delta and use it to synchronize irq enabling and disabling. Also only act in the irq handler if the irq is still enabled. Fixes: af3008485ea0 ("iio:adc: Add common code for ADI Sigma Delta devices") Signed-off-by: Uwe Kleine-König Link: https://patch.msgid.link/9e6def47e2e773e0e15b7a2c29d22629b53d91b1.1733504533.git.u.kleine-koenig@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/ad_sigma_delta.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index 895b7ebf4be5..200130e4244d 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -86,6 +86,7 @@ struct ad_sigma_delta { /* private: */ struct completion completion; + spinlock_t irq_lock; /* protects .irq_dis and irq en/disable state */ bool irq_dis; bool bus_locked; -- cgit v1.2.3 From 07a28874bb49700036a3ab435dd95ae31afd21ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 6 Dec 2024 18:28:39 +0100 Subject: iio: adc: ad_sigma_delta: Store information about reset sequence length MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The various chips can be reset using a sequence of SPI transfers with MOSI = 1. The length of such a sequence varies from chip to chip. Store that length in struct ad_sigma_delta_info and replace the respective parameter to ad_sd_reset() with it. Note the ad7192 used to pass 48 as length but the documentation specifies 40 as the required length. Assuming the latter is right. (Using a too long sequence doesn't hurt apart from using a longer spi transfer than necessary, so this is no relevant fix.) The motivation for storing this information is that this is useful to clear a pending R̅D̅Y̅ signal in the next change. Signed-off-by: Uwe Kleine-König Link: https://patch.msgid.link/9750db62fce638bf140ff48172c23bff7f785e5b.1733504533.git.u.kleine-koenig@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/ad_sigma_delta.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index 200130e4244d..417073c52380 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -54,6 +54,7 @@ struct iio_dev; * @irq_flags: flags for the interrupt used by the triggered buffer * @num_slots: Number of sequencer slots * @irq_line: IRQ for reading conversions. If 0, spi->irq will be used + * @num_resetclks: Number of SPI clk cycles with MOSI=1 to reset the chip. */ struct ad_sigma_delta_info { int (*set_channel)(struct ad_sigma_delta *, unsigned int channel); @@ -70,6 +71,7 @@ struct ad_sigma_delta_info { unsigned long irq_flags; unsigned int num_slots; int irq_line; + unsigned int num_resetclks; }; /** @@ -181,8 +183,7 @@ int ad_sd_write_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg, int ad_sd_read_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg, unsigned int size, unsigned int *val); -int ad_sd_reset(struct ad_sigma_delta *sigma_delta, - unsigned int reset_length); +int ad_sd_reset(struct ad_sigma_delta *sigma_delta); int ad_sigma_delta_single_conversion(struct iio_dev *indio_dev, const struct iio_chan_spec *chan, int *val); -- cgit v1.2.3 From 22ccb0a1c57c436de899ccd3170d6d2ce7238836 Mon Sep 17 00:00:00 2001 From: Matteo Martelli Date: Mon, 2 Dec 2024 16:11:07 +0100 Subject: iio: consumers: ensure read buffers for labels and ext_info are page aligned Attributes of iio providers are exposed via sysfs. Typically, providers pass attribute values to the iio core, which handles formatting and printing to sysfs. However, some attributes, such as labels or extended info, are directly formatted and printed to sysfs by provider drivers using sysfs_emit() and sysfs_emit_at(). These helpers assume the read buffer, allocated by sysfs fop, is page-aligned. When these attributes are accessed by consumer drivers, the read buffer is allocated by the consumer and may not be page-aligned, leading to failures in the provider's callback that utilizes sysfs_emit*. Add a check to ensure that read buffers for labels and external info attributes are page-aligned. Update the prototype documentation as well. Signed-off-by: Matteo Martelli Link: https://patch.msgid.link/20241202-iio-kmalloc-align-v1-1-aa9568c03937@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/iio/consumer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h index 333d1d8ccb37..6a4479616479 100644 --- a/include/linux/iio/consumer.h +++ b/include/linux/iio/consumer.h @@ -418,7 +418,7 @@ unsigned int iio_get_channel_ext_info_count(struct iio_channel *chan); * @chan: The channel being queried. * @attr: The ext_info attribute to read. * @buf: Where to store the attribute value. Assumed to hold - * at least PAGE_SIZE bytes. + * at least PAGE_SIZE bytes and to be aligned at PAGE_SIZE. * * Returns the number of bytes written to buf (perhaps w/o zero termination; * it need not even be a string), or an error code. @@ -445,7 +445,7 @@ ssize_t iio_write_channel_ext_info(struct iio_channel *chan, const char *attr, * iio_read_channel_label() - read label for a given channel * @chan: The channel being queried. * @buf: Where to store the attribute value. Assumed to hold - * at least PAGE_SIZE bytes. + * at least PAGE_SIZE bytes and to be aligned at PAGE_SIZE. * * Returns the number of bytes written to buf, or an error code. */ -- cgit v1.2.3 From bad6722e478f5b17a5ceb039dfb4c680cf2c0b48 Mon Sep 17 00:00:00 2001 From: Eliav Farber Date: Wed, 4 Dec 2024 14:20:02 +0000 Subject: kexec: Consolidate machine_kexec_mask_interrupts() implementation Consolidate the machine_kexec_mask_interrupts implementation into a common function located in a new file: kernel/irq/kexec.c. This removes duplicate implementations from architecture-specific files in arch/arm, arch/arm64, arch/powerpc, and arch/riscv, reducing code duplication and improving maintainability. The new implementation retains architecture-specific behavior for CONFIG_GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD, which was previously implemented for ARM64. When enabled (currently for ARM64), it clears the active state of interrupts forwarded to virtual machines (VMs) before handling other interrupt masking operations. Signed-off-by: Eliav Farber Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241204142003.32859-2-farbere@amazon.com --- include/linux/irq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index fa711f80957b..25f51bf3c351 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -694,6 +694,9 @@ extern int irq_chip_request_resources_parent(struct irq_data *data); extern void irq_chip_release_resources_parent(struct irq_data *data); #endif +/* Disable or mask interrupts during a kernel kexec */ +extern void machine_kexec_mask_interrupts(void); + /* Handling of unhandled and spurious interrupts: */ extern void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret); -- cgit v1.2.3 From 41d7ea30494cc0dde3e124a75ce0add93f988ba9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 10 Dec 2024 12:27:12 -0800 Subject: lib: packing: add pack_fields() and unpack_fields() This is new API which caters to the following requirements: - Pack or unpack a large number of fields to/from a buffer with a small code footprint. The current alternative is to open-code a large number of calls to pack() and unpack(), or to use packing() to reduce that number to half. But packing() is not const-correct. - Use unpacked numbers stored in variables smaller than u64. This reduces the rodata footprint of the stored field arrays. - Perform error checking at compile time, rather than runtime, and return void from the API functions. Because the C preprocessor can't generate variable length code (loops), this is a bit tricky to do with macros. To handle this, implement macros which sanity check the packed field definitions based on their size. Finally, a single macro with a chain of __builtin_choose_expr() is used to select the appropriate macros. We enforce the use of ascending or descending order to avoid O(N^2) scaling when checking for overlap. Note that the macros are written with care to ensure that the compilers can correctly evaluate the resulting code at compile time. In particular, care was taken with avoiding too many nested statement expressions. Nested statement expressions trip up some compilers, especially when passing down variables created in previous statement expressions. There are two key design choices intended to keep the overall macro code size small. First, the definition of each CHECK_PACKED_FIELDS_N macro is implemented recursively, by calling the N-1 macro. This avoids needing the code to repeat multiple times. Second, the CHECK_PACKED_FIELD macro enforces that the fields in the array are sorted in order. This allows checking for overlap only with neighboring fields, rather than the general overlap case where each field would need to be checked against other fields. The overlap checks use the first two fields to determine the order of the remaining fields, thus allowing either ascending or descending order. This enables drivers the flexibility to keep the fields ordered in which ever order most naturally fits their hardware design and its associated documentation. The CHECK_PACKED_FIELDS macro is directly called from within pack_fields and unpack_fields, ensuring that all drivers using the API receive the benefits of the compile-time checks. Users do not need to directly call any of the macros directly. The CHECK_PACKED_FIELDS and its helper macros CHECK_PACKED_FIELDS_(0..50) are generated using a simple C program in scripts/gen_packed_field_checks.c This program can be compiled on demand and executed to generate the macro code in include/linux/packing.h. This will aid in the event that a driver needs more than 50 fields. The generator can be updated with a new size, and used to update the packing.h header file. In practice, the ice driver will need to support 27 fields, and the sja1105 driver will need to support 0 fields. This on-demand generation avoids the need to modify Kbuild. We do not anticipate the maximum number of fields to grow very often. - Reduced rodata footprint for the storage of the packed field arrays. To that end, we have struct packed_field_u8 and packed_field_u16, which define the fields with the associated type. More can be added as needed (unlikely for now). On these types, the same generic pack_fields() and unpack_fields() API can be used, thanks to the new C11 _Generic() selection feature, which can call pack_fields_u8() or pack_fields_16(), depending on the type of the "fields" array - a simplistic form of polymorphism. It is evaluated at compile time which function will actually be called. Over time, packing() is expected to be completely replaced either with pack() or with pack_fields(). Signed-off-by: Vladimir Oltean Co-developed-by: Jacob Keller Signed-off-by: Jacob Keller Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241210-packing-pack-fields-and-ice-implementation-v10-3-ee56a47479ac@intel.com Signed-off-by: Jakub Kicinski --- include/linux/packing.h | 425 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 425 insertions(+) (limited to 'include/linux') diff --git a/include/linux/packing.h b/include/linux/packing.h index 5d36dcd06f60..0589d70bbe04 100644 --- a/include/linux/packing.h +++ b/include/linux/packing.h @@ -8,6 +8,83 @@ #include #include +#define GEN_PACKED_FIELD_STRUCT(__type) \ + struct packed_field_ ## __type { \ + __type startbit; \ + __type endbit; \ + __type offset; \ + __type size; \ + } + +/* struct packed_field_u8. Use with bit offsets < 256, buffers < 32B and + * unpacked structures < 256B. + */ +GEN_PACKED_FIELD_STRUCT(u8); + +/* struct packed_field_u16. Use with bit offsets < 65536, buffers < 8KB and + * unpacked structures < 64KB. + */ +GEN_PACKED_FIELD_STRUCT(u16); + +#define PACKED_FIELD(start, end, struct_name, struct_field) \ +{ \ + (start), \ + (end), \ + offsetof(struct_name, struct_field), \ + sizeof_field(struct_name, struct_field), \ +} + +#define CHECK_PACKED_FIELD_OVERLAP(fields, index1, index2) ({ \ + typeof(&(fields)[0]) __f = (fields); \ + typeof(__f[0]) _f1 = __f[index1]; typeof(__f[0]) _f2 = __f[index2]; \ + const bool _ascending = __f[0].startbit < __f[1].startbit; \ + BUILD_BUG_ON_MSG(_ascending && _f1.startbit >= _f2.startbit, \ + __stringify(fields) " field " __stringify(index2) \ + " breaks ascending order"); \ + BUILD_BUG_ON_MSG(!_ascending && _f1.startbit <= _f2.startbit, \ + __stringify(fields) " field " __stringify(index2) \ + " breaks descending order"); \ + BUILD_BUG_ON_MSG(max(_f1.endbit, _f2.endbit) <= \ + min(_f1.startbit, _f2.startbit), \ + __stringify(fields) " field " __stringify(index2) \ + " overlaps with previous field"); \ +}) + +#define CHECK_PACKED_FIELD(fields, index) ({ \ + typeof(&(fields)[0]) _f = (fields); \ + typeof(_f[0]) __f = _f[index]; \ + BUILD_BUG_ON_MSG(__f.startbit < __f.endbit, \ + __stringify(fields) " field " __stringify(index) \ + " start bit must not be smaller than end bit"); \ + BUILD_BUG_ON_MSG(__f.size != 1 && __f.size != 2 && \ + __f.size != 4 && __f.size != 8, \ + __stringify(fields) " field " __stringify(index) \ + " has unsupported unpacked storage size"); \ + BUILD_BUG_ON_MSG(__f.startbit - __f.endbit >= BITS_PER_BYTE * __f.size, \ + __stringify(fields) " field " __stringify(index) \ + " exceeds unpacked storage size"); \ + __builtin_choose_expr(index != 0, \ + CHECK_PACKED_FIELD_OVERLAP(fields, index - 1, index), \ + 1); \ +}) + +/* Note that the packed fields may be either in ascending or descending order. + * Thus, we must check that both the first and last field wit within the + * packed buffer size. + */ +#define CHECK_PACKED_FIELDS_SIZE(fields, pbuflen) ({ \ + typeof(&(fields)[0]) _f = (fields); \ + typeof(pbuflen) _len = (pbuflen); \ + const size_t num_fields = ARRAY_SIZE(fields); \ + BUILD_BUG_ON_MSG(!__builtin_constant_p(_len), \ + __stringify(fields) " pbuflen " __stringify(pbuflen) \ + " must be a compile time constant"); \ + BUILD_BUG_ON_MSG(_f[0].startbit >= BITS_PER_BYTE * _len, \ + __stringify(fields) " first field exceeds packed buffer size"); \ + BUILD_BUG_ON_MSG(_f[num_fields - 1].startbit >= BITS_PER_BYTE * _len, \ + __stringify(fields) " last field exceeds packed buffer size"); \ +}) + #define QUIRK_MSB_ON_THE_RIGHT BIT(0) #define QUIRK_LITTLE_ENDIAN BIT(1) #define QUIRK_LSW32_IS_FIRST BIT(2) @@ -26,4 +103,352 @@ int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, size_t pbuflen, u8 quirks); +void pack_fields_u8(void *pbuf, size_t pbuflen, const void *ustruct, + const struct packed_field_u8 *fields, size_t num_fields, + u8 quirks); + +void pack_fields_u16(void *pbuf, size_t pbuflen, const void *ustruct, + const struct packed_field_u16 *fields, size_t num_fields, + u8 quirks); + +void unpack_fields_u8(const void *pbuf, size_t pbuflen, void *ustruct, + const struct packed_field_u8 *fields, size_t num_fields, + u8 quirks); + +void unpack_fields_u16(const void *pbuf, size_t pbuflen, void *ustruct, + const struct packed_field_u16 *fields, size_t num_fields, + u8 quirks); + +/* Do not hand-edit the following packed field check macros! + * + * They are generated using scripts/gen_packed_field_checks.c, which may be + * built via "make scripts_gen_packed_field_checks". If larger macro sizes are + * needed in the future, please use this program to re-generate the macros and + * insert them here. + */ + +#define CHECK_PACKED_FIELDS_1(fields) \ + CHECK_PACKED_FIELD(fields, 0) + +#define CHECK_PACKED_FIELDS_2(fields) do { \ + CHECK_PACKED_FIELDS_1(fields); \ + CHECK_PACKED_FIELD(fields, 1); \ +} while (0) + +#define CHECK_PACKED_FIELDS_3(fields) do { \ + CHECK_PACKED_FIELDS_2(fields); \ + CHECK_PACKED_FIELD(fields, 2); \ +} while (0) + +#define CHECK_PACKED_FIELDS_4(fields) do { \ + CHECK_PACKED_FIELDS_3(fields); \ + CHECK_PACKED_FIELD(fields, 3); \ +} while (0) + +#define CHECK_PACKED_FIELDS_5(fields) do { \ + CHECK_PACKED_FIELDS_4(fields); \ + CHECK_PACKED_FIELD(fields, 4); \ +} while (0) + +#define CHECK_PACKED_FIELDS_6(fields) do { \ + CHECK_PACKED_FIELDS_5(fields); \ + CHECK_PACKED_FIELD(fields, 5); \ +} while (0) + +#define CHECK_PACKED_FIELDS_7(fields) do { \ + CHECK_PACKED_FIELDS_6(fields); \ + CHECK_PACKED_FIELD(fields, 6); \ +} while (0) + +#define CHECK_PACKED_FIELDS_8(fields) do { \ + CHECK_PACKED_FIELDS_7(fields); \ + CHECK_PACKED_FIELD(fields, 7); \ +} while (0) + +#define CHECK_PACKED_FIELDS_9(fields) do { \ + CHECK_PACKED_FIELDS_8(fields); \ + CHECK_PACKED_FIELD(fields, 8); \ +} while (0) + +#define CHECK_PACKED_FIELDS_10(fields) do { \ + CHECK_PACKED_FIELDS_9(fields); \ + CHECK_PACKED_FIELD(fields, 9); \ +} while (0) + +#define CHECK_PACKED_FIELDS_11(fields) do { \ + CHECK_PACKED_FIELDS_10(fields); \ + CHECK_PACKED_FIELD(fields, 10); \ +} while (0) + +#define CHECK_PACKED_FIELDS_12(fields) do { \ + CHECK_PACKED_FIELDS_11(fields); \ + CHECK_PACKED_FIELD(fields, 11); \ +} while (0) + +#define CHECK_PACKED_FIELDS_13(fields) do { \ + CHECK_PACKED_FIELDS_12(fields); \ + CHECK_PACKED_FIELD(fields, 12); \ +} while (0) + +#define CHECK_PACKED_FIELDS_14(fields) do { \ + CHECK_PACKED_FIELDS_13(fields); \ + CHECK_PACKED_FIELD(fields, 13); \ +} while (0) + +#define CHECK_PACKED_FIELDS_15(fields) do { \ + CHECK_PACKED_FIELDS_14(fields); \ + CHECK_PACKED_FIELD(fields, 14); \ +} while (0) + +#define CHECK_PACKED_FIELDS_16(fields) do { \ + CHECK_PACKED_FIELDS_15(fields); \ + CHECK_PACKED_FIELD(fields, 15); \ +} while (0) + +#define CHECK_PACKED_FIELDS_17(fields) do { \ + CHECK_PACKED_FIELDS_16(fields); \ + CHECK_PACKED_FIELD(fields, 16); \ +} while (0) + +#define CHECK_PACKED_FIELDS_18(fields) do { \ + CHECK_PACKED_FIELDS_17(fields); \ + CHECK_PACKED_FIELD(fields, 17); \ +} while (0) + +#define CHECK_PACKED_FIELDS_19(fields) do { \ + CHECK_PACKED_FIELDS_18(fields); \ + CHECK_PACKED_FIELD(fields, 18); \ +} while (0) + +#define CHECK_PACKED_FIELDS_20(fields) do { \ + CHECK_PACKED_FIELDS_19(fields); \ + CHECK_PACKED_FIELD(fields, 19); \ +} while (0) + +#define CHECK_PACKED_FIELDS_21(fields) do { \ + CHECK_PACKED_FIELDS_20(fields); \ + CHECK_PACKED_FIELD(fields, 20); \ +} while (0) + +#define CHECK_PACKED_FIELDS_22(fields) do { \ + CHECK_PACKED_FIELDS_21(fields); \ + CHECK_PACKED_FIELD(fields, 21); \ +} while (0) + +#define CHECK_PACKED_FIELDS_23(fields) do { \ + CHECK_PACKED_FIELDS_22(fields); \ + CHECK_PACKED_FIELD(fields, 22); \ +} while (0) + +#define CHECK_PACKED_FIELDS_24(fields) do { \ + CHECK_PACKED_FIELDS_23(fields); \ + CHECK_PACKED_FIELD(fields, 23); \ +} while (0) + +#define CHECK_PACKED_FIELDS_25(fields) do { \ + CHECK_PACKED_FIELDS_24(fields); \ + CHECK_PACKED_FIELD(fields, 24); \ +} while (0) + +#define CHECK_PACKED_FIELDS_26(fields) do { \ + CHECK_PACKED_FIELDS_25(fields); \ + CHECK_PACKED_FIELD(fields, 25); \ +} while (0) + +#define CHECK_PACKED_FIELDS_27(fields) do { \ + CHECK_PACKED_FIELDS_26(fields); \ + CHECK_PACKED_FIELD(fields, 26); \ +} while (0) + +#define CHECK_PACKED_FIELDS_28(fields) do { \ + CHECK_PACKED_FIELDS_27(fields); \ + CHECK_PACKED_FIELD(fields, 27); \ +} while (0) + +#define CHECK_PACKED_FIELDS_29(fields) do { \ + CHECK_PACKED_FIELDS_28(fields); \ + CHECK_PACKED_FIELD(fields, 28); \ +} while (0) + +#define CHECK_PACKED_FIELDS_30(fields) do { \ + CHECK_PACKED_FIELDS_29(fields); \ + CHECK_PACKED_FIELD(fields, 29); \ +} while (0) + +#define CHECK_PACKED_FIELDS_31(fields) do { \ + CHECK_PACKED_FIELDS_30(fields); \ + CHECK_PACKED_FIELD(fields, 30); \ +} while (0) + +#define CHECK_PACKED_FIELDS_32(fields) do { \ + CHECK_PACKED_FIELDS_31(fields); \ + CHECK_PACKED_FIELD(fields, 31); \ +} while (0) + +#define CHECK_PACKED_FIELDS_33(fields) do { \ + CHECK_PACKED_FIELDS_32(fields); \ + CHECK_PACKED_FIELD(fields, 32); \ +} while (0) + +#define CHECK_PACKED_FIELDS_34(fields) do { \ + CHECK_PACKED_FIELDS_33(fields); \ + CHECK_PACKED_FIELD(fields, 33); \ +} while (0) + +#define CHECK_PACKED_FIELDS_35(fields) do { \ + CHECK_PACKED_FIELDS_34(fields); \ + CHECK_PACKED_FIELD(fields, 34); \ +} while (0) + +#define CHECK_PACKED_FIELDS_36(fields) do { \ + CHECK_PACKED_FIELDS_35(fields); \ + CHECK_PACKED_FIELD(fields, 35); \ +} while (0) + +#define CHECK_PACKED_FIELDS_37(fields) do { \ + CHECK_PACKED_FIELDS_36(fields); \ + CHECK_PACKED_FIELD(fields, 36); \ +} while (0) + +#define CHECK_PACKED_FIELDS_38(fields) do { \ + CHECK_PACKED_FIELDS_37(fields); \ + CHECK_PACKED_FIELD(fields, 37); \ +} while (0) + +#define CHECK_PACKED_FIELDS_39(fields) do { \ + CHECK_PACKED_FIELDS_38(fields); \ + CHECK_PACKED_FIELD(fields, 38); \ +} while (0) + +#define CHECK_PACKED_FIELDS_40(fields) do { \ + CHECK_PACKED_FIELDS_39(fields); \ + CHECK_PACKED_FIELD(fields, 39); \ +} while (0) + +#define CHECK_PACKED_FIELDS_41(fields) do { \ + CHECK_PACKED_FIELDS_40(fields); \ + CHECK_PACKED_FIELD(fields, 40); \ +} while (0) + +#define CHECK_PACKED_FIELDS_42(fields) do { \ + CHECK_PACKED_FIELDS_41(fields); \ + CHECK_PACKED_FIELD(fields, 41); \ +} while (0) + +#define CHECK_PACKED_FIELDS_43(fields) do { \ + CHECK_PACKED_FIELDS_42(fields); \ + CHECK_PACKED_FIELD(fields, 42); \ +} while (0) + +#define CHECK_PACKED_FIELDS_44(fields) do { \ + CHECK_PACKED_FIELDS_43(fields); \ + CHECK_PACKED_FIELD(fields, 43); \ +} while (0) + +#define CHECK_PACKED_FIELDS_45(fields) do { \ + CHECK_PACKED_FIELDS_44(fields); \ + CHECK_PACKED_FIELD(fields, 44); \ +} while (0) + +#define CHECK_PACKED_FIELDS_46(fields) do { \ + CHECK_PACKED_FIELDS_45(fields); \ + CHECK_PACKED_FIELD(fields, 45); \ +} while (0) + +#define CHECK_PACKED_FIELDS_47(fields) do { \ + CHECK_PACKED_FIELDS_46(fields); \ + CHECK_PACKED_FIELD(fields, 46); \ +} while (0) + +#define CHECK_PACKED_FIELDS_48(fields) do { \ + CHECK_PACKED_FIELDS_47(fields); \ + CHECK_PACKED_FIELD(fields, 47); \ +} while (0) + +#define CHECK_PACKED_FIELDS_49(fields) do { \ + CHECK_PACKED_FIELDS_48(fields); \ + CHECK_PACKED_FIELD(fields, 48); \ +} while (0) + +#define CHECK_PACKED_FIELDS_50(fields) do { \ + CHECK_PACKED_FIELDS_49(fields); \ + CHECK_PACKED_FIELD(fields, 49); \ +} while (0) + +#define CHECK_PACKED_FIELDS(fields) \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 1, ({ CHECK_PACKED_FIELDS_1(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 2, ({ CHECK_PACKED_FIELDS_2(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 3, ({ CHECK_PACKED_FIELDS_3(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 4, ({ CHECK_PACKED_FIELDS_4(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 5, ({ CHECK_PACKED_FIELDS_5(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 6, ({ CHECK_PACKED_FIELDS_6(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 7, ({ CHECK_PACKED_FIELDS_7(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 8, ({ CHECK_PACKED_FIELDS_8(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 9, ({ CHECK_PACKED_FIELDS_9(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 10, ({ CHECK_PACKED_FIELDS_10(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 11, ({ CHECK_PACKED_FIELDS_11(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 12, ({ CHECK_PACKED_FIELDS_12(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 13, ({ CHECK_PACKED_FIELDS_13(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 14, ({ CHECK_PACKED_FIELDS_14(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 15, ({ CHECK_PACKED_FIELDS_15(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 16, ({ CHECK_PACKED_FIELDS_16(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 17, ({ CHECK_PACKED_FIELDS_17(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 18, ({ CHECK_PACKED_FIELDS_18(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 19, ({ CHECK_PACKED_FIELDS_19(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 20, ({ CHECK_PACKED_FIELDS_20(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 21, ({ CHECK_PACKED_FIELDS_21(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 22, ({ CHECK_PACKED_FIELDS_22(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 23, ({ CHECK_PACKED_FIELDS_23(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 24, ({ CHECK_PACKED_FIELDS_24(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 25, ({ CHECK_PACKED_FIELDS_25(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 26, ({ CHECK_PACKED_FIELDS_26(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 27, ({ CHECK_PACKED_FIELDS_27(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 28, ({ CHECK_PACKED_FIELDS_28(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 29, ({ CHECK_PACKED_FIELDS_29(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 30, ({ CHECK_PACKED_FIELDS_30(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 31, ({ CHECK_PACKED_FIELDS_31(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 32, ({ CHECK_PACKED_FIELDS_32(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 33, ({ CHECK_PACKED_FIELDS_33(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 34, ({ CHECK_PACKED_FIELDS_34(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 35, ({ CHECK_PACKED_FIELDS_35(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 36, ({ CHECK_PACKED_FIELDS_36(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 37, ({ CHECK_PACKED_FIELDS_37(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 38, ({ CHECK_PACKED_FIELDS_38(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 39, ({ CHECK_PACKED_FIELDS_39(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 40, ({ CHECK_PACKED_FIELDS_40(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 41, ({ CHECK_PACKED_FIELDS_41(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 42, ({ CHECK_PACKED_FIELDS_42(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 43, ({ CHECK_PACKED_FIELDS_43(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 44, ({ CHECK_PACKED_FIELDS_44(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 45, ({ CHECK_PACKED_FIELDS_45(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 46, ({ CHECK_PACKED_FIELDS_46(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 47, ({ CHECK_PACKED_FIELDS_47(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 48, ({ CHECK_PACKED_FIELDS_48(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 49, ({ CHECK_PACKED_FIELDS_49(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 50, ({ CHECK_PACKED_FIELDS_50(fields); }), \ + ({ BUILD_BUG_ON_MSG(1, "CHECK_PACKED_FIELDS() must be regenerated to support array sizes larger than 50."); }) \ +)))))))))))))))))))))))))))))))))))))))))))))))))) + +/* End of generated content */ + +#define pack_fields(pbuf, pbuflen, ustruct, fields, quirks) \ + ({ \ + CHECK_PACKED_FIELDS(fields); \ + CHECK_PACKED_FIELDS_SIZE((fields), (pbuflen)); \ + _Generic((fields), \ + const struct packed_field_u8 * : pack_fields_u8, \ + const struct packed_field_u16 * : pack_fields_u16 \ + )((pbuf), (pbuflen), (ustruct), (fields), ARRAY_SIZE(fields), (quirks)); \ + }) + +#define unpack_fields(pbuf, pbuflen, ustruct, fields, quirks) \ + ({ \ + CHECK_PACKED_FIELDS(fields); \ + CHECK_PACKED_FIELDS_SIZE((fields), (pbuflen)); \ + _Generic((fields), \ + const struct packed_field_u8 * : unpack_fields_u8, \ + const struct packed_field_u16 * : unpack_fields_u16 \ + )((pbuf), (pbuflen), (ustruct), (fields), ARRAY_SIZE(fields), (quirks)); \ + }) + #endif -- cgit v1.2.3 From 082e8f6db9092d19ae84549874daaef240c2207b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Mon, 11 Nov 2024 11:03:45 +0100 Subject: turris-omnia-mcu-interface.h: Move command execution function to global header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the command execution functions from the turris-omnia-mcu platform driver private header to the global turris-omnia-mcu-interface.h header, so that they can be used by the LED driver. Signed-off-by: Marek Behún Link: https://lore.kernel.org/r/20241111100355.6978-2-kabel@kernel.org Signed-off-by: Lee Jones --- include/linux/turris-omnia-mcu-interface.h | 136 ++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/turris-omnia-mcu-interface.h b/include/linux/turris-omnia-mcu-interface.h index 2da8cbeb158a..7f24cc682780 100644 --- a/include/linux/turris-omnia-mcu-interface.h +++ b/include/linux/turris-omnia-mcu-interface.h @@ -9,7 +9,10 @@ #define __TURRIS_OMNIA_MCU_INTERFACE_H #include -#include +#include +#include +#include +#include enum omnia_commands_e { OMNIA_CMD_GET_STATUS_WORD = 0x01, /* slave sends status word back */ @@ -246,4 +249,135 @@ enum omnia_cmd_usb_ovc_prot_e { OMNIA_CMD_xET_USB_OVC_PROT_ENABLE = BIT(4), }; +/* Command execution functions */ + +struct i2c_client; + +int omnia_cmd_write_read(const struct i2c_client *client, + void *cmd, unsigned int cmd_len, + void *reply, unsigned int reply_len); + +static inline int omnia_cmd_write(const struct i2c_client *client, void *cmd, + unsigned int len) +{ + return omnia_cmd_write_read(client, cmd, len, NULL, 0); +} + +static inline int omnia_cmd_write_u8(const struct i2c_client *client, u8 cmd, + u8 val) +{ + u8 buf[2] = { cmd, val }; + + return omnia_cmd_write(client, buf, sizeof(buf)); +} + +static inline int omnia_cmd_write_u16(const struct i2c_client *client, u8 cmd, + u16 val) +{ + u8 buf[3]; + + buf[0] = cmd; + put_unaligned_le16(val, &buf[1]); + + return omnia_cmd_write(client, buf, sizeof(buf)); +} + +static inline int omnia_cmd_write_u32(const struct i2c_client *client, u8 cmd, + u32 val) +{ + u8 buf[5]; + + buf[0] = cmd; + put_unaligned_le32(val, &buf[1]); + + return omnia_cmd_write(client, buf, sizeof(buf)); +} + +static inline int omnia_cmd_read(const struct i2c_client *client, u8 cmd, + void *reply, unsigned int len) +{ + return omnia_cmd_write_read(client, &cmd, 1, reply, len); +} + +static inline unsigned int +omnia_compute_reply_length(unsigned long mask, bool interleaved, + unsigned int offset) +{ + if (!mask) + return 0; + + return ((__fls(mask) >> 3) << interleaved) + 1 + offset; +} + +/* Returns 0 on success */ +static inline int omnia_cmd_read_bits(const struct i2c_client *client, u8 cmd, + unsigned long bits, unsigned long *dst) +{ + __le32 reply; + int err; + + if (!bits) { + *dst = 0; + return 0; + } + + err = omnia_cmd_read(client, cmd, &reply, + omnia_compute_reply_length(bits, false, 0)); + if (err) + return err; + + *dst = le32_to_cpu(reply) & bits; + + return 0; +} + +static inline int omnia_cmd_read_bit(const struct i2c_client *client, u8 cmd, + unsigned long bit) +{ + unsigned long reply; + int err; + + err = omnia_cmd_read_bits(client, cmd, bit, &reply); + if (err) + return err; + + return !!reply; +} + +static inline int omnia_cmd_read_u32(const struct i2c_client *client, u8 cmd, + u32 *dst) +{ + __le32 reply; + int err; + + err = omnia_cmd_read(client, cmd, &reply, sizeof(reply)); + if (err) + return err; + + *dst = le32_to_cpu(reply); + + return 0; +} + +static inline int omnia_cmd_read_u16(const struct i2c_client *client, u8 cmd, + u16 *dst) +{ + __le16 reply; + int err; + + err = omnia_cmd_read(client, cmd, &reply, sizeof(reply)); + if (err) + return err; + + *dst = le16_to_cpu(reply); + + return 0; +} + +static inline int omnia_cmd_read_u8(const struct i2c_client *client, u8 cmd, + u8 *reply) +{ + return omnia_cmd_read(client, cmd, reply, sizeof(*reply)); +} + #endif /* __TURRIS_OMNIA_MCU_INTERFACE_H */ -- cgit v1.2.3 From d665d7f2800fff5da9311e4c8c236966ba57d440 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Mon, 11 Nov 2024 11:03:47 +0100 Subject: turris-omnia-mcu-interface.h: Add LED commands related definitions to global header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add definitions for contents of the OMNIA_CMD_LED_MODE and OMNIA_CMD_LED_STATE commands to the global turris-omnia-mcu-interface.h header. Signed-off-by: Marek Behún Link: https://lore.kernel.org/r/20241111100355.6978-4-kabel@kernel.org Signed-off-by: Lee Jones --- include/linux/turris-omnia-mcu-interface.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/turris-omnia-mcu-interface.h b/include/linux/turris-omnia-mcu-interface.h index 7f24cc682780..06c94e032c6f 100644 --- a/include/linux/turris-omnia-mcu-interface.h +++ b/include/linux/turris-omnia-mcu-interface.h @@ -239,6 +239,18 @@ enum omnia_int_e { OMNIA_INT_LAN5_LED1 = BIT(31), }; +enum omnia_cmd_led_mode_e { + OMNIA_CMD_LED_MODE_LED_MASK = GENMASK(3, 0), +#define OMNIA_CMD_LED_MODE_LED(_l) FIELD_PREP(OMNIA_CMD_LED_MODE_LED_MASK, _l) + OMNIA_CMD_LED_MODE_USER = BIT(4), +}; + +enum omnia_cmd_led_state_e { + OMNIA_CMD_LED_STATE_LED_MASK = GENMASK(3, 0), +#define OMNIA_CMD_LED_STATE_LED(_l) FIELD_PREP(OMNIA_CMD_LED_STATE_LED_MASK, _l) + OMNIA_CMD_LED_STATE_ON = BIT(4), +}; + enum omnia_cmd_poweroff_e { OMNIA_CMD_POWER_OFF_POWERON_BUTTON = BIT(0), OMNIA_CMD_POWER_OFF_MAGIC = 0xdead, -- cgit v1.2.3 From 76850b54943ffd5037c97cc27794449ce05c31e9 Mon Sep 17 00:00:00 2001 From: Rick Wertenbroek Date: Thu, 12 Dec 2024 17:25:47 +0100 Subject: PCI: endpoint: Replace magic number '6' by PCI_STD_NUM_BARS Replace the constant "6" by PCI_STD_NUM_BARS, as defined in include/uapi/linux/pci_regs.h: #define PCI_STD_NUM_BARS 6 /* Number of standard BARs */ Link: https://lore.kernel.org/r/20241212162547.225880-1-rick.wertenbroek@gmail.com Signed-off-by: Rick Wertenbroek Signed-off-by: Bjorn Helgaas --- include/linux/pci-epf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index 18a3aeb62ae4..ee6156bcbbd0 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -157,7 +157,7 @@ struct pci_epf { struct device dev; const char *name; struct pci_epf_header *header; - struct pci_epf_bar bar[6]; + struct pci_epf_bar bar[PCI_STD_NUM_BARS]; u8 msi_interrupts; u16 msix_interrupts; u8 func_no; @@ -174,7 +174,7 @@ struct pci_epf { /* Below members are to attach secondary EPC to an endpoint function */ struct pci_epc *sec_epc; struct list_head sec_epc_list; - struct pci_epf_bar sec_epc_bar[6]; + struct pci_epf_bar sec_epc_bar[PCI_STD_NUM_BARS]; u8 sec_epc_func_no; struct config_group *group; unsigned int is_bound; -- cgit v1.2.3 From d24bf99214b199c25f9c2cb04b3a4993d1c7ab60 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 11 Dec 2024 18:44:49 +0100 Subject: power: supply: core: Add new "charge_types" property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new "charge_types" property, this is identical to "charge_type" but reading returns a list of supported charge-types with the currently active type surrounded by square brackets, e.g.: Fast [Standard] "Long_Life" This has the advantage over the existing "charge_type" property that this allows userspace to find out which charge-types are supported for writable charge_type properties. Drivers which already support "charge_type" can easily add support for this by setting power_supply_desc.charge_types to a bitmask representing valid charge_type values. The existing "charge_type" get_property() and set_property() code paths can be re-used for "charge_types". Signed-off-by: Hans de Goede Reviewed-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241211174451.355421-3-hdegoede@redhat.com Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 17fc383785bf..0d96657d1a2b 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -40,7 +40,7 @@ enum { }; /* What algorithm is the charger using? */ -enum { +enum power_supply_charge_type { POWER_SUPPLY_CHARGE_TYPE_UNKNOWN = 0, POWER_SUPPLY_CHARGE_TYPE_NONE, POWER_SUPPLY_CHARGE_TYPE_TRICKLE, /* slow speed */ @@ -99,6 +99,7 @@ enum power_supply_property { /* Properties of type `int' */ POWER_SUPPLY_PROP_STATUS = 0, POWER_SUPPLY_PROP_CHARGE_TYPE, + POWER_SUPPLY_PROP_CHARGE_TYPES, POWER_SUPPLY_PROP_HEALTH, POWER_SUPPLY_PROP_PRESENT, POWER_SUPPLY_PROP_ONLINE, @@ -245,6 +246,7 @@ struct power_supply_desc { const char *name; enum power_supply_type type; u8 charge_behaviours; + u32 charge_types; u32 usb_types; const enum power_supply_property *properties; size_t num_properties; @@ -946,6 +948,11 @@ ssize_t power_supply_charge_behaviour_show(struct device *dev, char *buf); int power_supply_charge_behaviour_parse(unsigned int available_behaviours, const char *buf); +ssize_t power_supply_charge_types_show(struct device *dev, + unsigned int available_types, + enum power_supply_charge_type current_type, + char *buf); +int power_supply_charge_types_parse(unsigned int available_types, const char *buf); #else static inline ssize_t power_supply_charge_behaviour_show(struct device *dev, @@ -961,6 +968,20 @@ static inline int power_supply_charge_behaviour_parse(unsigned int available_beh { return -EOPNOTSUPP; } + +static inline +ssize_t power_supply_charge_types_show(struct device *dev, + unsigned int available_types, + enum power_supply_charge_type current_type, + char *buf) +{ + return -EOPNOTSUPP; +} + +static inline int power_supply_charge_types_parse(unsigned int available_types, const char *buf) +{ + return -EOPNOTSUPP; +} #endif #endif /* __LINUX_POWER_SUPPLY_H__ */ -- cgit v1.2.3 From 0dffdb3b3366c932fb7d210f5032476c552f7000 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 11 Dec 2024 18:26:47 +0100 Subject: skbuff: allow 2-4-argument skb_frag_dma_map() skb_frag_dma_map(dev, frag, 0, skb_frag_size(frag), DMA_TO_DEVICE) is repeated across dozens of drivers and really wants a shorthand. Add a macro which will count args and handle all possible number from 2 to 5. Semantics: skb_frag_dma_map(dev, frag) -> __skb_frag_dma_map(dev, frag, 0, skb_frag_size(frag), DMA_TO_DEVICE) skb_frag_dma_map(dev, frag, offset) -> __skb_frag_dma_map(dev, frag, offset, skb_frag_size(frag) - offset, DMA_TO_DEVICE) skb_frag_dma_map(dev, frag, offset, size) -> __skb_frag_dma_map(dev, frag, offset, size, DMA_TO_DEVICE) skb_frag_dma_map(dev, frag, offset, size, dir) -> __skb_frag_dma_map(dev, frag, offset, size, dir) No object code size changes for the existing callers. Users passing less arguments also won't have bigger size comparing to the full equivalent call. Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241211172649.761483-11-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 69624b394cd9..b2509cd0b930 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3674,7 +3674,7 @@ static inline void skb_frag_page_copy(skb_frag_t *fragto, bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio); /** - * skb_frag_dma_map - maps a paged fragment via the DMA API + * __skb_frag_dma_map - maps a paged fragment via the DMA API * @dev: the device to map the fragment to * @frag: the paged fragment to map * @offset: the offset within the fragment (starting at the @@ -3684,15 +3684,36 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio); * * Maps the page associated with @frag to @device. */ -static inline dma_addr_t skb_frag_dma_map(struct device *dev, - const skb_frag_t *frag, - size_t offset, size_t size, - enum dma_data_direction dir) +static inline dma_addr_t __skb_frag_dma_map(struct device *dev, + const skb_frag_t *frag, + size_t offset, size_t size, + enum dma_data_direction dir) { return dma_map_page(dev, skb_frag_page(frag), skb_frag_off(frag) + offset, size, dir); } +#define skb_frag_dma_map(dev, frag, ...) \ + CONCATENATE(_skb_frag_dma_map, \ + COUNT_ARGS(__VA_ARGS__))(dev, frag, ##__VA_ARGS__) + +#define __skb_frag_dma_map1(dev, frag, offset, uf, uo) ({ \ + const skb_frag_t *uf = (frag); \ + size_t uo = (offset); \ + \ + __skb_frag_dma_map(dev, uf, uo, skb_frag_size(uf) - uo, \ + DMA_TO_DEVICE); \ +}) +#define _skb_frag_dma_map1(dev, frag, offset) \ + __skb_frag_dma_map1(dev, frag, offset, __UNIQUE_ID(frag_), \ + __UNIQUE_ID(offset_)) +#define _skb_frag_dma_map0(dev, frag) \ + _skb_frag_dma_map1(dev, frag, 0) +#define _skb_frag_dma_map2(dev, frag, offset, size) \ + __skb_frag_dma_map(dev, frag, offset, size, DMA_TO_DEVICE) +#define _skb_frag_dma_map3(dev, frag, offset, size, dir) \ + __skb_frag_dma_map(dev, frag, offset, size, dir) + static inline struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) { -- cgit v1.2.3 From 0ef8047b737d7480a5d4c46d956e97c190f13050 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 29 Nov 2024 16:15:54 +0100 Subject: x86/static-call: provide a way to do very early static-call updates Add static_call_update_early() for updating static-call targets in very early boot. This will be needed for support of Xen guest type specific hypercall functions. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Co-developed-by: Peter Zijlstra Co-developed-by: Josh Poimboeuf --- include/linux/compiler.h | 37 ++++++++++++++++++++++++++----------- include/linux/static_call.h | 1 + 2 files changed, 27 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 469a64dd6495..240c632c5b95 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -216,28 +216,43 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif /* __KERNEL__ */ +/** + * offset_to_ptr - convert a relative memory offset to an absolute pointer + * @off: the address of the 32-bit offset value + */ +static inline void *offset_to_ptr(const int *off) +{ + return (void *)((unsigned long)off + *off); +} + +#endif /* __ASSEMBLY__ */ + +#ifdef CONFIG_64BIT +#define ARCH_SEL(a,b) a +#else +#define ARCH_SEL(a,b) b +#endif + /* * Force the compiler to emit 'sym' as a symbol, so that we can reference * it from inline assembler. Necessary in case 'sym' could be inlined * otherwise, or eliminated entirely due to lack of references that are * visible to the compiler. */ -#define ___ADDRESSABLE(sym, __attrs) \ - static void * __used __attrs \ +#define ___ADDRESSABLE(sym, __attrs) \ + static void * __used __attrs \ __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym; + #define __ADDRESSABLE(sym) \ ___ADDRESSABLE(sym, __section(".discard.addressable")) -/** - * offset_to_ptr - convert a relative memory offset to an absolute pointer - * @off: the address of the 32-bit offset value - */ -static inline void *offset_to_ptr(const int *off) -{ - return (void *)((unsigned long)off + *off); -} +#define __ADDRESSABLE_ASM(sym) \ + .pushsection .discard.addressable,"aw"; \ + .align ARCH_SEL(8,4); \ + ARCH_SEL(.quad, .long) __stringify(sym); \ + .popsection; -#endif /* __ASSEMBLY__ */ +#define __ADDRESSABLE_ASM_STR(sym) __stringify(__ADDRESSABLE_ASM(sym)) #ifdef __CHECKER__ #define __BUILD_BUG_ON_ZERO_MSG(e, msg) (0) diff --git a/include/linux/static_call.h b/include/linux/static_call.h index 141e6b176a1b..785980af8972 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -138,6 +138,7 @@ #ifdef CONFIG_HAVE_STATIC_CALL #include +extern int static_call_initialized; /* * Either @site or @tramp can be NULL. */ -- cgit v1.2.3 From 62374ce1876be26b3f33575680e67ca69a59db54 Mon Sep 17 00:00:00 2001 From: Tao Zhang Date: Fri, 13 Dec 2024 18:07:29 +0800 Subject: coresight: Add a helper to check if a device is source Since there are a lot of places in the code to check whether the device is source, add a helper to check it. Signed-off-by: Tao Zhang Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20241213100731.25914-3-quic_taozha@quicinc.com --- include/linux/coresight.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 055ce5cd5c44..c50d128e8d93 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -588,9 +588,14 @@ static inline void csdev_access_write64(struct csdev_access *csa, u64 val, u32 o } #endif /* CONFIG_64BIT */ +static inline bool coresight_is_device_source(struct coresight_device *csdev) +{ + return csdev && (csdev->type == CORESIGHT_DEV_TYPE_SOURCE); +} + static inline bool coresight_is_percpu_source(struct coresight_device *csdev) { - return csdev && (csdev->type == CORESIGHT_DEV_TYPE_SOURCE) && + return csdev && coresight_is_device_source(csdev) && (csdev->subtype.source_subtype == CORESIGHT_DEV_SUBTYPE_SOURCE_PROC); } -- cgit v1.2.3 From ec9903d6cc34e61b77e609a0425e7a0a804fb95a Mon Sep 17 00:00:00 2001 From: Tao Zhang Date: Fri, 13 Dec 2024 18:07:30 +0800 Subject: coresight: Add support for trace filtering by source Some replicators have hard coded filtering of "trace" data, based on the source device. This is different from the trace filtering based on TraceID, available in the standard programmable replicators. e.g., Qualcomm replicators have filtering based on custom trace protocol format and is not programmable. The source device could be connected to the replicator via intermediate components (e.g., a funnel). Thus we need platform information from the firmware tables to decide the source device corresponding to a given output port from the replicator. Given this affects "trace path building" and traversing the path back from the sink to source, add the concept of "filtering by source" to the generic coresight connection. The specified source will be marked like below in the Devicetree. test-replicator { ... ... ... ... out-ports { ... ... ... ... port@0 { reg = <0>; xyz: endpoint { remote-endpoint = <&zyx>; filter-source = <&source_1>; <-- To specify the source to }; be filtered out here. }; port@1 { reg = <1>; abc: endpoint { remote-endpoint = <&cba>; filter-source = <&source_2>; <-- To specify the source to }; be filtered out here. }; }; }; Signed-off-by: Tao Zhang Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20241213100731.25914-4-quic_taozha@quicinc.com --- include/linux/coresight.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index c50d128e8d93..17276965ff1d 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -172,6 +172,9 @@ struct coresight_desc { * @dest_dev: a @coresight_device representation of the component connected to @src_port. NULL until the device is created * @link: Representation of the connection as a sysfs link. + * @filter_src_fwnode: filter source component's fwnode handle. + * @filter_src_dev: a @coresight_device representation of the component that + needs to be filtered. * * The full connection structure looks like this, where in_conns store * references to same connection as the source device's out_conns. @@ -200,6 +203,8 @@ struct coresight_connection { struct coresight_device *dest_dev; struct coresight_sysfs_link *link; struct coresight_device *src_dev; + struct fwnode_handle *filter_src_fwnode; + struct coresight_device *filter_src_dev; int src_refcnt; int dest_refcnt; }; -- cgit v1.2.3 From d54a3fc6bf3db0db0e16cfdf7f48a8bbb803f6b0 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 12 Dec 2024 14:37:14 +0000 Subject: firmware: cs_dsp: Add mock regmap for KUnit testing Add a mock regmap implementation to act as a simulated DSP for KUnit testing. This is built as a utility module so that it could be used by clients of cs_dsp to create a mock "DSP" for their own testing. cs_dsp interacts with the DSP only through registers. Most of the register space of the DSP is RAM. ADSP cores have a small set of control registers. HALO Core DSPs have a much larger set of control registers but only a small subset are used. Most writes are "blind" in the sense that cs_dsp does not expect to receive any sort of response from the DSP. So there isn't any need to emulate a "DSP", only a set of registers that can be written and read back. The idea of the mock regmap is to use the cache to accumulate writes which can then be tested against the values that are expected to be in the registers. Stray writes can be detected by dropping the cache entries for all addresses that should have been written and then issuing a regcache_sync(). If this causes bus writes it means there were writes to unexpected registers. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20241212143725.1381013-2-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp_test_utils.h | 46 +++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 include/linux/firmware/cirrus/cs_dsp_test_utils.h (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/cs_dsp_test_utils.h b/include/linux/firmware/cirrus/cs_dsp_test_utils.h new file mode 100644 index 000000000000..ac6b03f4c084 --- /dev/null +++ b/include/linux/firmware/cirrus/cs_dsp_test_utils.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Support utilities for cs_dsp testing. + * + * Copyright (C) 2024 Cirrus Logic, Inc. and + * Cirrus Logic International Semiconductor Ltd. + */ + +#include +#include + +struct kunit; +struct cs_dsp_test; +struct cs_dsp_test_local; + +/** + * struct cs_dsp_test - base class for test utilities + * + * @test: Pointer to struct kunit instance. + * @dsp: Pointer to struct cs_dsp instance. + * @local: Private data for each test suite. + */ +struct cs_dsp_test { + struct kunit *test; + struct cs_dsp *dsp; + + struct cs_dsp_test_local *local; + + /* Following members are private */ + bool saw_bus_write; +}; + +extern const unsigned int cs_dsp_mock_adsp2_32bit_sysbase; +extern const unsigned int cs_dsp_mock_adsp2_16bit_sysbase; +extern const unsigned int cs_dsp_mock_halo_core_base; +extern const unsigned int cs_dsp_mock_halo_sysinfo_base; + +int cs_dsp_mock_regmap_init(struct cs_dsp_test *priv); +void cs_dsp_mock_regmap_drop_range(struct cs_dsp_test *priv, + unsigned int first_reg, unsigned int last_reg); +void cs_dsp_mock_regmap_drop_regs(struct cs_dsp_test *priv, + unsigned int first_reg, size_t num_regs); +void cs_dsp_mock_regmap_drop_bytes(struct cs_dsp_test *priv, + unsigned int first_reg, size_t num_bytes); +void cs_dsp_mock_regmap_drop_system_regs(struct cs_dsp_test *priv); +bool cs_dsp_mock_regmap_is_dirty(struct cs_dsp_test *priv, bool drop_system_regs); -- cgit v1.2.3 From 41e78c0f44f97c958afcda3f82b23f4f4a05b968 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 12 Dec 2024 14:37:15 +0000 Subject: firmware: cs_dsp: Add mock DSP memory map for KUnit testing Add helper functions to implement an emulation of the DSP memory map. There are three main groups of functionality: 1. Define a mock cs_dsp_region table. 2. Calculate the addresses of memory and algorithms from the firmware header in XM. 3. Build a mock XM header in emulated XM. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20241212143725.1381013-3-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp_test_utils.h | 63 +++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/cs_dsp_test_utils.h b/include/linux/firmware/cirrus/cs_dsp_test_utils.h index ac6b03f4c084..899ae94198aa 100644 --- a/include/linux/firmware/cirrus/cs_dsp_test_utils.h +++ b/include/linux/firmware/cirrus/cs_dsp_test_utils.h @@ -30,11 +30,74 @@ struct cs_dsp_test { bool saw_bus_write; }; +/** + * struct cs_dsp_mock_alg_def - Info for creating a mock algorithm entry. + * + * @id Algorithm ID. + * @ver; Algorithm version. + * @xm_base_words XM base address in DSP words. + * @xm_size_words XM size in DSP words. + * @ym_base_words YM base address in DSP words. + * @ym_size_words YM size in DSP words. + * @zm_base_words ZM base address in DSP words. + * @zm_size_words ZM size in DSP words. + */ +struct cs_dsp_mock_alg_def { + unsigned int id; + unsigned int ver; + unsigned int xm_base_words; + unsigned int xm_size_words; + unsigned int ym_base_words; + unsigned int ym_size_words; + unsigned int zm_base_words; + unsigned int zm_size_words; +}; + +/** + * struct cs_dsp_mock_xm_header - XM header builder + * + * @test_priv: Pointer to the struct cs_dsp_test. + * @blob_data: Pointer to the created blob data. + * @blob_size_bytes: Size of the data at blob_data. + */ +struct cs_dsp_mock_xm_header { + struct cs_dsp_test *test_priv; + void *blob_data; + size_t blob_size_bytes; +}; + extern const unsigned int cs_dsp_mock_adsp2_32bit_sysbase; extern const unsigned int cs_dsp_mock_adsp2_16bit_sysbase; extern const unsigned int cs_dsp_mock_halo_core_base; extern const unsigned int cs_dsp_mock_halo_sysinfo_base; +extern const struct cs_dsp_region cs_dsp_mock_halo_dsp1_regions[]; +extern const unsigned int cs_dsp_mock_halo_dsp1_region_sizes[]; +extern const struct cs_dsp_region cs_dsp_mock_adsp2_32bit_dsp1_regions[]; +extern const unsigned int cs_dsp_mock_adsp2_32bit_dsp1_region_sizes[]; +extern const struct cs_dsp_region cs_dsp_mock_adsp2_16bit_dsp1_regions[]; +extern const unsigned int cs_dsp_mock_adsp2_16bit_dsp1_region_sizes[]; +int cs_dsp_mock_count_regions(const unsigned int *region_sizes); +unsigned int cs_dsp_mock_size_of_region(const struct cs_dsp *dsp, int mem_type); +unsigned int cs_dsp_mock_base_addr_for_mem(struct cs_dsp_test *priv, int mem_type); +unsigned int cs_dsp_mock_reg_addr_inc_per_unpacked_word(struct cs_dsp_test *priv); +unsigned int cs_dsp_mock_reg_block_length_bytes(struct cs_dsp_test *priv, int mem_type); +unsigned int cs_dsp_mock_reg_block_length_registers(struct cs_dsp_test *priv, int mem_type); +unsigned int cs_dsp_mock_reg_block_length_dsp_words(struct cs_dsp_test *priv, int mem_type); +bool cs_dsp_mock_has_zm(struct cs_dsp_test *priv); +int cs_dsp_mock_packed_to_unpacked_mem_type(int packed_mem_type); +unsigned int cs_dsp_mock_num_dsp_words_to_num_packed_regs(unsigned int num_dsp_words); +unsigned int cs_dsp_mock_xm_header_get_alg_base_in_words(struct cs_dsp_test *priv, + unsigned int alg_id, + int mem_type); +unsigned int cs_dsp_mock_xm_header_get_fw_version_from_regmap(struct cs_dsp_test *priv); +unsigned int cs_dsp_mock_xm_header_get_fw_version(struct cs_dsp_mock_xm_header *header); +void cs_dsp_mock_xm_header_drop_from_regmap_cache(struct cs_dsp_test *priv); +int cs_dsp_mock_xm_header_write_to_regmap(struct cs_dsp_mock_xm_header *header); +struct cs_dsp_mock_xm_header *cs_dsp_create_mock_xm_header(struct cs_dsp_test *priv, + const struct cs_dsp_mock_alg_def *algs, + size_t num_algs); + int cs_dsp_mock_regmap_init(struct cs_dsp_test *priv); void cs_dsp_mock_regmap_drop_range(struct cs_dsp_test *priv, unsigned int first_reg, unsigned int last_reg); -- cgit v1.2.3 From 5cf1b7b471803f7cc654a29ee16cb085ad69c097 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 12 Dec 2024 14:37:16 +0000 Subject: firmware: cs_dsp: Add mock wmfw file generator for KUnit testing Add a mock firmware file that emulates what the firmware build tools would normally create. This will be used by KUnit tests to generate a test wmfw file. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20241212143725.1381013-4-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp_test_utils.h | 33 +++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/cs_dsp_test_utils.h b/include/linux/firmware/cirrus/cs_dsp_test_utils.h index 899ae94198aa..fde7e95a33e9 100644 --- a/include/linux/firmware/cirrus/cs_dsp_test_utils.h +++ b/include/linux/firmware/cirrus/cs_dsp_test_utils.h @@ -53,6 +53,17 @@ struct cs_dsp_mock_alg_def { unsigned int zm_size_words; }; +struct cs_dsp_mock_coeff_def { + const char *shortname; + const char *fullname; + const char *description; + u16 type; + u16 flags; + u16 mem_type; + unsigned int offset_dsp_words; + unsigned int length_bytes; +}; + /** * struct cs_dsp_mock_xm_header - XM header builder * @@ -66,6 +77,8 @@ struct cs_dsp_mock_xm_header { size_t blob_size_bytes; }; +struct cs_dsp_mock_wmfw_builder; + extern const unsigned int cs_dsp_mock_adsp2_32bit_sysbase; extern const unsigned int cs_dsp_mock_adsp2_16bit_sysbase; extern const unsigned int cs_dsp_mock_halo_core_base; @@ -107,3 +120,23 @@ void cs_dsp_mock_regmap_drop_bytes(struct cs_dsp_test *priv, unsigned int first_reg, size_t num_bytes); void cs_dsp_mock_regmap_drop_system_regs(struct cs_dsp_test *priv); bool cs_dsp_mock_regmap_is_dirty(struct cs_dsp_test *priv, bool drop_system_regs); + +struct cs_dsp_mock_wmfw_builder *cs_dsp_mock_wmfw_init(struct cs_dsp_test *priv, + int format_version); +void cs_dsp_mock_wmfw_add_raw_block(struct cs_dsp_mock_wmfw_builder *builder, + int mem_region, unsigned int mem_offset_dsp_words, + const void *payload_data, size_t payload_len_bytes); +void cs_dsp_mock_wmfw_add_info(struct cs_dsp_mock_wmfw_builder *builder, + const char *info); +void cs_dsp_mock_wmfw_add_data_block(struct cs_dsp_mock_wmfw_builder *builder, + int mem_region, unsigned int mem_offset_dsp_words, + const void *payload_data, size_t payload_len_bytes); +void cs_dsp_mock_wmfw_start_alg_info_block(struct cs_dsp_mock_wmfw_builder *builder, + unsigned int alg_id, + const char *name, + const char *description); +void cs_dsp_mock_wmfw_add_coeff_desc(struct cs_dsp_mock_wmfw_builder *builder, + const struct cs_dsp_mock_coeff_def *def); +void cs_dsp_mock_wmfw_end_alg_info_block(struct cs_dsp_mock_wmfw_builder *builder); +struct firmware *cs_dsp_mock_wmfw_get_firmware(struct cs_dsp_mock_wmfw_builder *builder); +int cs_dsp_mock_wmfw_format_version(struct cs_dsp_mock_wmfw_builder *builder); -- cgit v1.2.3 From 7c052c6615297ff32032105130cd5f02059f7ae4 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 12 Dec 2024 14:37:17 +0000 Subject: firmware: cs_dsp: Add mock bin file generator for KUnit testing Add a mock firmware file that emulates what the firmware build tools would normally create. This will be used by KUnit tests to generate a test bin file. The data payload in a bin is an opaque blob, so the mock bin only needs to generate the appropriate file header and description block for each payload blob. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20241212143725.1381013-5-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp_test_utils.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/cirrus/cs_dsp_test_utils.h b/include/linux/firmware/cirrus/cs_dsp_test_utils.h index fde7e95a33e9..4f87a908ab4f 100644 --- a/include/linux/firmware/cirrus/cs_dsp_test_utils.h +++ b/include/linux/firmware/cirrus/cs_dsp_test_utils.h @@ -78,6 +78,7 @@ struct cs_dsp_mock_xm_header { }; struct cs_dsp_mock_wmfw_builder; +struct cs_dsp_mock_bin_builder; extern const unsigned int cs_dsp_mock_adsp2_32bit_sysbase; extern const unsigned int cs_dsp_mock_adsp2_16bit_sysbase; @@ -121,6 +122,23 @@ void cs_dsp_mock_regmap_drop_bytes(struct cs_dsp_test *priv, void cs_dsp_mock_regmap_drop_system_regs(struct cs_dsp_test *priv); bool cs_dsp_mock_regmap_is_dirty(struct cs_dsp_test *priv, bool drop_system_regs); +struct cs_dsp_mock_bin_builder *cs_dsp_mock_bin_init(struct cs_dsp_test *priv, + int format_version, + unsigned int fw_version); +void cs_dsp_mock_bin_add_raw_block(struct cs_dsp_mock_bin_builder *builder, + unsigned int alg_id, unsigned int alg_ver, + int type, unsigned int offset, + const void *payload_data, size_t payload_len_bytes); +void cs_dsp_mock_bin_add_info(struct cs_dsp_mock_bin_builder *builder, + const char *info); +void cs_dsp_mock_bin_add_name(struct cs_dsp_mock_bin_builder *builder, + const char *name); +void cs_dsp_mock_bin_add_patch(struct cs_dsp_mock_bin_builder *builder, + unsigned int alg_id, unsigned int alg_ver, + int mem_region, unsigned int reg_addr_offset, + const void *payload_data, size_t payload_len_bytes); +struct firmware *cs_dsp_mock_bin_get_firmware(struct cs_dsp_mock_bin_builder *builder); + struct cs_dsp_mock_wmfw_builder *cs_dsp_mock_wmfw_init(struct cs_dsp_test *priv, int format_version); void cs_dsp_mock_wmfw_add_raw_block(struct cs_dsp_mock_wmfw_builder *builder, -- cgit v1.2.3 From b76d32422c09bc9310f61a5a89671975db34bd2a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 9 Dec 2024 16:09:44 +0000 Subject: kref: Improve documentation There is already kernel-doc written for many of the functions in kref.h but it's not linked into the html docs anywhere. Add it to kref.rst. Improve the kref documentation by using the standard Return: section, rewording some unclear verbiage and adding docs for some undocumented functions. Update Thomas' email address to his current one. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Randy Dunlap Acked-by: Randy Dunlap Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20241209160953.757673-1-willy@infradead.org --- include/linux/kref.h | 48 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kref.h b/include/linux/kref.h index d32e21a2538c..88e82ab1367c 100644 --- a/include/linux/kref.h +++ b/include/linux/kref.h @@ -46,18 +46,18 @@ static inline void kref_get(struct kref *kref) } /** - * kref_put - decrement refcount for object. - * @kref: object. - * @release: pointer to the function that will clean up the object when the + * kref_put - Decrement refcount for object + * @kref: Object + * @release: Pointer to the function that will clean up the object when the * last reference to the object is released. - * This pointer is required, and it is not acceptable to pass kfree - * in as this function. * - * Decrement the refcount, and if 0, call release(). - * Return 1 if the object was removed, otherwise return 0. Beware, if this - * function returns 0, you still can not count on the kref from remaining in - * memory. Only use the return value if you want to see if the kref is now - * gone, not present. + * Decrement the refcount, and if 0, call @release. The caller may not + * pass NULL or kfree() as the release function. + * + * Return: 1 if this call removed the object, otherwise return 0. Beware, + * if this function returns 0, another caller may have removed the object + * by the time this function returns. The return value is only certain + * if you want to see if the object is definitely released. */ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref)) { @@ -68,17 +68,37 @@ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref) return 0; } +/** + * kref_put_mutex - Decrement refcount for object + * @kref: Object + * @release: Pointer to the function that will clean up the object when the + * last reference to the object is released. + * @mutex: Mutex which protects the release function. + * + * This variant of kref_lock() calls the @release function with the @mutex + * held. The @release function will release the mutex. + */ static inline int kref_put_mutex(struct kref *kref, void (*release)(struct kref *kref), - struct mutex *lock) + struct mutex *mutex) { - if (refcount_dec_and_mutex_lock(&kref->refcount, lock)) { + if (refcount_dec_and_mutex_lock(&kref->refcount, mutex)) { release(kref); return 1; } return 0; } +/** + * kref_put_lock - Decrement refcount for object + * @kref: Object + * @release: Pointer to the function that will clean up the object when the + * last reference to the object is released. + * @lock: Spinlock which protects the release function. + * + * This variant of kref_lock() calls the @release function with the @lock + * held. The @release function will release the lock. + */ static inline int kref_put_lock(struct kref *kref, void (*release)(struct kref *kref), spinlock_t *lock) @@ -94,8 +114,6 @@ static inline int kref_put_lock(struct kref *kref, * kref_get_unless_zero - Increment refcount for object unless it is zero. * @kref: object. * - * Return non-zero if the increment succeeded. Otherwise return 0. - * * This function is intended to simplify locking around refcounting for * objects that can be looked up from a lookup structure, and which are * removed from that lookup structure in the object destructor. @@ -105,6 +123,8 @@ static inline int kref_put_lock(struct kref *kref, * With a lookup followed by a kref_get_unless_zero *with return value check* * locking in the kref_put path can be deferred to the actual removal from * the lookup structure and RCU lookups become trivial. + * + * Return: non-zero if the increment succeeded. Otherwise return 0. */ static inline int __must_check kref_get_unless_zero(struct kref *kref) { -- cgit v1.2.3 From 7f78c081d44ce0f9d73dcef3204df5936ddbfea7 Mon Sep 17 00:00:00 2001 From: Chun-Kuang Hu Date: Sun, 1 Sep 2024 14:32:59 +0000 Subject: soc: mediatek: cmdq: Remove cmdq_pkt_finalize() helper function In order to have fine-grained control, use cmdq_pkt_eoc() and cmdq_pkt_jump_rel() to replace cmdq_pkt_finalize(). Signed-off-by: Chun-Kuang Hu Acked-by: Matthias Brugger Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Sebastian Fricke Signed-off-by: Mauro Carvalho Chehab --- include/linux/soc/mediatek/mtk-cmdq.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h index 5bee6f7fc400..0c3906e8ad19 100644 --- a/include/linux/soc/mediatek/mtk-cmdq.h +++ b/include/linux/soc/mediatek/mtk-cmdq.h @@ -391,14 +391,6 @@ int cmdq_pkt_jump_rel(struct cmdq_pkt *pkt, s32 offset, u8 shift_pa); */ int cmdq_pkt_eoc(struct cmdq_pkt *pkt); -/** - * cmdq_pkt_finalize() - Append EOC and jump command to pkt. - * @pkt: the CMDQ packet - * - * Return: 0 for success; else the error code is returned - */ -int cmdq_pkt_finalize(struct cmdq_pkt *pkt); - #else /* IS_ENABLED(CONFIG_MTK_CMDQ) */ static inline int cmdq_dev_get_client_reg(struct device *dev, @@ -519,11 +511,6 @@ static inline int cmdq_pkt_eoc(struct cmdq_pkt *pkt) return -EINVAL; } -static inline int cmdq_pkt_finalize(struct cmdq_pkt *pkt) -{ - return -EINVAL; -} - #endif /* IS_ENABLED(CONFIG_MTK_CMDQ) */ #endif /* __MTK_CMDQ_H__ */ -- cgit v1.2.3 From 4e885fab7164689f031a6c73522a3d91674c5bdc Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Fri, 13 Dec 2024 13:09:28 +0000 Subject: bpf: Add a __btf_get_by_fd helper Add a new helper to get a pointer to a struct btf from a file descriptor. This helper doesn't increase a refcnt. Add a comment explaining this and pointing to a corresponding function which does take a reference. Signed-off-by: Anton Protopopov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241213130934.1087929-2-aspsk@isovalent.com --- include/linux/bpf.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eaee2a819f4c..ac44b857b2f9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2301,6 +2301,14 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); +/* + * The __bpf_map_get() and __btf_get_by_fd() functions parse a file + * descriptor and return a corresponding map or btf object. + * Their names are double underscored to emphasize the fact that they + * do not increase refcnt. To also increase refcnt use corresponding + * bpf_map_get() and btf_get_by_fd() functions. + */ + static inline struct bpf_map *__bpf_map_get(struct fd f) { if (fd_empty(f)) @@ -2310,6 +2318,15 @@ static inline struct bpf_map *__bpf_map_get(struct fd f) return fd_file(f)->private_data; } +static inline struct btf *__btf_get_by_fd(struct fd f) +{ + if (fd_empty(f)) + return ERR_PTR(-EBADF); + if (unlikely(fd_file(f)->f_op != &btf_fops)) + return ERR_PTR(-EINVAL); + return fd_file(f)->private_data; +} + void bpf_map_inc(struct bpf_map *map); void bpf_map_inc_with_uref(struct bpf_map *map); struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref); -- cgit v1.2.3 From 6037802bbae892f3ad0c7b4c4faee39b967e32b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 11 Dec 2024 20:57:55 +0100 Subject: power: supply: core: implement extension API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various drivers, mostly in platform/x86 extend the ACPI battery driver with additional sysfs attributes to implement more UAPIs than are exposed through ACPI by using various side-channels, like WMI, nonstandard ACPI or EC communication. While the created sysfs attributes look similar to the attributes provided by the powersupply core, there are various deficiencies: * They don't show up in uevent payload. * They can't be queried with the standard in-kernel APIs. * They don't work with triggers. * The extending driver has to reimplement all of the parsing, formatting and sysfs display logic. * Writing a extension driver is completely different from writing a normal power supply driver. This extension API avoids all of these issues. An extension is just a "struct power_supply_ext" with the same kind of callbacks as in a normal "struct power_supply_desc". The API is meant to be used via battery_hook_register(), the same way as the current extensions. Signed-off-by: Thomas Weißschuh Reviewed-by: Armin Wolf Link: https://lore.kernel.org/r/20241211-power-supply-extensions-v6-1-9d9dc3f3d387@weissschuh.net Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 0d96657d1a2b..a877518cd963 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include #include @@ -283,6 +285,27 @@ struct power_supply_desc { int use_for_apm; }; +struct power_supply_ext { + u8 charge_behaviours; + const enum power_supply_property *properties; + size_t num_properties; + + int (*get_property)(struct power_supply *psy, + const struct power_supply_ext *ext, + void *data, + enum power_supply_property psp, + union power_supply_propval *val); + int (*set_property)(struct power_supply *psy, + const struct power_supply_ext *ext, + void *data, + enum power_supply_property psp, + const union power_supply_propval *val); + int (*property_is_writeable)(struct power_supply *psy, + const struct power_supply_ext *ext, + void *data, + enum power_supply_property psp); +}; + struct power_supply { const struct power_supply_desc *desc; @@ -302,10 +325,13 @@ struct power_supply { struct delayed_work deferred_register_work; spinlock_t changed_lock; bool changed; + bool update_groups; bool initialized; bool removing; atomic_t use_cnt; struct power_supply_battery_info *battery_info; + struct rw_semaphore extensions_sem; /* protects "extensions" */ + struct list_head extensions; #ifdef CONFIG_THERMAL struct thermal_zone_device *tzd; struct thermal_cooling_device *tcd; @@ -882,6 +908,13 @@ devm_power_supply_register(struct device *parent, extern void power_supply_unregister(struct power_supply *psy); extern int power_supply_powers(struct power_supply *psy, struct device *dev); +extern int __must_check +power_supply_register_extension(struct power_supply *psy, + const struct power_supply_ext *ext, + void *data); +extern void power_supply_unregister_extension(struct power_supply *psy, + const struct power_supply_ext *ext); + #define to_power_supply(device) container_of(device, struct power_supply, dev) extern void *power_supply_get_drvdata(struct power_supply *psy); -- cgit v1.2.3 From 5119e6b44f8ada5f5cea19935a7f005fee062aef Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 11 Dec 2024 21:42:27 +0000 Subject: memory: omap-gpmc: deadcode a pair of functions gpmc_get_client_irq() last use was removed by commit ac28e47ccc3f ("ARM: OMAP2+: Remove legacy gpmc-nand.c") gpmc_ticks_to_ns() last use was removed by commit 2514830b8b8c ("ARM: OMAP2+: Remove gpmc-onenand") Remove them. gpmc_clk_ticks_to_ns() is now only used in some DEBUG code; move inside the ifdef to avoid unused warnings. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Roger Quadros Acked-by: Kevin Hilman Link: https://lore.kernel.org/r/20241211214227.107980-1-linux@treblig.org Signed-off-by: Krzysztof Kozlowski --- include/linux/omap-gpmc.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/omap-gpmc.h b/include/linux/omap-gpmc.h index c9e3843d2dd5..263b915df1fb 100644 --- a/include/linux/omap-gpmc.h +++ b/include/linux/omap-gpmc.h @@ -66,10 +66,6 @@ extern int gpmc_calc_timings(struct gpmc_timings *gpmc_t, struct device_node; -extern int gpmc_get_client_irq(unsigned irq_config); - -extern unsigned int gpmc_ticks_to_ns(unsigned int ticks); - extern void gpmc_cs_write_reg(int cs, int idx, u32 val); extern int gpmc_calc_divider(unsigned int sync_clk); extern int gpmc_cs_set_timings(int cs, const struct gpmc_timings *t, -- cgit v1.2.3 From 9698d5a4836549d394e6efd858b5200878c9f255 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 29 Nov 2024 14:02:23 +0100 Subject: pidfs: rework inode number allocation Recently we received a patchset that aims to enable file handle encoding and decoding via name_to_handle_at(2) and open_by_handle_at(2). A crucical step in the patch series is how to go from inode number to struct pid without leaking information into unprivileged contexts. The issue is that in order to find a struct pid the pid number in the initial pid namespace must be encoded into the file handle via name_to_handle_at(2). This can be used by containers using a separate pid namespace to learn what the pid number of a given process in the initial pid namespace is. While this is a weak information leak it could be used in various exploits and in general is an ugly wart in the design. To solve this problem a new way is needed to lookup a struct pid based on the inode number allocated for that struct pid. The other part is to remove the custom inode number allocation on 32bit systems that is also an ugly wart that should go away. So, a new scheme is used that I was discusssing with Tejun some time back. A cyclic ida is used for the lower 32 bits and a the high 32 bits are used for the generation number. This gives a 64 bit inode number that is unique on both 32 bit and 64 bit. The lower 32 bit number is recycled slowly and can be used to lookup struct pids. Link: https://lore.kernel.org/r/20241129-work-pidfs-v2-1-61043d66fbce@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/pidfs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 75bdf9807802..2958652bb108 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -4,5 +4,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); +int pidfs_add_pid(struct pid *pid); +void pidfs_remove_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ -- cgit v1.2.3 From d2ab36bb115b720c9c738184d4007e1ca01c53da Mon Sep 17 00:00:00 2001 From: Erin Shepherd Date: Fri, 29 Nov 2024 14:38:00 +0100 Subject: pseudofs: add support for export_ops Pseudo-filesystems might reasonably wish to implement the export ops (particularly for name_to_handle_at/open_by_handle_at); plumb this through pseudo_fs_context Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Erin Shepherd Link: https://lore.kernel.org/r/20241113-pidfs_fh-v2-1-9a4d28155a37@e43.eu Link: https://lore.kernel.org/r/20241129-work-pidfs-file_handle-v1-1-87d803a42495@kernel.org Signed-off-by: Christian Brauner --- include/linux/pseudo_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h index 730f77381d55..2503f7625d65 100644 --- a/include/linux/pseudo_fs.h +++ b/include/linux/pseudo_fs.h @@ -5,6 +5,7 @@ struct pseudo_fs_context { const struct super_operations *ops; + const struct export_operations *eops; const struct xattr_handler * const *xattr; const struct dentry_operations *dops; unsigned long magic; -- cgit v1.2.3 From 50166d57ea8c5042ecba0ee22532617d72ed085a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 29 Nov 2024 14:38:02 +0100 Subject: exportfs: add open method This allows filesystems such as pidfs to provide their custom open. Link: https://lore.kernel.org/r/20241129-work-pidfs-file_handle-v1-3-87d803a42495@kernel.org Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 4cc8801e50e3..c69b79b64466 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -10,6 +10,7 @@ struct inode; struct iomap; struct super_block; struct vfsmount; +struct path; /* limit the handle size to NFSv4 handle size now */ #define MAX_HANDLE_SZ 128 @@ -225,6 +226,9 @@ struct fid { * is also a directory. In the event that it cannot be found, or storage * space cannot be allocated, a %ERR_PTR should be returned. * + * open: + * Allow filesystems to specify a custom open function. + * * commit_metadata: * @commit_metadata should commit metadata changes to stable storage. * @@ -251,6 +255,7 @@ struct export_operations { bool write, u32 *device_generation); int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int nr_iomaps, struct iattr *iattr); + struct file * (*open)(struct path *path, unsigned int oflags); #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ #define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */ -- cgit v1.2.3 From 0203b485d26d5b403ff4ed21e4cc85ba9ec0fe67 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 Oct 2024 16:42:37 -0700 Subject: torture: Add dowarn argument to torture_sched_setaffinity() Current use cases of torture_sched_setaffinity() are well served by its unconditional warning on error. However, an upcoming use case for a preemption kthread needs to avoid warnings that might otherwise arise when that kthread attempted to bind itself to a CPU on its way offline. This commit therefore adds a dowarn argument that, when false, suppresses the warning. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/torture.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index c2e979f82f8d..0134e7221cae 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -130,7 +130,7 @@ void _torture_stop_kthread(char *m, struct task_struct **tp); #endif #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST) -long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn); #endif #endif /* __LINUX_TORTURE_H */ -- cgit v1.2.3 From 868dc3cd1105bd328be864bf2c409891438df44a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 18 Nov 2024 07:15:58 +0100 Subject: thermal: core: Add stub for thermal_zone_device_update() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To simplify the !CONFIG_THERMAL case in the hwmon core, add a !CONFIG_THERMAL stub for thermal_zone_device_update(). Signed-off-by: Thomas Weißschuh Reviewed-by: Guenter Roeck Acked-by: Rafael J. Wysocki Signed-off-by: Guenter Roeck --- include/linux/thermal.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 754802478b96..69f9bedd0ee8 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -295,6 +295,10 @@ static inline struct thermal_zone_device *thermal_tripless_zone_device_register( static inline void thermal_zone_device_unregister(struct thermal_zone_device *tz) { } +static inline void thermal_zone_device_update(struct thermal_zone_device *tz, + enum thermal_notify_event event) +{ } + static inline struct thermal_cooling_device * thermal_cooling_device_register(const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) -- cgit v1.2.3 From f40452577557caf0e5d0ff182da8479c3d492ac5 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Mon, 2 Dec 2024 11:28:00 +0100 Subject: hwmon: (pmbus/core) improve handling of write protected regulators Writing PMBus protected registers does succeed from the smbus perspective, even if the write is ignored by the device and a communication fault is raised. This fault will silently be caught and cleared by pmbus irq if one has been registered. This means that the regulator call may return succeed although the operation was ignored. With this change, the operation which are not supported will be properly flagged as such and the regulator framework won't even try to execute them. Signed-off-by: Jerome Brunet [groeck: Adjust to EXPORT_SYMBOL_NS_GPL API change] Signed-off-by: Guenter Roeck --- include/linux/pmbus.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pmbus.h b/include/linux/pmbus.h index fa9f08164c36..884040e1383b 100644 --- a/include/linux/pmbus.h +++ b/include/linux/pmbus.h @@ -73,6 +73,20 @@ */ #define PMBUS_USE_COEFFICIENTS_CMD BIT(5) +/* + * PMBUS_OP_PROTECTED + * Set if the chip OPERATION command is protected and protection is not + * determined by the standard WRITE_PROTECT command. + */ +#define PMBUS_OP_PROTECTED BIT(6) + +/* + * PMBUS_VOUT_PROTECTED + * Set if the chip VOUT_COMMAND command is protected and protection is not + * determined by the standard WRITE_PROTECT command. + */ +#define PMBUS_VOUT_PROTECTED BIT(7) + struct pmbus_platform_data { u32 flags; /* Device specific flags */ -- cgit v1.2.3 From 0f38c06cab7712fc82c314fe4264a8897f3e6365 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Oct 2024 13:07:11 -0700 Subject: rcutorture: Check preemption for failing reader This commit checks to see if the RCU reader has been preempted within its read-side critical section for RCU flavors supporting this notion (currently only preemptible RCU). If such a preemption occurred, then this is printed at the end of the "Failure/close-call rcutorture reader segments" list at the end of the rcutorture run. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Tested-by: kernel test robot Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/rcupdate_wait.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index 303ab9bee155..f9bed3d3f78d 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -65,4 +65,15 @@ static inline void cond_resched_rcu(void) #endif } +// Has the current task blocked within its current RCU read-side +// critical section? +static inline bool has_rcu_reader_blocked(void) +{ +#ifdef CONFIG_PREEMPT_RCU + return !list_empty(¤t->rcu_node_entry); +#else + return false; +#endif +} + #endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */ -- cgit v1.2.3 From 0fef924e3918e72768357a220c84e6b4dd2b6180 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Nov 2024 11:11:18 -0800 Subject: rcutorture: Use symbols for SRCU reader flavors This commit converts rcutorture.c values for the reader_flavor module parameter from hexadecimal to the SRCU_READ_FLAVOR_* C-preprocessor macros. The actual modprobe or kernel-boot-parameter values for read_flavor must still be entered in hexadecimal. Link: https://lore.kernel.org/all/c48c9dca-fe07-4833-acaa-28c827e5a79e@amd.com/ Suggested-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/srcu.h | 6 ++++++ include/linux/srcutree.h | 6 +----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 08339eb8a01c..da8224d0f71c 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -43,6 +43,12 @@ int init_srcu_struct(struct srcu_struct *ssp); #define __SRCU_DEP_MAP_INIT(srcu_name) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +/* Values for SRCU Tree srcu_data ->srcu_reader_flavor, but also used by rcutorture. */ +#define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). +#define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). +#define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). +#define SRCU_READ_FLAVOR_ALL 0x7 // All of the above. + #ifdef CONFIG_TINY_SRCU #include #elif defined(CONFIG_TREE_SRCU) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 490aeecc6bb4..80016bbed672 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -26,6 +26,7 @@ struct srcu_data { atomic_long_t srcu_lock_count[2]; /* Locks per CPU. */ atomic_long_t srcu_unlock_count[2]; /* Unlocks per CPU. */ int srcu_reader_flavor; /* Reader flavor for srcu_struct structure? */ + /* Values: SRCU_READ_FLAVOR_.* */ /* Update-side state. */ spinlock_t __private lock ____cacheline_internodealigned_in_smp; @@ -43,11 +44,6 @@ struct srcu_data { struct srcu_struct *ssp; }; -/* Values for ->srcu_reader_flavor. */ -#define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). -#define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). -#define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). - /* * Node in SRCU combining tree, similar in function to rcu_data. */ -- cgit v1.2.3 From d465492a224b2409508224cf6970d7b97e2285cc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Oct 2024 15:09:39 -0700 Subject: srcu: Guarantee non-negative return value from srcu_read_lock() For almost 20 years, the int return value from srcu_read_lock() has been always either zero or one. This commit therefore documents the fact that it will be non-negative, and does the same for the underlying __srcu_read_lock(). [ paulmck: Apply Andrii Nakryiko feedback. ] Signed-off-by: Paul E. McKenney Acked-by: Andrii Nakryiko Acked-by: Peter Zijlstra (Intel) Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/srcu.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 08339eb8a01c..abaddd7e6ddf 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -232,13 +232,14 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) * a mutex that is held elsewhere while calling synchronize_srcu() or * synchronize_srcu_expedited(). * - * The return value from srcu_read_lock() must be passed unaltered - * to the matching srcu_read_unlock(). Note that srcu_read_lock() and - * the matching srcu_read_unlock() must occur in the same context, for - * example, it is illegal to invoke srcu_read_unlock() in an irq handler - * if the matching srcu_read_lock() was invoked in process context. Or, - * for that matter to invoke srcu_read_unlock() from one task and the - * matching srcu_read_lock() from another. + * The return value from srcu_read_lock() is guaranteed to be + * non-negative. This value must be passed unaltered to the matching + * srcu_read_unlock(). Note that srcu_read_lock() and the matching + * srcu_read_unlock() must occur in the same context, for example, it is + * illegal to invoke srcu_read_unlock() in an irq handler if the matching + * srcu_read_lock() was invoked in process context. Or, for that matter to + * invoke srcu_read_unlock() from one task and the matching srcu_read_lock() + * from another. */ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) { -- cgit v1.2.3 From cfb07b07dda2a17feed96c80c5af85937fcd2e9c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Nov 2024 16:53:53 -0800 Subject: srcu: Fix typo s/srcu_check_read_flavor()/__srcu_check_read_flavor()/ This commit fixes a typo in which a comment needed to have been updated from srcu_check_read_flavor() to __srcu_check_read_flavor(). Reported-by: Neeraj Upadhyay Closes: https://lore.kernel.org/all/b75d1fcd-6fcd-4619-bb5c-507fa599ee28@amd.com/ Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/srcutree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 490aeecc6bb4..4e69f88bcab9 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -258,7 +258,7 @@ static inline void srcu_check_read_flavor_lite(struct srcu_struct *ssp) if (likely(READ_ONCE(sdp->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE)) return; - // Note that the cmpxchg() in srcu_check_read_flavor() is fully ordered. + // Note that the cmpxchg() in __srcu_check_read_flavor() is fully ordered. __srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); } -- cgit v1.2.3 From 288a2cabcf6bb35532e8b2708829bdc2b85bc690 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 11 Dec 2024 20:57:58 +0100 Subject: power: supply: core: add UAPI to discover currently used extensions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Userspace wants to now about the used power supply extensions, for example to handle a device extended by a certain extension differently or to discover information about the extending device. Add a sysfs directory to the power supply device. This directory contains links which are named after the used extension and point to the device implementing that extension. Signed-off-by: Thomas Weißschuh Reviewed-by: Armin Wolf Link: https://lore.kernel.org/r/20241211-power-supply-extensions-v6-4-9d9dc3f3d387@weissschuh.net Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index a877518cd963..c3ce9f2b17d4 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -286,6 +286,7 @@ struct power_supply_desc { }; struct power_supply_ext { + const char *const name; u8 charge_behaviours; const enum power_supply_property *properties; size_t num_properties; @@ -911,6 +912,7 @@ extern int power_supply_powers(struct power_supply *psy, struct device *dev); extern int __must_check power_supply_register_extension(struct power_supply *psy, const struct power_supply_ext *ext, + struct device *dev, void *data); extern void power_supply_unregister_extension(struct power_supply *psy, const struct power_supply_ext *ext); -- cgit v1.2.3 From 2c2b61d2138f472e50b5531ec0cb4a1485837e21 Mon Sep 17 00:00:00 2001 From: Yuyang Huang Date: Wed, 11 Dec 2024 17:22:41 +0900 Subject: netlink: add IGMP/MLD join/leave notifications MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change introduces netlink notifications for multicast address changes. The following features are included: * Addition and deletion of multicast addresses are reported using RTM_NEWMULTICAST and RTM_DELMULTICAST messages with AF_INET and AF_INET6. * Two new notification groups: RTNLGRP_IPV4_MCADDR and RTNLGRP_IPV6_MCADDR are introduced for receiving these events. This change allows user space applications (e.g., ip monitor) to efficiently track multicast group memberships by listening for netlink events. Previously, applications relied on inefficient polling of procfs, introducing delays. With netlink notifications, applications receive realtime updates on multicast group membership changes, enabling more precise metrics collection and system monitoring.  This change also unlocks the potential for implementing a wide range of sophisticated multicast related features in user space by allowing applications to combine kernel provided multicast address information with user space data and communicate decisions back to the kernel for more fine grained control. This mechanism can be used for various purposes, including multicast filtering, IGMP/MLD offload, and IGMP/MLD snooping. Cc: Maciej Żenczykowski Cc: Lorenzo Colitti Co-developed-by: Patrick Ruddy Signed-off-by: Patrick Ruddy Link: https://lore.kernel.org/r/20180906091056.21109-1-pruddy@vyatta.att-mail.com Signed-off-by: Yuyang Huang Signed-off-by: David S. Miller --- include/linux/igmp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 5171231f70a8..073b30a9b850 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -87,6 +87,8 @@ struct ip_mc_list { char loaded; unsigned char gsquery; /* check source marks? */ unsigned char crcount; + unsigned long mca_cstamp; + unsigned long mca_tstamp; struct rcu_head rcu; }; -- cgit v1.2.3 From 2ef6fc99e0d922a54073e7b6d6465c62f4d3b62b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thi=C3=A9baud=20Weksteen?= Date: Thu, 5 Dec 2024 12:21:00 +1100 Subject: selinux: add netlink nlmsg_type audit message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new audit message type to capture nlmsg-related information. This is similar to LSM_AUDIT_DATA_IOCTL_OP which was added for the other SELinux extended permission (ioctl). Adding a new type is preferred to adding to the existing lsm_network_audit structure which contains irrelevant information for the netlink sockets (i.e., dport, sport). Signed-off-by: Thiébaud Weksteen [PM: change "nlnk-msgtype" to "nl-msgtype" as discussed] Signed-off-by: Paul Moore --- include/linux/lsm_audit.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index 97a8b21eb033..69d2b7bc00ed 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -77,6 +77,7 @@ struct common_audit_data { #define LSM_AUDIT_DATA_LOCKDOWN 15 #define LSM_AUDIT_DATA_NOTIFICATION 16 #define LSM_AUDIT_DATA_ANONINODE 17 +#define LSM_AUDIT_DATA_NLMSGTYPE 18 union { struct path path; struct dentry *dentry; @@ -98,6 +99,7 @@ struct common_audit_data { struct lsm_ibendport_audit *ibendport; int reason; const char *anonclass; + u16 nlmsg_type; } u; /* this union contains LSM specific data */ union { -- cgit v1.2.3 From aeb3ec99026979287266e4b5a1194789c1488c1a Mon Sep 17 00:00:00 2001 From: Rongwei Liu Date: Fri, 13 Dec 2024 00:13:20 +0200 Subject: net/mlx5: Add device cap abs_native_port_num When the abs_native_port_num is set, the native_port_num reported by the device may not be continuous and bigger than the num_lag_ports. Signed-off-by: Rongwei Liu Reviewed-by: Shay Drory Reviewed-by: Saeed Mahameed Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241212221329.961628-2-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5451ff1d4356..43b3cb4bf8d1 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1599,7 +1599,8 @@ enum { struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_0[0x6]; u8 page_request_disable[0x1]; - u8 reserved_at_7[0x9]; + u8 abs_native_port_num[0x1]; + u8 reserved_at_8[0x8]; u8 shared_object_to_user_object_allowed[0x1]; u8 reserved_at_13[0xe]; u8 vhca_resource_manager[0x1]; -- cgit v1.2.3 From 35f7cad1743e04bf2944a2aadb6b6a42adc57bca Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Thu, 12 Dec 2024 18:06:43 +0100 Subject: net: Add the possibility to support a selected hwtstamp in netdevice Introduce the description of a hwtstamp provider, mainly defined with a the hwtstamp source and the phydev pointer. Add a hwtstamp provider description within the netdev structure to allow saving the hwtstamp we want to use. This prepares for future support of an ethtool netlink command to select the desired hwtstamp provider. By default, the old API that does not support hwtstamp selectability is used, meaning the hwtstamp provider pointer is unset. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/linux/net_tstamp.h | 29 +++++++++++++++++++++++++++++ include/linux/netdevice.h | 4 ++++ 2 files changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h index 662074b08c94..ff0758e88ea1 100644 --- a/include/linux/net_tstamp.h +++ b/include/linux/net_tstamp.h @@ -19,6 +19,33 @@ enum hwtstamp_source { HWTSTAMP_SOURCE_PHYLIB, }; +/** + * struct hwtstamp_provider_desc - hwtstamp provider description + * + * @index: index of the hwtstamp provider. + * @qualifier: hwtstamp provider qualifier. + */ +struct hwtstamp_provider_desc { + int index; + enum hwtstamp_provider_qualifier qualifier; +}; + +/** + * struct hwtstamp_provider - hwtstamp provider object + * + * @rcu_head: RCU callback used to free the struct. + * @source: source of the hwtstamp provider. + * @phydev: pointer of the phydev source in case a PTP coming from phylib + * @desc: hwtstamp provider description. + */ + +struct hwtstamp_provider { + struct rcu_head rcu_head; + enum hwtstamp_source source; + struct phy_device *phydev; + struct hwtstamp_provider_desc desc; +}; + /** * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config * @@ -31,6 +58,7 @@ enum hwtstamp_source { * copied the ioctl request back to user space * @source: indication whether timestamps should come from the netdev or from * an attached phylib PHY + * @qualifier: qualifier of the hwtstamp provider * * Prefer using this structure for in-kernel processing of hardware * timestamping configuration, over the inextensible struct hwtstamp_config @@ -43,6 +71,7 @@ struct kernel_hwtstamp_config { struct ifreq *ifr; bool copied_to_user; enum hwtstamp_source source; + enum hwtstamp_provider_qualifier qualifier; }; static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d917949bba03..2593019ad5b1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -82,6 +82,7 @@ struct xdp_metadata_ops; struct xdp_md; struct ethtool_netdev_state; struct phy_link_topology; +struct hwtstamp_provider; typedef u32 xdp_features_t; @@ -2045,6 +2046,7 @@ enum netdev_reg_state { * * @neighbours: List heads pointing to this device's neighbours' * dev_list, one per address-family. + * @hwprov: Tracks which PTP performs hardware packet time stamping. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2457,6 +2459,8 @@ struct net_device { struct hlist_head neighbours[NEIGH_NR_TABLES]; + struct hwtstamp_provider __rcu *hwprov; + u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; -- cgit v1.2.3 From b9e3f7dc9ed95daeb83cfa45b821cacaa01aa906 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Thu, 12 Dec 2024 18:06:44 +0100 Subject: net: ethtool: tsinfo: Enhance tsinfo to support several hwtstamp by net topology Either the MAC or the PHY can provide hwtstamp, so we should be able to read the tsinfo for any hwtstamp provider. Enhance 'get' command to retrieve tsinfo of hwtstamp providers within a network topology. Add support for a specific dump command to retrieve all hwtstamp providers within the network topology, with added functionality for filtered dump to target a single interface. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/linux/ethtool.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index e217c6321ed0..f711bfd75c4d 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -711,6 +711,7 @@ struct ethtool_rxfh_param { * @cmd: command number = %ETHTOOL_GET_TS_INFO * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags * @phc_index: device index of the associated PHC, or -1 if there is none + * @phc_qualifier: qualifier of the associated PHC * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values */ @@ -718,6 +719,7 @@ struct kernel_ethtool_ts_info { u32 cmd; u32 so_timestamping; int phc_index; + enum hwtstamp_provider_qualifier phc_qualifier; enum hwtstamp_tx_types tx_types; enum hwtstamp_rx_filters rx_filters; }; @@ -749,6 +751,7 @@ struct kernel_ethtool_ts_info { * @rss_context argument to @create_rxfh_context and friends. * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. + * @supported_hwtstamp_qualifiers: bitfield of supported hwtstamp qualifier. * @get_drvinfo: Report driver/device information. Modern drivers no * longer have to implement this callback. Most fields are * correctly filled in by the core using system information, or @@ -966,6 +969,7 @@ struct ethtool_ops { u32 rxfh_max_num_contexts; u32 supported_coalesce_params; u32 supported_ring_params; + u32 supported_hwtstamp_qualifiers; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); int (*get_regs_len)(struct net_device *); void (*get_regs)(struct net_device *, struct ethtool_regs *, void *); -- cgit v1.2.3 From 1e7381f3617d14b3c11da80ff5f8a93ab14cfc46 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Oct 2024 08:04:50 -0700 Subject: KVM: Explicitly verify target vCPU is online in kvm_get_vcpu() Explicitly verify the target vCPU is fully online _prior_ to clamping the index in kvm_get_vcpu(). If the index is "bad", the nospec clamping will generate '0', i.e. KVM will return vCPU0 instead of NULL. In practice, the bug is unlikely to cause problems, as it will only come into play if userspace or the guest is buggy or misbehaving, e.g. KVM may send interrupts to vCPU0 instead of dropping them on the floor. However, returning vCPU0 when it shouldn't exist per online_vcpus is problematic now that KVM uses an xarray for the vCPUs array, as KVM needs to insert into the xarray before publishing the vCPU to userspace (see commit c5b077549136 ("KVM: Convert the kvm->vcpus array to a xarray")), i.e. before vCPU creation is guaranteed to succeed. As a result, incorrectly providing access to vCPU0 will trigger a use-after-free if vCPU0 is dereferenced and kvm_vm_ioctl_create_vcpu() bails out of vCPU creation due to an error and frees vCPU0. Commit afb2acb2e3a3 ("KVM: Fix vcpu_array[0] races") papered over that issue, but in doing so introduced an unsolvable teardown conundrum. Preventing accesses to vCPU0 before it's fully online will allow reverting commit afb2acb2e3a3, without re-introducing the vcpu_array[0] UAF race. Fixes: 1d487e9bf8ba ("KVM: fix spectrev1 gadgets") Cc: stable@vger.kernel.org Cc: Will Deacon Cc: Michal Luczaj Reviewed-by: Pankaj Gupta Acked-by: Will Deacon Link: https://lore.kernel.org/r/20241009150455.1057573-2-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 401439bb21e3..b0b38744c4b0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -963,6 +963,15 @@ static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) { int num_vcpus = atomic_read(&kvm->online_vcpus); + + /* + * Explicitly verify the target vCPU is online, as the anti-speculation + * logic only limits the CPU's ability to speculate, e.g. given a "bad" + * index, clamping the index to 0 would return vCPU0, not NULL. + */ + if (i >= num_vcpus) + return NULL; + i = array_index_nospec(i, num_vcpus); /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu. */ -- cgit v1.2.3 From 0664dc74e9d004c36b4400081811df795169809a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Oct 2024 08:04:51 -0700 Subject: KVM: Verify there's at least one online vCPU when iterating over all vCPUs Explicitly check that there is at least online vCPU before iterating over all vCPUs. Because the max index is an unsigned long, passing "0 - 1" in the online_vcpus==0 case results in xa_for_each_range() using an unlimited max, i.e. allows it to access vCPU0 when it shouldn't. This will allow KVM to safely _erase_ from vcpu_array if the last stages of vCPU creation fail, i.e. without generating a use-after-free if a different task happens to be concurrently iterating over all vCPUs. Note, because xa_for_each_range() is a macro, kvm_for_each_vcpu() subtly reloads online_vcpus after each iteration, i.e. adding an extra load doesn't meaningfully impact the total cost of iterating over all vCPUs. And because online_vcpus is never decremented, there is no risk of a reload triggering a walk of the entire xarray. Cc: Will Deacon Cc: Michal Luczaj Acked-by: Will Deacon Link: https://lore.kernel.org/r/20241009150455.1057573-3-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b0b38744c4b0..92f192bec07b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -979,9 +979,10 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) return xa_load(&kvm->vcpu_array, i); } -#define kvm_for_each_vcpu(idx, vcpup, kvm) \ - xa_for_each_range(&kvm->vcpu_array, idx, vcpup, 0, \ - (atomic_read(&kvm->online_vcpus) - 1)) +#define kvm_for_each_vcpu(idx, vcpup, kvm) \ + if (atomic_read(&kvm->online_vcpus)) \ + xa_for_each_range(&kvm->vcpu_array, idx, vcpup, 0, \ + (atomic_read(&kvm->online_vcpus) - 1)) static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) { -- cgit v1.2.3 From 239d87327dcd361b0098038995f8908f3296864f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 12 Dec 2024 17:28:06 -0800 Subject: fortify: Hide run-time copy size from value range tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC performs value range tracking for variables as a way to provide better diagnostics. One place this is regularly seen is with warnings associated with bounds-checking, e.g. -Wstringop-overflow, -Wstringop-overread, -Warray-bounds, etc. In order to keep the signal-to-noise ratio high, warnings aren't emitted when a value range spans the entire value range representable by a given variable. For example: unsigned int len; char dst[8]; ... memcpy(dst, src, len); If len's value is unknown, it has the full "unsigned int" range of [0, UINT_MAX], and GCC's compile-time bounds checks against memcpy() will be ignored. However, when a code path has been able to narrow the range: if (len > 16) return; memcpy(dst, src, len); Then the range will be updated for the execution path. Above, len is now [0, 16] when reading memcpy(), so depending on other optimizations, we might see a -Wstringop-overflow warning like: error: '__builtin_memcpy' writing between 9 and 16 bytes into region of size 8 [-Werror=stringop-overflow] When building with CONFIG_FORTIFY_SOURCE, the fortified run-time bounds checking can appear to narrow value ranges of lengths for memcpy(), depending on how the compiler constructs the execution paths during optimization passes, due to the checks against the field sizes. For example: if (p_size_field != SIZE_MAX && p_size != p_size_field && p_size_field < size) As intentionally designed, these checks only affect the kernel warnings emitted at run-time and do not block the potentially overflowing memcpy(), so GCC thinks it needs to produce a warning about the resulting value range that might be reaching the memcpy(). We have seen this manifest a few times now, with the most recent being with cpumasks: In function ‘bitmap_copy’, inlined from ‘cpumask_copy’ at ./include/linux/cpumask.h:839:2, inlined from ‘__padata_set_cpumasks’ at kernel/padata.c:730:2: ./include/linux/fortify-string.h:114:33: error: ‘__builtin_memcpy’ reading between 257 and 536870904 bytes from a region of size 256 [-Werror=stringop-overread] 114 | #define __underlying_memcpy __builtin_memcpy | ^ ./include/linux/fortify-string.h:633:9: note: in expansion of macro ‘__underlying_memcpy’ 633 | __underlying_##op(p, q, __fortify_size); \ | ^~~~~~~~~~~~~ ./include/linux/fortify-string.h:678:26: note: in expansion of macro ‘__fortify_memcpy_chk’ 678 | #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ | ^~~~~~~~~~~~~~~~~~~~ ./include/linux/bitmap.h:259:17: note: in expansion of macro ‘memcpy’ 259 | memcpy(dst, src, len); | ^~~~~~ kernel/padata.c: In function ‘__padata_set_cpumasks’: kernel/padata.c:713:48: note: source object ‘pcpumask’ of size [0, 256] 713 | cpumask_var_t pcpumask, | ~~~~~~~~~~~~~~^~~~~~~~ This warning is _not_ emitted when CONFIG_FORTIFY_SOURCE is disabled, and with the recent -fdiagnostics-details we can confirm the origin of the warning is due to FORTIFY's bounds checking: ../include/linux/bitmap.h:259:17: note: in expansion of macro 'memcpy' 259 | memcpy(dst, src, len); | ^~~~~~ '__padata_set_cpumasks': events 1-2 ../include/linux/fortify-string.h:613:36: 612 | if (p_size_field != SIZE_MAX && | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 613 | p_size != p_size_field && p_size_field < size) | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~ | | | (1) when the condition is evaluated to false | (2) when the condition is evaluated to true '__padata_set_cpumasks': event 3 114 | #define __underlying_memcpy __builtin_memcpy | ^ | | | (3) out of array bounds here Note that the cpumask warning started appearing since bitmap functions were recently marked __always_inline in commit ed8cd2b3bd9f ("bitmap: Switch from inline to __always_inline"), which allowed GCC to gain visibility into the variables as they passed through the FORTIFY implementation. In order to silence these false positives but keep otherwise deterministic compile-time warnings intact, hide the length variable from GCC with OPTIMIZE_HIDE_VAR() before calling the builtin memcpy. Additionally add a comment about why all the macro args have copies with const storage. Reported-by: "Thomas Weißschuh" Closes: https://lore.kernel.org/all/db7190c8-d17f-4a0d-bc2f-5903c79f36c2@t-8ch.de/ Reported-by: Nilay Shroff Closes: https://lore.kernel.org/all/20241112124127.1666300-1-nilay@linux.ibm.com/ Tested-by: Nilay Shroff Acked-by: Yury Norov Acked-by: Greg Kroah-Hartman Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 0d99bf11d260..e4ce1cae03bf 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -616,6 +616,12 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, return false; } +/* + * To work around what seems to be an optimizer bug, the macro arguments + * need to have const copies or the values end up changed by the time they + * reach fortify_warn_once(). See commit 6f7630b1b5bc ("fortify: Capture + * __bos() results in const temp vars") for more details. + */ #define __fortify_memcpy_chk(p, q, size, p_size, q_size, \ p_size_field, q_size_field, op) ({ \ const size_t __fortify_size = (size_t)(size); \ @@ -623,6 +629,8 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, const size_t __q_size = (q_size); \ const size_t __p_size_field = (p_size_field); \ const size_t __q_size_field = (q_size_field); \ + /* Keep a mutable version of the size for the final copy. */ \ + size_t __copy_size = __fortify_size; \ fortify_warn_once(fortify_memcpy_chk(__fortify_size, __p_size, \ __q_size, __p_size_field, \ __q_size_field, FORTIFY_FUNC_ ##op), \ @@ -630,7 +638,11 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, __fortify_size, \ "field \"" #p "\" at " FILE_LINE, \ __p_size_field); \ - __underlying_##op(p, q, __fortify_size); \ + /* Hide only the run-time size from value range tracking to */ \ + /* silence compile-time false positive bounds warnings. */ \ + if (!__builtin_constant_p(__copy_size)) \ + OPTIMIZER_HIDE_VAR(__copy_size); \ + __underlying_##op(p, q, __copy_size); \ }) /* -- cgit v1.2.3 From 3a3f61ce5e0b4bcf730acc09c1af91012d241f85 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 29 Nov 2024 20:06:55 -0800 Subject: exec: Make sure task->comm is always NUL-terminated Using strscpy() meant that the final character in task->comm may be non-NUL for a moment before the "string too long" truncation happens. Instead of adding a new use of the ambiguous strncpy(), we'd want to use memtostr_pad() which enforces being able to check at compile time that sizes are sensible, but this requires being able to see string buffer lengths. Instead of trying to inline __set_task_comm() (which needs to call trace and perf functions), just open-code it. But to make sure we're always safe, add compile-time checking like we already do for get_task_comm(). Suggested-by: Linus Torvalds Suggested-by: "Eric W. Biederman" Signed-off-by: Kees Cook --- include/linux/sched.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index e6ee4258169a..ac9f429ddc17 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1932,11 +1932,10 @@ static inline void kick_process(struct task_struct *tsk) { } #endif extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); - -static inline void set_task_comm(struct task_struct *tsk, const char *from) -{ - __set_task_comm(tsk, from, false); -} +#define set_task_comm(tsk, from) ({ \ + BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN); \ + __set_task_comm(tsk, from, false); \ +}) extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); #define get_task_comm(buf, tsk) ({ \ -- cgit v1.2.3 From 543841d1806029889c2f69f040e88b247aba8e22 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 21 Nov 2024 07:07:05 -0800 Subject: exec: fix up /proc/pid/comm in the execveat(AT_EMPTY_PATH) case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zbigniew mentioned at Linux Plumber's that systemd is interested in switching to execveat() for service execution, but can't, because the contents of /proc/pid/comm are the file descriptor which was used, instead of the path to the binary[1]. This makes the output of tools like top and ps useless, especially in a world where most fds are opened CLOEXEC so the number is truly meaningless. When the filename passed in is empty (e.g. with AT_EMPTY_PATH), use the dentry's filename for "comm" instead of using the useless numeral from the synthetic fdpath construction. This way the actual exec machinery is unchanged, but cosmetically the comm looks reasonable to admins investigating things. Instead of adding TASK_COMM_LEN more bytes to bprm, use one of the unused flag bits to indicate that we need to set "comm" from the dentry. Suggested-by: Zbigniew Jędrzejewski-Szmek Suggested-by: Tycho Andersen Suggested-by: Al Viro Suggested-by: Linus Torvalds Link: https://github.com/uapi-group/kernel-features#set-comm-field-before-exec [1] Reviewed-by: Aleksa Sarai Tested-by: Zbigniew Jędrzejewski-Szmek Signed-off-by: Kees Cook --- include/linux/binfmts.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index e6c00e860951..3305c849abd6 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -42,7 +42,9 @@ struct linux_binprm { * Set when errors can no longer be returned to the * original userspace. */ - point_of_no_return:1; + point_of_no_return:1, + /* Set when "comm" must come from the dentry. */ + comm_from_dentry:1; struct file *executable; /* Executable to pass to the interpreter */ struct file *interpreter; struct file *file; -- cgit v1.2.3 From c220e216d6bcd52cc7333e38edf43dc66ba0dd13 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 29 Nov 2024 14:38:04 +0100 Subject: exportfs: add permission method This allows filesystems such as pidfs to provide their custom permission checks. Link: https://lore.kernel.org/r/20241129-work-pidfs-file_handle-v1-5-87d803a42495@kernel.org Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index c69b79b64466..a087606ace19 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -3,6 +3,7 @@ #define LINUX_EXPORTFS_H 1 #include +#include struct dentry; struct iattr; @@ -10,7 +11,6 @@ struct inode; struct iomap; struct super_block; struct vfsmount; -struct path; /* limit the handle size to NFSv4 handle size now */ #define MAX_HANDLE_SZ 128 @@ -157,6 +157,17 @@ struct fid { }; }; +enum handle_to_path_flags { + HANDLE_CHECK_PERMS = (1 << 0), + HANDLE_CHECK_SUBTREE = (1 << 1), +}; + +struct handle_to_path_ctx { + struct path root; + enum handle_to_path_flags flags; + unsigned int fh_flags; +}; + #define EXPORT_FH_CONNECTABLE 0x1 /* Encode file handle with parent */ #define EXPORT_FH_FID 0x2 /* File handle may be non-decodeable */ #define EXPORT_FH_DIR_ONLY 0x4 /* Only decode file handle for a directory */ @@ -226,6 +237,9 @@ struct fid { * is also a directory. In the event that it cannot be found, or storage * space cannot be allocated, a %ERR_PTR should be returned. * + * permission: + * Allow filesystems to specify a custom permission function. + * * open: * Allow filesystems to specify a custom open function. * @@ -255,6 +269,7 @@ struct export_operations { bool write, u32 *device_generation); int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int nr_iomaps, struct iattr *iattr); + int (*permission)(struct handle_to_path_ctx *ctx, unsigned int oflags); struct file * (*open)(struct path *path, unsigned int oflags); #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ -- cgit v1.2.3 From 16ecd47cb0cd895c7c2f5dd5db50f6c005c51639 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 14 Dec 2024 22:01:28 +0100 Subject: pidfs: lookup pid through rbtree The new pid inode number allocation scheme is neat but I overlooked a possible, even though unlikely, attack that can be used to trigger an overflow on both 32bit and 64bit. An unique 64 bit identifier was constructed for each struct pid by two combining a 32 bit idr with a 32 bit generation number. A 32bit number was allocated using the idr_alloc_cyclic() infrastructure. When the idr wrapped around a 32 bit wraparound counter was incremented. The 32 bit wraparound counter served as the upper 32 bits and the allocated idr number as the lower 32 bits. Since the idr can only allocate up to INT_MAX entries everytime a wraparound happens INT_MAX - 1 entries are lost (Ignoring that numbering always starts at 2 to avoid theoretical collisions with the root inode number.). If userspace fully populates the idr such that and puts itself into control of two entries such that one entry is somewhere in the middle and the other entry is the INT_MAX entry then it is possible to overflow the wraparound counter. That is probably difficult to pull off but the mere possibility is annoying. The problem could be contained to 32 bit by switching to a data structure such as the maple tree that allows allocating 64 bit numbers on 64 bit machines. That would leave 32 bit in a lurch but that probably doesn't matter that much. The other problem is that removing entries form the maple tree is somewhat non-trivial because the removal code can be called under the irq write lock of tasklist_lock and irq{save,restore} code. Instead, allocate unique identifiers for struct pid by simply incrementing a 64 bit counter and insert each struct pid into the rbtree so it can be looked up to decode file handles avoiding to leak actual pids across pid namespaces in file handles. On both 64 bit and 32 bit the same 64 bit identifier is used to lookup struct pid in the rbtree. On 64 bit the unique identifier for struct pid simply becomes the inode number. Comparing two pidfds continues to be as simple as comparing inode numbers. On 32 bit the 64 bit number assigned to struct pid is split into two 32 bit numbers. The lower 32 bits are used as the inode number and the upper 32 bits are used as the inode generation number. Whenever a wraparound happens on 32 bit the 64 bit number will be incremented by 2 so inode numbering starts at 2 again. When a wraparound happens on 32 bit multiple pidfds with the same inode number are likely to exist. This isn't a problem since before pidfs pidfds used the anonymous inode meaning all pidfds had the same inode number. On 32 bit sserspace can thus reconstruct the 64 bit identifier by retrieving both the inode number and the inode generation number to compare, or use file handles. This gives the same guarantees on both 32 bit and 64 bit. Link: https://lore.kernel.org/r/20241214-gekoppelt-erdarbeiten-a1f9a982a5a6@brauner Signed-off-by: Christian Brauner --- include/linux/pid.h | 2 ++ include/linux/pidfs.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pid.h b/include/linux/pid.h index a3aad9b4074c..fe575fcdb4af 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -59,6 +59,7 @@ struct pid spinlock_t lock; struct dentry *stashed; u64 ino; + struct rb_node pidfs_node; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head inodes; @@ -68,6 +69,7 @@ struct pid struct upid numbers[]; }; +extern seqcount_spinlock_t pidmap_lock_seq; extern struct pid init_struct_pid; struct file; diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 2958652bb108..df574d6708d4 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -4,7 +4,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); -int pidfs_add_pid(struct pid *pid); +void pidfs_add_pid(struct pid *pid); void pidfs_remove_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ -- cgit v1.2.3 From fa52c04daec9ff9820260901a8b1d271bb532d12 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Thu, 7 Nov 2024 12:47:05 +0100 Subject: mfd: core: Make platform_data pointer const in struct mfd_cell The content of the platform_data of a struct mfd_cell is simply passed on to the platform_device_add_data() call in mfd_add_device() . platform_device_add_data() already handles the data behind that pointer as const and also uses kmemdup to create a copy of the data before handing that copy over to the newly created platform-device, so there is no reason to not extend this to struct mfd_cell, as the old copy in the mfd_cell will be stale anyway. This allows to pass structs gathered from of_device_get_match_data() as platform-data to sub-devices - which is retrieved as const already. Signed-off-by: Heiko Stuebner Link: https://lore.kernel.org/r/20241107114712.538976-3-heiko@sntech.de Signed-off-by: Lee Jones --- include/linux/mfd/core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h index e8bcad641d8c..faeea7abd688 100644 --- a/include/linux/mfd/core.h +++ b/include/linux/mfd/core.h @@ -72,7 +72,7 @@ struct mfd_cell { int (*resume)(struct platform_device *dev); /* platform data passed to the sub devices drivers */ - void *platform_data; + const void *platform_data; size_t pdata_size; /* Matches ACPI */ -- cgit v1.2.3 From 998f70d1806bb718a7565f350283e4a79c8cbb4b Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Thu, 7 Nov 2024 12:47:07 +0100 Subject: mfd: Add base driver for qnap-mcu devices These microcontroller units are used in network-attached-storage devices made by QNAP and provide additional functionality to the system. This adds the base driver that implements the serial protocol via serdev and additionally hooks into the poweroff handlers to turn off the parts of the system not supplied by the general PMIC. Turning off (at least the TSx33 devices using Rockchip SoCs) consists of two separate actions. Turning off the MCU alone does not turn off the main SoC and turning off only the SoC/PMIC does not turn off the hard-drives. Also if the MCU is not turned off, the system also won't start again until it is unplugged from power. So on shutdown the MCU needs to be turned off separately before the main PMIC. The protocol spoken by the MCU is sadly not documented, but was obtained by listening to the chatter on the serial port, as thankfully the "hal_app" program from QNAPs firmware allows triggering all/most MCU actions from the command line. The implementation of how to talk to the serial device got some inspiration from the rave-sp servdev driver. Signed-off-by: Heiko Stuebner Link: https://lore.kernel.org/r/20241107114712.538976-5-heiko@sntech.de Signed-off-by: Lee Jones --- include/linux/mfd/qnap-mcu.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 include/linux/mfd/qnap-mcu.h (limited to 'include/linux') diff --git a/include/linux/mfd/qnap-mcu.h b/include/linux/mfd/qnap-mcu.h new file mode 100644 index 000000000000..8d48c212fd44 --- /dev/null +++ b/include/linux/mfd/qnap-mcu.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Core definitions for QNAP MCU MFD driver. + * Copyright (C) 2024 Heiko Stuebner + */ + +#ifndef _LINUX_QNAP_MCU_H_ +#define _LINUX_QNAP_MCU_H_ + +struct qnap_mcu; + +struct qnap_mcu_variant { + u32 baud_rate; + int num_drives; + int fan_pwm_min; + int fan_pwm_max; + bool usb_led; +}; + +int qnap_mcu_exec(struct qnap_mcu *mcu, + const u8 *cmd_data, size_t cmd_data_size, + u8 *reply_data, size_t reply_data_size); +int qnap_mcu_exec_with_ack(struct qnap_mcu *mcu, + const u8 *cmd_data, size_t cmd_data_size); + +#endif /* _LINUX_QNAP_MCU_H_ */ -- cgit v1.2.3 From c925bb8853dae5cb25e7108298e905b55301bbff Mon Sep 17 00:00:00 2001 From: Marcus Folkesson Date: Tue, 10 Dec 2024 16:24:40 +0100 Subject: mfd: da9052: Store result from fault_log Other sub-components (da9052-wdt) could use the result to determine reboot cause. Expose the result by make it part of the da9052 structure. Signed-off-by: Marcus Folkesson Link: https://lore.kernel.org/r/20241210-da9052-wdt-v2-1-95a5756e9ac8@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/da9052/da9052.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/da9052/da9052.h b/include/linux/mfd/da9052/da9052.h index 76feb3a7066d..9cb2fc2938ce 100644 --- a/include/linux/mfd/da9052/da9052.h +++ b/include/linux/mfd/da9052/da9052.h @@ -93,6 +93,8 @@ struct da9052 { int chip_irq; + int fault_log; + /* SOC I/O transfer related fixes for DA9052/53 */ int (*fix_io) (struct da9052 *da9052, unsigned char reg); }; -- cgit v1.2.3 From c2b148f3bc94b61e885dc8529d6b6136576bd865 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Wed, 11 Dec 2024 17:27:16 +0100 Subject: mfd: Add support for AAEON UP board FPGA The UP boards implement some features (pin controller, LEDs) through an on-board FPGA. This MFD driver implements the line protocol to communicate with the FPGA through regmap, and registers pin controller and led cells. This commit adds support for UP and UP Squared boards. Based on the work done by Gary Wang . Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20241211-aaeon-up-board-pinctrl-support-v1-1-24719be27631@bootlin.com Signed-off-by: Lee Jones --- include/linux/mfd/upboard-fpga.h | 55 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 include/linux/mfd/upboard-fpga.h (limited to 'include/linux') diff --git a/include/linux/mfd/upboard-fpga.h b/include/linux/mfd/upboard-fpga.h new file mode 100644 index 000000000000..12231e40f5da --- /dev/null +++ b/include/linux/mfd/upboard-fpga.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * UP Board CPLD/FPGA driver + * + * Copyright (c) AAEON. All rights reserved. + * Copyright (C) 2024 Bootlin + * + * Author: Gary Wang + * Author: Thomas Richard + * + */ + +#ifndef __LINUX_MFD_UPBOARD_FPGA_H +#define __LINUX_MFD_UPBOARD_FPGA_H + +#define UPBOARD_REGISTER_SIZE 16 + +enum upboard_fpgareg { + UPBOARD_REG_PLATFORM_ID = 0x10, + UPBOARD_REG_FIRMWARE_ID = 0x11, + UPBOARD_REG_FUNC_EN0 = 0x20, + UPBOARD_REG_FUNC_EN1 = 0x21, + UPBOARD_REG_GPIO_EN0 = 0x30, + UPBOARD_REG_GPIO_EN1 = 0x31, + UPBOARD_REG_GPIO_EN2 = 0x32, + UPBOARD_REG_GPIO_DIR0 = 0x40, + UPBOARD_REG_GPIO_DIR1 = 0x41, + UPBOARD_REG_GPIO_DIR2 = 0x42, + UPBOARD_REG_MAX, +}; + +enum upboard_fpga_type { + UPBOARD_UP_FPGA, + UPBOARD_UP2_FPGA, +}; + +struct upboard_fpga_data { + enum upboard_fpga_type type; + const struct regmap_config *regmap_config; +}; + +struct upboard_fpga { + struct device *dev; + struct regmap *regmap; + struct gpio_desc *enable_gpio; + struct gpio_desc *reset_gpio; + struct gpio_desc *clear_gpio; + struct gpio_desc *strobe_gpio; + struct gpio_desc *datain_gpio; + struct gpio_desc *dataout_gpio; + unsigned int firmware_version; + const struct upboard_fpga_data *fpga_data; +}; + +#endif /* __LINUX_MFD_UPBOARD_FPGA_H */ -- cgit v1.2.3 From 1c896113f04e34d0036ef506532d2e6cf77dd1e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Sun, 15 Dec 2024 22:13:23 +0100 Subject: turris-omnia-mcu-interface.h: Move macro definitions outside of enums MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the definitions of enumerator related macros outside of the enumerator definitions. Suggested-by: Lee Jones Link: https://lore.kernel.org/linux-leds/20241212183357.GK7139@google.com/ Signed-off-by: Marek Behún Link: https://lore.kernel.org/r/20241215211323.23364-1-kabel@kernel.org Signed-off-by: Lee Jones --- include/linux/turris-omnia-mcu-interface.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/turris-omnia-mcu-interface.h b/include/linux/turris-omnia-mcu-interface.h index 06c94e032c6f..38b45ab00053 100644 --- a/include/linux/turris-omnia-mcu-interface.h +++ b/include/linux/turris-omnia-mcu-interface.h @@ -241,16 +241,18 @@ enum omnia_int_e { enum omnia_cmd_led_mode_e { OMNIA_CMD_LED_MODE_LED_MASK = GENMASK(3, 0), -#define OMNIA_CMD_LED_MODE_LED(_l) FIELD_PREP(OMNIA_CMD_LED_MODE_LED_MASK, _l) OMNIA_CMD_LED_MODE_USER = BIT(4), }; +#define OMNIA_CMD_LED_MODE_LED(_l) FIELD_PREP(OMNIA_CMD_LED_MODE_LED_MASK, _l) + enum omnia_cmd_led_state_e { OMNIA_CMD_LED_STATE_LED_MASK = GENMASK(3, 0), -#define OMNIA_CMD_LED_STATE_LED(_l) FIELD_PREP(OMNIA_CMD_LED_STATE_LED_MASK, _l) OMNIA_CMD_LED_STATE_ON = BIT(4), }; +#define OMNIA_CMD_LED_STATE_LED(_l) FIELD_PREP(OMNIA_CMD_LED_STATE_LED_MASK, _l) + enum omnia_cmd_poweroff_e { OMNIA_CMD_POWER_OFF_POWERON_BUTTON = BIT(0), OMNIA_CMD_POWER_OFF_MAGIC = 0xdead, -- cgit v1.2.3 From 020b40f3562495f3c703a283ece145ffec19e82d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Dec 2024 08:21:46 -0700 Subject: io_uring: make ctx->timeout_lock a raw spinlock Chase reports that their tester complaints about a locking context mismatch: ============================= [ BUG: Invalid wait context ] 6.13.0-rc1-gf137f14b7ccb-dirty #9 Not tainted ----------------------------- syz.1.25198/182604 is trying to lock: ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: spin_lock_irq include/linux/spinlock.h:376 [inline] ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: io_match_task_safe io_uring/io_uring.c:218 [inline] ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: io_match_task_safe+0x187/0x250 io_uring/io_uring.c:204 other info that might help us debug this: context-{5:5} 1 lock held by syz.1.25198/182604: #0: ffff88802b7d48c0 (&acct->lock){+.+.}-{2:2}, at: io_acct_cancel_pending_work+0x2d/0x6b0 io_uring/io-wq.c:1049 stack backtrace: CPU: 0 UID: 0 PID: 182604 Comm: syz.1.25198 Not tainted 6.13.0-rc1-gf137f14b7ccb-dirty #9 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x82/0xd0 lib/dump_stack.c:120 print_lock_invalid_wait_context kernel/locking/lockdep.c:4826 [inline] check_wait_context kernel/locking/lockdep.c:4898 [inline] __lock_acquire+0x883/0x3c80 kernel/locking/lockdep.c:5176 lock_acquire.part.0+0x11b/0x370 kernel/locking/lockdep.c:5849 __raw_spin_lock_irq include/linux/spinlock_api_smp.h:119 [inline] _raw_spin_lock_irq+0x36/0x50 kernel/locking/spinlock.c:170 spin_lock_irq include/linux/spinlock.h:376 [inline] io_match_task_safe io_uring/io_uring.c:218 [inline] io_match_task_safe+0x187/0x250 io_uring/io_uring.c:204 io_acct_cancel_pending_work+0xb8/0x6b0 io_uring/io-wq.c:1052 io_wq_cancel_pending_work io_uring/io-wq.c:1074 [inline] io_wq_cancel_cb+0xb0/0x390 io_uring/io-wq.c:1112 io_uring_try_cancel_requests+0x15e/0xd70 io_uring/io_uring.c:3062 io_uring_cancel_generic+0x6ec/0x8c0 io_uring/io_uring.c:3140 io_uring_files_cancel include/linux/io_uring.h:20 [inline] do_exit+0x494/0x27a0 kernel/exit.c:894 do_group_exit+0xb3/0x250 kernel/exit.c:1087 get_signal+0x1d77/0x1ef0 kernel/signal.c:3017 arch_do_signal_or_restart+0x79/0x5b0 arch/x86/kernel/signal.c:337 exit_to_user_mode_loop kernel/entry/common.c:111 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:329 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] syscall_exit_to_user_mode+0x150/0x2a0 kernel/entry/common.c:218 do_syscall_64+0xd8/0x250 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x77/0x7f which is because io_uring has ctx->timeout_lock nesting inside the io-wq acct lock, the latter of which is used from inside the scheduler and hence is a raw spinlock, while the former is a "normal" spinlock and can hence be sleeping on PREEMPT_RT. Change ctx->timeout_lock to be a raw spinlock to solve this nesting dependency on PREEMPT_RT=y. Reported-by: chase xd Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 011860ade268..fd4cdb0860a2 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -345,7 +345,7 @@ struct io_ring_ctx { /* timeouts */ struct { - spinlock_t timeout_lock; + raw_spinlock_t timeout_lock; struct list_head timeout_list; struct list_head ltimeout_list; unsigned cq_last_tm_flush; -- cgit v1.2.3 From afd2627f727b89496d79a6b934a025fc916d4ded Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 21:41:22 -0500 Subject: tracing: Check "%s" dereference via the field and not the TP_printk format The TP_printk() portion of a trace event is executed at the time a event is read from the trace. This can happen seconds, minutes, hours, days, months, years possibly later since the event was recorded. If the print format contains a dereference to a string via "%s", and that string was allocated, there's a chance that string could be freed before it is read by the trace file. To protect against such bugs, there are two functions that verify the event. The first one is test_event_printk(), which is called when the event is created. It reads the TP_printk() format as well as its arguments to make sure nothing may be dereferencing a pointer that was not copied into the ring buffer along with the event. If it is, it will trigger a WARN_ON(). For strings that use "%s", it is not so easy. The string may not reside in the ring buffer but may still be valid. Strings that are static and part of the kernel proper which will not be freed for the life of the running system, are safe to dereference. But to know if it is a pointer to a static string or to something on the heap can not be determined until the event is triggered. This brings us to the second function that tests for the bad dereferencing of strings, trace_check_vprintf(). It would walk through the printf format looking for "%s", and when it finds it, it would validate that the pointer is safe to read. If not, it would produces a WARN_ON() as well and write into the ring buffer "[UNSAFE-MEMORY]". The problem with this is how it used va_list to have vsnprintf() handle all the cases that it didn't need to check. Instead of re-implementing vsnprintf(), it would make a copy of the format up to the %s part, and call vsnprintf() with the current va_list ap variable, where the ap would then be ready to point at the string in question. For architectures that passed va_list by reference this was possible. For architectures that passed it by copy it was not. A test_can_verify() function was used to differentiate between the two, and if it wasn't possible, it would disable it. Even for architectures where this was feasible, it was a stretch to rely on such a method that is undocumented, and could cause issues later on with new optimizations of the compiler. Instead, the first function test_event_printk() was updated to look at "%s" as well. If the "%s" argument is a pointer outside the event in the ring buffer, it would find the field type of the event that is the problem and mark the structure with a new flag called "needs_test". The event itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that this event has a field that needs to be verified before the event can be printed using the printf format. When the event fields are created from the field type structure, the fields would copy the field type's "needs_test" value. Finally, before being printed, a new function ignore_event() is called which will check if the event has the TEST_STR flag set (if not, it returns false). If the flag is set, it then iterates through the events fields looking for the ones that have the "needs_test" flag set. Then it uses the offset field from the field structure to find the pointer in the ring buffer event. It runs the tests to make sure that pointer is safe to print and if not, it triggers the WARN_ON() and also adds to the trace output that the event in question has an unsafe memory access. The ignore_event() makes the trace_check_vprintf() obsolete so it is removed. Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Al Viro Cc: Linus Torvalds Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 2a5df5b62cfc..91b8ffbdfa8c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -273,7 +273,8 @@ struct trace_event_fields { const char *name; const int size; const int align; - const int is_signed; + const unsigned int is_signed:1; + unsigned int needs_test:1; const int filter_type; const int len; }; @@ -324,6 +325,7 @@ enum { TRACE_EVENT_FL_EPROBE_BIT, TRACE_EVENT_FL_FPROBE_BIT, TRACE_EVENT_FL_CUSTOM_BIT, + TRACE_EVENT_FL_TEST_STR_BIT, }; /* @@ -340,6 +342,7 @@ enum { * CUSTOM - Event is a custom event (to be attached to an exsiting tracepoint) * This is set when the custom event has not been attached * to a tracepoint yet, then it is cleared when it is. + * TEST_STR - The event has a "%s" that points to a string outside the event */ enum { TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), @@ -352,6 +355,7 @@ enum { TRACE_EVENT_FL_EPROBE = (1 << TRACE_EVENT_FL_EPROBE_BIT), TRACE_EVENT_FL_FPROBE = (1 << TRACE_EVENT_FL_FPROBE_BIT), TRACE_EVENT_FL_CUSTOM = (1 << TRACE_EVENT_FL_CUSTOM_BIT), + TRACE_EVENT_FL_TEST_STR = (1 << TRACE_EVENT_FL_TEST_STR_BIT), }; #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE) -- cgit v1.2.3 From 7c4b497fd4032935676b9024396f187fee005739 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 17 Dec 2024 18:41:54 +0100 Subject: clk: davinci: remove platform data struct There are no board files using struct davinci_pll_platform_data anymore. The structure itself is currently used to store a single pointer. Let's remove the struct definition, the header and rework the driver to not require the syscon regmap to be stored in probe(). Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20241217174154.84441-1-brgl@bgdev.pl Reviewed-by: David Lechner Signed-off-by: Stephen Boyd --- include/linux/platform_data/clk-davinci-pll.h | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 include/linux/platform_data/clk-davinci-pll.h (limited to 'include/linux') diff --git a/include/linux/platform_data/clk-davinci-pll.h b/include/linux/platform_data/clk-davinci-pll.h deleted file mode 100644 index e55dab1d578b..000000000000 --- a/include/linux/platform_data/clk-davinci-pll.h +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * PLL clock driver for TI Davinci SoCs - * - * Copyright (C) 2018 David Lechner - */ - -#ifndef __LINUX_PLATFORM_DATA_CLK_DAVINCI_PLL_H__ -#define __LINUX_PLATFORM_DATA_CLK_DAVINCI_PLL_H__ - -#include - -/** - * davinci_pll_platform_data - * @cfgchip: CFGCHIP syscon regmap - */ -struct davinci_pll_platform_data { - struct regmap *cfgchip; -}; - -#endif /* __LINUX_PLATFORM_DATA_CLK_DAVINCI_PLL_H__ */ -- cgit v1.2.3 From d5af79c05e9382d38b8546dc5362381ce07ba3d1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 9 Dec 2024 16:00:41 -0800 Subject: Documentation: move dev-tools debugging files to process/debugging/ Move gdb and kgdb debugging documentation to the dedicated debugging directory (Documentation/process/debugging/). Adjust the index.rst files to follow the file movement. Adjust files that refer to these moved files to follow the file movement. Update location of kgdb.rst in MAINTAINERS file. Add a link from dev-tools/index to process/debugging/index. Note: translations are not updated. Signed-off-by: Randy Dunlap Cc: Sebastian Fricke Cc: Jonathan Corbet Cc: workflows@vger.kernel.org Cc: Jason Wessel Cc: Daniel Thompson Cc: Douglas Anderson Cc: linux-debuggers@vger.kernel.org Cc: kgdb-bugreport@lists.sourceforge.net Cc: Doug Anderson Cc: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Andrew Morton Cc: Greg Kroah-Hartman Cc: linux-serial@vger.kernel.org Acked-by: Greg Kroah-Hartman Acked-by: Daniel Thompson Reviewed-by: Douglas Anderson Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20241210000041.305477-1-rdunlap@infradead.org --- include/linux/tty_driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index dd4b31ce6d5d..d4cdc089f6c3 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -320,7 +320,7 @@ struct serial_struct; * * @poll_init: ``int ()(struct tty_driver *driver, int line, char *options)`` * - * kgdboc support (Documentation/dev-tools/kgdb.rst). This routine is + * kgdboc support (Documentation/process/debugging/kgdb.rst). This routine is * called to initialize the HW for later use by calling @poll_get_char or * @poll_put_char. * -- cgit v1.2.3 From 85b08180df07b9a5984b15ae31d76b904d42a115 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 13 Dec 2024 10:51:29 -0800 Subject: x86/cpu: Expose only stepping min/max interface The x86_match_cpu() infrastructure can match CPU steppings. Since there are only 16 possible steppings, the matching infrastructure goes all out and stores the stepping match as a bitmap. That means it can match any possible steppings in a single list entry. Fun. But it exposes this bitmap to each of the X86_MATCH_*() helpers when none of them really need a bitmap. It makes up for this by exporting a helper (X86_STEPPINGS()) which converts a contiguous stepping range into the bitmap which every single user leverages. Instead of a bitmap, have the main helper for this sort of thing (X86_MATCH_VFM_STEPS()) just take a stepping range. This ends up actually being even more compact than before. Leave the helper in place (renamed to __X86_STEPPINGS()) to make it more clear what is going on instead of just having a random GENMASK() in the middle of an already complicated macro. One oddity that I hit was this macro: X86_MATCH_VFM_STEPS(vfm, X86_STEPPING_MIN, max_stepping, issues) It *could* have been converted over to take a min/max stepping value for each entry. But that would have been a bit too verbose and would prevent the one oddball in the list (INTEL_COMETLAKE_L stepping 0) from sticking out. Instead, just have it take a *maximum* stepping and imply that the match is from 0=>max_stepping. This is functional for all the cases now and also retains the nice property of having INTEL_COMETLAKE_L stepping 0 stick out like a sore thumb. skx_cpuids[] is goofy. It uses the stepping match but encodes all possible steppings. Just use a normal, non-stepping match helper. Suggested-by: Ingo Molnar Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20241213185129.65527B2A%40davehans-spike.ostc.intel.com --- include/linux/mod_devicetable.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 4338b1b4ac44..d67614f7b7f1 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -700,6 +700,8 @@ struct x86_cpu_id { #define X86_FAMILY_ANY 0 #define X86_MODEL_ANY 0 #define X86_STEPPING_ANY 0 +#define X86_STEP_MIN 0 +#define X86_STEP_MAX 0xf #define X86_FEATURE_ANY 0 /* Same as FPU, you can't test for that */ /* -- cgit v1.2.3 From 647b7aad19490a7b90c52c883bda7df299457491 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 4 Dec 2024 04:29:28 -0800 Subject: iommu: Remove the remove_dev_pasid op The iommu drivers that supports PASID have supported attaching pasid to the blocked_domain, hence remove the remove_dev_pasid op from the iommu_ops. Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Vasant Hegde Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241204122928.11987-8-yi.l.liu@intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 318d27841130..38c65e92ecd0 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -587,9 +587,6 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, * - IOMMU_DOMAIN_DMA: must use a dma domain * - 0: use the default setting * @default_domain_ops: the default ops for domains - * @remove_dev_pasid: Remove any translation configurations of a specific - * pasid, so that any DMA transactions with this pasid - * will be blocked by the hardware. * @viommu_alloc: Allocate an iommufd_viommu on a physical IOMMU instance behind * the @dev, as the set of virtualization resources shared/passed * to user space IOMMU instance. And associate it with a nesting @@ -647,8 +644,6 @@ struct iommu_ops { struct iommu_page_response *msg); int (*def_domain_type)(struct device *dev); - void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid, - struct iommu_domain *domain); struct iommufd_viommu *(*viommu_alloc)( struct device *dev, struct iommu_domain *parent_domain, -- cgit v1.2.3 From 349f0086ba8b2a169877d21ff15a4d9da3a60054 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 18 Dec 2024 09:02:28 +0100 Subject: x86/static-call: fix 32-bit build In 32-bit x86 builds CONFIG_STATIC_CALL_INLINE isn't set, leading to static_call_initialized not being available. Define it as "0" in that case. Reported-by: Stephen Rothwell Fixes: 0ef8047b737d ("x86/static-call: provide a way to do very early static-call updates") Signed-off-by: Juergen Gross Acked-by: Peter Zijlstra (Intel) Signed-off-by: Linus Torvalds --- include/linux/static_call.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/static_call.h b/include/linux/static_call.h index 785980af8972..78a77a4ae0ea 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -138,7 +138,6 @@ #ifdef CONFIG_HAVE_STATIC_CALL #include -extern int static_call_initialized; /* * Either @site or @tramp can be NULL. */ @@ -161,6 +160,8 @@ extern void arch_static_call_transform(void *site, void *tramp, void *func, bool #ifdef CONFIG_HAVE_STATIC_CALL_INLINE +extern int static_call_initialized; + extern int __init static_call_init(void); extern void static_call_force_reinit(void); @@ -226,6 +227,8 @@ extern long __static_call_return0(void); #elif defined(CONFIG_HAVE_STATIC_CALL) +#define static_call_initialized 0 + static inline int static_call_init(void) { return 0; } #define DEFINE_STATIC_CALL(name, _func) \ @@ -282,6 +285,8 @@ extern long __static_call_return0(void); #else /* Generic implementation */ +#define static_call_initialized 0 + static inline int static_call_init(void) { return 0; } static inline long __static_call_return0(void) -- cgit v1.2.3 From ebeeee390b6a341770789a50d81e677da9a103d9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 12 Dec 2024 13:01:02 +0100 Subject: PM: EM: Move sched domains rebuild function from schedutil to EM Function sugov_eas_rebuild_sd() defined in the schedutil cpufreq governor implements generic functionality that may be useful in other places. In particular, there is a plan to use it in the intel_pstate driver in the future. For this reason, move it from schedutil to the energy model code and rename it to em_rebuild_sched_domains(). This also helps to get rid of some #ifdeffery in schedutil which is a plus. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle --- include/linux/energy_model.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 752e0b297582..78318d49276d 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -179,6 +179,7 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, int em_dev_update_chip_binning(struct device *dev); int em_update_performance_limits(struct em_perf_domain *pd, unsigned long freq_min_khz, unsigned long freq_max_khz); +void em_rebuild_sched_domains(void); /** * em_pd_get_efficient_state() - Get an efficient performance state from the EM @@ -404,6 +405,7 @@ int em_update_performance_limits(struct em_perf_domain *pd, { return -EINVAL; } +static inline void em_rebuild_sched_domains(void) {} #endif #endif -- cgit v1.2.3 From b317268368546d6401af788648668f82e3ba1bd3 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Wed, 18 Dec 2024 13:09:35 +0900 Subject: PM: wakeup: implement devm_device_init_wakeup() helper Some drivers that enable device wakeup fail to properly disable it during their cleanup, which results in a memory leak. To address this, introduce devm_device_init_wakeup(), a managed variant of device_init_wakeup(dev, true). With this managed helper, wakeup functionality will be automatically disabled when the device is released, ensuring a more reliable cleanup process. This need for this addition arose during a previous discussion [1]. Link: https://lore.kernel.org/linux-rtc/20241212100403.3799667-1-joe@pf.is.s.u-tokyo.ac.jp/ [1] Suggested-by: Alexandre Belloni Signed-off-by: Joe Hattori Link: https://patch.msgid.link/20241218040935.1921416-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Rafael J. Wysocki --- include/linux/pm_wakeup.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index 222f7530806c..d501c09c60cd 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -240,4 +240,21 @@ static inline int device_init_wakeup(struct device *dev, bool enable) return 0; } +static void device_disable_wakeup(void *dev) +{ + device_init_wakeup(dev, false); +} + +/** + * devm_device_init_wakeup - Resource managed device wakeup initialization. + * @dev: Device to handle. + * + * This function is the devm managed version of device_init_wakeup(dev, true). + */ +static inline int devm_device_init_wakeup(struct device *dev) +{ + device_init_wakeup(dev, true); + return devm_add_action_or_reset(dev, device_disable_wakeup, dev); +} + #endif /* _LINUX_PM_WAKEUP_H */ -- cgit v1.2.3 From 525f6a2c63e0958c25080e108a0cb7f8a3a23719 Mon Sep 17 00:00:00 2001 From: "Sicelo A. Mhlongo" Date: Mon, 25 Nov 2024 17:12:58 +0200 Subject: bq27xxx: add voltage min design for bq27000 and bq27200 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bq27x00 gauges have an EEPROM register which contains the value of the voltage that should be considered to be zero battery capacity. Expose this to userspace using the VOLTAGE_MIN_DESIGN property. Tested on Nokia N900 with bq27200. Signed-off-by: Sicelo A. Mhlongo Acked-by: Pali Rohár Link: https://lore.kernel.org/r/20241125151321.45440-1-absicsz@gmail.com Signed-off-by: Sebastian Reichel --- include/linux/power/bq27xxx_battery.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h index 5180dc9f1706..6b190639b08e 100644 --- a/include/linux/power/bq27xxx_battery.h +++ b/include/linux/power/bq27xxx_battery.h @@ -61,6 +61,7 @@ struct bq27xxx_device_info { struct bq27xxx_access_methods bus; struct bq27xxx_reg_cache cache; int charge_design_full; + int voltage_min_design; bool removed; unsigned long last_update; union power_supply_propval last_status; -- cgit v1.2.3 From a5874fde3c0884a33ed4145101052318c5e17c74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 12 Dec 2024 18:42:16 +0100 Subject: exec: Add a new AT_EXECVE_CHECK flag to execveat(2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new AT_EXECVE_CHECK flag to execveat(2) to check if a file would be allowed for execution. The main use case is for script interpreters and dynamic linkers to check execution permission according to the kernel's security policy. Another use case is to add context to access logs e.g., which script (instead of interpreter) accessed a file. As any executable code, scripts could also use this check [1]. This is different from faccessat(2) + X_OK which only checks a subset of access rights (i.e. inode permission and mount options for regular files), but not the full context (e.g. all LSM access checks). The main use case for access(2) is for SUID processes to (partially) check access on behalf of their caller. The main use case for execveat(2) + AT_EXECVE_CHECK is to check if a script execution would be allowed, according to all the different restrictions in place. Because the use of AT_EXECVE_CHECK follows the exact kernel semantic as for a real execution, user space gets the same error codes. An interesting point of using execveat(2) instead of openat2(2) is that it decouples the check from the enforcement. Indeed, the security check can be logged (e.g. with audit) without blocking an execution environment not yet ready to enforce a strict security policy. LSMs can control or log execution requests with security_bprm_creds_for_exec(). However, to enforce a consistent and complete access control (e.g. on binary's dependencies) LSMs should restrict file executability, or measure executed files, with security_file_open() by checking file->f_flags & __FMODE_EXEC. Because AT_EXECVE_CHECK is dedicated to user space interpreters, it doesn't make sense for the kernel to parse the checked files, look for interpreters known to the kernel (e.g. ELF, shebang), and return ENOEXEC if the format is unknown. Because of that, security_bprm_check() is never called when AT_EXECVE_CHECK is used. It should be noted that script interpreters cannot directly use execveat(2) (without this new AT_EXECVE_CHECK flag) because this could lead to unexpected behaviors e.g., `python script.sh` could lead to Bash being executed to interpret the script. Unlike the kernel, script interpreters may just interpret the shebang as a simple comment, which should not change for backward compatibility reasons. Because scripts or libraries files might not currently have the executable permission set, or because we might want specific users to be allowed to run arbitrary scripts, the following patch provides a dynamic configuration mechanism with the SECBIT_EXEC_RESTRICT_FILE and SECBIT_EXEC_DENY_INTERACTIVE securebits. This is a redesign of the CLIP OS 4's O_MAYEXEC: https://github.com/clipos-archive/src_platform_clip-patches/blob/f5cb330d6b684752e403b4e41b39f7004d88e561/1901_open_mayexec.patch This patch has been used for more than a decade with customized script interpreters. Some examples can be found here: https://github.com/clipos-archive/clipos4_portage-overlay/search?q=O_MAYEXEC Cc: Al Viro Cc: Christian Brauner Cc: Kees Cook Acked-by: Paul Moore Reviewed-by: Serge Hallyn Reviewed-by: Jeff Xu Tested-by: Jeff Xu Link: https://docs.python.org/3/library/io.html#io.open_code [1] Signed-off-by: Mickaël Salaün Link: https://lore.kernel.org/r/20241212174223.389435-2-mic@digikod.net Signed-off-by: Kees Cook --- include/linux/binfmts.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index e6c00e860951..8ff0eb3644a1 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -42,7 +42,12 @@ struct linux_binprm { * Set when errors can no longer be returned to the * original userspace. */ - point_of_no_return:1; + point_of_no_return:1, + /* + * Set by user space to check executability according to the + * caller's environment. + */ + is_check:1; struct file *executable; /* Executable to pass to the interpreter */ struct file *interpreter; struct file *file; -- cgit v1.2.3 From 12d908116f7efd34f255a482b9afc729d7a5fb78 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 18 Dec 2024 17:56:25 +0100 Subject: io_uring: Fix registered ring file refcount leak Currently, io_uring_unreg_ringfd() (which cleans up registered rings) is only called on exit, but __io_uring_free (which frees the tctx in which the registered ring pointers are stored) is also called on execve (via begin_new_exec -> io_uring_task_cancel -> __io_uring_cancel -> io_uring_cancel_generic -> __io_uring_free). This means: A process going through execve while having registered rings will leak references to the rings' `struct file`. Fix it by zapping registered rings on execve(). This is implemented by moving the io_uring_unreg_ringfd() from io_uring_files_cancel() into its callee __io_uring_cancel(), which is called from io_uring_task_cancel() on execve. This could probably be exploited *on 32-bit kernels* by leaking 2^32 references to the same ring, because the file refcount is stored in a pointer-sized field and get_file() doesn't have protection against refcount overflow, just a WARN_ONCE(); but on 64-bit it should have no impact beyond a memory leak. Cc: stable@vger.kernel.org Fixes: e7a6c00dc77a ("io_uring: add support for registering ring file descriptors") Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20241218-uring-reg-ring-cleanup-v1-1-8f63e999045b@google.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index e123d5e17b52..85fe4e6b275c 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -15,10 +15,8 @@ bool io_is_uring_fops(struct file *file); static inline void io_uring_files_cancel(void) { - if (current->io_uring) { - io_uring_unreg_ringfd(); + if (current->io_uring) __io_uring_cancel(false); - } } static inline void io_uring_task_cancel(void) { -- cgit v1.2.3 From a126061c80d5efb4baef4bcf346094139cd81df6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Dec 2024 13:51:21 +0000 Subject: ptr_ring: do not block hard interrupts in ptr_ring_resize_multiple() Jakub added a lockdep_assert_no_hardirq() check in __page_pool_put_page() to increase test coverage. syzbot found a splat caused by hard irq blocking in ptr_ring_resize_multiple() [1] As current users of ptr_ring_resize_multiple() do not require hard irqs being masked, replace it to only block BH. Rename helpers to better reflect they are safe against BH only. - ptr_ring_resize_multiple() to ptr_ring_resize_multiple_bh() - skb_array_resize_multiple() to skb_array_resize_multiple_bh() [1] WARNING: CPU: 1 PID: 9150 at net/core/page_pool.c:709 __page_pool_put_page net/core/page_pool.c:709 [inline] WARNING: CPU: 1 PID: 9150 at net/core/page_pool.c:709 page_pool_put_unrefed_netmem+0x157/0xa40 net/core/page_pool.c:780 Modules linked in: CPU: 1 UID: 0 PID: 9150 Comm: syz.1.1052 Not tainted 6.11.0-rc3-syzkaller-00202-gf8669d7b5f5d #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024 RIP: 0010:__page_pool_put_page net/core/page_pool.c:709 [inline] RIP: 0010:page_pool_put_unrefed_netmem+0x157/0xa40 net/core/page_pool.c:780 Code: 74 0e e8 7c aa fb f7 eb 43 e8 75 aa fb f7 eb 3c 65 8b 1d 38 a8 6a 76 31 ff 89 de e8 a3 ae fb f7 85 db 74 0b e8 5a aa fb f7 90 <0f> 0b 90 eb 1d 65 8b 1d 15 a8 6a 76 31 ff 89 de e8 84 ae fb f7 85 RSP: 0018:ffffc9000bda6b58 EFLAGS: 00010083 RAX: ffffffff8997e523 RBX: 0000000000000000 RCX: 0000000000040000 RDX: ffffc9000fbd0000 RSI: 0000000000001842 RDI: 0000000000001843 RBP: 0000000000000000 R08: ffffffff8997df2c R09: 1ffffd40003a000d R10: dffffc0000000000 R11: fffff940003a000e R12: ffffea0001d00040 R13: ffff88802e8a4000 R14: dffffc0000000000 R15: 00000000ffffffff FS: 00007fb7aaf716c0(0000) GS:ffff8880b9300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa15a0d4b72 CR3: 00000000561b0000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tun_ptr_free drivers/net/tun.c:617 [inline] __ptr_ring_swap_queue include/linux/ptr_ring.h:571 [inline] ptr_ring_resize_multiple_noprof include/linux/ptr_ring.h:643 [inline] tun_queue_resize drivers/net/tun.c:3694 [inline] tun_device_event+0xaaf/0x1080 drivers/net/tun.c:3714 notifier_call_chain+0x19f/0x3e0 kernel/notifier.c:93 call_netdevice_notifiers_extack net/core/dev.c:2032 [inline] call_netdevice_notifiers net/core/dev.c:2046 [inline] dev_change_tx_queue_len+0x158/0x2a0 net/core/dev.c:9024 do_setlink+0xff6/0x41f0 net/core/rtnetlink.c:2923 rtnl_setlink+0x40d/0x5a0 net/core/rtnetlink.c:3201 rtnetlink_rcv_msg+0x73f/0xcf0 net/core/rtnetlink.c:6647 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2550 Fixes: ff4e538c8c3e ("page_pool: add a lockdep check for recycling in hardirq") Reported-by: syzbot+f56a5c5eac2b28439810@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/671e10df.050a0220.2b8c0f.01cf.GAE@google.com/T/ Signed-off-by: Eric Dumazet Acked-by: Michael S. Tsirkin Acked-by: Jason Wang Link: https://patch.msgid.link/20241217135121.326370-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/ptr_ring.h | 21 ++++++++++----------- include/linux/skb_array.h | 17 +++++++++-------- 2 files changed, 19 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index fd037c127bb0..551329220e4f 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -615,15 +615,14 @@ static inline int ptr_ring_resize_noprof(struct ptr_ring *r, int size, gfp_t gfp /* * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. - * In particular if you consume ring in interrupt or BH context, you must - * disable interrupts/BH when doing so. + * In particular if you consume ring in BH context, you must + * disable BH when doing so. */ -static inline int ptr_ring_resize_multiple_noprof(struct ptr_ring **rings, - unsigned int nrings, - int size, - gfp_t gfp, void (*destroy)(void *)) +static inline int ptr_ring_resize_multiple_bh_noprof(struct ptr_ring **rings, + unsigned int nrings, + int size, gfp_t gfp, + void (*destroy)(void *)) { - unsigned long flags; void ***queues; int i; @@ -638,12 +637,12 @@ static inline int ptr_ring_resize_multiple_noprof(struct ptr_ring **rings, } for (i = 0; i < nrings; ++i) { - spin_lock_irqsave(&(rings[i])->consumer_lock, flags); + spin_lock_bh(&(rings[i])->consumer_lock); spin_lock(&(rings[i])->producer_lock); queues[i] = __ptr_ring_swap_queue(rings[i], queues[i], size, gfp, destroy); spin_unlock(&(rings[i])->producer_lock); - spin_unlock_irqrestore(&(rings[i])->consumer_lock, flags); + spin_unlock_bh(&(rings[i])->consumer_lock); } for (i = 0; i < nrings; ++i) @@ -662,8 +661,8 @@ nomem: noqueues: return -ENOMEM; } -#define ptr_ring_resize_multiple(...) \ - alloc_hooks(ptr_ring_resize_multiple_noprof(__VA_ARGS__)) +#define ptr_ring_resize_multiple_bh(...) \ + alloc_hooks(ptr_ring_resize_multiple_bh_noprof(__VA_ARGS__)) static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *)) { diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h index 926496c9cc9c..bf178238a308 100644 --- a/include/linux/skb_array.h +++ b/include/linux/skb_array.h @@ -199,17 +199,18 @@ static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp) return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb); } -static inline int skb_array_resize_multiple_noprof(struct skb_array **rings, - int nrings, unsigned int size, - gfp_t gfp) +static inline int skb_array_resize_multiple_bh_noprof(struct skb_array **rings, + int nrings, + unsigned int size, + gfp_t gfp) { BUILD_BUG_ON(offsetof(struct skb_array, ring)); - return ptr_ring_resize_multiple_noprof((struct ptr_ring **)rings, - nrings, size, gfp, - __skb_array_destroy_skb); + return ptr_ring_resize_multiple_bh_noprof((struct ptr_ring **)rings, + nrings, size, gfp, + __skb_array_destroy_skb); } -#define skb_array_resize_multiple(...) \ - alloc_hooks(skb_array_resize_multiple_noprof(__VA_ARGS__)) +#define skb_array_resize_multiple_bh(...) \ + alloc_hooks(skb_array_resize_multiple_bh_noprof(__VA_ARGS__)) static inline void skb_array_cleanup(struct skb_array *a) { -- cgit v1.2.3 From 31c5629920b82ddf66059f20f79be2bc00c4197b Mon Sep 17 00:00:00 2001 From: Petr Malat Date: Tue, 10 Dec 2024 01:06:04 +0100 Subject: mm: add RCU annotation to pte_offset_map(_lock) RCU lock is taken by ___pte_offset_map() unless it returns NULL. Add this information to its inline callers to avoid sparse warning about context imbalance in pte_unmap(). Link: https://lkml.kernel.org/r/20241210000604.700710-1-oss@malat.biz Signed-off-by: Petr Malat Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c39c4945946c..3a6ee6a05aa0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3010,7 +3010,15 @@ static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } -pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); +pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); +static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, + pmd_t *pmdvalp) +{ + pte_t *pte; + + __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp)); + return pte; +} static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) { return __pte_offset_map(pmd, addr, NULL); @@ -3023,7 +3031,8 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, { pte_t *pte; - __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp)); + __cond_lock(RCU, __cond_lock(*ptlp, + pte = __pte_offset_map_lock(mm, pmd, addr, ptlp))); return pte; } -- cgit v1.2.3 From 5c0541e11c16bd2f162e23a22d07c09d58017e5a Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 9 Dec 2024 13:23:25 -0500 Subject: mm: introduce cpu_icache_is_aliasing() across all architectures In commit eacd0e950dc2 ("ARC: [mm] Lazy D-cache flush (non aliasing VIPT)"), arc adds the need to flush dcache to make icache see the code page change. This also requires special handling for clear_user_(high)page(). Introduce cpu_icache_is_aliasing() to make MM code query special clear_user_(high)page() easier. This will be used by the following commit. Link: https://lkml.kernel.org/r/20241209182326.2955963-1-ziy@nvidia.com Fixes: 5708d96da20b ("mm: avoid zeroing user movable page twice with init_on_alloc=1") Signed-off-by: Zi Yan Suggested-by: Mathieu Desnoyers Reviewed-by: Mathieu Desnoyers Acked-by: Vlastimil Babka Cc: Alexander Potapenko Cc: David Hildenbrand Cc: Geert Uytterhoeven Cc: John Hubbard Cc: Kees Cook Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ryan Roberts Cc: Vineet Gupta Signed-off-by: Andrew Morton --- include/linux/cacheinfo.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 108060612bb8..7ad736538649 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -155,8 +155,14 @@ static inline int get_cpu_cacheinfo_id(int cpu, int level) #ifndef CONFIG_ARCH_HAS_CPU_CACHE_ALIASING #define cpu_dcache_is_aliasing() false +#define cpu_icache_is_aliasing() cpu_dcache_is_aliasing() #else #include + +#ifndef cpu_icache_is_aliasing +#define cpu_icache_is_aliasing() cpu_dcache_is_aliasing() +#endif + #endif #endif /* _LINUX_CACHEINFO_H */ -- cgit v1.2.3 From c51a4f11e6d8246590b5e64908c1ed84b33e8ba2 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 9 Dec 2024 13:23:26 -0500 Subject: mm: use clear_user_(high)page() for arch with special user folio handling Some architectures have special handling after clearing user folios: architectures, which set cpu_dcache_is_aliasing() to true, require flushing dcache; arc, which sets cpu_icache_is_aliasing() to true, changes folio->flags to make icache coherent to dcache. So __GFP_ZERO using only clear_page() is not enough to zero user folios and clear_user_(high)page() must be used. Otherwise, user data will be corrupted. Fix it by always clearing user folios with clear_user_(high)page() when cpu_dcache_is_aliasing() is true or cpu_icache_is_aliasing() is true. Rename alloc_zeroed() to user_alloc_needs_zeroing() and invert the logic to clarify its intend. Link: https://lkml.kernel.org/r/20241209182326.2955963-2-ziy@nvidia.com Fixes: 5708d96da20b ("mm: avoid zeroing user movable page twice with init_on_alloc=1") Signed-off-by: Zi Yan Reported-by: Geert Uytterhoeven Closes: https://lore.kernel.org/linux-mm/CAMuHMdV1hRp_NtR5YnJo=HsfgKQeH91J537Gh4gKk3PFZhSkbA@mail.gmail.com/ Tested-by: Geert Uytterhoeven Acked-by: Vlastimil Babka Cc: Alexander Potapenko Cc: David Hildenbrand Cc: John Hubbard Cc: Kees Cook Cc: Kefeng Wang Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ryan Roberts Cc: Vineet Gupta Signed-off-by: Andrew Morton --- include/linux/highmem.h | 8 +++++++- include/linux/mm.h | 18 ++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 6e452bd8e7e3..5c6bea81a90e 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -224,7 +224,13 @@ static inline struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { - return vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr); + struct folio *folio; + + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); + if (folio && user_alloc_needs_zeroing()) + clear_user_highpage(&folio->page, vaddr); + + return folio; } #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 3a6ee6a05aa0..338a76ce9083 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -31,6 +31,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -4184,6 +4185,23 @@ static inline int do_mseal(unsigned long start, size_t len_in, unsigned long fla } #endif +/* + * user_alloc_needs_zeroing checks if a user folio from page allocator needs to + * be zeroed or not. + */ +static inline bool user_alloc_needs_zeroing(void) +{ + /* + * for user folios, arch with cache aliasing requires cache flush and + * arc changes folio->flags to make icache coherent with dcache, so + * always return false to make caller use + * clear_user_page()/clear_user_highpage(). + */ + return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() || + !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc); +} + int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); -- cgit v1.2.3 From 42b2eb69835b0fda797f70eb5b4fc213dbe3a7ea Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Thu, 12 Dec 2024 18:33:51 +0000 Subject: mm: convert partially_mapped set/clear operations to be atomic Other page flags in the 2nd page, like PG_hwpoison and PG_anon_exclusive can get modified concurrently. Changes to other page flags might be lost if they are happening at the same time as non-atomic partially_mapped operations. Hence, make partially_mapped operations atomic. Link: https://lkml.kernel.org/r/20241212183351.1345389-1-usamaarif642@gmail.com Fixes: 8422acdc97ed ("mm: introduce a pageflag for partially mapped folios") Reported-by: David Hildenbrand Link: https://lore.kernel.org/all/e53b04ad-1827-43a2-a1ab-864c7efecf6e@redhat.com/ Signed-off-by: Usama Arif Acked-by: David Hildenbrand Acked-by: Johannes Weiner Acked-by: Roman Gushchin Cc: Barry Song Cc: Domenico Cerasuolo Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Nico Pache Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index cf46ac720802..691506bdf2c5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -862,18 +862,10 @@ static inline void ClearPageCompound(struct page *page) ClearPageHead(page); } FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE) -FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE) -/* - * PG_partially_mapped is protected by deferred_split split_queue_lock, - * so its safe to use non-atomic set/clear. - */ -__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE) -__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE) +FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(large_rmappable) -FOLIO_TEST_FLAG_FALSE(partially_mapped) -__FOLIO_SET_FLAG_NOOP(partially_mapped) -__FOLIO_CLEAR_FLAG_NOOP(partially_mapped) +FOLIO_FLAG_FALSE(partially_mapped) #endif #define PG_head_mask ((1UL << PG_head)) -- cgit v1.2.3 From 30c2de0a267c04046d89e678cc0067a9cfb455df Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 12 Dec 2024 13:31:26 -0800 Subject: mm/vmstat: fix a W=1 clang compiler warning Fix the following clang compiler warning that is reported if the kernel is built with W=1: ./include/linux/vmstat.h:518:36: error: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Werror,-Wenum-enum-conversion] 518 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" | ~~~~~~~~~~~ ^ ~~~ Link: https://lkml.kernel.org/r/20241212213126.1269116-1-bvanassche@acm.org Fixes: 9d7ea9a297e6 ("mm/vmstat: add helpers to get vmstat item names for each enum type") Signed-off-by: Bart Van Assche Cc: Konstantin Khlebnikov Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index d2761bf8ff32..9f3a04345b86 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -515,7 +515,7 @@ static inline const char *node_stat_name(enum node_stat_item item) static inline const char *lru_list_name(enum lru_list lru) { - return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" + return node_stat_name(NR_LRU_BASE + (enum node_stat_item)lru) + 3; // skip "nr_" } #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) -- cgit v1.2.3 From 640a603943a7659340c10044c0a1c98ae4e13189 Mon Sep 17 00:00:00 2001 From: David Wang <00107082@163.com> Date: Fri, 13 Dec 2024 09:33:32 +0800 Subject: mm/codetag: clear tags before swap When CONFIG_MEM_ALLOC_PROFILING_DEBUG is set, kernel WARN would be triggered when calling __alloc_tag_ref_set() during swap: alloc_tag was not cleared (got tag for mm/filemap.c:1951) WARNING: CPU: 0 PID: 816 at ./include/linux/alloc_tag.h... Clear code tags before swap can fix the warning. And this patch also fix a potential invalid address dereference in alloc_tag_add_check() when CONFIG_MEM_ALLOC_PROFILING_DEBUG is set and ref->ct is CODETAG_EMPTY, which is defined as ((void *)1). Link: https://lkml.kernel.org/r/20241213013332.89910-1-00107082@163.com Fixes: 51f43d5d82ed ("mm/codetag: swap tags when migrate pages") Signed-off-by: David Wang <00107082@163.com> Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202412112227.df61ebb-lkp@intel.com Acked-by: Suren Baghdasaryan Cc: Kent Overstreet Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 7c0786bdf9af..cba024bf2db3 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -135,7 +135,7 @@ static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag) #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) { - WARN_ONCE(ref && ref->ct, + WARN_ONCE(ref && ref->ct && !is_codetag_empty(ref), "alloc_tag was not cleared (got tag for %s:%u)\n", ref->ct->filename, ref->ct->lineno); -- cgit v1.2.3 From 60da7445a142bd15e67f3cda915497781c3f781f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 29 Nov 2024 16:14:23 -0800 Subject: alloc_tag: fix set_codetag_empty() when !CONFIG_MEM_ALLOC_PROFILING_DEBUG It was recently noticed that set_codetag_empty() might be used not only to mark NULL alloctag references as empty to avoid warnings but also to reset valid tags (in clear_page_tag_ref()). Since set_codetag_empty() is defined as NOOP for CONFIG_MEM_ALLOC_PROFILING_DEBUG=n, such use of set_codetag_empty() leads to subtle bugs. Fix set_codetag_empty() for CONFIG_MEM_ALLOC_PROFILING_DEBUG=n to reset the tag reference. Link: https://lkml.kernel.org/r/20241130001423.1114965-2-surenb@google.com Fixes: a8fc28dad6d5 ("alloc_tag: introduce clear_page_tag_ref() helper function") Signed-off-by: Suren Baghdasaryan Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/lkml/20241124074318.399027-1-00107082@163.com/ Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Mike Rapoport (Microsoft) Cc: Pasha Tatashin Cc: Sourav Panda Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index cba024bf2db3..0bbbe537c5f9 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -63,7 +63,12 @@ static inline void set_codetag_empty(union codetag_ref *ref) #else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ static inline bool is_codetag_empty(union codetag_ref *ref) { return false; } -static inline void set_codetag_empty(union codetag_ref *ref) {} + +static inline void set_codetag_empty(union codetag_ref *ref) +{ + if (ref) + ref->ct = NULL; +} #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ -- cgit v1.2.3 From 08a7ead3242f70f4415232800104ae458fd96d17 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 12 Dec 2024 20:19:49 -0800 Subject: mmc: crypto: add mmc_from_crypto_profile() Add a helper function that encapsulates a container_of expression. For now there is just one user but soon there will be more. Signed-off-by: Eric Biggers Message-ID: <20241213041958.202565-7-ebiggers@kernel.org> Signed-off-by: Ulf Hansson --- include/linux/mmc/host.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index f166d6611ddb..68f09a955a90 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -590,6 +590,14 @@ static inline struct mmc_host *mmc_from_priv(void *priv) return container_of(priv, struct mmc_host, private); } +#ifdef CONFIG_MMC_CRYPTO +static inline struct mmc_host * +mmc_from_crypto_profile(struct blk_crypto_profile *profile) +{ + return container_of(profile, struct mmc_host, crypto_profile); +} +#endif + #define mmc_host_is_spi(host) ((host)->caps & MMC_CAP_SPI) #define mmc_dev(x) ((x)->parent) -- cgit v1.2.3 From 7d3707bbbbb1dc8b1802886e9ff0506b7d8b323b Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 16 Dec 2024 21:17:18 +0000 Subject: ACPI: header: implement acpi_device_handle when !ACPI Provide an implementation of acpi_device_handle that can be used when CONFIG_ACPI is not set. Reviewed-by: Sakari Ailus Acked-by: Mauro Carvalho Chehab Signed-off-by: Ricardo Ribalda Link: https://patch.msgid.link/20241216-fix-ipu-v5-4-3d6b35ddce7b@chromium.org Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 6adcd1b92b20..4e495b29c640 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -854,6 +854,11 @@ static inline struct fwnode_handle *acpi_fwnode_handle(struct acpi_device *adev) return NULL; } +static inline acpi_handle acpi_device_handle(struct acpi_device *adev) +{ + return NULL; +} + static inline bool has_acpi_companion(struct device *dev) { return false; -- cgit v1.2.3 From b55498ff14bd14860d48dc8d2a0b6889b218c408 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 16 Dec 2024 22:31:18 +0100 Subject: net: phy: add phy_disable_eee If a MAC driver doesn't support EEE, then the PHY shouldn't advertise it. Add phy_disable_eee() for this purpose. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/fd51738c-dcd6-4d61-b8c5-faa6ac0f1026@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index e597a32cc787..5bc71d59910c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2071,6 +2071,7 @@ void phy_advertise_eee_all(struct phy_device *phydev); void phy_support_sym_pause(struct phy_device *phydev); void phy_support_asym_pause(struct phy_device *phydev); void phy_support_eee(struct phy_device *phydev); +void phy_disable_eee(struct phy_device *phydev); void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx, bool autoneg); void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx); -- cgit v1.2.3 From 68ddc8ae17685a8c4ac78260bde8fe4a79511aef Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 18 Dec 2024 18:44:30 +0100 Subject: xdp: add generic xdp_buff_add_frag() The code piece which would attach a frag to &xdp_buff is almost identical across the drivers supporting XDP multi-buffer on Rx. Make it a generic elegant "oneliner". Also, I see lots of drivers calculating frags_truesize as `xdp->frame_sz * nr_frags`. I can't say this is fully correct, since frags might be backed by chunks of different sizes, especially with stuff like the header split. Even page_pool_alloc() can give you two different truesizes on two subsequent requests to allocate the same buffer size. Add a field to &skb_shared_info (unionized as there's no free slot currently on x86_64) to track the "true" truesize. It can be used later when updating the skb. Reviewed-by: Maciej Fijalkowski Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241218174435.1445282-3-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b2509cd0b930..bb2b751d274a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -608,11 +608,19 @@ struct skb_shared_info { * Warning : all fields before dataref are cleared in __alloc_skb() */ atomic_t dataref; - unsigned int xdp_frags_size; - /* Intermediate layers must ensure that destructor_arg - * remains valid until skb destructor */ - void * destructor_arg; + union { + struct { + u32 xdp_frags_size; + u32 xdp_frags_truesize; + }; + + /* + * Intermediate layers must ensure that destructor_arg + * remains valid until skb destructor. + */ + void *destructor_arg; + }; /* must be last field, see pskb_expand_head() */ skb_frag_t frags[MAX_SKB_FRAGS]; -- cgit v1.2.3 From a430d99e349026d53e2557b7b22bd2ebd61fe12a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Dec 2024 06:32:19 +0000 Subject: sched/fair: Fix value reported by hot tasks pulled in /proc/schedstat In /proc/schedstat, lb_hot_gained reports the number hot tasks pulled during load balance. This value is incremented in can_migrate_task() if the task is migratable and hot. After incrementing the value, load balancer can still decide not to migrate this task leading to wrong accounting. Fix this by incrementing stats when hot tasks are detached. This issue only exists in detach_tasks() where we can decide to not migrate hot task even if it is migratable. However, in detach_one_task(), we migrate it unconditionally. [Swapnil: Handled the case where nr_failed_migrations_hot was not accounted properly and wrote commit log] Fixes: d31980846f96 ("sched: Move up affinity check to mitigate useless redoing overhead") Signed-off-by: Peter Zijlstra (Intel) Reported-by: "Gautham R. Shenoy" Not-yet-signed-off-by: Peter Zijlstra Signed-off-by: Swapnil Sapkal Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241220063224.17767-2-swapnil.sapkal@amd.com --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index b5916be49f62..8c6a2ed9f80e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -937,6 +937,7 @@ struct task_struct { unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; + unsigned sched_task_hot:1; /* Force alignment to the next boundary: */ unsigned :0; -- cgit v1.2.3 From 3b2a793ea70fd14136b442df31e53935e8095034 Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Fri, 20 Dec 2024 06:32:21 +0000 Subject: sched: Report the different kinds of imbalances in /proc/schedstat In /proc/schedstat, lb_imbalance reports the sum of imbalances discovered in sched domains with each call to sched_balance_rq(), which is not very useful because lb_imbalance does not mention whether the imbalance is due to load, utilization, nr_tasks or misfit_tasks. Remove this field from /proc/schedstat. Currently there is no field in /proc/schedstat to report different types of imbalances. Introduce new fields in /proc/schedstat to report the total imbalances in load, utilization, nr_tasks or misfit_tasks. Added fields to /proc/schedstat: - lb_imbalance_load: Total imbalance due to load. - lb_imbalance_util: Total imbalance due to utilization. - lb_imbalance_task: Total imbalance due to number of tasks. - lb_imbalance_misfit: Total imbalance due to misfit tasks. Signed-off-by: Swapnil Sapkal Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20241220063224.17767-4-swapnil.sapkal@amd.com --- include/linux/sched/topology.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 4237daa5ac7a..76a662e1ec24 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -114,7 +114,10 @@ struct sched_domain { unsigned int lb_count[CPU_MAX_IDLE_TYPES]; unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; - unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_load[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_util[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_task[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_misfit[CPU_MAX_IDLE_TYPES]; unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; -- cgit v1.2.3 From 1c055a0f5d3bafaca5d218bbb3e4e63d6307be45 Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Fri, 20 Dec 2024 06:32:22 +0000 Subject: sched: Move sched domain name out of CONFIG_SCHED_DEBUG /proc/schedstat file shows cpu and sched domain level scheduler statistics. It does not show domain name instead shows domain level. It will be very useful for tools like `perf sched stats`[1] to aggragate domain level stats if domain names are shown in /proc/schedstat. But sched domain name is guarded by CONFIG_SCHED_DEBUG. As per the discussion[2], move sched domain name out of CONFIG_SCHED_DEBUG. [1] https://lore.kernel.org/lkml/20241122084452.1064968-1-swapnil.sapkal@amd.com/ [2] https://lore.kernel.org/lkml/fcefeb4d-3acb-462d-9c9b-3df8d927e522@amd.com/ Suggested-by: "Gautham R. Shenoy" Signed-off-by: Swapnil Sapkal Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241220063224.17767-5-swapnil.sapkal@amd.com --- include/linux/sched/topology.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 76a662e1ec24..7f3dbafe1817 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -143,9 +143,7 @@ struct sched_domain { unsigned int ttwu_move_affine; unsigned int ttwu_move_balance; #endif -#ifdef CONFIG_SCHED_DEBUG char *name; -#endif union { void *private; /* used during construction */ struct rcu_head rcu; /* used during destruction */ @@ -201,18 +199,12 @@ struct sched_domain_topology_level { int flags; int numa_level; struct sd_data data; -#ifdef CONFIG_SCHED_DEBUG char *name; -#endif }; extern void __init set_sched_topology(struct sched_domain_topology_level *tl); -#ifdef CONFIG_SCHED_DEBUG # define SD_INIT_NAME(type) .name = #type -#else -# define SD_INIT_NAME(type) -#endif #else /* CONFIG_SMP */ -- cgit v1.2.3 From abfdccd6af2b071951633e57d6322c46a1ea791f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 16 Dec 2024 20:07:35 -0800 Subject: sched/wake_q: Add helper to call wake_up_q after unlock with preemption disabled A common pattern seen when wake_qs are used to defer a wakeup until after a lock is released is something like: preempt_disable(); raw_spin_unlock(lock); wake_up_q(wake_q); preempt_enable(); So create some raw_spin_unlock*_wake() helper functions to clean this up. Applies on top of the fix I submitted here: https://lore.kernel.org/lkml/20241212222138.2400498-1-jstultz@google.com/ NOTE: I recognise the unlock()/unlock_irq()/unlock_irqrestore() variants creates its own duplication, which we could use a macro to generate the similar functions, but I often dislike how those generation macros making finding the actual implementation harder, so I left the three functions as is. If folks would prefer otherwise, let me know and I'll switch it. Suggested-by: Peter Zijlstra Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241217040803.243420-1-jstultz@google.com --- include/linux/sched/wake_q.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h index 06cd8fb2f409..0f28b4623ad4 100644 --- a/include/linux/sched/wake_q.h +++ b/include/linux/sched/wake_q.h @@ -63,4 +63,38 @@ extern void wake_q_add(struct wake_q_head *head, struct task_struct *task); extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task); extern void wake_up_q(struct wake_q_head *head); +/* Spin unlock helpers to unlock and call wake_up_q with preempt disabled */ +static inline +void raw_spin_unlock_wake(raw_spinlock_t *lock, struct wake_q_head *wake_q) +{ + guard(preempt)(); + raw_spin_unlock(lock); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } +} + +static inline +void raw_spin_unlock_irq_wake(raw_spinlock_t *lock, struct wake_q_head *wake_q) +{ + guard(preempt)(); + raw_spin_unlock_irq(lock); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } +} + +static inline +void raw_spin_unlock_irqrestore_wake(raw_spinlock_t *lock, unsigned long flags, + struct wake_q_head *wake_q) +{ + guard(preempt)(); + raw_spin_unlock_irqrestore(lock, flags); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } +} #endif /* _LINUX_SCHED_WAKE_Q_H */ -- cgit v1.2.3 From d888b7af7c149c115dd6ac772cc11c375da3e17c Mon Sep 17 00:00:00 2001 From: Zijian Zhang Date: Tue, 10 Dec 2024 01:20:39 +0000 Subject: tcp_bpf: Add sk_rmem_alloc related logic for tcp_bpf ingress redirection When we do sk_psock_verdict_apply->sk_psock_skb_ingress, an sk_msg will be created out of the skb, and the rmem accounting of the sk_msg will be handled by the skb. For skmsgs in __SK_REDIRECT case of tcp_bpf_send_verdict, when redirecting to the ingress of a socket, although we sk_rmem_schedule and add sk_msg to the ingress_msg of sk_redir, we do not update sk_rmem_alloc. As a result, except for the global memory limit, the rmem of sk_redir is nearly unlimited. Thus, add sk_rmem_alloc related logic to limit the recv buffer. Since the function sk_msg_recvmsg and __sk_psock_purge_ingress_msg are used in these two paths. We use "msg->skb" to test whether the sk_msg is skb backed up. If it's not, we shall do the memory accounting explicitly. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Zijian Zhang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241210012039.1669389-3-zijianzhang@bytedance.com --- include/linux/skmsg.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index d9b03e0746e7..2cbe0c22a32f 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -317,17 +317,22 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); } -static inline void sk_psock_queue_msg(struct sk_psock *psock, +static inline bool sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { + bool ret; + spin_lock_bh(&psock->ingress_lock); - if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { list_add_tail(&msg->list, &psock->ingress_msg); - else { + ret = true; + } else { sk_msg_free(psock->sk, msg); kfree(msg); + ret = false; } spin_unlock_bh(&psock->ingress_lock); + return ret; } static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) -- cgit v1.2.3 From 208fff3f567e2a3c3e7e4788845e90245c3891b4 Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Wed, 18 Dec 2024 14:12:37 +0100 Subject: PCI: Add PCI_VDEVICE_SUB helper macro PCI_VDEVICE_SUB generates the pci_device_id struct layout for the specific PCI device/subdevice. Private data may follow the output. Reviewed-by: Przemek Kitszel Signed-off-by: Piotr Kwapulinski Acked-by: Bjorn Helgaas Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- include/linux/pci.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index db9b47ce3eef..414ee5fff66b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1046,6 +1046,20 @@ struct pci_driver { .vendor = PCI_VENDOR_ID_##vend, .device = (dev), \ .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, 0, 0 +/** + * PCI_VDEVICE_SUB - describe a specific PCI device/subdevice in a short form + * @vend: the vendor name + * @dev: the 16 bit PCI Device ID + * @subvend: the 16 bit PCI Subvendor ID + * @subdev: the 16 bit PCI Subdevice ID + * + * Generate the pci_device_id struct layout for the specific PCI + * device/subdevice. Private data may follow the output. + */ +#define PCI_VDEVICE_SUB(vend, dev, subvend, subdev) \ + .vendor = PCI_VENDOR_ID_##vend, .device = (dev), \ + .subvendor = (subvend), .subdevice = (subdev), 0, 0 + /** * PCI_DEVICE_DATA - macro used to describe a specific PCI device in very short form * @vend: the vendor name (without PCI_VENDOR_ID_ prefix) -- cgit v1.2.3 From 4acb665cf4f3e5436844f17ece0a8a55ce688c7b Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:08 +0000 Subject: netfs: Work around recursion by abandoning retry if nothing read syzkaller reported recursion with a loop of three calls (netfs_rreq_assess, netfs_retry_reads and netfs_rreq_terminated) hitting the limit of the stack during an unbuffered or direct I/O read. There are a number of issues: (1) There is no limit on the number of retries. (2) A subrequest is supposed to be abandoned if it does not transfer anything (NETFS_SREQ_NO_PROGRESS), but that isn't checked under all circumstances. (3) The actual root cause, which is this: if (atomic_dec_and_test(&rreq->nr_outstanding)) netfs_rreq_terminated(rreq, ...); When we do a retry, we bump the rreq->nr_outstanding counter to prevent the final cleanup phase running before we've finished dispatching the retries. The problem is if we hit 0, we have to do the cleanup phase - but we're in the cleanup phase and end up repeating the retry cycle, hence the recursion. Work around the problem by limiting the number of retries. This is based on Lizhi Xu's patch[1], and makes the following changes: (1) Replace NETFS_SREQ_NO_PROGRESS with NETFS_SREQ_MADE_PROGRESS and make the filesystem set it if it managed to read or write at least one byte of data. Clear this bit before issuing a subrequest. (2) Add a ->retry_count member to the subrequest and increment it any time we do a retry. (3) Remove the NETFS_SREQ_RETRYING flag as it is superfluous with ->retry_count. If the latter is non-zero, we're doing a retry. (4) Abandon a subrequest if retry_count is non-zero and we made no progress. (5) Use ->retry_count in both the write-side and the read-size. [?] Question: Should I set a hard limit on retry_count in both read and write? Say it hits 50, we always abandon it. The problem is that these changes only mitigate the issue. As long as it made at least one byte of progress, the recursion is still an issue. This patch mitigates the problem, but does not fix the underlying cause. I have patches that will do that, but it's an intrusive fix that's currently pending for the next merge window. The oops generated by KASAN looks something like: BUG: TASK stack guard page was hit at ffffc9000482ff48 (stack is ffffc90004830000..ffffc90004838000) Oops: stack guard page: 0000 [#1] PREEMPT SMP KASAN NOPTI ... RIP: 0010:mark_lock+0x25/0xc60 kernel/locking/lockdep.c:4686 ... mark_usage kernel/locking/lockdep.c:4646 [inline] __lock_acquire+0x906/0x3ce0 kernel/locking/lockdep.c:5156 lock_acquire.part.0+0x11b/0x380 kernel/locking/lockdep.c:5825 local_lock_acquire include/linux/local_lock_internal.h:29 [inline] ___slab_alloc+0x123/0x1880 mm/slub.c:3695 __slab_alloc.constprop.0+0x56/0xb0 mm/slub.c:3908 __slab_alloc_node mm/slub.c:3961 [inline] slab_alloc_node mm/slub.c:4122 [inline] kmem_cache_alloc_noprof+0x2a7/0x2f0 mm/slub.c:4141 radix_tree_node_alloc.constprop.0+0x1e8/0x350 lib/radix-tree.c:253 idr_get_free+0x528/0xa40 lib/radix-tree.c:1506 idr_alloc_u32+0x191/0x2f0 lib/idr.c:46 idr_alloc+0xc1/0x130 lib/idr.c:87 p9_tag_alloc+0x394/0x870 net/9p/client.c:321 p9_client_prepare_req+0x19f/0x4d0 net/9p/client.c:644 p9_client_zc_rpc.constprop.0+0x105/0x880 net/9p/client.c:793 p9_client_read_once+0x443/0x820 net/9p/client.c:1570 p9_client_read+0x13f/0x1b0 net/9p/client.c:1534 v9fs_issue_read+0x115/0x310 fs/9p/vfs_addr.c:74 netfs_retry_read_subrequests fs/netfs/read_retry.c:60 [inline] netfs_retry_reads+0x153a/0x1d00 fs/netfs/read_retry.c:232 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 ... netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_dispatch_unbuffered_reads fs/netfs/direct_read.c:103 [inline] netfs_unbuffered_read fs/netfs/direct_read.c:127 [inline] netfs_unbuffered_read_iter_locked+0x12f6/0x19b0 fs/netfs/direct_read.c:221 netfs_unbuffered_read_iter+0xc5/0x100 fs/netfs/direct_read.c:256 v9fs_file_read_iter+0xbf/0x100 fs/9p/vfs_file.c:361 do_iter_readv_writev+0x614/0x7f0 fs/read_write.c:832 vfs_readv+0x4cf/0x890 fs/read_write.c:1025 do_preadv fs/read_write.c:1142 [inline] __do_sys_preadv fs/read_write.c:1192 [inline] __se_sys_preadv fs/read_write.c:1187 [inline] __x64_sys_preadv+0x22d/0x310 fs/read_write.c:1187 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Closes: https://syzkaller.appspot.com/bug?extid=1fc6f64c40a9d143cfb6 Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241108034020.3695718-1-lizhi.xu@windriver.com/ [1] Link: https://lore.kernel.org/r/20241213135013.2964079-9-dhowells@redhat.com Tested-by: syzbot+885c03ad650731743489@syzkaller.appspotmail.com Suggested-by: Lizhi Xu cc: Dominique Martinet cc: Jeff Layton cc: v9fs@lists.linux.dev cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Reported-by: syzbot+885c03ad650731743489@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- include/linux/netfs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5eaceef41e6c..4083d77e3f39 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -185,6 +185,7 @@ struct netfs_io_subrequest { short error; /* 0 or error that occurred */ unsigned short debug_index; /* Index in list (for debugging output) */ unsigned int nr_segs; /* Number of segs in io_iter */ + u8 retry_count; /* The number of retries (0 on initial pass) */ enum netfs_io_source source; /* Where to read from/write to */ unsigned char stream_nr; /* I/O stream this belongs to */ unsigned char curr_folioq_slot; /* Folio currently being read */ @@ -194,14 +195,13 @@ struct netfs_io_subrequest { #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ #define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */ #define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */ -#define NETFS_SREQ_NO_PROGRESS 4 /* Set if we didn't manage to read any data */ +#define NETFS_SREQ_MADE_PROGRESS 4 /* Set if we transferred at least some data */ #define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */ #define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */ #define NETFS_SREQ_HIT_EOF 7 /* Set if short due to EOF */ #define NETFS_SREQ_IN_PROGRESS 8 /* Unlocked when the subrequest completes */ #define NETFS_SREQ_NEED_RETRY 9 /* Set if the filesystem requests a retry */ -#define NETFS_SREQ_RETRYING 10 /* Set if we're retrying */ -#define NETFS_SREQ_FAILED 11 /* Set if the subreq failed unretryably */ +#define NETFS_SREQ_FAILED 10 /* Set if the subreq failed unretryably */ }; enum netfs_io_origin { -- cgit v1.2.3 From d4e338de17cb6532bf805fae00db8b41e914009b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:34:45 +0000 Subject: netfs: Fix is-caching check in read-retry netfs: Fix is-caching check in read-retry The read-retry code checks the NETFS_RREQ_COPY_TO_CACHE flag to determine if there might be failed reads from the cache that need turning into reads from the server, with the intention of skipping the complicated part if it can. The code that set the flag, however, got lost during the read-side rewrite. Fix the check to see if the cache_resources are valid instead. The flag can then be removed. Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Signed-off-by: David Howells Link: https://lore.kernel.org/r/3752048.1734381285@warthog.procyon.org.uk cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 4083d77e3f39..ecdd5ced16a8 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -269,7 +269,6 @@ struct netfs_io_request { size_t prev_donated; /* Fallback for subreq->prev_donated */ refcount_t ref; unsigned long flags; -#define NETFS_RREQ_COPY_TO_CACHE 1 /* Need to write to the cache */ #define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */ #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ -- cgit v1.2.3 From eb1181594417dafad0f75808ead71f6d5170c1ea Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:40:53 +0000 Subject: netfs: Use a folio_queue allocation and free functions Provide and use folio_queue allocation and free functions to combine the allocation, initialisation and stat (un)accounting steps that are repeated in several places. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-4-dhowells@redhat.com cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index ecdd5ced16a8..c69e0f02c30f 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -21,6 +21,7 @@ enum netfs_sreq_ref_trace; typedef struct mempool_s mempool_t; +struct folio_queue; /** * folio_start_private_2 - Start an fscache write on a folio. [DEPRECATED] @@ -453,6 +454,10 @@ void netfs_end_io_write(struct inode *inode); int netfs_start_io_direct(struct inode *inode); void netfs_end_io_direct(struct inode *inode); +/* Miscellaneous APIs. */ +struct folio_queue *netfs_folioq_alloc(gfp_t gfp); +void netfs_folioq_free(struct folio_queue *folioq); + /** * netfs_inode - Get the netfs inode context from the inode * @inode: The inode to query -- cgit v1.2.3 From aabcabf2746062253565b33aa3f8d25999a5ac01 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:40:54 +0000 Subject: netfs: Add a tracepoint to log the lifespan of folio_queue structs Add a tracepoint to log the lifespan of folio_queue structs. For tracing illustrative purposes, folio_queues are tagged with the debug ID of whatever they're related to (typically a netfs_io_request) and a debug ID of their own. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-5-dhowells@redhat.com cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/folio_queue.h | 12 +++++++++--- include/linux/netfs.h | 6 ++++-- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h index 3abe614ef5f0..4d3f8074c137 100644 --- a/include/linux/folio_queue.h +++ b/include/linux/folio_queue.h @@ -37,16 +37,20 @@ struct folio_queue { #if PAGEVEC_SIZE > BITS_PER_LONG #error marks is not big enough #endif + unsigned int rreq_id; + unsigned int debug_id; }; /** * folioq_init - Initialise a folio queue segment * @folioq: The segment to initialise + * @rreq_id: The request identifier to use in tracelines. * - * Initialise a folio queue segment. Note that the folio pointers are - * left uninitialised. + * Initialise a folio queue segment and set an identifier to be used in traces. + * + * Note that the folio pointers are left uninitialised. */ -static inline void folioq_init(struct folio_queue *folioq) +static inline void folioq_init(struct folio_queue *folioq, unsigned int rreq_id) { folio_batch_init(&folioq->vec); folioq->next = NULL; @@ -54,6 +58,8 @@ static inline void folioq_init(struct folio_queue *folioq) folioq->marks = 0; folioq->marks2 = 0; folioq->marks3 = 0; + folioq->rreq_id = rreq_id; + folioq->debug_id = 0; } /** diff --git a/include/linux/netfs.h b/include/linux/netfs.h index c69e0f02c30f..5b2f427f8e3e 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -455,8 +455,10 @@ int netfs_start_io_direct(struct inode *inode); void netfs_end_io_direct(struct inode *inode); /* Miscellaneous APIs. */ -struct folio_queue *netfs_folioq_alloc(gfp_t gfp); -void netfs_folioq_free(struct folio_queue *folioq); +struct folio_queue *netfs_folioq_alloc(unsigned int rreq_id, gfp_t gfp, + unsigned int trace /*enum netfs_folioq_trace*/); +void netfs_folioq_free(struct folio_queue *folioq, + unsigned int trace /*enum netfs_trace_folioq*/); /** * netfs_inode - Get the netfs inode context from the inode -- cgit v1.2.3 From 06fa229ceb36898e68022b5654c017d2c6582d7d Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:40:55 +0000 Subject: netfs: Abstract out a rolling folio buffer implementation A rolling buffer is a series of folios held in a list of folio_queues. New folios and folio_queue structs may be inserted at the head simultaneously with spent ones being removed from the tail without the need for locking. The rolling buffer includes an iov_iter and it has to be careful managing this as the list of folio_queues is extended such that an oops doesn't incurred because the iterator was pointing to the end of a folio_queue segment that got appended to and then removed. We need to use the mechanism twice, once for read and once for write, and, in future patches, we will use a second rolling buffer to handle bounce buffering for content encryption. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-6-dhowells@redhat.com cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 10 +++---- include/linux/rolling_buffer.h | 61 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 6 deletions(-) create mode 100644 include/linux/rolling_buffer.h (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5b2f427f8e3e..bd922f0936e3 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -18,6 +18,7 @@ #include #include #include +#include enum netfs_sreq_ref_trace; typedef struct mempool_s mempool_t; @@ -238,10 +239,9 @@ struct netfs_io_request { struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */ #define NR_IO_STREAMS 2 //wreq->nr_io_streams struct netfs_group *group; /* Writeback group being written back */ - struct folio_queue *buffer; /* Head of I/O buffer */ - struct folio_queue *buffer_tail; /* Tail of I/O buffer */ - struct iov_iter iter; /* Unencrypted-side iterator */ - struct iov_iter io_iter; /* I/O (Encrypted-side) iterator */ + struct rolling_buffer buffer; /* Unencrypted buffer */ +#define NETFS_ROLLBUF_PUT_MARK ROLLBUF_MARK_1 +#define NETFS_ROLLBUF_PAGECACHE_MARK ROLLBUF_MARK_2 void *netfs_priv; /* Private data for the netfs */ void *netfs_priv2; /* Private data for the netfs */ struct bio_vec *direct_bv; /* DIO buffer list (when handling iovec-iter) */ @@ -259,8 +259,6 @@ struct netfs_io_request { long error; /* 0 or error that occurred */ enum netfs_io_origin origin; /* Origin of the request */ bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ - u8 buffer_head_slot; /* First slot in ->buffer */ - u8 buffer_tail_slot; /* Next slot in ->buffer_tail */ unsigned long long i_size; /* Size of the file */ unsigned long long start; /* Start position */ atomic64_t issued_to; /* Write issuer folio cursor */ diff --git a/include/linux/rolling_buffer.h b/include/linux/rolling_buffer.h new file mode 100644 index 000000000000..ac15b1ffdd83 --- /dev/null +++ b/include/linux/rolling_buffer.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Rolling buffer of folios + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#ifndef _ROLLING_BUFFER_H +#define _ROLLING_BUFFER_H + +#include +#include + +/* + * Rolling buffer. Whilst the buffer is live and in use, folios and folio + * queue segments can be added to one end by one thread and removed from the + * other end by another thread. The buffer isn't allowed to be empty; it must + * always have at least one folio_queue in it so that neither side has to + * modify both queue pointers. + * + * The iterator in the buffer is extended as buffers are inserted. It can be + * snapshotted to use a segment of the buffer. + */ +struct rolling_buffer { + struct folio_queue *head; /* Producer's insertion point */ + struct folio_queue *tail; /* Consumer's removal point */ + struct iov_iter iter; /* Iterator tracking what's left in the buffer */ + u8 next_head_slot; /* Next slot in ->head */ + u8 first_tail_slot; /* First slot in ->tail */ +}; + +/* + * Snapshot of a rolling buffer. + */ +struct rolling_buffer_snapshot { + struct folio_queue *curr_folioq; /* Queue segment in which current folio resides */ + unsigned char curr_slot; /* Folio currently being read */ + unsigned char curr_order; /* Order of folio */ +}; + +/* Marks to store per-folio in the internal folio_queue structs. */ +#define ROLLBUF_MARK_1 BIT(0) +#define ROLLBUF_MARK_2 BIT(1) + +int rolling_buffer_init(struct rolling_buffer *roll, unsigned int rreq_id, + unsigned int direction); +int rolling_buffer_make_space(struct rolling_buffer *roll); +ssize_t rolling_buffer_load_from_ra(struct rolling_buffer *roll, + struct readahead_control *ractl, + struct folio_batch *put_batch); +ssize_t rolling_buffer_append(struct rolling_buffer *roll, struct folio *folio, + unsigned int flags); +struct folio_queue *rolling_buffer_delete_spent(struct rolling_buffer *roll); +void rolling_buffer_clear(struct rolling_buffer *roll); + +static inline void rolling_buffer_advance(struct rolling_buffer *roll, size_t amount) +{ + iov_iter_advance(&roll->iter, amount); +} + +#endif /* _ROLLING_BUFFER_H */ -- cgit v1.2.3 From 360157829ee3dba848ffa817792d9a07969e0a95 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:40:58 +0000 Subject: netfs: Drop the error arg from netfs_read_subreq_terminated() Drop the error argument from netfs_read_subreq_terminated() in favour of passing the value in subreq->error. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-9-dhowells@redhat.com cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index bd922f0936e3..a882921460a9 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -427,10 +427,9 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp); vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group); /* (Sub)request management API. */ -void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, - bool was_async); -void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, - int error, bool was_async); +void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, bool was_async); +void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, bool was_async); +void netfs_read_subreq_termination_worker(struct work_struct *work); void netfs_get_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what); void netfs_put_subrequest(struct netfs_io_subrequest *subreq, -- cgit v1.2.3 From 31fc366aa7aa911ebc0744e99c82caee4e97315a Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:40:59 +0000 Subject: netfs: Drop the was_async arg from netfs_read_subreq_terminated() Drop the was_async argument from netfs_read_subreq_terminated(). Almost every caller is either in process context and passes false. Some filesystems delegate the call to a workqueue to avoid doing the work in their network message queue parsing thread. The only exception is netfs_cache_read_terminated() which handles completion in the cache - which is usually a callback from the backing filesystem in softirq context, though it can be from process context if an error occurred. In this case, delegate to a workqueue. Suggested-by: Linus Torvalds Link: https://lore.kernel.org/r/CAHk-=wiVC5Cgyz6QKXFu6fTaA6h4CjexDR-OV9kL6Vo5x9v8=A@mail.gmail.com/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-10-dhowells@redhat.com cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index a882921460a9..374e54beacbe 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -427,8 +427,8 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp); vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group); /* (Sub)request management API. */ -void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, bool was_async); -void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, bool was_async); +void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq); +void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq); void netfs_read_subreq_termination_worker(struct work_struct *work); void netfs_get_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what); -- cgit v1.2.3 From e61bfaad8fd86ac84eac633e0bbaac47a5dfd358 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:41:08 +0000 Subject: netfs: Add functions to build/clean a buffer in a folio_queue Add two netfslib functions to build up or clean up a buffer in a folio_queue. The first, netfs_alloc_folioq_buffer() will add folios to a buffer, extending up at least to the given size. If it can, it will add multipage folios. The folios are optionally have the mapping set and will have the index set according to the distance from the front of the folio queue. The second function will free up a folio queue and put any folios in the queue that have the first mark set. The netfs_folio tracepoint is also altered to cope with folios that have a NULL mapping, and the folios being added/put will have trace lines emitted and will be accounted in the stats. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-19-dhowells@redhat.com cc: Jeff Layton cc: Marc Dionne cc: netfs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 374e54beacbe..dd737344cff3 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -457,6 +457,12 @@ struct folio_queue *netfs_folioq_alloc(unsigned int rreq_id, gfp_t gfp, void netfs_folioq_free(struct folio_queue *folioq, unsigned int trace /*enum netfs_trace_folioq*/); +/* Buffer wrangling helpers API. */ +int netfs_alloc_folioq_buffer(struct address_space *mapping, + struct folio_queue **_buffer, + size_t *_cur_size, ssize_t size, gfp_t gfp); +void netfs_free_folioq_buffer(struct folio_queue *fq); + /** * netfs_inode - Get the netfs inode context from the inode * @inode: The inode to query -- cgit v1.2.3 From 49866ce7ea8d41a3dc198f519cc9caa2d6be1891 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:41:09 +0000 Subject: netfs: Add support for caching single monolithic objects such as AFS dirs Add support for caching the content of a file that contains a single monolithic object that must be read/written with a single I/O operation, such as an AFS directory. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-20-dhowells@redhat.com cc: Jeff Layton cc: Marc Dionne cc: netfs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index dd737344cff3..27e62f7d2940 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -73,6 +73,7 @@ struct netfs_inode { #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ #define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ #define NETFS_ICTX_MODIFIED_ATTR 3 /* Indicate change in mtime/ctime */ +#define NETFS_ICTX_SINGLE_NO_UPLOAD 4 /* Monolithic payload, cache but no upload */ }; /* @@ -210,9 +211,11 @@ enum netfs_io_origin { NETFS_READAHEAD, /* This read was triggered by readahead */ NETFS_READPAGE, /* This read is a synchronous read */ NETFS_READ_GAPS, /* This read is a synchronous read to fill gaps */ + NETFS_READ_SINGLE, /* This read should be treated as a single object */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ NETFS_DIO_READ, /* This is a direct I/O read */ NETFS_WRITEBACK, /* This write was triggered by writepages */ + NETFS_WRITEBACK_SINGLE, /* This monolithic write was triggered by writepages */ NETFS_WRITETHROUGH, /* This write was made by netfs_perform_write() */ NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */ NETFS_DIO_WRITE, /* This is a direct I/O write */ @@ -408,6 +411,13 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * struct netfs_group *netfs_group); ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from); +/* Single, monolithic object read/write API. */ +void netfs_single_mark_inode_dirty(struct inode *inode); +ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_iter *iter); +int netfs_writeback_single(struct address_space *mapping, + struct writeback_control *wbc, + struct iov_iter *iter); + /* Address operations API */ struct readahead_control; void netfs_readahead(struct readahead_control *); -- cgit v1.2.3 From e2d46f2ec332533816417b60933954173f602121 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:41:17 +0000 Subject: netfs: Change the read result collector to only use one work item Change the way netfslib collects read results to do all the collection for a particular read request using a single work item that walks along the subrequest queue as subrequests make progress or complete, unlocking folios progressively rather than doing the unlock in parallel as parallel requests come in. The code is remodelled to be more like the write-side code, though only using a single stream. This makes it more directly comparable and thus easier to duplicate fixes between the two sides. This has a number of advantages: (1) It's simpler. There doesn't need to be a complex donation mechanism to handle mismatches between the size and alignment of subrequests and folios. The collector unlocks folios as the subrequests covering each complete. (2) It should cause less scheduler overhead as there's a single work item in play unlocking pages in parallel when a read gets split up into a lot of subrequests instead of one per subrequest. Whilst the parallellism is nice in theory, in practice, the vast majority of loads are sequential reads of the whole file, so committing a bunch of threads to unlocking folios out of order doesn't help in those cases. (3) It should make it easier to implement content decryption. A folio cannot be decrypted until all the requests that contribute to it have completed - and, again, most loads are sequential and so, most of the time, we want to begin decryption sequentially (though it's great if the decryption can happen in parallel). There is a disadvantage in that we're losing the ability to decrypt and unlock things on an as-things-arrive basis which may affect some applications. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-28-dhowells@redhat.com cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 27e62f7d2940..071d05d81d38 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -181,9 +181,6 @@ struct netfs_io_subrequest { unsigned long long start; /* Where to start the I/O */ size_t len; /* Size of the I/O */ size_t transferred; /* Amount of data transferred */ - size_t consumed; /* Amount of read data consumed */ - size_t prev_donated; /* Amount of data donated from previous subreq */ - size_t next_donated; /* Amount of data donated from next subreq */ refcount_t ref; short error; /* 0 or error that occurred */ unsigned short debug_index; /* Index in list (for debugging output) */ @@ -191,9 +188,6 @@ struct netfs_io_subrequest { u8 retry_count; /* The number of retries (0 on initial pass) */ enum netfs_io_source source; /* Where to read from/write to */ unsigned char stream_nr; /* I/O stream this belongs to */ - unsigned char curr_folioq_slot; /* Folio currently being read */ - unsigned char curr_folio_order; /* Order of folio */ - struct folio_queue *curr_folioq; /* Queue segment in which current folio resides */ unsigned long flags; #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ #define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */ @@ -236,15 +230,16 @@ struct netfs_io_request { struct address_space *mapping; /* The mapping being accessed */ struct kiocb *iocb; /* AIO completion vector */ struct netfs_cache_resources cache_resources; + struct netfs_io_request *copy_to_cache; /* Request to write just-read data to the cache */ struct readahead_control *ractl; /* Readahead descriptor */ struct list_head proc_link; /* Link in netfs_iorequests */ - struct list_head subrequests; /* Contributory I/O operations */ struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */ #define NR_IO_STREAMS 2 //wreq->nr_io_streams struct netfs_group *group; /* Writeback group being written back */ struct rolling_buffer buffer; /* Unencrypted buffer */ #define NETFS_ROLLBUF_PUT_MARK ROLLBUF_MARK_1 #define NETFS_ROLLBUF_PAGECACHE_MARK ROLLBUF_MARK_2 + wait_queue_head_t waitq; /* Processor waiter */ void *netfs_priv; /* Private data for the netfs */ void *netfs_priv2; /* Private data for the netfs */ struct bio_vec *direct_bv; /* DIO buffer list (when handling iovec-iter) */ @@ -255,7 +250,6 @@ struct netfs_io_request { atomic_t subreq_counter; /* Next subreq->debug_index */ unsigned int nr_group_rel; /* Number of refs to release on ->group */ spinlock_t lock; /* Lock for queuing subreqs */ - atomic_t nr_outstanding; /* Number of ops in progress */ unsigned long long submitted; /* Amount submitted for I/O so far */ unsigned long long len; /* Length of the request */ size_t transferred; /* Amount to be indicated as transferred */ @@ -267,14 +261,17 @@ struct netfs_io_request { atomic64_t issued_to; /* Write issuer folio cursor */ unsigned long long collected_to; /* Point we've collected to */ unsigned long long cleaned_to; /* Position we've cleaned folios to */ + unsigned long long abandon_to; /* Position to abandon folios to */ pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ - size_t prev_donated; /* Fallback for subreq->prev_donated */ + unsigned char front_folio_order; /* Order (size) of front folio */ refcount_t ref; unsigned long flags; +#define NETFS_RREQ_OFFLOAD_COLLECTION 0 /* Offload collection to workqueue */ #define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */ #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ #define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ +#define NETFS_RREQ_FOLIO_COPY_TO_CACHE 6 /* Copy current folio to cache from read */ #define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ #define NETFS_RREQ_NONBLOCK 9 /* Don't block if possible (O_NONBLOCK) */ #define NETFS_RREQ_BLOCKED 10 /* We blocked */ @@ -439,7 +436,6 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr /* (Sub)request management API. */ void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq); void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq); -void netfs_read_subreq_termination_worker(struct work_struct *work); void netfs_get_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what); void netfs_put_subrequest(struct netfs_io_subrequest *subreq, -- cgit v1.2.3 From 394033dcc976d1f83f0fc6e7d4dd041ce376d245 Mon Sep 17 00:00:00 2001 From: Integral Date: Wed, 23 Oct 2024 18:00:33 +0800 Subject: bcachefs: add support for true/false & yes/no in bool-type options Here is the patch which uses existing constant table: Currently, when using bcachefs-tools to set options, bool-type options can only accept 1 or 0. Add support for accepting true/false and yes/no for these options. Signed-off-by: Integral Signed-off-by: Kent Overstreet Acked-by: David Howells --- include/linux/fs_parser.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index 3cef566088fc..53e566efd5fd 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -84,6 +84,8 @@ extern int fs_lookup_param(struct fs_context *fc, extern int lookup_constant(const struct constant_table tbl[], const char *name, int not_found); +extern const struct constant_table bool_names[]; + #ifdef CONFIG_VALIDATE_FS_PARSER extern bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, int low, int high, int special); -- cgit v1.2.3 From dec6c0aac4fc5e4266cea18e9e6e47eecb2333e1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 6 Dec 2024 19:16:02 -0500 Subject: lib min_heap: Switch to size_t size_t is the correct type for a count of objects that can fit in memory: this also means heaps now have the same memory layout as darrays (fs/bcachefs/darray.h), and darrays can be used as heaps. Cc: Kuan-Wei Chiu Cc: Ian Rogers Cc: Andrew Morton Cc: Coly Li Cc: Peter Zijlstra Signed-off-by: Kent Overstreet --- include/linux/min_heap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index e781727c8916..6325f6ffb895 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -15,8 +15,8 @@ */ #define MIN_HEAP_PREALLOCATED(_type, _name, _nr) \ struct _name { \ - int nr; \ - int size; \ + size_t nr; \ + size_t size; \ _type *data; \ _type preallocated[_nr]; \ } -- cgit v1.2.3 From 4e39aded665f9c8966d0fd487d37fa3f30b94ba4 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 17 Dec 2024 01:38:59 +0000 Subject: video: hdmi: Remove unused hdmi_infoframe_check hdmi_infoframe_check() has been unused since it was added in commit c5e69ab35c0d ("video/hdmi: Constify infoframe passed to the pack functions") Remove it. Note that the individual check functions for each type are actually used, so they're staying. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Helge Deller --- include/linux/hdmi.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h index 455f855bc084..96bda41d9148 100644 --- a/include/linux/hdmi.h +++ b/include/linux/hdmi.h @@ -445,7 +445,6 @@ ssize_t hdmi_infoframe_pack(union hdmi_infoframe *frame, void *buffer, size_t size); ssize_t hdmi_infoframe_pack_only(const union hdmi_infoframe *frame, void *buffer, size_t size); -int hdmi_infoframe_check(union hdmi_infoframe *frame); int hdmi_infoframe_unpack(union hdmi_infoframe *frame, const void *buffer, size_t size); void hdmi_infoframe_log(const char *level, struct device *dev, -- cgit v1.2.3 From ef4144ac2dec35d47de666f35cd873eb1be4172e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 19 Dec 2024 18:01:32 +0100 Subject: pidfs: allow bind-mounts Allow bind-mounting pidfds. Similar to nsfs let's allow bind-mounts for pidfds. This allows pidfds to be safely recovered and checked for process recycling. Link: https://lore.kernel.org/r/20241219-work-pidfs-mount-v1-1-dbc56198b839@kernel.org Signed-off-by: Christian Brauner --- include/linux/pidfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index df574d6708d4..7c830d0dec9a 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -6,5 +6,6 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); void pidfs_add_pid(struct pid *pid); void pidfs_remove_pid(struct pid *pid); +extern const struct dentry_operations pidfs_dentry_operations; #endif /* _LINUX_PID_FS_H */ -- cgit v1.2.3 From c7175957b28a69947dd1d36e8b19ac0d3c1a5d7d Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 27 Jul 2023 20:03:55 +0200 Subject: seqlock: annotate spinning as unlikely() in __read_seqcount_begin Annotation already used to be there, but got lost in 52ac39e5db5148f7 ("seqlock: seqcount_t: Implement all read APIs as statement expressions"). Does not look like it was intentional. Without it gcc 12 decides to compile the following in path_init: nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); into 2 cases of conditional jumps forward if the value is even, aka branch prediction miss by default in the common case on x86-64. With the patch jumps are only for odd values. before: [snip] mov 0x104fe96(%rip),%eax # 0xffffffff82409680 test $0x1,%al je 0xffffffff813b97fa pause mov 0x104fe8a(%rip),%eax # 0xffffffff82409680 test $0x1,%al jne 0xffffffff813b97ee mov %eax,0x48(%rbx) mov 0x104fdfd(%rip),%eax # 0xffffffff82409600 test $0x1,%al je 0xffffffff813b9813 pause mov 0x104fdf1(%rip),%eax # 0xffffffff82409600 test $0x1,%al jne 0xffffffff813b9807 [/snip] after: [snip] mov 0x104fec6(%rip),%eax # 0xffffffff82409680 test $0x1,%al jne 0xffffffff813b99af mov %eax,0x48(%rbx) mov 0x104fe35(%rip),%eax # 0xffffffff82409600 test $0x1,%al jne 0xffffffff813b999d [/snip] Interestingly .text gets slightly smaller (as reported by size(1)): before: 20702563 after: 20702429 Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20230727180355.813995-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/seqlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5298765d6ca4..eb20dcaa51b5 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -272,7 +272,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) ({ \ unsigned __seq; \ \ - while ((__seq = seqprop_sequence(s)) & 1) \ + while (unlikely((__seq = seqprop_sequence(s)) & 1)) \ cpu_relax(); \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ -- cgit v1.2.3 From 135ec43eb29c68ed26e2d10f221d43f7d9139a8f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 20 Nov 2024 17:13:52 -0800 Subject: fiemap: use kernel-doc includes in fiemap docbook Add some kernel-doc notation to structs in fiemap header files then pull that into Documentation/filesystems/fiemap.rst instead of duplicating the header file structs in fiemap.rst. This helps to future-proof fiemap.rst against struct changes. Add missing flags documentation from header files into fiemap.rst for FIEMAP_FLAG_CACHE and FIEMAP_EXTENT_SHARED. Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20241121011352.201907-1-rdunlap@infradead.org Cc: Christoph Hellwig Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: Matthew Wilcox Signed-off-by: Christian Brauner --- include/linux/fiemap.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fiemap.h b/include/linux/fiemap.h index c50882f19235..966092ffa89a 100644 --- a/include/linux/fiemap.h +++ b/include/linux/fiemap.h @@ -5,12 +5,18 @@ #include #include +/** + * struct fiemap_extent_info - fiemap request to a filesystem + * @fi_flags: Flags as passed from user + * @fi_extents_mapped: Number of mapped extents + * @fi_extents_max: Size of fiemap_extent array + * @fi_extents_start: Start of fiemap_extent array + */ struct fiemap_extent_info { - unsigned int fi_flags; /* Flags as passed from user */ - unsigned int fi_extents_mapped; /* Number of mapped extents */ - unsigned int fi_extents_max; /* Size of fiemap_extent array */ - struct fiemap_extent __user *fi_extents_start; /* Start of - fiemap_extent array */ + unsigned int fi_flags; + unsigned int fi_extents_mapped; + unsigned int fi_extents_max; + struct fiemap_extent __user *fi_extents_start; }; int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, -- cgit v1.2.3 From ea382199071931d19aac5f688b543e07360e2b64 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 20 Nov 2024 12:20:34 +0100 Subject: vfs: support caching symlink lengths in inodes When utilized it dodges strlen() in vfs_readlink(), giving about 1.5% speed up when issuing readlink on /initrd.img on ext4. Filesystems opt in by calling inode_set_cached_link() when creating an inode. The size is stored in a new union utilizing the same space as i_devices, thus avoiding growing the struct or taking up any more space. Churn-wise the current readlink_copy() helper is patched to accept the size instead of calculating it. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20241120112037.822078-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..2cc98de5af43 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -626,6 +626,7 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 #define IOP_MGTIME 0x0020 +#define IOP_CACHED_LINK 0x0040 /* * Keep mostly read-only and often accessed (especially for @@ -723,7 +724,10 @@ struct inode { }; struct file_lock_context *i_flctx; struct address_space i_data; - struct list_head i_devices; + union { + struct list_head i_devices; + int i_linklen; + }; union { struct pipe_inode_info *i_pipe; struct cdev *i_cdev; @@ -749,6 +753,13 @@ struct inode { void *i_private; /* fs or device private pointer */ } __randomize_layout; +static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) +{ + inode->i_link = link; + inode->i_linklen = linklen; + inode->i_opflags |= IOP_CACHED_LINK; +} + /* * Get bit address from inode->i_state to use with wait_var_event() * infrastructre. @@ -3351,7 +3362,7 @@ extern const struct file_operations generic_ro_fops; #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) -extern int readlink_copy(char __user *, int, const char *); +extern int readlink_copy(char __user *, int, const char *, int); extern int page_readlink(struct dentry *, char __user *, int); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); -- cgit v1.2.3 From 3212a8f34021a16d13ace91d3ac5f451ef8d0103 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 30 Nov 2024 06:17:11 +0100 Subject: fs: use a consume fence in mnt_idmap() The routine is used in link_path_walk() for every path component. To my reading the entire point of the fence was to grab a fully populated mnt_idmap, but that's already going to happen with mere consume fence. Eliminates an actual fence on arm64. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20241130051712.1036527-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/mount.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mount.h b/include/linux/mount.h index c34c18b4e8f3..33f17b6e8732 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -76,7 +76,7 @@ struct vfsmount { static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt) { /* Pairs with smp_store_release() in do_idmap_mount(). */ - return smp_load_acquire(&mnt->mnt_idmap); + return READ_ONCE(mnt->mnt_idmap); } extern int mnt_want_write(struct vfsmount *mnt); -- cgit v1.2.3 From 7533d0df69452c3e7b69c727c1e8e1a7e1afc83c Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Wed, 18 Dec 2024 16:01:43 +0800 Subject: soundwire: mipi_disco: read lane mapping properties from ACPI The DisCo for SoundWire 2.0 added support for the 'mipi-sdw-lane--mapping' property. Co-developed-by: Chao Song Signed-off-by: Chao Song Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20241218080155.102405-3-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index bd9836690da6..bb4e33a4db17 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -54,6 +54,8 @@ struct sdw_slave; #define SDW_MAX_PORTS 15 #define SDW_VALID_PORT_RANGE(n) ((n) < SDW_MAX_PORTS && (n) >= 1) +#define SDW_MAX_LANES 8 + enum { SDW_PORT_DIRN_SINK = 0, SDW_PORT_DIRN_SOURCE, @@ -356,6 +358,7 @@ struct sdw_dpn_prop { * and masks are supported * @commit_register_supported: is PCP_Commit register supported * @scp_int1_mask: SCP_INT1_MASK desired settings + * @lane_maps: Lane mapping for the slave, only valid if lane_control_support is set * @clock_reg_supported: the Peripheral implements the clock base and scale * registers introduced with the SoundWire 1.2 specification. SDCA devices * do not need to set this boolean property as the registers are required. @@ -385,6 +388,7 @@ struct sdw_slave_prop { u32 sdca_interrupt_register_list; u8 commit_register_supported; u8 scp_int1_mask; + u8 lane_maps[SDW_MAX_LANES]; bool clock_reg_supported; bool use_domain_irq; }; @@ -450,6 +454,7 @@ struct sdw_master_prop { int sdw_master_read_prop(struct sdw_bus *bus); int sdw_slave_read_prop(struct sdw_slave *slave); +int sdw_slave_read_lane_mapping(struct sdw_slave *slave); /* * SDW Slave Structures and APIs -- cgit v1.2.3 From b6a2e1be7d9303d07eff72a13132a37e035fbcfa Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Wed, 18 Dec 2024 16:01:44 +0800 Subject: soundwire: add lane_used_bandwidth in struct sdw_bus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To support multi-lane, we need to know how much bandwidth is used on each lane. And to use the lane that has enough bandwidth. Signed-off-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20241218080155.102405-4-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index bb4e33a4db17..ae38ac848d38 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -893,6 +893,7 @@ struct sdw_master_ops { * @multi_link: Store bus property that indicates if multi links * are supported. This flag is populated by drivers after reading * appropriate firmware (ACPI/DT). + * @lane_used_bandwidth: how much bandwidth in bits per second is used by each lane */ struct sdw_bus { struct device *dev; @@ -924,6 +925,7 @@ struct sdw_bus { struct dentry *debugfs; #endif bool multi_link; + unsigned int lane_used_bandwidth[SDW_MAX_LANES]; }; int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent, -- cgit v1.2.3 From 8f4e3343eda8cdedaf711bf3d8ef2d6ed571f420 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Wed, 18 Dec 2024 16:01:47 +0800 Subject: Soundwire: add sdw_slave_get_scale_index helper Currently, we only set peripheral frequency when the peripheral is initialized. However, curr_dr_freq may change to get required bandwidth. For example, curr_dr_freq may increase from 4.8MHz to 9.6MHz when the 4th stream is opened. Add a helper to get the scale index so that we can get the scale index and program it. Signed-off-by: Bard Liao Reviewed-by: Ranjani Sridharan Link: https://lore.kernel.org/r/20241218080155.102405-7-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index ae38ac848d38..05a85e2bd96d 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -1052,6 +1052,8 @@ int sdw_stream_add_slave(struct sdw_slave *slave, int sdw_stream_remove_slave(struct sdw_slave *slave, struct sdw_stream_runtime *stream); +int sdw_slave_get_scale_index(struct sdw_slave *slave, u8 *base); + /* messaging and data APIs */ int sdw_read(struct sdw_slave *slave, u32 addr); int sdw_write(struct sdw_slave *slave, u32 addr, u8 value); -- cgit v1.2.3 From 645291cfe5e52cce9571d73542476bec1d79ce26 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Wed, 18 Dec 2024 16:01:48 +0800 Subject: Soundwire: stream: program BUSCLOCK_SCALE We need to program bus clock scale to adjust the bus clock if current bus clock doesn't fit the bandwidth. Signed-off-by: Bard Liao Reviewed-by: Ranjani Sridharan Link: https://lore.kernel.org/r/20241218080155.102405-8-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 05a85e2bd96d..fc0a203c3ae0 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -1041,6 +1041,7 @@ int sdw_bus_exit_clk_stop(struct sdw_bus *bus); int sdw_compare_devid(struct sdw_slave *slave, struct sdw_slave_id id); void sdw_extract_slave_id(struct sdw_bus *bus, u64 addr, struct sdw_slave_id *id); +bool is_clock_scaling_supported_by_slave(struct sdw_slave *slave); #if IS_ENABLED(CONFIG_SOUNDWIRE) -- cgit v1.2.3 From 168cdf9cdef232225f6b6c617fd347b4d1c4a7d7 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Wed, 18 Dec 2024 16:01:54 +0800 Subject: SoundWire: pass stream to compute_params() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The stream parameter will be used in the follow up commit. No function change. Signed-off-by: Bard Liao Reviewed-by: Ranjani Sridharan Reviewed-by: Péter Ujfalusi Link: https://lore.kernel.org/r/20241218080155.102405-14-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 148 +++++++++++++++++++++--------------------- 1 file changed, 74 insertions(+), 74 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index fc0a203c3ae0..2d6c30317792 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -855,79 +855,6 @@ struct sdw_master_ops { int dev_num); }; -/** - * struct sdw_bus - SoundWire bus - * @dev: Shortcut to &bus->md->dev to avoid changing the entire code. - * @md: Master device - * @bus_lock_key: bus lock key associated to @bus_lock - * @bus_lock: bus lock - * @slaves: list of Slaves on this bus - * @msg_lock_key: message lock key associated to @msg_lock - * @msg_lock: message lock - * @m_rt_list: List of Master instance of all stream(s) running on Bus. This - * is used to compute and program bus bandwidth, clock, frame shape, - * transport and port parameters - * @defer_msg: Defer message - * @params: Current bus parameters - * @stream_refcount: number of streams currently using this bus - * @ops: Master callback ops - * @port_ops: Master port callback ops - * @prop: Master properties - * @vendor_specific_prop: pointer to non-standard properties - * @hw_sync_min_links: Number of links used by a stream above which - * hardware-based synchronization is required. This value is only - * meaningful if multi_link is set. If set to 1, hardware-based - * synchronization will be used even if a stream only uses a single - * SoundWire segment. - * @controller_id: system-unique controller ID. If set to -1, the bus @id will be used. - * @link_id: Link id number, can be 0 to N, unique for each Controller - * @id: bus system-wide unique id - * @compute_params: points to Bus resource management implementation - * @assigned: Bitmap for Slave device numbers. - * Bit set implies used number, bit clear implies unused number. - * @clk_stop_timeout: Clock stop timeout computed - * @bank_switch_timeout: Bank switch timeout computed - * @domain: IRQ domain - * @irq_chip: IRQ chip - * @debugfs: Bus debugfs (optional) - * @multi_link: Store bus property that indicates if multi links - * are supported. This flag is populated by drivers after reading - * appropriate firmware (ACPI/DT). - * @lane_used_bandwidth: how much bandwidth in bits per second is used by each lane - */ -struct sdw_bus { - struct device *dev; - struct sdw_master_device *md; - struct lock_class_key bus_lock_key; - struct mutex bus_lock; - struct list_head slaves; - struct lock_class_key msg_lock_key; - struct mutex msg_lock; - struct list_head m_rt_list; - struct sdw_defer defer_msg; - struct sdw_bus_params params; - int stream_refcount; - const struct sdw_master_ops *ops; - const struct sdw_master_port_ops *port_ops; - struct sdw_master_prop prop; - void *vendor_specific_prop; - int hw_sync_min_links; - int controller_id; - unsigned int link_id; - int id; - int (*compute_params)(struct sdw_bus *bus); - DECLARE_BITMAP(assigned, SDW_MAX_DEVICES); - unsigned int clk_stop_timeout; - u32 bank_switch_timeout; - struct irq_chip irq_chip; - struct irq_domain *domain; -#ifdef CONFIG_DEBUG_FS - struct dentry *debugfs; -#endif - bool multi_link; - unsigned int lane_used_bandwidth[SDW_MAX_LANES]; -}; - int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent, struct fwnode_handle *fwnode); void sdw_bus_master_delete(struct sdw_bus *bus); @@ -1017,10 +944,83 @@ struct sdw_stream_runtime { struct list_head master_list; }; +/** + * struct sdw_bus - SoundWire bus + * @dev: Shortcut to &bus->md->dev to avoid changing the entire code. + * @md: Master device + * @bus_lock_key: bus lock key associated to @bus_lock + * @bus_lock: bus lock + * @slaves: list of Slaves on this bus + * @msg_lock_key: message lock key associated to @msg_lock + * @msg_lock: message lock + * @m_rt_list: List of Master instance of all stream(s) running on Bus. This + * is used to compute and program bus bandwidth, clock, frame shape, + * transport and port parameters + * @defer_msg: Defer message + * @params: Current bus parameters + * @stream_refcount: number of streams currently using this bus + * @ops: Master callback ops + * @port_ops: Master port callback ops + * @prop: Master properties + * @vendor_specific_prop: pointer to non-standard properties + * @hw_sync_min_links: Number of links used by a stream above which + * hardware-based synchronization is required. This value is only + * meaningful if multi_link is set. If set to 1, hardware-based + * synchronization will be used even if a stream only uses a single + * SoundWire segment. + * @controller_id: system-unique controller ID. If set to -1, the bus @id will be used. + * @link_id: Link id number, can be 0 to N, unique for each Controller + * @id: bus system-wide unique id + * @compute_params: points to Bus resource management implementation + * @assigned: Bitmap for Slave device numbers. + * Bit set implies used number, bit clear implies unused number. + * @clk_stop_timeout: Clock stop timeout computed + * @bank_switch_timeout: Bank switch timeout computed + * @domain: IRQ domain + * @irq_chip: IRQ chip + * @debugfs: Bus debugfs (optional) + * @multi_link: Store bus property that indicates if multi links + * are supported. This flag is populated by drivers after reading + * appropriate firmware (ACPI/DT). + * @lane_used_bandwidth: how much bandwidth in bits per second is used by each lane + */ +struct sdw_bus { + struct device *dev; + struct sdw_master_device *md; + struct lock_class_key bus_lock_key; + struct mutex bus_lock; + struct list_head slaves; + struct lock_class_key msg_lock_key; + struct mutex msg_lock; + struct list_head m_rt_list; + struct sdw_defer defer_msg; + struct sdw_bus_params params; + int stream_refcount; + const struct sdw_master_ops *ops; + const struct sdw_master_port_ops *port_ops; + struct sdw_master_prop prop; + void *vendor_specific_prop; + int hw_sync_min_links; + int controller_id; + unsigned int link_id; + int id; + int (*compute_params)(struct sdw_bus *bus, struct sdw_stream_runtime *stream); + DECLARE_BITMAP(assigned, SDW_MAX_DEVICES); + unsigned int clk_stop_timeout; + u32 bank_switch_timeout; + struct irq_chip irq_chip; + struct irq_domain *domain; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs; +#endif + bool multi_link; + unsigned int lane_used_bandwidth[SDW_MAX_LANES]; +}; + struct sdw_stream_runtime *sdw_alloc_stream(const char *stream_name); void sdw_release_stream(struct sdw_stream_runtime *stream); -int sdw_compute_params(struct sdw_bus *bus); +int sdw_compute_params(struct sdw_bus *bus, struct sdw_stream_runtime *stream); int sdw_stream_add_master(struct sdw_bus *bus, struct sdw_stream_config *stream_config, -- cgit v1.2.3 From 0b7a66a2c864859fbf9bb16229c03172eef02c05 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 5 Dec 2024 17:06:02 +0100 Subject: preempt: Move PREEMPT_RT before PREEMPT in vermagic. Since the dynamic preemption has been enabled for PREEMPT_RT we have now CONFIG_PREEMPT and CONFIG_PREEMPT_RT set simultaneously. This affects the vermagic strings which comes now PREEMPT with PREEMPT_RT enabled. The PREEMPT_RT module usually can not be loaded on a PREEMPT kernel because some symbols are missing. However if the symbols are fine then it continues and it crashes later. The problem is that the struct module has a different layout and the num_exentries or init members are at a different position leading to a crash later on. This is not necessary caught by the size check in elf_validity_cache_index_mod() because the mem member has an alignment requirement of __module_memory_align which is big enough keep the total size unchanged. Therefore we should keep the string accurate instead of removing it. Move the PREEMPT_RT check before the PREEMPT so that it takes precedence if both symbols are enabled. Fixes: 35772d627b55c ("sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT") Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Petr Pavlu Link: https://lore.kernel.org/r/20241205160602.3lIAsJRT@linutronix.de Signed-off-by: Petr Pavlu --- include/linux/vermagic.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index a54046bf37e5..939ceabcaf06 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -15,10 +15,10 @@ #else #define MODULE_VERMAGIC_SMP "" #endif -#ifdef CONFIG_PREEMPT_BUILD -#define MODULE_VERMAGIC_PREEMPT "preempt " -#elif defined(CONFIG_PREEMPT_RT) +#ifdef CONFIG_PREEMPT_RT #define MODULE_VERMAGIC_PREEMPT "preempt_rt " +#elif defined(CONFIG_PREEMPT_BUILD) +#define MODULE_VERMAGIC_PREEMPT "preempt " #else #define MODULE_VERMAGIC_PREEMPT "" #endif -- cgit v1.2.3 From b89c0ed09e1189217cd9d516b739627c523d53a4 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Tue, 19 Nov 2024 18:56:36 +0100 Subject: opp: core: implement dev_pm_opp_get_bw Add and implement dev_pm_opp_get_bw() to retrieve the OPP's bandwidth in the same way as the dev_pm_opp_get_voltage() helper. Retrieving bandwidth is required in the case of the Adreno GPU where the GPU Management Unit can handle the Bandwidth scaling. The helper can get the peak or average bandwidth for any of the interconnect path. Signed-off-by: Neil Armstrong [ Viresh: Fixed commit log and a comment in code ] Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 568183e3e641..414146abfe81 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -102,6 +102,8 @@ struct dev_pm_opp_data { struct opp_table *dev_pm_opp_get_opp_table(struct device *dev); void dev_pm_opp_put_opp_table(struct opp_table *opp_table); +unsigned long dev_pm_opp_get_bw(struct dev_pm_opp *opp, bool peak, int index); + unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp); int dev_pm_opp_get_supplies(struct dev_pm_opp *opp, struct dev_pm_opp_supply *supplies); @@ -205,6 +207,11 @@ static inline struct opp_table *dev_pm_opp_get_opp_table_indexed(struct device * static inline void dev_pm_opp_put_opp_table(struct opp_table *opp_table) {} +static inline unsigned long dev_pm_opp_get_bw(struct dev_pm_opp *opp, bool peak, int index) +{ + return 0; +} + static inline unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp) { return 0; -- cgit v1.2.3 From 67b43038ce14d6b0673bdffb2052d879065c94ae Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Mon, 4 Nov 2024 16:43:03 +0800 Subject: KVM: guest_memfd: Remove RCU-protected attribute from slot->gmem.file Remove the RCU-protected attribute from slot->gmem.file. No need to use RCU primitives rcu_assign_pointer()/synchronize_rcu() to update this pointer. - slot->gmem.file is updated in 3 places: kvm_gmem_bind(), kvm_gmem_unbind(), kvm_gmem_release(). All of them are protected by kvm->slots_lock. - slot->gmem.file is read in 2 paths: (1) kvm_gmem_populate kvm_gmem_get_file __kvm_gmem_get_pfn (2) kvm_gmem_get_pfn kvm_gmem_get_file __kvm_gmem_get_pfn Path (1) kvm_gmem_populate() requires holding kvm->slots_lock, so slot->gmem.file is protected by the kvm->slots_lock in this path. Path (2) kvm_gmem_get_pfn() does not require holding kvm->slots_lock. However, it's also not guarded by rcu_read_lock() and rcu_read_unlock(). So synchronize_rcu() in kvm_gmem_unbind()/kvm_gmem_release() actually will not wait for the readers in kvm_gmem_get_pfn() due to lack of RCU read-side critical section. The path (2) kvm_gmem_get_pfn() is safe without RCU protection because: a) kvm_gmem_bind() is called on a new memslot, before the memslot is visible to kvm_gmem_get_pfn(). b) kvm->srcu ensures that kvm_gmem_unbind() and freeing of a memslot occur after the memslot is no longer visible to kvm_gmem_get_pfn(). c) get_file_active() ensures that kvm_gmem_get_pfn() will not access the stale file if kvm_gmem_release() sets it to NULL. This is because if kvm_gmem_release() occurs before kvm_gmem_get_pfn(), get_file_active() will return NULL; if get_file_active() does not return NULL, kvm_gmem_release() should not occur until after kvm_gmem_get_pfn() releases the file reference. Signed-off-by: Yan Zhao Message-ID: <20241104084303.29909-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 401439bb21e3..9df216943eb4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -596,7 +596,12 @@ struct kvm_memory_slot { #ifdef CONFIG_KVM_PRIVATE_MEM struct { - struct file __rcu *file; + /* + * Writes protected by kvm->slots_lock. Acquiring a + * reference via kvm_gmem_get_file() is protected by + * either kvm->slots_lock or kvm->srcu. + */ + struct file *file; pgoff_t pgoff; } gmem; #endif -- cgit v1.2.3 From dca6c88532322830d5d92486467fcc91b67a9ad8 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Thu, 18 Jul 2024 14:12:14 -0700 Subject: KVM: Add member to struct kvm_gfn_range to indicate private/shared Add new members to strut kvm_gfn_range to indicate which mapping (private-vs-shared) to operate on: enum kvm_gfn_range_filter attr_filter. Update the core zapping operations to set them appropriately. TDX utilizes two GPA aliases for the same memslots, one for memory that is for private memory and one that is for shared. For private memory, KVM cannot always perform the same operations it does on memory for default VMs, such as zapping pages and having them be faulted back in, as this requires guest coordination. However, some operations such as guest driven conversion of memory between private and shared should zap private memory. Internally to the MMU, private and shared mappings are tracked on separate roots. Mapping and zapping operations will operate on the respective GFN alias for each root (private or shared). So zapping operations will by default zap both aliases. Add fields in struct kvm_gfn_range to allow callers to specify which aliases so they can only target the aliases appropriate for their specific operation. There was feedback that target aliases should be specified such that the default value (0) is to operate on both aliases. Several options were considered. Several variations of having separate bools defined such that the default behavior was to process both aliases. They either allowed nonsensical configurations, or were confusing for the caller. A simple enum was also explored and was close, but was hard to process in the caller. Instead, use an enum with the default value (0) reserved as a disallowed value. Catch ranges that didn't have the target aliases specified by looking for that specific value. Set target alias with enum appropriately for these MMU operations: - For KVM's mmu notifier callbacks, zap shared pages only because private pages won't have a userspace mapping - For setting memory attributes, kvm_arch_pre_set_memory_attributes() chooses the aliases based on the attribute. - For guest_memfd invalidations, zap private only. Link: https://lore.kernel.org/kvm/ZivIF9vjKcuGie3s@google.com/ Signed-off-by: Isaku Yamahata Co-developed-by: Rick Edgecombe Signed-off-by: Rick Edgecombe Message-ID: <20240718211230.1492011-3-rick.p.edgecombe@intel.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9df216943eb4..c788d0bd952a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -255,11 +255,17 @@ union kvm_mmu_notifier_arg { unsigned long attributes; }; +enum kvm_gfn_range_filter { + KVM_FILTER_SHARED = BIT(0), + KVM_FILTER_PRIVATE = BIT(1), +}; + struct kvm_gfn_range { struct kvm_memory_slot *slot; gfn_t start; gfn_t end; union kvm_mmu_notifier_arg arg; + enum kvm_gfn_range_filter attr_filter; bool may_block; }; bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); -- cgit v1.2.3 From 943d0609d0571af092dc13456cbca70351e4d20e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:22 +0000 Subject: io_uring: rename ->resize_lock ->resize_lock is used for resizing rings, but it's a good idea to reuse it in other cases as well. Rename it into mmap_lock as it's protects from races with mmap. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/68f705306f3ac4d2fb999eb80ea1615015ce9f7f.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index fd4cdb0860a2..fafc1d779eb1 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -424,7 +424,7 @@ struct io_ring_ctx { * side will need to grab this lock, to prevent either side from * being run concurrently with the other. */ - struct mutex resize_lock; + struct mutex mmap_lock; /* * If IORING_SETUP_NO_MMAP is used, then the below holds -- cgit v1.2.3 From a730d2047d4ef822262c37f51ff7267f2f0e7167 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:24 +0000 Subject: io_uring/memmap: flag vmap'ed regions Add internal flags for struct io_mapped_region. The first flag we need is IO_REGION_F_VMAPPED, that indicates that the pointer has to be unmapped on region destruction. For now all regions are vmap'ed, so it's set unconditionally. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5a3d8046a038da97c0f8a8c8f1733fa3fc689d31.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index fafc1d779eb1..0793a91b66a5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -78,8 +78,9 @@ struct io_hash_table { struct io_mapped_region { struct page **pages; - void *vmap_ptr; - size_t nr_pages; + void *ptr; + unsigned nr_pages; + unsigned flags; }; /* -- cgit v1.2.3 From 8078486e1d53591ed946c943177339e59e3089e0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:34 +0000 Subject: io_uring: use region api for SQ Convert internal parts of the SQ managment to the region API. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1fb73ced6b835cb319ab0fe1dc0b2e982a9a5650.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 0793a91b66a5..808accf1776b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -432,10 +432,9 @@ struct io_ring_ctx { * the gup'ed pages for the two rings, and the sqes. */ unsigned short n_ring_pages; - unsigned short n_sqe_pages; struct page **ring_pages; - struct page **sqe_pages; + struct io_mapped_region sq_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; }; -- cgit v1.2.3 From 81a4058e0cd0f07139f088fbeb65bc488f687829 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:35 +0000 Subject: io_uring: use region api for CQ Convert internal parts of the CQ/SQ array managment to the region API. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/46fc3c801290d6b1ac16023d78f6b8e685c87fd6.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 808accf1776b..b63f44220e8b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -427,14 +427,8 @@ struct io_ring_ctx { */ struct mutex mmap_lock; - /* - * If IORING_SETUP_NO_MMAP is used, then the below holds - * the gup'ed pages for the two rings, and the sqes. - */ - unsigned short n_ring_pages; - struct page **ring_pages; - struct io_mapped_region sq_region; + struct io_mapped_region ring_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; }; -- cgit v1.2.3 From 78fda3d056417ccb9921663383b12f771aa0dd43 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:36 +0000 Subject: io_uring/kbuf: use mmap_lock to sync with mmap A preparation / cleanup patch simplifying the buf ring - mmap synchronisation. Instead of relying on RCU, which is trickier, do it by grabbing the mmap_lock when when anyone tries to publish or remove a registered buffer to / from ->io_bl_xa. Modifications of the xarray should always be protected by both ->uring_lock and ->mmap_lock, while lookups should hold either of them. While a struct io_buffer_list is in the xarray, the mmap related fields like ->flags and ->buf_pages should stay stable. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/af13bde56ee1a26bcaefaa9aad37a9ea318a590e.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index b63f44220e8b..73575d545d3c 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -294,6 +294,11 @@ struct io_ring_ctx { struct io_submit_state submit_state; + /* + * Modifications are protected by ->uring_lock and ->mmap_lock. + * The flags, buf_pages and buf_nr_pages fields should be stable + * once published. + */ struct xarray io_bl_xa; struct io_hash_table cancel_table; -- cgit v1.2.3 From 5dbb3cbd060aa86a722d7d44278e537ae3f63081 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:31 +0530 Subject: block: define set of integrity flags to be inherited by cloned bip Introduce BIP_CLONE_FLAGS describing integrity flags that should be inherited in the cloned bip from the parent. Suggested-by: Christoph Hellwig Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-2-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index dbf0f74c1529..0f0cf10222e8 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -30,6 +30,9 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; +#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_CTRL_NOCHECK | \ + BIP_DISK_NOCHECK | BIP_IP_CHECKSUM) + #ifdef CONFIG_BLK_DEV_INTEGRITY #define bip_for_each_vec(bvl, bip, iter) \ -- cgit v1.2.3 From fe8f4ca7107e968b0eb7328155c8811f2a19424a Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:33 +0530 Subject: block: modify bio_integrity_map_user to accept iov_iter as argument This patch refactors bio_integrity_map_user to accept iov_iter as argument. This is a prep patch. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-4-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 0f0cf10222e8..58ff9988433a 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -75,7 +75,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr); int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len); +int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); void bio_integrity_unmap_user(struct bio *bio); bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); @@ -101,8 +101,7 @@ static inline void bioset_integrity_free(struct bio_set *bs) { } -static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, - ssize_t len) +static int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; } -- cgit v1.2.3 From 10783d0ba0d7731ec81d88c54f83cf0ff89d1c2a Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:34 +0530 Subject: fs, iov_iter: define meta io descriptor Add flags to describe checks for integrity meta buffer. Also, introduce a new 'uio_meta' structure that upper layer can use to pass the meta/integrity information. Signed-off-by: Kanchan Joshi Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241128112240.8867-5-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/uio.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 853f9de5aa05..8ada84e85447 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -82,6 +82,15 @@ struct iov_iter { }; }; +typedef __u16 uio_meta_flags_t; + +struct uio_meta { + uio_meta_flags_t flags; + u16 app_tag; + u64 seed; + struct iov_iter iter; +}; + static inline const struct iovec *iter_iov(const struct iov_iter *iter) { if (iter->iter_type == ITER_UBUF) -- cgit v1.2.3 From 4de2ce04c862db66a7c1dbe0f358fc6df3825bac Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:35 +0530 Subject: fs: introduce IOCB_HAS_METADATA for metadata Introduce an IOCB_HAS_METADATA flag for the kiocb struct, for handling requests containing meta payload. Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241128112240.8867-6-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..2cc3d45da7b0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -348,6 +348,7 @@ struct readahead_control; #define IOCB_DIO_CALLER_COMP (1 << 22) /* kiocb is a read or write operation submitted by fs/aio.c. */ #define IOCB_AIO_RW (1 << 23) +#define IOCB_HAS_METADATA (1 << 24) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ -- cgit v1.2.3 From 2c0487d8b1f1351d48a13b77b254a2bb6de49eb3 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:37 +0530 Subject: block: introduce BIP_CHECK_GUARD/REFTAG/APPTAG bip_flags This patch introduces BIP_CHECK_GUARD/REFTAG/APPTAG bip_flags which indicate how the hardware should check the integrity payload. BIP_CHECK_GUARD/REFTAG are conversion of existing semantics, while BIP_CHECK_APPTAG is a new flag. The driver can now just rely on block layer flags, and doesn't need to know the integrity source. Submitter of PI decides which tags to check. This would also give us a unified interface for user and kernel generated integrity. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-8-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 58ff9988433a..fe2bfe122db2 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -11,6 +11,9 @@ enum bip_flags { BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ BIP_COPY_USER = 1 << 5, /* Kernel bounce buffer in use */ + BIP_CHECK_GUARD = 1 << 6, /* guard check */ + BIP_CHECK_REFTAG = 1 << 7, /* reftag check */ + BIP_CHECK_APPTAG = 1 << 8, /* apptag check */ }; struct bio_integrity_payload { @@ -31,7 +34,8 @@ struct bio_integrity_payload { }; #define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_CTRL_NOCHECK | \ - BIP_DISK_NOCHECK | BIP_IP_CHECKSUM) + BIP_DISK_NOCHECK | BIP_IP_CHECKSUM | \ + BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) #ifdef CONFIG_BLK_DEV_INTEGRITY -- cgit v1.2.3 From 18623503a3a514780214850bf8ba8b03ea0f3a4b Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:39 +0530 Subject: scsi: add support for user-meta interface Add support for sending user-meta buffer. Set tags to be checked using flags specified by user/block-layer. With this change, BIP_CTRL_NOCHECK becomes unused. Remove it. Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241128112240.8867-10-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index fe2bfe122db2..2195bc06dcde 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -7,13 +7,12 @@ enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ - BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ - BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ - BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ - BIP_COPY_USER = 1 << 5, /* Kernel bounce buffer in use */ - BIP_CHECK_GUARD = 1 << 6, /* guard check */ - BIP_CHECK_REFTAG = 1 << 7, /* reftag check */ - BIP_CHECK_APPTAG = 1 << 8, /* apptag check */ + BIP_DISK_NOCHECK = 1 << 2, /* disable disk integrity checking */ + BIP_IP_CHECKSUM = 1 << 3, /* IP checksum */ + BIP_COPY_USER = 1 << 4, /* Kernel bounce buffer in use */ + BIP_CHECK_GUARD = 1 << 5, /* guard check */ + BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ + BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ }; struct bio_integrity_payload { @@ -33,8 +32,7 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; -#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_CTRL_NOCHECK | \ - BIP_DISK_NOCHECK | BIP_IP_CHECKSUM | \ +#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \ BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) #ifdef CONFIG_BLK_DEV_INTEGRITY -- cgit v1.2.3 From 3d8b5a22d40435b4a7e58f06ae2cd3506b222898 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Thu, 28 Nov 2024 16:52:40 +0530 Subject: block: add support to pass user meta buffer If an iocb contains metadata, extract that and prepare the bip. Based on flags specified by the user, set corresponding guard/app/ref tags to be checked in bip. Reviewed-by: Christoph Hellwig Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-11-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 2195bc06dcde..de0a6c9de4d1 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -23,6 +23,7 @@ struct bio_integrity_payload { unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ + u16 app_tag; /* application tag value */ struct bvec_iter bio_iter; /* for rewinding parent bio */ @@ -78,6 +79,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); +int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta); void bio_integrity_unmap_user(struct bio *bio); bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); @@ -108,6 +110,11 @@ static int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) return -EINVAL; } +static inline int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) +{ + return -EINVAL; +} + static inline void bio_integrity_unmap_user(struct bio *bio) { } -- cgit v1.2.3 From 6f491a8d4b92d1a840fd9209cba783c84437d0b7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 27 Nov 2024 21:51:28 +0800 Subject: block: track disk DEAD state automatically for modeling queue freeze lockdep Now we only verify the outmost freeze & unfreeze in current context in case that !q->mq_freeze_depth, so it is reliable to save disk DEAD state when we want to lock the freeze queue since the state is one per-task variable now. Doing this way can kill lots of false positive when freeze queue is called before adding disk[1]. [1] https://lore.kernel.org/linux-block/6741f6b2.050a0220.1cc393.0017.GAE@google.com/ Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241127135133.3952153-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 378d3a1a22fc..522cf8eef66c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -581,6 +581,8 @@ struct request_queue { #ifdef CONFIG_LOCKDEP struct task_struct *mq_freeze_owner; int mq_freeze_owner_depth; + /* Records disk state in current context, used in unfreeze queue */ + bool mq_freeze_disk_dead; #endif wait_queue_head_t mq_freeze_wq; /* -- cgit v1.2.3 From f6661b1d0525f3764596a1b65eeed9e75aecafa7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 27 Nov 2024 21:51:30 +0800 Subject: block: track queue dying state automatically for modeling queue freeze lockdep Now we only verify the outmost freeze & unfreeze in current context in case that !q->mq_freeze_depth, so it is reliable to save queue lying state when we want to lock the freeze queue since the state is one per-task variable now. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241127135133.3952153-5-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 522cf8eef66c..5d40af2ef971 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -581,8 +581,12 @@ struct request_queue { #ifdef CONFIG_LOCKDEP struct task_struct *mq_freeze_owner; int mq_freeze_owner_depth; - /* Records disk state in current context, used in unfreeze queue */ + /* + * Records disk & queue state in current context, used in unfreeze + * queue + */ bool mq_freeze_disk_dead; + bool mq_freeze_queue_dying; #endif wait_queue_head_t mq_freeze_wq; /* -- cgit v1.2.3 From 5c292ac6e69f390179b93dc104b40903cddce636 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 2 Dec 2024 11:19:56 +0000 Subject: block: Delete bio_prio() Since commit 43b62ce3ff0a ("block: move bio io prio to a new field"), macro bio_prio() does nothing but return the value in bio->bi_ioprio. Most other places just read bio->bi_ioprio directly, so replace bi_ioprio() callsites with reading bio->bi_ioprio directly and delete that macro. Signed-off-by: John Garry Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20241202111957.2311683-2-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 7a1b3b1a8fed..99676916f3db 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -19,7 +19,6 @@ static inline unsigned int bio_max_segs(unsigned int nr_segs) return min(nr_segs, BIO_MAX_VECS); } -#define bio_prio(bio) (bio)->bi_ioprio #define bio_set_prio(bio, prio) ((bio)->bi_ioprio = prio) #define bio_iter_iovec(bio, iter) \ -- cgit v1.2.3 From 19206d3f5ef7f051056d2fb49203a347e4844e6e Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 2 Dec 2024 11:19:57 +0000 Subject: block: Delete bio_set_prio() Since commit 43b62ce3ff0a ("block: move bio io prio to a new field"), macro bio_set_prio() does nothing but set bio->bi_ioprio. All other places just set bio->bi_ioprio directly, so replace bio_set_prio() remaining callsites with setting bio->bi_ioprio directly and delete that macro. Signed-off-by: John Garry Acked-by: Jack Wang Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20241202111957.2311683-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 99676916f3db..1eec59699100 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -19,8 +19,6 @@ static inline unsigned int bio_max_segs(unsigned int nr_segs) return min(nr_segs, BIO_MAX_VECS); } -#define bio_set_prio(bio, prio) ((bio)->bi_ioprio = prio) - #define bio_iter_iovec(bio, iter) \ bvec_iter_bvec((bio)->bi_io_vec, (iter)) -- cgit v1.2.3 From fea4952df0eeec4e1a295ebaac9f61c0065fae87 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 2 Dec 2024 15:00:09 +0100 Subject: driver core: bus: add irq_get_affinity callback to bus_type Introducing a callback in struct bus_type so that a subsystem can hook up the getters directly. This approach avoids exposing random getters in any subsystems APIs. Acked-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Acked-by: Greg Kroah-Hartman Signed-off-by: Daniel Wagner Link: https://lore.kernel.org/r/20241202-refactor-blk-affinity-helpers-v6-1-27211e9c2cd5@kernel.org Signed-off-by: Jens Axboe --- include/linux/device/bus.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index cdc4757217f9..b18658bce2c3 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -48,6 +48,7 @@ struct fwnode_handle; * will never get called until they do. * @remove: Called when a device removed from this bus. * @shutdown: Called at shut-down time to quiesce the device. + * @irq_get_affinity: Get IRQ affinity mask for the device on this bus. * * @online: Called to put the device back online (after offlining it). * @offline: Called to put the device offline for hot-removal. May fail. @@ -87,6 +88,8 @@ struct bus_type { void (*sync_state)(struct device *dev); void (*remove)(struct device *dev); void (*shutdown)(struct device *dev); + const struct cpumask *(*irq_get_affinity)(struct device *dev, + unsigned int irq_vec); int (*online)(struct device *dev); int (*offline)(struct device *dev); -- cgit v1.2.3 From 1452e9b470c903fc4137a448e9f5767e92d68229 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 2 Dec 2024 15:00:12 +0100 Subject: blk-mq: introduce blk_mq_map_hw_queues blk_mq_pci_map_queues and blk_mq_virtio_map_queues will create a CPU to hardware queue mapping based on affinity information. These two function share common code and only differ on how the affinity information is retrieved. Also, those functions are located in the block subsystem where it doesn't really fit in. They are virtio and pci subsystem specific. Thus introduce provide a generic mapping function which uses the irq_get_affinity callback from bus_type. Originally idea from Ming Lei Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Reviewed-by: John Garry Signed-off-by: Daniel Wagner Link: https://lore.kernel.org/r/20241202-refactor-blk-affinity-helpers-v6-4-27211e9c2cd5@kernel.org Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c596e0e4cb75..769eab6247d4 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -921,6 +921,8 @@ void blk_mq_unfreeze_queue_non_owner(struct request_queue *q); void blk_freeze_queue_start_non_owner(struct request_queue *q); void blk_mq_map_queues(struct blk_mq_queue_map *qmap); +void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap, + struct device *dev, unsigned int offset); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); void blk_mq_quiesce_queue_nowait(struct request_queue *q); -- cgit v1.2.3 From 9bc1e897a821f19ba3775bb013a8a6fb121c3ca1 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 2 Dec 2024 15:00:16 +0100 Subject: blk-mq: remove unused queue mapping helpers There are no users left of the pci and virtio queue mapping helpers. Thus remove them. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Reviewed-by: John Garry Signed-off-by: Daniel Wagner Link: https://lore.kernel.org/r/20241202-refactor-blk-affinity-helpers-v6-8-27211e9c2cd5@kernel.org Signed-off-by: Jens Axboe --- include/linux/blk-mq-pci.h | 11 ----------- include/linux/blk-mq-virtio.h | 11 ----------- 2 files changed, 22 deletions(-) delete mode 100644 include/linux/blk-mq-pci.h delete mode 100644 include/linux/blk-mq-virtio.h (limited to 'include/linux') diff --git a/include/linux/blk-mq-pci.h b/include/linux/blk-mq-pci.h deleted file mode 100644 index ca544e1d3508..000000000000 --- a/include/linux/blk-mq-pci.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_BLK_MQ_PCI_H -#define _LINUX_BLK_MQ_PCI_H - -struct blk_mq_queue_map; -struct pci_dev; - -void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, - int offset); - -#endif /* _LINUX_BLK_MQ_PCI_H */ diff --git a/include/linux/blk-mq-virtio.h b/include/linux/blk-mq-virtio.h deleted file mode 100644 index 13226e9b22dd..000000000000 --- a/include/linux/blk-mq-virtio.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_BLK_MQ_VIRTIO_H -#define _LINUX_BLK_MQ_VIRTIO_H - -struct blk_mq_queue_map; -struct virtio_device; - -void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, - struct virtio_device *vdev, int first_vec); - -#endif /* _LINUX_BLK_MQ_VIRTIO_H */ -- cgit v1.2.3 From cc76ace465d6977b47daa427379b7be1e0976f12 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 19 Dec 2024 07:01:59 +0100 Subject: block: remove BLK_MQ_F_SHOULD_MERGE BLK_MQ_F_SHOULD_MERGE is set for all tag_sets except those that purely process passthrough commands (bsg-lib, ufs tmf, various nvme admin queues) and thus don't even check the flag. Remove it to simplify the driver interface. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241219060214.1928848-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 769eab6247d4..7f6c482ebf54 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -668,7 +668,6 @@ struct blk_mq_ops { /* Keep hctx_flag_name[] in sync with the definitions below */ enum { - BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, /* * Set when this device requires underlying blk-mq device for -- cgit v1.2.3 From 546d191427cf5cf3215529744c2ea8558f0279db Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Nov 2024 15:53:58 -0700 Subject: block: make bio_integrity_map_user() static inline If CONFIG_BLK_DEV_INTEGRITY isn't set, then the dummy helper must be static inline to avoid complaints about the function being unused. Fixes: fe8f4ca7107e ("block: modify bio_integrity_map_user to accept iov_iter as argument") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202411300229.y7h60mDg-lkp@intel.com/ Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index de0a6c9de4d1..802f52e38efd 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -105,7 +105,7 @@ static inline void bioset_integrity_free(struct bio_set *bs) { } -static int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) +static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; } -- cgit v1.2.3 From 94ddd8bf98d76f03297a2b33a951711b31f7bc38 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Mon, 23 Dec 2024 15:18:37 +0000 Subject: misc: trivial: Remove undesired double space from struct definition When one is too lazy to use an LSP to conduct look-ups on struct definitions, one might use the ever useful `struct {` search string. However this doesn't work with `struct miscdevice {` because of a stray double space. Assuming that this wasn't intentional, let's simply remove it. Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20241223151843.472645-1-lee@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/miscdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index c0fea6ca5076..69e110c2b86a 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -76,7 +76,7 @@ struct device; struct attribute_group; -struct miscdevice { +struct miscdevice { int minor; const char *name; const struct file_operations *fops; -- cgit v1.2.3 From 95f68e06b41b9e88291796efa3969409d13fdd4c Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 19 Dec 2024 19:58:33 +0200 Subject: net/mlx5: fs, add counter object to flow destination Currently mlx5_flow_destination includes counter_id which is assigned in case we use flow counter on the flow steering rule. However, counter_id is not enough data in case of using HW Steering. Thus, have mlx5_fc object as part of mlx5_flow_destination instead of counter_id and assign it where needed. In case counter_id is received from user space, create a local counter object to represent it. Signed-off-by: Moshe Shemesh Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241219175841.1094544-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/fs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 438db888bde0..2a69d9d71276 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -163,7 +163,7 @@ struct mlx5_flow_destination { u32 tir_num; u32 ft_num; struct mlx5_flow_table *ft; - u32 counter_id; + struct mlx5_fc *counter; struct { u16 num; u16 vhca_id; @@ -299,6 +299,8 @@ int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler, struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging); void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter); +struct mlx5_fc *mlx5_fc_local_create(u32 counter_id, u32 offset, u32 bulk_size); +void mlx5_fc_local_destroy(struct mlx5_fc *counter); u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter); void mlx5_fc_query_cached(struct mlx5_fc *counter, u64 *bytes, u64 *packets, u64 *lastuse); -- cgit v1.2.3 From 2a4f56fbcc473d8faeb29b73082df39efbe5893c Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 20 Dec 2024 10:15:05 +0200 Subject: net/mlx5e: Keep netdev when leave switchdev for devlink set legacy only In the cited commit, when changing from switchdev to legacy mode, uplink representor's netdev is kept, and its profile is replaced with nic profile, so netdev is detached from old profile, then attach to new profile. During profile change, the hardware resources allocated by the old profile will be cleaned up. However, the cleanup is relying on the related kernel modules. And they may need to flush themselves first, which is triggered by netdev events, for example, NETDEV_UNREGISTER. However, netdev is kept, or netdev_register is called after the cleanup, which may cause troubles because the resources are still referred by kernel modules. The same process applies to all the caes when uplink is leaving switchdev mode, including devlink eswitch mode set legacy, driver unload and devlink reload. For the first one, it can be blocked and returns failure to users, whenever possible. But it's hard for the others. Besides, the attachment to nic profile is unnecessary as the netdev will be unregistered anyway for such cases. So in this patch, the original behavior is kept only for devlink eswitch set mode legacy. For the others, moves netdev unregistration before the profile change. Fixes: 7a9fb35e8c3a ("net/mlx5e: Do not reload ethernet ports when changing eswitch mode") Signed-off-by: Jianbo Liu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fc7e6153b73d..8f5991168ccd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -524,6 +524,7 @@ enum { * creation/deletion on drivers rescan. Unset during device attach. */ MLX5_PRIV_FLAGS_DETACH = 1 << 2, + MLX5_PRIV_FLAGS_SWITCH_LEGACY = 1 << 3, }; struct mlx5_adev { -- cgit v1.2.3 From ef94ea4fc18ff3fa5034d0da4c1a52dba0b23f8c Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Tue, 17 Dec 2024 23:41:53 +0200 Subject: clk: Drop obsolete devm_clk_bulk_get_all_enable() helper Commit 265b07df758a ("clk: Provide managed helper to get and enable bulk clocks") added devm_clk_bulk_get_all_enable() function, but missed to return the number of clocks stored in the clk_bulk_data table referenced by the clks argument. Without knowing the number, it's not possible to iterate these clocks when needed, hence the argument is useless and could have been simply removed. A new helper devm_clk_bulk_get_all_enabled() has been introduced, which is consistent with devm_clk_bulk_get_all() in terms of the returned value. Drop the obsolete function since all users switched to the new helper. Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Manivannan Sadhasivam Signed-off-by: Cristian Ciocaltea Link: https://lore.kernel.org/r/20241217-clk_bulk_ena_fix-v5-3-aafbbb245155@collabora.com Signed-off-by: Stephen Boyd --- include/linux/clk.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk.h b/include/linux/clk.h index 1dcee6d701e4..b607482ca77e 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -1138,15 +1138,6 @@ static inline void clk_restore_context(void) {} #endif -/* Deprecated. Use devm_clk_bulk_get_all_enabled() */ -static inline int __must_check -devm_clk_bulk_get_all_enable(struct device *dev, struct clk_bulk_data **clks) -{ - int ret = devm_clk_bulk_get_all_enabled(dev, clks); - - return ret > 0 ? 0 : ret; -} - /* clk_prepare_enable helps cases using clk_enable in non-atomic context. */ static inline int clk_prepare_enable(struct clk *clk) { -- cgit v1.2.3 From 452f4b31e3f70a52b97890888eeb9eaa9a87139a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= Date: Mon, 25 Nov 2024 11:50:25 +0100 Subject: tracing: Constify string literal data member in struct trace_event_call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The name member of the struct trace_event_call is assigned with generated string literals; declare them pointer to read-only. Reported by clang: security/landlock/syscalls.c:179:1: warning: initializing 'char *' with an expression of type 'const char[34]' discards qualifiers [-Wincompatible-pointer-types-discards-qualifiers] 179 | SYSCALL_DEFINE3(landlock_create_ruleset, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 180 | const struct landlock_ruleset_attr __user *const, attr, | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 181 | const size_t, size, const __u32, flags) | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:226:36: note: expanded from macro 'SYSCALL_DEFINE3' 226 | #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:234:2: note: expanded from macro 'SYSCALL_DEFINEx' 234 | SYSCALL_METADATA(sname, x, __VA_ARGS__) \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:184:2: note: expanded from macro 'SYSCALL_METADATA' 184 | SYSCALL_TRACE_ENTER_EVENT(sname); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:151:30: note: expanded from macro 'SYSCALL_TRACE_ENTER_EVENT' 151 | .name = "sys_enter"#sname, \ | ^~~~~~~~~~~~~~~~~ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mickaël Salaün Cc: Günther Noack Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Bill Wendling Cc: Justin Stitt Link: https://lore.kernel.org/20241125105028.42807-1-cgoettsche@seltendoof.de Fixes: b77e38aa240c3 ("tracing: add event trace infrastructure") Signed-off-by: Christian Göttsche Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 91b8ffbdfa8c..58ad4ead33fc 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -364,7 +364,7 @@ struct trace_event_call { struct list_head list; struct trace_event_class *class; union { - char *name; + const char *name; /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */ struct tracepoint *tp; }; -- cgit v1.2.3 From 8cda395b79d90709fde3a9963c667d849cc5718f Mon Sep 17 00:00:00 2001 From: Amit Sunil Dhamne Date: Tue, 10 Dec 2024 19:07:09 -0800 Subject: usb: typec: tcpm: Add new AMS for Get_Revision response This commit adds a new AMS for responding to a "Get_Revision" request. Revision message consists of the following fields: +----------------------------------------------------+ | Header | RMDO | | No. of data objects = 1 | | +----------------------------------------------------+ While RMDO consists of: * B31..28 Revision Major * B27..24 Revision Minor * B23..20 Version Major * B19..16 Version Minor * B15..0 Reserved, shall be set to zero. As per the PD spec ("8.3.3.16.2.1 PR_Give_Revision State"), a request is only expected when an explicit contract is established and the port is in ready state. This AMS is only supported for PD >= 3.0. Signed-off-by: Amit Sunil Dhamne Reviewed-by: Badhri Jagan Sridharan Link: https://lore.kernel.org/r/20241210-get_rev_upstream-v2-3-d0094e52d48f@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/pd.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h index d50098fb16b5..3068c3084eb6 100644 --- a/include/linux/usb/pd.h +++ b/include/linux/usb/pd.h @@ -33,7 +33,9 @@ enum pd_ctrl_msg_type { PD_CTRL_FR_SWAP = 19, PD_CTRL_GET_PPS_STATUS = 20, PD_CTRL_GET_COUNTRY_CODES = 21, - /* 22-31 Reserved */ + /* 22-23 Reserved */ + PD_CTRL_GET_REVISION = 24, + /* 25-31 Reserved */ }; enum pd_data_msg_type { @@ -46,7 +48,9 @@ enum pd_data_msg_type { PD_DATA_ALERT = 6, PD_DATA_GET_COUNTRY_INFO = 7, PD_DATA_ENTER_USB = 8, - /* 9-14 Reserved */ + /* 9-11 Reserved */ + PD_DATA_REVISION = 12, + /* 13-14 Reserved */ PD_DATA_VENDOR_DEF = 15, /* 16-31 Reserved */ }; @@ -453,6 +457,20 @@ static inline unsigned int rdo_max_power(u32 rdo) #define EUDO_TBT_SUPPORT BIT(14) #define EUDO_HOST_PRESENT BIT(13) +/* + * Request Message Data Object (PD Revision 3.1+ only) + * -------- + * <31:28> :: Revision Major + * <27:24> :: Revision Minor + * <23:20> :: Version Major + * <19:16> :: Version Minor + * <15:0> :: Reserved, Shall be set to zero + */ + +#define RMDO(rev_maj, rev_min, ver_maj, ver_min) \ + (((rev_maj) & 0xf) << 28 | ((rev_min) & 0xf) << 24 | \ + ((ver_maj) & 0xf) << 20 | ((ver_min) & 0xf) << 16) + /* USB PD timers and counters */ #define PD_T_NO_RESPONSE 5000 /* 4.5 - 5.5 seconds */ #define PD_T_DB_DETECT 10000 /* 10 - 15 seconds */ -- cgit v1.2.3 From 100e257386595b3f1865ca8a991e2ba74f9701ff Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Fri, 13 Dec 2024 15:35:43 -0800 Subject: usb: typec: Add driver for Thunderbolt 3 Alternate Mode Thunderbolt 3 Alternate Mode entry flow is described in USB Type-C Specification Release 2.0. Signed-off-by: Heikki Krogerus Co-developed-by: Abhishek Pandit-Subedi Signed-off-by: Abhishek Pandit-Subedi Reviewed-by: Benson Leung Link: https://lore.kernel.org/r/20241213153543.v5.2.I3080b036e8de0b9957c57c1c3059db7149c5e549@changeid Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec_tbt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/typec_tbt.h b/include/linux/usb/typec_tbt.h index fa97d7e00f5c..55dcea12082c 100644 --- a/include/linux/usb/typec_tbt.h +++ b/include/linux/usb/typec_tbt.h @@ -44,6 +44,7 @@ struct typec_thunderbolt_data { #define TBT_GEN3_NON_ROUNDED 0 #define TBT_GEN3_GEN4_ROUNDED_NON_ROUNDED 1 +#define TBT_CABLE_ROUNDED BIT(19) #define TBT_CABLE_OPTICAL BIT(21) #define TBT_CABLE_RETIMER BIT(22) #define TBT_CABLE_LINK_TRAINING BIT(23) -- cgit v1.2.3 From 183b194d8fb62694e81c18e1faec9ad418f952e3 Mon Sep 17 00:00:00 2001 From: Abhishek Pandit-Subedi Date: Fri, 13 Dec 2024 15:35:44 -0800 Subject: usb: typec: Make active on port altmode writable The active property of port altmode should be writable (to prevent or allow partner altmodes from entering) and needs to be part of typec_altmode_desc so we can initialize the port to an inactive state if desired. Signed-off-by: Abhishek Pandit-Subedi Reviewed-by: Heikki Krogerus Reviewed-by: Benson Leung Link: https://lore.kernel.org/r/20241213153543.v5.3.I794566684ab2965e209f326b08232006eff333f8@changeid Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h index d616b8807000..252af3f77039 100644 --- a/include/linux/usb/typec.h +++ b/include/linux/usb/typec.h @@ -140,6 +140,7 @@ int typec_cable_set_identity(struct typec_cable *cable); * @mode: Index of the Mode * @vdo: VDO returned by Discover Modes USB PD command * @roles: Only for ports. DRP if the mode is available in both roles + * @inactive: Only for ports. Make this port inactive (default is active). * * Description of an Alternate Mode which a connector, cable plug or partner * supports. @@ -150,6 +151,7 @@ struct typec_altmode_desc { u32 vdo; /* Only used with ports */ enum typec_port_data roles; + bool inactive; }; void typec_partner_set_pd_revision(struct typec_partner *partner, u16 pd_revision); -- cgit v1.2.3 From fdf3ee5c6e5278dab4f60b998b47ed2d510bf80f Mon Sep 17 00:00:00 2001 From: Md Sadre Alam Date: Wed, 20 Nov 2024 14:45:02 +0530 Subject: mtd: nand: Add qpic_common API file Add qpic_common.c file which hold all the common qpic APIs which will be used by both qpic raw nand driver and qpic spi nand driver. Signed-off-by: Md Sadre Alam Signed-off-by: Miquel Raynal --- include/linux/mtd/nand-qpic-common.h | 468 +++++++++++++++++++++++++++++++++++ 1 file changed, 468 insertions(+) create mode 100644 include/linux/mtd/nand-qpic-common.h (limited to 'include/linux') diff --git a/include/linux/mtd/nand-qpic-common.h b/include/linux/mtd/nand-qpic-common.h new file mode 100644 index 000000000000..425994429387 --- /dev/null +++ b/include/linux/mtd/nand-qpic-common.h @@ -0,0 +1,468 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * QCOM QPIC common APIs header file + * + * Copyright (c) 2023 Qualcomm Inc. + * Authors: Md sadre Alam + * + */ +#ifndef __MTD_NAND_QPIC_COMMON_H__ +#define __MTD_NAND_QPIC_COMMON_H__ + +/* NANDc reg offsets */ +#define NAND_FLASH_CMD 0x00 +#define NAND_ADDR0 0x04 +#define NAND_ADDR1 0x08 +#define NAND_FLASH_CHIP_SELECT 0x0c +#define NAND_EXEC_CMD 0x10 +#define NAND_FLASH_STATUS 0x14 +#define NAND_BUFFER_STATUS 0x18 +#define NAND_DEV0_CFG0 0x20 +#define NAND_DEV0_CFG1 0x24 +#define NAND_DEV0_ECC_CFG 0x28 +#define NAND_AUTO_STATUS_EN 0x2c +#define NAND_DEV1_CFG0 0x30 +#define NAND_DEV1_CFG1 0x34 +#define NAND_READ_ID 0x40 +#define NAND_READ_STATUS 0x44 +#define NAND_DEV_CMD0 0xa0 +#define NAND_DEV_CMD1 0xa4 +#define NAND_DEV_CMD2 0xa8 +#define NAND_DEV_CMD_VLD 0xac +#define SFLASHC_BURST_CFG 0xe0 +#define NAND_ERASED_CW_DETECT_CFG 0xe8 +#define NAND_ERASED_CW_DETECT_STATUS 0xec +#define NAND_EBI2_ECC_BUF_CFG 0xf0 +#define FLASH_BUF_ACC 0x100 + +#define NAND_CTRL 0xf00 +#define NAND_VERSION 0xf08 +#define NAND_READ_LOCATION_0 0xf20 +#define NAND_READ_LOCATION_1 0xf24 +#define NAND_READ_LOCATION_2 0xf28 +#define NAND_READ_LOCATION_3 0xf2c +#define NAND_READ_LOCATION_LAST_CW_0 0xf40 +#define NAND_READ_LOCATION_LAST_CW_1 0xf44 +#define NAND_READ_LOCATION_LAST_CW_2 0xf48 +#define NAND_READ_LOCATION_LAST_CW_3 0xf4c + +/* dummy register offsets, used by qcom_write_reg_dma */ +#define NAND_DEV_CMD1_RESTORE 0xdead +#define NAND_DEV_CMD_VLD_RESTORE 0xbeef + +/* NAND_FLASH_CMD bits */ +#define PAGE_ACC BIT(4) +#define LAST_PAGE BIT(5) + +/* NAND_FLASH_CHIP_SELECT bits */ +#define NAND_DEV_SEL 0 +#define DM_EN BIT(2) + +/* NAND_FLASH_STATUS bits */ +#define FS_OP_ERR BIT(4) +#define FS_READY_BSY_N BIT(5) +#define FS_MPU_ERR BIT(8) +#define FS_DEVICE_STS_ERR BIT(16) +#define FS_DEVICE_WP BIT(23) + +/* NAND_BUFFER_STATUS bits */ +#define BS_UNCORRECTABLE_BIT BIT(8) +#define BS_CORRECTABLE_ERR_MSK 0x1f + +/* NAND_DEVn_CFG0 bits */ +#define DISABLE_STATUS_AFTER_WRITE 4 +#define CW_PER_PAGE 6 +#define UD_SIZE_BYTES 9 +#define UD_SIZE_BYTES_MASK GENMASK(18, 9) +#define ECC_PARITY_SIZE_BYTES_RS 19 +#define SPARE_SIZE_BYTES 23 +#define SPARE_SIZE_BYTES_MASK GENMASK(26, 23) +#define NUM_ADDR_CYCLES 27 +#define STATUS_BFR_READ 30 +#define SET_RD_MODE_AFTER_STATUS 31 + +/* NAND_DEVn_CFG0 bits */ +#define DEV0_CFG1_ECC_DISABLE 0 +#define WIDE_FLASH 1 +#define NAND_RECOVERY_CYCLES 2 +#define CS_ACTIVE_BSY 5 +#define BAD_BLOCK_BYTE_NUM 6 +#define BAD_BLOCK_IN_SPARE_AREA 16 +#define WR_RD_BSY_GAP 17 +#define ENABLE_BCH_ECC 27 + +/* NAND_DEV0_ECC_CFG bits */ +#define ECC_CFG_ECC_DISABLE 0 +#define ECC_SW_RESET 1 +#define ECC_MODE 4 +#define ECC_PARITY_SIZE_BYTES_BCH 8 +#define ECC_NUM_DATA_BYTES 16 +#define ECC_NUM_DATA_BYTES_MASK GENMASK(25, 16) +#define ECC_FORCE_CLK_OPEN 30 + +/* NAND_DEV_CMD1 bits */ +#define READ_ADDR 0 + +/* NAND_DEV_CMD_VLD bits */ +#define READ_START_VLD BIT(0) +#define READ_STOP_VLD BIT(1) +#define WRITE_START_VLD BIT(2) +#define ERASE_START_VLD BIT(3) +#define SEQ_READ_START_VLD BIT(4) + +/* NAND_EBI2_ECC_BUF_CFG bits */ +#define NUM_STEPS 0 + +/* NAND_ERASED_CW_DETECT_CFG bits */ +#define ERASED_CW_ECC_MASK 1 +#define AUTO_DETECT_RES 0 +#define MASK_ECC BIT(ERASED_CW_ECC_MASK) +#define RESET_ERASED_DET BIT(AUTO_DETECT_RES) +#define ACTIVE_ERASED_DET (0 << AUTO_DETECT_RES) +#define CLR_ERASED_PAGE_DET (RESET_ERASED_DET | MASK_ECC) +#define SET_ERASED_PAGE_DET (ACTIVE_ERASED_DET | MASK_ECC) + +/* NAND_ERASED_CW_DETECT_STATUS bits */ +#define PAGE_ALL_ERASED BIT(7) +#define CODEWORD_ALL_ERASED BIT(6) +#define PAGE_ERASED BIT(5) +#define CODEWORD_ERASED BIT(4) +#define ERASED_PAGE (PAGE_ALL_ERASED | PAGE_ERASED) +#define ERASED_CW (CODEWORD_ALL_ERASED | CODEWORD_ERASED) + +/* NAND_READ_LOCATION_n bits */ +#define READ_LOCATION_OFFSET 0 +#define READ_LOCATION_SIZE 16 +#define READ_LOCATION_LAST 31 + +/* Version Mask */ +#define NAND_VERSION_MAJOR_MASK 0xf0000000 +#define NAND_VERSION_MAJOR_SHIFT 28 +#define NAND_VERSION_MINOR_MASK 0x0fff0000 +#define NAND_VERSION_MINOR_SHIFT 16 + +/* NAND OP_CMDs */ +#define OP_PAGE_READ 0x2 +#define OP_PAGE_READ_WITH_ECC 0x3 +#define OP_PAGE_READ_WITH_ECC_SPARE 0x4 +#define OP_PAGE_READ_ONFI_READ 0x5 +#define OP_PROGRAM_PAGE 0x6 +#define OP_PAGE_PROGRAM_WITH_ECC 0x7 +#define OP_PROGRAM_PAGE_SPARE 0x9 +#define OP_BLOCK_ERASE 0xa +#define OP_CHECK_STATUS 0xc +#define OP_FETCH_ID 0xb +#define OP_RESET_DEVICE 0xd + +/* Default Value for NAND_DEV_CMD_VLD */ +#define NAND_DEV_CMD_VLD_VAL (READ_START_VLD | WRITE_START_VLD | \ + ERASE_START_VLD | SEQ_READ_START_VLD) + +/* NAND_CTRL bits */ +#define BAM_MODE_EN BIT(0) + +/* + * the NAND controller performs reads/writes with ECC in 516 byte chunks. + * the driver calls the chunks 'step' or 'codeword' interchangeably + */ +#define NANDC_STEP_SIZE 512 + +/* + * the largest page size we support is 8K, this will have 16 steps/codewords + * of 512 bytes each + */ +#define MAX_NUM_STEPS (SZ_8K / NANDC_STEP_SIZE) + +/* we read at most 3 registers per codeword scan */ +#define MAX_REG_RD (3 * MAX_NUM_STEPS) + +/* ECC modes supported by the controller */ +#define ECC_NONE BIT(0) +#define ECC_RS_4BIT BIT(1) +#define ECC_BCH_4BIT BIT(2) +#define ECC_BCH_8BIT BIT(3) + +/* + * Returns the actual register address for all NAND_DEV_ registers + * (i.e. NAND_DEV_CMD0, NAND_DEV_CMD1, NAND_DEV_CMD2 and NAND_DEV_CMD_VLD) + */ +#define dev_cmd_reg_addr(nandc, reg) ((nandc)->props->dev_cmd_reg_start + (reg)) + +/* Returns the NAND register physical address */ +#define nandc_reg_phys(chip, offset) ((chip)->base_phys + (offset)) + +/* Returns the dma address for reg read buffer */ +#define reg_buf_dma_addr(chip, vaddr) \ + ((chip)->reg_read_dma + \ + ((u8 *)(vaddr) - (u8 *)(chip)->reg_read_buf)) + +#define QPIC_PER_CW_CMD_ELEMENTS 32 +#define QPIC_PER_CW_CMD_SGL 32 +#define QPIC_PER_CW_DATA_SGL 8 + +#define QPIC_NAND_COMPLETION_TIMEOUT msecs_to_jiffies(2000) + +/* + * Flags used in DMA descriptor preparation helper functions + * (i.e. qcom_read_reg_dma/qcom_write_reg_dma/qcom_read_data_dma/qcom_write_data_dma) + */ +/* Don't set the EOT in current tx BAM sgl */ +#define NAND_BAM_NO_EOT BIT(0) +/* Set the NWD flag in current BAM sgl */ +#define NAND_BAM_NWD BIT(1) +/* Finish writing in the current BAM sgl and start writing in another BAM sgl */ +#define NAND_BAM_NEXT_SGL BIT(2) +/* + * Erased codeword status is being used two times in single transfer so this + * flag will determine the current value of erased codeword status register + */ +#define NAND_ERASED_CW_SET BIT(4) + +#define MAX_ADDRESS_CYCLE 5 + +/* + * This data type corresponds to the BAM transaction which will be used for all + * NAND transfers. + * @bam_ce - the array of BAM command elements + * @cmd_sgl - sgl for NAND BAM command pipe + * @data_sgl - sgl for NAND BAM consumer/producer pipe + * @last_data_desc - last DMA desc in data channel (tx/rx). + * @last_cmd_desc - last DMA desc in command channel. + * @txn_done - completion for NAND transfer. + * @bam_ce_pos - the index in bam_ce which is available for next sgl + * @bam_ce_start - the index in bam_ce which marks the start position ce + * for current sgl. It will be used for size calculation + * for current sgl + * @cmd_sgl_pos - current index in command sgl. + * @cmd_sgl_start - start index in command sgl. + * @tx_sgl_pos - current index in data sgl for tx. + * @tx_sgl_start - start index in data sgl for tx. + * @rx_sgl_pos - current index in data sgl for rx. + * @rx_sgl_start - start index in data sgl for rx. + */ +struct bam_transaction { + struct bam_cmd_element *bam_ce; + struct scatterlist *cmd_sgl; + struct scatterlist *data_sgl; + struct dma_async_tx_descriptor *last_data_desc; + struct dma_async_tx_descriptor *last_cmd_desc; + struct completion txn_done; + u32 bam_ce_pos; + u32 bam_ce_start; + u32 cmd_sgl_pos; + u32 cmd_sgl_start; + u32 tx_sgl_pos; + u32 tx_sgl_start; + u32 rx_sgl_pos; + u32 rx_sgl_start; +}; + +/* + * This data type corresponds to the nand dma descriptor + * @dma_desc - low level DMA engine descriptor + * @list - list for desc_info + * + * @adm_sgl - sgl which will be used for single sgl dma descriptor. Only used by + * ADM + * @bam_sgl - sgl which will be used for dma descriptor. Only used by BAM + * @sgl_cnt - number of SGL in bam_sgl. Only used by BAM + * @dir - DMA transfer direction + */ +struct desc_info { + struct dma_async_tx_descriptor *dma_desc; + struct list_head node; + + union { + struct scatterlist adm_sgl; + struct { + struct scatterlist *bam_sgl; + int sgl_cnt; + }; + }; + enum dma_data_direction dir; +}; + +/* + * holds the current register values that we want to write. acts as a contiguous + * chunk of memory which we use to write the controller registers through DMA. + */ +struct nandc_regs { + __le32 cmd; + __le32 addr0; + __le32 addr1; + __le32 chip_sel; + __le32 exec; + + __le32 cfg0; + __le32 cfg1; + __le32 ecc_bch_cfg; + + __le32 clrflashstatus; + __le32 clrreadstatus; + + __le32 cmd1; + __le32 vld; + + __le32 orig_cmd1; + __le32 orig_vld; + + __le32 ecc_buf_cfg; + __le32 read_location0; + __le32 read_location1; + __le32 read_location2; + __le32 read_location3; + __le32 read_location_last0; + __le32 read_location_last1; + __le32 read_location_last2; + __le32 read_location_last3; + + __le32 erased_cw_detect_cfg_clr; + __le32 erased_cw_detect_cfg_set; +}; + +/* + * NAND controller data struct + * + * @dev: parent device + * + * @base: MMIO base + * + * @core_clk: controller clock + * @aon_clk: another controller clock + * + * @regs: a contiguous chunk of memory for DMA register + * writes. contains the register values to be + * written to controller + * + * @props: properties of current NAND controller, + * initialized via DT match data + * + * @controller: base controller structure + * @host_list: list containing all the chips attached to the + * controller + * + * @chan: dma channel + * @cmd_crci: ADM DMA CRCI for command flow control + * @data_crci: ADM DMA CRCI for data flow control + * + * @desc_list: DMA descriptor list (list of desc_infos) + * + * @data_buffer: our local DMA buffer for page read/writes, + * used when we can't use the buffer provided + * by upper layers directly + * @reg_read_buf: local buffer for reading back registers via DMA + * + * @base_phys: physical base address of controller registers + * @base_dma: dma base address of controller registers + * @reg_read_dma: contains dma address for register read buffer + * + * @buf_size/count/start: markers for chip->legacy.read_buf/write_buf + * functions + * @max_cwperpage: maximum QPIC codewords required. calculated + * from all connected NAND devices pagesize + * + * @reg_read_pos: marker for data read in reg_read_buf + * + * @cmd1/vld: some fixed controller register values + * + * @exec_opwrite: flag to select correct number of code word + * while reading status + */ +struct qcom_nand_controller { + struct device *dev; + + void __iomem *base; + + struct clk *core_clk; + struct clk *aon_clk; + + struct nandc_regs *regs; + struct bam_transaction *bam_txn; + + const struct qcom_nandc_props *props; + + struct nand_controller *controller; + struct list_head host_list; + + union { + /* will be used only by QPIC for BAM DMA */ + struct { + struct dma_chan *tx_chan; + struct dma_chan *rx_chan; + struct dma_chan *cmd_chan; + }; + + /* will be used only by EBI2 for ADM DMA */ + struct { + struct dma_chan *chan; + unsigned int cmd_crci; + unsigned int data_crci; + }; + }; + + struct list_head desc_list; + + u8 *data_buffer; + __le32 *reg_read_buf; + + phys_addr_t base_phys; + dma_addr_t base_dma; + dma_addr_t reg_read_dma; + + int buf_size; + int buf_count; + int buf_start; + unsigned int max_cwperpage; + + int reg_read_pos; + + u32 cmd1, vld; + bool exec_opwrite; +}; + +/* + * This data type corresponds to the NAND controller properties which varies + * among different NAND controllers. + * @ecc_modes - ecc mode for NAND + * @dev_cmd_reg_start - NAND_DEV_CMD_* registers starting offset + * @supports_bam - whether NAND controller is using BAM + * @nandc_part_of_qpic - whether NAND controller is part of qpic IP + * @qpic_version2 - flag to indicate QPIC IP version 2 + * @use_codeword_fixup - whether NAND has different layout for boot partitions + */ +struct qcom_nandc_props { + u32 ecc_modes; + u32 dev_cmd_reg_start; + bool supports_bam; + bool nandc_part_of_qpic; + bool qpic_version2; + bool use_codeword_fixup; +}; + +void qcom_free_bam_transaction(struct qcom_nand_controller *nandc); +struct bam_transaction *qcom_alloc_bam_transaction(struct qcom_nand_controller *nandc); +void qcom_clear_bam_transaction(struct qcom_nand_controller *nandc); +void qcom_qpic_bam_dma_done(void *data); +void qcom_nandc_dev_to_mem(struct qcom_nand_controller *nandc, bool is_cpu); +int qcom_prepare_bam_async_desc(struct qcom_nand_controller *nandc, + struct dma_chan *chan, unsigned long flags); +int qcom_prep_bam_dma_desc_cmd(struct qcom_nand_controller *nandc, bool read, + int reg_off, const void *vaddr, int size, unsigned int flags); +int qcom_prep_bam_dma_desc_data(struct qcom_nand_controller *nandc, bool read, + const void *vaddr, int size, unsigned int flags); +int qcom_prep_adm_dma_desc(struct qcom_nand_controller *nandc, bool read, int reg_off, + const void *vaddr, int size, bool flow_control); +int qcom_read_reg_dma(struct qcom_nand_controller *nandc, int first, int num_regs, + unsigned int flags); +int qcom_write_reg_dma(struct qcom_nand_controller *nandc, __le32 *vaddr, int first, + int num_regs, unsigned int flags); +int qcom_read_data_dma(struct qcom_nand_controller *nandc, int reg_off, const u8 *vaddr, + int size, unsigned int flags); +int qcom_write_data_dma(struct qcom_nand_controller *nandc, int reg_off, const u8 *vaddr, + int size, unsigned int flags); +int qcom_submit_descs(struct qcom_nand_controller *nandc); +void qcom_clear_read_regs(struct qcom_nand_controller *nandc); +void qcom_nandc_unalloc(struct qcom_nand_controller *nandc); +int qcom_nandc_alloc(struct qcom_nand_controller *nandc); +#endif + -- cgit v1.2.3 From 0c08080fd71cd5dd59643104b39d3c89d793ab3c Mon Sep 17 00:00:00 2001 From: Md Sadre Alam Date: Wed, 20 Nov 2024 14:45:03 +0530 Subject: mtd: rawnand: qcom: use FIELD_PREP and GENMASK Use the bitfield macro FIELD_PREP, and GENMASK to do the shift and mask in one go. This makes the code more readable. Reviewed-by: Konrad Dybcio Signed-off-by: Md Sadre Alam Signed-off-by: Miquel Raynal --- include/linux/mtd/nand-qpic-common.h | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand-qpic-common.h b/include/linux/mtd/nand-qpic-common.h index 425994429387..e79c79775eb8 100644 --- a/include/linux/mtd/nand-qpic-common.h +++ b/include/linux/mtd/nand-qpic-common.h @@ -70,35 +70,42 @@ #define BS_CORRECTABLE_ERR_MSK 0x1f /* NAND_DEVn_CFG0 bits */ -#define DISABLE_STATUS_AFTER_WRITE 4 +#define DISABLE_STATUS_AFTER_WRITE BIT(4) #define CW_PER_PAGE 6 +#define CW_PER_PAGE_MASK GENMASK(8, 6) #define UD_SIZE_BYTES 9 #define UD_SIZE_BYTES_MASK GENMASK(18, 9) -#define ECC_PARITY_SIZE_BYTES_RS 19 +#define ECC_PARITY_SIZE_BYTES_RS GENMASK(22, 19) #define SPARE_SIZE_BYTES 23 #define SPARE_SIZE_BYTES_MASK GENMASK(26, 23) #define NUM_ADDR_CYCLES 27 -#define STATUS_BFR_READ 30 -#define SET_RD_MODE_AFTER_STATUS 31 +#define NUM_ADDR_CYCLES_MASK GENMASK(29, 27) +#define STATUS_BFR_READ BIT(30) +#define SET_RD_MODE_AFTER_STATUS BIT(31) /* NAND_DEVn_CFG0 bits */ -#define DEV0_CFG1_ECC_DISABLE 0 -#define WIDE_FLASH 1 +#define DEV0_CFG1_ECC_DISABLE BIT(0) +#define WIDE_FLASH BIT(1) #define NAND_RECOVERY_CYCLES 2 -#define CS_ACTIVE_BSY 5 +#define NAND_RECOVERY_CYCLES_MASK GENMASK(4, 2) +#define CS_ACTIVE_BSY BIT(5) #define BAD_BLOCK_BYTE_NUM 6 -#define BAD_BLOCK_IN_SPARE_AREA 16 +#define BAD_BLOCK_BYTE_NUM_MASK GENMASK(15, 6) +#define BAD_BLOCK_IN_SPARE_AREA BIT(16) #define WR_RD_BSY_GAP 17 -#define ENABLE_BCH_ECC 27 +#define WR_RD_BSY_GAP_MASK GENMASK(22, 17) +#define ENABLE_BCH_ECC BIT(27) /* NAND_DEV0_ECC_CFG bits */ -#define ECC_CFG_ECC_DISABLE 0 -#define ECC_SW_RESET 1 +#define ECC_CFG_ECC_DISABLE BIT(0) +#define ECC_SW_RESET BIT(1) #define ECC_MODE 4 +#define ECC_MODE_MASK GENMASK(5, 4) #define ECC_PARITY_SIZE_BYTES_BCH 8 +#define ECC_PARITY_SIZE_BYTES_BCH_MASK GENMASK(12, 8) #define ECC_NUM_DATA_BYTES 16 #define ECC_NUM_DATA_BYTES_MASK GENMASK(25, 16) -#define ECC_FORCE_CLK_OPEN 30 +#define ECC_FORCE_CLK_OPEN BIT(30) /* NAND_DEV_CMD1 bits */ #define READ_ADDR 0 -- cgit v1.2.3 From 9e49ca756d207f4313fb7af48648a67da8e4e250 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 20 Dec 2024 10:33:13 -0500 Subject: tracing/string: Create and use __free(argv_free) in trace_dynevent.c The function dyn_event_release() uses argv_split() which must be freed via argv_free(). It contains several error paths that do a goto out to call argv_free() for cleanup. This makes the code complex and error prone. Create a new __free() directive __free(argv_free) that will call argv_free() for data allocated with argv_split(), and use it in the dyn_event_release() function. Cc: Kees Cook Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Andy Shevchenko Cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/20241220103313.4a74ec8e@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- include/linux/string.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 493ac4862c77..86d5d352068b 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -4,6 +4,7 @@ #include #include +#include /* for DEFINE_FREE() */ #include /* for inline */ #include /* for size_t */ #include /* for NULL */ @@ -312,6 +313,8 @@ extern void *kmemdup_array(const void *src, size_t count, size_t element_size, g extern char **argv_split(gfp_t gfp, const char *str, int *argcp); extern void argv_free(char **argv); +DEFINE_FREE(argv_free, char **, if (!IS_ERR_OR_NULL(_T)) argv_free(_T)) + /* lib/cmdline.c */ extern int get_option(char **str, int *pint); extern char *get_options(const char *str, int nints, int *ints); -- cgit v1.2.3 From cff6d93eab00bacf8b6bffdef775fc2de0273c96 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Thu, 12 Dec 2024 13:12:37 +0000 Subject: tracepoint: Reduce duplication of __DO_TRACE_CALL The logic for invoking __DO_TRACE_CALL was extracted to a static inline function called __rust_do_trace_##name so that Rust can call it directly. This logic does not include the static branch, to avoid a function call when the tracepoint is disabled. Since the C code needs to perform the same logic after checking the static key, this logic is currently duplicated. Thus, remove this duplication by having C call the static inline function too. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241212131237.1988409-1-aliceryhl@google.com Signed-off-by: Alice Ryhl Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 76d9055b2cff..a351763e6965 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -218,7 +218,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DEFINE_RUST_DO_TRACE(name, proto, args) \ notrace void rust_do_trace_##name(proto) \ { \ - __rust_do_trace_##name(args); \ + __do_trace_##name(args); \ } /* @@ -268,7 +268,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ - static inline void __rust_do_trace_##name(proto) \ + static inline void __do_trace_##name(proto) \ { \ if (cond) { \ guard(preempt_notrace)(); \ @@ -277,12 +277,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) } \ static inline void trace_##name(proto) \ { \ - if (static_branch_unlikely(&__tracepoint_##name.key)) { \ - if (cond) { \ - guard(preempt_notrace)(); \ - __DO_TRACE_CALL(name, TP_ARGS(args)); \ - } \ - } \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ + __do_trace_##name(args); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ @@ -291,7 +287,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE_SYSCALL(name, proto, args, data_proto) \ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ - static inline void __rust_do_trace_##name(proto) \ + static inline void __do_trace_##name(proto) \ { \ guard(rcu_tasks_trace)(); \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ @@ -299,10 +295,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) static inline void trace_##name(proto) \ { \ might_fault(); \ - if (static_branch_unlikely(&__tracepoint_##name.key)) { \ - guard(rcu_tasks_trace)(); \ - __DO_TRACE_CALL(name, TP_ARGS(args)); \ - } \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ + __do_trace_##name(args); \ if (IS_ENABLED(CONFIG_LOCKDEP)) { \ WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ -- cgit v1.2.3 From 41705c4262aaca49b8d9fe9b24fe048dc6c2b301 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:11:40 +0900 Subject: fgraph: Pass ftrace_regs to entryfunc Pass ftrace_regs to the fgraph_ops::entryfunc(). If ftrace_regs is not available, it passes a NULL instead. User callback function can access some registers (including return address) via this ftrace_regs. Note that the ftrace_regs can be NULL when the arch does NOT define: HAVE_DYNAMIC_FTRACE_WITH_ARGS or HAVE_DYNAMIC_FTRACE_WITH_REGS. More specifically, if HAVE_DYNAMIC_FTRACE_WITH_REGS is defined but not the HAVE_DYNAMIC_FTRACE_WITH_ARGS, and the ftrace ops used to register the function callback does not set FTRACE_OPS_FL_SAVE_REGS. In this case, ftrace_regs can be NULL in user callback. Signed-off-by: Masami Hiramatsu (Google) Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Cc: Huacai Chen Cc: WANG Xuerui Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Naveen N Rao Cc: Madhavan Srinivasan Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: Mathieu Desnoyers Link: https://lore.kernel.org/173518990044.391279.17406984900626078579.stgit@devnote2 Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index aa9ddd1e4bb6..c86ac786da3d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1071,10 +1071,12 @@ struct fgraph_ops; typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *, struct fgraph_ops *); /* return */ typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *, - struct fgraph_ops *); /* entry */ + struct fgraph_ops *, + struct ftrace_regs *); /* entry */ extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops); + struct fgraph_ops *gops, + struct ftrace_regs *fregs); bool ftrace_pids_enabled(struct ftrace_ops *ops); #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -1114,8 +1116,15 @@ struct ftrace_ret_stack { extern void return_to_handler(void); extern int -function_graph_enter(unsigned long ret, unsigned long func, - unsigned long frame_pointer, unsigned long *retp); +function_graph_enter_regs(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp, + struct ftrace_regs *fregs); + +static inline int function_graph_enter(unsigned long ret, unsigned long func, + unsigned long fp, unsigned long *retp) +{ + return function_graph_enter_regs(ret, func, fp, retp, NULL); +} struct ftrace_ret_stack * ftrace_graph_get_ret_stack(struct task_struct *task, int skip); -- cgit v1.2.3 From a3ed4157b7d89800a0008de0c9e46a438a5c3745 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:11:55 +0900 Subject: fgraph: Replace fgraph_ret_regs with ftrace_regs Use ftrace_regs instead of fgraph_ret_regs for tracing return value on function_graph tracer because of simplifying the callback interface. The CONFIG_HAVE_FUNCTION_GRAPH_RETVAL is also replaced by CONFIG_HAVE_FUNCTION_GRAPH_FREGS. Signed-off-by: Masami Hiramatsu (Google) Acked-by: Heiko Carstens Acked-by: Will Deacon Cc: Catalin Marinas Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Cc: Huacai Chen Cc: WANG Xuerui Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Heiko Carstens Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: Mathieu Desnoyers Link: https://lore.kernel.org/173518991508.391279.16635322774382197642.stgit@devnote2 Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 12 +++++++++--- include/linux/ftrace_regs.h | 2 ++ 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index c86ac786da3d..069f270bd7ae 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -43,9 +43,8 @@ struct dyn_ftrace; char *arch_ftrace_match_adjust(char *str, const char *search); -#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL -struct fgraph_ret_regs; -unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs); +#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FREGS +unsigned long ftrace_return_to_handler(struct ftrace_regs *fregs); #else unsigned long ftrace_return_to_handler(unsigned long frame_pointer); #endif @@ -134,6 +133,13 @@ extern int ftrace_enabled; * Also, architecture dependent fields can be used for internal process. * (e.g. orig_ax on x86_64) * + * Basically, ftrace_regs stores the registers related to the context. + * On function entry, registers for function parameters and hooking the + * function call are stored, and on function exit, registers for function + * return value and frame pointers are stored. + * + * And also, it dpends on the context that which registers are restored + * from the ftrace_regs. * On the function entry, those registers will be restored except for * the stack pointer, so that user can change the function parameters * and instruction pointer (e.g. live patching.) diff --git a/include/linux/ftrace_regs.h b/include/linux/ftrace_regs.h index be1ed0c891d0..bbc1873ca6b8 100644 --- a/include/linux/ftrace_regs.h +++ b/include/linux/ftrace_regs.h @@ -30,6 +30,8 @@ struct ftrace_regs; override_function_with_return(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_query_register_offset(name) \ regs_query_register_offset(name) +#define ftrace_regs_get_frame_pointer(fregs) \ + frame_pointer(&arch_ftrace_regs(fregs)->regs) #endif /* HAVE_ARCH_FTRACE_REGS */ -- cgit v1.2.3 From 2ca8c112c9676e2394d76760db78ffddf21d93b5 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:12:09 +0900 Subject: fgraph: Pass ftrace_regs to retfunc Pass ftrace_regs to the fgraph_ops::retfunc(). If ftrace_regs is not available, it passes a NULL instead. User callback function can access some registers (including return address) via this ftrace_regs. Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Link: https://lore.kernel.org/173518992972.391279.14055405490327765506.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 069f270bd7ae..9a1e768e47da 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1075,7 +1075,8 @@ struct fgraph_ops; /* Type of the callback handlers for tracing function graph*/ typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *, - struct fgraph_ops *); /* return */ + struct fgraph_ops *, + struct ftrace_regs *); /* return */ typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *, struct fgraph_ops *, struct ftrace_regs *); /* entry */ -- cgit v1.2.3 From 46bc082388560a95e3649b698a4675e5ea3262e6 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:12:20 +0900 Subject: fprobe: Use ftrace_regs in fprobe entry handler This allows fprobes to be available with CONFIG_DYNAMIC_FTRACE_WITH_ARGS instead of CONFIG_DYNAMIC_FTRACE_WITH_REGS, then we can enable fprobe on arm64. Cc: Alexei Starovoitov Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Link: https://lore.kernel.org/173518994037.391279.2786805566359674586.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Acked-by: Florent Revest Signed-off-by: Steven Rostedt (Google) --- include/linux/fprobe.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index f39869588117..ca64ee5e45d2 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -10,7 +10,7 @@ struct fprobe; typedef int (*fprobe_entry_cb)(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *regs, void *entry_data); typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip, -- cgit v1.2.3 From 762abbc0d09f7ae123c82d315eb1a961c1a2cf7b Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:12:31 +0900 Subject: fprobe: Use ftrace_regs in fprobe exit handler Change the fprobe exit handler to use ftrace_regs structure instead of pt_regs. This also introduce HAVE_FTRACE_REGS_HAVING_PT_REGS which means the ftrace_regs is including the pt_regs so that ftrace_regs can provide pt_regs without memory allocation. Fprobe introduces a new dependency with that. Signed-off-by: Masami Hiramatsu (Google) Acked-by: Heiko Carstens # s390 Cc: Huacai Chen Cc: Alexei Starovoitov Cc: Florent Revest Cc: bpf Cc: Alan Maguire Cc: Heiko Carstens Cc: WANG Xuerui Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Song Liu Cc: Jiri Olsa Cc: KP Singh Cc: Matt Bobrowski Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: Eduard Zingerman Cc: Yonghong Song Cc: John Fastabend Cc: Stanislav Fomichev Cc: Hao Luo Cc: Andrew Morton Link: https://lore.kernel.org/173518995092.391279.6765116450352977627.stgit@devnote2 Signed-off-by: Steven Rostedt (Google) --- include/linux/fprobe.h | 2 +- include/linux/ftrace.h | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index ca64ee5e45d2..ef609bcca0f9 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -14,7 +14,7 @@ typedef int (*fprobe_entry_cb)(struct fprobe *fp, unsigned long entry_ip, void *entry_data); typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *regs, void *entry_data); /** diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9a1e768e47da..bf8bb6c10553 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -176,6 +176,12 @@ static inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs) #define ftrace_regs_set_instruction_pointer(fregs, ip) do { } while (0) #endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ +#ifdef CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS + +static_assert(sizeof(struct pt_regs) == ftrace_regs_size()); + +#endif /* CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS */ + static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs) { if (!fregs) -- cgit v1.2.3 From b9b55c8912ce1e5555715d126486bdd63ddfeaec Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:12:47 +0900 Subject: tracing: Add ftrace_partial_regs() for converting ftrace_regs to pt_regs Add ftrace_partial_regs() which converts the ftrace_regs to pt_regs. This is for the eBPF which needs this to keep the same pt_regs interface to access registers. Thus when replacing the pt_regs with ftrace_regs in fprobes (which is used by kprobe_multi eBPF event), this will be used. If the architecture defines its own ftrace_regs, this copies partial registers to pt_regs and returns it. If not, ftrace_regs is the same as pt_regs and ftrace_partial_regs() will return ftrace_regs::regs. Signed-off-by: Masami Hiramatsu (Google) Acked-by: Florent Revest Cc: Alexei Starovoitov Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Link: https://lore.kernel.org/173518996761.391279.4987911298206448122.stgit@devnote2 Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index bf8bb6c10553..ad2b46e1d5b0 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -190,6 +190,23 @@ static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs return arch_ftrace_get_regs(fregs); } +#if !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) || \ + defined(CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS) + +static __always_inline struct pt_regs * +ftrace_partial_regs(struct ftrace_regs *fregs, struct pt_regs *regs) +{ + /* + * If CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS=y, ftrace_regs memory + * layout is including pt_regs. So always returns that address. + * Since arch_ftrace_get_regs() will check some members and may return + * NULL, we can not use it. + */ + return &arch_ftrace_regs(fregs)->regs; +} + +#endif /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS || CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS */ + /* * When true, the ftrace_regs_{get,set}_*() functions may be used on fregs. * Note: this can be true even when ftrace_get_regs() cannot provide a pt_regs. -- cgit v1.2.3 From d5d01b71996ec03af51b3c0736c92d0fc89703b5 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:12:59 +0900 Subject: tracing: Add ftrace_fill_perf_regs() for perf event Add ftrace_fill_perf_regs() which should be compatible with the perf_fetch_caller_regs(). In other words, the pt_regs returned from the ftrace_fill_perf_regs() must satisfy 'user_mode(regs) == false' and can be used for stack tracing. Signed-off-by: Masami Hiramatsu (Google) Acked-by: Will Deacon Acked-by: Heiko Carstens # s390 Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Heiko Carstens Cc: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Naveen N Rao Cc: Madhavan Srinivasan Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: x86@kernel.org Cc: "H. Peter Anvin" Link: https://lore.kernel.org/173518997908.391279.15910334347345106424.stgit@devnote2 Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ad2b46e1d5b0..6d29c640697c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -207,6 +207,37 @@ ftrace_partial_regs(struct ftrace_regs *fregs, struct pt_regs *regs) #endif /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS || CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS */ +#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS + +/* + * Please define arch dependent pt_regs which compatible to the + * perf_arch_fetch_caller_regs() but based on ftrace_regs. + * This requires + * - user_mode(_regs) returns false (always kernel mode). + * - able to use the _regs for stack trace. + */ +#ifndef arch_ftrace_fill_perf_regs +/* As same as perf_arch_fetch_caller_regs(), do nothing by default */ +#define arch_ftrace_fill_perf_regs(fregs, _regs) do {} while (0) +#endif + +static __always_inline struct pt_regs * +ftrace_fill_perf_regs(struct ftrace_regs *fregs, struct pt_regs *regs) +{ + arch_ftrace_fill_perf_regs(fregs, regs); + return regs; +} + +#else /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ + +static __always_inline struct pt_regs * +ftrace_fill_perf_regs(struct ftrace_regs *fregs, struct pt_regs *regs) +{ + return &arch_ftrace_regs(fregs)->regs; +} + +#endif + /* * When true, the ftrace_regs_{get,set}_*() functions may be used on fregs. * Note: this can be true even when ftrace_get_regs() cannot provide a pt_regs. -- cgit v1.2.3 From 0566cefe73b9a6ea38357b428d27460db032a03d Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:13:13 +0900 Subject: tracing/fprobe: Enable fprobe events with CONFIG_DYNAMIC_FTRACE_WITH_ARGS Allow fprobe events to be enabled with CONFIG_DYNAMIC_FTRACE_WITH_ARGS. With this change, fprobe events mostly use ftrace_regs instead of pt_regs. Note that if the arch doesn't enable HAVE_FTRACE_REGS_HAVING_PT_REGS, fprobe events will not be able to be used from perf. Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Link: https://lore.kernel.org/173518999352.391279.13332699755290175168.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 6d29c640697c..4c553fe9c026 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -250,6 +250,23 @@ static __always_inline bool ftrace_regs_has_args(struct ftrace_regs *fregs) return ftrace_get_regs(fregs) != NULL; } +#ifdef CONFIG_HAVE_REGS_AND_STACK_ACCESS_API +static __always_inline unsigned long +ftrace_regs_get_kernel_stack_nth(struct ftrace_regs *fregs, unsigned int nth) +{ + unsigned long *stackp; + + stackp = (unsigned long *)ftrace_regs_get_stack_pointer(fregs); + if (((unsigned long)(stackp + nth) & ~(THREAD_SIZE - 1)) == + ((unsigned long)stackp & ~(THREAD_SIZE - 1))) + return *(stackp + nth); + + return 0; +} +#else /* !CONFIG_HAVE_REGS_AND_STACK_ACCESS_API */ +#define ftrace_regs_get_kernel_stack_nth(fregs, nth) (0L) +#endif /* CONFIG_HAVE_REGS_AND_STACK_ACCESS_API */ + typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); -- cgit v1.2.3 From 4346ba1604093305a287e08eb465a9c15ba05b80 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:13:59 +0900 Subject: fprobe: Rewrite fprobe on function-graph tracer Rewrite fprobe implementation on function-graph tracer. Major API changes are: - 'nr_maxactive' field is deprecated. - This depends on CONFIG_DYNAMIC_FTRACE_WITH_ARGS or !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS, and CONFIG_HAVE_FUNCTION_GRAPH_FREGS. So currently works only on x86_64. - Currently the entry size is limited in 15 * sizeof(long). - If there is too many fprobe exit handler set on the same function, it will fail to probe. Signed-off-by: Masami Hiramatsu (Google) Acked-by: Heiko Carstens # s390 Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Heiko Carstens Cc: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Cc: Huacai Chen Cc: WANG Xuerui Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Naveen N Rao Cc: Madhavan Srinivasan Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/173519003970.391279.14406792285453830996.stgit@devnote2 Signed-off-by: Steven Rostedt (Google) --- include/linux/fprobe.h | 58 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index ef609bcca0f9..91337bcb452f 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -5,10 +5,11 @@ #include #include -#include +#include +#include +#include struct fprobe; - typedef int (*fprobe_entry_cb)(struct fprobe *fp, unsigned long entry_ip, unsigned long ret_ip, struct ftrace_regs *regs, void *entry_data); @@ -17,35 +18,57 @@ typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip, unsigned long ret_ip, struct ftrace_regs *regs, void *entry_data); +/** + * struct fprobe_hlist_node - address based hash list node for fprobe. + * + * @hlist: The hlist node for address search hash table. + * @addr: One of the probing address of @fp. + * @fp: The fprobe which owns this. + */ +struct fprobe_hlist_node { + struct hlist_node hlist; + unsigned long addr; + struct fprobe *fp; +}; + +/** + * struct fprobe_hlist - hash list nodes for fprobe. + * + * @hlist: The hlist node for existence checking hash table. + * @rcu: rcu_head for RCU deferred release. + * @fp: The fprobe which owns this fprobe_hlist. + * @size: The size of @array. + * @array: The fprobe_hlist_node for each address to probe. + */ +struct fprobe_hlist { + struct hlist_node hlist; + struct rcu_head rcu; + struct fprobe *fp; + int size; + struct fprobe_hlist_node array[] __counted_by(size); +}; + /** * struct fprobe - ftrace based probe. - * @ops: The ftrace_ops. + * * @nmissed: The counter for missing events. * @flags: The status flag. - * @rethook: The rethook data structure. (internal data) * @entry_data_size: The private data storage size. - * @nr_maxactive: The max number of active functions. + * @nr_maxactive: The max number of active functions. (*deprecated) * @entry_handler: The callback function for function entry. * @exit_handler: The callback function for function exit. + * @hlist_array: The fprobe_hlist for fprobe search from IP hash table. */ struct fprobe { -#ifdef CONFIG_FUNCTION_TRACER - /* - * If CONFIG_FUNCTION_TRACER is not set, CONFIG_FPROBE is disabled too. - * But user of fprobe may keep embedding the struct fprobe on their own - * code. To avoid build error, this will keep the fprobe data structure - * defined here, but remove ftrace_ops data structure. - */ - struct ftrace_ops ops; -#endif unsigned long nmissed; unsigned int flags; - struct rethook *rethook; size_t entry_data_size; int nr_maxactive; fprobe_entry_cb entry_handler; fprobe_exit_cb exit_handler; + + struct fprobe_hlist *hlist_array; }; /* This fprobe is soft-disabled. */ @@ -121,4 +144,9 @@ static inline void enable_fprobe(struct fprobe *fp) fp->flags &= ~FPROBE_FL_DISABLED; } +/* The entry data size is 4 bits (=16) * sizeof(long) in maximum */ +#define FPROBE_DATA_SIZE_BITS 4 +#define MAX_FPROBE_DATA_SIZE_WORD ((1L << FPROBE_DATA_SIZE_BITS) - 1) +#define MAX_FPROBE_DATA_SIZE (MAX_FPROBE_DATA_SIZE_WORD * sizeof(long)) + #endif -- cgit v1.2.3 From a2224559cbba1db3a998dd100c60c85a1d078ad6 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:14:32 +0900 Subject: tracing/fprobe: Remove nr_maxactive from fprobe Remove depercated fprobe::nr_maxactive. This involves fprobe events to rejects the maxactive number. Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Link: https://lore.kernel.org/173519007257.391279.946804046982289337.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/fprobe.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 91337bcb452f..702099f08929 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -54,7 +54,6 @@ struct fprobe_hlist { * @nmissed: The counter for missing events. * @flags: The status flag. * @entry_data_size: The private data storage size. - * @nr_maxactive: The max number of active functions. (*deprecated) * @entry_handler: The callback function for function entry. * @exit_handler: The callback function for function exit. * @hlist_array: The fprobe_hlist for fprobe search from IP hash table. @@ -63,7 +62,6 @@ struct fprobe { unsigned long nmissed; unsigned int flags; size_t entry_data_size; - int nr_maxactive; fprobe_entry_cb entry_handler; fprobe_exit_cb exit_handler; -- cgit v1.2.3 From 2bc56fdae1ba3fc80ee37a648346abc5f152357d Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:15:14 +0900 Subject: ftrace: Add ftrace_get_symaddr to convert fentry_ip to symaddr This introduces ftrace_get_symaddr() which tries to convert fentry_ip passed by ftrace or fgraph callback to symaddr without calling kallsyms API. It returns the symbol address or 0 if it fails to convert it. Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Link: https://lore.kernel.org/173519011487.391279.5450806886342723151.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412061423.K79V55Hd-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202412061804.5VRzF14E-lkp@intel.com/ Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4c553fe9c026..07092dfb21a4 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -622,6 +622,19 @@ enum { FTRACE_MAY_SLEEP = (1 << 5), }; +/* Arches can override ftrace_get_symaddr() to convert fentry_ip to symaddr. */ +#ifndef ftrace_get_symaddr +/** + * ftrace_get_symaddr - return the symbol address from fentry_ip + * @fentry_ip: the address of ftrace location + * + * Get the symbol address from @fentry_ip (fast path). If there is no fast + * search path, this returns 0. + * User may need to use kallsyms API to find the symbol address. + */ +#define ftrace_get_symaddr(fentry_ip) (0) +#endif + #ifdef CONFIG_DYNAMIC_FTRACE void ftrace_arch_code_modify_prepare(void); -- cgit v1.2.3 From 1143be17d7acb02a7c4dba6169a33534983f4960 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Dec 2024 07:44:25 -0700 Subject: io_uring/rw: don't mask in f_iocb_flags A previous commit changed overwriting kiocb->ki_flags with ->f_iocb_flags with masking it in. This breaks for retry situations, where we don't necessarily want to retain previously set flags, like IOCB_NOWAIT. The use case needs IOCB_HAS_METADATA to be persistent, but the change makes all flags persistent, which is an issue. Add a request flag to track whether the request has metadata or not, as that is persistent across issues. Fixes: 59a7d12a7fb5 ("io_uring: introduce attributes for read/write and PI support") Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 73575d545d3c..623d8e798a11 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -480,6 +480,7 @@ enum { REQ_F_BL_NO_RECYCLE_BIT, REQ_F_BUFFERS_COMMIT_BIT, REQ_F_BUF_NODE_BIT, + REQ_F_HAS_METADATA_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -560,6 +561,8 @@ enum { REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT), /* buf node is valid */ REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT), + /* request has read/write metadata assigned */ + REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); -- cgit v1.2.3 From 9351bbb1b022227644022850bf2160b04e970195 Mon Sep 17 00:00:00 2001 From: Vasileios Amoiridis Date: Sat, 14 Dec 2024 20:14:21 +0100 Subject: iio: core: mark scan_timestamp as __private Since there are no more direct accesses to the indio_dev->scan_timestamp value, it can be marked as __private and use the macro ACCESS_PRIVATE() in order to access it. Like this, static checkers will be able to inform in case someone tries to either write to the value, or read its value directly. Signed-off-by: Vasileios Amoiridis Link: https://patch.msgid.link/20241214191421.94172-5-vassilisamir@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer.h | 2 +- include/linux/iio/iio.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h index 418b1307d3f2..3b8d618bb3df 100644 --- a/include/linux/iio/buffer.h +++ b/include/linux/iio/buffer.h @@ -37,7 +37,7 @@ int iio_pop_from_buffer(struct iio_buffer *buffer, void *data); static inline int iio_push_to_buffers_with_timestamp(struct iio_dev *indio_dev, void *data, int64_t timestamp) { - if (indio_dev->scan_timestamp) { + if (ACCESS_PRIVATE(indio_dev, scan_timestamp)) { size_t ts_offset = indio_dev->scan_bytes / sizeof(int64_t) - 1; ((int64_t *)data)[ts_offset] = timestamp; } diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index ae65890d4567..56161e02f002 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -611,7 +611,7 @@ struct iio_dev { const unsigned long *available_scan_masks; unsigned int __private masklength; const unsigned long *active_scan_mask; - bool scan_timestamp; + bool __private scan_timestamp; struct iio_trigger *trig; struct iio_poll_func *pollfunc; struct iio_poll_func *pollfunc_event; -- cgit v1.2.3 From e2f9d754fc5b5dcb53a0df627f386b63f8ba2d68 Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Fri, 20 Dec 2024 10:59:21 +0100 Subject: iio: trigger: stm32-timer: add support for stm32mp25 Add support for STM32MP25 SoC. Use newly introduced compatible to handle this new HW variant. Add TIM20 trigger definitions that can be used by the stm32 analog-to-digital converter. Use compatible data to identify it. As the counter framework is now superseding the deprecated IIO counter interface (IIO_COUNT), don't support it. Only register IIO trigger devices for ADC usage. So, make the valids_table a cfg option. Signed-off-by: Fabrice Gasnier Link: https://patch.msgid.link/20241220095927.1122782-4-fabrice.gasnier@foss.st.com Signed-off-by: Jonathan Cameron --- include/linux/iio/timer/stm32-timer-trigger.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/timer/stm32-timer-trigger.h b/include/linux/iio/timer/stm32-timer-trigger.h index 37572e4dc73a..1ee237b56183 100644 --- a/include/linux/iio/timer/stm32-timer-trigger.h +++ b/include/linux/iio/timer/stm32-timer-trigger.h @@ -72,6 +72,12 @@ #define TIM17_OC1 "tim17_oc1" +#define TIM20_OC1 "tim20_oc1" +#define TIM20_OC2 "tim20_oc2" +#define TIM20_OC3 "tim20_oc3" +#define TIM20_TRGO "tim20_trgo" +#define TIM20_TRGO2 "tim20_trgo2" + #if IS_REACHABLE(CONFIG_IIO_STM32_TIMER_TRIGGER) bool is_stm32_timer_trigger(struct iio_trigger *trig); #else -- cgit v1.2.3 From f718faf3940e95d5d34af9041f279f598396ab7d Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 17 Dec 2024 00:48:18 +0000 Subject: freezer, sched: Report frozen tasks as 'D' instead of 'R' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before commit: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") the frozen task stat was reported as 'D' in cgroup v1. However, after rewriting the core freezer logic, the frozen task stat is reported as 'R'. This is confusing, especially when a task with stat of 'S' is frozen. This bug can be reproduced with these steps: $ cd /sys/fs/cgroup/freezer/ $ mkdir test $ sleep 1000 & [1] 739 // task whose stat is 'S' $ echo 739 > test/cgroup.procs $ echo FROZEN > test/freezer.state $ ps -aux | grep 739 root 739 0.1 0.0 8376 1812 pts/0 R 10:56 0:00 sleep 1000 As shown above, a task whose stat is 'S' was changed to 'R' when it was frozen. To solve this regression, simply maintain the same reported state as before the rewrite. [ mingo: Enhanced the changelog and comments ] Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") Signed-off-by: Chen Ridong Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Tejun Heo Acked-by: Michal Koutný Link: https://lore.kernel.org/r/20241217004818.3200515-1-chenridong@huaweicloud.com --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 66b311fbd5d6..64934e0830af 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1637,8 +1637,9 @@ static inline unsigned int __task_state_index(unsigned int tsk_state, * We're lying here, but rather than expose a completely new task state * to userspace, we can make this appear as if the task has gone through * a regular rt_mutex_lock() call. + * Report frozen tasks as uninterruptible. */ - if (tsk_state & TASK_RTLOCK_WAIT) + if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN)) state = TASK_UNINTERRUPTIBLE; return fls(state); -- cgit v1.2.3 From b651ea8a44aab69f71c5ebeec7e472b03f1b2ca2 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Tue, 24 Dec 2024 09:01:32 -0500 Subject: ACPI: platform_profile: Add devm_platform_profile_register() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Platform profile's lifetime is usually tied to a device's lifetime, therefore add a device managed version of platform_profile_register(). Signed-off-by: Kurt Borja Reviewed-by: Armin Wolf Link: https://lore.kernel.org/r/20241224140131.30362-4-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 0682bb4c57e5..f1cd4b65e351 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -41,6 +41,7 @@ struct platform_profile_handler { int platform_profile_register(struct platform_profile_handler *pprof); int platform_profile_remove(struct platform_profile_handler *pprof); +int devm_platform_profile_register(struct platform_profile_handler *pprof); int platform_profile_cycle(void); void platform_profile_notify(struct platform_profile_handler *pprof); -- cgit v1.2.3 From 5ffa0dbfdc9fc05acae02d5b0dc766ec778569ac Mon Sep 17 00:00:00 2001 From: Dawid Niedzwiecki Date: Fri, 6 Dec 2024 09:15:13 +0000 Subject: platform/chrome: cros_ec: jump to RW before probing There are EC devices, like FPMCU, that use RWSIG as a method of authenticating RW section. After the authentication succeeds, EC device waits some time before jumping to RW. EC can be probed before the jump, which means there is a time window after jump to RW in which EC won't respond, because it is not initialized. It can cause a communication errors after probing. To avoid such problems, send the RWSIG continue command first, which skips waiting for the jump to RW. Send the command more times, to make sure EC is ready in RW before the start of the actual probing process. If a EC device doesn't support the RWSIG, it will respond with invalid command error code and probing will continue as usual. Signed-off-by: Dawid Niedzwiecki Link: https://lore.kernel.org/r/20241206091514.2538350-2-dawidn@google.com Signed-off-by: Tzung-Bi Shih --- include/linux/platform_data/cros_ec_proto.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index b34ed0cc1f8d..701389c16fa7 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -246,6 +246,8 @@ int cros_ec_cmd_xfer(struct cros_ec_device *ec_dev, int cros_ec_cmd_xfer_status(struct cros_ec_device *ec_dev, struct cros_ec_command *msg); +int cros_ec_rwsig_continue(struct cros_ec_device *ec_dev); + int cros_ec_query_all(struct cros_ec_device *ec_dev); int cros_ec_get_next_event(struct cros_ec_device *ec_dev, -- cgit v1.2.3 From fb1e493426d4da77a1d192fffa4dc55fc4ad5741 Mon Sep 17 00:00:00 2001 From: Rob Barnes Date: Wed, 18 Dec 2024 01:57:59 +0000 Subject: platform/chrome: cros_ec_lpc: Only check for events on MKBP notifies Only check EC for MKBP events when the ACPI notify value indicates the notify is due to an MKBP host event. This reduces unnecessary queries to the EC. Notify value 0x80 is reserved for devices specific notifies. It is used by many devices to indicate various events. It's only used by cros_ec for MKBP events. Signed-off-by: Rob Barnes Link: https://lore.kernel.org/r/20241218015759.3558830-1-robbarnes@google.com Signed-off-by: Tzung-Bi Shih --- include/linux/platform_data/cros_ec_proto.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 701389c16fa7..3ec24f445c29 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -41,6 +41,11 @@ #define EC_MAX_REQUEST_OVERHEAD 1 #define EC_MAX_RESPONSE_OVERHEAD 32 +/* + * ACPI notify value for MKBP host event. + */ +#define ACPI_NOTIFY_CROS_EC_MKBP 0x80 + /* * EC panic is not covered by the standard (0-F) ACPI notify values. * Arbitrarily choosing B0 to notify ec panic, which is in the 84-BF -- cgit v1.2.3 From 6fdbc7b9aa20b1db47d13a5f2a4d31fb2f8f3822 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Mon, 30 Dec 2024 14:30:27 +0000 Subject: nvmem: specify ->reg_read/reg_write() expected return values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both ->reg_read() and ->reg_write() return values are not easy to deduce. Explicit that they should return zero on success (and negative values otherwise). Such callbacks, in some alternative world, could return the number of bytes in the success case. That would be translated to errors in the nvmem core because of checks like: ret = nvmem->reg_write(nvmem->priv, offset, val, bytes); if (ret) { // error case } This mistake is not just theoretical, see commit 28b008751aa2 ("nvmem: rmem: Fix return value of rmem_read()"). Signed-off-by: Théo Lebrun Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20241230143035.265518-4-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/nvmem-provider.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 3ebeaa0ded00..515676ebe598 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -92,8 +92,8 @@ struct nvmem_cell_info { * @read_only: Device is read-only. * @root_only: Device is accessibly to root only. * @of_node: If given, this will be used instead of the parent's of_node. - * @reg_read: Callback to read data. - * @reg_write: Callback to write data. + * @reg_read: Callback to read data; return zero if successful. + * @reg_write: Callback to write data; return zero if successful. * @size: Device size. * @word_size: Minimum read/write access granularity. * @stride: Minimum read/write access stride. -- cgit v1.2.3 From 4f3d1be4c2f8a22470f3625cbc778ba2e2130def Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 14 Nov 2024 02:18:32 +0900 Subject: compiler.h: add const_true() __builtin_constant_p() is known for not always being able to produce constant expression [1] which led to the introduction of __is_constexpr() [2]. Because of its dependency on __builtin_constant_p(), statically_true() suffers from the same issues. For example: void foo(int a) { /* fail on GCC */ BUILD_BUG_ON_ZERO(statically_true(a)); /* fail on both clang and GCC */ static char arr[statically_true(a) ? 1 : 2]; } For the same reasons why __is_constexpr() was created to cover __builtin_constant_p() edge cases, __is_constexpr() can be used to resolve statically_true() limitations. Note that, somehow, GCC is not always able to fold this: __is_constexpr(x) && (x) It is OK in BUILD_BUG_ON_ZERO() but not in array declarations nor in static_assert(): void bar(int a) { /* success */ BUILD_BUG_ON_ZERO(__is_constexpr(a) && (a)); /* fail on GCC */ static char arr[__is_constexpr(a) && (a) ? 1 : 2]; /* fail on GCC */ static_assert(__is_constexpr(a) && (a)); } Encapsulating the expression in a __builtin_choose_expr() switch resolves all these failed tests. Define a new const_true() macro which, by making use of the __builtin_choose_expr() and __is_constexpr(x) combo, always produces a constant expression. It should be noted that statically_true() is the only one able to fold tautological expressions in which at least one on the operands is not a constant expression. For example: statically_true(true || var) statically_true(var == var) statically_true(var * 0 + 1) statically_true(!(var * 8 % 4)) always evaluates to true, whereas all of these would be false under const_true() if var is not a constant expression [3]. For this reason, usage of const_true() should be the exception. Reflect in the documentation that const_true() is less powerful and that statically_true() is the overall preferred solution. [1] __builtin_constant_p cannot resolve to const when optimizing Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=19449 [2] commit 3c8ba0d61d04 ("kernel.h: Retain constant expression output for max()/min()") Link: https://git.kernel.org/torvalds/c/3c8ba0d61d04 [3] https://godbolt.org/z/c61PMxqbK CC: Linus Torvalds CC: Rasmus Villemoes CC: Luc Van Oostenryck Reviewed-by: Yury Norov , Signed-off-by: Vincent Mailhol Signed-off-by: Yury Norov --- include/linux/compiler.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 240c632c5b95..bea92d20f9d2 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -330,6 +330,28 @@ static inline void *offset_to_ptr(const int *off) */ #define statically_true(x) (__builtin_constant_p(x) && (x)) +/* + * Similar to statically_true() but produces a constant expression + * + * To be used in conjunction with macros, such as BUILD_BUG_ON_ZERO(), + * which require their input to be a constant expression and for which + * statically_true() would otherwise fail. + * + * This is a trade-off: const_true() requires all its operands to be + * compile time constants. Else, it would always returns false even on + * the most trivial cases like: + * + * true || non_const_var + * + * On the opposite, statically_true() is able to fold more complex + * tautologies and will return true on expressions such as: + * + * !(non_const_var * 8 % 4) + * + * For the general case, statically_true() is better. + */ +#define const_true(x) __builtin_choose_expr(__is_constexpr(x), x, false) + /* * This is needed in functions which generate the stack canary, see * arch/x86/kernel/smpboot.c::start_secondary() for an example. -- cgit v1.2.3 From 4463a445a64b719e6f501d80dcc5872dde42eb73 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 14 Nov 2024 02:18:33 +0900 Subject: linux/bits.h: simplify GENMASK_INPUT_CHECK() In GENMASK_INPUT_CHECK(), __builtin_choose_expr(__is_constexpr((l) > (h)), (l) > (h), 0) is the exact expansion of: const_true((l) > (h)) Apply const_true() to simplify GENMASK_INPUT_CHECK(). CC: Linus Torvalds CC: Rasmus Villemoes CC: Luc Van Oostenryck Reviewed-by: Yury Norov , Signed-off-by: Vincent Mailhol Signed-off-by: Yury Norov --- include/linux/bits.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bits.h b/include/linux/bits.h index 60044b608817..61a75d3f294b 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -20,9 +20,8 @@ */ #if !defined(__ASSEMBLY__) #include -#define GENMASK_INPUT_CHECK(h, l) \ - (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \ - __is_constexpr((l) > (h)), (l) > (h), 0))) +#include +#define GENMASK_INPUT_CHECK(h, l) BUILD_BUG_ON_ZERO(const_true((l) > (h))) #else /* * BUILD_BUG_ON_ZERO is not available in h files included from asm files, -- cgit v1.2.3 From 7f15d4abf925f33015fb62973ce2ddb45ce04bb9 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 20 Dec 2024 16:57:39 +0000 Subject: cpu: Remove unused init_cpu_online The last use of init_cpu_online() was removed by the commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture") Remove it. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Yury Norov --- include/linux/cpumask.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 9278a50d514f..590d8438514c 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1043,7 +1043,6 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); /* Wrappers for arch boot code to manipulate normally-constant masks */ void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); -void init_cpu_online(const struct cpumask *src); #define assign_cpu(cpu, mask, val) \ assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) -- cgit v1.2.3 From 8ec396d05d1b737c87311fb7311f753b02c2a6b1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 28 Nov 2024 15:06:17 +0000 Subject: mm: reinstate ability to map write-sealed memfd mappings read-only Patch series "mm: reinstate ability to map write-sealed memfd mappings read-only". In commit 158978945f31 ("mm: perform the mapping_map_writable() check after call_mmap()") (and preceding changes in the same series) it became possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only. Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") unintentionally undid this logic by moving the mapping_map_writable() check before the shmem_mmap() hook is invoked, thereby regressing this change. This series reworks how we both permit write-sealed mappings being mapped read-only and disallow mprotect() from undoing the write-seal, fixing this regression. We also add a regression test to ensure that we do not accidentally regress this in future. Thanks to Julian Orth for reporting this regression. This patch (of 2): In commit 158978945f31 ("mm: perform the mapping_map_writable() check after call_mmap()") (and preceding changes in the same series) it became possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only. This was previously unnecessarily disallowed, despite the man page documentation indicating that it would be, thereby limiting the usefulness of F_SEAL_WRITE logic. We fixed this by adapting logic that existed for the F_SEAL_FUTURE_WRITE seal (one which disallows future writes to the memfd) to also be used for F_SEAL_WRITE. For background - the F_SEAL_FUTURE_WRITE seal clears VM_MAYWRITE for a read-only mapping to disallow mprotect() from overriding the seal - an operation performed by seal_check_write(), invoked from shmem_mmap(), the f_op->mmap() hook used by shmem mappings. By extending this to F_SEAL_WRITE and critically - checking mapping_map_writable() to determine if we may map the memfd AFTER we invoke shmem_mmap() - the desired logic becomes possible. This is because mapping_map_writable() explicitly checks for VM_MAYWRITE, which we will have cleared. Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") unintentionally undid this logic by moving the mapping_map_writable() check before the shmem_mmap() hook is invoked, thereby regressing this change. We reinstate this functionality by moving the check out of shmem_mmap() and instead performing it in do_mmap() at the point at which VMA flags are being determined, which seems in any case to be a more appropriate place in which to make this determination. In order to achieve this we rework memfd seal logic to allow us access to this information using existing logic and eliminate the clearing of VM_MAYWRITE from seal_check_write() which we are performing in do_mmap() instead. Link: https://lkml.kernel.org/r/99fc35d2c62bd2e05571cf60d9f8b843c56069e0.1732804776.git.lorenzo.stoakes@oracle.com Fixes: 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") Signed-off-by: Lorenzo Stoakes Reported-by: Julian Orth Closes: https://lore.kernel.org/all/CAHijbEUMhvJTN9Xw1GmbM266FXXv=U7s4L_Jem5x3AaPZxrYpQ@mail.gmail.com/ Cc: Jann Horn Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Shuah Khan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/memfd.h | 14 +++++++++++++ include/linux/mm.h | 58 +++++++++++++++++++++++++++++++++++---------------- 2 files changed, 54 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memfd.h b/include/linux/memfd.h index 3f2cf339ceaf..d437e3070850 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -7,6 +7,7 @@ #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); +unsigned int *memfd_file_seals_ptr(struct file *file); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { @@ -16,6 +17,19 @@ static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) { return ERR_PTR(-EINVAL); } + +static inline unsigned int *memfd_file_seals_ptr(struct file *file) +{ + return NULL; +} #endif +/* Retrieve memfd seals associated with the file, if any. */ +static inline unsigned int memfd_file_seals(struct file *file) +{ + unsigned int *sealsp = memfd_file_seals_ptr(file); + + return sealsp ? *sealsp : 0; +} + #endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 338a76ce9083..fb397918c43d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4101,6 +4101,37 @@ void mem_dump_obj(void *object); static inline void mem_dump_obj(void *object) {} #endif +static inline bool is_write_sealed(int seals) +{ + return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); +} + +/** + * is_readonly_sealed - Checks whether write-sealed but mapped read-only, + * in which case writes should be disallowing moving + * forwards. + * @seals: the seals to check + * @vm_flags: the VMA flags to check + * + * Returns whether readonly sealed, in which case writess should be disallowed + * going forward. + */ +static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags) +{ + /* + * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as + * MAP_SHARED and read-only, take care to not allow mprotect to + * revert protections on such mappings. Do this only for shared + * mappings. For private mappings, don't need to mask + * VM_MAYWRITE as we still want them to be COW-writable. + */ + if (is_write_sealed(seals) && + ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED)) + return true; + + return false; +} + /** * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and * handle them. @@ -4112,24 +4143,15 @@ static inline void mem_dump_obj(void *object) {} */ static inline int seal_check_write(int seals, struct vm_area_struct *vma) { - if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * write seals are active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; - - /* - * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as - * MAP_SHARED and read-only, take care to not allow mprotect to - * revert protections on such mappings. Do this only for shared - * mappings. For private mappings, don't need to mask - * VM_MAYWRITE as we still want them to be COW-writable. - */ - if (vma->vm_flags & VM_SHARED) - vm_flags_clear(vma, VM_MAYWRITE); - } + if (!is_write_sealed(seals)) + return 0; + + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * write seals are active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; return 0; } -- cgit v1.2.3 From 59d9094df3d79443937add8700b2ef1a866b1081 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 16 Dec 2024 15:11:47 +0800 Subject: mm: hugetlb: independent PMD page table shared count The folio refcount may be increased unexpectly through try_get_folio() by caller such as split_huge_pages. In huge_pmd_unshare(), we use refcount to check whether a pmd page table is shared. The check is incorrect if the refcount is increased by the above caller, and this can cause the page table leaked: BUG: Bad page state in process sh pfn:109324 page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x66 pfn:0x109324 flags: 0x17ffff800000000(node=0|zone=2|lastcpupid=0xfffff) page_type: f2(table) raw: 017ffff800000000 0000000000000000 0000000000000000 0000000000000000 raw: 0000000000000066 0000000000000000 00000000f2000000 0000000000000000 page dumped because: nonzero mapcount ... CPU: 31 UID: 0 PID: 7515 Comm: sh Kdump: loaded Tainted: G B 6.13.0-rc2master+ #7 Tainted: [B]=BAD_PAGE Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 Call trace: show_stack+0x20/0x38 (C) dump_stack_lvl+0x80/0xf8 dump_stack+0x18/0x28 bad_page+0x8c/0x130 free_page_is_bad_report+0xa4/0xb0 free_unref_page+0x3cc/0x620 __folio_put+0xf4/0x158 split_huge_pages_all+0x1e0/0x3e8 split_huge_pages_write+0x25c/0x2d8 full_proxy_write+0x64/0xd8 vfs_write+0xcc/0x280 ksys_write+0x70/0x110 __arm64_sys_write+0x24/0x38 invoke_syscall+0x50/0x120 el0_svc_common.constprop.0+0xc8/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x34/0x128 el0t_64_sync_handler+0xc8/0xd0 el0t_64_sync+0x190/0x198 The issue may be triggered by damon, offline_page, page_idle, etc, which will increase the refcount of page table. 1. The page table itself will be discarded after reporting the "nonzero mapcount". 2. The HugeTLB page mapped by the page table miss freeing since we treat the page table as shared and a shared page table will not be unmapped. Fix it by introducing independent PMD page table shared count. As described by comment, pt_index/pt_mm/pt_frag_refcount are used for s390 gmap, x86 pgds and powerpc, pt_share_count is used for x86/arm64/riscv pmds, so we can reuse the field as pt_share_count. Link: https://lkml.kernel.org/r/20241216071147.3984217-1-liushixin2@huawei.com Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Liu Shixin Cc: Kefeng Wang Cc: Ken Chen Cc: Muchun Song Cc: Nanyong Sun Cc: Jane Chu Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + include/linux/mm_types.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index fb397918c43d..b1c3db9cf355 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3125,6 +3125,7 @@ static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) if (!pmd_ptlock_init(ptdesc)) return false; __folio_set_pgtable(folio); + ptdesc_pmd_pts_init(ptdesc); lruvec_stat_add_folio(folio, NR_PAGETABLE); return true; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7361a8f3ab68..332cee285662 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -445,6 +445,7 @@ FOLIO_MATCH(compound_head, _head_2a); * @pt_index: Used for s390 gmap. * @pt_mm: Used for x86 pgds. * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. + * @pt_share_count: Used for HugeTLB PMD page table share count. * @_pt_pad_2: Padding to ensure proper alignment. * @ptl: Lock for the page table. * @__page_type: Same as page->page_type. Unused for page tables. @@ -471,6 +472,9 @@ struct ptdesc { pgoff_t pt_index; struct mm_struct *pt_mm; atomic_t pt_frag_refcount; +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING + atomic_t pt_share_count; +#endif }; union { @@ -516,6 +520,32 @@ static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); const struct page *: (const struct ptdesc *)(p), \ struct page *: (struct ptdesc *)(p))) +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING +static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) +{ + atomic_set(&ptdesc->pt_share_count, 0); +} + +static inline void ptdesc_pmd_pts_inc(struct ptdesc *ptdesc) +{ + atomic_inc(&ptdesc->pt_share_count); +} + +static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc) +{ + atomic_dec(&ptdesc->pt_share_count); +} + +static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc) +{ + return atomic_read(&ptdesc->pt_share_count); +} +#else +static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) +{ +} +#endif + /* * Used for sizing the vmemmap region on some architectures */ -- cgit v1.2.3 From 11673247700e2af3a6a95f7b3f1bb80b691c950e Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 19 Dec 2024 14:18:28 +0200 Subject: percpu: remove intermediate variable in PERCPU_PTR() The intermediate variable in the PERCPU_PTR() macro results in a kernel panic on boot [1] due to a compiler bug seen when compiling the kernel (+ KASAN) with gcc 11.3.1, but not when compiling with latest gcc (v14.2)/clang(v18.1). To solve it, remove the intermediate variable (which is not needed) and keep the casting that resolves the address space checks. [1] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] CPU: 0 UID: 0 PID: 547 Comm: iptables Not tainted 6.13.0-rc1_external_tested-master #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:nf_ct_netns_do_get+0x139/0x540 Code: 03 00 00 48 81 c4 88 00 00 00 5b 5d 41 5c 41 5d 41 5e 41 5f c3 4d 8d 75 08 48 b8 00 00 00 00 00 fc ff df 4c 89 f2 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e 27 03 00 00 41 8b 45 08 83 c0 RSP: 0018:ffff888116df75e8 EFLAGS: 00010207 RAX: dffffc0000000000 RBX: 1ffff11022dbeebe RCX: ffffffff839a2382 RDX: 0000000000000003 RSI: 0000000000000008 RDI: ffff88842ec46d10 RBP: 0000000000000002 R08: 0000000000000000 R09: fffffbfff0b0860c R10: ffff888116df75e8 R11: 0000000000000001 R12: ffffffff879d6a80 R13: 0000000000000016 R14: 000000000000001e R15: ffff888116df7908 FS: 00007fba01646740(0000) GS:ffff88842ec00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055bd901800d8 CR3: 00000001205f0003 CR4: 0000000000172eb0 Call Trace: ? die_addr+0x3d/0xa0 ? exc_general_protection+0x144/0x220 ? asm_exc_general_protection+0x22/0x30 ? __mutex_lock+0x2c2/0x1d70 ? nf_ct_netns_do_get+0x139/0x540 ? nf_ct_netns_do_get+0xb5/0x540 ? net_generic+0x1f0/0x1f0 ? __create_object+0x5e/0x80 xt_check_target+0x1f0/0x930 ? textify_hooks.constprop.0+0x110/0x110 ? pcpu_alloc_noprof+0x7cd/0xcf0 ? xt_find_target+0x148/0x1e0 find_check_entry.constprop.0+0x6c0/0x920 ? get_info+0x380/0x380 ? __virt_addr_valid+0x1df/0x3b0 ? kasan_quarantine_put+0xe3/0x200 ? kfree+0x13e/0x3d0 ? translate_table+0xaf5/0x1750 translate_table+0xbd8/0x1750 ? ipt_unregister_table_exit+0x30/0x30 ? __might_fault+0xbb/0x170 do_ipt_set_ctl+0x408/0x1340 ? nf_sockopt_find.constprop.0+0x17b/0x1f0 ? lock_downgrade+0x680/0x680 ? lockdep_hardirqs_on_prepare+0x284/0x400 ? ipt_register_table+0x440/0x440 ? bit_wait_timeout+0x160/0x160 nf_setsockopt+0x6f/0xd0 raw_setsockopt+0x7e/0x200 ? raw_bind+0x590/0x590 ? do_user_addr_fault+0x812/0xd20 do_sock_setsockopt+0x1e2/0x3f0 ? move_addr_to_user+0x90/0x90 ? lock_downgrade+0x680/0x680 __sys_setsockopt+0x9e/0x100 __x64_sys_setsockopt+0xb9/0x150 ? do_syscall_64+0x33/0x140 do_syscall_64+0x6d/0x140 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7fba015134ce Code: 0f 1f 40 00 48 8b 15 59 69 0e 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b1 0f 1f 00 f3 0f 1e fa 49 89 ca b8 36 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 0a c3 66 0f 1f 84 00 00 00 00 00 48 8b 15 21 RSP: 002b:00007ffd9de6f388 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 000055bd9017f490 RCX: 00007fba015134ce RDX: 0000000000000040 RSI: 0000000000000000 RDI: 0000000000000004 RBP: 0000000000000500 R08: 0000000000000560 R09: 0000000000000052 R10: 000055bd901800e0 R11: 0000000000000246 R12: 000055bd90180140 R13: 000055bd901800e0 R14: 000055bd9017f498 R15: 000055bd9017ff10 Modules linked in: xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay zram zsmalloc mlx4_ib mlx4_en mlx4_core rpcrdma rdma_ucm ib_uverbs ib_iser libiscsi scsi_transport_iscsi fuse ib_umad rdma_cm ib_ipoib iw_cm ib_cm ib_core ---[ end trace 0000000000000000 ]--- [akpm@linux-foundation.org: simplification, per Uros] Link: https://lkml.kernel.org/r/20241219121828.2120780-1-gal@nvidia.com Fixes: dabddd687c9e ("percpu: cast percpu pointer in PERCPU_PTR() via unsigned long") Signed-off-by: Gal Pressman Closes: https://lore.kernel.org/all/7590f546-4021-4602-9252-0d525de35b52@nvidia.com Cc: Uros Bizjak Cc: Bill Wendling Cc: Christoph Lameter Cc: Dennis Zhou Cc: Justin Stitt Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/percpu-defs.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 35842d1e3879..5b520fe86b60 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -221,10 +221,7 @@ do { \ } while (0) #define PERCPU_PTR(__p) \ -({ \ - unsigned long __pcpu_ptr = (__force unsigned long)(__p); \ - (typeof(*(__p)) __force __kernel *)(__pcpu_ptr); \ -}) + (typeof(*(__p)) __force __kernel *)((__force unsigned long)(__p)) #ifdef CONFIG_SMP -- cgit v1.2.3 From 202580b60229345dc2637099f10c8a8857c1fdc2 Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Fri, 20 Dec 2024 15:35:07 +0530 Subject: soc: ti: pruss: Fix pruss APIs PRUSS APIs in pruss_driver.h produce lots of compilation errors when CONFIG_TI_PRUSS is not set. The errors and warnings, warning: returning 'void *' from a function with return type 'int' makes integer from pointer without a cast [-Wint-conversion] error: expected identifier or '(' before '{' token Fix these warnings and errors by fixing the return type of pruss APIs as well as removing the misplaced semicolon from pruss_cfg_xfr_enable() Fixes: 0211cc1e4fbb ("soc: ti: pruss: Add helper functions to set GPI mode, MII_RT_event and XFR") Signed-off-by: MD Danish Anwar Reviewed-by: Roger Quadros Link: https://lore.kernel.org/r/20241220100508.1554309-2-danishanwar@ti.com Signed-off-by: Nishanth Menon --- include/linux/pruss_driver.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pruss_driver.h b/include/linux/pruss_driver.h index c9a31c567e85..2e18fef1a2e1 100644 --- a/include/linux/pruss_driver.h +++ b/include/linux/pruss_driver.h @@ -144,32 +144,32 @@ static inline int pruss_release_mem_region(struct pruss *pruss, static inline int pruss_cfg_get_gpmux(struct pruss *pruss, enum pruss_pru_id pru_id, u8 *mux) { - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; } static inline int pruss_cfg_set_gpmux(struct pruss *pruss, enum pruss_pru_id pru_id, u8 mux) { - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; } static inline int pruss_cfg_gpimode(struct pruss *pruss, enum pruss_pru_id pru_id, enum pruss_gpi_mode mode) { - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; } static inline int pruss_cfg_miirt_enable(struct pruss *pruss, bool enable) { - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; } static inline int pruss_cfg_xfr_enable(struct pruss *pruss, enum pru_type pru_type, - bool enable); + bool enable) { - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; } #endif /* CONFIG_TI_PRUSS */ -- cgit v1.2.3 From f91a5b8089389eb408501af2762f168c3aaa7b79 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2024 16:10:04 +0000 Subject: af_packet: fix vlan_get_protocol_dgram() vs MSG_PEEK Blamed commit forgot MSG_PEEK case, allowing a crash [1] as found by syzbot. Rework vlan_get_protocol_dgram() to not touch skb at all, so that it can be used from many cpus on the same skb. Add a const qualifier to skb argument. [1] skbuff: skb_under_panic: text:ffffffff8a8ccd05 len:29 put:14 head:ffff88807fc8e400 data:ffff88807fc8e3f4 tail:0x11 end:0x140 dev: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:206 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 UID: 0 PID: 5892 Comm: syz-executor883 Not tainted 6.13.0-rc4-syzkaller-00054-gd6ef8b40d075 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:skb_panic net/core/skbuff.c:206 [inline] RIP: 0010:skb_under_panic+0x14b/0x150 net/core/skbuff.c:216 Code: 0b 8d 48 c7 c6 86 d5 25 8e 48 8b 54 24 08 8b 0c 24 44 8b 44 24 04 4d 89 e9 50 41 54 41 57 41 56 e8 5a 69 79 f7 48 83 c4 20 90 <0f> 0b 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 RSP: 0018:ffffc900038d7638 EFLAGS: 00010282 RAX: 0000000000000087 RBX: dffffc0000000000 RCX: 609ffd18ea660600 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff88802483c8d0 R08: ffffffff817f0a8c R09: 1ffff9200071ae60 R10: dffffc0000000000 R11: fffff5200071ae61 R12: 0000000000000140 R13: ffff88807fc8e400 R14: ffff88807fc8e3f4 R15: 0000000000000011 FS: 00007fbac5e006c0(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fbac5e00d58 CR3: 000000001238e000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_push+0xe5/0x100 net/core/skbuff.c:2636 vlan_get_protocol_dgram+0x165/0x290 net/packet/af_packet.c:585 packet_recvmsg+0x948/0x1ef0 net/packet/af_packet.c:3552 sock_recvmsg_nosec net/socket.c:1033 [inline] sock_recvmsg+0x22f/0x280 net/socket.c:1055 ____sys_recvmsg+0x1c6/0x480 net/socket.c:2803 ___sys_recvmsg net/socket.c:2845 [inline] do_recvmmsg+0x426/0xab0 net/socket.c:2940 __sys_recvmmsg net/socket.c:3014 [inline] __do_sys_recvmmsg net/socket.c:3037 [inline] __se_sys_recvmmsg net/socket.c:3030 [inline] __x64_sys_recvmmsg+0x199/0x250 net/socket.c:3030 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 79eecf631c14 ("af_packet: Handle outgoing VLAN packets without hardware offloading") Reported-by: syzbot+74f70bb1cb968bf09e4f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772c485.050a0220.2f3838.04c5.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Chengen Du Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241230161004.2681892-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/if_vlan.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index c1645c86eed9..d65b5d71b93b 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -585,13 +585,16 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query * @type: first vlan protocol + * @mac_offset: MAC offset * @depth: buffer to store length of eth and vlan tags in bytes * * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, - int *depth) +static inline __be16 __vlan_get_protocol_offset(const struct sk_buff *skb, + __be16 type, + int mac_offset, + int *depth) { unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; @@ -610,7 +613,8 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, do { struct vlan_hdr vhdr, *vh; - vh = skb_header_pointer(skb, vlan_depth, sizeof(vhdr), &vhdr); + vh = skb_header_pointer(skb, mac_offset + vlan_depth, + sizeof(vhdr), &vhdr); if (unlikely(!vh || !--parse_depth)) return 0; @@ -625,6 +629,12 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, return type; } +static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, + int *depth) +{ + return __vlan_get_protocol_offset(skb, type, 0, depth); +} + /** * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query -- cgit v1.2.3 From 9b6442a3bdd7e0d528122d63c24bd57f6cb05671 Mon Sep 17 00:00:00 2001 From: Vitaliy Shevtsov Date: Wed, 25 Dec 2024 01:45:30 +0000 Subject: ipmi: make ipmi_destroy_user() return void Return value of ipmi_destroy_user() has no meaning, because it's always zero and callers can do nothing with it. And in most cases it's not checked. So make this function return void. This also will eliminate static code analyzer warnings such as unreachable code/redundant comparison when the return value is checked against non-zero value. Found by Linux Verification Center (linuxtesting.org) with Svace. Signed-off-by: Vitaliy Shevtsov Message-ID: <20241225014532.20091-1-v.shevtsov@maxima.ru> Signed-off-by: Corey Minyard --- include/linux/ipmi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h index a1c9c0d48ebf..2f74dd90c271 100644 --- a/include/linux/ipmi.h +++ b/include/linux/ipmi.h @@ -126,7 +126,7 @@ int ipmi_create_user(unsigned int if_num, * the users before you destroy the callback structures, it should be * safe, too. */ -int ipmi_destroy_user(struct ipmi_user *user); +void ipmi_destroy_user(struct ipmi_user *user); /* Get the IPMI version of the BMC we are talking to. */ int ipmi_get_version(struct ipmi_user *user, -- cgit v1.2.3 From 401d07d530bfaf7131d6ab9acd32ff12b9a6ddf1 Mon Sep 17 00:00:00 2001 From: Pavan Holla Date: Tue, 31 Dec 2024 13:10:46 +0000 Subject: platform/chrome: Update ChromeOS EC header for UCSI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add EC host commands for reading and writing UCSI structures in the EC. The corresponding kernel driver is cros-ec-ucsi. Also update PD events supported by the EC. Acked-by: Tzung-Bi Shih Signed-off-by: Pavan Holla Signed-off-by: Łukasz Bartosik Link: https://lore.kernel.org/r/20241231131047.1757434-2-ukaszb@chromium.org Signed-off-by: Greg Kroah-Hartman --- include/linux/platform_data/cros_ec_commands.h | 28 +++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index b3c4993e656e..ecf290a0c98f 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -5044,8 +5044,11 @@ struct ec_response_pd_status { #define PD_EVENT_POWER_CHANGE BIT(1) #define PD_EVENT_IDENTITY_RECEIVED BIT(2) #define PD_EVENT_DATA_SWAP BIT(3) +#define PD_EVENT_TYPEC BIT(4) +#define PD_EVENT_PPM BIT(5) + struct ec_response_host_event_status { - uint32_t status; /* PD MCU host event status */ + uint32_t status; /* PD MCU host event status */ } __ec_align4; /* Set USB type-C port role and muxes */ @@ -6105,6 +6108,29 @@ struct ec_response_typec_vdm_response { #undef VDO_MAX_SIZE +/* + * UCSI OPM-PPM commands + * + * These commands are used for communication between OPM and PPM. + * Only UCSI3.0 is tested. + */ + +#define EC_CMD_UCSI_PPM_SET 0x0140 + +/* The data size is stored in the host command protocol header. */ +struct ec_params_ucsi_ppm_set { + uint16_t offset; + uint8_t data[]; +} __ec_align2; + +#define EC_CMD_UCSI_PPM_GET 0x0141 + +/* For 'GET' sub-commands, data will be returned as a raw payload. */ +struct ec_params_ucsi_ppm_get { + uint16_t offset; + uint8_t size; +} __ec_align2; + /*****************************************************************************/ /* The command range 0x200-0x2FF is reserved for Rotor. */ -- cgit v1.2.3 From f1e8bf56320a7fb32095b6c51b707459361b403b Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Tue, 24 Dec 2024 21:05:03 +0800 Subject: driver core: Constify API device_find_child() and adapt for various usages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Constify the following API: struct device *device_find_child(struct device *dev, void *data, int (*match)(struct device *dev, void *data)); To : struct device *device_find_child(struct device *dev, const void *data, device_match_t match); typedef int (*device_match_t)(struct device *dev, const void *data); with the following reasons: - Protect caller's match data @*data which is for comparison and lookup and the API does not actually need to modify @*data. - Make the API's parameters (@match)() and @data have the same type as all of other device finding APIs (bus|class|driver)_find_device(). - All kinds of existing device match functions can be directly taken as the API's argument, they were exported by driver core. Constify the API and adapt for various existing usages. BTW, various subsystem changes are squashed into this commit to meet 'git bisect' requirement, and this commit has the minimal and simplest changes to complement squashing shortcoming, and that may bring extra code improvement. Reviewed-by: Alison Schofield Reviewed-by: Takashi Sakamoto Acked-by: Uwe Kleine-König # for drivers/pwm Signed-off-by: Zijun Hu Reviewed-by: Jonathan Cameron Reviewed-by: Mathieu Poirier Link: https://lore.kernel.org/r/20241224-const_dfc_done-v5-4-6623037414d4@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 667cb6db9019..0e0bc9bfe0d1 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1081,8 +1081,8 @@ int device_for_each_child_reverse(struct device *dev, void *data, int device_for_each_child_reverse_from(struct device *parent, struct device *from, const void *data, int (*fn)(struct device *, const void *)); -struct device *device_find_child(struct device *dev, void *data, - int (*match)(struct device *dev, void *data)); +struct device *device_find_child(struct device *dev, const void *data, + device_match_t match); struct device *device_find_child_by_name(struct device *parent, const char *name); struct device *device_find_any_child(struct device *parent); -- cgit v1.2.3 From adf908c965798c33d1148393927a7c0c5d08053c Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Tue, 24 Dec 2024 21:05:08 +0800 Subject: driver core: Introduce an device matching API device_match_type() Introduce device_match_type() for purposes below: - Test if a device matches with a specified device type. - As argument of various device finding APIs to find a device with specified type. device_find_child() will use it to simplify operations later. Reviewed-by: Jonathan Cameron Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241224-const_dfc_done-v5-9-6623037414d4@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device/bus.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index cdc4757217f9..bc3fd74bb763 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -131,6 +131,7 @@ typedef int (*device_match_t)(struct device *dev, const void *data); /* Generic device matching functions that all busses can use to match with */ int device_match_name(struct device *dev, const void *name); +int device_match_type(struct device *dev, const void *type); int device_match_of_node(struct device *dev, const void *np); int device_match_fwnode(struct device *dev, const void *fwnode); int device_match_devt(struct device *dev, const void *pdevt); -- cgit v1.2.3 From 56a50667cbcfaf95eea9128d5676af94e54b51a8 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 1 Nov 2024 23:09:51 +0100 Subject: i2c: Replace list-based mechanism for handling auto-detected clients So far a list is used to track auto-detected clients per driver. The same functionality can be achieved much simpler by flagging auto-detected clients. Two notes regarding the usage of driver_for_each_device: In our case it can't fail, however the function is annotated __must_check. So a little workaround is needed to avoid a compiler warning. Then we may remove nodes from the list over which we iterate. This is safe, see the explanation at the beginning of lib/klist.c. Signed-off-by: Heiner Kallweit [wsa: fixed description of the new flag] Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 388ce71a29a9..072a62244e14 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -244,7 +244,6 @@ enum i2c_driver_flags { * @id_table: List of I2C devices supported by this driver * @detect: Callback for device detection * @address_list: The I2C addresses to probe (for detect) - * @clients: List of detected clients we created (for i2c-core use only) * @flags: A bitmask of flags defined in &enum i2c_driver_flags * * The driver.owner field should be set to the module owner of this driver. @@ -299,7 +298,6 @@ struct i2c_driver { /* Device detection callback for automatic device creation */ int (*detect)(struct i2c_client *client, struct i2c_board_info *info); const unsigned short *address_list; - struct list_head clients; u32 flags; }; @@ -334,6 +332,7 @@ struct i2c_client { #define I2C_CLIENT_SLAVE 0x20 /* we are the slave */ #define I2C_CLIENT_HOST_NOTIFY 0x40 /* We want to use I2C host notify */ #define I2C_CLIENT_WAKE 0x80 /* for board_info; true iff can wake */ +#define I2C_CLIENT_AUTO 0x100 /* client was auto-detected */ #define I2C_CLIENT_SCCB 0x9000 /* Use Omnivision SCCB protocol */ /* Must match I2C_M_STOP|IGNORE_NAK */ -- cgit v1.2.3 From 3cfe39b3a845593a485ab1c716615979004ef9f6 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 1 Nov 2024 23:11:39 +0100 Subject: i2c: Replace list-based mechanism for handling userspace-created clients Similar to the list of auto-detected clients, we can also replace the list of userspace-created clients with flagging such client devices. Signed-off-by: Heiner Kallweit [wsa: fixed description of the new flag; reordered new code in 'device_store' to have single exit point; fixed whitespace errors; folded cleanup patch into this one] Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 072a62244e14..66fb3d6cf686 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -313,8 +313,6 @@ struct i2c_driver { * @dev: Driver model device node for the slave. * @init_irq: IRQ that was set at initialization * @irq: indicates the IRQ generated by this device (if any) - * @detected: member of an i2c_driver.clients list or i2c-core's - * userspace_devices list * @slave_cb: Callback when I2C slave mode of an adapter is used. The adapter * calls it to pass on slave events to the slave driver. * @devres_group_id: id of the devres group that will be created for resources @@ -333,6 +331,7 @@ struct i2c_client { #define I2C_CLIENT_HOST_NOTIFY 0x40 /* We want to use I2C host notify */ #define I2C_CLIENT_WAKE 0x80 /* for board_info; true iff can wake */ #define I2C_CLIENT_AUTO 0x100 /* client was auto-detected */ +#define I2C_CLIENT_USER 0x200 /* client was userspace-created */ #define I2C_CLIENT_SCCB 0x9000 /* Use Omnivision SCCB protocol */ /* Must match I2C_M_STOP|IGNORE_NAK */ @@ -344,7 +343,6 @@ struct i2c_client { struct device dev; /* the device structure */ int init_irq; /* irq set at initialization */ int irq; /* irq issued by device */ - struct list_head detected; #if IS_ENABLED(CONFIG_I2C_SLAVE) i2c_slave_cb_t slave_cb; /* callback for slave mode */ #endif @@ -750,9 +748,6 @@ struct i2c_adapter { char name[48]; struct completion dev_released; - struct mutex userspace_clients_lock; - struct list_head userspace_clients; - struct i2c_bus_recovery_info *bus_recovery_info; const struct i2c_adapter_quirks *quirks; -- cgit v1.2.3 From 45d339fefaa3dcd237038769e0d34584fb867390 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Thu, 19 Dec 2024 14:23:36 +0200 Subject: RDMA/mlx5: Enable multiplane mode only when it is supported Driver queries vport_cxt.num_plane and enables multiplane when it is greater then 0, but some old FWs (versions from x.40.1000 till x.42.1000), report vport_cxt.num_plane = 1 unexpectedly. Fix it by querying num_plane only when HCA_CAP2.multiplane bit is set. Fixes: 2a5db20fa532 ("RDMA/mlx5: Add support to multi-plane device and port") Link: https://patch.msgid.link/r/1ef901acdf564716fcf550453cf5e94f343777ec.1734610916.git.leon@kernel.org Cc: stable@vger.kernel.org Reported-by: Francesco Poli Closes: https://lore.kernel.org/all/nvs4i2v7o6vn6zhmtq4sgazy2hu5kiulukxcntdelggmznnl7h@so3oul6uwgbl/ Signed-off-by: Mark Zhang Signed-off-by: Leon Romanovsky Reviewed-by: Michal Swiatkowski Signed-off-by: Jason Gunthorpe --- include/linux/mlx5/mlx5_ifc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4fbbcf35498b..48d47181c7cd 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2119,7 +2119,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 migration_in_chunks[0x1]; u8 reserved_at_d1[0x1]; u8 sf_eq_usage[0x1]; - u8 reserved_at_d3[0xd]; + u8 reserved_at_d3[0x5]; + u8 multiplane[0x1]; + u8 reserved_at_d9[0x7]; u8 cross_vhca_object_to_object_supported[0x20]; -- cgit v1.2.3 From 96ea081ed52bf077cad6d00153b6fba68e510767 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 20 Dec 2024 12:18:18 -0800 Subject: bpf: Reject struct_ops registration that uses module ptr and the module btf_id is missing There is a UAF report in the bpf_struct_ops when CONFIG_MODULES=n. In particular, the report is on tcp_congestion_ops that has a "struct module *owner" member. For struct_ops that has a "struct module *owner" member, it can be extended either by the regular kernel module or by the bpf_struct_ops. bpf_try_module_get() will be used to do the refcounting and different refcount is done based on the owner pointer. When CONFIG_MODULES=n, the btf_id of the "struct module" is missing: WARN: resolve_btfids: unresolved symbol module Thus, the bpf_try_module_get() cannot do the correct refcounting. Not all subsystem's struct_ops requires the "struct module *owner" member. e.g. the recent sched_ext_ops. This patch is to disable bpf_struct_ops registration if the struct_ops has the "struct module *" member and the "struct module" btf_id is missing. The btf_type_is_fwd() helper is moved to the btf.h header file for this test. This has happened since the beginning of bpf_struct_ops which has gone through many changes. The Fixes tag is set to a recent commit that this patch can apply cleanly. Considering CONFIG_MODULES=n is not common and the age of the issue, targeting for bpf-next also. Fixes: 1611603537a4 ("bpf: Create argument information for nullable arguments.") Reported-by: Robert Morris Closes: https://lore.kernel.org/bpf/74665.1733669976@localhost/ Signed-off-by: Martin KaFai Lau Tested-by: Eduard Zingerman Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241220201818.127152-1-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 4214e76c9168..2a08a2b55592 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -353,6 +353,11 @@ static inline bool btf_type_is_scalar(const struct btf_type *t) return btf_type_is_int(t) || btf_type_is_enum(t); } +static inline bool btf_type_is_fwd(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; +} + static inline bool btf_type_is_typedef(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF; -- cgit v1.2.3 From af6505e5745b9f3a670de405b08b73573343c15c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:45 -0700 Subject: fs: add RWF_DONTCACHE iocb and FOP_DONTCACHE file_operations flag If a file system supports uncached buffered IO, it may set FOP_DONTCACHE and enable support for RWF_DONTCACHE. If RWF_DONTCACHE is attempted without the file system supporting it, it'll get errored with -EOPNOTSUPP. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20241220154831.1086649-8-axboe@kernel.dk Signed-off-by: Christian Brauner --- include/linux/fs.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..6a838b5479a6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -322,6 +322,7 @@ struct readahead_control; #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC +#define IOCB_DONTCACHE (__force int) RWF_DONTCACHE /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -356,7 +357,8 @@ struct readahead_control; { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ - { IOCB_ATOMIC, "ATOMIC"}, \ + { IOCB_ATOMIC, "ATOMIC" }, \ + { IOCB_DONTCACHE, "DONTCACHE" }, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ @@ -2127,6 +2129,8 @@ struct file_operations { #define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) /* Supports asynchronous lock callbacks */ #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) +/* File system supports uncached read/write buffered IO */ +#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, @@ -3614,6 +3618,14 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) return -EOPNOTSUPP; } + if (flags & RWF_DONTCACHE) { + /* file system must support it */ + if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE)) + return -EOPNOTSUPP; + /* DAX mappings not supported */ + if (IS_DAX(ki->ki_filp->f_mapping->host)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; -- cgit v1.2.3 From d7bde4f27ceef3dc6d72010a20d4da23db835a32 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 28 Dec 2024 12:55:18 -0500 Subject: Revert "libfs: Add simple_offset_empty()" simple_empty() and simple_offset_empty() perform the same task. The latter's use as a canary to find bugs has not found any new issues. A subsequent patch will remove the use of the mtree for iterating directory contents, so revert back to using a similar mechanism for determining whether a directory is indeed empty. Only one such mechanism is ever needed. Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/20241228175522.1854234-3-cel@kernel.org Reviewed-by: Yang Erkun Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..f7efc6866ebc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3468,7 +3468,6 @@ struct offset_ctx { void simple_offset_init(struct offset_ctx *octx); int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry); void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry); -int simple_offset_empty(struct dentry *dentry); int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); int simple_offset_rename_exchange(struct inode *old_dir, -- cgit v1.2.3 From 7716d085531bf797c882ed67eda184ac58a387a8 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Mon, 30 Dec 2024 16:13:52 +0100 Subject: iio: gts-helper: add helpers to ease searches of gain_sel and new_gain This helper functions reduce the burden in the drivers that want to fetch a gain and time selector for a given scale or a new optimal gain. The former is currently achieved by calling iio_gts_find_gain_sel_for_scale_using_time() for the current time selector, and then iterating over the rest of time selectors if the gain selector was not found. The latter requires a combination of multiple iio-gts helpers to find the new gain, look for an optimal gain if there was no exact match, and set a minimum gain if the optimal gain is not in the range of available gains. Provide simpler workflows by means of functions that address common patterns in the users of the iio-gts helpers. Acked-by: Matti Vaittinen Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241230-veml3235_scale-v3-1-48a5795e2f64@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio-gts-helper.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio-gts-helper.h b/include/linux/iio/iio-gts-helper.h index 9cb6c80dea71..e5de7a124bad 100644 --- a/include/linux/iio/iio-gts-helper.h +++ b/include/linux/iio/iio-gts-helper.h @@ -188,6 +188,9 @@ int iio_gts_total_gain_to_scale(struct iio_gts *gts, int total_gain, int iio_gts_find_gain_sel_for_scale_using_time(struct iio_gts *gts, int time_sel, int scale_int, int scale_nano, int *gain_sel); +int iio_gts_find_gain_time_sel_for_scale(struct iio_gts *gts, int scale_int, + int scale_nano, int *gain_sel, + int *time_sel); int iio_gts_get_scale(struct iio_gts *gts, int gain, int time, int *scale_int, int *scale_nano); int iio_gts_find_new_gain_sel_by_old_gain_time(struct iio_gts *gts, @@ -196,6 +199,9 @@ int iio_gts_find_new_gain_sel_by_old_gain_time(struct iio_gts *gts, int iio_gts_find_new_gain_by_old_gain_time(struct iio_gts *gts, int old_gain, int old_time, int new_time, int *new_gain); +int iio_gts_find_new_gain_by_gain_time_min(struct iio_gts *gts, int old_gain, + int old_time, int new_time, + int *new_gain, bool *in_range); int iio_gts_avail_times(struct iio_gts *gts, const int **vals, int *type, int *length); int iio_gts_all_avail_scales(struct iio_gts *gts, const int **vals, int *type, -- cgit v1.2.3 From 7ccbe076d987598b04b4b9c9b61f042291f9cc77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Fri, 22 Nov 2024 15:33:31 +0100 Subject: lsm: Only build lsm_audit.c if CONFIG_SECURITY and CONFIG_AUDIT are set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_AUDIT is set, its CONFIG_NET dependency is also set, and the dev_get_by_index and init_net symbols (used by dump_common_audit_data) are found by the linker. dump_common_audit_data() should then failed to build when CONFIG_NET is not set. However, because the compiler is smart, it knows that audit_log_start() always return NULL when !CONFIG_AUDIT, and it doesn't build the body of common_lsm_audit(). As a side effect, dump_common_audit_data() is not built and the linker doesn't error out because of missing symbols. Let's only build lsm_audit.o when CONFIG_SECURITY and CONFIG_AUDIT are both set, which is checked with the new CONFIG_HAS_SECURITY_AUDIT. ipv4_skb_to_auditdata() and ipv6_skb_to_auditdata() are only used by Smack if CONFIG_AUDIT is set, so they don't need fake implementations. Because common_lsm_audit() is used in multiple places without CONFIG_AUDIT checks, add a fake implementation. Link: https://lore.kernel.org/r/20241122143353.59367-2-mic@digikod.net Cc: Casey Schaufler Cc: James Morris Cc: Paul Moore Cc: Serge E. Hallyn Signed-off-by: Mickaël Salaün Signed-off-by: Paul Moore --- include/linux/lsm_audit.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index 97a8b21eb033..c2b01380262c 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -116,14 +116,28 @@ struct common_audit_data { #define v4info fam.v4 #define v6info fam.v6 +#ifdef CONFIG_AUDIT + int ipv4_skb_to_auditdata(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto); +#if IS_ENABLED(CONFIG_IPV6) int ipv6_skb_to_auditdata(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto); +#endif /* IS_ENABLED(CONFIG_IPV6) */ void common_lsm_audit(struct common_audit_data *a, void (*pre_audit)(struct audit_buffer *, void *), void (*post_audit)(struct audit_buffer *, void *)); +#else /* CONFIG_AUDIT */ + +static inline void common_lsm_audit(struct common_audit_data *a, + void (*pre_audit)(struct audit_buffer *, void *), + void (*post_audit)(struct audit_buffer *, void *)) +{ +} + +#endif /* CONFIG_AUDIT */ + #endif -- cgit v1.2.3 From 6aeb4f836480617be472de767c4cb09c1060a067 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Jan 2025 08:33:57 +0100 Subject: block: remove bio_add_pc_page Lift bio_split_rw_at into blk_rq_append_bio so that it validates the hardware limits. With this all passthrough callers can simply add bio_add_page to build the bio and delay checking for exceeding of limits to this point instead of doing it for each page. While this looks like adding a new expensive loop over all bio_vecs, blk_rq_append_bio is already doing that just to counter the number of segments. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Link: https://lore.kernel.org/r/20250103073417.459715-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 1eec59699100..4b79bf50f4f0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -413,8 +413,6 @@ int __must_check bio_add_page(struct bio *bio, struct page *page, unsigned len, unsigned off); bool __must_check bio_add_folio(struct bio *bio, struct folio *folio, size_t len, size_t off); -extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, - unsigned int, unsigned int); void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, -- cgit v1.2.3 From 02ee5d69e3baf2796ba75b928fcbc9cf7884c5e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Jan 2025 08:33:58 +0100 Subject: block: remove blk_rq_bio_prep There is not real point in a helper just to assign three values to four fields, especially when the surrounding code is working on the neighbor fields directly. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Link: https://lore.kernel.org/r/20250103073417.459715-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7f6c482ebf54..6340293511c9 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -978,14 +978,6 @@ static inline void blk_mq_cleanup_rq(struct request *rq) rq->q->mq_ops->cleanup_rq(rq); } -static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, - unsigned int nr_segs) -{ - rq->nr_phys_segments = nr_segs; - rq->__data_len = bio->bi_iter.bi_size; - rq->bio = rq->biotail = bio; -} - void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, struct lock_class_key *key); -- cgit v1.2.3 From 5bb494d5cbb9a3403ba8b1c8bc145b42fc119078 Mon Sep 17 00:00:00 2001 From: Gao Shiyuan Date: Sat, 4 Jan 2025 00:58:08 +0800 Subject: iommu/amd: remove return value of amd_iommu_detect The return value of amd_iommu_detect is not used, so remove it and is consistent with other iommu detect functions. Signed-off-by: Gao Shiyuan Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20250103165808.80939-1-gaoshiyuan@baidu.com Signed-off-by: Joerg Roedel --- include/linux/amd-iommu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 2b90c48a6a87..062fbd4c9b77 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -31,11 +31,11 @@ struct amd_iommu_pi_data { struct task_struct; struct pci_dev; -extern int amd_iommu_detect(void); +extern void amd_iommu_detect(void); #else /* CONFIG_AMD_IOMMU */ -static inline int amd_iommu_detect(void) { return -ENODEV; } +static inline void amd_iommu_detect(void) { } #endif /* CONFIG_AMD_IOMMU */ -- cgit v1.2.3 From dadf03cfd4eaa09f1d0e8b2521de1e11d3e3bec1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Jan 2025 15:02:23 +0000 Subject: io_uring/cmd: rename struct uring_cache to io_uring_cmd_data In preparation for making this more generically available for ->uring_cmd() usage that needs stable command data, rename it and move it to io_uring/cmd.h instead. Signed-off-by: Jens Axboe Signed-off-by: David Sterba --- include/linux/io_uring/cmd.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index c189d36ad55e..24cff2b9b9d4 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -18,6 +18,10 @@ struct io_uring_cmd { u8 pdu[32]; /* available inline for free use */ }; +struct io_uring_cmd_data { + struct io_uring_sqe sqes[2]; +}; + static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) { return sqe->cmd; -- cgit v1.2.3 From 3347fa658a1baecd61b007787d031b729cd86537 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Jan 2025 15:02:24 +0000 Subject: io_uring/cmd: add per-op data to struct io_uring_cmd_data In case an op handler for ->uring_cmd() needs stable storage for user data, it can allocate io_uring_cmd_data->op_data and use it for the duration of the request. When the request gets cleaned up, uring_cmd will free it automatically. Signed-off-by: Jens Axboe Signed-off-by: David Sterba --- include/linux/io_uring/cmd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 24cff2b9b9d4..3df6636ec3a3 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -20,6 +20,7 @@ struct io_uring_cmd { struct io_uring_cmd_data { struct io_uring_sqe sqes[2]; + void *op_data; }; static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) -- cgit v1.2.3 From b0af20d33f63c74985a6dd98344326e5111b2fea Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Fri, 3 Jan 2025 15:02:25 +0000 Subject: io_uring: add io_uring_cmd_get_async_data helper Add a helper function in include/linux/io_uring/cmd.h to read the async_data pointer from a struct io_uring_cmd. Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- include/linux/io_uring/cmd.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 3df6636ec3a3..b0aeec834c1d 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -118,4 +118,9 @@ static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd return cmd_to_io_kiocb(cmd)->task; } +static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_uring_cmd *cmd) +{ + return cmd_to_io_kiocb(cmd)->async_data; +} + #endif /* _LINUX_IO_URING_CMD_H */ -- cgit v1.2.3 From 1156b5e8be98c97087f8971609c852e418daf03b Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Sat, 4 Jan 2025 17:20:57 +0530 Subject: regulator: Guard of_regulator_bulk_get_all() with CONFIG_OF Since the definition is in drivers/regulator/of_regulator.c and compiled only if CONFIG_OF is enabled, building the consumer driver without CONFIG_OF and with CONFIG_REGULATOR will result in below build error: ERROR: modpost: "of_regulator_bulk_get_all" [drivers/pci/pwrctrl/pci-pwrctl-slot.ko] undefined! Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412181640.12Iufkvd-lkp@intel.com/ Fixes: 27b9ecc7a9ba ("regulator: Add of_regulator_bulk_get_all") Signed-off-by: Manivannan Sadhasivam Reviewed-by: Bartosz Golaszewski Link: https://patch.msgid.link/20250104115058.19216-2-manivannan.sadhasivam@linaro.org Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 8c3c372ad735..85be83c8fa17 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -175,6 +175,8 @@ struct regulator *__must_check of_regulator_get_optional(struct device *dev, struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, struct device_node *node, const char *id); +int __must_check of_regulator_bulk_get_all(struct device *dev, struct device_node *np, + struct regulator_bulk_data **consumers); #else static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, struct device_node *node, @@ -189,6 +191,13 @@ static inline struct regulator *__must_check devm_of_regulator_get_optional(stru { return ERR_PTR(-ENODEV); } + +static inline int of_regulator_bulk_get_all(struct device *dev, struct device_node *np, + struct regulator_bulk_data **consumers) +{ + return 0; +} + #endif int regulator_register_supply_alias(struct device *dev, const char *id, @@ -223,8 +232,6 @@ int regulator_disable_deferred(struct regulator *regulator, int ms); int __must_check regulator_bulk_get(struct device *dev, int num_consumers, struct regulator_bulk_data *consumers); -int __must_check of_regulator_bulk_get_all(struct device *dev, struct device_node *np, - struct regulator_bulk_data **consumers); int __must_check devm_regulator_bulk_get(struct device *dev, int num_consumers, struct regulator_bulk_data *consumers); void devm_regulator_bulk_put(struct regulator_bulk_data *consumers); @@ -483,12 +490,6 @@ static inline int devm_regulator_bulk_get(struct device *dev, int num_consumers, return 0; } -static inline int of_regulator_bulk_get_all(struct device *dev, struct device_node *np, - struct regulator_bulk_data **consumers) -{ - return 0; -} - static inline int devm_regulator_bulk_get_const( struct device *dev, int num_consumers, const struct regulator_bulk_data *in_consumers, -- cgit v1.2.3 From 907af7d6e0c8cf4086b1bc5218281b2ca09f130b Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Sat, 4 Jan 2025 17:20:58 +0530 Subject: regulator: Move OF_ API declarations/definitions outside CONFIG_REGULATOR Since these are hidden inside CONFIG_REGULATOR, building the consumer drivers without CONFIG_REGULATOR will result in the following build error: >> drivers/pci/pwrctrl/slot.c:39:15: error: implicit declaration of function 'of_regulator_bulk_get_all'; did you mean 'regulator_bulk_get'? [-Werror=implicit-function-declaration] 39 | ret = of_regulator_bulk_get_all(dev, dev_of_node(dev), | ^~~~~~~~~~~~~~~~~~~~~~~~~ | regulator_bulk_get cc1: some warnings being treated as errors This also removes the duplicated definitions that were possibly added to fix the build issues. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501020407.HmQQQKa0-lkp@intel.com/ Fixes: 27b9ecc7a9ba ("regulator: Add of_regulator_bulk_get_all") Signed-off-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20250104115058.19216-3-manivannan.sadhasivam@linaro.org Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 78 ++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 85be83c8fa17..bcba3935c6f9 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -168,38 +168,6 @@ int devm_regulator_get_enable_read_voltage(struct device *dev, const char *id); void regulator_put(struct regulator *regulator); void devm_regulator_put(struct regulator *regulator); -#if IS_ENABLED(CONFIG_OF) -struct regulator *__must_check of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id); -struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id); -int __must_check of_regulator_bulk_get_all(struct device *dev, struct device_node *np, - struct regulator_bulk_data **consumers); -#else -static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id) -{ - return ERR_PTR(-ENODEV); -} - -static inline struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id) -{ - return ERR_PTR(-ENODEV); -} - -static inline int of_regulator_bulk_get_all(struct device *dev, struct device_node *np, - struct regulator_bulk_data **consumers) -{ - return 0; -} - -#endif - int regulator_register_supply_alias(struct device *dev, const char *id, struct device *alias_dev, const char *alias_id); @@ -380,20 +348,6 @@ devm_regulator_get_optional(struct device *dev, const char *id) return ERR_PTR(-ENODEV); } -static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id) -{ - return ERR_PTR(-ENODEV); -} - -static inline struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id) -{ - return ERR_PTR(-ENODEV); -} - static inline void regulator_put(struct regulator *regulator) { } @@ -701,6 +655,38 @@ regulator_is_equal(struct regulator *reg1, struct regulator *reg2) } #endif +#if IS_ENABLED(CONFIG_OF) && IS_ENABLED(CONFIG_REGULATOR) +struct regulator *__must_check of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id); +struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id); +int __must_check of_regulator_bulk_get_all(struct device *dev, struct device_node *np, + struct regulator_bulk_data **consumers); +#else +static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id) +{ + return ERR_PTR(-ENODEV); +} + +static inline struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id) +{ + return ERR_PTR(-ENODEV); +} + +static inline int of_regulator_bulk_get_all(struct device *dev, struct device_node *np, + struct regulator_bulk_data **consumers) +{ + return 0; +} + +#endif + static inline int regulator_set_voltage_triplet(struct regulator *regulator, int min_uV, int target_uV, int max_uV) -- cgit v1.2.3 From 2caca8fc7aad9ea9a6ea3ed26ed146b1e5f06fab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jan 2025 09:14:37 +0100 Subject: block: use page_to_phys in bvec_phys Use page_to_phys instead of open coding it now that it is available in an architecture independent way. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250106081437.798213-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bvec.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index f41c7f0ef91e..ba8f52d48b94 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -286,12 +286,7 @@ static inline void *bvec_virt(struct bio_vec *bvec) */ static inline phys_addr_t bvec_phys(const struct bio_vec *bvec) { - /* - * Note this open codes page_to_phys because page_to_phys is defined in - * , which we don't want to pull in here. If it ever moves to - * a sensible place we should start using it. - */ - return PFN_PHYS(page_to_pfn(bvec->bv_page)) + bvec->bv_offset; + return page_to_phys(bvec->bv_page) + bvec->bv_offset; } #endif /* __LINUX_BVEC_H */ -- cgit v1.2.3 From 6e1d75f778d644d02147d8e61ca2cef033ce045d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 6 Dec 2024 13:55:53 +1100 Subject: sunrpc/svc: use store_release_wake_up() svc_thread_init_status() contains an open-coded store_release_wake_up(). It is cleaner to use that function directly rather than needing to remember the barrier. Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index e68fecf6eab5..e4f09f58d58c 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -327,12 +327,7 @@ static inline bool svc_thread_should_stop(struct svc_rqst *rqstp) */ static inline void svc_thread_init_status(struct svc_rqst *rqstp, int err) { - rqstp->rq_err = err; - /* memory barrier ensures assignment to error above is visible before - * waitqueue_active() test below completes. - */ - smp_mb(); - wake_up_var(&rqstp->rq_err); + store_release_wake_up(&rqstp->rq_err, err); if (err) kthread_exit(1); } -- cgit v1.2.3 From eccbbc7c00a5aae5e704d4002adfaf4c3fa4b30d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 Dec 2024 11:41:26 +1100 Subject: nfsd: don't use sv_nrthreads in connection limiting calculations. The heuristic for limiting the number of incoming connections to nfsd currently uses sv_nrthreads - allowing more connections if more threads were configured. A future patch will allow number of threads to grow dynamically so that there will be no need to configure sv_nrthreads. So we need a different solution for limiting connections. It isn't clear what problem is solved by limiting connections (as mentioned in a code comment) but the most likely problem is a connection storm - many connections that are not doing productive work. These will be closed after about 6 minutes already but it might help to slow down a storm. This patch adds a per-connection flag XPT_PEER_VALID which indicates that the peer has presented a filehandle for which it has some sort of access. i.e the peer is known to be trusted in some way. We now only count connections which have NOT been determined to be valid. There should be relative few of these at any given time. If the number of non-validated peer exceed a limit - currently 64 - we close the oldest non-validated peer to avoid having too many of these useless connections. Note that this patch significantly changes the meaning of the various configuration parameters for "max connections". The next patch will remove all of these. Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 2 +- include/linux/sunrpc/svc_xprt.h | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index e4f09f58d58c..4f9418cbf8c9 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -81,7 +81,7 @@ struct svc_serv { unsigned int sv_xdrsize; /* XDR buffer size */ struct list_head sv_permsocks; /* all permanent sockets */ struct list_head sv_tempsocks; /* all temporary sockets */ - int sv_tmpcnt; /* count of temporary sockets */ + int sv_tmpcnt; /* count of temporary "valid" sockets */ struct timer_list sv_temptimer; /* timer for aging temporary sockets */ char * sv_name; /* service name */ diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 0981e35a9fed..7064ebbd550b 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -99,8 +99,24 @@ enum { XPT_HANDSHAKE, /* xprt requests a handshake */ XPT_TLS_SESSION, /* transport-layer security established */ XPT_PEER_AUTH, /* peer has been authenticated */ + XPT_PEER_VALID, /* peer has presented a filehandle that + * it has access to. It is NOT counted + * in ->sv_tmpcnt. + */ }; +static inline void svc_xprt_set_valid(struct svc_xprt *xpt) +{ + if (test_bit(XPT_TEMP, &xpt->xpt_flags) && + !test_and_set_bit(XPT_PEER_VALID, &xpt->xpt_flags)) { + struct svc_serv *serv = xpt->xpt_server; + + spin_lock(&serv->sv_lock); + serv->sv_tmpcnt -= 1; + spin_unlock(&serv->sv_lock); + } +} + static inline void unregister_xpt_user(struct svc_xprt *xpt, struct svc_xpt_user *u) { spin_lock(&xpt->xpt_lock); -- cgit v1.2.3 From a4b853f183a19a88ad635f9ae8ba97e7cb377a23 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 Dec 2024 11:41:27 +1100 Subject: sunrpc: remove all connection limit configuration Now that the connection limit only apply to unconfirmed connections, there is no need to configure it. So remove all the configuration and fix the number of unconfirmed connections as always 64 - which is now given a name: XPT_MAX_TMP_CONN Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 4 ---- include/linux/sunrpc/svc_xprt.h | 6 ++++++ 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 4f9418cbf8c9..74658cca0f38 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -72,10 +72,6 @@ struct svc_serv { spinlock_t sv_lock; unsigned int sv_nprogs; /* Number of sv_programs */ unsigned int sv_nrthreads; /* # of server threads */ - unsigned int sv_maxconn; /* max connections allowed or - * '0' causing max to be based - * on number of threads. */ - unsigned int sv_max_payload; /* datagram payload size */ unsigned int sv_max_mesg; /* max_payload + 1 page for overheads */ unsigned int sv_xdrsize; /* XDR buffer size */ diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 7064ebbd550b..72be60952579 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -105,6 +105,12 @@ enum { */ }; +/* + * Maximum number of "tmp" connections - those without XPT_PEER_VALID - + * permitted on any service. + */ +#define XPT_MAX_TMP_CONN 64 + static inline void svc_xprt_set_valid(struct svc_xprt *xpt) { if (test_bit(XPT_TEMP, &xpt->xpt_flags) && -- cgit v1.2.3 From 2f55dbe4e2072c9e99298c6f37473778a98c9107 Mon Sep 17 00:00:00 2001 From: Yang Erkun Date: Wed, 25 Dec 2024 14:59:05 +0800 Subject: SUNRPC: introduce cache_check_rcu to help check in rcu context This is a prepare patch to add cache_check_rcu, will use it with follow patch. Suggested-by: NeilBrown Signed-off-by: Yang Erkun Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/cache.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 35766963dd14..e783132e481f 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -222,6 +222,8 @@ static inline bool cache_is_expired(struct cache_detail *detail, struct cache_he return detail->flush_time >= h->last_refresh; } +extern int cache_check_rcu(struct cache_detail *detail, + struct cache_head *h, struct cache_req *rqstp); extern int cache_check(struct cache_detail *detail, struct cache_head *h, struct cache_req *rqstp); extern void cache_flush(void); -- cgit v1.2.3 From e7602bb4f3a1234df8b75728ac3260bcb8242612 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jan 2025 09:35:10 +0100 Subject: block: remove BLK_MQ_F_NO_SCHED The only queues that really can't support a scheduler are those that do not have a gendisk associated with them, and thus can't be used for non-passthrough commands. In addition to those null_blk can optionally set the flag, which is a bad odd. Replace the null_blk usage with BLK_MQ_F_NO_SCHED_BY_DEFAULT to keep the expected semantics and then remove BLK_MQ_F_NO_SCHED as the non-disk queues never call into elevator_init_mq or blk_register_queue which adds the sysfs attributes. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250106083531.799976-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 6340293511c9..f2ff0ffa0535 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -676,8 +676,6 @@ enum { BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 4, - /* Do not allow an I/O scheduler to be configured. */ - BLK_MQ_F_NO_SCHED = 1 << 5, /* * Select 'none' during queue registration in case of a single hwq -- cgit v1.2.3 From ce32496ec1abe866225f2e2005ceda68cf4c7bf4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jan 2025 09:35:11 +0100 Subject: block: simplify tag allocation policy selection Use a plain BLK_MQ_F_* flag to select the round robin tag selection instead of overlaying an enum with just two possible values into the flags space. Doing so allows adding a BLK_MQ_F_MAX sentinel for simplified overflow checking in the messy debugfs helpers. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Link: https://lore.kernel.org/r/20250106083531.799976-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 22 +++++++--------------- include/linux/libata.h | 4 ++-- 2 files changed, 9 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f2ff0ffa0535..a0a9007cc1e3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -296,13 +296,6 @@ enum blk_eh_timer_return { BLK_EH_RESET_TIMER, }; -/* Keep alloc_policy_name[] in sync with the definitions below */ -enum { - BLK_TAG_ALLOC_FIFO, /* allocate starting from 0 */ - BLK_TAG_ALLOC_RR, /* allocate starting from last allocated tag */ - BLK_TAG_ALLOC_MAX -}; - /** * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware * block device @@ -677,20 +670,19 @@ enum { BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 4, + /* + * Alloc tags on a round-robin base instead of the first available one. + */ + BLK_MQ_F_TAG_RR = 1 << 5, + /* * Select 'none' during queue registration in case of a single hwq * or shared hwqs instead of 'mq-deadline'. */ BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 6, - BLK_MQ_F_ALLOC_POLICY_START_BIT = 7, - BLK_MQ_F_ALLOC_POLICY_BITS = 1, + + BLK_MQ_F_MAX = 1 << 7, }; -#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ - ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ - ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) -#define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ - ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ - << BLK_MQ_F_ALLOC_POLICY_START_BIT) #define BLK_MQ_MAX_DEPTH (10240) #define BLK_MQ_NO_HCTX_IDX (-1U) diff --git a/include/linux/libata.h b/include/linux/libata.h index c1a85d46eba6..be5183d75736 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1467,13 +1467,13 @@ extern const struct attribute_group *ata_common_sdev_groups[]; #define ATA_SUBBASE_SHT(drv_name) \ __ATA_BASE_SHT(drv_name), \ .can_queue = ATA_DEF_QUEUE, \ - .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ + .tag_alloc_policy_rr = true, \ .device_configure = ata_scsi_device_configure #define ATA_SUBBASE_SHT_QD(drv_name, drv_qd) \ __ATA_BASE_SHT(drv_name), \ .can_queue = drv_qd, \ - .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ + .tag_alloc_policy_rr = true, \ .device_configure = ata_scsi_device_configure #define ATA_BASE_SHT(drv_name) \ -- cgit v1.2.3 From b168ed458ddecc176f3b9a1f4bcd83d7a4541c14 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Wed, 4 Dec 2024 15:31:11 +0100 Subject: kernel/cgroup: Add "dmem" memory accounting cgroup This code is based on the RDMA and misc cgroup initially, but now uses page_counter. It uses the same min/low/max semantics as the memory cgroup as a result. There's a small mismatch as TTM uses u64, and page_counter long pages. In practice it's not a problem. 32-bits systems don't really come with >=4GB cards and as long as we're consistently wrong with units, it's fine. The device page size may not be in the same units as kernel page size, and each region might also have a different page size (VRAM vs GART for example). The interface is simple: - Call dmem_cgroup_register_region() - Use dmem_cgroup_try_charge to check if you can allocate a chunk of memory, use dmem_cgroup__uncharge when freeing it. This may return an error code, or -EAGAIN when the cgroup limit is reached. In that case a reference to the limiting pool is returned. - The limiting cs can be used as compare function for dmem_cgroup_state_evict_valuable. - After having evicted enough, drop reference to limiting cs with dmem_cgroup_pool_state_put. This API allows you to limit device resources with cgroups. You can see the supported cards in /sys/fs/cgroup/dmem.capacity You need to echo +dmem to cgroup.subtree_control, and then you can partition device memory. Co-developed-by: Friedrich Vock Signed-off-by: Friedrich Vock Co-developed-by: Maxime Ripard Signed-off-by: Maarten Lankhorst Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20241204143112.1250983-1-dev@lankhorst.se Signed-off-by: Maxime Ripard --- include/linux/cgroup_dmem.h | 66 +++++++++++++++++++++++++++++++++++++++++++ include/linux/cgroup_subsys.h | 4 +++ include/linux/page_counter.h | 2 +- 3 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 include/linux/cgroup_dmem.h (limited to 'include/linux') diff --git a/include/linux/cgroup_dmem.h b/include/linux/cgroup_dmem.h new file mode 100644 index 000000000000..dd4869f1d736 --- /dev/null +++ b/include/linux/cgroup_dmem.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023-2024 Intel Corporation + */ + +#ifndef _CGROUP_DMEM_H +#define _CGROUP_DMEM_H + +#include +#include + +struct dmem_cgroup_pool_state; + +/* Opaque definition of a cgroup region, used internally */ +struct dmem_cgroup_region; + +#if IS_ENABLED(CONFIG_CGROUP_DMEM) +struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *name_fmt, ...) __printf(2,3); +void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region); +int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, + struct dmem_cgroup_pool_state **ret_pool, + struct dmem_cgroup_pool_state **ret_limit_pool); +void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size); +bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool, + struct dmem_cgroup_pool_state *test_pool, + bool ignore_low, bool *ret_hit_low); + +void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool); +#else +static inline __printf(2,3) struct dmem_cgroup_region * +dmem_cgroup_register_region(u64 size, const char *name_fmt, ...) +{ + return NULL; +} + +static inline void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region) +{ } + +static inline int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, + struct dmem_cgroup_pool_state **ret_pool, + struct dmem_cgroup_pool_state **ret_limit_pool) +{ + *ret_pool = NULL; + + if (ret_limit_pool) + *ret_limit_pool = NULL; + + return 0; +} + +static inline void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size) +{ } + +static inline +bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool, + struct dmem_cgroup_pool_state *test_pool, + bool ignore_low, bool *ret_hit_low) +{ + return true; +} + +static inline void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool) +{ } + +#endif +#endif /* _CGROUP_DMEM_H */ diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 445235487230..3fd0bcbf3080 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -65,6 +65,10 @@ SUBSYS(rdma) SUBSYS(misc) #endif +#if IS_ENABLED(CONFIG_CGROUP_DMEM) +SUBSYS(dmem) +#endif + /* * The following subsystems are not supported on the default hierarchy. */ diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 79dbd8bc35a7..46406f3fe34d 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -96,7 +96,7 @@ static inline void page_counter_reset_watermark(struct page_counter *counter) counter->watermark = usage; } -#ifdef CONFIG_MEMCG +#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM) void page_counter_calculate_protection(struct page_counter *root, struct page_counter *counter, bool recursive_protection); -- cgit v1.2.3 From 95fc45d1dea8e1253f8ec58abc5befb71553d666 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jan 2025 21:05:14 +0000 Subject: ax25: rcu protect dev->ax25_ptr syzbot found a lockdep issue [1]. We should remove ax25 RTNL dependency in ax25_setsockopt() This should also fix a variety of possible UAF in ax25. [1] WARNING: possible circular locking dependency detected 6.13.0-rc3-syzkaller-00762-g9268abe611b0 #0 Not tainted ------------------------------------------------------ syz.5.1818/12806 is trying to acquire lock: ffffffff8fcb3988 (rtnl_mutex){+.+.}-{4:4}, at: ax25_setsockopt+0xa55/0xe90 net/ax25/af_ax25.c:680 but task is already holding lock: ffff8880617ac258 (sk_lock-AF_AX25){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1618 [inline] ffff8880617ac258 (sk_lock-AF_AX25){+.+.}-{0:0}, at: ax25_setsockopt+0x209/0xe90 net/ax25/af_ax25.c:574 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (sk_lock-AF_AX25){+.+.}-{0:0}: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 lock_sock_nested+0x48/0x100 net/core/sock.c:3642 lock_sock include/net/sock.h:1618 [inline] ax25_kill_by_device net/ax25/af_ax25.c:101 [inline] ax25_device_event+0x24d/0x580 net/ax25/af_ax25.c:146 notifier_call_chain+0x1a5/0x3f0 kernel/notifier.c:85 __dev_notify_flags+0x207/0x400 dev_change_flags+0xf0/0x1a0 net/core/dev.c:9026 dev_ifsioc+0x7c8/0xe70 net/core/dev_ioctl.c:563 dev_ioctl+0x719/0x1340 net/core/dev_ioctl.c:820 sock_do_ioctl+0x240/0x460 net/socket.c:1234 sock_ioctl+0x626/0x8e0 net/socket.c:1339 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl+0xf5/0x170 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #0 (rtnl_mutex){+.+.}-{4:4}: check_prev_add kernel/locking/lockdep.c:3161 [inline] check_prevs_add kernel/locking/lockdep.c:3280 [inline] validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904 __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 __mutex_lock_common kernel/locking/mutex.c:585 [inline] __mutex_lock+0x1ac/0xee0 kernel/locking/mutex.c:735 ax25_setsockopt+0xa55/0xe90 net/ax25/af_ax25.c:680 do_sock_setsockopt+0x3af/0x720 net/socket.c:2324 __sys_setsockopt net/socket.c:2349 [inline] __do_sys_setsockopt net/socket.c:2355 [inline] __se_sys_setsockopt net/socket.c:2352 [inline] __x64_sys_setsockopt+0x1ee/0x280 net/socket.c:2352 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(sk_lock-AF_AX25); lock(rtnl_mutex); lock(sk_lock-AF_AX25); lock(rtnl_mutex); *** DEADLOCK *** 1 lock held by syz.5.1818/12806: #0: ffff8880617ac258 (sk_lock-AF_AX25){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1618 [inline] #0: ffff8880617ac258 (sk_lock-AF_AX25){+.+.}-{0:0}, at: ax25_setsockopt+0x209/0xe90 net/ax25/af_ax25.c:574 stack backtrace: CPU: 1 UID: 0 PID: 12806 Comm: syz.5.1818 Not tainted 6.13.0-rc3-syzkaller-00762-g9268abe611b0 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_circular_bug+0x13a/0x1b0 kernel/locking/lockdep.c:2074 check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2206 check_prev_add kernel/locking/lockdep.c:3161 [inline] check_prevs_add kernel/locking/lockdep.c:3280 [inline] validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904 __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 __mutex_lock_common kernel/locking/mutex.c:585 [inline] __mutex_lock+0x1ac/0xee0 kernel/locking/mutex.c:735 ax25_setsockopt+0xa55/0xe90 net/ax25/af_ax25.c:680 do_sock_setsockopt+0x3af/0x720 net/socket.c:2324 __sys_setsockopt net/socket.c:2349 [inline] __do_sys_setsockopt net/socket.c:2355 [inline] __se_sys_setsockopt net/socket.c:2352 [inline] __x64_sys_setsockopt+0x1ee/0x280 net/socket.c:2352 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f7b62385d29 Fixes: c433570458e4 ("ax25: fix a use-after-free in ax25_fillin_cb()") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250103210514.87290-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2593019ad5b1..e84602e0226c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2261,7 +2261,7 @@ struct net_device { void *atalk_ptr; #endif #if IS_ENABLED(CONFIG_AX25) - void *ax25_ptr; + struct ax25_dev __rcu *ax25_ptr; #endif #if IS_ENABLED(CONFIG_CFG80211) struct wireless_dev *ieee80211_ptr; -- cgit v1.2.3 From fbb9a9d263a68f60a16c8ba5a51d6198d67171cd Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 3 Jan 2025 11:16:31 +0000 Subject: net: phylink: add support for PCS supported_interfaces bitmap Add support for the PCS to specify which interfaces it supports, which can be used by MAC drivers to build the main supported_interfaces bitmap. Phylink also validates that the PCS returned by the MAC driver supports the interface that the MAC was asked for. An empty supported_interfaces bitmap from the PCS indicates that it does not provide this information, and we handle that appropriately. Reviewed-by: Maxime Chevallier Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tTffL-007RoD-1Y@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 5462cc6a37dc..4b7a20620b49 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -393,6 +393,8 @@ struct phylink_pcs_ops; /** * struct phylink_pcs - PHYLINK PCS instance + * @supported_interfaces: describing which PHY_INTERFACE_MODE_xxx + * are supported by this PCS. * @ops: a pointer to the &struct phylink_pcs_ops structure * @phylink: pointer to &struct phylink_config * @neg_mode: provide PCS neg mode via "mode" argument @@ -409,6 +411,7 @@ struct phylink_pcs_ops; * the PCS driver. */ struct phylink_pcs { + DECLARE_PHY_INTERFACE_MASK(supported_interfaces); const struct phylink_pcs_ops *ops; struct phylink *phylink; bool neg_mode; -- cgit v1.2.3 From 2410719cdd49d9b062e87dddaf5ec990edafc6e3 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 3 Jan 2025 11:16:56 +0000 Subject: net: pcs: xpcs: make xpcs_get_interfaces() static xpcs_get_interfaces() should no longer be used outside of the XPCS code, so make it static. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1tTffk-007Roi-JM@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/pcs/pcs-xpcs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index b5b5d17998b8..733f4ddd2ef1 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -50,7 +50,6 @@ struct dw_xpcs; struct phylink_pcs *xpcs_to_phylink_pcs(struct dw_xpcs *xpcs); int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface); -void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces); int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable); struct dw_xpcs *xpcs_create_mdiodev(struct mii_bus *bus, int addr); -- cgit v1.2.3 From 7bd72a4aa226c3ef752bcd6b33c54f6e85efcc60 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 4 Jan 2025 17:21:48 +0900 Subject: rtnetlink: Add rtnl_net_lock_killable(). rtnl_lock_killable() is used only in register_netdev() and will be converted to per-netns RTNL. Let's unexport it and add the corresponding helper. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/rtnetlink.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 3b9d132cbc9e..4bc2ee0b10b0 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -102,6 +102,7 @@ void __rtnl_net_unlock(struct net *net); void rtnl_net_lock(struct net *net); void rtnl_net_unlock(struct net *net); int rtnl_net_trylock(struct net *net); +int rtnl_net_lock_killable(struct net *net); int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b); bool rtnl_net_is_locked(struct net *net); @@ -138,6 +139,11 @@ static inline int rtnl_net_trylock(struct net *net) return rtnl_trylock(); } +static inline int rtnl_net_lock_killable(struct net *net) +{ + return rtnl_lock_killable(); +} + static inline void ASSERT_RTNL_NET(struct net *net) { ASSERT_RTNL(); -- cgit v1.2.3 From 7f2ef1bfc758f0f206eac863ff8ee417d5bb1493 Mon Sep 17 00:00:00 2001 From: Bibek Kumar Patro Date: Thu, 12 Dec 2024 20:44:00 +0530 Subject: iommu/arm-smmu: Add support for PRR bit setup Add an adreno-smmu-priv interface for drm/msm to call into arm-smmu-qcom and initiate the "Partially Resident Region" (PRR) bit setup or reset sequence as per request. This will be used by GPU to setup the PRR bit and related configuration registers through adreno-smmu private interface instead of directly poking the smmu hardware. Suggested-by: Rob Clark Signed-off-by: Bibek Kumar Patro Link: https://lore.kernel.org/r/20241212151402.159102-4-quic_bibekkum@quicinc.com Signed-off-by: Will Deacon --- include/linux/adreno-smmu-priv.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/adreno-smmu-priv.h b/include/linux/adreno-smmu-priv.h index c637e0997f6d..abec23c7744f 100644 --- a/include/linux/adreno-smmu-priv.h +++ b/include/linux/adreno-smmu-priv.h @@ -50,6 +50,11 @@ struct adreno_smmu_fault_info { * the GPU driver must call resume_translation() * @resume_translation: Resume translation after a fault * + * @set_prr_bit: [optional] Configure the GPU's Partially Resident + * Region (PRR) bit in the ACTLR register. + * @set_prr_addr: [optional] Configure the PRR_CFG_*ADDR register with + * the physical address of PRR page passed from GPU + * driver. * * The GPU driver (drm/msm) and adreno-smmu work together for controlling * the GPU's SMMU instance. This is by necessity, as the GPU is directly @@ -67,6 +72,8 @@ struct adreno_smmu_priv { void (*get_fault_info)(const void *cookie, struct adreno_smmu_fault_info *info); void (*set_stall)(const void *cookie, bool enabled); void (*resume_translation)(const void *cookie, bool terminate); + void (*set_prr_bit)(const void *cookie, bool set); + void (*set_prr_addr)(const void *cookie, phys_addr_t page_addr); }; #endif /* __ADRENO_SMMU_PRIV_H */ -- cgit v1.2.3 From c79a39dc8d060b9e64e8b0fa9d245d44befeefbe Mon Sep 17 00:00:00 2001 From: Calvin Owens Date: Mon, 11 Nov 2024 20:13:29 -0800 Subject: pps: Fix a use-after-free On a board running ntpd and gpsd, I'm seeing a consistent use-after-free in sys_exit() from gpsd when rebooting: pps pps1: removed ------------[ cut here ]------------ kobject: '(null)' (00000000db4bec24): is not initialized, yet kobject_put() is being called. WARNING: CPU: 2 PID: 440 at lib/kobject.c:734 kobject_put+0x120/0x150 CPU: 2 UID: 299 PID: 440 Comm: gpsd Not tainted 6.11.0-rc6-00308-gb31c44928842 #1 Hardware name: Raspberry Pi 4 Model B Rev 1.1 (DT) pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : kobject_put+0x120/0x150 lr : kobject_put+0x120/0x150 sp : ffffffc0803d3ae0 x29: ffffffc0803d3ae0 x28: ffffff8042dc9738 x27: 0000000000000001 x26: 0000000000000000 x25: ffffff8042dc9040 x24: ffffff8042dc9440 x23: ffffff80402a4620 x22: ffffff8042ef4bd0 x21: ffffff80405cb600 x20: 000000000008001b x19: ffffff8040b3b6e0 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 696e6920746f6e20 x14: 7369203a29343263 x13: 205d303434542020 x12: 0000000000000000 x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000 x8 : 0000000000000000 x7 : 0000000000000000 x6 : 0000000000000000 x5 : 0000000000000000 x4 : 0000000000000000 x3 : 0000000000000000 x2 : 0000000000000000 x1 : 0000000000000000 x0 : 0000000000000000 Call trace: kobject_put+0x120/0x150 cdev_put+0x20/0x3c __fput+0x2c4/0x2d8 ____fput+0x1c/0x38 task_work_run+0x70/0xfc do_exit+0x2a0/0x924 do_group_exit+0x34/0x90 get_signal+0x7fc/0x8c0 do_signal+0x128/0x13b4 do_notify_resume+0xdc/0x160 el0_svc+0xd4/0xf8 el0t_64_sync_handler+0x140/0x14c el0t_64_sync+0x190/0x194 ---[ end trace 0000000000000000 ]--- ...followed by more symptoms of corruption, with similar stacks: refcount_t: underflow; use-after-free. kernel BUG at lib/list_debug.c:62! Kernel panic - not syncing: Oops - BUG: Fatal exception This happens because pps_device_destruct() frees the pps_device with the embedded cdev immediately after calling cdev_del(), but, as the comment above cdev_del() notes, fops for previously opened cdevs are still callable even after cdev_del() returns. I think this bug has always been there: I can't explain why it suddenly started happening every time I reboot this particular board. In commit d953e0e837e6 ("pps: Fix a use-after free bug when unregistering a source."), George Spelvin suggested removing the embedded cdev. That seems like the simplest way to fix this, so I've implemented his suggestion, using __register_chrdev() with pps_idr becoming the source of truth for which minor corresponds to which device. But now that pps_idr defines userspace visibility instead of cdev_add(), we need to be sure the pps->dev refcount can't reach zero while userspace can still find it again. So, the idr_remove() call moves to pps_unregister_cdev(), and pps_idr now holds a reference to pps->dev. pps_core: source serial1 got cdev (251:1) <...> pps pps1: removed pps_core: unregistering pps1 pps_core: deallocating pps1 Fixes: d953e0e837e6 ("pps: Fix a use-after free bug when unregistering a source.") Cc: stable@vger.kernel.org Signed-off-by: Calvin Owens Reviewed-by: Michal Schmidt Link: https://lore.kernel.org/r/a17975fd5ae99385791929e563f72564edbcf28f.1731383727.git.calvin@wbinvd.org Signed-off-by: Greg Kroah-Hartman --- include/linux/pps_kernel.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h index 78c8ac4951b5..c7abce28ed29 100644 --- a/include/linux/pps_kernel.h +++ b/include/linux/pps_kernel.h @@ -56,8 +56,7 @@ struct pps_device { unsigned int id; /* PPS source unique ID */ void const *lookup_cookie; /* For pps_lookup_dev() only */ - struct cdev cdev; - struct device *dev; + struct device dev; struct fasync_struct *async_queue; /* fasync method */ spinlock_t lock; }; -- cgit v1.2.3 From aff028a8192d056d346541c5fc7d88c0eb43412c Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Tue, 10 Dec 2024 08:51:21 -0800 Subject: iommu/io-pgtable-arm: Add way to debug pgtable walk Add an io-pgtable method to walk the pgtable returning the raw PTEs that would be traversed for a given iova access. Signed-off-by: Rob Clark Reviewed-by: Mostafa Saleh Link: https://lore.kernel.org/r/20241210165127.600817-4-robdclark@gmail.com [will: Removed 'arm_lpae_io_pgtable_walk_data::level' per Mostafa] Signed-off-by: Will Deacon --- include/linux/io-pgtable.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index ce86b09ae80f..bba2a51c87d2 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -180,12 +180,22 @@ struct io_pgtable_cfg { }; }; +/** + * struct arm_lpae_io_pgtable_walk_data - information from a pgtable walk + * + * @ptes: The recorded PTE values from the walk + */ +struct arm_lpae_io_pgtable_walk_data { + u64 ptes[4]; +}; + /** * struct io_pgtable_ops - Page table manipulation API for IOMMU drivers. * * @map_pages: Map a physically contiguous range of pages of the same size. * @unmap_pages: Unmap a range of virtually contiguous pages of the same size. * @iova_to_phys: Translate iova to physical address. + * @pgtable_walk: (optional) Perform a page table walk for a given iova. * * These functions map directly onto the iommu_ops member functions with * the same names. @@ -199,6 +209,7 @@ struct io_pgtable_ops { struct iommu_iotlb_gather *gather); phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops, unsigned long iova); + int (*pgtable_walk)(struct io_pgtable_ops *ops, unsigned long iova, void *wd); int (*read_and_clear_dirty)(struct io_pgtable_ops *ops, unsigned long iova, size_t size, unsigned long flags, -- cgit v1.2.3 From 92d6254f58120011c93610b4cb7def214409731d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 5 Dec 2024 18:07:31 +0100 Subject: sysfs: constify macro BIN_ATTRIBUTE_GROUPS() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As there is only one in-tree user, avoid a transition phase and switch that user in the same commit. As there are some interdependencies between the constness of the different symbols in the s390 driver, covert the whole driver at once. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241205-sysfs-const-bin_attr-groups_macro-v1-1-ac5e855031e8@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 0f2fcd244523..b4368377fac9 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -293,7 +293,7 @@ __ATTRIBUTE_GROUPS(_name) #define BIN_ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ - .bin_attrs = _name##_attrs, \ + .bin_attrs_new = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) -- cgit v1.2.3 From 1bd13edbbed6e7e396f1aab92b224a4775218e68 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 27 Dec 2024 13:07:57 +0900 Subject: tracing/hist: Add poll(POLLIN) support on hist file Add poll syscall support on the `hist` file. The Waiter will be waken up when the histogram is updated with POLLIN. Currently, there is no way to wait for a specific event in userspace. So user needs to peek the `trace` periodicaly, or wait on `trace_pipe`. But it is not a good idea to peek at the `trace` for an event that randomly happens. And `trace_pipe` is not coming back until a page is filled with events. This allows a user to wait for a specific event on the `hist` file. User can set a histogram trigger on the event which they want to monitor and poll() on its `hist` file. Since this poll() returns POLLIN, the next poll() will return soon unless a read() happens on that hist file. NOTE: To read the hist file again, you must set the file offset to 0, but just for monitoring the event, you may not need to read the histogram. Cc: Shuah Khan Cc: Mathieu Desnoyers Link: https://lore.kernel.org/173527247756.464571.14236296701625509931.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 91b8ffbdfa8c..02cde1174487 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -673,6 +673,20 @@ struct trace_event_file { atomic_t tm_ref; /* trigger-mode reference counter */ }; +#ifdef CONFIG_HIST_TRIGGERS +extern struct irq_work hist_poll_work; +extern wait_queue_head_t hist_poll_wq; + +static inline void hist_poll_wakeup(void) +{ + if (wq_has_sleeper(&hist_poll_wq)) + irq_work_queue(&hist_poll_work); +} + +#define hist_poll_wait(file, wait) \ + poll_wait(file, &hist_poll_wq, wait) +#endif + #define __TRACE_EVENT_FLAGS(name, value) \ static int __init trace_init_flags_##name(void) \ { \ -- cgit v1.2.3 From 85b60ca9ad2c94661acf86a0c11278246cc5ea86 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Mon, 6 Jan 2025 18:16:25 +0530 Subject: x86/sev: Add Secure TSC support for SNP guests Add support for Secure TSC in SNP-enabled guests. Secure TSC allows guests to securely use RDTSC/RDTSCP instructions, ensuring that the parameters used cannot be altered by the hypervisor once the guest is launched. Secure TSC-enabled guests need to query TSC information from the AMD Security Processor. This communication channel is encrypted between the AMD Security Processor and the guest, with the hypervisor acting merely as a conduit to deliver the guest messages to the AMD Security Processor. Each message is protected with AEAD (AES-256 GCM). [ bp: Zap a stray newline over amd_cc_platform_has() while at it, simplify CC_ATTR_GUEST_SNP_SECURE_TSC check ] Signed-off-by: Nikunj A Dadhania Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250106124633.1418972-6-nikunj@amd.com --- include/linux/cc_platform.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index caa4b4430634..0bf7d33a1048 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -81,6 +81,14 @@ enum cc_attr { */ CC_ATTR_GUEST_SEV_SNP, + /** + * @CC_ATTR_GUEST_SNP_SECURE_TSC: SNP Secure TSC is active. + * + * The platform/OS is running as a guest/virtual machine and actively + * using AMD SEV-SNP Secure TSC feature. + */ + CC_ATTR_GUEST_SNP_SECURE_TSC, + /** * @CC_ATTR_HOST_SEV_SNP: AMD SNP enabled on the host. * -- cgit v1.2.3 From 7b16e60b31202254c62a29f5c709ffb42684b6f9 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 7 Jan 2025 15:44:03 +0000 Subject: soundwire: SDCA: Add additional SDCA address macros Compliment the existing macro to construct an SDCA control address with macros to extract the constituent parts, and validation of such an address. Also update the masks for the original macro to use GENMASK to make mental comparisons with the included comment on the address format easier. Acked-by: Vinod Koul Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20250107154408.814455-2-ckeepax@opensource.cirrus.com Reviewed-by: Pierre-Louis Bossart Signed-off-by: Mark Brown --- include/linux/soundwire/sdw_registers.h | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_registers.h b/include/linux/soundwire/sdw_registers.h index 658b10fa5b20..0a5939285583 100644 --- a/include/linux/soundwire/sdw_registers.h +++ b/include/linux/soundwire/sdw_registers.h @@ -4,6 +4,9 @@ #ifndef __SDW_REGISTERS_H #define __SDW_REGISTERS_H +#include +#include + /* * SDW registers as defined by MIPI 1.2 Spec */ @@ -329,16 +332,27 @@ * 2:0 Control Number[2:0] */ -#define SDW_SDCA_CTL(fun, ent, ctl, ch) (BIT(30) | \ - (((fun) & 0x7) << 22) | \ - (((ent) & 0x40) << 15) | \ - (((ent) & 0x3f) << 7) | \ - (((ctl) & 0x30) << 15) | \ - (((ctl) & 0x0f) << 3) | \ - (((ch) & 0x38) << 12) | \ - ((ch) & 0x07)) +#define SDW_SDCA_CTL(fun, ent, ctl, ch) (BIT(30) | \ + (((fun) & GENMASK(2, 0)) << 22) | \ + (((ent) & BIT(6)) << 15) | \ + (((ent) & GENMASK(5, 0)) << 7) | \ + (((ctl) & GENMASK(5, 4)) << 15) | \ + (((ctl) & GENMASK(3, 0)) << 3) | \ + (((ch) & GENMASK(5, 3)) << 12) | \ + ((ch) & GENMASK(2, 0))) + +#define SDW_SDCA_CTL_FUNC(reg) FIELD_GET(GENMASK(24, 22), (reg)) +#define SDW_SDCA_CTL_ENT(reg) ((FIELD_GET(BIT(21), (reg)) << 6) | \ + FIELD_GET(GENMASK(12, 7), (reg))) +#define SDW_SDCA_CTL_CSEL(reg) ((FIELD_GET(GENMASK(20, 19), (reg)) << 4) | \ + FIELD_GET(GENMASK(6, 3), (reg))) +#define SDW_SDCA_CTL_CNUM(reg) ((FIELD_GET(GENMASK(17, 15), (reg)) << 3) | \ + FIELD_GET(GENMASK(2, 0), (reg))) #define SDW_SDCA_MBQ_CTL(reg) ((reg) | BIT(13)) #define SDW_SDCA_NEXT_CTL(reg) ((reg) | BIT(14)) +/* Check the reserved and fixed bits in address */ +#define SDW_SDCA_VALID_CTL(reg) (((reg) & (GENMASK(31, 25) | BIT(18) | BIT(13))) == BIT(30)) + #endif /* __SDW_REGISTERS_H */ -- cgit v1.2.3 From fdd9ef3dce98e035d21c17fac587cb6e3c7706fd Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 7 Jan 2025 15:44:05 +0000 Subject: regmap: sdw-mbq: Add support for further MBQ register sizes SoundWire MBQ register maps typically contain a variety of register sizes, which doesn't map ideally to the regmap abstraction which expects register maps to have a consistent size. Currently the MBQ register map only allows 16-bit registers to be defined, however this leads to complex CODEC driver implementations with an 8-bit register map and a 16-bit MBQ, every control will then have a custom get and put handler that allows them to access different register maps. Further more 32-bit MBQ quantities are not currently supported. Add support for additional MBQ sizes and to avoid the complexity of multiple register maps treat the val_size as a maximum size for the register map. Within the regmap use an ancillary callback to determine how many bytes to actually read/write to the hardware for a specific register. In the case that no callback is defined the behaviour defaults back to the existing behaviour of a fixed size register map. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20250107154408.814455-4-ckeepax@opensource.cirrus.com Reviewed-by: Pierre-Louis Bossart Signed-off-by: Mark Brown --- include/linux/regmap.h | 47 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index fd41baccbf3e..dd96a22f5657 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -506,6 +506,17 @@ struct regmap_range_cfg { unsigned int window_len; }; +/** + * struct regmap_sdw_mbq_cfg - Configuration for Multi-Byte Quantities + * + * @mbq_size: Callback returning the actual size of the given register. + * + * Provides additional configuration required for SoundWire MBQ register maps. + */ +struct regmap_sdw_mbq_cfg { + int (*mbq_size)(struct device *dev, unsigned int reg); +}; + struct regmap_async; typedef int (*regmap_hw_write)(void *context, const void *data, @@ -652,6 +663,7 @@ struct regmap *__regmap_init_sdw(struct sdw_slave *sdw, const char *lock_name); struct regmap *__regmap_init_sdw_mbq(struct sdw_slave *sdw, const struct regmap_config *config, + const struct regmap_sdw_mbq_cfg *mbq_config, struct lock_class_key *lock_key, const char *lock_name); struct regmap *__regmap_init_spi_avmm(struct spi_device *spi, @@ -713,6 +725,7 @@ struct regmap *__devm_regmap_init_sdw(struct sdw_slave *sdw, const char *lock_name); struct regmap *__devm_regmap_init_sdw_mbq(struct sdw_slave *sdw, const struct regmap_config *config, + const struct regmap_sdw_mbq_cfg *mbq_config, struct lock_class_key *lock_key, const char *lock_name); struct regmap *__devm_regmap_init_slimbus(struct slim_device *slimbus, @@ -942,7 +955,22 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg); */ #define regmap_init_sdw_mbq(sdw, config) \ __regmap_lockdep_wrapper(__regmap_init_sdw_mbq, #config, \ - sdw, config) + sdw, config, NULL) + +/** + * regmap_init_sdw_mbq_cfg() - Initialise MBQ SDW register map with config + * + * @sdw: Device that will be interacted with + * @config: Configuration for register map + * @mbq_config: Properties for the MBQ registers + * + * The return value will be an ERR_PTR() on error or a valid pointer + * to a struct regmap. The regmap will be automatically freed by the + * device management code. + */ +#define regmap_init_sdw_mbq_cfg(sdw, config, mbq_config) \ + __regmap_lockdep_wrapper(__regmap_init_sdw_mbq, #config, \ + sdw, config, mbq_config) /** * regmap_init_spi_avmm() - Initialize register map for Intel SPI Slave @@ -1155,7 +1183,22 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg); */ #define devm_regmap_init_sdw_mbq(sdw, config) \ __regmap_lockdep_wrapper(__devm_regmap_init_sdw_mbq, #config, \ - sdw, config) + sdw, config, NULL) + +/** + * devm_regmap_init_sdw_mbq_cfg() - Initialise managed MBQ SDW register map with config + * + * @sdw: Device that will be interacted with + * @config: Configuration for register map + * @mbq_config: Properties for the MBQ registers + * + * The return value will be an ERR_PTR() on error or a valid pointer + * to a struct regmap. The regmap will be automatically freed by the + * device management code. + */ +#define devm_regmap_init_sdw_mbq_cfg(sdw, config, mbq_config) \ + __regmap_lockdep_wrapper(__devm_regmap_init_sdw_mbq, \ + #config, sdw, config, mbq_config) /** * devm_regmap_init_slimbus() - Initialise managed register map -- cgit v1.2.3 From 5bc493bf0c37c157bf2eb364e55a1c6f8bc43a69 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 7 Jan 2025 15:44:06 +0000 Subject: regmap: sdw-mbq: Add support for SDCA deferred controls The SDCA specification allows for controls to be deferred. In the case of a deferred control the device will return COMMAND_IGNORED to the 8-bit operation that would cause the value to commit. Which is the final 8-bits on a write, or the first 8-bits on a read. In the case of receiving a defer, the regmap will poll the SDCA function busy bit, after which the transaction will be retried, returning an error if the function busy does not clear within a chip specific timeout. Since this is common SDCA functionality which is the 99% use-case for MBQs it makes sense to incorporate this functionality into the register map. If no MBQ configuration is specified, the behaviour will default to the existing behaviour. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20250107154408.814455-5-ckeepax@opensource.cirrus.com Reviewed-by: Pierre-Louis Bossart Signed-off-by: Mark Brown --- include/linux/regmap.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index dd96a22f5657..198067d3cf10 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -510,11 +510,26 @@ struct regmap_range_cfg { * struct regmap_sdw_mbq_cfg - Configuration for Multi-Byte Quantities * * @mbq_size: Callback returning the actual size of the given register. + * @deferrable: Callback returning true if the hardware can defer + * transactions to the given register. Deferral should + * only be used by SDCA parts and typically which controls + * are deferrable will be specified in either as a hard + * coded list or from the DisCo tables in the platform + * firmware. + * + * @timeout_us: The time in microseconds after which waiting for a deferred + * transaction should time out. + * @retry_us: The time in microseconds between polls of the function busy + * status whilst waiting for an opportunity to retry a deferred + * transaction. * * Provides additional configuration required for SoundWire MBQ register maps. */ struct regmap_sdw_mbq_cfg { int (*mbq_size)(struct device *dev, unsigned int reg); + bool (*deferrable)(struct device *dev, unsigned int reg); + unsigned long timeout_us; + unsigned long retry_us; }; struct regmap_async; -- cgit v1.2.3 From b04ce63859793e3439b394976b8d29e785d4d69a Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 11 Dec 2024 11:23:35 +0100 Subject: i2c: davinci: kill platform data There are no more board file users of this driver. The platform data structure is only used internally. Two of the four fields it stores are not used at all anymore. Pull the remainder into the driver data struct and shrink code by removing parts that are now dead code. Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20241211102337.37956-1-brgl@bgdev.pl Signed-off-by: Andi Shyti --- include/linux/platform_data/i2c-davinci.h | 26 -------------------------- 1 file changed, 26 deletions(-) delete mode 100644 include/linux/platform_data/i2c-davinci.h (limited to 'include/linux') diff --git a/include/linux/platform_data/i2c-davinci.h b/include/linux/platform_data/i2c-davinci.h deleted file mode 100644 index 98967df07468..000000000000 --- a/include/linux/platform_data/i2c-davinci.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * DaVinci I2C controller platform_device info - * - * Author: Vladimir Barinov, MontaVista Software, Inc. - * - * 2007 (c) MontaVista Software, Inc. This file is licensed under - * the terms of the GNU General Public License version 2. This program - * is licensed "as is" without any warranty of any kind, whether express - * or implied. -*/ - -#ifndef __ASM_ARCH_I2C_H -#define __ASM_ARCH_I2C_H - -/* All frequencies are expressed in kHz */ -struct davinci_i2c_platform_data { - unsigned int bus_freq; /* standard bus frequency (kHz) */ - unsigned int bus_delay; /* post-transaction delay (usec) */ - bool gpio_recovery; /* Use GPIO recovery method */ - bool has_pfunc; /* Chip has a ICPFUNC register */ -}; - -/* for board setup code */ -void davinci_init_i2c(struct davinci_i2c_platform_data *); - -#endif /* __ASM_ARCH_I2C_H */ -- cgit v1.2.3 From 1b960cd19311c0bb653afa3633aaa9ef8edcfdde Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Jan 2025 09:09:24 +0000 Subject: net: watchdog: rename __dev_watchdog_up() and dev_watchdog_down() In commit d7811e623dd4 ("[NET]: Drop tx lock in dev_watchdog_up") dev_watchdog_up() became a simple wrapper for __netdev_watchdog_up() Herbert also said : "In 2.6.19 we can eliminate the unnecessary __dev_watchdog_up and replace it with dev_watchdog_up." This patch consolidates things to have only two functions, with a common prefix. - netdev_watchdog_up(), exported for the sake of one freescale driver. This replaces __netdev_watchdog_up() and dev_watchdog_up(). - netdev_watchdog_down(), static to net/sched/sch_generic.c This replaces dev_watchdog_down(). Signed-off-by: Eric Dumazet Cc: Herbert Xu Reviewed-by: Jason Xing Link: https://patch.msgid.link/20250105090924.1661822-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e84602e0226c..1812564b5204 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4295,7 +4295,7 @@ static inline bool netif_carrier_ok(const struct net_device *dev) unsigned long dev_trans_start(struct net_device *dev); -void __netdev_watchdog_up(struct net_device *dev); +void netdev_watchdog_up(struct net_device *dev); void netif_carrier_on(struct net_device *dev); void netif_carrier_off(struct net_device *dev); -- cgit v1.2.3 From d8c2e5f33acec38cf478c509c65646d029cc378e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 6 Jan 2025 09:46:20 -0800 Subject: if_vlan: fix kdoc warnings While merging net to net-next I noticed that the kdoc above __vlan_get_protocol_offset() has the wrong function name. Fix that and all the other kdoc warnings in this file. Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250106174620.1855269-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/if_vlan.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index d495cbdb52cb..38456b42cdb5 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -176,6 +176,7 @@ struct netpoll; * @real_dev_addr: address of underlying netdevice * @dent: proc dir entry * @vlan_pcpu_stats: ptr to percpu rx stats + * @netpoll: netpoll instance "propagated" down to @real_dev */ struct vlan_dev_priv { unsigned int nr_ingress_mappings; @@ -414,6 +415,8 @@ static inline int __vlan_insert_tag(struct sk_buff *skb, * doesn't have to worry about freeing the original skb. * * Does not change skb->protocol so this function can be used during receive. + * + * Return: modified @skb on success, NULL on error (@skb is freed). */ static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb, __be16 vlan_proto, @@ -443,6 +446,8 @@ static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb, * doesn't have to worry about freeing the original skb. * * Does not change skb->protocol so this function can be used during receive. + * + * Return: modified @skb on success, NULL on error (@skb is freed). */ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) @@ -461,6 +466,8 @@ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. + * + * Return: modified @skb on success, NULL on error (@skb is freed). */ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb, __be16 vlan_proto, @@ -582,7 +589,7 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) } /** - * vlan_get_protocol - get protocol EtherType. + * __vlan_get_protocol_offset() - get protocol EtherType. * @skb: skbuff to query * @type: first vlan protocol * @mac_offset: MAC offset @@ -808,9 +815,11 @@ static inline netdev_features_t vlan_features_check(struct sk_buff *skb, * @h1: Pointer to vlan header * @h2: Pointer to vlan header * - * Compare two vlan headers, returns 0 if equal. + * Compare two vlan headers. * * Please note that alignment of h1 & h2 are only guaranteed to be 16 bits. + * + * Return: 0 if equal, arbitrary non-zero value if not equal. */ static inline unsigned long compare_vlan_header(const struct vlan_hdr *h1, const struct vlan_hdr *h2) -- cgit v1.2.3 From 9696d9ae016573568dfd65dd2a92d6e8d277b25b Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Mon, 25 Nov 2024 15:24:40 -0800 Subject: hyperv: Move hv_connection_id to hyperv-tlfs.h This definition is in the wrong file; it is part of the TLFS doc. Signed-off-by: Nuno Das Neves Acked-by: Wei Liu Reviewed-by: Easwar Hariharan Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/1732577084-2122-2-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1732577084-2122-2-git-send-email-nunodasneves@linux.microsoft.com> --- include/linux/hyperv.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 02a226bcf0ed..b0dbba3b9108 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -768,15 +768,6 @@ struct vmbus_close_msg { struct vmbus_channel_close_channel msg; }; -/* Define connection identifier type. */ -union hv_connection_id { - u32 asu32; - struct { - u32 id:24; - u32 reserved:8; - } u; -}; - enum vmbus_device_type { HV_IDE = 0, HV_SCSI, -- cgit v1.2.3 From 86b525bed2758878e788c9fb6b8fb281fd61bdb0 Mon Sep 17 00:00:00 2001 From: Rodolfo Giometti Date: Fri, 8 Nov 2024 08:31:12 +0100 Subject: drivers pps: add PPS generators support Sometimes one needs to be able not only to catch PPS signals but to produce them also. For example, running a distributed simulation, which requires computers' clock to be synchronized very tightly. This patch adds PPS generators class in order to have a well-defined interface for these devices. Signed-off-by: Rodolfo Giometti Link: https://lore.kernel.org/r/20241108073115.759039-2-giometti@enneenne.com Signed-off-by: Greg Kroah-Hartman --- include/linux/pps_gen_kernel.h | 78 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 include/linux/pps_gen_kernel.h (limited to 'include/linux') diff --git a/include/linux/pps_gen_kernel.h b/include/linux/pps_gen_kernel.h new file mode 100644 index 000000000000..022ea0ac4440 --- /dev/null +++ b/include/linux/pps_gen_kernel.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * PPS generator API kernel header + * + * Copyright (C) 2024 Rodolfo Giometti + */ + +#ifndef LINUX_PPS_GEN_KERNEL_H +#define LINUX_PPS_GEN_KERNEL_H + +#include +#include +#include + +/* + * Global defines + */ + +#define PPS_GEN_MAX_SOURCES 16 /* should be enough... */ + +struct pps_gen_device; + +/** + * struct pps_gen_source_info - the specific PPS generator info + * @use_system_clock: true, if the system clock is used to generate pulses + * @get_time: query the time stored into the generator clock + * @enable: enable/disable the PPS pulses generation + * + * This is the main generator struct where all needed information must be + * placed before calling the pps_gen_register_source(). + */ +struct pps_gen_source_info { + bool use_system_clock; + + int (*get_time)(struct pps_gen_device *pps_gen, + struct timespec64 *time); + int (*enable)(struct pps_gen_device *pps_gen, bool enable); + +/* private: internal use only */ + struct module *owner; + struct device *parent; /* for device_create */ +}; + +/* The main struct */ +struct pps_gen_device { + struct pps_gen_source_info info; /* PSS generator info */ + bool enabled; /* PSS generator status */ + + unsigned int event; + unsigned int sequence; + + unsigned int last_ev; /* last PPS event id */ + wait_queue_head_t queue; /* PPS event queue */ + + unsigned int id; /* PPS generator unique ID */ + struct cdev cdev; + struct device *dev; + struct fasync_struct *async_queue; /* fasync method */ + spinlock_t lock; +}; + +/* + * Global variables + */ + +extern const struct attribute_group *pps_gen_groups[]; + +/* + * Exported functions + */ + +extern struct pps_gen_device *pps_gen_register_source( + struct pps_gen_source_info *info); +extern void pps_gen_unregister_source(struct pps_gen_device *pps_gen); +extern void pps_gen_event(struct pps_gen_device *pps_gen, + unsigned int event, void *data); + +#endif /* LINUX_PPS_GEN_KERNEL_H */ -- cgit v1.2.3 From 567a311d0a1a433f1e5bff508f3eb7ebfa189aa3 Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Fri, 20 Dec 2024 00:29:57 +0100 Subject: VMCI: remove unused ioctl definitions IOCTL_VMCI_SOCKETS_VERSION and IOCTL_VMCI_SOCKETS_GET_AF_VALUE were never implemented, because VSOCK ended up being implemented as a generic mechanism with a static AF value. Likewise, IOCTL_VMCI_SOCKETS_GET_LOCAL_CID ended up being implemented as IOCTL_VM_SOCKETS_GET_LOCAL_CID. This isn't a UAPI header, so it should be fine to remove the unused values. I've left a comment noting IOCTL_VM_SOCKETS_GET_LOCAL_CID is in the VMCI range to avoid unintentional reuse. Signed-off-by: Alyssa Ross Acked-by: Vishnu Dasa Link: https://lore.kernel.org/r/fzdcrz4yfedokmbm22h2iwsluix4jwejwaltuwcdr6kz3yu6eu@nue5xc6ayevo Signed-off-by: Greg Kroah-Hartman --- include/linux/vmw_vmci_defs.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmw_vmci_defs.h b/include/linux/vmw_vmci_defs.h index 6fb663b36f72..c2df94696593 100644 --- a/include/linux/vmw_vmci_defs.h +++ b/include/linux/vmw_vmci_defs.h @@ -453,9 +453,7 @@ enum { #define IOCTL_VMCI_CTX_GET_CPT_STATE _IO(7, 0xb1) #define IOCTL_VMCI_CTX_SET_CPT_STATE _IO(7, 0xb2) #define IOCTL_VMCI_GET_CONTEXT_ID _IO(7, 0xb3) -#define IOCTL_VMCI_SOCKETS_VERSION _IO(7, 0xb4) -#define IOCTL_VMCI_SOCKETS_GET_AF_VALUE _IO(7, 0xb8) -#define IOCTL_VMCI_SOCKETS_GET_LOCAL_CID _IO(7, 0xb9) +/*IOCTL_VM_SOCKETS_GET_LOCAL_CID _IO(7, 0xb9)*/ #define IOCTL_VMCI_SET_NOTIFY _IO(7, 0xcb) /* 1995 */ /*IOCTL_VMMON_START _IO(7, 0xd1)*/ /* 2001 */ -- cgit v1.2.3 From 3a5446612a3f2579c751ddb77c5e16b9a0d47001 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:48:59 +0200 Subject: sched,arm64: Handle CPU isolation on last resort fallback rq selection When a kthread or any other task has an affinity mask that is fully offline or unallowed, the scheduler reaffines the task to all possible CPUs as a last resort. This default decision doesn't mix up very well with nohz_full CPUs that are part of the possible cpumask but don't want to be disturbed by unbound kthreads or even detached pinned user tasks. Make the fallback affinity setting aware of nohz_full. Suggested-by: Michal Hocko Acked-by: Will Deacon Signed-off-by: Frederic Weisbecker --- include/linux/mmu_context.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h index bbaec80c78c5..ac01dc4eb2ce 100644 --- a/include/linux/mmu_context.h +++ b/include/linux/mmu_context.h @@ -24,6 +24,7 @@ static inline void leave_mm(void) { } #ifndef task_cpu_possible_mask # define task_cpu_possible_mask(p) cpu_possible_mask # define task_cpu_possible(cpu, p) true +# define task_cpu_fallback_mask(p) housekeeping_cpumask(HK_TYPE_TICK) #else # define task_cpu_possible(cpu, p) cpumask_test_cpu((cpu), task_cpu_possible_mask(p)) #endif -- cgit v1.2.3 From d1a89197589c4a77468298ef2b14ff4084c4ea29 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:01 +0200 Subject: kthread: Default affine kthread to its preferred NUMA node Kthreads attached to a preferred NUMA node for their task structure allocation can also be assumed to run preferrably within that same node. A more precise affinity is usually notified by calling kthread_create_on_cpu() or kthread_bind[_mask]() before the first wakeup. For the others, a default affinity to the node is desired and sometimes implemented with more or less success when it comes to deal with hotplug events and nohz_full / CPU Isolation interactions: - kcompactd is affine to its node and handles hotplug but not CPU Isolation - kswapd is affine to its node and ignores hotplug and CPU Isolation - A bunch of drivers create their kthreads on a specific node and don't take care about affining further. Handle that default node affinity preference at the generic level instead, provided a kthread is created on an actual node and doesn't apply any specific affinity such as a given CPU or a custom cpumask to bind to before its first wake-up. This generic handling is aware of CPU hotplug events and CPU isolation such that: * When a housekeeping CPU goes up that is part of the node of a given kthread, the related task is re-affined to that own node if it was previously running on the default last resort online housekeeping set from other nodes. * When a housekeeping CPU goes down while it was part of the node of a kthread, the running task is migrated (or the sleeping task is woken up) automatically by the scheduler to other housekeepers within the same node or, as a last resort, to all housekeepers from other nodes. Acked-by: Vlastimil Babka Signed-off-by: Frederic Weisbecker --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index a04b73c40173..6cc5e484547c 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -240,6 +240,7 @@ enum cpuhp_state { CPUHP_AP_WORKQUEUE_ONLINE, CPUHP_AP_RANDOM_ONLINE, CPUHP_AP_RCUTREE_ONLINE, + CPUHP_AP_KTHREADS_ONLINE, CPUHP_AP_BASE_CACHEINFO_ONLINE, CPUHP_AP_ONLINE_DYN, CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 40, -- cgit v1.2.3 From 4d13f4304fa43471bfea101658a11feec7b28ac0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:04 +0200 Subject: kthread: Implement preferred affinity Affining kthreads follow either of four existing different patterns: 1) Per-CPU kthreads must stay affine to a single CPU and never execute relevant code on any other CPU. This is currently handled by smpboot code which takes care of CPU-hotplug operations. 2) Kthreads that _have_ to be affine to a specific set of CPUs and can't run anywhere else. The affinity is set through kthread_bind_mask() and the subsystem takes care by itself to handle CPU-hotplug operations. 3) Kthreads that prefer to be affine to a specific NUMA node. That preferred affinity is applied by default when an actual node ID is passed on kthread creation, provided the kthread is not per-CPU and no call to kthread_bind_mask() has been issued before the first wake-up. 4) Similar to the previous point but kthreads have a preferred affinity different than a node. It is set manually like any other task and CPU-hotplug is supposed to be handled by the relevant subsystem so that the task is properly reaffined whenever a given CPU from the preferred affinity comes up. Also care must be taken so that the preferred affinity doesn't cross housekeeping cpumask boundaries. Provide a function to handle the last usecase, mostly reusing the current node default affinity infrastructure. kthread_affine_preferred() is introduced, to be used just like kthread_bind_mask(), right after kthread creation and before the first wake up. The kthread is then affine right away to the cpumask passed through the API if it has online housekeeping CPUs. Otherwise it will be affine to all online housekeeping CPUs as a last resort. As with node affinity, it is aware of CPU hotplug events such that: * When a housekeeping CPU goes up that is part of the preferred affinity of a given kthread, the related task is re-affined to that preferred affinity if it was previously running on the default last resort online housekeeping set. * When a housekeeping CPU goes down while it was part of the preferred affinity of a kthread, the running task is migrated (or the sleeping task is woken up) automatically by the scheduler to other housekeepers within the preferred affinity or, as a last resort, to all housekeepers from other nodes. Acked-by: Vlastimil Babka Signed-off-by: Frederic Weisbecker --- include/linux/kthread.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index b11f53c1ba2e..30209bdf83a2 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -85,6 +85,7 @@ kthread_run_on_cpu(int (*threadfn)(void *data), void *data, void free_kthread_struct(struct task_struct *k); void kthread_bind(struct task_struct *k, unsigned int cpu); void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask); +int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask); int kthread_stop(struct task_struct *k); int kthread_stop_put(struct task_struct *k); bool kthread_should_stop(void); -- cgit v1.2.3 From 41f70d8e16349c65abdc0dd88a7d0ab94e5ce639 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:06 +0200 Subject: kthread: Unify kthread_create_on_cpu() and kthread_create_worker_on_cpu() automatic format kthread_create_on_cpu() uses the CPU argument as an implicit and unique printf argument to add to the format whereas kthread_create_worker_on_cpu() still relies on explicitly passing the printf arguments. This difference in behaviour is error prone and doesn't help standardizing per-CPU kthread names. Unify the behaviours and convert kthread_create_worker_on_cpu() to use the printf behaviour of kthread_create_on_cpu(). Signed-off-by: Frederic Weisbecker --- include/linux/kthread.h | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 30209bdf83a2..0c66e7c1092a 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -187,13 +187,24 @@ extern void __kthread_init_worker(struct kthread_worker *worker, int kthread_worker_fn(void *worker_ptr); -__printf(2, 3) -struct kthread_worker * -kthread_create_worker(unsigned int flags, const char namefmt[], ...); +__printf(3, 4) +struct kthread_worker *kthread_create_worker_on_node(unsigned int flags, + int node, + const char namefmt[], ...); -__printf(3, 4) struct kthread_worker * +#define kthread_create_worker(flags, namefmt, ...) \ +({ \ + struct kthread_worker *__kw \ + = kthread_create_worker_on_node(flags, NUMA_NO_NODE, \ + namefmt, ## __VA_ARGS__); \ + if (!IS_ERR(__kw)) \ + wake_up_process(__kw->task); \ + __kw; \ +}) + +struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, - const char namefmt[], ...); + const char namefmt[]); bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work); -- cgit v1.2.3 From b04e317b522630b46f78ee62ecbdc5734e8d43de Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:07 +0200 Subject: treewide: Introduce kthread_run_worker[_on_cpu]() kthread_create() creates a kthread without running it yet. kthread_run() creates a kthread and runs it. On the other hand, kthread_create_worker() creates a kthread worker and runs it. This difference in behaviours is confusing. Also there is no way to create a kthread worker and affine it using kthread_bind_mask() or kthread_affine_preferred() before starting it. Consolidate the behaviours and introduce kthread_run_worker[_on_cpu]() that behaves just like kthread_run(). kthread_create_worker[_on_cpu]() will now only create a kthread worker without starting it. Signed-off-by: Frederic Weisbecker Signed-off-by: Dan Carpenter --- include/linux/kthread.h | 48 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 0c66e7c1092a..8d27403888ce 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -193,19 +193,53 @@ struct kthread_worker *kthread_create_worker_on_node(unsigned int flags, const char namefmt[], ...); #define kthread_create_worker(flags, namefmt, ...) \ -({ \ - struct kthread_worker *__kw \ - = kthread_create_worker_on_node(flags, NUMA_NO_NODE, \ - namefmt, ## __VA_ARGS__); \ - if (!IS_ERR(__kw)) \ - wake_up_process(__kw->task); \ - __kw; \ + kthread_create_worker_on_node(flags, NUMA_NO_NODE, namefmt, ## __VA_ARGS__); + +/** + * kthread_run_worker - create and wake a kthread worker. + * @flags: flags modifying the default behavior of the worker + * @namefmt: printf-style name for the thread. + * + * Description: Convenient wrapper for kthread_create_worker() followed by + * wake_up_process(). Returns the kthread_worker or ERR_PTR(-ENOMEM). + */ +#define kthread_run_worker(flags, namefmt, ...) \ +({ \ + struct kthread_worker *__kw \ + = kthread_create_worker(flags, namefmt, ## __VA_ARGS__); \ + if (!IS_ERR(__kw)) \ + wake_up_process(__kw->task); \ + __kw; \ }) struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[]); +/** + * kthread_run_worker_on_cpu - create and wake a cpu bound kthread worker. + * @cpu: CPU number + * @flags: flags modifying the default behavior of the worker + * @namefmt: printf-style name for the thread. Format is restricted + * to "name.*%u". Code fills in cpu number. + * + * Description: Convenient wrapper for kthread_create_worker_on_cpu() + * followed by wake_up_process(). Returns the kthread_worker or + * ERR_PTR(-ENOMEM). + */ +static inline struct kthread_worker * +kthread_run_worker_on_cpu(int cpu, unsigned int flags, + const char namefmt[]) +{ + struct kthread_worker *kw; + + kw = kthread_create_worker_on_cpu(cpu, flags, namefmt); + if (!IS_ERR(kw)) + wake_up_process(kw->task); + + return kw; +} + bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work); -- cgit v1.2.3 From b9371866799d67a80be0ea9e01bd41987db22f26 Mon Sep 17 00:00:00 2001 From: Md Sadre Alam Date: Mon, 6 Jan 2025 18:45:58 +0530 Subject: mtd: rawnand: qcom: Fix build issue on x86 architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a buffer overflow issue in qcom_clear_bam_transaction by using struct_group to group related fields and avoid FORTIFY_SOURCE warnings. On x86 architecture, the following error occurs due to warnings being treated as errors: In function ‘fortify_memset_chk’, inlined from ‘qcom_clear_bam_transaction’ at drivers/mtd/nand/qpic_common.c:88:2: ./include/linux/fortify-string.h:480:25: error: call to ‘__write_overflow_field’ declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror=attribute-warning] 480 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LD [M] drivers/mtd/nand/nandcore.o CC [M] drivers/w1/masters/mxc_w1.o cc1: all warnings being treated as errors This patch addresses the issue by grouping the related fields in struct bam_transaction using struct_group and updating the memset call accordingly. Fixes: 8c52932da5e6 ("mtd: rawnand: qcom: cleanup qcom_nandc driver") Signed-off-by: Md Sadre Alam Signed-off-by: Miquel Raynal --- include/linux/mtd/nand-qpic-common.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand-qpic-common.h b/include/linux/mtd/nand-qpic-common.h index e79c79775eb8..4d9b736ff8b7 100644 --- a/include/linux/mtd/nand-qpic-common.h +++ b/include/linux/mtd/nand-qpic-common.h @@ -254,14 +254,17 @@ struct bam_transaction { struct dma_async_tx_descriptor *last_data_desc; struct dma_async_tx_descriptor *last_cmd_desc; struct completion txn_done; - u32 bam_ce_pos; - u32 bam_ce_start; - u32 cmd_sgl_pos; - u32 cmd_sgl_start; - u32 tx_sgl_pos; - u32 tx_sgl_start; - u32 rx_sgl_pos; - u32 rx_sgl_start; + struct_group(bam_positions, + u32 bam_ce_pos; + u32 bam_ce_start; + u32 cmd_sgl_pos; + u32 cmd_sgl_start; + u32 tx_sgl_pos; + u32 tx_sgl_start; + u32 rx_sgl_pos; + u32 rx_sgl_start; + + ); }; /* -- cgit v1.2.3 From f90877dd7fb5085dd9abd6399daf63dd2969fc90 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 8 Jan 2025 23:44:45 +0100 Subject: seccomp: Stub for !CONFIG_SECCOMP When using !CONFIG_SECCOMP with CONFIG_GENERIC_ENTRY, the randconfig bots found the following snag: kernel/entry/common.c: In function 'syscall_trace_enter': >> kernel/entry/common.c:52:23: error: implicit declaration of function '__secure_computing' [-Wimplicit-function-declaration] 52 | ret = __secure_computing(NULL); | ^~~~~~~~~~~~~~~~~~ Since generic entry calls __secure_computing() unconditionally, fix this by moving the stub out of the ifdef clause for CONFIG_HAVE_ARCH_SECCOMP_FILTER so it's always available. Link: https://lore.kernel.org/oe-kbuild-all/202501061240.Fzk9qiFZ-lkp@intel.com/ Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20250108-seccomp-stub-2-v2-1-74523d49420f@linaro.org Signed-off-by: Kees Cook --- include/linux/seccomp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 341980599c71..e45531455d3b 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -55,10 +55,10 @@ struct seccomp_data; #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER static inline int secure_computing(void) { return 0; } -static inline int __secure_computing(const struct seccomp_data *sd) { return 0; } #else static inline void secure_computing_strict(int this_syscall) { return; } #endif +static inline int __secure_computing(const struct seccomp_data *sd) { return 0; } static inline long prctl_get_seccomp(void) { -- cgit v1.2.3 From 1d45a1cd9f3ae849db868e07e5fee5e5b37eff55 Mon Sep 17 00:00:00 2001 From: Gaurav Kashyap Date: Thu, 12 Dec 2024 20:19:51 -0800 Subject: firmware: qcom: scm: add calls for wrapped key support Add helper functions for the SCM calls required to support hardware-wrapped inline storage encryption keys. These SCM calls manage wrapped keys via Qualcomm's Hardware Key Manager (HWKM), which can only be accessed from TrustZone. QCOM_SCM_ES_GENERATE_ICE_KEY and QCOM_SCM_ES_IMPORT_ICE_KEY create a new long-term wrapped key, with the former making the hardware generate the key and the latter importing a raw key. QCOM_SCM_ES_PREPARE_ICE_KEY converts the key to ephemerally-wrapped form so that it can be used for inline storage encryption. These are planned to be wired up to new ioctls via the blk-crypto framework; see the proposed documentation for the hardware-wrapped keys feature for more information. Similarly there's also QCOM_SCM_ES_DERIVE_SW_SECRET which derives a "software secret" from an ephemerally-wrapped key and will be wired up to the corresponding operation in the blk_crypto_profile. These will all be used by the ICE driver in drivers/soc/qcom/ice.c. [EB: merged related patches, fixed error handling, fixed naming, fixed docs for size parameters, fixed qcom_scm_has_wrapped_key_support(), improved comments, improved commit message.] Signed-off-by: Gaurav Kashyap Signed-off-by: Bartosz Golaszewski Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20241213041958.202565-9-ebiggers@kernel.org Signed-off-by: Bjorn Andersson --- include/linux/firmware/qcom/qcom_scm.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h index 4621aec0328c..983e1591bbba 100644 --- a/include/linux/firmware/qcom/qcom_scm.h +++ b/include/linux/firmware/qcom/qcom_scm.h @@ -105,6 +105,14 @@ bool qcom_scm_ice_available(void); int qcom_scm_ice_invalidate_key(u32 index); int qcom_scm_ice_set_key(u32 index, const u8 *key, u32 key_size, enum qcom_scm_ice_cipher cipher, u32 data_unit_size); +bool qcom_scm_has_wrapped_key_support(void); +int qcom_scm_derive_sw_secret(const u8 *eph_key, size_t eph_key_size, + u8 *sw_secret, size_t sw_secret_size); +int qcom_scm_generate_ice_key(u8 *lt_key, size_t lt_key_size); +int qcom_scm_prepare_ice_key(const u8 *lt_key, size_t lt_key_size, + u8 *eph_key, size_t eph_key_size); +int qcom_scm_import_ice_key(const u8 *raw_key, size_t raw_key_size, + u8 *lt_key, size_t lt_key_size); bool qcom_scm_hdcp_available(void); int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt, u32 *resp); -- cgit v1.2.3 From 80818fdc068eaab729bb793d790ae9fd053f7053 Mon Sep 17 00:00:00 2001 From: Terry Tritton Date: Fri, 20 Dec 2024 19:23:18 +0000 Subject: HID: fix generic desktop D-Pad controls The addition of the "System Do Not Disturb" event code caused the Generic Desktop D-Pad configuration to be skipped. This commit allows both to be configured without conflicting with each other. Fixes: 22d6d060ac77 ("input: Add support for "Do Not Disturb"") Signed-off-by: Terry Tritton Reviewed-by: Aseda Aboagye Reviewed-by: Carlos Llamas Signed-off-by: Jiri Kosina --- include/linux/hid.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index d11e9c9a5f15..cdc0dc13c87f 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -218,6 +218,7 @@ struct hid_item { #define HID_GD_DOWN 0x00010091 #define HID_GD_RIGHT 0x00010092 #define HID_GD_LEFT 0x00010093 +#define HID_GD_DO_NOT_DISTURB 0x0001009b /* Microsoft Win8 Wireless Radio Controls CA usage codes */ #define HID_GD_RFKILL_BTN 0x000100c6 #define HID_GD_RFKILL_LED 0x000100c7 -- cgit v1.2.3 From 6657d899ce35a313d7e0e7ddc0988aa80d304ca8 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 25 Dec 2024 01:55:08 +0000 Subject: HID: intel-ish-hid: Remove unused ishtp_cl_tx_empty ishtp_cl_tx_empty() was added in 2018 by commit a1c40ce62fd2 ("HID: intel-ish-hid: ishtp: add helper functions for client buffer operation") but has remained unused. Remove it. Signed-off-by: Dr. David Alan Gilbert Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- include/linux/intel-ish-client-if.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h index 771622650247..dfbf7d9d7bb5 100644 --- a/include/linux/intel-ish-client-if.h +++ b/include/linux/intel-ish-client-if.h @@ -100,7 +100,6 @@ void ishtp_cl_destroy_connection(struct ishtp_cl *cl, bool reset); int ishtp_cl_send(struct ishtp_cl *cl, uint8_t *buf, size_t length); int ishtp_cl_flush_queues(struct ishtp_cl *cl); int ishtp_cl_io_rb_recycle(struct ishtp_cl_rb *rb); -bool ishtp_cl_tx_empty(struct ishtp_cl *cl); struct ishtp_cl_rb *ishtp_cl_rx_get_rb(struct ishtp_cl *cl); void *ishtp_get_client_data(struct ishtp_cl *cl); void ishtp_set_client_data(struct ishtp_cl *cl, void *data); -- cgit v1.2.3 From 4751113f24048f96cb696ff8d939e38530dcdfc1 Mon Sep 17 00:00:00 2001 From: Even Xu Date: Mon, 6 Jan 2025 10:31:41 +0800 Subject: HID: intel-thc-hid: intel-quickspi: Add THC QuickSPI driver hid layer Add HID Low level driver callbacks and hid probe function to register QucikSPI as a HID driver, and external touch device as a HID device. Co-developed-by: Xinpeng Sun Signed-off-by: Xinpeng Sun Signed-off-by: Even Xu Tested-by: Rui Zhang Tested-by: Mark Pearson Reviewed-by: Srinivas Pandruvada Reviewed-by: Mark Pearson Tested-by: Aaron Ma Signed-off-by: Jiri Kosina --- include/linux/hid-over-spi.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 include/linux/hid-over-spi.h (limited to 'include/linux') diff --git a/include/linux/hid-over-spi.h b/include/linux/hid-over-spi.h new file mode 100644 index 000000000000..ddbe41c5d8fd --- /dev/null +++ b/include/linux/hid-over-spi.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright 2024 Intel Corporation */ + +#ifndef _HID_OVER_SPI_H_ +#define _HID_OVER_SPI_H_ + +/** + * struct hidspi_dev_descriptor - HIDSPI device descriptor definition + * @dev_desc_len: The length of the complete device descriptor, fixed to 0x18 (24). + * @bcd_ver: The version number of the HIDSPI protocol supported. + * In binary coded decimal (BCD) format. Must be fixed to 0x0300. + * @rep_desc_len: The length of the report descriptor + * @max_input_len: The length of the largest possible HID input (or feature) report + * @max_output_len: The length of the largest output (or feature) report + * @max_frag_len: The length of the largest fragment, where a fragment represents + * the body of an input report. + * @vendor_id: Device manufacturers vendor ID + * @product_id: Device unique model/product ID + * @version_id: Device’s unique version + * @flags: Specify flags for the device’s operation + * @reserved: Reserved and should be 0 + */ +struct hidspi_dev_descriptor { + __le16 dev_desc_len; + __le16 bcd_ver; + __le16 rep_desc_len; + __le16 max_input_len; + __le16 max_output_len; + __le16 max_frag_len; + __le16 vendor_id; + __le16 product_id; + __le16 version_id; + __le16 flags; + __le32 reserved; +}; + +#endif /* _HID_OVER_SPI_H_ */ -- cgit v1.2.3 From 9d8d51735a3af40b722346931a6a1e50227df4b5 Mon Sep 17 00:00:00 2001 From: Even Xu Date: Mon, 6 Jan 2025 10:31:43 +0800 Subject: HID: intel-thc-hid: intel-quickspi: Add HIDSPI protocol implementation Intel QuickSPI driver uses THC hardware to accelerate HID over SPI (HIDSPI) protocol flow. This patch implements all data flows described in HID over SPI protocol SPEC by using THC hardware layer APIs. HID over SPI SPEC: https://www.microsoft.com/download/details.aspx?id=103325 Co-developed-by: Xinpeng Sun Signed-off-by: Xinpeng Sun Signed-off-by: Even Xu Tested-by: Rui Zhang Tested-by: Mark Pearson Reviewed-by: Srinivas Pandruvada Reviewed-by: Mark Pearson Tested-by: Aaron Ma Signed-off-by: Jiri Kosina --- include/linux/hid-over-spi.h | 118 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid-over-spi.h b/include/linux/hid-over-spi.h index ddbe41c5d8fd..da5a14b5e89b 100644 --- a/include/linux/hid-over-spi.h +++ b/include/linux/hid-over-spi.h @@ -4,6 +4,120 @@ #ifndef _HID_OVER_SPI_H_ #define _HID_OVER_SPI_H_ +#include +#include + +/* Input report type definition in HIDSPI protocol */ +enum input_report_type { + INVALID_INPUT_REPORT_TYPE_0 = 0, + DATA = 1, + INVALID_TYPE_2 = 2, + RESET_RESPONSE = 3, + COMMAND_RESPONSE = 4, + GET_FEATURE_RESPONSE = 5, + INVALID_TYPE_6 = 6, + DEVICE_DESCRIPTOR_RESPONSE = 7, + REPORT_DESCRIPTOR_RESPONSE = 8, + SET_FEATURE_RESPONSE = 9, + OUTPUT_REPORT_RESPONSE = 10, + GET_INPUT_REPORT_RESPONSE = 11, + INVALID_INPUT_REPORT_TYPE = 0xF, +}; + +/* Output report type definition in HIDSPI protocol */ +enum output_report_type { + INVALID_OUTPUT_REPORT_TYPE_0 = 0, + DEVICE_DESCRIPTOR = 1, + REPORT_DESCRIPTOR = 2, + SET_FEATURE = 3, + GET_FEATURE = 4, + OUTPUT_REPORT = 5, + GET_INPUT_REPORT = 6, + COMMAND_CONTENT = 7, +}; + +/* Set power command ID for output report */ +#define HIDSPI_SET_POWER_CMD_ID 1 + +/* Power state definition in HIDSPI protocol */ +enum hidspi_power_state { + HIDSPI_ON = 1, + HIDSPI_SLEEP = 2, + HIDSPI_OFF = 3, +}; + +/** + * Input report header definition in HIDSPI protocol + * Report header size is 32bits, it includes: + * protocol_ver: [0:3] Current supported HIDSPI protocol version, must be 0x3 + * reserved0: [4:7] Reserved bits + * input_report_len: [8:21] Input report length in number bytes divided by 4 + * last_frag_flag: [22]Indicate if this packet is last fragment. + * 1 - indicates last fragment + * 0 - indicates additional fragments + * reserved1: [23] Reserved bits + * @sync_const: [24:31] Used to validate input report header, must be 0x5A + */ +#define HIDSPI_INPUT_HEADER_SIZE sizeof(u32) +#define HIDSPI_INPUT_HEADER_VER GENMASK(3, 0) +#define HIDSPI_INPUT_HEADER_REPORT_LEN GENMASK(21, 8) +#define HIDSPI_INPUT_HEADER_LAST_FLAG BIT(22) +#define HIDSPI_INPUT_HEADER_SYNC GENMASK(31, 24) + +/** + * struct input_report_body_header - Input report body header definition in HIDSPI protocol + * @input_report_type: indicate input report type, reference to enum input_report_type + * @content_len: this input report body packet length + * @content_id: indicate this input report's report id + */ +struct input_report_body_header { + u8 input_report_type; + __le16 content_len; + u8 content_id; +} __packed; + +#define HIDSPI_INPUT_BODY_HEADER_SIZE sizeof(struct input_report_body_header) + +/** + * struct input_report_body - Input report body definition in HIDSPI protocol + * @body_hdr: input report body header + * @content: input report body content + */ +struct input_report_body { + struct input_report_body_header body_hdr; + u8 content[]; +} __packed; + +#define HIDSPI_INPUT_BODY_SIZE(content_len) ((content_len) + HIDSPI_INPUT_BODY_HEADER_SIZE) + +/** + * struct output_report_header - Output report header definition in HIDSPI protocol + * @report_type: output report type, reference to enum output_report_type + * @content_len: length of content + * @content_id: 0x00 - descriptors + * report id - Set/Feature feature or Input/Output Reports + * command opcode - for commands + */ +struct output_report_header { + u8 report_type; + __le16 content_len; + u8 content_id; +} __packed; + +#define HIDSPI_OUTPUT_REPORT_HEADER_SIZE sizeof(struct output_report_header) + +/** + * struct output_report - Output report definition in HIDSPI protocol + * @output_hdr: output report header + * @content: output report content + */ +struct output_report { + struct output_report_header output_hdr; + u8 content[]; +} __packed; + +#define HIDSPI_OUTPUT_REPORT_SIZE(content_len) ((content_len) + HIDSPI_OUTPUT_REPORT_HEADER_SIZE) + /** * struct hidspi_dev_descriptor - HIDSPI device descriptor definition * @dev_desc_len: The length of the complete device descriptor, fixed to 0x18 (24). @@ -34,4 +148,8 @@ struct hidspi_dev_descriptor { __le32 reserved; }; +#define HIDSPI_DEVICE_DESCRIPTOR_SIZE sizeof(struct hidspi_dev_descriptor) +#define HIDSPI_INPUT_DEVICE_DESCRIPTOR_SIZE \ + (HIDSPI_INPUT_BODY_HEADER_SIZE + HIDSPI_DEVICE_DESCRIPTOR_SIZE) + #endif /* _HID_OVER_SPI_H_ */ -- cgit v1.2.3 From ba38d7f87f159c94f947d5b8336f763ec760d4ea Mon Sep 17 00:00:00 2001 From: Even Xu Date: Mon, 6 Jan 2025 10:31:47 +0800 Subject: HID: intel-thc-hid: intel-quicki2c: Add THC QuickI2C driver hid layer Add HID Low level driver callbacks and hid probe function to register QucikI2C as a HID driver, and external touch device as a HID device. Co-developed-by: Xinpeng Sun Signed-off-by: Xinpeng Sun Signed-off-by: Even Xu Tested-by: Rui Zhang Tested-by: Mark Pearson Reviewed-by: Srinivas Pandruvada Reviewed-by: Mark Pearson Tested-by: Aaron Ma Signed-off-by: Jiri Kosina --- include/linux/hid-over-i2c.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 include/linux/hid-over-i2c.h (limited to 'include/linux') diff --git a/include/linux/hid-over-i2c.h b/include/linux/hid-over-i2c.h new file mode 100644 index 000000000000..b70626723a38 --- /dev/null +++ b/include/linux/hid-over-i2c.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright 2024 Intel Corporation */ + +#ifndef _HID_OVER_I2C_H_ +#define _HID_OVER_I2C_H_ + +/** + * struct hidi2c_dev_descriptor - HIDI2C device descriptor definition + * @dev_desc_len: The length of the complete device descriptor, fixed to 0x1E (30). + * @bcd_ver: The version number of the HIDI2C protocol supported. + * In binary coded decimal (BCD) format. + * @report_desc_len: The length of the report descriptor + * @report_desc_reg: The register address to retrieve report descriptor + * @input_reg: the register address to retrieve input report + * @max_input_len: The length of the largest possible HID input (or feature) report + * @output_reg: the register address to send output report + * @max_output_len: The length of the largest output (or feature) report + * @cmd_reg: the register address to send command + * @data_reg: the register address to send command data + * @vendor_id: Device manufacturers vendor ID + * @product_id: Device unique model/product ID + * @version_id: Device’s unique version + * @reserved0: Reserved and should be 0 + * @reserved1: Reserved and should be 0 + */ +struct hidi2c_dev_descriptor { + __le16 dev_desc_len; + __le16 bcd_ver; + __le16 report_desc_len; + __le16 report_desc_reg; + __le16 input_reg; + __le16 max_input_len; + __le16 output_reg; + __le16 max_output_len; + __le16 cmd_reg; + __le16 data_reg; + __le16 vendor_id; + __le16 product_id; + __le16 version_id; + __le16 reserved0; + __le16 reserved1; +} __packed; + +#endif /* _HID_OVER_I2C_H_ */ -- cgit v1.2.3 From 6fc761385bcf62b235b6b48b1db32e2558a7904a Mon Sep 17 00:00:00 2001 From: Even Xu Date: Mon, 6 Jan 2025 10:31:49 +0800 Subject: HID: intel-thc-hid: intel-quicki2c: Add HIDI2C protocol implementation Intel QuickI2C driver uses THC hardware to accelerate HID over I2C (HIDI2C) protocol flow. This patch implements all data flows described in HID over I2C protocol SPEC by using THC hardware layer APIs. HID over I2C SPEC: https://learn.microsoft.com/en-us/previous-versions/windows/hardware/design/dn642101(v=vs.85) Co-developed-by: Xinpeng Sun Signed-off-by: Xinpeng Sun Signed-off-by: Even Xu Tested-by: Rui Zhang Tested-by: Mark Pearson Reviewed-by: Srinivas Pandruvada Reviewed-by: Mark Pearson Tested-by: Aaron Ma Signed-off-by: Jiri Kosina --- include/linux/hid-over-i2c.h | 73 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid-over-i2c.h b/include/linux/hid-over-i2c.h index b70626723a38..3b1a0208a6b8 100644 --- a/include/linux/hid-over-i2c.h +++ b/include/linux/hid-over-i2c.h @@ -1,9 +1,80 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright 2024 Intel Corporation */ +#include + #ifndef _HID_OVER_I2C_H_ #define _HID_OVER_I2C_H_ +#define HIDI2C_REG_LEN sizeof(__le16) + +/* Input report type definition in HIDI2C protocol */ +enum hidi2c_report_type { + HIDI2C_RESERVED = 0, + HIDI2C_INPUT, + HIDI2C_OUTPUT, + HIDI2C_FEATURE, +}; + +/* Power state type definition in HIDI2C protocol */ +enum hidi2c_power_state { + HIDI2C_ON, + HIDI2C_SLEEP, +}; + +/* Opcode type definition in HIDI2C protocol */ +enum hidi2c_opcode { + HIDI2C_RESET = 1, + HIDI2C_GET_REPORT, + HIDI2C_SET_REPORT, + HIDI2C_GET_IDLE, + HIDI2C_SET_IDLE, + HIDI2C_GET_PROTOCOL, + HIDI2C_SET_PROTOCOL, + HIDI2C_SET_POWER, +}; + +/** + * struct hidi2c_report_packet - Report packet definition in HIDI2C protocol + * @len: data field length + * @data: HIDI2C report packet data + */ +struct hidi2c_report_packet { + __le16 len; + u8 data[]; +} __packed; + +#define HIDI2C_LENGTH_LEN sizeof(__le16) + +#define HIDI2C_PACKET_LEN(data_len) ((data_len) + HIDI2C_LENGTH_LEN) +#define HIDI2C_DATA_LEN(pkt_len) ((pkt_len) - HIDI2C_LENGTH_LEN) + +#define HIDI2C_CMD_MAX_RI 0x0F + +/** + * HIDI2C command data packet - Command packet definition in HIDI2C protocol + * @report_id: [0:3] report id (<15) for features or output reports + * @report_type: [4:5] indicate report type, reference to hidi2c_report_type + * @reserved0: [6:7] reserved bits + * @opcode: [8:11] command operation code, reference to hidi2c_opcode + * @reserved1: [12:15] reserved bits + * @report_id_optional: [23:16] appended 3rd byte. + * If the report_id in the low byte is set to the + * sentinel value (HIDI2C_CMD_MAX_RI), then this + * optional third byte represents the report id (>=15) + * Otherwise, not this 3rd byte. + */ + +#define HIDI2C_CMD_LEN sizeof(__le16) +#define HIDI2C_CMD_LEN_OPT (sizeof(__le16) + 1) +#define HIDI2C_CMD_REPORT_ID GENMASK(3, 0) +#define HIDI2C_CMD_REPORT_TYPE GENMASK(5, 4) +#define HIDI2C_CMD_OPCODE GENMASK(11, 8) +#define HIDI2C_CMD_OPCODE GENMASK(11, 8) +#define HIDI2C_CMD_3RD_BYTE GENMASK(23, 16) + +#define HIDI2C_HID_DESC_BCDVERSION 0x100 + /** * struct hidi2c_dev_descriptor - HIDI2C device descriptor definition * @dev_desc_len: The length of the complete device descriptor, fixed to 0x1E (30). @@ -41,4 +112,6 @@ struct hidi2c_dev_descriptor { __le16 reserved1; } __packed; +#define HIDI2C_DEV_DESC_LEN sizeof(struct hidi2c_dev_descriptor) + #endif /* _HID_OVER_I2C_H_ */ -- cgit v1.2.3 From 3675a926feefdf3afabea12f806f31ea582065e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 28 Dec 2024 09:43:41 +0100 Subject: sysfs: constify bin_attribute argument of sysfs_bin_attr_simple_read() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most users use this function through the BIN_ATTR_SIMPLE* macros, they can handle the switch transparently. Also adapt the two non-macro users in the same change. Signed-off-by: Thomas Weißschuh Acked-by: Madhavan Srinivasan Reviewed-by: Mahesh Salgaonkar Tested-by: Aditya Gupta Link: https://lore.kernel.org/r/20241228-sysfs-const-bin_attr-simple-v2-1-7c6f3f1767a3@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index b4368377fac9..18f7e1fd093c 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -511,7 +511,7 @@ __printf(3, 4) int sysfs_emit_at(char *buf, int at, const char *fmt, ...); ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t off, size_t count); #else /* CONFIG_SYSFS */ @@ -774,7 +774,7 @@ static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...) static inline ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t off, size_t count) { -- cgit v1.2.3 From 09a897432637aa0b99545ce13d57760cf0cb09d1 Mon Sep 17 00:00:00 2001 From: Shree Ramamoorthy Date: Tue, 17 Dec 2024 14:49:35 -0600 Subject: mfd: tps65219: Remove unused macros & add regmap.h These macros are not used by the driver, and the structs are accounted for with the addition of the linux/regmap.h file. Signed-off-by: Shree Ramamoorthy Link: https://lore.kernel.org/r/20241217204935.1012106-3-s-ramamoorthy@ti.com Signed-off-by: Lee Jones --- include/linux/mfd/tps65219.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/tps65219.h b/include/linux/mfd/tps65219.h index e6826e34e2a6..546bceec7173 100644 --- a/include/linux/mfd/tps65219.h +++ b/include/linux/mfd/tps65219.h @@ -10,14 +10,9 @@ #include #include +#include #include -struct regmap; -struct regmap_irq_chip_data; - -#define TPS65219_1V35 1350000 -#define TPS65219_1V8 1800000 - /* TPS chip id list */ #define TPS65219 0xF0 -- cgit v1.2.3 From 26769582bf353ae613f5113a1414ff3a80e08264 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Tue, 17 Dec 2024 12:11:41 -0600 Subject: mfd: syscon: Remove the platform driver support The platform driver is dead code. It is not used by DT platforms since commit bdb0066df96e ("mfd: syscon: Decouple syscon interface from platform devices") which said: For non-DT based platforms, this patch keeps syscon platform driver structure so that syscon can be probed and such non-DT based drivers can use syscon_regmap_lookup_by_pdev API and access regmap handles. Once all users of "syscon_regmap_lookup_by_pdev" migrated to DT based, we can completely remove platform driver of syscon, and keep only helper functions to get regmap handles. The last user of syscon_regmap_lookup_by_pdevname() was removed in 2018. syscon_regmap_lookup_by_pdevname() was then removed in 2019, but that commit failed to remove the rest of the platform driver. Signed-off-by: Rob Herring (Arm) Tested-by: Krzysztof Kozlowski Tested-by: Will McVicker Acked-by: Liviu Dudau Reviewed-by: Pankaj Dubey Tested-by: Pankaj Dubey Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20241217-syscon-fixes-v2-2-4f56d750541d@kernel.org Signed-off-by: Lee Jones --- include/linux/platform_data/syscon.h | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 include/linux/platform_data/syscon.h (limited to 'include/linux') diff --git a/include/linux/platform_data/syscon.h b/include/linux/platform_data/syscon.h deleted file mode 100644 index 2c089dd3e2bd..000000000000 --- a/include/linux/platform_data/syscon.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef PLATFORM_DATA_SYSCON_H -#define PLATFORM_DATA_SYSCON_H - -struct syscon_platform_data { - const char *label; -}; - -#endif -- cgit v1.2.3 From e61e6c415ba9ff2b32bb6780ce1b17d1d76238f1 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Tue, 7 Jan 2025 02:48:12 -0800 Subject: net/mlx5: use do_aux_work for PHC overflow checks The overflow_work is using system wq to do overflow checks and updates for PHC device timecounter, which might be overhelmed by other tasks. But there is dedicated kthread in PTP subsystem designed for such things. This patch changes the work queue to proper align with PTP subsystem and to avoid overloading system work queue. The adjfine() function acts the same way as overflow check worker, we can postpone ptp aux worker till the next overflow period after adjfine() was called. Reviewed-by: Dragos Tatulea Signed-off-by: Vadim Fedorenko Acked-by: Tariq Toukan Link: https://patch.msgid.link/20250107104812.380225-1-vadfed@meta.com Signed-off-by: Paolo Abeni --- include/linux/mlx5/driver.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ea48eb879a0f..fed666c5bd16 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -691,7 +691,6 @@ struct mlx5_timer { struct timecounter tc; u32 nominal_c_mult; unsigned long overflow_period; - struct delayed_work overflow_work; }; struct mlx5_clock { -- cgit v1.2.3 From d1c444b47100d81a4b8c84aa3ac1c8159c22066a Mon Sep 17 00:00:00 2001 From: Basavaraj Natikar Date: Tue, 17 Dec 2024 20:46:26 +0530 Subject: HID: amd_sfh: Add support to export device operating states MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support to export device operating states, such as laptop placement, platform types and propagate this data to AMD PMF driver for use in actions. To retrieve the device operating states data, SRA sensor support need to be enabled in AMD SFH driver. So add support to enable the SRA sensor. Also, remove explicit assignments to sensor_index enum. Co-developed-by: Akshata MukundShetty Signed-off-by: Akshata MukundShetty Signed-off-by: Basavaraj Natikar Signed-off-by: Shyam Sundar S K Reviewed-by: Mario Limonciello Acked-by: Jiri Kosina Link: https://lore.kernel.org/r/20241217151627.757477-2-Shyam-sundar.S-k@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/amd-pmf-io.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/amd-pmf-io.h b/include/linux/amd-pmf-io.h index b4f818205216..6fa510f419c0 100644 --- a/include/linux/amd-pmf-io.h +++ b/include/linux/amd-pmf-io.h @@ -18,10 +18,12 @@ * enum sfh_message_type - Query the SFH message type * @MT_HPD: Message ID to know the Human presence info from MP2 FW * @MT_ALS: Message ID to know the Ambient light info from MP2 FW + * @MT_SRA: Message ID to know the SRA data from MP2 FW */ enum sfh_message_type { MT_HPD, MT_ALS, + MT_SRA, }; /** @@ -40,10 +42,23 @@ enum sfh_hpd_info { * struct amd_sfh_info - get HPD sensor info from MP2 FW * @ambient_light: Populates the ambient light information * @user_present: Populates the user presence information + * @platform_type: Operating modes (clamshell, flat, tent, etc.) + * @laptop_placement: Device states (ontable, onlap, outbag) */ struct amd_sfh_info { u32 ambient_light; u8 user_present; + u32 platform_type; + u32 laptop_placement; +}; + +enum laptop_placement { + LP_UNKNOWN = 0, + ON_TABLE, + ON_LAP_MOTION, + IN_BAG, + OUT_OF_BAG, + LP_UNDEFINED, }; int amd_get_sfh_info(struct amd_sfh_info *sfh_info, enum sfh_message_type op); -- cgit v1.2.3 From 7ed6cbe0f8caa6ee38a2dc8f1b925acb904cc01f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Jan 2025 09:31:02 +0100 Subject: fs: add STATX_DIO_READ_ALIGN Add a separate dio read align field, as many out of place write file systems can easily do reads aligned to the device sector size, but require bigger alignment for writes. This is usually papered over by falling back to buffered I/O for smaller writes and doing read-modify-write cycles, but performance for this sucks, so applications benefit from knowing the actual write alignment. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250109083109.1441561-3-hch@lst.de Reviewed-by: John Garry Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/stat.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stat.h b/include/linux/stat.h index 3d900c86981c..9d8382e23a9c 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -52,6 +52,7 @@ struct kstat { u64 mnt_id; u32 dio_mem_align; u32 dio_offset_align; + u32 dio_read_offset_align; u64 change_cookie; u64 subvol; u32 atomic_write_unit_min; -- cgit v1.2.3 From 344bac8f0d73fe970cd9f5b2f132906317d29e8b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 15 Dec 2024 21:17:05 +0100 Subject: fs: kill MNT_ONRB Move mnt->mnt_node into the union with mnt->mnt_rcu and mnt->mnt_llist instead of keeping it with mnt->mnt_list. This allows us to use RB_CLEAR_NODE(&mnt->mnt_node) in umount_tree() as well as list_empty(&mnt->mnt_node). That in turn allows us to remove MNT_ONRB. This also fixes the bug reported in [1] where seemingly MNT_ONRB wasn't set in @mnt->mnt_flags even though the mount was present in the mount rbtree of the mount namespace. The root cause is the following race. When a btrfs subvolume is mounted a temporary mount is created: btrfs_get_tree_subvol() { mnt = fc_mount() // Register the newly allocated mount with sb->mounts: lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); } and registered on sb->s_mounts. Later it is added to an anonymous mount namespace via mount_subvol(): -> mount_subvol() -> mount_subtree() -> alloc_mnt_ns() mnt_add_to_ns() vfs_path_lookup() put_mnt_ns() The mnt_add_to_ns() call raises MNT_ONRB in @mnt->mnt_flags. If someone concurrently does a ro remount: reconfigure_super() -> sb_prepare_remount_readonly() { list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { } all mounts registered in sb->s_mounts are visited and first MNT_WRITE_HOLD is raised, then MNT_READONLY is raised, and finally MNT_WRITE_HOLD is removed again. The flag modification for MNT_WRITE_HOLD/MNT_READONLY and MNT_ONRB race so MNT_ONRB might be lost. Fixes: 2eea9ce4310d ("mounts: keep list of mounts in an rbtree") Cc: # v6.8+ Link: https://lore.kernel.org/r/20241215-vfs-6-14-mount-work-v1-1-fd55922c4af8@kernel.org Link: https://lore.kernel.org/r/ec6784ed-8722-4695-980a-4400d4e7bd1a@gmx.com [1] Signed-off-by: Christian Brauner --- include/linux/mount.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mount.h b/include/linux/mount.h index c34c18b4e8f3..04213d8ef837 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -50,7 +50,7 @@ struct path; #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB) + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED) #define MNT_INTERNAL 0x4000 @@ -64,7 +64,6 @@ struct path; #define MNT_SYNC_UMOUNT 0x2000000 #define MNT_MARKED 0x4000000 #define MNT_UMOUNT 0x8000000 -#define MNT_ONRB 0x10000000 struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ -- cgit v1.2.3 From 67d676bb135cd4de9647616e73cfd059ef57c9a6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 13 Dec 2024 00:03:43 +0100 Subject: rculist: add list_bidir_{del,prev}_rcu() Currently there is no primitive for retrieving the previous list member. To do this we need a new deletion primitive that doesn't poison the prev pointer and a corresponding retrieval helper. Note that it is not valid to ues both list_del_rcu() and list_bidir_del_rcu() on the same list. Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-4-6e3cdaf9b280@kernel.org Reviewed-by: Paul E. McKenney Reviewed-by: Jeff Layton Suggested-by: Paul E. McKenney Signed-off-by: Christian Brauner --- include/linux/rculist.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 14dfa6008467..1b11926ddd47 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -30,6 +30,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) * way, we must not access it directly */ #define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) +/* + * Return the ->prev pointer of a list_head in an rcu safe way. Don't + * access it directly. + * + * Any list traversed with list_bidir_prev_rcu() must never use + * list_del_rcu(). Doing so will poison the ->prev pointer that + * list_bidir_prev_rcu() relies on, which will result in segfaults. + * To prevent these segfaults, use list_bidir_del_rcu() instead + * of list_del_rcu(). + */ +#define list_bidir_prev_rcu(list) (*((struct list_head __rcu **)(&(list)->prev))) /** * list_tail_rcu - returns the prev pointer of the head of the list @@ -158,6 +169,39 @@ static inline void list_del_rcu(struct list_head *entry) entry->prev = LIST_POISON2; } +/** + * list_bidir_del_rcu - deletes entry from list without re-initialization + * @entry: the element to delete from the list. + * + * In contrast to list_del_rcu() doesn't poison the prev pointer thus + * allowing backwards traversal via list_bidir_prev_rcu(). + * + * Note: list_empty() on entry does not return true after this because + * the entry is in a special undefined state that permits RCU-based + * lockfree reverse traversal. In particular this means that we can not + * poison the forward and backwards pointers that may still be used for + * walking the list. + * + * The caller must take whatever precautions are necessary (such as + * holding appropriate locks) to avoid racing with another list-mutation + * primitive, such as list_bidir_del_rcu() or list_add_rcu(), running on + * this same list. However, it is perfectly legal to run concurrently + * with the _rcu list-traversal primitives, such as + * list_for_each_entry_rcu(). + * + * Note that list_del_rcu() and list_bidir_del_rcu() must not be used on + * the same list. + * + * Note that the caller is not permitted to immediately free + * the newly deleted entry. Instead, either synchronize_rcu() + * or call_rcu() must be used to defer freeing until an RCU + * grace period has elapsed. + */ +static inline void list_bidir_del_rcu(struct list_head *entry) +{ + __list_del_entry(entry); +} + /** * hlist_del_init_rcu - deletes entry from hash list with re-initialization * @n: the element to delete from the hash list. -- cgit v1.2.3 From 0fefeade90e74bc8f40ab0e460f483565c492e28 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 24 Dec 2024 18:05:46 +0100 Subject: spi: spi-mem: Extend spi-mem operations with a per-operation maximum frequency In the spi subsystem, the bus frequency is derived as follows: - the controller may expose a minimum and maximum operating frequency - the hardware description, through the spi peripheral properties, advise what is the maximum acceptable frequency from a device/wiring point of view. Transfers must be observed at a frequency which fits both (so in practice, the lowest maximum). Actually, this second point mixes two information and already takes the lowest frequency among: - what the spi device is capable of (what is written in the component datasheet) - what the wiring allows (electromagnetic sensibility, crossovers, terminations, antenna effect, etc). This logic works until spi devices are no longer capable of sustaining their highest frequency regardless of the operation. Spi memories are typically subject to such variation. Some devices are capable of spitting their internally stored data (essentially in read mode) at a very fast rate, typically up to 166MHz on Winbond SPI-NAND chips, using "fast" commands. However, some of the low-end operations, such as regular page read-from-cache commands, are more limited and can only be executed at 54MHz at most. This is currently a problem in the SPI-NAND subsystem. Another situation, even if not yet supported, will be with DTR commands, when the data is latched on both edges of the clock. The same chips as mentioned previously are in this case limited to 80MHz. Yet another example might be continuous reads, which, under certain circumstances, can also run at most at 104 or 120MHz. As a matter of fact, the "one frequency per chip" policy is outdated and more fine grain configuration is needed: we need to allow per-operation frequency limitations. So far, all datasheets I encountered advertise a maximum default frequency, which need to be lowered for certain specific operations. So based on the current infrastructure, we can still expect firmware (device trees in general) to continued advertising the same maximum speed which is a mix between the PCB limitations and the chip maximum capability, and expect per-operation lower frequencies when this is relevant. Add a `struct spi_mem_op` member to carry this information. Not providing this field explicitly from upper layers means that there is no further constraint and the default spi device maximum speed will be carried instead. The SPI_MEM_OP() macro is also expanded with an optional frequency argument, because virtually all operations can be subject to such a limitation, and this will allow for a smooth and discrete transition. For controller drivers which do not implement the spi-mem interface, the per-transfer speed is also set acordingly to a lower (than the maximum default) speed when relevant. Acked-by: Pratyush Yadav Signed-off-by: Miquel Raynal Link: https://patch.msgid.link/20241224-winbond-6-11-rc1-quad-support-v2-1-ad218dbc406f@bootlin.com Signed-off-by: Mark Brown --- include/linux/spi/spi-mem.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index c46d2b8029be..84ec52498792 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -68,6 +68,9 @@ enum spi_mem_data_dir { SPI_MEM_DATA_OUT, }; +#define SPI_MEM_OP_MAX_FREQ(__freq) \ + .max_freq = __freq + /** * struct spi_mem_op - describes a SPI memory operation * @cmd.nbytes: number of opcode bytes (only 1 or 2 are valid). The opcode is @@ -97,6 +100,9 @@ enum spi_mem_data_dir { * operation does not involve transferring data * @data.buf.in: input buffer (must be DMA-able) * @data.buf.out: output buffer (must be DMA-able) + * @max_freq: frequency limitation wrt this operation. 0 means there is no + * specific constraint and the highest achievable frequency can be + * attempted. */ struct spi_mem_op { struct { @@ -135,14 +141,17 @@ struct spi_mem_op { const void *out; } buf; } data; + + unsigned int max_freq; }; -#define SPI_MEM_OP(__cmd, __addr, __dummy, __data) \ +#define SPI_MEM_OP(__cmd, __addr, __dummy, __data, ...) \ { \ .cmd = __cmd, \ .addr = __addr, \ .dummy = __dummy, \ .data = __data, \ + __VA_ARGS__ \ } /** @@ -371,6 +380,7 @@ bool spi_mem_default_supports_op(struct spi_mem *mem, #endif /* CONFIG_SPI_MEM */ int spi_mem_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op); +void spi_mem_adjust_op_freq(struct spi_mem *mem, struct spi_mem_op *op); bool spi_mem_supports_op(struct spi_mem *mem, const struct spi_mem_op *op); -- cgit v1.2.3 From 1248c9b8d54120950fda10fbeb98fb8932b4d45c Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 24 Dec 2024 18:05:47 +0100 Subject: spi: spi-mem: Add a new controller capability There are spi devices with multiple frequency limitations depending on the invoked command. We probably do not want to afford running at the lowest supported frequency all the time, so if we want to get the most of our hardware, we need to allow per-operation frequency limitations. Among all the SPI memory controllers, I believe all are capable of changing the spi frequency on the fly. Some of the drivers do not make any frequency setup though. And some others will derive a per chip prescaler value which will be used forever. Actually changing the frequency on the fly is something new in Linux, so we need to carefully flag the drivers which do and do not support it. A controller capability is created for that, and the presence for this capability will always be checked before accepting such pattern. Signed-off-by: Miquel Raynal Reviewed-by: Tudor Ambarus Link: https://patch.msgid.link/20241224-winbond-6-11-rc1-quad-support-v2-2-ad218dbc406f@bootlin.com Signed-off-by: Mark Brown --- include/linux/spi/spi-mem.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index 84ec52498792..c7a7719c2648 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -311,11 +311,13 @@ struct spi_controller_mem_ops { * @ecc: Supports operations with error correction * @swap16: Supports swapping bytes on a 16 bit boundary when configured in * Octal DTR + * @per_op_freq: Supports per operation frequency switching */ struct spi_controller_mem_caps { bool dtr; bool ecc; bool swap16; + bool per_op_freq; }; #define spi_mem_controller_is_capable(ctlr, cap) \ -- cgit v1.2.3 From d1f85873d2d62d6980e68d21d3a21f20b0664cc3 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 24 Dec 2024 18:06:03 +0100 Subject: spi: spi-mem: Reorder spi-mem macro assignments Follow the order in which all the `struct spi_mem_op` members are defined. This is purely aesthetics, there is no functional change. Signed-off-by: Miquel Raynal Link: https://patch.msgid.link/20241224-winbond-6-11-rc1-quad-support-v2-18-ad218dbc406f@bootlin.com Signed-off-by: Mark Brown --- include/linux/spi/spi-mem.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index c7a7719c2648..ca6ea01c40f8 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -15,16 +15,16 @@ #define SPI_MEM_OP_CMD(__opcode, __buswidth) \ { \ + .nbytes = 1, \ .buswidth = __buswidth, \ .opcode = __opcode, \ - .nbytes = 1, \ } #define SPI_MEM_OP_ADDR(__nbytes, __val, __buswidth) \ { \ .nbytes = __nbytes, \ - .val = __val, \ .buswidth = __buswidth, \ + .val = __val, \ } #define SPI_MEM_OP_NO_ADDR { } @@ -39,18 +39,18 @@ #define SPI_MEM_OP_DATA_IN(__nbytes, __buf, __buswidth) \ { \ + .buswidth = __buswidth, \ .dir = SPI_MEM_DATA_IN, \ .nbytes = __nbytes, \ .buf.in = __buf, \ - .buswidth = __buswidth, \ } #define SPI_MEM_OP_DATA_OUT(__nbytes, __buf, __buswidth) \ { \ + .buswidth = __buswidth, \ .dir = SPI_MEM_DATA_OUT, \ .nbytes = __nbytes, \ .buf.out = __buf, \ - .buswidth = __buswidth, \ } #define SPI_MEM_OP_NO_DATA { } -- cgit v1.2.3 From f0006897a96c736623ddeb9b68c3880eb5cdebe7 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 24 Dec 2024 18:06:04 +0100 Subject: spi: spi-mem: Create macros for DTR operation We do have macros for defining command, address, dummy and data cycles. We also have a .dtr flag that implies sampling the bus on both edges, but there are currently no macros enabling it. We might make use of such macros, so let's create: - SPI_MEM_DTR_OP_CMD - SPI_MEM_DTR_OP_ADDR - SPI_MEM_DTR_OP_DUMMY - SPI_MEM_DTR_OP_DATA_OUT - SPI_MEM_DTR_OP_DATA_OUT Signed-off-by: Miquel Raynal Link: https://patch.msgid.link/20241224-winbond-6-11-rc1-quad-support-v2-19-ad218dbc406f@bootlin.com Signed-off-by: Mark Brown --- include/linux/spi/spi-mem.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index ca6ea01c40f8..306c05dd1378 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -20,6 +20,14 @@ .opcode = __opcode, \ } +#define SPI_MEM_DTR_OP_CMD(__opcode, __buswidth) \ + { \ + .nbytes = 1, \ + .opcode = __opcode, \ + .buswidth = __buswidth, \ + .dtr = true, \ + } + #define SPI_MEM_OP_ADDR(__nbytes, __val, __buswidth) \ { \ .nbytes = __nbytes, \ @@ -27,6 +35,14 @@ .val = __val, \ } +#define SPI_MEM_DTR_OP_ADDR(__nbytes, __val, __buswidth) \ + { \ + .nbytes = __nbytes, \ + .val = __val, \ + .buswidth = __buswidth, \ + .dtr = true, \ + } + #define SPI_MEM_OP_NO_ADDR { } #define SPI_MEM_OP_DUMMY(__nbytes, __buswidth) \ @@ -35,6 +51,13 @@ .buswidth = __buswidth, \ } +#define SPI_MEM_DTR_OP_DUMMY(__nbytes, __buswidth) \ + { \ + .nbytes = __nbytes, \ + .buswidth = __buswidth, \ + .dtr = true, \ + } + #define SPI_MEM_OP_NO_DUMMY { } #define SPI_MEM_OP_DATA_IN(__nbytes, __buf, __buswidth) \ @@ -45,6 +68,15 @@ .buf.in = __buf, \ } +#define SPI_MEM_DTR_OP_DATA_IN(__nbytes, __buf, __buswidth) \ + { \ + .dir = SPI_MEM_DATA_IN, \ + .nbytes = __nbytes, \ + .buf.in = __buf, \ + .buswidth = __buswidth, \ + .dtr = true, \ + } + #define SPI_MEM_OP_DATA_OUT(__nbytes, __buf, __buswidth) \ { \ .buswidth = __buswidth, \ @@ -53,6 +85,15 @@ .buf.out = __buf, \ } +#define SPI_MEM_DTR_OP_DATA_OUT(__nbytes, __buf, __buswidth) \ + { \ + .dir = SPI_MEM_DATA_OUT, \ + .nbytes = __nbytes, \ + .buf.out = __buf, \ + .buswidth = __buswidth, \ + .dtr = true, \ + } + #define SPI_MEM_OP_NO_DATA { } /** -- cgit v1.2.3 From 155c5bf26f983e9988333eeb0ef217138304d13b Mon Sep 17 00:00:00 2001 From: guanjing Date: Fri, 20 Dec 2024 09:33:35 +0100 Subject: firewall: remove misplaced semicolon from stm32_firewall_get_firewall Remove misplaced colon in stm32_firewall_get_firewall() which results in a syntax error when the code is compiled without CONFIG_STM32_FIREWALL. Fixes: 5c9668cfc6d7 ("firewall: introduce stm32_firewall framework") Signed-off-by: guanjing Reviewed-by: Gatien Chevallier Signed-off-by: Alexandre Torgue Signed-off-by: Arnd Bergmann --- include/linux/bus/stm32_firewall_device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bus/stm32_firewall_device.h b/include/linux/bus/stm32_firewall_device.h index 18e0a2fc3816..5178b72bc920 100644 --- a/include/linux/bus/stm32_firewall_device.h +++ b/include/linux/bus/stm32_firewall_device.h @@ -115,7 +115,7 @@ void stm32_firewall_release_access_by_id(struct stm32_firewall *firewall, u32 su #else /* CONFIG_STM32_FIREWALL */ int stm32_firewall_get_firewall(struct device_node *np, struct stm32_firewall *firewall, - unsigned int nb_firewall); + unsigned int nb_firewall) { return -ENODEV; } -- cgit v1.2.3 From e2b6e5e4877ac898b61338dc20491cd837af79b2 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 9 Dec 2024 11:41:11 +0900 Subject: jump_label: Define guard() for jump_label_lock Link: https://lore.kernel.org/all/173371207108.480397.12818384744149153972.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- include/linux/jump_label.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index f5a2727ca4a9..fdb79dd1ebd8 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -75,6 +75,7 @@ #include #include +#include extern bool static_key_initialized; @@ -347,6 +348,8 @@ static inline void static_key_disable(struct static_key *key) #endif /* CONFIG_JUMP_LABEL */ +DEFINE_LOCK_GUARD_0(jump_label_lock, jump_label_lock(), jump_label_unlock()) + #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE #define jump_label_enabled static_key_enabled -- cgit v1.2.3 From ef5a3c92a81a1a892ae9edf949625beb68b4bd43 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Mon, 25 Nov 2024 15:24:43 -0800 Subject: hyperv: Switch from hyperv-tlfs.h to hyperv/hvhdk.h Switch to using hvhdk.h everywhere in the kernel. This header includes all the new Hyper-V headers in include/hyperv, which form a superset of the definitions found in hyperv-tlfs.h. This makes it easier to add new Hyper-V interfaces without being restricted to those in the TLFS doc (reflected in hyperv-tlfs.h). To be more consistent with the original Hyper-V code, the names of some definitions are changed slightly. Update those where needed. Update comments in mshyperv.h files to point to include/hyperv for adding new definitions. Signed-off-by: Nuno Das Neves Reviewed-by: Michael Kelley Reviewed-by: Easwar Hariharan Signed-off-by: Roman Kisel Reviewed-by: Easwar Hariharan Link: https://lore.kernel.org/r/1732577084-2122-5-git-send-email-nunodasneves@linux.microsoft.com Link: https://lore.kernel.org/r/20250108222138.1623703-3-romank@linux.microsoft.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index b0dbba3b9108..4179add2864b 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -24,7 +24,7 @@ #include #include #include -#include +#include #define MAX_PAGE_BUFFER_COUNT 32 #define MAX_MULTIPAGE_BUFFER_COUNT 32 /* 128K */ -- cgit v1.2.3 From cacd9ae4bf801ff4125d8961bb9a3ba955e51680 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 7 Jan 2025 17:27:17 +0100 Subject: poll_wait: add mb() to fix theoretical race between waitqueue_active() and .poll() As the comment above waitqueue_active() explains, it can only be used if both waker and waiter have mb()'s that pair with each other. However __pollwait() is broken in this respect. This is not pipe-specific, but let's look at pipe_poll() for example: poll_wait(...); // -> __pollwait() -> add_wait_queue() LOAD(pipe->head); LOAD(pipe->head); In theory these LOAD()'s can leak into the critical section inside add_wait_queue() and can happen before list_add(entry, wq_head), in this case pipe_poll() can race with wakeup_pipe_readers/writers which do smp_mb(); if (waitqueue_active(wq_head)) wake_up_interruptible(wq_head); There are more __pollwait()-like functions (grep init_poll_funcptr), and it seems that at least ep_ptable_queue_proc() has the same problem, so the patch adds smp_mb() into poll_wait(). Link: https://lore.kernel.org/all/20250102163320.GA17691@redhat.com/ Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250107162717.GA18922@redhat.com Signed-off-by: Christian Brauner --- include/linux/poll.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/poll.h b/include/linux/poll.h index d1ea4f3714a8..fc641b50f129 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -41,8 +41,16 @@ typedef struct poll_table_struct { static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { - if (p && p->_qproc && wait_address) + if (p && p->_qproc && wait_address) { p->_qproc(filp, wait_address, p); + /* + * This memory barrier is paired in the wq_has_sleeper(). + * See the comment above prepare_to_wait(), we need to + * ensure that subsequent tests in this thread can't be + * reordered with __add_wait_queue() in _qproc() paths. + */ + smp_mb(); + } } /* -- cgit v1.2.3 From 10b02a2cfec2f106db4897ad87732db56d71e6fd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 7 Jan 2025 17:27:24 +0100 Subject: poll_wait: kill the obsolete wait_address check This check is historical and no longer needed, wait_address is never NULL. These days we rely on the poll_table->_qproc check. NULL if select/poll is not going to sleep, or it already has a data to report, or all waiters have already been registered after the 1st iteration. However, poll_table *p can be NULL, see p9_fd_poll() for example, so we can't remove the "p != NULL" check. Link: https://lore.kernel.org/all/20250106180325.GF7233@redhat.com/ Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250107162724.GA18926@redhat.com Signed-off-by: Christian Brauner --- include/linux/poll.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/poll.h b/include/linux/poll.h index fc641b50f129..57b6d1ccd8bf 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -41,7 +41,7 @@ typedef struct poll_table_struct { static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { - if (p && p->_qproc && wait_address) { + if (p && p->_qproc) { p->_qproc(filp, wait_address, p); /* * This memory barrier is paired in the wq_has_sleeper(). -- cgit v1.2.3 From f005bf18a57aadf3af1e85a0f0151cb3688ee606 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 7 Jan 2025 17:27:43 +0100 Subject: poll: kill poll_does_not_wait() It no longer has users. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250107162743.GA18947@redhat.com Signed-off-by: Christian Brauner --- include/linux/poll.h | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/poll.h b/include/linux/poll.h index 57b6d1ccd8bf..12bb18e8b978 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -25,14 +25,14 @@ struct poll_table_struct; -/* +/* * structures and helpers for f_op->poll implementations */ typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *); /* - * Do not touch the structure directly, use the access functions - * poll_does_not_wait() and poll_requested_events() instead. + * Do not touch the structure directly, use the access function + * poll_requested_events() instead. */ typedef struct poll_table_struct { poll_queue_proc _qproc; @@ -53,16 +53,6 @@ static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_addres } } -/* - * Return true if it is guaranteed that poll will not wait. This is the case - * if the poll() of another file descriptor in the set got an event, so there - * is no need for waiting. - */ -static inline bool poll_does_not_wait(const poll_table *p) -{ - return p == NULL || p->_qproc == NULL; -} - /* * Return the set of events that the application wants to poll for. * This is useful for drivers that need to know whether a DMA transfer has -- cgit v1.2.3 From 1cd9502ee9275c6176a7312863f939cca9506114 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 29 Dec 2024 00:45:28 +0900 Subject: module: get symbol CRC back to unsigned Commit 71810db27c1c ("modversions: treat symbol CRCs as 32 bit quantities") changed the CRC fields to s32 because the __kcrctab and __kcrctab_gpl sections contained relative references to the actual CRC values stored in the .rodata section when CONFIG_MODULE_REL_CRCS=y. Commit 7b4537199a4a ("kbuild: link symbol CRCs at final link, removing CONFIG_MODULE_REL_CRCS") removed this complexity. Now, the __kcrctab and __kcrctab_gpl sections directly contain the CRC values in all cases. The genksyms tool outputs unsigned 32-bit CRC values, so u32 is preferred over s32. No functional changes are intended. Regardless of this change, the CRC value is assigned to the u32 variable 'crcval' before the comparison, as seen in kernel/module/version.c: crcval = *crc; It was previously mandatory (but now optional) in order to avoid sign extension because the following line previously compared 'unsigned long' and 's32': if (versions[i].crc == crcval) return 1; versions[i].crc is still 'unsigned long' for backward compatibility. Signed-off-by: Masahiro Yamada Reviewed-by: Petr Pavlu --- include/linux/module.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 94acbacdcdf1..903ef8fe4c04 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -430,7 +430,7 @@ struct module { /* Exported symbols */ const struct kernel_symbol *syms; - const s32 *crcs; + const u32 *crcs; unsigned int num_syms; #ifdef CONFIG_ARCH_USES_CFI_TRAPS @@ -448,7 +448,7 @@ struct module { /* GPL-only exported symbols. */ unsigned int num_gpl_syms; const struct kernel_symbol *gpl_syms; - const s32 *gpl_crcs; + const u32 *gpl_crcs; bool using_gplonly_symbols; #ifdef CONFIG_MODULE_SIG -- cgit v1.2.3 From 896be785015c0e0ba73442f73b8d4d9f5ccfc54c Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Wed, 4 Sep 2024 11:17:23 -0300 Subject: bus: fsl-mc: constify the struct device_type usage Since commit aed65af1cc2f ("drivers: make device_type const"), the driver core can properly handle constant struct device_type. Move all the device_type variables used in the bus to be constant structures as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Link: https://lore.kernel.org/r/20240904-class_cleanup-fsl-mc-bus-v2-1-83fa25cbdc68@suse.com Signed-off-by: Greg Kroah-Hartman --- include/linux/fsl/mc.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index c90ec889bfc2..99f30c7d6208 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -438,21 +438,21 @@ struct fsl_mc_device *fsl_mc_get_endpoint(struct fsl_mc_device *mc_dev, extern const struct bus_type fsl_mc_bus_type; -extern struct device_type fsl_mc_bus_dprc_type; -extern struct device_type fsl_mc_bus_dpni_type; -extern struct device_type fsl_mc_bus_dpio_type; -extern struct device_type fsl_mc_bus_dpsw_type; -extern struct device_type fsl_mc_bus_dpbp_type; -extern struct device_type fsl_mc_bus_dpcon_type; -extern struct device_type fsl_mc_bus_dpmcp_type; -extern struct device_type fsl_mc_bus_dpmac_type; -extern struct device_type fsl_mc_bus_dprtc_type; -extern struct device_type fsl_mc_bus_dpseci_type; -extern struct device_type fsl_mc_bus_dpdmux_type; -extern struct device_type fsl_mc_bus_dpdcei_type; -extern struct device_type fsl_mc_bus_dpaiop_type; -extern struct device_type fsl_mc_bus_dpci_type; -extern struct device_type fsl_mc_bus_dpdmai_type; +extern const struct device_type fsl_mc_bus_dprc_type; +extern const struct device_type fsl_mc_bus_dpni_type; +extern const struct device_type fsl_mc_bus_dpio_type; +extern const struct device_type fsl_mc_bus_dpsw_type; +extern const struct device_type fsl_mc_bus_dpbp_type; +extern const struct device_type fsl_mc_bus_dpcon_type; +extern const struct device_type fsl_mc_bus_dpmcp_type; +extern const struct device_type fsl_mc_bus_dpmac_type; +extern const struct device_type fsl_mc_bus_dprtc_type; +extern const struct device_type fsl_mc_bus_dpseci_type; +extern const struct device_type fsl_mc_bus_dpdmux_type; +extern const struct device_type fsl_mc_bus_dpdcei_type; +extern const struct device_type fsl_mc_bus_dpaiop_type; +extern const struct device_type fsl_mc_bus_dpci_type; +extern const struct device_type fsl_mc_bus_dpdmai_type; static inline bool is_fsl_mc_bus_dprc(const struct fsl_mc_device *mc_dev) { -- cgit v1.2.3 From ab017a15fdb2222002cdc6bdf86699fb21a0721a Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Sun, 5 Jan 2025 16:34:05 +0800 Subject: driver core: Rename declaration parameter name for API device_find_child() cluster For APIs: device_find_child() device_for_each_child() device_for_each_child_reverse() Their declaration has parameter name 'dev', but their defination changes the name to 'parent'. Rename declaration name to defination 'parent' to make both have the same name. Reviewed-by: Fan Ni Reviewed-by: Jonathan Cameron Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250105-class_fix-v6-4-3a2f1768d4d4@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 0e0bc9bfe0d1..a9d928398895 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1074,14 +1074,14 @@ void device_del(struct device *dev); DEFINE_FREE(device_del, struct device *, if (_T) device_del(_T)) -int device_for_each_child(struct device *dev, void *data, +int device_for_each_child(struct device *parent, void *data, int (*fn)(struct device *dev, void *data)); -int device_for_each_child_reverse(struct device *dev, void *data, +int device_for_each_child_reverse(struct device *parent, void *data, int (*fn)(struct device *dev, void *data)); int device_for_each_child_reverse_from(struct device *parent, struct device *from, const void *data, int (*fn)(struct device *, const void *)); -struct device *device_find_child(struct device *dev, const void *data, +struct device *device_find_child(struct device *parent, const void *data, device_match_t match); struct device *device_find_child_by_name(struct device *parent, const char *name); -- cgit v1.2.3 From 523c6b3ed7702a638e0f8fd02708a7ed4f938269 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Sun, 5 Jan 2025 16:34:07 +0800 Subject: driver core: Correct API device_for_each_child_reverse_from() prototype For API device_for_each_child_reverse_from(..., const void *data, int (*fn)(struct device *dev, const void *data)) - Type of @data is const pointer, and means caller's data @*data is not allowed to be modified, but that usually is not proper for such non finding device iterating API. - Types for both @data and @fn are not consistent with all other for_each device iterating APIs device_for_each_child(_reverse)(), bus_for_each_dev() and (driver|class)_for_each_device(). Correct its prototype by removing const from parameter types, then adapt for various existing usages. An dedicated typedef device_iter_t will be introduced as @fn() type for various for_each device interating APIs later. Reviewed-by: Jonathan Cameron Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250105-class_fix-v6-6-3a2f1768d4d4@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index a9d928398895..025bac08fca7 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1079,8 +1079,8 @@ int device_for_each_child(struct device *parent, void *data, int device_for_each_child_reverse(struct device *parent, void *data, int (*fn)(struct device *dev, void *data)); int device_for_each_child_reverse_from(struct device *parent, - struct device *from, const void *data, - int (*fn)(struct device *, const void *)); + struct device *from, void *data, + int (*fn)(struct device *, void *)); struct device *device_find_child(struct device *parent, const void *data, device_match_t match); struct device *device_find_child_by_name(struct device *parent, -- cgit v1.2.3 From 767b74e0d1fc7890a94d1770acf05a442474bd87 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Sun, 5 Jan 2025 16:34:08 +0800 Subject: driver core: Introduce device_iter_t for device iterating APIs There are several for_each APIs which has parameter with type below: int (*fn)(struct device *dev, void *data) They iterate over various device lists and call @fn() for each device with caller provided data @*data, and they usually need to modify @*data. Give the type an dedicated typedef with advantages shown below: typedef int (*device_iter_t)(struct device *dev, void *data) - Shorter API declarations and definitions - Prevent further for_each APIs from using bad parameter type So introduce device_iter_t and apply it to various existing APIs below: bus_for_each_dev() (class|driver)_for_each_device() device_for_each_child(_reverse|_reverse_from)(). Reviewed-by: Jonathan Cameron Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250105-class_fix-v6-7-3a2f1768d4d4@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 6 +++--- include/linux/device/bus.h | 7 +++++-- include/linux/device/class.h | 4 ++-- include/linux/device/driver.h | 2 +- 4 files changed, 11 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 025bac08fca7..36d1a1607712 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1075,12 +1075,12 @@ void device_del(struct device *dev); DEFINE_FREE(device_del, struct device *, if (_T) device_del(_T)) int device_for_each_child(struct device *parent, void *data, - int (*fn)(struct device *dev, void *data)); + device_iter_t fn); int device_for_each_child_reverse(struct device *parent, void *data, - int (*fn)(struct device *dev, void *data)); + device_iter_t fn); int device_for_each_child_reverse_from(struct device *parent, struct device *from, void *data, - int (*fn)(struct device *, void *)); + device_iter_t fn); struct device *device_find_child(struct device *parent, const void *data, device_match_t match); struct device *device_find_child_by_name(struct device *parent, diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index bc3fd74bb763..3d3517da41a1 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -139,9 +139,12 @@ int device_match_acpi_dev(struct device *dev, const void *adev); int device_match_acpi_handle(struct device *dev, const void *handle); int device_match_any(struct device *dev, const void *unused); +/* Device iterating function type for various driver core for_each APIs */ +typedef int (*device_iter_t)(struct device *dev, void *data); + /* iterator helpers for buses */ -int bus_for_each_dev(const struct bus_type *bus, struct device *start, void *data, - int (*fn)(struct device *dev, void *data)); +int bus_for_each_dev(const struct bus_type *bus, struct device *start, + void *data, device_iter_t fn); struct device *bus_find_device(const struct bus_type *bus, struct device *start, const void *data, device_match_t match); /** diff --git a/include/linux/device/class.h b/include/linux/device/class.h index 518c9c83d64b..aa67d4736816 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -92,8 +92,8 @@ void class_dev_iter_init(struct class_dev_iter *iter, const struct class *class, struct device *class_dev_iter_next(struct class_dev_iter *iter); void class_dev_iter_exit(struct class_dev_iter *iter); -int class_for_each_device(const struct class *class, const struct device *start, void *data, - int (*fn)(struct device *dev, void *data)); +int class_for_each_device(const struct class *class, const struct device *start, + void *data, device_iter_t fn); struct device *class_find_device(const struct class *class, const struct device *start, const void *data, device_match_t match); diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index 5c04b8e3833b..cd8e0f0a634b 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -154,7 +154,7 @@ void driver_remove_file(const struct device_driver *driver, int driver_set_override(struct device *dev, const char **override, const char *s, size_t len); int __must_check driver_for_each_device(struct device_driver *drv, struct device *start, - void *data, int (*fn)(struct device *dev, void *)); + void *data, device_iter_t fn); struct device *driver_find_device(const struct device_driver *drv, struct device *start, const void *data, device_match_t match); -- cgit v1.2.3 From 51796f5e2960130fe53e9a71d07152622d5e024c Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Sun, 5 Jan 2025 16:34:09 +0800 Subject: driver core: Move two simple APIs for finding child device to header The following two APIs are for finding child device, and both only have one line code in function body. device_find_child_by_name() device_find_any_child() Move them to header as static inline function. Reviewed-by: Jonathan Cameron Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250105-class_fix-v6-8-3a2f1768d4d4@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 36d1a1607712..1e9aded9a086 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1083,9 +1083,35 @@ int device_for_each_child_reverse_from(struct device *parent, device_iter_t fn); struct device *device_find_child(struct device *parent, const void *data, device_match_t match); -struct device *device_find_child_by_name(struct device *parent, - const char *name); -struct device *device_find_any_child(struct device *parent); +/** + * device_find_child_by_name - device iterator for locating a child device. + * @parent: parent struct device + * @name: name of the child device + * + * This is similar to the device_find_child() function above, but it + * returns a reference to a device that has the name @name. + * + * NOTE: you will need to drop the reference with put_device() after use. + */ +static inline struct device *device_find_child_by_name(struct device *parent, + const char *name) +{ + return device_find_child(parent, name, device_match_name); +} + +/** + * device_find_any_child - device iterator for locating a child device, if any. + * @parent: parent struct device + * + * This is similar to the device_find_child() function above, but it + * returns a reference to a child device, if any. + * + * NOTE: you will need to drop the reference with put_device() after use. + */ +static inline struct device *device_find_any_child(struct device *parent) +{ + return device_find_child(parent, NULL, device_match_any); +} int device_rename(struct device *dev, const char *new_name); int device_move(struct device *dev, struct device *new_parent, -- cgit v1.2.3 From 9c96821b44f893fb63f021a28625d3b32c68e8b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 06:47:09 +0100 Subject: block: fix docs for freezing of queue limits updates queue_limits_commit_update is the function that needs to operate on a frozen queue, not queue_limits_start_update. Update the kerneldoc comments to reflect that. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Nilay Shroff Reviewed-by: Johannes Thumshirn Reviewed-by: John Garry Link: https://lore.kernel.org/r/20250110054726.1499538-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5d40af2ef971..e781d4e6f92d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -944,8 +944,7 @@ static inline unsigned int blk_boundary_sectors_left(sector_t offset, * the caller can modify. The caller must call queue_limits_commit_update() * to finish the update. * - * Context: process context. The caller must have frozen the queue or ensured - * that there is outstanding I/O by other means. + * Context: process context. */ static inline struct queue_limits queue_limits_start_update(struct request_queue *q) -- cgit v1.2.3 From aa427d7b73b196f657d6d2cf0e94eff6b883fdef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 06:47:10 +0100 Subject: block: add a queue_limits_commit_update_frozen helper Add a helper that freezes the queue, updates the queue limits and unfreezes the queue and convert all open coded versions of that to the new helper. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Nilay Shroff Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250110054726.1499538-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e781d4e6f92d..13d353351c37 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -952,6 +952,8 @@ queue_limits_start_update(struct request_queue *q) mutex_lock(&q->limits_lock); return q->limits; } +int queue_limits_commit_update_frozen(struct request_queue *q, + struct queue_limits *lim); int queue_limits_commit_update(struct request_queue *q, struct queue_limits *lim); int queue_limits_set(struct request_queue *q, struct queue_limits *lim); -- cgit v1.2.3 From 827ed8b1590d4d29dae837283d606709ffeebe37 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 19 Dec 2024 22:48:17 +0100 Subject: drivers: core: remove device_link argument from class_compat_[create|remove]_link After 7e722083fcc3 ("i2c: Remove I2C_COMPAT config symbol and related code") there's no caller left passing a non-null device_link argument. So remove this argument to simplify the code. Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/db49131d-fd79-4f23-93f2-0ab541a345fa@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device/class.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device/class.h b/include/linux/device/class.h index aa67d4736816..45ee3a634999 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -82,10 +82,8 @@ bool class_is_registered(const struct class *class); struct class_compat; struct class_compat *class_compat_register(const char *name); void class_compat_unregister(struct class_compat *cls); -int class_compat_create_link(struct class_compat *cls, struct device *dev, - struct device *device_link); -void class_compat_remove_link(struct class_compat *cls, struct device *dev, - struct device *device_link); +int class_compat_create_link(struct class_compat *cls, struct device *dev); +void class_compat_remove_link(struct class_compat *cls, struct device *dev); void class_dev_iter_init(struct class_dev_iter *iter, const struct class *class, const struct device *start, const struct device_type *type); -- cgit v1.2.3 From f1725160fd28a2e65e47166637aa44856a1a7f89 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 7 Jan 2025 13:25:10 +0100 Subject: devres: add devm_remove_action_nowarn() devm_remove_action() warns if the action to remove does not exist (anymore). The Rust devres abstraction, however, has a use-case to call devm_remove_action() at a point where it can't be guaranteed that the corresponding action hasn't been released yet. In particular, an instance of `Devres` may be dropped after the action has been released. So far, `Devres` worked around this by keeping the inner type alive. Hence, add devm_remove_action_nowarn(), which returns an error code if the action has been removed already. A subsequent patch uses devm_remove_action_nowarn() to remove the action when `Devres` is dropped. Signed-off-by: Danilo Krummrich Link: https://lore.kernel.org/r/20250107122609.8135-1-dakr@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 1e9aded9a086..80a5b3268986 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -399,7 +399,23 @@ void __iomem *devm_of_iomap(struct device *dev, #endif /* allows to add/remove a custom action to devres stack */ -void devm_remove_action(struct device *dev, void (*action)(void *), void *data); +int devm_remove_action_nowarn(struct device *dev, void (*action)(void *), void *data); + +/** + * devm_remove_action() - removes previously added custom action + * @dev: Device that owns the action + * @action: Function implementing the action + * @data: Pointer to data passed to @action implementation + * + * Removes instance of @action previously added by devm_add_action(). + * Both action and data should match one of the existing entries. + */ +static inline +void devm_remove_action(struct device *dev, void (*action)(void *), void *data) +{ + WARN_ON(devm_remove_action_nowarn(dev, action, data)); +} + void devm_release_action(struct device *dev, void (*action)(void *), void *data); int __devm_add_action(struct device *dev, void (*action)(void *), void *data, const char *name); -- cgit v1.2.3 From 910ef438e93cd2ceb70c72caea418710d648feef Mon Sep 17 00:00:00 2001 From: John Ogness Date: Tue, 7 Jan 2025 22:33:00 +0106 Subject: serial: 8250: Provide flag for IER toggling for RS485 For RS485 mode, if SER_RS485_RX_DURING_TX is not available, the console ->write() callback needs to enable/disable Tx. It does this by calling the ->rs485_start_tx() and ->rs485_stop_tx() callbacks. However, some of these callbacks also disable/enable interrupts and makes power management calls. This causes 2 problems for console writing: 1. A console write can occur in contexts that are illegal for pm_runtime_*(). It is not even necessary for console writing to use pm_runtime_*() because a console already does this in serial8250_console_setup() and serial8250_console_exit(). 2. The console ->write() callback already handles disabling/enabling the interrupts by properly restoring the previous IER value. Add an argument @toggle_ier to the ->rs485_start_tx() and ->rs485_stop_tx() callbacks to specify if they may disable/enable receive interrupts while using pm_runtime_*(). Console writing will not allow the toggling. For all call sites other than console writing there is no functional change. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20250107212702.169493-5-john.ogness@linutronix.de Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_8250.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index e0717c8393d7..144de7a7948d 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -161,8 +161,8 @@ struct uart_8250_port { void (*dl_write)(struct uart_8250_port *up, u32 value); struct uart_8250_em485 *em485; - void (*rs485_start_tx)(struct uart_8250_port *); - void (*rs485_stop_tx)(struct uart_8250_port *); + void (*rs485_start_tx)(struct uart_8250_port *up, bool toggle_ier); + void (*rs485_stop_tx)(struct uart_8250_port *up, bool toggle_ier); /* Serial port overrun backoff */ struct delayed_work overrun_backoff; -- cgit v1.2.3 From b63e6f60eab45b16a1bf734fef9035a4c4187cd5 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Tue, 7 Jan 2025 22:33:01 +0106 Subject: serial: 8250: Switch to nbcon console Implement the necessary callbacks to switch the 8250 console driver to perform as an nbcon console. Add implementations for the nbcon console callbacks: ->write_atomic() ->write_thread() ->device_lock() ->device_unlock() and add CON_NBCON to the initial @flags. All register access in the callbacks are within unsafe sections. The ->write_atomic() and ->write_thread() callbacks allow safe handover/takeover per byte and add a preceding newline if they take over from another context mid-line. For the ->write_atomic() callback, a new irq_work is used to defer modem control since it may be called from a context that does not allow waking up tasks. Note: A new __serial8250_clear_IER() is introduced for direct clearing of UART_IER. This will allow to restore the lockdep check to serial8250_clear_IER() in a follow-up commit. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Tested-by: Petr Mladek Link: https://lore.kernel.org/r/20250107212702.169493-6-john.ogness@linutronix.de Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_8250.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index 144de7a7948d..57875c37023a 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -150,8 +150,17 @@ struct uart_8250_port { #define LSR_SAVE_FLAGS UART_LSR_BRK_ERROR_BITS u16 lsr_saved_flags; u16 lsr_save_mask; + + /* + * Track when a console line has been fully written to the + * hardware, i.e. true when the most recent byte written to + * UART_TX by the console was '\n'. + */ + bool console_line_ended; + #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA unsigned char msr_saved_flags; + struct irq_work modem_status_work; struct uart_8250_dma *dma; const struct uart_8250_ops *ops; @@ -202,8 +211,8 @@ void serial8250_tx_chars(struct uart_8250_port *up); unsigned int serial8250_modem_status(struct uart_8250_port *up); void serial8250_init_port(struct uart_8250_port *up); void serial8250_set_defaults(struct uart_8250_port *up); -void serial8250_console_write(struct uart_8250_port *up, const char *s, - unsigned int count); +void serial8250_console_write(struct uart_8250_port *up, + struct nbcon_write_context *wctxt, bool in_atomic); int serial8250_console_setup(struct uart_port *port, char *options, bool probe); int serial8250_console_exit(struct uart_port *port); -- cgit v1.2.3 From e364374369b365351ad8ad69a10b5f7861f24bcd Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Thu, 9 Jan 2025 20:38:07 +0100 Subject: VMCI: fix reference to ioctl-number.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There has never been an ioctl-number.h — this must have been a typo for ioctl-number.txt (which later become ioctl-number.rst). At the time this comment was written, the note didn't actually end up appearing anywhere, but I fixed the omission from ioctl-number.rst in 0a8e4dc1d353 ("Documentation: ioctl: document 0x07 ioctl code"). Fixes: 20259849bb1a ("VMCI: Some header and config files.") Signed-off-by: Alyssa Ross Link: https://lore.kernel.org/r/re3xng4uwull2cu53xnu5dtv3wlstfiv3v7rmbwtw2qbvj5mo3@q45iujse5ovc Signed-off-by: Greg Kroah-Hartman --- include/linux/vmw_vmci_defs.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmw_vmci_defs.h b/include/linux/vmw_vmci_defs.h index c2df94696593..60c9eacd2cf3 100644 --- a/include/linux/vmw_vmci_defs.h +++ b/include/linux/vmw_vmci_defs.h @@ -431,11 +431,11 @@ enum { ((((_p)[0] & 0xFF) << 24) | (((_p)[1] & 0xFF) << 16) | ((_p)[2])) /* - * The VMCI IOCTLs. We use identity code 7, as noted in ioctl-number.h, and - * we start at sequence 9f. This gives us the same values that our shipping - * products use, starting at 1951, provided we leave out the direction and - * structure size. Note that VMMon occupies the block following us, starting - * at 2001. + * The VMCI IOCTLs. We use identity code 7, as noted in ioctl-number.rst, + * and we start at sequence 9f. This gives us the same values that our + * shipping products use, starting at 1951, provided we leave out the + * direction and structure size. Note that VMMon occupies the block + * following us, starting at 2001. */ #define IOCTL_VMCI_VERSION _IO(7, 0x9f) /* 1951 */ #define IOCTL_VMCI_INIT_CONTEXT _IO(7, 0xa0) -- cgit v1.2.3 From 226d6cb3cb799aae46d0dd19a521133997d9db11 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 10 Jan 2025 15:45:22 +0100 Subject: spi: spi-mem: Estimate the time taken by operations In the SPI-NAND layer, we currently make list of operation variants from the fastest one to the slowest and there is a bit of logic in the core to go over them and pick the first one that is supported by the controller, ie. the fastest one among the supported ops. This kind of logic only works if all operations run at the same frequency, but as soon as we introduce per operation max frequencies it is not longer as obvious which operation will be faster, especially since it also depends on the PCB/controller frequency limitation. One way to make this choice more clever is to go over all the variants and for each of them derive an indicator which will help derive the theoretical best. In this case, we derive a theoretical duration for the entire operation and we take the smallest one. Add a helper that parses the spi-mem operation and returns this value. Signed-off-by: Miquel Raynal Link: https://patch.msgid.link/20250110-winbond-6-11-rc1-quad-support-v3-20-7ab4bd56cf6e@bootlin.com Signed-off-by: Mark Brown --- include/linux/spi/spi-mem.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index 306c05dd1378..c4830dfaff3d 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -424,6 +424,7 @@ bool spi_mem_default_supports_op(struct spi_mem *mem, int spi_mem_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op); void spi_mem_adjust_op_freq(struct spi_mem *mem, struct spi_mem_op *op); +u64 spi_mem_calc_op_duration(struct spi_mem_op *op); bool spi_mem_supports_op(struct spi_mem *mem, const struct spi_mem_op *op); -- cgit v1.2.3 From d7476f24c9aa93d02ef3fd8d587a6114387b7667 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 3 Jan 2025 20:45:38 +0000 Subject: export: Add __gendwarfksyms_ptr_ references to exported symbols With gendwarfksyms, we need each TU where the EXPORT_SYMBOL() macro is used to also contain DWARF type information for the symbols it exports. However, as a TU can also export external symbols and compilers may choose not to emit debugging information for symbols not defined in the current TU, the missing types will result in missing symbol versions. Stand-alone assembly code also doesn't contain type information for exported symbols, so we need to compile a temporary object file with asm-prototypes.h instead, and similarly need to ensure the DWARF in the temporary object file contains the necessary types. To always emit type information for external exports, add explicit __gendwarfksyms_ptr_ references to them in EXPORT_SYMBOL(). gendwarfksyms will use the type information for __gendwarfksyms_ptr_* if needed. Discard the pointers from the final binary to avoid further bloat. Signed-off-by: Sami Tolvanen Signed-off-by: Masahiro Yamada --- include/linux/export.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/export.h b/include/linux/export.h index 2633df4d31e6..a8c23d945634 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -52,9 +52,24 @@ #else +#ifdef CONFIG_GENDWARFKSYMS +/* + * With CONFIG_GENDWARFKSYMS, ensure the compiler emits debugging + * information for all exported symbols, including those defined in + * different TUs, by adding a __gendwarfksyms_ptr_ pointer + * that's discarded during the final link. + */ +#define __GENDWARFKSYMS_EXPORT(sym) \ + static typeof(sym) *__gendwarfksyms_ptr_##sym __used \ + __section(".discard.gendwarfksyms") = &sym; +#else +#define __GENDWARFKSYMS_EXPORT(sym) +#endif + #define __EXPORT_SYMBOL(sym, license, ns) \ extern typeof(sym) sym; \ __ADDRESSABLE(sym) \ + __GENDWARFKSYMS_EXPORT(sym) \ asm(__stringify(___EXPORT_SYMBOL(sym, license, ns))) #endif -- cgit v1.2.3 From cf337105ad38564d7855151889a7315da73119d0 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 8 Jan 2025 16:47:33 +0000 Subject: net: phy: add configuration of rx clock stop mode Add a function to allow configuration of the PCS's clock stop enable bit, used to configure whether the xMII receive clock can be stopped during LPI mode. Reviewed-by: Andrew Lunn Tested-by: Choong Yong Liang Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tVZDR-0002Jl-Ry@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 5bc71d59910c..4875465653ca 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2096,6 +2096,7 @@ int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask); int phy_unregister_fixup_for_id(const char *bus_id); int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask); +int phy_eee_rx_clock_stop(struct phy_device *phydev, bool clk_stop_enable); int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable); int phy_get_eee_err(struct phy_device *phydev); int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_keee *data); -- cgit v1.2.3 From 21520e74ba454c549f4f732d014f180f8c0c041c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 9 Jan 2025 16:49:24 -0800 Subject: net: hide the definition of dev_get_by_napi_id() There are no module callers of dev_get_by_napi_id(), and commit d1cacd747768 ("netdev: prevent accessing NAPI instances from another namespace") proves that getting NAPI by id needs to be done with care. So hide dev_get_by_napi_id(). Reviewed-by: Jacob Keller Reviewed-by: Kalesh AP Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250110004924.3212260-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1812564b5204..aeb4a6cff171 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3252,7 +3252,6 @@ struct net_device *netdev_get_by_index(struct net *net, int ifindex, struct net_device *netdev_get_by_name(struct net *net, const char *name, netdevice_tracker *tracker, gfp_t gfp); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); -struct net_device *dev_get_by_napi_id(unsigned int napi_id); void netdev_copy_name(struct net_device *dev, char *name); static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, -- cgit v1.2.3 From 30e77e0fbec6940ecc5c79ffe0f076c54cf5a8d9 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:34 +0900 Subject: nvme: Move opcode string helper functions declarations Move the declaration of all helper functions converting NVMe command opcodes and status codes into strings from drivers/nvme/host/nvme.h into include/linux/nvme.h, together with the commands definitions. This allows NVMe target drivers to call these functions without having to include a host header file. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- include/linux/nvme.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 13377dde4527..a5a4ee56efcf 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1896,6 +1896,46 @@ static inline bool nvme_is_fabrics(const struct nvme_command *cmd) return cmd->common.opcode == nvme_fabrics_command; } +#ifdef CONFIG_NVME_VERBOSE_ERRORS +const char *nvme_get_error_status_str(u16 status); +const char *nvme_get_opcode_str(u8 opcode); +const char *nvme_get_admin_opcode_str(u8 opcode); +const char *nvme_get_fabrics_opcode_str(u8 opcode); +#else /* CONFIG_NVME_VERBOSE_ERRORS */ +static inline const char *nvme_get_error_status_str(u16 status) +{ + return "I/O Error"; +} +static inline const char *nvme_get_opcode_str(u8 opcode) +{ + return "I/O Cmd"; +} +static inline const char *nvme_get_admin_opcode_str(u8 opcode) +{ + return "Admin Cmd"; +} + +static inline const char *nvme_get_fabrics_opcode_str(u8 opcode) +{ + return "Fabrics Cmd"; +} +#endif /* CONFIG_NVME_VERBOSE_ERRORS */ + +static inline const char *nvme_opcode_str(int qid, u8 opcode) +{ + return qid ? nvme_get_opcode_str(opcode) : + nvme_get_admin_opcode_str(opcode); +} + +static inline const char *nvme_fabrics_opcode_str( + int qid, const struct nvme_command *cmd) +{ + if (nvme_is_fabrics(cmd)) + return nvme_get_fabrics_opcode_str(cmd->fabrics.fctype); + + return nvme_opcode_str(qid, cmd->common.opcode); +} + struct nvme_error_slot { __le64 error_count; __le16 sqid; -- cgit v1.2.3 From 200adac75888182c09027e9b7852507dabd87034 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:39 +0900 Subject: nvme: Add PCI transport type Define the transport type NVMF_TRTYPE_PCI for PCI endpoint targets. This transport type is defined using the value 0 which is reserved in the NVMe base specifications v2.1 (Figure 294). Given that struct nvmet_port are zeroed out on creation, to avoid having this transsport type becoming the new default, nvmet_referral_make() and nvmet_ports_make() are modified to initialize a port discovery address transport type field (disc_addr.trtype) to NVMF_TRTYPE_MAX. Any port using this transport type is also skipped and not reported in the discovery log page (nvmet_execute_disc_get_log_page()). The helper function nvmet_is_pci_ctrl() is also introduced to check if a target controller uses the PCI transport. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index a5a4ee56efcf..42fc00dc494e 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -64,6 +64,7 @@ enum { /* Transport Type codes for Discovery Log Page entry TRTYPE field */ enum { + NVMF_TRTYPE_PCI = 0, /* PCI */ NVMF_TRTYPE_RDMA = 1, /* RDMA */ NVMF_TRTYPE_FC = 2, /* Fibre Channel */ NVMF_TRTYPE_TCP = 3, /* TCP/IP */ -- cgit v1.2.3 From 2f2b20fad973d00169d24f5338eb1bf0a42e8218 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:46 +0900 Subject: nvmet: Implement host identifier set feature support The NVMe specifications mandate support for the host identifier set_features for controllers that also supports reservations. Satisfy this requirement by implementing handling of the NVME_FEAT_HOST_ID feature for the nvme_set_features command. This implementation is for now effective only for PCI target controllers. For other controller types, the set features command is failed with a NVME_SC_CMD_SEQ_ERROR status as before. As noted in the code, 128 bits host identifiers are supported since the NVMe base specifications version 2.1 indicate in section 5.1.25.1.28.1 that "The controller may support a 64-bit Host Identifier...". The RHII (Reservations and Host Identifier Interaction) bit of the controller attribute (ctratt) field of the identify controller data is also set to indicate that a host ID of "0" is supported but that the host ID must be a non-zero value to use reservations. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 42fc00dc494e..fe3b60818fdc 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -276,6 +276,7 @@ enum nvme_ctrl_attr { NVME_CTRL_ATTR_HID_128_BIT = (1 << 0), NVME_CTRL_ATTR_TBKAS = (1 << 6), NVME_CTRL_ATTR_ELBAS = (1 << 15), + NVME_CTRL_ATTR_RHII = (1 << 18), }; struct nvme_id_ctrl { -- cgit v1.2.3 From 0f52b4db4f91320569311b97a1a14a18fb8ff256 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 12 Dec 2024 19:02:04 +0100 Subject: rcu/kvfree: Initialize kvfree_rcu() separately Introduce a separate initialization of kvfree_rcu() functionality. For such purpose a kfree_rcu_batch_init() is renamed to a kvfree_rcu_init() and it is invoked from the main.c right after rcu_init() is done. Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Hyeonggon Yoo Tested-by: Hyeonggon Yoo Signed-off-by: Vlastimil Babka --- include/linux/rcupdate.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48e5c03df1dd..acb0095b4dbe 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -118,6 +118,7 @@ static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) /* Internal to kernel */ void rcu_init(void); +void __init kvfree_rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); -- cgit v1.2.3 From bbe658d6580251d1832d408fa8a71ec254dc4416 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 12 Dec 2024 19:02:08 +0100 Subject: mm/slab: Move kvfree_rcu() into SLAB Move kvfree_rcu() functionality to the slab_common.c file. The reason to have kvfree_rcu() functionality as part of SLAB is that there is a clear trend and need of closer integration. One of the recent example is creating a barrier function for SLAB caches. Another reason is to prevent of having several implementations of RCU machinery for reclaiming objects after a GP. As future steps, it can be more integrated(easier) with SLAB internals. Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Hyeonggon Yoo Tested-by: Hyeonggon Yoo Signed-off-by: Vlastimil Babka --- include/linux/rcupdate.h | 1 - include/linux/slab.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index acb0095b4dbe..48e5c03df1dd 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -118,7 +118,6 @@ static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) /* Internal to kernel */ void rcu_init(void); -void __init kvfree_rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); diff --git a/include/linux/slab.h b/include/linux/slab.h index 10a971c2bde3..09eedaecf120 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -1099,5 +1099,6 @@ unsigned int kmem_cache_size(struct kmem_cache *s); size_t kmalloc_size_roundup(size_t size); void __init kmem_cache_init_late(void); +void __init kvfree_rcu_init(void); #endif /* _LINUX_SLAB_H */ -- cgit v1.2.3 From 387bef82d0b4afd4c7430b52c4971649a5cf3b06 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Thu, 9 Jan 2025 22:42:28 +0200 Subject: net/mlx5: Update mlx5_ifc to support FEC for 200G per lane link modes Add FEC admin and override related fields in PPLM, and the bit in PCAM to indicate those fields are supported. Signed-off-by: Jianbo Liu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250109204231.1809851-2-tariqt@nvidia.com Reviewed-by: Jacob Keller Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 43b3cb4bf8d1..c3da1581853c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10150,7 +10150,21 @@ struct mlx5_ifc_pplm_reg_bits { u8 fec_override_admin_200g_2x[0x10]; u8 fec_override_admin_100g_1x[0x10]; - u8 reserved_at_260[0x20]; + u8 reserved_at_260[0x60]; + + u8 fec_override_cap_1600g_8x[0x10]; + u8 fec_override_cap_800g_4x[0x10]; + + u8 fec_override_cap_400g_2x[0x10]; + u8 fec_override_cap_200g_1x[0x10]; + + u8 fec_override_admin_1600g_8x[0x10]; + u8 fec_override_admin_800g_4x[0x10]; + + u8 fec_override_admin_400g_2x[0x10]; + u8 fec_override_admin_200g_1x[0x10]; + + u8 reserved_at_340[0x80]; }; struct mlx5_ifc_ppcnt_reg_bits { @@ -10524,7 +10538,9 @@ struct mlx5_ifc_mtutc_reg_bits { }; struct mlx5_ifc_pcam_enhanced_features_bits { - u8 reserved_at_0[0x48]; + u8 reserved_at_0[0x1d]; + u8 fec_200G_per_lane_in_pplm[0x1]; + u8 reserved_at_1e[0x2a]; u8 fec_100G_per_lane_in_pplm[0x1]; u8 reserved_at_49[0x1f]; u8 fec_50G_per_lane_in_pplm[0x1]; -- cgit v1.2.3 From e2685ef5f56295249bf98bc6603d3c092fe0ce56 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Thu, 9 Jan 2025 22:42:29 +0200 Subject: net/mlx5: Add support for MRTCQ register Management Real Time Clock Query (MRTCQ) register is used to query hardware clock identity. Signed-off-by: Jianbo Liu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250109204231.1809851-3-tariqt@nvidia.com Reviewed-by: Jacob Keller Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fc7e6153b73d..8f6fe29bc4be 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -160,6 +160,7 @@ enum { MLX5_REG_MIRC = 0x9162, MLX5_REG_MTPTM = 0x9180, MLX5_REG_MTCTR = 0x9181, + MLX5_REG_MRTCQ = 0x9182, MLX5_REG_SBCAM = 0xB01F, MLX5_REG_RESOURCE_DUMP = 0xC000, MLX5_REG_DTOR = 0xC00E, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index c3da1581853c..221146278ac8 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10680,7 +10680,8 @@ struct mlx5_ifc_mcam_access_reg_bits3 { u8 regs_63_to_32[0x20]; - u8 regs_31_to_2[0x1e]; + u8 regs_31_to_3[0x1d]; + u8 mrtcq[0x1]; u8 mtctr[0x1]; u8 mtptm[0x1]; }; @@ -13171,4 +13172,12 @@ struct mlx5_ifc_msees_reg_bits { u8 reserved_at_80[0x180]; }; +struct mlx5_ifc_mrtcq_reg_bits { + u8 reserved_at_0[0x40]; + + u8 rt_clock_identity[0x40]; + + u8 reserved_at_80[0x180]; +}; + #endif /* MLX5_IFC_H */ -- cgit v1.2.3 From df75ad562a6f9ae6add42d56e228aa973b421421 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 9 Jan 2025 22:42:30 +0200 Subject: net/mlx5: SHAMPO: Introduce new SHAMPO specific HCA caps Read and cache SHAMPO specific caps for header data split capabilities. Will be used in downstream patch. Signed-off-by: Saeed Mahameed Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250109204231.1809851-4-tariqt@nvidia.com Reviewed-by: Jacob Keller Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 4 ++++ include/linux/mlx5/mlx5_ifc.h | 20 +++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index cc647992f3d1..0c48b20f818a 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1245,6 +1245,7 @@ enum mlx5_cap_type { MLX5_CAP_DEV_EVENT = 0x14, MLX5_CAP_IPSEC, MLX5_CAP_CRYPTO = 0x1a, + MLX5_CAP_SHAMPO = 0x1d, MLX5_CAP_MACSEC = 0x1f, MLX5_CAP_GENERAL_2 = 0x20, MLX5_CAP_PORT_SELECTION = 0x25, @@ -1470,6 +1471,9 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_MACSEC(mdev, cap)\ MLX5_GET(macsec_cap, (mdev)->caps.hca[MLX5_CAP_MACSEC]->cur, cap) +#define MLX5_CAP_SHAMPO(mdev, cap) \ + MLX5_GET(shampo_cap, mdev->caps.hca[MLX5_CAP_SHAMPO]->cur, cap) + enum { MLX5_CMD_STAT_OK = 0x0, MLX5_CMD_STAT_INT_ERR = 0x1, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 221146278ac8..d7c91f152735 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2327,7 +2327,9 @@ struct mlx5_ifc_wq_bits { u8 headers_mkey[0x20]; u8 shampo_enable[0x1]; - u8 reserved_at_1e1[0x4]; + u8 reserved_at_1e1[0x1]; + u8 shampo_mode[0x2]; + u8 reserved_at_1e4[0x1]; u8 log_reservation_size[0x3]; u8 reserved_at_1e8[0x5]; u8 log_max_num_of_packets_per_reservation[0x3]; @@ -3699,6 +3701,22 @@ struct mlx5_ifc_crypto_cap_bits { u8 reserved_at_80[0x780]; }; +struct mlx5_ifc_shampo_cap_bits { + u8 reserved_at_0[0x3]; + u8 shampo_log_max_reservation_size[0x5]; + u8 reserved_at_8[0x3]; + u8 shampo_log_min_reservation_size[0x5]; + u8 shampo_min_mss_size[0x10]; + + u8 shampo_header_split[0x1]; + u8 shampo_header_split_data_merge[0x1]; + u8 reserved_at_22[0x1]; + u8 shampo_log_max_headers_entry_size[0x5]; + u8 reserved_at_28[0x18]; + + u8 reserved_at_40[0x7c0]; +}; + union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_cmd_hca_cap_bits cmd_hca_cap; struct mlx5_ifc_cmd_hca_cap_2_bits cmd_hca_cap_2; -- cgit v1.2.3 From 6ca00ec47b70acb7a06cf5c79f6bec6074cef008 Mon Sep 17 00:00:00 2001 From: Akiva Goldberger Date: Thu, 9 Jan 2025 22:42:31 +0200 Subject: net/mlx5: Add nic_cap_reg and vhca_icm_ctrl registers Add nic_cap_reg and vhca_icm_ctrl registers interfaces for exposing ICM consumption. Signed-off-by: Akiva Goldberger Reviewed-by: Moshe Shemesh Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250109204231.1809851-5-tariqt@nvidia.com Reviewed-by: Jacob Keller Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 2 ++ include/linux/mlx5/mlx5_ifc.h | 22 +++++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8f6fe29bc4be..b957391529b3 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -163,7 +163,9 @@ enum { MLX5_REG_MRTCQ = 0x9182, MLX5_REG_SBCAM = 0xB01F, MLX5_REG_RESOURCE_DUMP = 0xC000, + MLX5_REG_NIC_CAP = 0xC00D, MLX5_REG_DTOR = 0xC00E, + MLX5_REG_VHCA_ICM_CTRL = 0xC010, }; enum mlx5_qpts_trust_state { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index d7c91f152735..2a40b1fd50e8 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1830,7 +1830,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 regexp_params[0x1]; u8 uar_sz[0x6]; u8 port_selection_cap[0x1]; - u8 reserved_at_251[0x1]; + u8 nic_cap_reg[0x1]; u8 umem_uid_0[0x1]; u8 reserved_at_253[0x5]; u8 log_pg_sz[0x8]; @@ -3327,6 +3327,14 @@ struct mlx5_ifc_dropped_packet_logged_bits { u8 reserved_at_0[0xe0]; }; +struct mlx5_ifc_nic_cap_reg_bits { + u8 reserved_at_0[0x1a]; + u8 vhca_icm_ctrl[0x1]; + u8 reserved_at_1b[0x5]; + + u8 reserved_at_20[0x60]; +}; + struct mlx5_ifc_default_timeout_bits { u8 to_multiplier[0x3]; u8 reserved_at_3[0x9]; @@ -3363,6 +3371,18 @@ struct mlx5_ifc_dtor_reg_bits { u8 reserved_at_1c0[0x20]; }; +struct mlx5_ifc_vhca_icm_ctrl_reg_bits { + u8 vhca_id_valid[0x1]; + u8 reserved_at_1[0xf]; + u8 vhca_id[0x10]; + + u8 reserved_at_20[0xa0]; + + u8 cur_alloc_icm[0x20]; + + u8 reserved_at_e0[0x120]; +}; + enum { MLX5_CQ_ERROR_SYNDROME_CQ_OVERRUN = 0x1, MLX5_CQ_ERROR_SYNDROME_CQ_ACCESS_VIOLATION_ERROR = 0x2, -- cgit v1.2.3 From 0d2c022ffa7ccbd5bed5b18eefbda7ef2df4f81f Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 19 Dec 2024 23:03:37 +0100 Subject: i3c: fix kdoc parameter description for module_i3c_i2c_driver() A typo mentioned I3C when it should have been I2C. Signed-off-by: Wolfram Sang Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20241219220338.10315-1-wsa+renesas@sang-engineering.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h index 0a8a44ac2f02..b674f64d0822 100644 --- a/include/linux/i3c/device.h +++ b/include/linux/i3c/device.h @@ -283,7 +283,7 @@ static inline void i3c_i2c_driver_unregister(struct i3c_driver *i3cdrv, * module_i3c_i2c_driver() - Register a module providing an I3C and an I2C * driver * @__i3cdrv: the I3C driver to register - * @__i2cdrv: the I3C driver to register + * @__i2cdrv: the I2C driver to register * * Provide generic init/exit functions that simply register/unregister an I3C * and an I2C driver. -- cgit v1.2.3 From c320592f3f2a1e6a4e69e5db8e76fc66934a0a78 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 7 Jan 2025 10:01:59 +0100 Subject: bitops: add generic parity calculation for u8 There are multiple open coded implementations for getting the parity of a byte in the kernel, even using different approaches. Take the pretty efficient version from SPD5118 driver and make it generally available by putting it into the bitops header. As long as there is just one parity calculation helper, the creation of a distinct 'parity.h' header was discarded. Also, the usage of hweight8() for architectures having a popcnt instruction is postponed until a use case within hot paths is desired. The motivation for this patch is the frequent use of odd parity in the I3C specification and to simplify drivers there. Changes compared to the original SPD5118 version are the addition of kernel documentation, switching the return type from bool to int, and renaming the argument of the function. Signed-off-by: Wolfram Sang Tested-by: Guenter Roeck Reviewed-by: Geert Uytterhoeven Acked-by: Yury Norov Reviewed-by: Kuan-Wei Chiu Tested-by: Kuan-Wei Chiu Link: https://lore.kernel.org/r/20250107090204.6593-2-wsa+renesas@sang-engineering.com Signed-off-by: Alexandre Belloni --- include/linux/bitops.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index ba35bbf07798..c1cb53cf2f0f 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -229,6 +229,37 @@ static inline int get_count_order_long(unsigned long l) return (int)fls_long(--l); } +/** + * parity8 - get the parity of an u8 value + * @value: the value to be examined + * + * Determine the parity of the u8 argument. + * + * Returns: + * 0 for even parity, 1 for odd parity + * + * Note: This function informs you about the current parity. Example to bail + * out when parity is odd: + * + * if (parity8(val) == 1) + * return -EBADMSG; + * + * If you need to calculate a parity bit, you need to draw the conclusion from + * this result yourself. Example to enforce odd parity, parity bit is bit 7: + * + * if (parity8(val) == 0) + * val ^= BIT(7); + */ +static inline int parity8(u8 val) +{ + /* + * One explanation of this algorithm: + * https://funloop.org/codex/problem/parity/README.html + */ + val ^= val >> 4; + return (0x6996 >> (val & 0xf)) & 1; +} + /** * __ffs64 - find first set bit in a 64 bit word * @word: The 64 bit word -- cgit v1.2.3 From 4bcf29741145e73440323e3e9af8b1a6f4961183 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Tue, 7 Jan 2025 16:34:57 +0100 Subject: module: fix writing of livepatch relocations in ROX text A livepatch module can contain a special relocation section .klp.rela.. to apply its relocations at the appropriate time and to additionally access local and unexported symbols. When points to another module, such relocations are processed separately from the regular module relocation process. For instance, only when the target actually becomes loaded. With CONFIG_STRICT_MODULE_RWX, when the livepatch core decides to apply these relocations, their processing results in the following bug: [ 25.827238] BUG: unable to handle page fault for address: 00000000000012ba [ 25.827819] #PF: supervisor read access in kernel mode [ 25.828153] #PF: error_code(0x0000) - not-present page [ 25.828588] PGD 0 P4D 0 [ 25.829063] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI [ 25.829742] CPU: 2 UID: 0 PID: 452 Comm: insmod Tainted: G O K 6.13.0-rc4-00078-g059dd502b263 #7820 [ 25.830417] Tainted: [O]=OOT_MODULE, [K]=LIVEPATCH [ 25.830768] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-20220807_005459-localhost 04/01/2014 [ 25.831651] RIP: 0010:memcmp+0x24/0x60 [ 25.832190] Code: [...] [ 25.833378] RSP: 0018:ffffa40b403a3ae8 EFLAGS: 00000246 [ 25.833637] RAX: 0000000000000000 RBX: ffff93bc81d8e700 RCX: ffffffffc0202000 [ 25.834072] RDX: 0000000000000000 RSI: 0000000000000004 RDI: 00000000000012ba [ 25.834548] RBP: ffffa40b403a3b68 R08: ffffa40b403a3b30 R09: 0000004a00000002 [ 25.835088] R10: ffffffffffffd222 R11: f000000000000000 R12: 0000000000000000 [ 25.835666] R13: ffffffffc02032ba R14: ffffffffc007d1e0 R15: 0000000000000004 [ 25.836139] FS: 00007fecef8c3080(0000) GS:ffff93bc8f900000(0000) knlGS:0000000000000000 [ 25.836519] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 25.836977] CR2: 00000000000012ba CR3: 0000000002f24000 CR4: 00000000000006f0 [ 25.837442] Call Trace: [ 25.838297] [ 25.841083] __write_relocate_add.constprop.0+0xc7/0x2b0 [ 25.841701] apply_relocate_add+0x75/0xa0 [ 25.841973] klp_write_section_relocs+0x10e/0x140 [ 25.842304] klp_write_object_relocs+0x70/0xa0 [ 25.842682] klp_init_object_loaded+0x21/0xf0 [ 25.842972] klp_enable_patch+0x43d/0x900 [ 25.843572] do_one_initcall+0x4c/0x220 [ 25.844186] do_init_module+0x6a/0x260 [ 25.844423] init_module_from_file+0x9c/0xe0 [ 25.844702] idempotent_init_module+0x172/0x270 [ 25.845008] __x64_sys_finit_module+0x69/0xc0 [ 25.845253] do_syscall_64+0x9e/0x1a0 [ 25.845498] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 25.846056] RIP: 0033:0x7fecef9eb25d [ 25.846444] Code: [...] [ 25.847563] RSP: 002b:00007ffd0c5d6de8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 25.848082] RAX: ffffffffffffffda RBX: 000055b03f05e470 RCX: 00007fecef9eb25d [ 25.848456] RDX: 0000000000000000 RSI: 000055b001e74e52 RDI: 0000000000000003 [ 25.848969] RBP: 00007ffd0c5d6ea0 R08: 0000000000000040 R09: 0000000000004100 [ 25.849411] R10: 00007fecefac7b20 R11: 0000000000000246 R12: 000055b001e74e52 [ 25.849905] R13: 0000000000000000 R14: 000055b03f05e440 R15: 0000000000000000 [ 25.850336] [ 25.850553] Modules linked in: deku(OK+) uinput [ 25.851408] CR2: 00000000000012ba [ 25.852085] ---[ end trace 0000000000000000 ]--- The problem is that the .klp.rela.. relocations are processed after the module was already formed and mod->rw_copy was reset. However, the code in __write_relocate_add() calls module_writable_address() which translates the target address 'loc' still to 'loc + (mem->rw_copy - mem->base)', with mem->rw_copy now being 0. Fix the problem by returning directly 'loc' in module_writable_address() when the module is already formed. Function __write_relocate_add() knows to use text_poke() in such a case. Link: https://lkml.kernel.org/r/20250107153507.14733-1-petr.pavlu@suse.com Fixes: 0c133b1e78cd ("module: prepare to handle ROX allocations for text") Signed-off-by: Petr Pavlu Reported-by: Marek Maslanka Closes: https://lore.kernel.org/linux-modules/CAGcaFA2hdThQV6mjD_1_U+GNHThv84+MQvMWLgEuX+LVbAyDxg@mail.gmail.com/ Reviewed-by: Petr Mladek Tested-by: Petr Mladek Cc: Joe Lawrence Cc: Josh Poimboeuf Cc: Luis Chamberlain Cc: Mike Rapoport (Microsoft) Cc: Petr Mladek Signed-off-by: Andrew Morton --- include/linux/module.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 94acbacdcdf1..b3a643435357 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -773,7 +773,8 @@ void *__module_writable_address(struct module *mod, void *loc); static inline void *module_writable_address(struct module *mod, void *loc) { - if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX) || !mod) + if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX) || !mod || + mod->state != MODULE_STATE_UNFORMED) return loc; return __module_writable_address(mod, loc); } -- cgit v1.2.3 From 0cef0bb836e3cfe00f08f9606c72abd72fe78ca3 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 7 Jan 2025 14:47:52 +0000 Subject: mm: clear uffd-wp PTE/PMD state on mremap() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When mremap()ing a memory region previously registered with userfaultfd as write-protected but without UFFD_FEATURE_EVENT_REMAP, an inconsistency in flag clearing leads to a mismatch between the vma flags (which have uffd-wp cleared) and the pte/pmd flags (which do not have uffd-wp cleared). This mismatch causes a subsequent mprotect(PROT_WRITE) to trigger a warning in page_table_check_pte_flags() due to setting the pte to writable while uffd-wp is still set. Fix this by always explicitly clearing the uffd-wp pte/pmd flags on any such mremap() so that the values are consistent with the existing clearing of VM_UFFD_WP. Be careful to clear the logical flag regardless of its physical form; a PTE bit, a swap PTE bit, or a PTE marker. Cover PTE, huge PMD and hugetlb paths. Link: https://lkml.kernel.org/r/20250107144755.1871363-2-ryan.roberts@arm.com Co-developed-by: Mikołaj Lenczewski Signed-off-by: Mikołaj Lenczewski Signed-off-by: Ryan Roberts Closes: https://lore.kernel.org/linux-mm/810b44a8-d2ae-4107-b665-5a42eae2d948@arm.com/ Fixes: 63b2d4174c4a ("userfaultfd: wp: add the writeprotect API to userfaultfd ioctl") Cc: David Hildenbrand Cc: Jann Horn Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Mark Rutland Cc: Muchun Song Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index cb40f1a1d081..75342022d144 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -247,6 +247,13 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, vma_is_shmem(vma); } +static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) +{ + struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx; + + return uffd_ctx && (uffd_ctx->features & UFFD_FEATURE_EVENT_REMAP) == 0; +} + extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); extern void dup_userfaultfd_complete(struct list_head *); void dup_userfaultfd_fail(struct list_head *); @@ -402,6 +409,11 @@ static inline bool userfaultfd_wp_async(struct vm_area_struct *vma) return false; } +static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) +{ + return false; +} + #endif /* CONFIG_USERFAULTFD */ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) -- cgit v1.2.3 From ec01e9d001fef6278df1900df4207c70166095b4 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sat, 30 Nov 2024 02:12:19 +0800 Subject: lib min_heap: improve type safety in min_heap macros by using container_of Patch series "lib min_heap: Improve min_heap safety, testing, and documentation". Improve the min heap implementation by enhancing type safety with container_of, reducing the attack vector by replacing test function calls with inline variants, and adding a brief API introduction in min_heap.h. It also includes author information in Documentation/core-api/min_heap.rst. This patch (of 4): The current implementation of min_heap macros uses explicit casting to min_heap_char *, which prevents the compiler from detecting incorrect pointer types. This can lead to errors if non-min_heap pointers are passed inadvertently. To enhance safety, replace all explicit casts to min_heap_char * with the use of container_of(&(_heap)->nr, min_heap_char, nr). This approach ensures that the _heap parameter is indeed a min_heap_char-compatible structure, allowing the compiler to catch improper usages. Link: https://lkml.kernel.org/r/20241129181222.646855-1-visitorckw@gmail.com Link: https://lore.kernel.org/lkml/CAMuHMdVO5DPuD9HYWBFqKDHphx7+0BEhreUxtVC40A=8p6VAhQ@mail.gmail.com Link: https://lkml.kernel.org/r/20241129181222.646855-2-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Suggested-by: Geert Uytterhoeven Cc: Ching-Chun (Jim) Huang Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 61 +++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index e781727c8916..456cfbc1b8f5 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -218,7 +218,7 @@ void __min_heap_init_inline(min_heap_char *heap, void *data, int size) } #define min_heap_init_inline(_heap, _data, _size) \ - __min_heap_init_inline((min_heap_char *)_heap, _data, _size) + __min_heap_init_inline(container_of(&(_heap)->nr, min_heap_char, nr), _data, _size) /* Get the minimum element from the heap. */ static __always_inline @@ -228,7 +228,8 @@ void *__min_heap_peek_inline(struct min_heap_char *heap) } #define min_heap_peek_inline(_heap) \ - (__minheap_cast(_heap) __min_heap_peek_inline((min_heap_char *)_heap)) + (__minheap_cast(_heap) \ + __min_heap_peek_inline(container_of(&(_heap)->nr, min_heap_char, nr))) /* Check if the heap is full. */ static __always_inline @@ -238,7 +239,7 @@ bool __min_heap_full_inline(min_heap_char *heap) } #define min_heap_full_inline(_heap) \ - __min_heap_full_inline((min_heap_char *)_heap) + __min_heap_full_inline(container_of(&(_heap)->nr, min_heap_char, nr)) /* Sift the element at pos down the heap. */ static __always_inline @@ -277,8 +278,8 @@ void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size, } #define min_heap_sift_down_inline(_heap, _pos, _func, _args) \ - __min_heap_sift_down_inline((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), \ - _func, _args) + __min_heap_sift_down_inline(container_of(&(_heap)->nr, min_heap_char, nr), _pos, \ + __minheap_obj_size(_heap), _func, _args) /* Sift up ith element from the heap, O(log2(nr)). */ static __always_inline @@ -304,8 +305,8 @@ void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx } #define min_heap_sift_up_inline(_heap, _idx, _func, _args) \ - __min_heap_sift_up_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, \ - _func, _args) + __min_heap_sift_up_inline(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _idx, _func, _args) /* Floyd's approach to heapification that is O(nr). */ static __always_inline @@ -319,7 +320,8 @@ void __min_heapify_all_inline(min_heap_char *heap, size_t elem_size, } #define min_heapify_all_inline(_heap, _func, _args) \ - __min_heapify_all_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) + __min_heapify_all_inline(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _func, _args) /* Remove minimum element from the heap, O(log2(nr)). */ static __always_inline @@ -340,7 +342,8 @@ bool __min_heap_pop_inline(min_heap_char *heap, size_t elem_size, } #define min_heap_pop_inline(_heap, _func, _args) \ - __min_heap_pop_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) + __min_heap_pop_inline(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _func, _args) /* * Remove the minimum element and then push the given element. The @@ -356,8 +359,8 @@ void __min_heap_pop_push_inline(min_heap_char *heap, const void *element, size_t } #define min_heap_pop_push_inline(_heap, _element, _func, _args) \ - __min_heap_pop_push_inline((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ - _func, _args) + __min_heap_pop_push_inline(container_of(&(_heap)->nr, min_heap_char, nr), _element, \ + __minheap_obj_size(_heap), _func, _args) /* Push an element on to the heap, O(log2(nr)). */ static __always_inline @@ -382,8 +385,8 @@ bool __min_heap_push_inline(min_heap_char *heap, const void *element, size_t ele } #define min_heap_push_inline(_heap, _element, _func, _args) \ - __min_heap_push_inline((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ - _func, _args) + __min_heap_push_inline(container_of(&(_heap)->nr, min_heap_char, nr), _element, \ + __minheap_obj_size(_heap), _func, _args) /* Remove ith element from the heap, O(log2(nr)). */ static __always_inline @@ -411,8 +414,8 @@ bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx, } #define min_heap_del_inline(_heap, _idx, _func, _args) \ - __min_heap_del_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, \ - _func, _args) + __min_heap_del_inline(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _idx, _func, _args) void __min_heap_init(min_heap_char *heap, void *data, int size); void *__min_heap_peek(struct min_heap_char *heap); @@ -433,25 +436,31 @@ bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, const struct min_heap_callbacks *func, void *args); #define min_heap_init(_heap, _data, _size) \ - __min_heap_init((min_heap_char *)_heap, _data, _size) + __min_heap_init(container_of(&(_heap)->nr, min_heap_char, nr), _data, _size) #define min_heap_peek(_heap) \ - (__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap)) + (__minheap_cast(_heap) __min_heap_peek(container_of(&(_heap)->nr, min_heap_char, nr))) #define min_heap_full(_heap) \ - __min_heap_full((min_heap_char *)_heap) + __min_heap_full(container_of(&(_heap)->nr, min_heap_char, nr)) #define min_heap_sift_down(_heap, _pos, _func, _args) \ - __min_heap_sift_down((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args) + __min_heap_sift_down(container_of(&(_heap)->nr, min_heap_char, nr), _pos, \ + __minheap_obj_size(_heap), _func, _args) #define min_heap_sift_up(_heap, _idx, _func, _args) \ - __min_heap_sift_up((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) + __min_heap_sift_up(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _idx, _func, _args) #define min_heapify_all(_heap, _func, _args) \ - __min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) + __min_heapify_all(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _func, _args) #define min_heap_pop(_heap, _func, _args) \ - __min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) + __min_heap_pop(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _func, _args) #define min_heap_pop_push(_heap, _element, _func, _args) \ - __min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ - _func, _args) + __min_heap_pop_push(container_of(&(_heap)->nr, min_heap_char, nr), _element, \ + __minheap_obj_size(_heap), _func, _args) #define min_heap_push(_heap, _element, _func, _args) \ - __min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) + __min_heap_push(container_of(&(_heap)->nr, min_heap_char, nr), _element, \ + __minheap_obj_size(_heap), _func, _args) #define min_heap_del(_heap, _idx, _func, _args) \ - __min_heap_del((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) + __min_heap_del(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _idx, _func, _args) #endif /* _LINUX_MIN_HEAP_H */ -- cgit v1.2.3 From 2ad0546deb0214e385dbf33bb7a5e26c0dda3ad1 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sat, 30 Nov 2024 02:12:21 +0800 Subject: lib min_heap: add brief introduction to Min Heap API A short description of the Min Heap API is added to the min_heap.h, explaining its purpose for managing min-heaps and emphasizing the use of macro wrappers instead of direct function calls. For more details, users are directed to the documentation at Documentation/core-api/min_heap.rst. Link: https://lkml.kernel.org/r/20241129181222.646855-4-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Ching-Chun (Jim) Huang Cc: Geert Uytterhoeven Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 456cfbc1b8f5..55bfe670bbb9 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -6,6 +6,17 @@ #include #include +/* + * The Min Heap API provides utilities for managing min-heaps, a binary tree + * structure where each node's value is less than or equal to its children's + * values, ensuring the smallest element is at the root. + * + * Users should avoid directly calling functions prefixed with __min_heap_*(). + * Instead, use the provided macro wrappers. + * + * For further details and examples, refer to Documentation/core-api/min_heap.rst. + */ + /** * Data structure to hold a min-heap. * @nr: Number of elements currently in the heap. -- cgit v1.2.3 From 658eb5ab916ddc92f294dbce8e3d449470be9f86 Mon Sep 17 00:00:00 2001 From: Wang Yaxin Date: Tue, 3 Dec 2024 16:48:48 +0800 Subject: delayacct: add delay max to record delay peak Introduce the use cases of delay max, which can help quickly detect potential abnormal delays in the system and record the types and specific details of delay spikes. Problem ======== Delay accounting can track the average delay of processes to show system workload. However, when a process experiences a significant delay, maybe a delay spike, which adversely affects performance, getdelays can only display the average system delay over a period of time. Yet, average delay is unhelpful for diagnosing delay peak. It is not even possible to determine which type of delay has spiked, as this information might be masked by the average delay. Solution ========= the 'delay max' can display delay peak since the system's startup, which can record potential abnormal delays over time, including the type of delay and the maximum delay. This is helpful for quickly identifying crash caused by delay. Use case ========= bash# ./getdelays -d -p 244 print delayacct stats ON PID 244 CPU count real total virtual total delay total delay average delay max 68 192000000 213676651 705643 0.010ms 0.306381ms IO count delay total delay average delay max 0 0 0.000ms 0.000000ms SWAP count delay total delay average delay max 0 0 0.000ms 0.000000ms RECLAIM count delay total delay average delay max 0 0 0.000ms 0.000000ms THRASHING count delay total delay average delay max 0 0 0.000ms 0.000000ms COMPACT count delay total delay average delay max 0 0 0.000ms 0.000000ms WPCOPY count delay total delay average delay max 235 15648284 0.067ms 0.263842ms IRQ count delay total delay average delay max 0 0 0.000ms 0.000000ms [wang.yaxin@zte.com.cn: update docs and fix some spelling errors] Link: https://lkml.kernel.org/r/20241213192700771XKZ8H30OtHSeziGqRVMs0@zte.com.cn Link: https://lkml.kernel.org/r/20241203164848805CS62CQPQWG9GLdQj2_BxS@zte.com.cn Co-developed-by: Wang Yong Signed-off-by: Wang Yong Co-developed-by: xu xin Signed-off-by: xu xin Co-developed-by: Wang Yaxin Signed-off-by: Wang Yaxin Signed-off-by: Kun Jiang Cc: Balbir Singh Cc: David Hildenbrand Cc: Fan Yu Cc: Peilin He Cc: tuqiang Cc: Yang Yang Cc: ye xingchen Cc: Yunkai Zhang Signed-off-by: Andrew Morton --- include/linux/delayacct.h | 7 +++++++ include/linux/sched.h | 3 +++ 2 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 6639f48dac36..56fbfa2c2ac5 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -29,25 +29,32 @@ struct task_delay_info { * XXX_delay contains the accumulated delay time in nanoseconds. */ u64 blkio_start; + u64 blkio_delay_max; u64 blkio_delay; /* wait for sync block io completion */ u64 swapin_start; + u64 swapin_delay_max; u64 swapin_delay; /* wait for swapin */ u32 blkio_count; /* total count of the number of sync block */ /* io operations performed */ u32 swapin_count; /* total count of swapin */ u64 freepages_start; + u64 freepages_delay_max; u64 freepages_delay; /* wait for memory reclaim */ u64 thrashing_start; + u64 thrashing_delay_max; u64 thrashing_delay; /* wait for thrashing page */ u64 compact_start; + u64 compact_delay_max; u64 compact_delay; /* wait for memory compact */ u64 wpcopy_start; + u64 wpcopy_delay_max; u64 wpcopy_delay; /* wait for write-protect copy */ + u64 irq_delay_max; u64 irq_delay; /* wait for IRQ/SOFTIRQ */ u32 freepages_count; /* total count of memory reclaim */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 64934e0830af..a0ae3923b41d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -398,6 +398,9 @@ struct sched_info { /* Time spent waiting on a runqueue: */ unsigned long long run_delay; + /* Max time spent waiting on a runqueue: */ + unsigned long long max_run_delay; + /* Timestamps: */ /* When did we last run on a CPU? */ -- cgit v1.2.3 From 7a77edf45a05615775d1e423a7309b9f06e866ec Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 5 Dec 2024 14:20:44 +0100 Subject: include: update references to include/asm- "include/asm-" was replaced by "arch//include/asm" a long time ago. Link: https://lkml.kernel.org/r/541258219b0441fa1da890e2f8458a7ac18c2ef9.1733404444.git.geert+renesas@glider.be Signed-off-by: Geert Uytterhoeven Cc: Andy Whitcroft Cc: Arnd Bergmann Cc: Dwaipayan Ray Cc: Joe Perches Cc: Lukas Bulwahn Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Oleg Nesterov Cc: Rasmus Villemoes Cc: Yury Norov Signed-off-by: Andrew Morton --- include/linux/bitmap.h | 2 +- include/linux/types.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 262b6596eca5..2026953e2c4e 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -23,7 +23,7 @@ struct device; * * Function implementations generic to all architectures are in * lib/bitmap.c. Functions implementations that are architecture - * specific are in various include/asm-/bitops.h headers + * specific are in various arch//include/asm/bitops.h headers * and other arch/ specific files. * * See lib/bitmap.c for more details. diff --git a/include/linux/types.h b/include/linux/types.h index 2d7b9ae8714c..1c509ce8f7f6 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -43,7 +43,7 @@ typedef unsigned long uintptr_t; typedef long intptr_t; #ifdef CONFIG_HAVE_UID16 -/* This is defined by include/asm-{arch}/posix_types.h */ +/* This is defined by arch/{arch}/include/asm/posix_types.h */ typedef __kernel_old_uid_t old_uid_t; typedef __kernel_old_gid_t old_gid_t; #endif /* CONFIG_UID16 */ -- cgit v1.2.3 From 78346c34d20f571d6495aa50c735653c730b94ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominik=20Karol=20Pi=C4=85tkowski?= Date: Fri, 20 Dec 2024 18:12:12 +0000 Subject: kasan: fix typo in kasan_poison_new_object documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix presumed copy-paste typo of kasan_poison_new_object documentation referring to kasan_unpoison_new_object. No functional changes. Link: https://lkml.kernel.org/r/20241220181205.9663-1-dominik.karol.piatkowski@protonmail.com Fixes: 1ce9a0523938 ("kasan: rename and document kasan_(un)poison_object_data") ta") Signed-off-by: Dominik Karol Piątkowski Reviewed-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- include/linux/kasan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 6bbfc8aa42e8..56465af31044 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -153,7 +153,7 @@ static __always_inline void kasan_unpoison_new_object(struct kmem_cache *cache, void __kasan_poison_new_object(struct kmem_cache *cache, void *object); /** - * kasan_unpoison_new_object - Repoison a new slab object. + * kasan_poison_new_object - Repoison a new slab object. * @cache: Cache the object belong to. * @object: Pointer to the object. * -- cgit v1.2.3 From f65c64f311ee2f1ddc1eb395ed8b20e6b9d14e85 Mon Sep 17 00:00:00 2001 From: Wang Yaxin Date: Fri, 20 Dec 2024 17:31:05 +0800 Subject: delayacct: add delay min to record delay peak Delay accounting can now calculate the average delay of processes, detect the overall system load, and also record the 'delay max' to identify potential abnormal delays. However, 'delay min' can help us identify another useful delay peak. By comparing the difference between 'delay max' and 'delay min', we can understand the optimization space for latency, providing a reference for the optimization of latency performance. Use case ========= bash-4.4# ./getdelays -d -t 242 print delayacct stats ON TGID 242 CPU count real total virtual total delay total delay average delay max delay min 39 156000000 156576579 2111069 0.054ms 0.212296ms 0.031307ms IO count delay total delay average delay max delay min 0 0 0.000ms 0.000000ms 0.000000ms SWAP count delay total delay average delay max delay min 0 0 0.000ms 0.000000ms 0.000000ms RECLAIM count delay total delay average delay max delay min 0 0 0.000ms 0.000000ms 0.000000ms THRASHING count delay total delay average delay max delay min 0 0 0.000ms 0.000000ms 0.000000ms COMPACT count delay total delay average delay max delay min 0 0 0.000ms 0.000000ms 0.000000ms WPCOPY count delay total delay average delay max delay min 156 11215873 0.072ms 0.207403ms 0.033913ms IRQ count delay total delay average delay max delay min 0 0 0.000ms 0.000000ms 0.000000ms Link: https://lkml.kernel.org/r/20241220173105906EOdsPhzjMLYNJJBqgz1ga@zte.com.cn Co-developed-by: Wang Yong Signed-off-by: Wang Yong Co-developed-by: xu xin Signed-off-by: xu xin Signed-off-by: Wang Yaxin Co-developed-by: Kun Jiang Signed-off-by: Kun Jiang Cc: Balbir Singh Cc: David Hildenbrand Cc: Fan Yu Cc: Peilin He Cc: tuqiang Cc: ye xingchen Cc: Yunkai Zhang Signed-off-by: Andrew Morton --- include/linux/delayacct.h | 7 +++++++ include/linux/sched.h | 3 +++ 2 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 56fbfa2c2ac5..800dcc360db2 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -30,9 +30,11 @@ struct task_delay_info { */ u64 blkio_start; u64 blkio_delay_max; + u64 blkio_delay_min; u64 blkio_delay; /* wait for sync block io completion */ u64 swapin_start; u64 swapin_delay_max; + u64 swapin_delay_min; u64 swapin_delay; /* wait for swapin */ u32 blkio_count; /* total count of the number of sync block */ /* io operations performed */ @@ -40,21 +42,26 @@ struct task_delay_info { u64 freepages_start; u64 freepages_delay_max; + u64 freepages_delay_min; u64 freepages_delay; /* wait for memory reclaim */ u64 thrashing_start; u64 thrashing_delay_max; + u64 thrashing_delay_min; u64 thrashing_delay; /* wait for thrashing page */ u64 compact_start; u64 compact_delay_max; + u64 compact_delay_min; u64 compact_delay; /* wait for memory compact */ u64 wpcopy_start; u64 wpcopy_delay_max; + u64 wpcopy_delay_min; u64 wpcopy_delay; /* wait for write-protect copy */ u64 irq_delay_max; + u64 irq_delay_min; u64 irq_delay; /* wait for IRQ/SOFTIRQ */ u32 freepages_count; /* total count of memory reclaim */ diff --git a/include/linux/sched.h b/include/linux/sched.h index a0ae3923b41d..155012467b21 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -401,6 +401,9 @@ struct sched_info { /* Max time spent waiting on a runqueue: */ unsigned long long max_run_delay; + /* Min time spent waiting on a runqueue: */ + unsigned long long min_run_delay; + /* Timestamps: */ /* When did we last run on a CPU? */ -- cgit v1.2.3 From 26a6cc10f19a058c24cbe3be2a4a10048e66d9c9 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Sun, 12 Jan 2025 18:33:52 +0800 Subject: usb: phy: Remove API devm_usb_put_phy() Static devm_usb_phy_match() is only called by API devm_usb_put_phy(), and the API has no caller now. Remove the API and the static function. Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250112-remove_api-v1-1-49cc8f792ac9@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/phy.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h index e4de6bc1f69b..0fa9885a1038 100644 --- a/include/linux/usb/phy.h +++ b/include/linux/usb/phy.h @@ -223,7 +223,6 @@ extern struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, extern struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, struct device_node *node, struct notifier_block *nb); extern void usb_put_phy(struct usb_phy *); -extern void devm_usb_put_phy(struct device *dev, struct usb_phy *x); extern void usb_phy_set_event(struct usb_phy *x, unsigned long event); extern void usb_phy_set_charger_current(struct usb_phy *usb_phy, unsigned int mA); @@ -259,10 +258,6 @@ static inline void usb_put_phy(struct usb_phy *x) { } -static inline void devm_usb_put_phy(struct device *dev, struct usb_phy *x) -{ -} - static inline void usb_phy_set_event(struct usb_phy *x, unsigned long event) { } -- cgit v1.2.3 From 90dde9a13c0020ce140bc8d27c1f4c48a070cc97 Mon Sep 17 00:00:00 2001 From: "Roger L. Beckermeyer III" Date: Wed, 18 Dec 2024 08:28:50 +1030 Subject: rbtree: add rb_find_add_cached() to rbtree.h Adds rb_find_add_cached() as a helper function for use with red-black trees. Used in btrfs to reduce boilerplate code. And since it's a new helper, the cmp() function will require both parameter to be const rb_node pointers. Suggested-by: Josef Bacik Signed-off-by: Roger L. Beckermeyer III Acked-by: Peter Zijlstra (Intel) Reviewed-by: Qu Wenruo Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/linux/rbtree.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 7c173aa64e1e..8d2ba3749866 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -210,6 +210,43 @@ rb_add(struct rb_node *node, struct rb_root *tree, rb_insert_color(node, tree); } +/** + * rb_find_add_cached() - find equivalent @node in @tree, or add @node + * @node: node to look-for / insert + * @tree: tree to search / modify + * @cmp: operator defining the node order + * + * Returns the rb_node matching @node, or NULL when no match is found and @node + * is inserted. + */ +static __always_inline struct rb_node * +rb_find_add_cached(struct rb_node *node, struct rb_root_cached *tree, + int (*cmp)(const struct rb_node *new, const struct rb_node *exist)) +{ + bool leftmost = true; + struct rb_node **link = &tree->rb_root.rb_node; + struct rb_node *parent = NULL; + int c; + + while (*link) { + parent = *link; + c = cmp(node, parent); + + if (c < 0) { + link = &parent->rb_left; + } else if (c > 0) { + link = &parent->rb_right; + leftmost = false; + } else { + return parent; + } + } + + rb_link_node(node, parent, link); + rb_insert_color_cached(node, tree, leftmost); + return NULL; +} + /** * rb_find_add() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert -- cgit v1.2.3 From 2bf502251b3ba0734aad81317d62e13389b89a5d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 1 Jan 2025 07:05:27 +0200 Subject: wifi: cfg80211: check extended MLD capa/ops in assoc Check that additionally extended MLD capa/ops for the MLD is consistent, i.e. the same value is reported by all affiliated APs/links. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250101070249.e29f42c7ae21.Ib2cdce608321ad154e4b13103cc315c3e3cb6b2b@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 05dedc45505c..9c0e2617fe8f 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4961,6 +4961,7 @@ struct ieee80211_multi_link_elem { #define IEEE80211_MLC_BASIC_PRES_EML_CAPA 0x0080 #define IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP 0x0100 #define IEEE80211_MLC_BASIC_PRES_MLD_ID 0x0200 +#define IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP 0x0400 #define IEEE80211_MED_SYNC_DELAY_DURATION 0x00ff #define IEEE80211_MED_SYNC_DELAY_SYNC_OFDM_ED_THRESH 0x0f00 @@ -5226,6 +5227,47 @@ static inline u16 ieee80211_mle_get_mld_capa_op(const u8 *data) return get_unaligned_le16(common); } +/** + * ieee80211_mle_get_ext_mld_capa_op - returns the extended MLD capabilities + * and operations. + * @data: pointer to the multi-link element + * Return: the extended MLD capabilities and operations field value from + * the multi-link element, or 0 if not present + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + */ +static inline u16 ieee80211_mle_get_ext_mld_capa_op(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* + * common points now at the beginning of + * ieee80211_mle_basic_common_info + */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP)) + return 0; + + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID) + common += 1; + + return get_unaligned_le16(common); +} + /** * ieee80211_mle_get_mld_id - returns the MLD ID * @data: pointer to the multi-link element @@ -5298,6 +5340,8 @@ static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) common += 2; if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID) common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP) + common += 2; break; case IEEE80211_ML_CONTROL_TYPE_PREQ: common += sizeof(struct ieee80211_mle_preq_common_info); -- cgit v1.2.3 From 98934687f8a871ea2bf90be6590daddd1a130cdd Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 1 Jan 2025 07:05:33 +0200 Subject: wifi: mac80211: skip all known membership selectors The GLK and EPD Selectors are also not rates, so add a new macro for the minimum value of a selector and test against that instead of the entire list. Also fix the typo in the EPD selector define. Signed-off-by: Benjamin Berg Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250101070249.2c19a2dc53db.If187b7d93d8b43a6c70e422c837b7636538fb358@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 9c0e2617fe8f..745c3b125d97 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1542,11 +1542,13 @@ struct ieee80211_mgmt { #define BSS_MEMBERSHIP_SELECTOR_HT_PHY 127 #define BSS_MEMBERSHIP_SELECTOR_VHT_PHY 126 #define BSS_MEMBERSHIP_SELECTOR_GLK 125 -#define BSS_MEMBERSHIP_SELECTOR_EPS 124 +#define BSS_MEMBERSHIP_SELECTOR_EPD 124 #define BSS_MEMBERSHIP_SELECTOR_SAE_H2E 123 #define BSS_MEMBERSHIP_SELECTOR_HE_PHY 122 #define BSS_MEMBERSHIP_SELECTOR_EHT_PHY 121 +#define BSS_MEMBERSHIP_SELECTOR_MIN BSS_MEMBERSHIP_SELECTOR_EHT_PHY + /* mgmt header + 1 byte category code */ #define IEEE80211_MIN_ACTION_SIZE offsetof(struct ieee80211_mgmt, u.action.u) -- cgit v1.2.3 From fa2a71a3b9ed1a333f1bed30ffe758cc150a399a Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 2 Jan 2025 16:19:53 +0200 Subject: wifi: ieee80211: Add some missing MLO related definitions As a preparation to support ML reconfiguration request and response, add additional ML reconfiguration definitions required to support the flow. See Section 9.4.2.321.4 in Draft P802.11be_D6.0. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250102161730.4970ca10ebfd.Ibe7f6108cd0e04b8c739a8e35a4f485f664a17e6@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 745c3b125d97..ee6bebfd041d 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -3885,6 +3885,16 @@ enum ieee80211_protected_eht_actioncode { WLAN_PROTECTED_EHT_ACTION_TTLM_REQ = 0, WLAN_PROTECTED_EHT_ACTION_TTLM_RES = 1, WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN = 2, + WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_REQ = 3, + WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_RESP = 4, + WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN = 5, + WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF = 6, + WLAN_PROTECTED_EHT_ACTION_LINK_RECOMMEND = 7, + WLAN_PROTECTED_EHT_ACTION_ML_OP_UPDATE_REQ = 8, + WLAN_PROTECTED_EHT_ACTION_ML_OP_UPDATE_RESP = 9, + WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_NOTIF = 10, + WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_REQ = 11, + WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_RESP = 12, }; /* Security key length */ @@ -5021,6 +5031,8 @@ struct ieee80211_multi_link_elem { #define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_DIFF 3 #define IEEE80211_MLD_CAP_OP_FREQ_SEP_TYPE_IND 0x0f80 #define IEEE80211_MLD_CAP_OP_AAR_SUPPORT 0x1000 +#define IEEE80211_MLD_CAP_OP_LINK_RECONF_SUPPORT 0x2000 +#define IEEE80211_MLD_CAP_OP_ALIGNED_TWT_SUPPORT 0x4000 struct ieee80211_mle_basic_common_info { u8 len; @@ -5036,6 +5048,9 @@ struct ieee80211_mle_preq_common_info { } __packed; #define IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR 0x0010 +#define IEEE80211_MLC_RECONF_PRES_EML_CAPA 0x0020 +#define IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP 0x0040 +#define IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP 0x0080 /* no fixed fields in RECONF */ @@ -5354,6 +5369,12 @@ static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) case IEEE80211_ML_CONTROL_TYPE_RECONF: if (control & IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR) common += ETH_ALEN; + if (control & IEEE80211_MLC_RECONF_PRES_EML_CAPA) + common += 2; + if (control & IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP) + common += 2; + if (control & IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP) + common += 2; break; case IEEE80211_ML_CONTROL_TYPE_TDLS: common += sizeof(struct ieee80211_mle_tdls_common_info); @@ -5504,8 +5525,13 @@ ieee80211_mle_basic_sta_prof_bss_param_ch_cnt(const struct ieee80211_mle_per_sta #define IEEE80211_MLE_STA_RECONF_CONTROL_COMPLETE_PROFILE 0x0010 #define IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT 0x0020 #define IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT 0x0040 -#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_UPDATE_TYPE 0x0780 -#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_PARAMS_PRESENT 0x0800 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE 0x0780 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_AP_REM 0 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_OP_PARAM_UPDATE 1 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_ADD_LINK 2 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_DEL_LINK 3 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_NSTR_STATUS 4 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_PARAMS_PRESENT 0x0800 /** * ieee80211_mle_reconf_sta_prof_size_ok - validate reconfiguration multi-link -- cgit v1.2.3 From 36e05b0b83903e2d85b3675d10ac8b5eced54377 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 2 Jan 2025 16:19:58 +0200 Subject: wifi: mac80211: Support dynamic link addition and removal Add support for adding and removing station links: - Adding links is done asynchronously, i.e., first an ML reconfiguration action frame is sent to the AP requesting to add links, and only when the AP replies, links which were added successfully by the AP are added locally. - Removing links is done synchronously, i.e., the links are removed before sending the ML reconfiguration action frame to the AP (to avoid using this links after the AP MLD removed them but before the station got the ML reconfiguration response). In case the AP replies with a status indicating that a link removal was not successful, disconnect (as this should not happen an is an indication that something might be wrong on the AP MLD). Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250102161730.ec0492a8dd21.I2869686642bbc0f86c40f284ebf7e6f644b551ab@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index ee6bebfd041d..b5c5b5c39d9a 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1532,6 +1532,17 @@ struct ieee80211_mgmt { struct { u8 action_code; } __packed ttlm_tear_down; + struct { + u8 action_code; + u8 dialog_token; + u8 variable[]; + } __packed ml_reconf_req; + struct { + u8 action_code; + u8 dialog_token; + u8 count; + u8 variable[]; + } __packed ml_reconf_resp; } u; } __packed action; DECLARE_FLEX_ARRAY(u8, body); /* Generic frame body */ -- cgit v1.2.3 From 19aa842dcbb5860509b7e1b7745dbae0b791f6c4 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 2 Jan 2025 16:20:00 +0200 Subject: wifi: mac80211: Fix common size calculation for ML element When the ML type is EPCS the control bitmap is reserved, the length is always 7 and is captured by the 1st octet after the control. Fixes: 0f48b8b88aa9 ("wifi: ieee80211: add definitions for multi-link element") Signed-off-by: Ilan Peer Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250102161730.5790376754a7.I381208cbb72b1be2a88239509294099e9337e254@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index b5c5b5c39d9a..16741e542e81 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -5084,28 +5084,24 @@ static inline u8 ieee80211_mle_common_size(const u8 *data) { const struct ieee80211_multi_link_elem *mle = (const void *)data; u16 control = le16_to_cpu(mle->control); - u8 common = 0; switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) { case IEEE80211_ML_CONTROL_TYPE_BASIC: case IEEE80211_ML_CONTROL_TYPE_PREQ: case IEEE80211_ML_CONTROL_TYPE_TDLS: case IEEE80211_ML_CONTROL_TYPE_RECONF: + case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: /* * The length is the first octet pointed by mle->variable so no * need to add anything */ break; - case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: - if (control & IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR) - common += ETH_ALEN; - return common; default: WARN_ON(1); return 0; } - return sizeof(*mle) + common + mle->variable[0]; + return sizeof(*mle) + mle->variable[0]; } /** @@ -5392,8 +5388,7 @@ static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) check_common_len = true; break; case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: - if (control & IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR) - common += ETH_ALEN; + common = ETH_ALEN + 1; break; default: /* we don't know this type */ -- cgit v1.2.3 From e0537c9f828dc9500e5ffc9211099c495b72f402 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Tue, 19 Nov 2024 13:43:22 -0800 Subject: SUNRPC: only put task on cl_tasks list after the RPC call slot is reserved. Under heavy write load, we've seen the cl_tasks list grows to millions of entries. Even though the list is extremely long, the system still runs fine until the user wants to get the information of all active RPC tasks by doing: When this happens, tasks_start acquires the cl_lock to walk the cl_tasks list, returning one entry at a time to the caller. The cl_lock is held until all tasks on this list have been processed. While the cl_lock is held, completed RPC tasks have to spin wait in rpc_task_release_client for the cl_lock. If there are millions of entries in the cl_tasks list it will take a long time before tasks_stop is called and the cl_lock is released. The spin wait tasks can use up all the available CPUs in the system, preventing other jobs to run, this causes the system to temporarily lock up. This patch fixes this problem by delaying inserting the RPC task on the cl_tasks list until the RPC call slot is reserved. This limits the length of the cl_tasks to the number of call slots available in the system. Signed-off-by: Dai Ngo Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 5321585c778f..fec976e58174 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -93,6 +93,7 @@ struct rpc_clnt { const struct cred *cl_cred; unsigned int cl_max_connect; /* max number of transports not to the same IP */ struct super_block *pipefs_sb; + atomic_t cl_task_count; }; /* -- cgit v1.2.3 From bb3914101f704a8282f65238d6b021d216efc608 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 9 Jan 2025 13:42:05 -0600 Subject: device property: Split property reading bool and presence test ops The fwnode/device property API currently implement (fwnode|device)_property_read_bool() with (fwnode|device)_property_present(). That does not allow having different behavior depending on the backend. Specifically, the usage of (fwnode|device)_property_read_bool() on non-boolean properties is deprecated on DT. In order to add a warning on this deprecated use, these 2 APIs need separate ops for the backend. Acked-by: Greg Kroah-Hartman Reviewed-by: Krzysztof Kozlowski Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20250109-dt-type-warnings-v1-1-0150e32e716c@kernel.org Signed-off-by: Rob Herring (Arm) --- include/linux/fwnode.h | 3 +++ include/linux/of.h | 4 +++- include/linux/property.h | 15 +++------------ 3 files changed, 9 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 0d79070c5a70..0731994b9d7c 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -112,6 +112,7 @@ struct fwnode_reference_args { * @device_is_available: Return true if the device is available. * @device_get_match_data: Return the device driver match data. * @property_present: Return true if a property is present. + * @property_read_bool: Return a boolean property value. * @property_read_int_array: Read an array of integer properties. Return zero on * success, a negative error code otherwise. * @property_read_string_array: Read an array of string properties. Return zero @@ -141,6 +142,8 @@ struct fwnode_operations { (*device_get_dma_attr)(const struct fwnode_handle *fwnode); bool (*property_present)(const struct fwnode_handle *fwnode, const char *propname); + bool (*property_read_bool)(const struct fwnode_handle *fwnode, + const char *propname); int (*property_read_int_array)(const struct fwnode_handle *fwnode, const char *propname, unsigned int elem_size, void *val, diff --git a/include/linux/of.h b/include/linux/of.h index f921786cb8ac..1cb4eb7fc2ed 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -1271,7 +1271,9 @@ static inline bool of_property_read_bool(const struct device_node *np, */ static inline bool of_property_present(const struct device_node *np, const char *propname) { - return of_property_read_bool(np, propname); + struct property *prop = of_find_property(np, propname, NULL); + + return prop ? true : false; } /** diff --git a/include/linux/property.h b/include/linux/property.h index 61fc20e5f81f..e214ecd241eb 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -37,6 +37,7 @@ struct fwnode_handle *__dev_fwnode(struct device *dev); struct device *: __dev_fwnode)(dev) bool device_property_present(const struct device *dev, const char *propname); +bool device_property_read_bool(const struct device *dev, const char *propname); int device_property_read_u8_array(const struct device *dev, const char *propname, u8 *val, size_t nval); int device_property_read_u16_array(const struct device *dev, const char *propname, @@ -54,6 +55,8 @@ int device_property_match_string(const struct device *dev, bool fwnode_property_present(const struct fwnode_handle *fwnode, const char *propname); +bool fwnode_property_read_bool(const struct fwnode_handle *fwnode, + const char *propname); int fwnode_property_read_u8_array(const struct fwnode_handle *fwnode, const char *propname, u8 *val, size_t nval); @@ -207,12 +210,6 @@ int fwnode_irq_get_byname(const struct fwnode_handle *fwnode, const char *name); unsigned int device_get_child_node_count(const struct device *dev); -static inline bool device_property_read_bool(const struct device *dev, - const char *propname) -{ - return device_property_present(dev, propname); -} - static inline int device_property_read_u8(const struct device *dev, const char *propname, u8 *val) { @@ -263,12 +260,6 @@ static inline int device_property_string_array_count(const struct device *dev, return device_property_read_string_array(dev, propname, NULL, 0); } -static inline bool fwnode_property_read_bool(const struct fwnode_handle *fwnode, - const char *propname) -{ - return fwnode_property_present(fwnode, propname); -} - static inline int fwnode_property_read_u8(const struct fwnode_handle *fwnode, const char *propname, u8 *val) { -- cgit v1.2.3 From c141ecc3cecd764799e17c8251026336cab86800 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 9 Jan 2025 13:42:06 -0600 Subject: of: Warn when of_property_read_bool() is used on non-boolean properties The use of of_property_read_bool() for non-boolean properties is deprecated. The primary use of it was to test property presence, but that has been replaced in favor of of_property_present(). With those uses now fixed, add a warning to discourage new ones. Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250109-dt-type-warnings-v1-2-0150e32e716c@kernel.org Signed-off-by: Rob Herring (Arm) --- include/linux/of.h | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 1cb4eb7fc2ed..0cdd58ff0a41 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -311,6 +311,7 @@ extern struct device_node *of_find_node_with_property( extern struct property *of_find_property(const struct device_node *np, const char *name, int *lenp); +extern bool of_property_read_bool(const struct device_node *np, const char *propname); extern int of_property_count_elems_of_size(const struct device_node *np, const char *propname, int elem_size); extern int of_property_read_u32_index(const struct device_node *np, @@ -615,6 +616,12 @@ static inline struct device_node *of_find_compatible_node( return NULL; } +static inline bool of_property_read_bool(const struct device_node *np, + const char *propname) +{ + return false; +} + static inline int of_property_count_elems_of_size(const struct device_node *np, const char *propname, int elem_size) { @@ -1242,24 +1249,6 @@ static inline int of_property_read_string_index(const struct device_node *np, return rc < 0 ? rc : 0; } -/** - * of_property_read_bool - Find a property - * @np: device node from which the property value is to be read. - * @propname: name of the property to be searched. - * - * Search for a boolean property in a device node. Usage on non-boolean - * property types is deprecated. - * - * Return: true if the property exists false otherwise. - */ -static inline bool of_property_read_bool(const struct device_node *np, - const char *propname) -{ - const struct property *prop = of_find_property(np, propname, NULL); - - return prop ? true : false; -} - /** * of_property_present - Test if a property is present in a node * @np: device node to search for the property. -- cgit v1.2.3 From f835bdae716751fa20451508150e5fdd5f5b2be3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 12 Jan 2025 16:34:55 -0800 Subject: net: remove init_dummy_netdev() init_dummy_netdev() can initialize statically declared or embedded net_devices. Such netdevs did not come from alloc_netdev_mqs(). After recent work by Breno, there are the only two cases where we have do that. Switch those cases to alloc_netdev_mqs() and delete init_dummy_netdev(). Dealing with static netdevs is not worth the maintenance burden. Reviewed-by: Alexander Lobakin Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250113003456.3904110-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index aeb4a6cff171..dd8f6f8991fe 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3238,7 +3238,6 @@ static inline void unregister_netdevice(struct net_device *dev) int netdev_refcnt_read(const struct net_device *dev); void free_netdev(struct net_device *dev); -void init_dummy_netdev(struct net_device *dev); struct net_device *netdev_get_xmit_slave(struct net_device *dev, struct sk_buff *skb, -- cgit v1.2.3 From aecd9d1020e3c6d29ecc9efccbcee7863e83c517 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 9 Jan 2025 18:05:36 +0200 Subject: net/mlx5: fs, add HWS packet reformat API function Add packet reformat alloc and dealloc API functions to provide packet reformat actions for steering rules. Add HWS action pools for each of the following packet reformat types: - decapl3: decapsulate l3 tunnel to l2 - encapl2: encapsulate l2 to tunnel l2 - encapl3: encapsulate l2 to tunnel l3 - insert_hdr: insert header In addition cache remove header action for remove vlan header as this is currently the only use case of remove header action in the driver. Signed-off-by: Moshe Shemesh Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250109160546.1733647-6-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 370f533da107..bb99a35fc6a2 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -7025,6 +7025,7 @@ struct mlx5_ifc_alloc_packet_reformat_context_out_bits { enum { MLX5_REFORMAT_CONTEXT_ANCHOR_MAC_START = 0x1, + MLX5_REFORMAT_CONTEXT_ANCHOR_VLAN_START = 0x2, MLX5_REFORMAT_CONTEXT_ANCHOR_IP_START = 0x7, MLX5_REFORMAT_CONTEXT_ANCHOR_TCP_UDP_START = 0x9, }; -- cgit v1.2.3 From 061b27e37238d374d8a6954b22d9c5d07c5db574 Mon Sep 17 00:00:00 2001 From: Yang Shen Date: Fri, 3 Jan 2025 18:21:38 +0800 Subject: crypto: hisilicon/qm - support new function communication On the HiSilicon accelerators drivers, the PF/VFs driver can send messages to the VFs/PF by writing hardware registers, and the VFs/PF driver receives messages from the PF/VFs by reading hardware registers. To support this feature, a new version id is added, different communication mechanism are used based on different version id. Signed-off-by: Yang Shen Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index c1dafbabbd6b..99fcf65d575f 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -124,6 +124,7 @@ enum qm_hw_ver { QM_HW_V1 = 0x20, QM_HW_V2 = 0x21, QM_HW_V3 = 0x30, + QM_HW_V4 = 0x50, }; enum qm_fun_type { @@ -397,6 +398,8 @@ struct hisi_qm { struct mutex mailbox_lock; + struct mutex ifc_lock; + const struct hisi_qm_hw_ops *ops; struct qm_debug debug; -- cgit v1.2.3 From bfc1d1782984903457b2707d05a35b24ce5fbea1 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Tue, 26 Nov 2024 09:56:54 -0600 Subject: mm: migrate: remove unused argument vma from migrate_misplaced_folio() Commit ee86814b0562 ("mm/migrate: move NUMA hinting fault folio isolation + checks under PTL") removed the code that had used the vma argument in migrate_misplaced_folio. Since the vma argument was no longer used in migrate_misplaced_folio, this patch removes it. Link: https://lkml.kernel.org/r/20241126155655.466186-1-donettom@linux.ibm.com Signed-off-by: Donet Tom Reviewed-by: Baolin Wang Reviewed-by: Zi Yan Acked-by: David Hildenbrand Cc: Ritesh Harjani (IBM) Signed-off-by: Andrew Morton --- include/linux/migrate.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 002e49b2ebd9..29919faea2f1 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -144,16 +144,14 @@ const struct movable_operations *page_movable_ops(struct page *page) #ifdef CONFIG_NUMA_BALANCING int migrate_misplaced_folio_prepare(struct folio *folio, struct vm_area_struct *vma, int node); -int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, - int node); +int migrate_misplaced_folio(struct folio *folio, int node); #else static inline int migrate_misplaced_folio_prepare(struct folio *folio, struct vm_area_struct *vma, int node) { return -EAGAIN; /* can't migrate now */ } -static inline int migrate_misplaced_folio(struct folio *folio, - struct vm_area_struct *vma, int node) +static inline int migrate_misplaced_folio(struct folio *folio, int node) { return -EAGAIN; /* can't migrate now */ } -- cgit v1.2.3 From 38558b2460d7881a3de3bdc31a23fa7034384d00 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:34 +0000 Subject: mm: make alloc_pages_mpol() static All callers outside mempolicy.c now use folio_alloc_mpol() thanks to Kefeng's cleanups, so we can remove this as a visible symbol. And also remove the alloc_hooks for alloc_pages_mpol(), since all users in mempolicy.c are using the nonprof version. Link: https://lkml.kernel.org/r/20241125210149.2976098-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Cc: Miaohe Lin Cc: Muchun Song Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/gfp.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b0fe9f62d15b..c96d5d7f7b89 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -300,8 +300,6 @@ static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask, #ifdef CONFIG_NUMA struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); -struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, - struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); @@ -312,11 +310,6 @@ static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order { return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order); } -static inline struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, - struct mempolicy *mpol, pgoff_t ilx, int nid) -{ - return alloc_pages_noprof(gfp, order); -} static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return __folio_alloc_node_noprof(gfp, order, numa_node_id()); @@ -331,7 +324,6 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde #endif #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) -#define alloc_pages_mpol(...) alloc_hooks(alloc_pages_mpol_noprof(__VA_ARGS__)) #define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) #define folio_alloc_mpol(...) alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__)) #define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) -- cgit v1.2.3 From 9023691d75f29fde884f6e243bcdad6a9dbadb19 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 25 Nov 2024 09:16:17 -0800 Subject: mm: mmap_lock: optimize mmap_lock tracepoints We are starting to deploy mmap_lock tracepoint monitoring across our fleet and the early results showed that these tracepoints are consuming significant amount of CPUs in kernfs_path_from_node when enabled. It seems like the kernel is trying to resolve the cgroup path in the fast path of the locking code path when the tracepoints are enabled. In addition for some application their metrics are regressing when monitoring is enabled. The cgroup path resolution can be slow and should not be done in the fast path. Most userspace tools, like bpftrace, provides functionality to get the cgroup path from cgroup id, so let's just trace the cgroup id and the users can use better tools to get the path in the slow path. Link: https://lkml.kernel.org/r/20241125171617.113892-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Yosry Ahmed Acked-by: Vlastimil Babka Acked-by: Roman Gushchin Reviewed-by: Axel Rasmussen Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Muchun Song Cc: Steven Rostedt Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5502aa8e138e..b28180269e75 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1046,6 +1046,23 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, void split_page_memcg(struct page *head, int old_order, int new_order); +static inline u64 cgroup_id_from_mm(struct mm_struct *mm) +{ + struct mem_cgroup *memcg; + u64 id; + + if (mem_cgroup_disabled()) + return 0; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (!memcg) + memcg = root_mem_cgroup; + id = cgroup_id(memcg->css.cgroup); + rcu_read_unlock(); + return id; +} + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1466,6 +1483,11 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) static inline void split_page_memcg(struct page *head, int old_order, int new_order) { } + +static inline u64 cgroup_id_from_mm(struct mm_struct *mm) +{ + return 0; +} #endif /* CONFIG_MEMCG */ /* -- cgit v1.2.3 From 20f3ab257211594c110c43e71c31bd25ba31e851 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 22 Nov 2024 15:36:52 +0800 Subject: mm: pgtable: make ptep_clear() non-atomic In the generic ptep_get_and_clear() implementation, it is just a simple combination of ptep_get() and pte_clear(). But for some architectures (such as x86 and arm64, etc), the hardware will modify the A/D bits of the page table entry, so the ptep_get_and_clear() needs to be overwritten and implemented as an atomic operation to avoid contention, which has a performance cost. The commit d283d422c6c4 ("x86: mm: add x86_64 support for page table check") adds the ptep_clear() on the x86, and makes it call ptep_get_and_clear() when CONFIG_PAGE_TABLE_CHECK is enabled. The page table check feature does not actually care about the A/D bits, so only ptep_get() + pte_clear() should be called. But considering that the page table check is a debug option, this should not have much of an impact. But then the commit de8c8e52836d ("mm: page_table_check: add hooks to public helpers") changed ptep_clear() to unconditionally call ptep_get_and_clear(), so that the CONFIG_PAGE_TABLE_CHECK check can be put into the page table check stubs (in include/linux/page_table_check.h). This also cause performance loss to the kernel without CONFIG_PAGE_TABLE_CHECK enabled, which doesn't make sense. Currently ptep_clear() is only used in debug code and in khugepaged collapse paths, which are fairly expensive. So the cost of an extra atomic RMW operation does not matter. But this may be used for other paths in the future. After all, for the present pte entry, we need to call ptep_clear() instead of pte_clear() to ensure that PAGE_TABLE_CHECK works properly. So to be more precise, just calling ptep_get() and pte_clear() in the ptep_clear(). Link: https://lkml.kernel.org/r/20241122073652.54030-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Pasha Tatashin Reviewed-by: Jann Horn Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Jason Gunthorpe Cc: Lorenzo Stoakes Cc: Peter Xu Cc: Ryan Roberts Cc: Tong Tiangen Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index adef9d6e9b1b..94d267d02372 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -533,7 +533,14 @@ static inline void clear_young_dirty_ptes(struct vm_area_struct *vma, static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - ptep_get_and_clear(mm, addr, ptep); + pte_t pte = ptep_get(ptep); + + pte_clear(mm, addr, ptep); + /* + * No need for ptep_get_and_clear(): page table check doesn't care about + * any bits that could have been set by HW concurrently. + */ + page_table_check_pte_clear(mm, pte); } #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH -- cgit v1.2.3 From d40797d6720e861196e848f3615bb09dae5be7ce Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Nov 2024 16:54:51 +0100 Subject: kasan: make kasan_record_aux_stack_noalloc() the default behaviour kasan_record_aux_stack_noalloc() was introduced to record a stack trace without allocating memory in the process. It has been added to callers which were invoked while a raw_spinlock_t was held. More and more callers were identified and changed over time. Is it a good thing to have this while functions try their best to do a locklessly setup? The only downside of having kasan_record_aux_stack() not allocate any memory is that we end up without a stacktrace if stackdepot runs out of memory and at the same stacktrace was not recorded before To quote Marco Elver from https://lore.kernel.org/all/CANpmjNPmQYJ7pv1N3cuU8cP18u7PP_uoZD8YxwZd4jtbof9nVQ@mail.gmail.com/ | I'd be in favor, it simplifies things. And stack depot should be | able to replenish its pool sufficiently in the "non-aux" cases | i.e. regular allocations. Worst case we fail to record some | aux stacks, but I think that's only really bad if there's a bug | around one of these allocations. In general the probabilities | of this being a regression are extremely small [...] Make the kasan_record_aux_stack_noalloc() behaviour default as kasan_record_aux_stack(). [bigeasy@linutronix.de: dressed the diff as patch] Link: https://lkml.kernel.org/r/20241122155451.Mb2pmeyJ@linutronix.de Fixes: 7cb3007ce2da ("kasan: generic: introduce kasan_record_aux_stack_noalloc()") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Reported-by: syzbot+39f85d612b7c20d8db48@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67275485.050a0220.3c8d68.0a37.GAE@google.com Reviewed-by: Andrey Konovalov Reviewed-by: Marco Elver Reviewed-by: Waiman Long Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Ben Segall Cc: Boqun Feng Cc: Christoph Lameter Cc: David Rientjes Cc: Dietmar Eggemann Cc: Dmitry Vyukov Cc: Frederic Weisbecker Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes (Google) Cc: Joonsoo Kim Cc: Josh Triplett Cc: Juri Lelli Cc: Cc: Lai Jiangshan Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Mathieu Desnoyers Cc: Mel Gorman Cc: Neeraj Upadhyay Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Roman Gushchin Cc: Steven Rostedt Cc: syzkaller-bugs@googlegroups.com Cc: Tejun Heo Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vincenzo Frascino Cc: Vlastimil Babka Cc: Zqiang Signed-off-by: Andrew Morton --- include/linux/kasan.h | 2 -- include/linux/task_work.h | 3 --- 2 files changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 6bbfc8aa42e8..1c1b3d39e7b6 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -491,7 +491,6 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_record_aux_stack(void *ptr); -void kasan_record_aux_stack_noalloc(void *ptr); #else /* CONFIG_KASAN_GENERIC */ @@ -509,7 +508,6 @@ static inline void kasan_cache_create(struct kmem_cache *cache, static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_record_aux_stack(void *ptr) {} -static inline void kasan_record_aux_stack_noalloc(void *ptr) {} #endif /* CONFIG_KASAN_GENERIC */ diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 2964171856e0..0646804860ff 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -19,9 +19,6 @@ enum task_work_notify_mode { TWA_SIGNAL, TWA_SIGNAL_NO_IPI, TWA_NMI_CURRENT, - - TWA_FLAGS = 0xff00, - TWAF_NO_ALLOC = 0x0100, }; static inline bool task_work_pending(struct task_struct *task) -- cgit v1.2.3 From da243c5479add600bdd58c910c9fae3355b4f026 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 28 Nov 2024 15:40:39 +0800 Subject: mm: factor out the order calculation into a new helper Patch series "Support large folios for tmpfs", v3. Traditionally, tmpfs only supported PMD-sized large folios. However nowadays with other file systems supporting any sized large folios, and extending anonymous to support mTHP, we should not restrict tmpfs to allocating only PMD-sized large folios, making it more special. Instead, we should allow tmpfs can allocate any sized large folios. Considering that tmpfs already has the 'huge=' option to control the PMD-sized large folios allocation, we can extend the 'huge=' option to allow any sized large folios. The semantics of the 'huge=' mount option are: huge=never: no any sized large folios huge=always: any sized large folios huge=within_size: like 'always' but respect the i_size huge=advise: like 'always' if requested with madvise() Note: for tmpfs mmap() faults, due to the lack of a write size hint, still allocate the PMD-sized large folios if huge=always/within_size/advise is set. Moreover, the 'deny' and 'force' testing options controlled by '/sys/kernel/mm/transparent_hugepage/shmem_enabled', still retain the same semantics. The 'deny' can disable any sized large folios for tmpfs, while the 'force' can enable PMD sized large folios for tmpfs. This patch (of 6): Factor out the order calculation into a new helper, which can be reused by shmem in the following patch. Link: https://lkml.kernel.org/r/cover.1732779148.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/5505f9ea50942820c1924d1803bfdd3a524e54f6.1732779148.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Suggested-by: Matthew Wilcox Reviewed-by: Barry Song Reviewed-by: David Hildenbrand Reviewed-by: Daniel Gomez Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bcf0865a38ae..d796c8a33647 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -727,6 +727,16 @@ typedef unsigned int __bitwise fgf_t; #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) +static inline unsigned int filemap_get_order(size_t size) +{ + unsigned int shift = ilog2(size); + + if (shift <= PAGE_SHIFT) + return 0; + + return shift - PAGE_SHIFT; +} + /** * fgf_set_order - Encode a length in the fgf_t flags. * @size: The suggested size of the folio to create. @@ -740,11 +750,11 @@ typedef unsigned int __bitwise fgf_t; */ static inline fgf_t fgf_set_order(size_t size) { - unsigned int shift = ilog2(size); + unsigned int order = filemap_get_order(size); - if (shift <= PAGE_SHIFT) + if (!order) return 0; - return (__force fgf_t)((shift - PAGE_SHIFT) << 26); + return (__force fgf_t)(order << 26); } void *filemap_get_entry(struct address_space *mapping, pgoff_t index); -- cgit v1.2.3 From da80f4ffb0dbee2419ac04f23aad0533658f1523 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Fri, 29 Nov 2024 14:58:25 +0000 Subject: list_lru: expand list_lru_add() docs with info about sublists The documentation for list_lru_add() and list_lru_del() has not been updated since lru lists were originally introduced by commit a38e40824844 ("list: add a new LRU list type"). Back then, list_lru stored all of the items in a single list, but the implementation has since been expanded to use many sublists internally. Thus, update the docs to mention that the requirements about not using the item with several lists at the same time also applies not using different sublists. Also mention that list_lru items are reparented when the memcg is deleted as discussed on the LKML [1]. Also fix incorrect use of 'Return value:' which should be 'Return:'. Link: https://lore.kernel.org/all/Z0eXrllVhRI9Ag5b@dread.disaster.area/ [1] Link: https://lkml.kernel.org/r/20241129-list_lru_memcg_docs-v2-1-e285ff1c481b@google.com Signed-off-by: Alice Ryhl Reviewed-by: Dave Chinner Acked-by: Muchun Song Reviewed-by: Nhat Pham Cc: Johannes Weiner Cc: Michal Hocko Cc: Qi Zheng Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 05c166811f6b..fe739d35a864 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -91,13 +91,24 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * @memcg: the cgroup of the sublist to add the item to. * * If the element is already part of a list, this function returns doing - * nothing. Therefore the caller does not need to keep state about whether or - * not the element already belongs in the list and is allowed to lazy update - * it. Note however that this is valid for *a* list, not *this* list. If - * the caller organize itself in a way that elements can be in more than - * one type of list, it is up to the caller to fully remove the item from - * the previous list (with list_lru_del() for instance) before moving it - * to @lru. + * nothing. This means that it is not necessary to keep state about whether or + * not the element already belongs in the list. That said, this logic only + * works if the item is in *this* list. If the item might be in some other + * list, then you cannot rely on this check and you must remove it from the + * other list before trying to insert it. + * + * The lru list consists of many sublists internally; the @nid and @memcg + * parameters are used to determine which sublist to insert the item into. + * It's important to use the right value of @nid and @memcg when deleting the + * item, since it might otherwise get deleted from the wrong sublist. + * + * This also applies when attempting to insert the item multiple times - if + * the item is currently in one sublist and you call list_lru_add() again, you + * must pass the right @nid and @memcg parameters so that the same sublist is + * used. + * + * You must ensure that the memcg is not freed during this call (e.g., with + * rcu or by taking a css refcnt). * * Return: true if the list was updated, false otherwise */ @@ -113,7 +124,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, * memcg of the sublist is determined by @item list_head. This assumption is * valid for slab objects LRU such as dentries, inodes, etc. * - * Return value: true if the list was updated, false otherwise + * Return: true if the list was updated, false otherwise */ bool list_lru_add_obj(struct list_lru *lru, struct list_head *item); @@ -125,8 +136,19 @@ bool list_lru_add_obj(struct list_lru *lru, struct list_head *item); * @memcg: the cgroup of the sublist to delete the item from. * * This function works analogously as list_lru_add() in terms of list - * manipulation. The comments about an element already pertaining to - * a list are also valid for list_lru_del(). + * manipulation. + * + * The comments in list_lru_add() about an element already being in a list are + * also valid for list_lru_del(), that is, you can delete an item that has + * already been removed or never been added. However, if the item is in a + * list, it must be in *this* list, and you must pass the right value of @nid + * and @memcg so that the right sublist is used. + * + * You must ensure that the memcg is not freed during this call (e.g., with + * rcu or by taking a css refcnt). When a memcg is deleted, list_lru entries + * are automatically moved to the parent memcg. This is done in a race-free + * way, so during deletion of an memcg both the old and new memcg will resolve + * to the same sublist internally. * * Return: true if the list was updated, false otherwise */ @@ -142,7 +164,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, * memcg of the sublist is determined by @item list_head. This assumption is * valid for slab objects LRU such as dentries, inodes, etc. * - * Return value: true if the list was updated, false otherwise. + * Return: true if the list was updated, false otherwise. */ bool list_lru_del_obj(struct list_lru *lru, struct list_head *item); -- cgit v1.2.3 From 1168b2bec7660f5146de7b14c14b52417e900d18 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 16 Nov 2024 15:14:46 +0000 Subject: filemap: remove unused folio_add_wait_queue folio_add_wait_queue() has been unused since 2021's commit 850cba069c26 ("cachefiles: Delete the cachefiles driver pending rewrite") Remove it. Link: https://lkml.kernel.org/r/20241116151446.95555-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Reviewed-by: David Hildenbrand Reviewed-by: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d796c8a33647..fc2e1319c7bb 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1280,11 +1280,6 @@ void folio_end_private_2(struct folio *folio); void folio_wait_private_2(struct folio *folio); int folio_wait_private_2_killable(struct folio *folio); -/* - * Add an arbitrary waiter to a page's wait queue - */ -void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter); - /* * Fault in userspace address range. */ -- cgit v1.2.3 From 21641bd9a7a7ce0360106a5a8e5b89a4fc74529d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 4 Nov 2024 11:23:18 -0300 Subject: lazy tlb: fix hotplug exit race with MMU_LAZY_TLB_SHOOTDOWN CPU unplug first calls __cpu_disable(), and that's where powerpc calls cleanup_cpu_mmu_context(), which clears this CPU from mm_cpumask() of all mms in the system. However this CPU may still be using a lazy tlb mm, and its mm_cpumask bit will be cleared from it. The CPU does not switch away from the lazy tlb mm until arch_cpu_idle_dead() calls idle_task_exit(). If that user mm exits in this window, it will not be subject to the lazy tlb mm shootdown and may be freed while in use as a lazy mm by the CPU that is being unplugged. cleanup_cpu_mmu_context() could be moved later, but it looks better to move the lazy tlb mm switching earlier. The problem with doing the lazy mm switching in idle_task_exit() is explained in commit bf2c59fce4074 ("sched/core: Fix illegal RCU from offline CPUs"), which added a wart to switch away from the mm but leave it set in active_mm to be cleaned up later. So instead, switch away from the lazy tlb mm at sched_cpu_wait_empty(), which is the last hotplug state before teardown (CPUHP_AP_SCHED_WAIT_EMPTY). This CPU will never switch to a user thread from this point, so it has no chance to pick up a new lazy tlb mm. This removes the lazy tlb mm handling wart in CPU unplug. With this, idle_task_exit() is not needed anymore and can be cleaned up. This leaves the prototype alone, to be cleaned after this change. herton: took the suggestions from https://lore.kernel.org/all/87jzvyprsw.ffs@tglx/ and made adjustments on the initial patch proposed by Nicholas. Link: https://lkml.kernel.org/r/20230524060455.147699-1-npiggin@gmail.com Link: https://lore.kernel.org/all/20230525205253.E2FAEC433EF@smtp.kernel.org/ Link: https://lkml.kernel.org/r/20241104142318.3295663-1-herton@redhat.com Fixes: 2655421ae69f ("lazy tlb: shoot lazies, non-refcounting lazy tlb mm reference handling scheme") Signed-off-by: Nicholas Piggin Signed-off-by: Herton R. Krzesinski Suggested-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Michael Ellerman Signed-off-by: Andrew Morton --- include/linux/sched/hotplug.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h index 412cdaba33eb..17e04859b9a4 100644 --- a/include/linux/sched/hotplug.h +++ b/include/linux/sched/hotplug.h @@ -18,10 +18,6 @@ extern int sched_cpu_dying(unsigned int cpu); # define sched_cpu_dying NULL #endif -#ifdef CONFIG_HOTPLUG_CPU -extern void idle_task_exit(void); -#else static inline void idle_task_exit(void) {} -#endif #endif /* _LINUX_SCHED_HOTPLUG_H */ -- cgit v1.2.3 From 7a5714991872f0a4805cc6004a5bff19a71d0459 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 3 Dec 2024 18:05:10 +0000 Subject: mm: abstract get_arg_page() stack expansion and mmap read lock Right now fs/exec.c invokes expand_downwards(), an otherwise internal implementation detail of the VMA logic in order to ensure that an arg page can be obtained by get_user_pages_remote(). In order to be able to move the stack expansion logic into mm/vma.c to make it available to userland testing we need to find an alternative approach here. We do so by providing the mmap_read_lock_maybe_expand() function which also helpfully documents what get_arg_page() is doing here and adds an additional check against VM_GROWSDOWN to make explicit that the stack expansion logic is only invoked when the VMA is indeed a downward-growing stack. This allows expand_downwards() to become a static function. Importantly, the VMA referenced by mmap_read_maybe_expand() must NOT be currently user-visible in any way, that is place within an rmap or VMA tree. It must be a newly allocated VMA. This is the case when exec invokes this function. Link: https://lkml.kernel.org/r/5295d1c70c58e6aa63d14be68d4e1de9fa1c8e6d.1733248985.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Al Viro Cc: Christian Brauner Cc: Eric W. Biederman Cc: Jan Kara Cc: Jann Horn Cc: Kees Cook Cc: Liam R. Howlett Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b1c3db9cf355..2e5ef71b8629 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3324,6 +3324,8 @@ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admi extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void exit_mmap(struct mm_struct *); int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, bool write); static inline int check_data_rlimit(unsigned long rlim, unsigned long new, @@ -3437,9 +3439,6 @@ extern unsigned long stack_guard_gap; int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); -/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ -int expand_downwards(struct vm_area_struct *vma, unsigned long address); - /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, -- cgit v1.2.3 From b9e40605daa94ae1817ceb5ce9e9b34093c6d850 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 3 Dec 2024 10:47:28 +0100 Subject: mm/page_isolation: don't pass gfp flags to start_isolate_page_range() The parameter is unused, so let's stop passing it. Link: https://lkml.kernel.org/r/20241203094732.200195-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Reviewed-by: Oscar Salvador Reviewed-by: Vishal Moola (Oracle) Cc: Christophe Leroy Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Naveen N Rao Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- include/linux/page-isolation.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 73dc2c1841ec..898bb788243b 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -31,7 +31,7 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, int migratetype); int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int flags, gfp_t gfp_flags); + int migratetype, int flags); void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, int migratetype); -- cgit v1.2.3 From 735fad44b5a86edf0fe65a8e8d43595bd1cf1d58 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 4 Dec 2024 19:09:46 +0800 Subject: mm: zap_install_uffd_wp_if_needed: return whether uffd-wp pte has been re-installed In some cases, we'll replace the none pte with an uffd-wp swap special pte marker when necessary. Let's expose this information to the caller through the return value, so that subsequent commits can use this information to detect whether the PTE page is empty. Link: https://lkml.kernel.org/r/9d4516554724eda87d6576468042a1741c475413.1733305182.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Muchun Song Cc: Peter Xu Cc: Peter Zijlstra Cc: Will Deacon Cc: Zach O'Keefe Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 1b6a917fffa4..34e5097182a0 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -564,9 +564,9 @@ static inline pte_marker copy_pte_marker( * Must be called with pgtable lock held so that no thread will see the none * pte, and if they see it, they'll fault and serialize at the pgtable lock. * - * This function is a no-op if PTE_MARKER_UFFD_WP is not enabled. + * Returns true if an uffd-wp pte was installed, false otherwise. */ -static inline void +static inline bool pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { @@ -583,7 +583,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, * with a swap pte. There's no way of leaking the bit. */ if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) - return; + return false; /* A uffd-wp wr-protected normal pte */ if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval))) @@ -596,10 +596,13 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, if (unlikely(pte_swp_uffd_wp_any(pteval))) arm_uffd_pte = true; - if (unlikely(arm_uffd_pte)) + if (unlikely(arm_uffd_pte)) { set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); + return true; + } #endif + return false; } static inline bool vma_has_recency(struct vm_area_struct *vma) -- cgit v1.2.3 From 6375e95f381e3dc85065b6f74263a61522736203 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 4 Dec 2024 19:09:49 +0800 Subject: mm: pgtable: reclaim empty PTE page in madvise(MADV_DONTNEED) Now in order to pursue high performance, applications mostly use some high-performance user-mode memory allocators, such as jemalloc or tcmalloc. These memory allocators use madvise(MADV_DONTNEED or MADV_FREE) to release physical memory, but neither MADV_DONTNEED nor MADV_FREE will release page table memory, which may cause huge page table memory usage. The following are a memory usage snapshot of one process which actually happened on our server: VIRT: 55t RES: 590g VmPTE: 110g In this case, most of the page table entries are empty. For such a PTE page where all entries are empty, we can actually free it back to the system for others to use. As a first step, this commit aims to synchronously free the empty PTE pages in madvise(MADV_DONTNEED) case. We will detect and free empty PTE pages in zap_pte_range(), and will add zap_details.reclaim_pt to exclude cases other than madvise(MADV_DONTNEED). Once an empty PTE is detected, we first try to hold the pmd lock within the pte lock. If successful, we clear the pmd entry directly (fast path). Otherwise, we wait until the pte lock is released, then re-hold the pmd and pte locks and loop PTRS_PER_PTE times to check pte_none() to re-detect whether the PTE page is empty and free it (slow path). For other cases such as madvise(MADV_FREE), consider scanning and freeing empty PTE pages asynchronously in the future. The following code snippet can show the effect of optimization: mmap 50G while (1) { for (; i < 1024 * 25; i++) { touch 2M memory madvise MADV_DONTNEED 2M } } As we can see, the memory usage of VmPTE is reduced: before after VIRT 50.0 GB 50.0 GB RES 3.1 MB 3.1 MB VmPTE 102640 KB 240 KB [zhengqi.arch@bytedance.com: fix uninitialized symbol 'ptl'] Link: https://lkml.kernel.org/r/20241206112348.51570-1-zhengqi.arch@bytedance.com Link: https://lore.kernel.org/linux-mm/224e6a4e-43b5-4080-bdd8-b0a6fb2f0853@stanley.mountain/ Link: https://lkml.kernel.org/r/92aba2b319a734913f18ba41e7d86a265f0b84e2.1733305182.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Muchun Song Cc: Peter Xu Cc: Peter Zijlstra Cc: Will Deacon Cc: Zach O'Keefe Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e5ef71b8629..9372bc058b43 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2320,6 +2320,7 @@ extern void pagefault_out_of_memory(void); struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool even_cows; /* Zap COWed private pages too? */ + bool reclaim_pt; /* Need reclaim page tables? */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; -- cgit v1.2.3 From 718b13861d2256ac95d65b892953282a63faf240 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 4 Dec 2024 19:09:50 +0800 Subject: x86: mm: free page table pages by RCU instead of semi RCU Now, if CONFIG_MMU_GATHER_RCU_TABLE_FREE is selected, the page table pages will be freed by semi RCU, that is: - batch table freeing: asynchronous free by RCU - single table freeing: IPI + synchronous free In this way, the page table can be lockless traversed by disabling IRQ in paths such as fast GUP. But this is not enough to free the empty PTE page table pages in paths other that munmap and exit_mmap path, because IPI cannot be synchronized with rcu_read_lock() in pte_offset_map{_lock}(). In preparation for supporting empty PTE page table pages reclaimation, let single table also be freed by RCU like batch table freeing. Then we can also use pte_offset_map() etc to prevent PTE page from being freed. Like pte_free_defer(), we can also safely use ptdesc->pt_rcu_head to free the page table pages: - The pt_rcu_head is unioned with pt_list and pmd_huge_pte. - For pt_list, it is used to manage the PGD page in x86. Fortunately tlb_remove_table() will not be used for free PGD pages, so it is safe to use pt_rcu_head. - For pmd_huge_pte, it is used for THPs, so it is safe. After applying this patch, if CONFIG_PT_RECLAIM is enabled, the function call of free_pte() is as follows: free_pte pte_free_tlb __pte_free_tlb ___pte_free_tlb paravirt_tlb_remove_table tlb_remove_table [!CONFIG_PARAVIRT, Xen PV, Hyper-V, KVM] [no-free-memory slowpath:] tlb_table_invalidate tlb_remove_table_one __tlb_remove_table_one [frees via RCU] [fastpath:] tlb_table_flush tlb_remove_table_free [frees via RCU] native_tlb_remove_table [CONFIG_PARAVIRT on native] tlb_remove_table [see above] Link: https://lkml.kernel.org/r/0287d442a973150b0e1019cc406e6322d148277a.1733305182.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Muchun Song Cc: Peter Xu Cc: Will Deacon Cc: Zach O'Keefe Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 332cee285662..7490d84af310 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -438,7 +438,9 @@ FOLIO_MATCH(compound_head, _head_2a); * struct ptdesc - Memory descriptor for page tables. * @__page_flags: Same as page flags. Powerpc only. * @pt_rcu_head: For freeing page table pages. - * @pt_list: List of used page tables. Used for s390 and x86. + * @pt_list: List of used page tables. Used for s390 gmap shadow pages + * (which are not linked into the user page tables) and x86 + * pgds. * @_pt_pad_1: Padding that aliases with page's compound head. * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. * @__page_mapping: Aliases with page->mapping. Unused for page tables. -- cgit v1.2.3 From 67c8b11bd58aee4644c9a6e495d0c234771e9175 Mon Sep 17 00:00:00 2001 From: Wenchao Hao Date: Mon, 2 Dec 2024 20:47:30 +0800 Subject: mm: add per-order mTHP swap-in fallback/fallback_charge counters Currently, large folio swap-in is supported, but we lack a method to analyze their success ratio. Similar to anon_fault_fallback, we introduce per-order mTHP swpin_fallback and swpin_fallback_charge counters for calculating their success ratio. The new counters are located at: /sys/kernel/mm/transparent_hugepage/hugepages-/stats/ swpin_fallback swpin_fallback_charge Link: https://lkml.kernel.org/r/20241202124730.2407037-1-haowenchao22@gmail.com Signed-off-by: Wenchao Hao Reviewed-by: Barry Song Reviewed-by: Lance Yang Cc: Baolin Wang Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Peter Xu Cc: Ryan Roberts Cc: Usama Arif Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b94c2e8ee918..93e509b6c00e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -121,6 +121,8 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_ZSWPOUT, MTHP_STAT_SWPIN, + MTHP_STAT_SWPIN_FALLBACK, + MTHP_STAT_SWPIN_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, MTHP_STAT_SHMEM_ALLOC, -- cgit v1.2.3 From dba4761a3e40433a8d9e434d515ecbae19b3dcb1 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:14 -0800 Subject: seqlock: add raw_seqcount_try_begin Add raw_seqcount_try_begin() to opens a read critical section of the given seqcount_t if the counter is even. This enables eliding the critical section entirely if the counter is odd, instead of doing the speculation knowing it will fail. Link: https://lkml.kernel.org/r/20241122174416.1367052-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: David Hildenbrand Reviewed-by: Liam R. Howlett Suggested-by: Peter Zijlstra Cc: Christian Brauner Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Paul E. McKenney Cc: Peter Xu Cc: Shakeel Butt Cc: Sourav Panda Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/seqlock.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5298765d6ca4..22c2c48b4265 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -318,6 +318,28 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) __seq; \ }) +/** + * raw_seqcount_try_begin() - begin a seqcount_t read critical section + * w/o lockdep and w/o counter stabilization + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants + * + * Similar to raw_seqcount_begin(), except it enables eliding the critical + * section entirely if odd, instead of doing the speculation knowing it will + * fail. + * + * Useful when counter stabilization is more or less equivalent to taking + * the lock and there is a slowpath that does that. + * + * If true, start will be set to the (even) sequence count read. + * + * Return: true when a read critical section is started. + */ +#define raw_seqcount_try_begin(s, start) \ +({ \ + start = raw_read_seqcount(s); \ + !(start & 1); \ +}) + /** * raw_seqcount_begin() - begin a seqcount_t read critical section w/o * lockdep and w/o counter stabilization -- cgit v1.2.3 From e5e7fb278e5924f29ceab42bbbb891cde528f7cc Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:15 -0800 Subject: mm: convert mm_lock_seq to a proper seqcount Convert mm_lock_seq to be seqcount_t and change all mmap_write_lock variants to increment it, in-line with the usual seqcount usage pattern. This lets us check whether the mmap_lock is write-locked by checking mm_lock_seq.sequence counter (odd=locked, even=unlocked). This will be used when implementing mmap_lock speculation functions. As a result vm_lock_seq is also change to be unsigned to match the type of mm_lock_seq.sequence. Link: https://lkml.kernel.org/r/20241122174416.1367052-2-surenb@google.com Suggested-by: Peter Zijlstra Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Paul E. McKenney Cc: Peter Xu Cc: Shakeel Butt Cc: Sourav Panda Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 +++++------ include/linux/mm_types.h | 7 ++++-- include/linux/mmap_lock.h | 55 +++++++++++++++++++++++++++++++---------------- 3 files changed, 47 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9372bc058b43..a3a50c37603e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -711,7 +711,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) return false; if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) @@ -728,7 +728,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { up_read(&vma->vm_lock->lock); return false; } @@ -743,7 +743,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) +static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -751,7 +751,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ - *mm_lock_seq = vma->vm_mm->mm_lock_seq; + *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; return (vma->vm_lock_seq == *mm_lock_seq); } @@ -762,7 +762,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) */ static inline void vma_start_write(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; if (__is_vma_write_locked(vma, &mm_lock_seq)) return; @@ -780,7 +780,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) static inline void vma_assert_write_locked(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7490d84af310..5f1b2dc788e2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -729,7 +729,7 @@ struct vm_area_struct { * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ - int vm_lock_seq; + unsigned int vm_lock_seq; /* Unstable RCU readers are allowed to read this. */ struct vma_lock *vm_lock; #endif @@ -923,6 +923,9 @@ struct mm_struct { * Roughly speaking, incrementing the sequence number is * equivalent to releasing locks on VMAs; reading the sequence * number can be part of taking a read lock on a VMA. + * Incremented every time mmap_lock is write-locked/unlocked. + * Initialized to 0, therefore odd values indicate mmap_lock + * is write-locked and even values that it's released. * * Can be modified under write mmap_lock using RELEASE * semantics. @@ -931,7 +934,7 @@ struct mm_struct { * Can be read with ACQUIRE semantics if not holding write * mmap_lock. */ - int mm_lock_seq; + seqcount_t mm_lock_seq; #endif diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index de9dc20b01ba..9715326f5a85 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -71,39 +71,39 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK -/* - * Drop all currently-held per-VMA locks. - * This is called from the mmap_lock implementation directly before releasing - * a write-locked mmap_lock (or downgrading it to read-locked). - * This should normally NOT be called manually from other places. - * If you want to call this manually anyway, keep in mind that this will release - * *all* VMA write locks, including ones from further up the stack. - */ -static inline void vma_end_write_all(struct mm_struct *mm) +static inline void mm_lock_seqcount_init(struct mm_struct *mm) { - mmap_assert_write_locked(mm); - /* - * Nobody can concurrently modify mm->mm_lock_seq due to exclusive - * mmap_lock being held. - * We need RELEASE semantics here to ensure that preceding stores into - * the VMA take effect before we unlock it with this store. - * Pairs with ACQUIRE semantics in vma_start_read(). - */ - smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1); + seqcount_init(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) +{ + do_raw_write_seqcount_begin(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_end(struct mm_struct *mm) +{ + ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq); + do_raw_write_seqcount_end(&mm->mm_lock_seq); } + #else -static inline void vma_end_write_all(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} #endif static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); + mm_lock_seqcount_init(mm); } static inline void mmap_write_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, true); down_write(&mm->mmap_lock); + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } @@ -111,6 +111,7 @@ static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) { __mmap_lock_trace_start_locking(mm, true); down_write_nested(&mm->mmap_lock, subclass); + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } @@ -120,10 +121,26 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm) __mmap_lock_trace_start_locking(mm, true); ret = down_write_killable(&mm->mmap_lock); + if (!ret) + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, ret == 0); return ret; } +/* + * Drop all currently-held per-VMA locks. + * This is called from the mmap_lock implementation directly before releasing + * a write-locked mmap_lock (or downgrading it to read-locked). + * This should normally NOT be called manually from other places. + * If you want to call this manually anyway, keep in mind that this will release + * *all* VMA write locks, including ones from further up the stack. + */ +static inline void vma_end_write_all(struct mm_struct *mm) +{ + mmap_assert_write_locked(mm); + mm_lock_seqcount_end(mm); +} + static inline void mmap_write_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, true); -- cgit v1.2.3 From 6f030e32e4499942a223677169d006085d8c57ce Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:16 -0800 Subject: mm: introduce mmap_lock_speculate_{try_begin|retry} Add helper functions to speculatively perform operations without read-locking mmap_lock, expecting that mmap_lock will not be write-locked and mm is not modified from under us. [akpm@linux-foundation.org: use read_seqcount_retry() in mmap_lock_speculate_retry(), per Wei Yang] Link: https://lkml.kernel.org/r/20241122174416.1367052-3-surenb@google.com Suggested-by: Peter Zijlstra Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Paul E. McKenney Cc: Peter Xu Cc: Shakeel Butt Cc: Sourav Panda Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 9715326f5a85..45a21faa3ff6 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -71,6 +71,7 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK + static inline void mm_lock_seqcount_init(struct mm_struct *mm) { seqcount_init(&mm->mm_lock_seq); @@ -87,11 +88,39 @@ static inline void mm_lock_seqcount_end(struct mm_struct *mm) do_raw_write_seqcount_end(&mm->mm_lock_seq); } -#else +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + /* + * Since mmap_lock is a sleeping lock, and waiting for it to become + * unlocked is more or less equivalent with taking it ourselves, don't + * bother with the speculative path if mmap_lock is already write-locked + * and take the slow path, which takes the lock. + */ + return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq); +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return read_seqcount_retry(&mm->mm_lock_seq, seq); +} + +#else /* CONFIG_PER_VMA_LOCK */ + static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} -#endif + +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + return false; +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return true; +} + +#endif /* CONFIG_PER_VMA_LOCK */ static inline void mmap_init_lock(struct mm_struct *mm) { -- cgit v1.2.3 From fa00b8ef1803fe133b4897c25227aa0d298dd093 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 6 Dec 2024 21:28:46 +0000 Subject: mm: perform all memfd seal checks in a single place We no longer actually need to perform these checks in the f_op->mmap() hook any longer. We already moved the operation which clears VM_MAYWRITE on a read-only mapping of a write-sealed memfd in order to work around the restrictions imposed by commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour"). There is no reason for us not to simply go ahead and additionally check to see if any pre-existing seals are in place here rather than defer this to the f_op->mmap() hook. By doing this we remove more logic from shmem_mmap() which doesn't belong there, as well as doing the same for hugetlbfs_file_mmap(). We also remove dubious shared logic in mm.h which simply does not belong there either. It makes sense to do these checks at the earliest opportunity, we know these are shmem (or hugetlbfs) mappings whose relevant VMA flags will not change from the invoking do_mmap() so there is simply no need to wait. This also means the implementation of further memfd seal flags can be done within mm/memfd.c and also have the opportunity to modify VMA flags as necessary early in the mapping logic. [lorenzo.stoakes@oracle.com: fix typos in !memfd inline stub] Link: https://lkml.kernel.org/r/7dee6c5d-480b-4c24-b98e-6fa47dbd8a23@lucifer.local Link: https://lkml.kernel.org/r/20241206212846.210835-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Tested-by: Isaac J. Manjarres Cc: Hugh Dickins Cc: Jann Horn Cc: Kalesh Singh Cc: Liam R. Howlett Cc: Muchun Song Cc: Vlastimil Babka Cc: Jeff Xu Signed-off-by: Andrew Morton --- include/linux/memfd.h | 23 +++++++++++---------- include/linux/mm.h | 55 --------------------------------------------------- 2 files changed, 11 insertions(+), 67 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memfd.h b/include/linux/memfd.h index d437e3070850..246daadbfde8 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -7,7 +7,14 @@ #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); -unsigned int *memfd_file_seals_ptr(struct file *file); +/* + * Check for any existing seals on mmap, return an error if access is denied due + * to sealing, or 0 otherwise. + * + * We also update VMA flags if appropriate by manipulating the VMA flags pointed + * to by vm_flags_ptr. + */ +int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { @@ -17,19 +24,11 @@ static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) { return ERR_PTR(-EINVAL); } - -static inline unsigned int *memfd_file_seals_ptr(struct file *file) +static inline int memfd_check_seals_mmap(struct file *file, + unsigned long *vm_flags_ptr) { - return NULL; + return 0; } #endif -/* Retrieve memfd seals associated with the file, if any. */ -static inline unsigned int memfd_file_seals(struct file *file) -{ - unsigned int *sealsp = memfd_file_seals_ptr(file); - - return sealsp ? *sealsp : 0; -} - #endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index a3a50c37603e..e7c54b9aac6d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4102,61 +4102,6 @@ void mem_dump_obj(void *object); static inline void mem_dump_obj(void *object) {} #endif -static inline bool is_write_sealed(int seals) -{ - return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); -} - -/** - * is_readonly_sealed - Checks whether write-sealed but mapped read-only, - * in which case writes should be disallowing moving - * forwards. - * @seals: the seals to check - * @vm_flags: the VMA flags to check - * - * Returns whether readonly sealed, in which case writess should be disallowed - * going forward. - */ -static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags) -{ - /* - * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as - * MAP_SHARED and read-only, take care to not allow mprotect to - * revert protections on such mappings. Do this only for shared - * mappings. For private mappings, don't need to mask - * VM_MAYWRITE as we still want them to be COW-writable. - */ - if (is_write_sealed(seals) && - ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED)) - return true; - - return false; -} - -/** - * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and - * handle them. - * @seals: the seals to check - * @vma: the vma to operate on - * - * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper - * check/handling on the vma flags. Return 0 if check pass, or <0 for errors. - */ -static inline int seal_check_write(int seals, struct vm_area_struct *vma) -{ - if (!is_write_sealed(seals)) - return 0; - - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * write seals are active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; - - return 0; -} - #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, -- cgit v1.2.3 From 991135774c0e05a4734e6d32aa03b00355e4cac9 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Wed, 11 Dec 2024 12:39:50 -0800 Subject: memcg/hugetlb: introduce mem_cgroup_charge_hugetlb This patch introduces mem_cgroup_charge_hugetlb which combines the logic of mem_cgroup_hugetlb_try_charge / mem_cgroup_hugetlb_commit_charge and removes the need for mem_cgroup_hugetlb_cancel_charge. It also reduces the footprint of memcg in hugetlb code and consolidates all memcg related error paths into one. Link: https://lkml.kernel.org/r/20241211203951.764733-3-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Acked-by: Shakeel Butt Reviewed-by: Nhat Pham Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b28180269e75..387470bed399 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -649,6 +649,8 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, long nr_pages); +int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp); + int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); @@ -1169,6 +1171,11 @@ static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, return 0; } +static inline int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp) +{ + return 0; +} + static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { -- cgit v1.2.3 From 1d8f136a421f26747e58c01281cba5bffae8d289 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Wed, 11 Dec 2024 12:39:51 -0800 Subject: memcg/hugetlb: remove memcg hugetlb try-commit-cancel protocol This patch fully removes the mem_cgroup_{try, commit, cancel}_charge functions, as well as their hugetlb variants. Link: https://lkml.kernel.org/r/20241211203951.764733-4-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Acked-by: Shakeel Butt Reviewed-by: Nhat Pham Cc: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 387470bed399..6e74b8254d9b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -620,8 +620,6 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target, page_counter_read(&memcg->memory); } -void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg); - int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp); /** @@ -646,9 +644,6 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, return __mem_cgroup_charge(folio, mm, gfp); } -int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, - long nr_pages); - int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp); int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, @@ -679,7 +674,6 @@ static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) __mem_cgroup_uncharge_folios(folios); } -void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_replace_folio(struct folio *old, struct folio *new); void mem_cgroup_migrate(struct folio *old, struct folio *new); @@ -1154,23 +1148,12 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target, return false; } -static inline void mem_cgroup_commit_charge(struct folio *folio, - struct mem_cgroup *memcg) -{ -} - static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { return 0; } -static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, - gfp_t gfp, long nr_pages) -{ - return 0; -} - static inline int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp) { return 0; @@ -1194,11 +1177,6 @@ static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { } -static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, - unsigned int nr_pages) -{ -} - static inline void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { -- cgit v1.2.3 From 144d52dd8fc83a082a275e1b663e7454d2b616a4 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 19 Dec 2024 10:27:08 +0100 Subject: x86/efistub: Drop long obsolete UGA support UGA is the EFI graphical output protocol that preceded GOP, and has been long obsolete. Drop support for it from the x86 implementation of the EFI stub - other architectures never bothered to implement it (save for ia64) Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index e5815867aba9..053c57e61869 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -363,7 +363,6 @@ void efi_native_runtime_setup(void); #define ACPI_20_TABLE_GUID EFI_GUID(0x8868e871, 0xe4f1, 0x11d3, 0xbc, 0x22, 0x00, 0x80, 0xc7, 0x3c, 0x88, 0x81) #define SMBIOS_TABLE_GUID EFI_GUID(0xeb9d2d31, 0x2d88, 0x11d3, 0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d) #define SMBIOS3_TABLE_GUID EFI_GUID(0xf2fd1544, 0x9794, 0x4a2c, 0x99, 0x2e, 0xe5, 0xbb, 0xcf, 0x20, 0xe3, 0x94) -#define UGA_IO_PROTOCOL_GUID EFI_GUID(0x61a4d49e, 0x6f68, 0x4f1b, 0xb9, 0x22, 0xa8, 0x6e, 0xed, 0x0b, 0x07, 0xa2) #define EFI_GLOBAL_VARIABLE_GUID EFI_GUID(0x8be4df61, 0x93ca, 0x11d2, 0xaa, 0x0d, 0x00, 0xe0, 0x98, 0x03, 0x2b, 0x8c) #define UV_SYSTEM_TABLE_GUID EFI_GUID(0x3b13a7d4, 0x633e, 0x11dd, 0x93, 0xec, 0xda, 0x25, 0x56, 0xd8, 0x95, 0x93) #define LINUX_EFI_CRASH_GUID EFI_GUID(0xcfc8fc79, 0xbe2e, 0x4ddc, 0x97, 0xf0, 0x9f, 0x98, 0xbf, 0xe2, 0x98, 0xa0) @@ -373,7 +372,6 @@ void efi_native_runtime_setup(void); #define EFI_DEVICE_PATH_TO_TEXT_PROTOCOL_GUID EFI_GUID(0x8b843e20, 0x8132, 0x4852, 0x90, 0xcc, 0x55, 0x1a, 0x4e, 0x4a, 0x7f, 0x1c) #define EFI_DEVICE_PATH_FROM_TEXT_PROTOCOL_GUID EFI_GUID(0x05c99a21, 0xc70f, 0x4ad2, 0x8a, 0x5f, 0x35, 0xdf, 0x33, 0x43, 0xf5, 0x1e) #define EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID EFI_GUID(0x9042a9de, 0x23dc, 0x4a38, 0x96, 0xfb, 0x7a, 0xde, 0xd0, 0x80, 0x51, 0x6a) -#define EFI_UGA_PROTOCOL_GUID EFI_GUID(0x982c298b, 0xf4fa, 0x41cb, 0xb8, 0x38, 0x77, 0xaa, 0x68, 0x8f, 0xb8, 0x39) #define EFI_PCI_IO_PROTOCOL_GUID EFI_GUID(0x4cf5b200, 0x68b8, 0x4ca5, 0x9e, 0xec, 0xb2, 0x3e, 0x3f, 0x50, 0x02, 0x9a) #define EFI_FILE_INFO_ID EFI_GUID(0x09576e92, 0x6d3f, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b) #define EFI_SYSTEM_RESOURCE_TABLE_GUID EFI_GUID(0xb122a263, 0x3661, 0x4f68, 0x99, 0x29, 0x78, 0xf8, 0xb0, 0xd6, 0x21, 0x80) @@ -1286,8 +1284,6 @@ struct linux_efi_memreserve { void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size); -char *efi_systab_show_arch(char *str); - /* * The LINUX_EFI_MOK_VARIABLE_TABLE_GUID config table can be provided * to the kernel by an EFI boot loader. The table contains a packed -- cgit v1.2.3 From b7a2c1fe6b55364e61b4b54b991eb43a47bb1104 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 10 Jan 2025 07:05:12 +0100 Subject: net: ethtool: plumb PHY stats to PHY drivers Introduce support for standardized PHY statistics reporting in ethtool by extending the PHYLIB framework. Add the functions phy_ethtool_get_phy_stats() and phy_ethtool_get_link_ext_stats() to provide a consistent interface for retrieving PHY-level and link-specific statistics. These functions are used within the ethtool implementation to avoid direct access to the phy_device structure outside of the PHYLIB framework. A new structure, ethtool_phy_stats, is introduced to standardize PHY statistics such as packet counts, byte counts, and error counters. Drivers are updated to include callbacks for retrieving PHY and link-specific statistics, ensuring values are explicitly set only for supported fields, initialized with ETHTOOL_STAT_NOT_SET to avoid ambiguity. Signed-off-by: Jakub Kicinski Signed-off-by: Oleksij Rempel Signed-off-by: Paolo Abeni --- include/linux/ethtool.h | 23 +++++++++++++++++++++++ include/linux/phy.h | 36 ++++++++++++++++++++++++++++++++++++ include/linux/phylib_stubs.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 101 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index f711bfd75c4d..4bf70cfec826 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -412,6 +412,29 @@ struct ethtool_eth_phy_stats { ); }; +/** + * struct ethtool_phy_stats - PHY-level statistics counters + * @rx_packets: Total successfully received frames + * @rx_bytes: Total successfully received bytes + * @rx_errors: Total received frames with errors (e.g., CRC errors) + * @tx_packets: Total successfully transmitted frames + * @tx_bytes: Total successfully transmitted bytes + * @tx_errors: Total transmitted frames with errors + * + * This structure provides a standardized interface for reporting + * PHY-level statistics counters. It is designed to expose statistics + * commonly provided by PHYs but not explicitly defined in the IEEE + * 802.3 standard. + */ +struct ethtool_phy_stats { + u64 rx_packets; + u64 rx_bytes; + u64 rx_errors; + u64 tx_packets; + u64 tx_bytes; + u64 tx_errors; +}; + /* Basic IEEE 802.3 MAC Ctrl statistics (30.3.3.*), not otherwise exposed * via a more targeted API. */ diff --git a/include/linux/phy.h b/include/linux/phy.h index 4875465653ca..81d1612e7d35 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1144,6 +1144,35 @@ struct phy_driver { int (*cable_test_get_status)(struct phy_device *dev, bool *finished); /* Get statistics from the PHY using ethtool */ + /** + * @get_phy_stats: Retrieve PHY statistics. + * @dev: The PHY device for which the statistics are retrieved. + * @eth_stats: structure where Ethernet PHY stats will be stored. + * @stats: structure where additional PHY-specific stats will be stored. + * + * Retrieves the supported PHY statistics and populates the provided + * structures. The input structures are pre-initialized with + * `ETHTOOL_STAT_NOT_SET`, and the driver must only modify members + * corresponding to supported statistics. Unmodified members will remain + * set to `ETHTOOL_STAT_NOT_SET` and will not be returned to userspace. + */ + void (*get_phy_stats)(struct phy_device *dev, + struct ethtool_eth_phy_stats *eth_stats, + struct ethtool_phy_stats *stats); + + /** + * @get_link_stats: Retrieve link statistics. + * @dev: The PHY device for which the statistics are retrieved. + * @link_stats: structure where link-specific stats will be stored. + * + * Retrieves link-related statistics for the given PHY device. The input + * structure is pre-initialized with `ETHTOOL_STAT_NOT_SET`, and the + * driver must only modify members corresponding to supported + * statistics. Unmodified members will remain set to + * `ETHTOOL_STAT_NOT_SET` and will not be returned to userspace. + */ + void (*get_link_stats)(struct phy_device *dev, + struct ethtool_link_ext_stats *link_stats); /** @get_sset_count: Number of statistic counters */ int (*get_sset_count)(struct phy_device *dev); /** @get_strings: Names of the statistic counters */ @@ -2124,6 +2153,13 @@ int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data); int phy_ethtool_get_sset_count(struct phy_device *phydev); int phy_ethtool_get_stats(struct phy_device *phydev, struct ethtool_stats *stats, u64 *data); + +void __phy_ethtool_get_phy_stats(struct phy_device *phydev, + struct ethtool_eth_phy_stats *phy_stats, + struct ethtool_phy_stats *phydev_stats); +void __phy_ethtool_get_link_ext_stats(struct phy_device *phydev, + struct ethtool_link_ext_stats *link_stats); + int phy_ethtool_get_plca_cfg(struct phy_device *phydev, struct phy_plca_cfg *plca_cfg); int phy_ethtool_set_plca_cfg(struct phy_device *phydev, diff --git a/include/linux/phylib_stubs.h b/include/linux/phylib_stubs.h index 1279f48c8a70..9d2d6090c86d 100644 --- a/include/linux/phylib_stubs.h +++ b/include/linux/phylib_stubs.h @@ -5,6 +5,9 @@ #include +struct ethtool_eth_phy_stats; +struct ethtool_link_ext_stats; +struct ethtool_phy_stats; struct kernel_hwtstamp_config; struct netlink_ext_ack; struct phy_device; @@ -19,6 +22,11 @@ struct phylib_stubs { int (*hwtstamp_set)(struct phy_device *phydev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); + void (*get_phy_stats)(struct phy_device *phydev, + struct ethtool_eth_phy_stats *phy_stats, + struct ethtool_phy_stats *phydev_stats); + void (*get_link_ext_stats)(struct phy_device *phydev, + struct ethtool_link_ext_stats *link_stats); }; static inline int phy_hwtstamp_get(struct phy_device *phydev, @@ -50,6 +58,29 @@ static inline int phy_hwtstamp_set(struct phy_device *phydev, return phylib_stubs->hwtstamp_set(phydev, config, extack); } +static inline void phy_ethtool_get_phy_stats(struct phy_device *phydev, + struct ethtool_eth_phy_stats *phy_stats, + struct ethtool_phy_stats *phydev_stats) +{ + ASSERT_RTNL(); + + if (!phylib_stubs) + return; + + phylib_stubs->get_phy_stats(phydev, phy_stats, phydev_stats); +} + +static inline void phy_ethtool_get_link_ext_stats(struct phy_device *phydev, + struct ethtool_link_ext_stats *link_stats) +{ + ASSERT_RTNL(); + + if (!phylib_stubs) + return; + + phylib_stubs->get_link_ext_stats(phydev, link_stats); +} + #else static inline int phy_hwtstamp_get(struct phy_device *phydev, @@ -65,4 +96,15 @@ static inline int phy_hwtstamp_set(struct phy_device *phydev, return -EOPNOTSUPP; } +static inline void phy_ethtool_get_phy_stats(struct phy_device *phydev, + struct ethtool_eth_phy_stats *phy_stats, + struct ethtool_phy_stats *phydev_stats) +{ +} + +static inline void phy_ethtool_get_link_ext_stats(struct phy_device *phydev, + struct ethtool_link_ext_stats *link_stats) +{ +} + #endif -- cgit v1.2.3 From f2bc1c2655728ac00c35cfb992bdb3243ca17e7e Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 10 Jan 2025 07:05:15 +0100 Subject: net: phy: introduce optional polling interface for PHY statistics Add an optional polling interface for PHY statistics to simplify driver implementation. Signed-off-by: Oleksij Rempel Signed-off-by: Paolo Abeni --- include/linux/phy.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 81d1612e7d35..afaae74d0949 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1173,6 +1173,24 @@ struct phy_driver { */ void (*get_link_stats)(struct phy_device *dev, struct ethtool_link_ext_stats *link_stats); + + /** + * @update_stats: Trigger periodic statistics updates. + * @dev: The PHY device for which statistics updates are triggered. + * + * Periodically gathers statistics from the PHY device to update locally + * maintained 64-bit counters. This is necessary for PHYs that implement + * reduced-width counters (e.g., 16-bit or 32-bit) which can overflow + * more frequently compared to 64-bit counters. By invoking this + * callback, drivers can fetch the current counter values, handle + * overflow detection, and accumulate the results into local 64-bit + * counters for accurate reporting through the `get_phy_stats` and + * `get_link_stats` interfaces. + * + * Return: 0 on success or a negative error code on failure. + */ + int (*update_stats)(struct phy_device *dev); + /** @get_sset_count: Number of statistic counters */ int (*get_sset_count)(struct phy_device *dev); /** @get_strings: Names of the statistic counters */ @@ -1663,6 +1681,9 @@ static inline bool phy_polling_mode(struct phy_device *phydev) if (phydev->drv->flags & PHY_POLL_CABLE_TEST) return true; + if (phydev->drv->update_stats) + return true; + return phydev->irq == PHY_POLL; } -- cgit v1.2.3 From 04508d20b017326e116c6e8ef953839507c73b6d Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Fri, 10 Jan 2025 13:58:50 +0530 Subject: net: ti: icssg-prueth: Add Multicast Filtering support for VLAN in MAC mode Add multicast filtering support for VLAN interfaces in dual EMAC mode for ICSSG driver. The driver uses vlan_for_each() API to get the list of available vlans. The driver then sync mc addr of vlan interface with a locally mainatined list emac->vlan_mcast_list[vid] using __hw_addr_sync_multiple() API. __hw_addr_sync_multiple() is used instead of __hw_addr_sync() to sync vdev->mc with local list because the sync_cnt for addresses in vdev->mc will already be set by the vlan_dev_set_rx_mode() [net/8021q/vlan_dev.c] and __hw_addr_sync() only syncs when the sync_cnt == 0. Whereas __hw_addr_sync_multiple() can sync addresses even if sync_cnt is not 0. Export __hw_addr_sync_multiple() so that driver can use it. Once the local list is synced, driver calls __hw_addr_sync_dev() with the local list, vdev, sync and unsync callbacks. __hw_addr_sync_dev() is used with the local maintained list as the list to synchronize instead of using __dev_mc_sync() on vdev because __dev_mc_sync() on vdev will call __hw_addr_sync_dev() on vdev->mc and sync_cnt for addresses in vdev->mc will already be set by the vlan_dev_set_rx_mode() [net/8021q/vlan_dev.c] and __hw_addr_sync_dev() only syncs if the sync_cnt of addresses in the list (vdev->mc in this case) is 0. Whereas __hw_addr_sync_dev() on local list will work fine as the sync_cnt for addresses in the local list will still be 0. Based on change in addresses in the local list, sync / unsync callbacks are invoked. In the sync / unsync API in driver, based on whether the ndev is vlan or not, driver passes appropriate vid to FDB helper functions. Signed-off-by: MD Danish Anwar Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dd8f6f8991fe..bced03fb349e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4687,6 +4687,9 @@ int devm_register_netdev(struct device *dev, struct net_device *ndev); /* General hardware address lists handling functions */ int __hw_addr_sync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len); +int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len); void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len); int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, -- cgit v1.2.3 From 9c10dd8eed74de9e8adeb820939f8745cd566d4a Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Fri, 10 Jan 2025 13:58:51 +0530 Subject: net: hsr: Create and export hsr_get_port_ndev() Create an API to get the net_device to the slave port of HSR device. The API will take hsr net_device and enum hsr_port_type for which we want the net_device as arguments. This API can be used by client drivers who support HSR and want to get the net_devcie of slave ports from the hsr device. Export this API for the same. This API needs the enum hsr_port_type to be accessible by the drivers using hsr. Move the enum hsr_port_type from net/hsr/hsr_main.h to include/linux/if_hsr.h for the same. Signed-off-by: MD Danish Anwar Signed-off-by: Paolo Abeni --- include/linux/if_hsr.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_hsr.h b/include/linux/if_hsr.h index 0404f5bf4f30..d7941fd88032 100644 --- a/include/linux/if_hsr.h +++ b/include/linux/if_hsr.h @@ -13,6 +13,15 @@ enum hsr_version { PRP_V1, }; +enum hsr_port_type { + HSR_PT_NONE = 0, /* Must be 0, used by framereg */ + HSR_PT_SLAVE_A, + HSR_PT_SLAVE_B, + HSR_PT_INTERLINK, + HSR_PT_MASTER, + HSR_PT_PORTS, /* This must be the last item in the enum */ +}; + /* HSR Tag. * As defined in IEC-62439-3:2010, the HSR tag is really { ethertype = 0x88FB, * path, LSDU_size, sequence Nr }. But we let eth_header() create { h_dest, @@ -32,6 +41,8 @@ struct hsr_tag { #if IS_ENABLED(CONFIG_HSR) extern bool is_hsr_master(struct net_device *dev); extern int hsr_get_version(struct net_device *dev, enum hsr_version *ver); +struct net_device *hsr_get_port_ndev(struct net_device *ndev, + enum hsr_port_type pt); #else static inline bool is_hsr_master(struct net_device *dev) { @@ -42,6 +53,12 @@ static inline int hsr_get_version(struct net_device *dev, { return -EINVAL; } + +static inline struct net_device *hsr_get_port_ndev(struct net_device *ndev, + enum hsr_port_type pt) +{ + return ERR_PTR(-EINVAL); +} #endif /* CONFIG_HSR */ #endif /*_LINUX_IF_HSR_H_*/ -- cgit v1.2.3 From d06905d686107c8343ff71aa4f3c881cc0a9a7b9 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 9 Jan 2025 13:21:10 +0100 Subject: i2c: add core-managed per-client directory in debugfs More and more I2C client drivers use debugfs entries and currently they need to manage a subdir for their files on their own. This means inconsistent naming for these subdirs and they are scattered all over the debugfs-tree as well. Not to mention the duplicated code. Let the I2C core provide and maintain a proper directory per client. Note: It was considered to save the additional pointer in 'struct i2c_client' and only provide a subdir when requested via a helper function. When sketching this approach, more and more corner cases appeared, though, so the current solution with its simple and unabiguous code was chosen. Signed-off-by: Wolfram Sang Reviewed-by: Guenter Roeck --- include/linux/i2c.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 66fb3d6cf686..36de788dc7fe 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -347,6 +347,7 @@ struct i2c_client { i2c_slave_cb_t slave_cb; /* callback for slave mode */ #endif void *devres_group_id; /* ID of probe devres group */ + struct dentry *debugfs; /* per-client debugfs dir */ }; #define to_i2c_client(d) container_of(d, struct i2c_client, dev) -- cgit v1.2.3 From a9ff94477836cb43d94efbd9a851213944800177 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 14 Jan 2025 12:54:52 +0100 Subject: ARM: 9433/2: implement cacheinfo support On ARMv7 / v7m machines read CTR and CLIDR registers to provide information regarding the cache topology. Earlier machines should describe full cache topology in the device tree. Note, this follows the ARM64 cacheinfo support and provides only minimal support required to bootstrap cache info. All useful properties should be decribed in Device Tree. Reviewed-by: Linus Walleij Signed-off-by: Dmitry Baryshkov Signed-off-by: Russell King (Oracle) --- include/linux/cacheinfo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 108060612bb8..1e7061549fc7 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -147,7 +147,7 @@ static inline int get_cpu_cacheinfo_id(int cpu, int level) return ci ? ci->id : -1; } -#ifdef CONFIG_ARM64 +#if defined(CONFIG_ARM64) || defined(CONFIG_ARM) #define use_arch_cache_info() (true) #else #define use_arch_cache_info() (false) -- cgit v1.2.3 From 514dcf78afe6b99fba5e885be295356856dd424b Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 10 Jan 2025 10:40:20 +0100 Subject: net: pse-pd: Remove unused pse_ethtool_get_pw_limit function declaration Removed the unused pse_ethtool_get_pw_limit() function declaration from pse.h. This function was declared but never implemented or used, making the declaration unnecessary. Reviewed-by: Kalesh AP Reviewed-by: Andrew Lunn Acked-by: Oleksij Rempel Reviewed-by: Kyle Swenson Signed-off-by: Kory Maincent Signed-off-by: Paolo Abeni --- include/linux/pse-pd/pse.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index 591a53e082e6..85a08c349256 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -184,8 +184,6 @@ int pse_ethtool_set_config(struct pse_control *psec, int pse_ethtool_set_pw_limit(struct pse_control *psec, struct netlink_ext_ack *extack, const unsigned int pw_limit); -int pse_ethtool_get_pw_limit(struct pse_control *psec, - struct netlink_ext_ack *extack); bool pse_has_podl(struct pse_control *psec); bool pse_has_c33(struct pse_control *psec); @@ -222,12 +220,6 @@ static inline int pse_ethtool_set_pw_limit(struct pse_control *psec, return -EOPNOTSUPP; } -static inline int pse_ethtool_get_pw_limit(struct pse_control *psec, - struct netlink_ext_ack *extack) -{ - return -EOPNOTSUPP; -} - static inline bool pse_has_podl(struct pse_control *psec) { return false; -- cgit v1.2.3 From 6e56a6d47a7fad705a1a1d088237b0858c01a770 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 10 Jan 2025 10:40:22 +0100 Subject: net: pse-pd: Add power limit check Checking only the current limit is not sufficient. According to the standard, voltage can reach up to 57V and current up to 1.92A, which exceeds the power limit described in the standard (99.9W). Add a power limit check to prevent this. Acked-by: Oleksij Rempel Signed-off-by: Kory Maincent Signed-off-by: Paolo Abeni --- include/linux/pse-pd/pse.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index 85a08c349256..bc5addccbf32 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -11,6 +11,8 @@ /* Maximum current in uA according to IEEE 802.3-2022 Table 145-1 */ #define MAX_PI_CURRENT 1920000 +/* Maximum power in mW according to IEEE 802.3-2022 Table 145-16 */ +#define MAX_PI_PW 99900 struct phy_device; struct pse_controller_dev; -- cgit v1.2.3 From e0a5e2bba38aa61a900934b45d6e846e0a6d7524 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 10 Jan 2025 10:40:26 +0100 Subject: net: pse-pd: Use power limit at driver side instead of current limit The regulator framework uses current limits, but the PSE standard and known PSE controllers rely on power limits. Instead of converting current to power within each driver, perform the conversion in the PSE core. This avoids redundancy in driver implementation and aligns better with the standard, simplifying driver development. Remove at the same time the _pse_ethtool_get_status() function which is not needed anymore. Acked-by: Oleksij Rempel Signed-off-by: Kory Maincent Signed-off-by: Paolo Abeni --- include/linux/pse-pd/pse.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index bc5addccbf32..a721651cd1e0 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -77,12 +77,8 @@ struct pse_control_status { * @pi_disable: Configure the PSE PI as disabled. * @pi_get_voltage: Return voltage similarly to get_voltage regulator * callback. - * @pi_get_current_limit: Get the configured current limit similarly to - * get_current_limit regulator callback. - * @pi_set_current_limit: Configure the current limit similarly to - * set_current_limit regulator callback. - * Should not return an error in case of MAX_PI_CURRENT - * current value set. + * @pi_get_pw_limit: Get the configured power limit of the PSE PI. + * @pi_set_pw_limit: Configure the power limit of the PSE PI. */ struct pse_controller_ops { int (*ethtool_get_status)(struct pse_controller_dev *pcdev, @@ -93,10 +89,10 @@ struct pse_controller_ops { int (*pi_enable)(struct pse_controller_dev *pcdev, int id); int (*pi_disable)(struct pse_controller_dev *pcdev, int id); int (*pi_get_voltage)(struct pse_controller_dev *pcdev, int id); - int (*pi_get_current_limit)(struct pse_controller_dev *pcdev, - int id); - int (*pi_set_current_limit)(struct pse_controller_dev *pcdev, - int id, int max_uA); + int (*pi_get_pw_limit)(struct pse_controller_dev *pcdev, + int id); + int (*pi_set_pw_limit)(struct pse_controller_dev *pcdev, + int id, int max_mW); }; struct module; -- cgit v1.2.3 From 3e9dbfec499807767d03592ebdf19d9c15fd495b Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 10 Jan 2025 10:40:27 +0100 Subject: net: pse-pd: Split ethtool_get_status into multiple callbacks The ethtool_get_status callback currently handles all status and PSE information within a single function. This approach has two key drawbacks: 1. If the core requires some information for purposes other than ethtool_get_status, redundant code will be needed to fetch the same data from the driver (like is_enabled). 2. Drivers currently have access to all information passed to ethtool. New variables will soon be added to ethtool status, such as PSE ID, power domain IDs, and budget evaluation strategies, which are meant to be managed solely by the core. Drivers should not have the ability to modify these variables. To resolve these issues, ethtool_get_status has been split into multiple callbacks, with each handling a specific piece of information required by ethtool or the core. Signed-off-by: Kory Maincent Signed-off-by: Paolo Abeni --- include/linux/pse-pd/pse.h | 87 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 75 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index a721651cd1e0..3c544aff58b9 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -31,7 +31,52 @@ struct pse_control_config { }; /** - * struct pse_control_status - PSE control/channel status. + * struct pse_admin_state - PSE operational state + * + * @podl_admin_state: operational state of the PoDL PSE + * functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState + * @c33_admin_state: operational state of the PSE + * functions. IEEE 802.3-2022 30.9.1.1.2 aPSEAdminState + */ +struct pse_admin_state { + enum ethtool_podl_pse_admin_state podl_admin_state; + enum ethtool_c33_pse_admin_state c33_admin_state; +}; + +/** + * struct pse_pw_status - PSE power detection status + * + * @podl_pw_status: power detection status of the PoDL PSE. + * IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus: + * @c33_pw_status: power detection status of the PSE. + * IEEE 802.3-2022 30.9.1.1.5 aPSEPowerDetectionStatus: + */ +struct pse_pw_status { + enum ethtool_podl_pse_pw_d_status podl_pw_status; + enum ethtool_c33_pse_pw_d_status c33_pw_status; +}; + +/** + * struct pse_ext_state_info - PSE extended state information + * + * @c33_ext_state_info: extended state information of the PSE + */ +struct pse_ext_state_info { + struct ethtool_c33_pse_ext_state_info c33_ext_state_info; +}; + +/** + * struct pse_pw_limit_ranges - PSE power limit configuration range + * + * @c33_pw_limit_ranges: supported power limit configuration range. The driver + * is in charge of the memory allocation. + */ +struct pse_pw_limit_ranges { + struct ethtool_c33_pse_pw_limit_range *c33_pw_limit_ranges; +}; + +/** + * struct ethtool_pse_control_status - PSE control/channel status. * * @podl_admin_state: operational state of the PoDL PSE * functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState @@ -49,11 +94,11 @@ struct pse_control_config { * @c33_avail_pw_limit: available power limit of the PSE in mW * IEEE 802.3-2022 145.2.5.4 pse_avail_pwr * @c33_pw_limit_ranges: supported power limit configuration range. The driver - * is in charge of the memory allocation. + * is in charge of the memory allocation * @c33_pw_limit_nb_ranges: number of supported power limit configuration * ranges */ -struct pse_control_status { +struct ethtool_pse_control_status { enum ethtool_podl_pse_admin_state podl_admin_state; enum ethtool_podl_pse_pw_d_status podl_pw_status; enum ethtool_c33_pse_admin_state c33_admin_state; @@ -69,22 +114,37 @@ struct pse_control_status { /** * struct pse_controller_ops - PSE controller driver callbacks * - * @ethtool_get_status: get PSE control status for ethtool interface * @setup_pi_matrix: setup PI matrix of the PSE controller + * @pi_get_admin_state: Get the operational state of the PSE PI. This ops + * is mandatory. + * @pi_get_pw_status: Get the power detection status of the PSE PI. This + * ops is mandatory. + * @pi_get_ext_state: Get the extended state of the PSE PI. + * @pi_get_pw_class: Get the power class of the PSE PI. + * @pi_get_actual_pw: Get actual power of the PSE PI in mW. * @pi_is_enabled: Return 1 if the PSE PI is enabled, 0 if not. * May also return negative errno. * @pi_enable: Configure the PSE PI as enabled. * @pi_disable: Configure the PSE PI as disabled. * @pi_get_voltage: Return voltage similarly to get_voltage regulator - * callback. - * @pi_get_pw_limit: Get the configured power limit of the PSE PI. - * @pi_set_pw_limit: Configure the power limit of the PSE PI. + * callback in uV. + * @pi_get_pw_limit: Get the configured power limit of the PSE PI in mW. + * @pi_set_pw_limit: Configure the power limit of the PSE PI in mW. + * @pi_get_pw_limit_ranges: Get the supported power limit configuration + * range. The driver is in charge of the memory + * allocation and should return the number of + * ranges. */ struct pse_controller_ops { - int (*ethtool_get_status)(struct pse_controller_dev *pcdev, - unsigned long id, struct netlink_ext_ack *extack, - struct pse_control_status *status); int (*setup_pi_matrix)(struct pse_controller_dev *pcdev); + int (*pi_get_admin_state)(struct pse_controller_dev *pcdev, int id, + struct pse_admin_state *admin_state); + int (*pi_get_pw_status)(struct pse_controller_dev *pcdev, int id, + struct pse_pw_status *pw_status); + int (*pi_get_ext_state)(struct pse_controller_dev *pcdev, int id, + struct pse_ext_state_info *ext_state_info); + int (*pi_get_pw_class)(struct pse_controller_dev *pcdev, int id); + int (*pi_get_actual_pw)(struct pse_controller_dev *pcdev, int id); int (*pi_is_enabled)(struct pse_controller_dev *pcdev, int id); int (*pi_enable)(struct pse_controller_dev *pcdev, int id); int (*pi_disable)(struct pse_controller_dev *pcdev, int id); @@ -93,12 +153,15 @@ struct pse_controller_ops { int id); int (*pi_set_pw_limit)(struct pse_controller_dev *pcdev, int id, int max_mW); + int (*pi_get_pw_limit_ranges)(struct pse_controller_dev *pcdev, int id, + struct pse_pw_limit_ranges *pw_limit_ranges); }; struct module; struct device_node; struct of_phandle_args; struct pse_control; +struct ethtool_pse_control_status; /* PSE PI pairset pinout can either be Alternative A or Alternative B */ enum pse_pi_pairset_pinout { @@ -175,7 +238,7 @@ void pse_control_put(struct pse_control *psec); int pse_ethtool_get_status(struct pse_control *psec, struct netlink_ext_ack *extack, - struct pse_control_status *status); + struct ethtool_pse_control_status *status); int pse_ethtool_set_config(struct pse_control *psec, struct netlink_ext_ack *extack, const struct pse_control_config *config); @@ -199,7 +262,7 @@ static inline void pse_control_put(struct pse_control *psec) static inline int pse_ethtool_get_status(struct pse_control *psec, struct netlink_ext_ack *extack, - struct pse_control_status *status) + struct ethtool_pse_control_status *status) { return -EOPNOTSUPP; } -- cgit v1.2.3 From 4640a1f0d8f2246f34d6e74330d7e7d2cf75605b Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 10 Jan 2025 10:40:28 +0100 Subject: net: pse-pd: Remove is_enabled callback from drivers The is_enabled callback is now redundant as the admin_state can be obtained directly from the driver and provides the same information. To simplify functionality, the core will handle this internally, making the is_enabled callback unnecessary at the driver level. Remove the callback from all drivers. Acked-by: Oleksij Rempel Signed-off-by: Kory Maincent Signed-off-by: Paolo Abeni --- include/linux/pse-pd/pse.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index 3c544aff58b9..b5ae3dcee550 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -122,8 +122,6 @@ struct ethtool_pse_control_status { * @pi_get_ext_state: Get the extended state of the PSE PI. * @pi_get_pw_class: Get the power class of the PSE PI. * @pi_get_actual_pw: Get actual power of the PSE PI in mW. - * @pi_is_enabled: Return 1 if the PSE PI is enabled, 0 if not. - * May also return negative errno. * @pi_enable: Configure the PSE PI as enabled. * @pi_disable: Configure the PSE PI as disabled. * @pi_get_voltage: Return voltage similarly to get_voltage regulator @@ -145,7 +143,6 @@ struct pse_controller_ops { struct pse_ext_state_info *ext_state_info); int (*pi_get_pw_class)(struct pse_controller_dev *pcdev, int id); int (*pi_get_actual_pw)(struct pse_controller_dev *pcdev, int id); - int (*pi_is_enabled)(struct pse_controller_dev *pcdev, int id); int (*pi_enable)(struct pse_controller_dev *pcdev, int id); int (*pi_disable)(struct pse_controller_dev *pcdev, int id); int (*pi_get_voltage)(struct pse_controller_dev *pcdev, int id); -- cgit v1.2.3 From 5385f1e1923ca8131eb143567d509b101a344e06 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 10 Jan 2025 10:40:31 +0100 Subject: net: pse-pd: Clean ethtool header of PSE structures Remove PSE-specific structures from the ethtool header to improve code modularity, maintain independent headers, and reduce incremental build time. Signed-off-by: Kory Maincent Signed-off-by: Paolo Abeni --- include/linux/ethtool.h | 20 -------------------- include/linux/pse-pd/pse.h | 22 +++++++++++++++++++++- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 4bf70cfec826..20a86bd5f4e3 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1326,24 +1326,4 @@ struct ethtool_forced_speed_map { void ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size); - -/* C33 PSE extended state and substate. */ -struct ethtool_c33_pse_ext_state_info { - enum ethtool_c33_pse_ext_state c33_pse_ext_state; - union { - enum ethtool_c33_pse_ext_substate_error_condition error_condition; - enum ethtool_c33_pse_ext_substate_mr_pse_enable mr_pse_enable; - enum ethtool_c33_pse_ext_substate_option_detect_ted option_detect_ted; - enum ethtool_c33_pse_ext_substate_option_vport_lim option_vport_lim; - enum ethtool_c33_pse_ext_substate_ovld_detected ovld_detected; - enum ethtool_c33_pse_ext_substate_power_not_available power_not_available; - enum ethtool_c33_pse_ext_substate_short_detected short_detected; - u32 __c33_pse_ext_substate; - }; -}; - -struct ethtool_c33_pse_pw_limit_range { - u32 min; - u32 max; -}; #endif /* _LINUX_ETHTOOL_H */ diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index b5ae3dcee550..c773eeb92d04 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -5,7 +5,6 @@ #ifndef _LINUX_PSE_CONTROLLER_H #define _LINUX_PSE_CONTROLLER_H -#include #include #include @@ -16,6 +15,27 @@ struct phy_device; struct pse_controller_dev; +struct netlink_ext_ack; + +/* C33 PSE extended state and substate. */ +struct ethtool_c33_pse_ext_state_info { + enum ethtool_c33_pse_ext_state c33_pse_ext_state; + union { + enum ethtool_c33_pse_ext_substate_error_condition error_condition; + enum ethtool_c33_pse_ext_substate_mr_pse_enable mr_pse_enable; + enum ethtool_c33_pse_ext_substate_option_detect_ted option_detect_ted; + enum ethtool_c33_pse_ext_substate_option_vport_lim option_vport_lim; + enum ethtool_c33_pse_ext_substate_ovld_detected ovld_detected; + enum ethtool_c33_pse_ext_substate_power_not_available power_not_available; + enum ethtool_c33_pse_ext_substate_short_detected short_detected; + u32 __c33_pse_ext_substate; + }; +}; + +struct ethtool_c33_pse_pw_limit_range { + u32 min; + u32 max; +}; /** * struct pse_control_config - PSE control/channel configuration. -- cgit v1.2.3 From f4757d84abf523ea831dba0c136db4050d55c99f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 7 Jan 2025 17:19:22 +0100 Subject: ACPI: PRM: Fix missing guid_t declaration in linux/prmt.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Seen the following build error: ./include/linux/prmt.h:5:27: error: unknown type name ‘guid_t’ 5 | int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer); | ^~~~~~ The include file uses guid_t but it is not declared. Include linux/uuid.h to fix this. Signed-off-by: Robert Richter Reviewed-by: Yazen Ghannam Link: https://patch.msgid.link/20250107161923.3387552-1-rrichter@amd.com Signed-off-by: Rafael J. Wysocki --- include/linux/prmt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/prmt.h b/include/linux/prmt.h index 9c094294403f..c53ab287e932 100644 --- a/include/linux/prmt.h +++ b/include/linux/prmt.h @@ -1,5 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ +#include + #ifdef CONFIG_ACPI_PRMT void init_prmt(void); int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer); -- cgit v1.2.3 From ee9c69388e3bad6c595fe38f34aa1126d2d07a11 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 12 Jan 2025 14:49:07 +0000 Subject: kobject: Remove unused functions kobj_ns_initial() and kobj_ns_netlink() were adde din 2010 by commit bc451f205823 ("kobj: Add basic infrastructure for dealing with namespaces.") but have remained unused. Remove them. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20250112144907.270272-1-linux@treblig.org Signed-off-by: Greg Kroah-Hartman --- include/linux/kobject_ns.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h index be707748e7ce..150fe2ae1b6b 100644 --- a/include/linux/kobject_ns.h +++ b/include/linux/kobject_ns.h @@ -52,8 +52,6 @@ const struct kobj_ns_type_operations *kobj_ns_ops(const struct kobject *kobj); bool kobj_ns_current_may_mount(enum kobj_ns_type type); void *kobj_ns_grab_current(enum kobj_ns_type type); -const void *kobj_ns_netlink(enum kobj_ns_type type, struct sock *sk); -const void *kobj_ns_initial(enum kobj_ns_type type); void kobj_ns_drop(enum kobj_ns_type type, void *ns); #endif /* _LINUX_KOBJECT_NS_H */ -- cgit v1.2.3 From 3feec68563dda59517f83d19123aa287a1dfd068 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:40:53 -0500 Subject: nfs/localio: add direct IO enablement with sync and async IO support This commit simply adds the required O_DIRECT plumbing. It doesn't address the fact that NFS doesn't ensure all writes are page aligned (nor device logical block size aligned as required by O_DIRECT). Because NFS will read-modify-write for IO that isn't aligned, LOCALIO will not use O_DIRECT semantics by default if/when an application requests the use of O_DIRECT. Allow the use of O_DIRECT semantics by: 1: Adding a flag to the nfs_pgio_header struct to allow the NFS O_DIRECT layer to signal that O_DIRECT was used by the application 2: Adding a 'localio_O_DIRECT_semantics' NFS module parameter that when enabled will cause LOCALIO to use O_DIRECT semantics (this may cause IO to fail if applications do not properly align their IO). This commit is derived from code developed by Weston Andros Adamson. Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs_xdr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 559273a0f16d..80766ff0a47c 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1637,6 +1637,7 @@ enum { NFS_IOHDR_RESEND_PNFS, NFS_IOHDR_RESEND_MDS, NFS_IOHDR_UNSTABLE_WRITES, + NFS_IOHDR_ODIRECT, }; struct nfs_io_completion; -- cgit v1.2.3 From a61466315d7afca032342183a57e62d5e3a3157c Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:40:54 -0500 Subject: nfsd: add nfsd_file_{get,put} to 'nfs_to' nfsd_localio_operations In later a commit LOCALIO must call both nfsd_file_get and nfsd_file_put to manage extra nfsd_file references. Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Acked-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/nfslocalio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 9202f4b24343..ab6a2a53f505 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -56,6 +56,8 @@ struct nfsd_localio_operations { const struct nfs_fh *, const fmode_t); struct net *(*nfsd_file_put_local)(struct nfsd_file *); + struct nfsd_file *(*nfsd_file_get)(struct nfsd_file *); + void (*nfsd_file_put)(struct nfsd_file *); struct file *(*nfsd_file_file)(struct nfsd_file *); } ____cacheline_aligned; -- cgit v1.2.3 From b49f049a22227df701bfbca083d6cc859496e615 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:40:55 -0500 Subject: nfs_common: rename functions that invalidate LOCALIO nfs_clients Rename nfs_uuid_invalidate_one_client to nfs_localio_disable_client. Rename nfs_uuid_invalidate_clients to nfs_localio_invalidate_clients. Signed-off-by: Mike Snitzer Reviewed-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfslocalio.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index ab6a2a53f505..a05d1043f2b0 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -37,8 +37,9 @@ bool nfs_uuid_begin(nfs_uuid_t *); void nfs_uuid_end(nfs_uuid_t *); void nfs_uuid_is_local(const uuid_t *, struct list_head *, struct net *, struct auth_domain *, struct module *); -void nfs_uuid_invalidate_clients(struct list_head *list); -void nfs_uuid_invalidate_one_client(nfs_uuid_t *nfs_uuid); + +void nfs_localio_disable_client(nfs_uuid_t *nfs_uuid); +void nfs_localio_invalidate_clients(struct list_head *list); /* localio needs to map filehandle -> struct nfsd_file */ extern struct nfsd_file * -- cgit v1.2.3 From 4ee7ba40007357a48447a8cbc667480acf9a006a Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:40:56 -0500 Subject: nfs_common: move localio_lock to new lock member of nfs_uuid_t Remove cl_localio_lock from 'struct nfs_client' in favor of adding a lock to the nfs_uuid_t struct (which is embedded in each nfs_client). Push nfs_local_{enable,disable} implementation down to nfs_common. Those methods now call nfs_localio_{enable,disable}_client. This allows implementing nfs_localio_invalidate_clients in terms of nfs_localio_disable_client. Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 1 - include/linux/nfslocalio.h | 8 +++++++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index b804346a9741..239d86ef166c 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -132,7 +132,6 @@ struct nfs_client { struct timespec64 cl_nfssvc_boot; seqlock_t cl_boot_lock; nfs_uuid_t cl_uuid; - spinlock_t cl_localio_lock; #endif /* CONFIG_NFS_LOCALIO */ }; diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index a05d1043f2b0..4d5583873f41 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -6,6 +6,7 @@ #ifndef __LINUX_NFSLOCALIO_H #define __LINUX_NFSLOCALIO_H + /* nfsd_file structure is purposely kept opaque to NFS client */ struct nfsd_file; @@ -19,6 +20,8 @@ struct nfsd_file; #include #include +struct nfs_client; + /* * Useful to allow a client to negotiate if localio * possible with its server. @@ -27,6 +30,8 @@ struct nfsd_file; */ typedef struct { uuid_t uuid; + /* sadly this struct is just over a cacheline, avoid bouncing */ + spinlock_t ____cacheline_aligned lock; struct list_head list; struct net __rcu *net; /* nfsd's network namespace */ struct auth_domain *dom; /* auth_domain for localio */ @@ -38,7 +43,8 @@ void nfs_uuid_end(nfs_uuid_t *); void nfs_uuid_is_local(const uuid_t *, struct list_head *, struct net *, struct auth_domain *, struct module *); -void nfs_localio_disable_client(nfs_uuid_t *nfs_uuid); +void nfs_localio_enable_client(struct nfs_client *clp); +void nfs_localio_disable_client(struct nfs_client *clp); void nfs_localio_invalidate_clients(struct list_head *list); /* localio needs to map filehandle -> struct nfsd_file */ -- cgit v1.2.3 From 86e00412254a717ffd5d38dc5ec0ee1cce6281b3 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:40:57 -0500 Subject: nfs: cache all open LOCALIO nfsd_file(s) in client This commit switches from leaning heavily on NFSD's filecache (in terms of GC'd nfsd_files) back to caching nfsd_files in the client. A later commit will add the callback mechanism needed to allow NFSD to force the NFS client to cleanup all cached nfsd_files. Add nfs_fh_localio_init() and 'struct nfs_fh_localio' to cache opened nfsd_file(s) (both a RO and RW nfsd_file is able to be opened and cached for a given nfs_fh). Update nfs_local_open_fh() to cache the nfsd_file once it is opened using __nfs_local_open_fh(). Introduce nfs_close_local_fh() to clear the cached open nfsd_files and call nfs_to_nfsd_file_put_local(). Refcounting is such that: - nfs_local_open_fh() is paired with nfs_close_local_fh(). - __nfs_local_open_fh() is paired with nfs_to_nfsd_file_put_local(). - nfs_local_file_get() is paired with nfs_local_file_put(). Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs_fs.h | 22 ++++++++++++++++++++-- include/linux/nfslocalio.h | 16 +++++++++------- 2 files changed, 29 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 039898d70954..67ae2c3f41d2 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -77,6 +77,23 @@ struct nfs_lock_context { struct rcu_head rcu_head; }; +struct nfs_file_localio { + struct nfsd_file __rcu *ro_file; + struct nfsd_file __rcu *rw_file; + struct list_head list; + void __rcu *nfs_uuid; /* opaque pointer to 'nfs_uuid_t' */ +}; + +static inline void nfs_localio_file_init(struct nfs_file_localio *nfl) +{ +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + nfl->ro_file = NULL; + nfl->rw_file = NULL; + INIT_LIST_HEAD(&nfl->list); + nfl->nfs_uuid = NULL; +#endif +} + struct nfs4_state; struct nfs_open_context { struct nfs_lock_context lock_context; @@ -87,15 +104,16 @@ struct nfs_open_context { struct nfs4_state *state; fmode_t mode; + int error; unsigned long flags; #define NFS_CONTEXT_BAD (2) #define NFS_CONTEXT_UNLOCK (3) #define NFS_CONTEXT_FILE_OPEN (4) - int error; - struct list_head list; struct nfs4_threshold *mdsthreshold; + struct list_head list; struct rcu_head rcu_head; + struct nfs_file_localio nfl; }; struct nfs_open_dir_context { diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 4d5583873f41..7cfc6720ed26 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -6,10 +6,6 @@ #ifndef __LINUX_NFSLOCALIO_H #define __LINUX_NFSLOCALIO_H - -/* nfsd_file structure is purposely kept opaque to NFS client */ -struct nfsd_file; - #if IS_ENABLED(CONFIG_NFS_LOCALIO) #include @@ -21,6 +17,7 @@ struct nfsd_file; #include struct nfs_client; +struct nfs_file_localio; /* * Useful to allow a client to negotiate if localio @@ -52,6 +49,7 @@ extern struct nfsd_file * nfsd_open_local_fh(struct net *, struct auth_domain *, struct rpc_clnt *, const struct cred *, const struct nfs_fh *, const fmode_t) __must_hold(rcu); +void nfs_close_local_fh(struct nfs_file_localio *); struct nfsd_localio_operations { bool (*nfsd_serv_try_get)(struct net *); @@ -73,7 +71,8 @@ extern const struct nfsd_localio_operations *nfs_to; struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *, struct rpc_clnt *, const struct cred *, - const struct nfs_fh *, const fmode_t); + const struct nfs_fh *, struct nfs_file_localio *, + const fmode_t); static inline void nfs_to_nfsd_net_put(struct net *net) { @@ -100,12 +99,15 @@ static inline void nfs_to_nfsd_file_put_local(struct nfsd_file *localio) } #else /* CONFIG_NFS_LOCALIO */ -static inline void nfsd_localio_ops_init(void) + +struct nfs_file_localio; +static inline void nfs_close_local_fh(struct nfs_file_localio *nfl) { } -static inline void nfs_to_nfsd_file_put_local(struct nfsd_file *localio) +static inline void nfsd_localio_ops_init(void) { } + #endif /* CONFIG_NFS_LOCALIO */ #endif /* __LINUX_NFSLOCALIO_H */ -- cgit v1.2.3 From b33f7dec3a67216123312c7bb752b8f6faa1c465 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:40:59 -0500 Subject: nfsd: rename nfsd_serv_ prefixed methods and variables with nfsd_net_ Also update Documentation/filesystems/nfs/localio.rst accordingly and reduce the technical documentation debt that was previously captured in that document. Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Acked-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/nfslocalio.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 7cfc6720ed26..aa2b5c6561ab 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -52,8 +52,8 @@ nfsd_open_local_fh(struct net *, struct auth_domain *, struct rpc_clnt *, void nfs_close_local_fh(struct nfs_file_localio *); struct nfsd_localio_operations { - bool (*nfsd_serv_try_get)(struct net *); - void (*nfsd_serv_put)(struct net *); + bool (*nfsd_net_try_get)(struct net *); + void (*nfsd_net_put)(struct net *); struct nfsd_file *(*nfsd_open_local_fh)(struct net *, struct auth_domain *, struct rpc_clnt *, @@ -77,12 +77,12 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *, static inline void nfs_to_nfsd_net_put(struct net *net) { /* - * Once reference to nfsd_serv is dropped, NFSD could be - * unloaded, so ensure safe return from nfsd_file_put_local() - * by always taking RCU. + * Once reference to net (and associated nfsd_serv) is dropped, NFSD + * could be unloaded, so ensure safe return from nfsd_net_put() by + * always taking RCU. */ rcu_read_lock(); - nfs_to->nfsd_serv_put(net); + nfs_to->nfsd_net_put(net); rcu_read_unlock(); } -- cgit v1.2.3 From 085804110aa13eac7f763d8d5cfe3a8220e35222 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:41:02 -0500 Subject: nfs_common: track all open nfsd_files per LOCALIO nfs_client This tracking enables __nfsd_file_cache_purge() to call nfs_localio_invalidate_clients(), upon shutdown or export change, to nfs_close_local_fh() all open nfsd_files that are still cached by the LOCALIO nfs clients associated with nfsd_net that is being shutdown. Now that the client must track all open nfsd_files there was more work than necessary being done with the global nfs_uuids_lock contended. This manifested in various RCU issues, e.g.: hrtimer: interrupt took 47969440 ns rcu: INFO: rcu_sched detected stalls on CPUs/tasks: Use nfs_uuid->lock to protect all nfs_uuid_t members, instead of nfs_uuids_lock, once nfs_uuid_is_local() adds the client to nn->local_clients. Also add 'local_clients_lock' to 'struct nfsd_net' to protect nn->local_clients. And store a pointer to spinlock in the 'list_lock' member of nfs_uuid_t so nfs_localio_disable_client() can use it to avoid taking the global nfs_uuids_lock. In combination, these split out locks eliminate the use of the single nfslocalio.c global nfs_uuids_lock in the IO paths (open and close). Also refactored associated fs/nfs_common/nfslocalio.c methods' locking to reduce work performed with spinlocks held in general. Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfslocalio.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index aa2b5c6561ab..c68a529230c1 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -30,19 +30,23 @@ typedef struct { /* sadly this struct is just over a cacheline, avoid bouncing */ spinlock_t ____cacheline_aligned lock; struct list_head list; + spinlock_t *list_lock; /* nn->local_clients_lock */ struct net __rcu *net; /* nfsd's network namespace */ struct auth_domain *dom; /* auth_domain for localio */ + /* Local files to close when net is shut down or exports change */ + struct list_head files; } nfs_uuid_t; void nfs_uuid_init(nfs_uuid_t *); bool nfs_uuid_begin(nfs_uuid_t *); void nfs_uuid_end(nfs_uuid_t *); -void nfs_uuid_is_local(const uuid_t *, struct list_head *, +void nfs_uuid_is_local(const uuid_t *, struct list_head *, spinlock_t *, struct net *, struct auth_domain *, struct module *); void nfs_localio_enable_client(struct nfs_client *clp); void nfs_localio_disable_client(struct nfs_client *clp); -void nfs_localio_invalidate_clients(struct list_head *list); +void nfs_localio_invalidate_clients(struct list_head *nn_local_clients, + spinlock_t *nn_local_clients_lock); /* localio needs to map filehandle -> struct nfsd_file */ extern struct nfsd_file * -- cgit v1.2.3 From 779a395189c692eec0246e7df63e2a3c0f0c8508 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:41:04 -0500 Subject: nfs/localio: remove redundant code and simplify LOCALIO enablement Remove nfs_local_enable and nfs_local_disable, instead use nfs_localio_enable_client and nfs_localio_disable_client. Discontinue use of the NFS_CS_LOCAL_IO bit in the nfs_client struct's cl_flags to reflect that LOCALIO is enabled; instead just test if the net member of the nfs_uuid_t struct is set. Also remove NFS_CS_LOCAL_IO. Lastly, remove trace_nfs_local_enable and trace_nfs_local_disable because comparable traces are available from nfs_localio.ko. Suggested-by: NeilBrown Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 1 - include/linux/nfslocalio.h | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 239d86ef166c..ed66df1093e8 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -50,7 +50,6 @@ struct nfs_client { #define NFS_CS_DS 7 /* - Server is a DS */ #define NFS_CS_REUSEPORT 8 /* - reuse src port on reconnect */ #define NFS_CS_PNFS 9 /* - Server used for pnfs */ -#define NFS_CS_LOCAL_IO 10 /* - client is local */ struct sockaddr_storage cl_addr; /* server identifier */ size_t cl_addrlen; char * cl_hostname; /* hostname of server */ diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index c68a529230c1..05817d6ef3d1 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -111,6 +111,10 @@ static inline void nfs_close_local_fh(struct nfs_file_localio *nfl) static inline void nfsd_localio_ops_init(void) { } +struct nfs_client; +static inline void nfs_localio_disable_client(struct nfs_client *clp) +{ +} #endif /* CONFIG_NFS_LOCALIO */ -- cgit v1.2.3 From 76d4cb6345da0f2cd505e552157258325bcc8bcd Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:41:05 -0500 Subject: nfs: probe for LOCALIO when v4 client reconnects to server Introduce nfs_local_probe_async() for the NFS client to initiate if/when it reconnects with server. For NFSv4 it is a simple matter to call nfs_local_probe_async() from nfs4_do_reclaim (during NFSv4 grace). Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index ed66df1093e8..f00bfcee7120 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -131,6 +131,7 @@ struct nfs_client { struct timespec64 cl_nfssvc_boot; seqlock_t cl_boot_lock; nfs_uuid_t cl_uuid; + struct work_struct cl_local_probe_work; #endif /* CONFIG_NFS_LOCALIO */ }; -- cgit v1.2.3 From 4a489220aa8c9daf5f02396c28cebade9f9ab563 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:41:06 -0500 Subject: nfs: probe for LOCALIO when v3 client reconnects to server Re-enabling NFSv3 LOCALIO is made more complex (than NFSv4) because v3 is stateless. As such, the hueristic used to identify a LOCALIO probe point is more adhoc by nature: if/when NFSv3 client IO begins to complete again in terms of normal RPC-based NFSv3 server IO, attempt nfs_local_probe_async(). Care is taken to throttle the frequency of nfs_local_probe_async(), otherwise there could be a flood of repeat calls to nfs_local_probe_async(). The throttle is admin controlled using a new module parameter for nfsv3, e.g.: echo 512 > /sys/module/nfsv3/parameters/nfs3_localio_probe_throttle Probe for NFSv3 LOCALIO every N IO requests (512 in this case). Must be power-of-2, defaults to 0 (probing disabled). On systems that expect to use LOCALIO with NFSv3 the admin should configure the 'nfs3_localio_probe_throttle' module parameter. This commit backfills module parameter documentation in localio.rst Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfslocalio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 05817d6ef3d1..9aa8a43843d7 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -27,7 +27,8 @@ struct nfs_file_localio; */ typedef struct { uuid_t uuid; - /* sadly this struct is just over a cacheline, avoid bouncing */ + unsigned nfs3_localio_probe_count; + /* this struct is over a cacheline, avoid bouncing */ spinlock_t ____cacheline_aligned lock; struct list_head list; spinlock_t *list_lock; /* nn->local_clients_lock */ -- cgit v1.2.3 From 013525583fdd79b6b83547b937535dfd406d3cd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 14 Jan 2025 19:08:33 +0200 Subject: PCI: Don't expose pcie_read_tlp_log() outside PCI subsystem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pcie_read_tlp_log() was exposed by the commit 0a5a46a6a61b ("PCI/AER: Generalize TLP Header Log reading") with the intent that drivers could use it, but the PCI maintainer later decided that drivers should be encouraged to use PCI core diagnostic logging of generic AER registers rather than building their own. Drivers that currently implement their own diagnostic logging include ixgbe (ixgbe_io_error_detected()) and iwlwifi (iwl_trans_pcie_dump_regs()). Remove the unwanted EXPORT of pcie_read_tlp_log() and remove it from include/linux/aer.h. Link: https://lore.kernel.org/r/20250114170840.1633-2-ilpo.jarvinen@linux.intel.com Link: https://lore.kernel.org/all/20240322193011.GA701027@bhelgaas/ Signed-off-by: Ilpo Järvinen [bhelgaas: commit log] Signed-off-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Reviewed-by: Yazen Ghannam --- include/linux/aer.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/aer.h b/include/linux/aer.h index 4b97f38f3fcf..190a0a2061cd 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -37,8 +37,6 @@ struct aer_capability_regs { u16 uncor_err_source; }; -int pcie_read_tlp_log(struct pci_dev *dev, int where, struct pcie_tlp_log *log); - #if defined(CONFIG_PCIEAER) int pci_aer_clear_nonfatal_status(struct pci_dev *dev); int pcie_aer_is_native(struct pci_dev *dev); -- cgit v1.2.3 From ede5d5dbef6f3a1f25ee4e9d1999750827aa0640 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 14 Jan 2025 19:08:35 +0200 Subject: PCI: Add defines for TLP Header/Prefix log sizes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add defines for AER and DPC capabilities TLP Header Logging register sizes (PCIe r6.2, sec 7.8.4 / 7.9.14) and replace literals with them. Link: https://lore.kernel.org/r/20250114170840.1633-4-ilpo.jarvinen@linux.intel.com Suggested-by: Yazen Ghannam Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas --- include/linux/aer.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/aer.h b/include/linux/aer.h index 190a0a2061cd..4ef6515c3205 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -16,10 +16,17 @@ #define AER_CORRECTABLE 2 #define DPC_FATAL 3 +/* + * AER and DPC capabilities TLP Logging register sizes (PCIe r6.2, sec 7.8.4 + * & 7.9.14). + */ +#define PCIE_STD_NUM_TLP_HEADERLOG 4 +#define PCIE_STD_MAX_TLP_PREFIXLOG 4 + struct pci_dev; struct pcie_tlp_log { - u32 dw[4]; + u32 dw[PCIE_STD_NUM_TLP_HEADERLOG]; }; struct aer_capability_regs { -- cgit v1.2.3 From e5321ae10e1323359a5067a26dfe98b5f44cc5e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 14 Jan 2025 19:08:38 +0200 Subject: PCI: Store number of supported End-End TLP Prefixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit eetlp_prefix_path in the struct pci_dev tells if End-End TLP Prefixes are supported by the path or not, and the value is only calculated if CONFIG_PCI_PASID is set. The Max End-End TLP Prefixes field in the Device Capabilities Register 2 also tells how many (1-4) End-End TLP Prefixes are supported (PCIe r6.2 sec 7.5.3.15). The number of supported End-End Prefixes is useful for reading correct number of DWORDs from TLP Prefix Log register in AER capability (PCIe r6.2 sec 7.8.4.12). Replace eetlp_prefix_path with eetlp_prefix_max and determine the number of supported End-End Prefixes regardless of CONFIG_PCI_PASID so that an upcoming commit generalizing TLP Prefix Log register reading does not have to read extra DWORDs for End-End Prefixes that never will be there. The value stored into eetlp_prefix_max is directly derived from device's Max End-End TLP Prefixes and does not consider limitations imposed by bridges or the Root Port beyond supported/not supported flags. This is intentional for two reasons: 1) PCIe r6.2 spec sections 2.2.10.4 & 6.2.4.4 indicate that a TLP is malformed only if the number of prefixes exceed the number of Max End-End TLP Prefixes, which seems to be the case even if the device could never receive that many prefixes due to smaller maximum imposed by a bridge or the Root Port. If TLP parsing is later added, this distinction is significant in interpreting what is logged by the TLP Prefix Log registers and the value matching to the Malformed TLP threshold is going to be more useful. 2) TLP Prefix handling happens autonomously on a low layer and the value in eetlp_prefix_max is not programmed anywhere by the kernel (i.e., there is no limiter OS can control to prevent sending more than N TLP Prefixes). Link: https://lore.kernel.org/r/20250114170840.1633-7-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Reviewed-by: Yazen Ghannam --- include/linux/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index db9b47ce3eef..21be5a1edf1a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -407,7 +407,7 @@ struct pci_dev { supported from root to here */ #endif unsigned int pasid_no_tlp:1; /* PASID works without TLP Prefix */ - unsigned int eetlp_prefix_path:1; /* End-to-End TLP Prefix */ + unsigned int eetlp_prefix_max:3; /* Max # of End-End TLP Prefixes, 0=not supported */ pci_channel_state_t error_state; /* Current connectivity state */ struct device dev; /* Generic device interface */ -- cgit v1.2.3 From b6be5ba8f1c6b28c2daa039fd9e9df32f62852bd Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 12 Jan 2025 13:13:18 +0000 Subject: socket: Remove unused kernel_sendmsg_locked The last use of kernel_sendmsg_locked() was removed in 2023 by commit dc97391e6610 ("sock: Remove ->sendpage*() in favour of sendmsg(MSG_SPLICE_PAGES)") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Kalesh AP Reviewed-by: Kuniyuki Iwashima Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250112131318.63753-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- include/linux/net.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index b75bc534c1b3..0ff950eecc6b 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -343,8 +343,6 @@ static inline bool sendpages_ok(struct page *page, size_t len, size_t offset) int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len); -int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, - struct kvec *vec, size_t num, size_t len); int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len, int flags); -- cgit v1.2.3 From f81a6d12bf8b262f4c8ce5e856a4d399d97612ee Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 10 Jan 2025 16:20:18 -0800 Subject: KVM: Open code kvm_set_memory_region() into its sole caller (ioctl() API) Open code kvm_set_memory_region() into its sole caller in preparation for adding a dedicated API for setting internal memslots. Oppurtunistically use the fancy new guard(mutex) to avoid a local 'r' variable. Cc: Tao Su Reviewed-by: Xiaoyao Li Reviewed-by: Claudio Imbrenda Acked-by: Christoph Schlameuss Link: https://lore.kernel.org/r/20250111002022.1230573-2-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 401439bb21e3..7443de24b1d9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1192,8 +1192,6 @@ enum kvm_mr_change { KVM_MR_FLAGS_ONLY, }; -int kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region2 *mem); int __kvm_set_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem); void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); -- cgit v1.2.3 From 156bffdb2b49fc0c869bf160a57378886f5fa92d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 10 Jan 2025 16:20:20 -0800 Subject: KVM: Add a dedicated API for setting KVM-internal memslots Add a dedicated API for setting internal memslots, and have it explicitly disallow setting userspace memslots. Setting a userspace memslots without a direct command from userspace would result in all manner of issues. No functional change intended. Cc: Tao Su Cc: Claudio Imbrenda Cc: Christian Borntraeger Reviewed-by: Xiaoyao Li Reviewed-by: Claudio Imbrenda Acked-by: Christoph Schlameuss Link: https://lore.kernel.org/r/20250111002022.1230573-4-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7443de24b1d9..8707d25a2e5b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1192,8 +1192,8 @@ enum kvm_mr_change { KVM_MR_FLAGS_ONLY, }; -int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region2 *mem); +int kvm_set_internal_memslot(struct kvm *kvm, + const struct kvm_userspace_memory_region2 *mem); void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, -- cgit v1.2.3 From 344315e93dbc9c4733d5c9fd605ecfc46ef97180 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 10 Jan 2025 16:20:21 -0800 Subject: KVM: x86: Drop double-underscores from __kvm_set_memory_region() Now that there's no outer wrapper for __kvm_set_memory_region() and it's static, drop its double-underscore prefix. No functional change intended. Cc: Tao Su Reviewed-by: Xiaoyao Li Reviewed-by: Claudio Imbrenda Acked-by: Christoph Schlameuss Link: https://lore.kernel.org/r/20250111002022.1230573-5-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8707d25a2e5b..dcb59d6e8acb 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1183,7 +1183,7 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn * -- just change its flags * * Since flags can be changed by some of these operations, the following - * differentiation is the best we can do for __kvm_set_memory_region(): + * differentiation is the best we can do for kvm_set_memory_region(): */ enum kvm_mr_change { KVM_MR_CREATE, -- cgit v1.2.3 From a648eb3a3f79e9736a59b28783700c2c691db419 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Dec 2024 11:34:14 +0100 Subject: genirq: Provide IRQCHIP_MOVE_DEFERRED The logic of GENERIC_PENDING_IRQ is backwards for historical reasons. Most interrupt controllers allow to move the interrupt from arbitrary contexts. If GENERIC_PENDING_IRQ is enabled by an architecture to support a chip, which requires the affinity change to happen in interrupt context, all other chips have to be marked with IRQF_MOVE_PCNTXT. That's tedious and there is no real good reason for the extra flags in the irq descriptor and the irq data status fields. In fact the decision whether interrupts can be moved in arbitrary context or not is a property of the interrupt chip. To simplify adoption for RISC-V provide a new mechanism which is enabled via a config switch and allows to add a flag to irq_chip::flags to request that interrupt affinity changes are deferred. Setting the top level chip of an interrupt evaluates the flag and maps it into the existing logic. The config switch and the various PCNTXT flags are temporary until x86 is converted over to this scheme. This intermediate step also allows trivial backporting of the mechanism to plug the affinity change race of various RISC-V interrupt controllers. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241210103335.500314436@linutronix.de --- include/linux/irq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 25f51bf3c351..6e021548fa0a 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -567,6 +567,7 @@ struct irq_chip { * in the suspend path if they are in disabled state * IRQCHIP_AFFINITY_PRE_STARTUP: Default affinity update before startup * IRQCHIP_IMMUTABLE: Don't ever change anything in this chip + * IRQCHIP_MOVE_DEFERRED: Move the interrupt in actual interrupt context */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), @@ -581,6 +582,7 @@ enum { IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND = (1 << 9), IRQCHIP_AFFINITY_PRE_STARTUP = (1 << 10), IRQCHIP_IMMUTABLE = (1 << 11), + IRQCHIP_MOVE_DEFERRED = (1 << 12), }; #include -- cgit v1.2.3 From 12c92098932b4bbf38396e9aed0a343d35437a21 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jan 2025 08:06:49 +0000 Subject: debugfs: allow to store an additional opaque pointer at file creation Set by debugfs_create_file_aux(name, mode, parent, data, aux, fops). Plain debugfs_create_file() has it set to NULL. Accessed by debugfs_get_aux(file). Convenience macros for numeric opaque data - debugfs_create_file_aux_num and debugfs_get_aux_num, resp. Signed-off-by: Al Viro Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20250112080705.141166-5-viro@zeniv.linux.org.uk Signed-off-by: Greg Kroah-Hartman --- include/linux/debugfs.h | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 59444b495d49..7c97417d73b5 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -79,9 +79,11 @@ struct debugfs_short_fops { struct dentry *debugfs_create_file_full(const char *name, umode_t mode, struct dentry *parent, void *data, + const void *aux, const struct file_operations *fops); struct dentry *debugfs_create_file_short(const char *name, umode_t mode, struct dentry *parent, void *data, + const void *aux, const struct debugfs_short_fops *fops); /** @@ -126,7 +128,15 @@ struct dentry *debugfs_create_file_short(const char *name, umode_t mode, const struct debugfs_short_fops *: debugfs_create_file_short, \ struct file_operations *: debugfs_create_file_full, \ struct debugfs_short_fops *: debugfs_create_file_short) \ - (name, mode, parent, data, fops) + (name, mode, parent, data, NULL, fops) + +#define debugfs_create_file_aux(name, mode, parent, data, aux, fops) \ + _Generic(fops, \ + const struct file_operations *: debugfs_create_file_full, \ + const struct debugfs_short_fops *: debugfs_create_file_short, \ + struct file_operations *: debugfs_create_file_full, \ + struct debugfs_short_fops *: debugfs_create_file_short) \ + (name, mode, parent, data, aux, fops) struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode, struct dentry *parent, void *data, @@ -153,6 +163,7 @@ void debugfs_remove(struct dentry *dentry); void debugfs_lookup_and_remove(const char *name, struct dentry *parent); const struct file_operations *debugfs_real_fops(const struct file *filp); +const void *debugfs_get_aux(const struct file *file); int debugfs_file_get(struct dentry *dentry); void debugfs_file_put(struct dentry *dentry); @@ -259,6 +270,14 @@ static inline struct dentry *debugfs_lookup(const char *name, return ERR_PTR(-ENODEV); } +static inline struct dentry *debugfs_create_file_aux(const char *name, + umode_t mode, struct dentry *parent, + void *data, void *aux, + const void *fops) +{ + return ERR_PTR(-ENODEV); +} + static inline struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const void *fops) @@ -312,6 +331,7 @@ static inline void debugfs_lookup_and_remove(const char *name, { } const struct file_operations *debugfs_real_fops(const struct file *filp); +void *debugfs_get_aux(const struct file *file); static inline int debugfs_file_get(struct dentry *dentry) { @@ -452,6 +472,11 @@ static inline ssize_t debugfs_read_file_str(struct file *file, #endif +#define debugfs_create_file_aux_num(name, mode, parent, data, n, fops) \ + debugfs_create_file_aux(name, mode, parent, data, \ + (void *)(unsigned long)n, fops) +#define debugfs_get_aux_num(f) (unsigned long)debugfs_get_aux(f) + /** * debugfs_create_xul - create a debugfs file that is used to read and write an * unsigned long value, formatted in hexadecimal -- cgit v1.2.3 From d1433c7ba289319983ec0086dd22524721a797ef Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jan 2025 08:06:50 +0000 Subject: debugfs: take debugfs_short_fops definition out of ifdef Signed-off-by: Al Viro Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20250112080705.141166-6-viro@zeniv.linux.org.uk Signed-off-by: Greg Kroah-Hartman --- include/linux/debugfs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 7c97417d73b5..68e9c6cbd835 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -67,16 +67,16 @@ static const struct file_operations __fops = { \ typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *); -#if defined(CONFIG_DEBUG_FS) - -struct dentry *debugfs_lookup(const char *name, struct dentry *parent); - struct debugfs_short_fops { ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); loff_t (*llseek) (struct file *, loff_t, int); }; +#if defined(CONFIG_DEBUG_FS) + +struct dentry *debugfs_lookup(const char *name, struct dentry *parent); + struct dentry *debugfs_create_file_full(const char *name, umode_t mode, struct dentry *parent, void *data, const void *aux, -- cgit v1.2.3 From f7862dfef6612b87b2ad8352c4d73886f09456d6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jan 2025 08:07:05 +0000 Subject: saner replacement for debugfs_rename() Existing primitive has several problems: 1) calling conventions are clumsy - it returns a dentry reference that is either identical to its second argument or is an ERR_PTR(-E...); in both cases no refcount changes happen. Inconvenient for users and bug-prone; it would be better to have it return 0 on success and -E... on failure. 2) it allows cross-directory moves; however, no such caller have ever materialized and considering the way debugfs is used, it's unlikely to happen in the future. What's more, any such caller would have fun issues to deal with wrt interplay with recursive removal. It also makes the calling conventions clumsier... 3) tautological rename fails; the callers have no race-free way to deal with that. 4) new name must have been formed by the caller; quite a few callers have it done by sprintf/kasprintf/etc., ending up with considerable boilerplate. Proposed replacement: int debugfs_change_name(dentry, fmt, ...). All callers convert to that easily, and it's simpler internally. IMO debugfs_rename() should go; if we ever get a real-world use case for cross-directory moves in debugfs, we can always look into the right way to handle that. Signed-off-by: Al Viro Link: https://lore.kernel.org/r/20250112080705.141166-21-viro@zeniv.linux.org.uk Signed-off-by: Greg Kroah-Hartman --- include/linux/debugfs.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 68e9c6cbd835..fa2568b4380d 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -175,8 +175,7 @@ ssize_t debugfs_attr_write(struct file *file, const char __user *buf, ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos); -struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, - struct dentry *new_dir, const char *new_name); +int debugfs_change_name(struct dentry *dentry, const char *fmt, ...) __printf(2, 3); void debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent, u8 *value); @@ -361,10 +360,10 @@ static inline ssize_t debugfs_attr_write_signed(struct file *file, return -ENODEV; } -static inline struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, - struct dentry *new_dir, char *new_name) +static inline int __printf(2, 3) debugfs_change_name(struct dentry *dentry, + const char *fmt, ...) { - return ERR_PTR(-ENODEV); + return -ENODEV; } static inline void debugfs_create_u8(const char *name, umode_t mode, -- cgit v1.2.3 From cec8c359f87c0f7c9cf63b570c0ce968b5ef62a4 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Mon, 13 Jan 2025 23:13:14 +0100 Subject: Input: i8042 - Add support for platform filter contexts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the platform filter cannot access any driver-specific state which forces drivers installing a i8042 filter to have at least some kind of global pointer for their filter. Allow callers of i8042_install_filter() to submit a context pointer which is then passed to the i8042 filter. This frees drivers from the responsibility of having to manage this global pointer themself. Also introduce a separate type for the i8042 filter (i8042_filter_t) so that the function definitions can stay compact. Tested on a Dell Inspiron 3505. Reviewed-by: Ilpo Järvinen Acked-by: Dmitry Torokhov Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20250113221314.435812-1-W_Armin@gmx.de Signed-off-by: Ilpo Järvinen --- include/linux/i8042.h | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i8042.h b/include/linux/i8042.h index 95b07f8b77fe..00037c13abc8 100644 --- a/include/linux/i8042.h +++ b/include/linux/i8042.h @@ -54,15 +54,29 @@ struct serio; +/** + * typedef i8042_filter_t - i8042 filter callback + * @data: Data received by the i8042 controller + * @str: Status register of the i8042 controller + * @serio: Serio of the i8042 controller + * @context: Context pointer associated with this callback + * + * This represents a i8042 filter callback which can be used with i8042_install_filter() + * and i8042_remove_filter() to filter the i8042 input for platform-specific key codes. + * + * Context: Interrupt context. + * Returns: true if the data should be filtered out, false if otherwise. + */ +typedef bool (*i8042_filter_t)(unsigned char data, unsigned char str, struct serio *serio, + void *context); + #if defined(CONFIG_SERIO_I8042) || defined(CONFIG_SERIO_I8042_MODULE) void i8042_lock_chip(void); void i8042_unlock_chip(void); int i8042_command(unsigned char *param, int command); -int i8042_install_filter(bool (*filter)(unsigned char data, unsigned char str, - struct serio *serio)); -int i8042_remove_filter(bool (*filter)(unsigned char data, unsigned char str, - struct serio *serio)); +int i8042_install_filter(i8042_filter_t filter, void *context); +int i8042_remove_filter(i8042_filter_t filter); #else @@ -79,14 +93,12 @@ static inline int i8042_command(unsigned char *param, int command) return -ENODEV; } -static inline int i8042_install_filter(bool (*filter)(unsigned char data, unsigned char str, - struct serio *serio)) +static inline int i8042_install_filter(i8042_filter_t filter, void *context) { return -ENODEV; } -static inline int i8042_remove_filter(bool (*filter)(unsigned char data, unsigned char str, - struct serio *serio)) +static inline int i8042_remove_filter(i8042_filter_t filter) { return -ENODEV; } -- cgit v1.2.3 From 6564862d646e7d630929ba1ff330740bb215bdac Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 9 Jan 2025 11:39:59 +0000 Subject: block: Ensure start sector is aligned for stacking atomic writes For stacking atomic writes, ensure that the start sector is aligned with the device atomic write unit min and any boundary. Otherwise, we may permit misaligned atomic writes. Rework bdev_can_atomic_write() into a common helper to resuse the alignment check. There also use atomic_write_hw_unit_min, which is more proper (than atomic_write_unit_min). Fixes: d7f36dc446e89 ("block: Support atomic writes limits for stacked devices") Reviewed-by: Christoph Hellwig Signed-off-by: John Garry Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250109114000.2299896-2-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 13d353351c37..7ac153e4423a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1706,6 +1706,15 @@ struct io_comp_batch { void (*complete)(struct io_comp_batch *); }; +static inline bool blk_atomic_write_start_sect_aligned(sector_t sector, + struct queue_limits *limits) +{ + unsigned int alignment = max(limits->atomic_write_hw_unit_min, + limits->atomic_write_hw_boundary); + + return IS_ALIGNED(sector, alignment >> SECTOR_SHIFT); +} + static inline bool bdev_can_atomic_write(struct block_device *bdev) { struct request_queue *bd_queue = bdev->bd_queue; @@ -1714,15 +1723,9 @@ static inline bool bdev_can_atomic_write(struct block_device *bdev) if (!limits->atomic_write_unit_min) return false; - if (bdev_is_partition(bdev)) { - sector_t bd_start_sect = bdev->bd_start_sect; - unsigned int alignment = - max(limits->atomic_write_unit_min, - limits->atomic_write_hw_boundary); - - if (!IS_ALIGNED(bd_start_sect, alignment >> SECTOR_SHIFT)) - return false; - } + if (bdev_is_partition(bdev)) + return blk_atomic_write_start_sect_aligned(bdev->bd_start_sect, + limits); return true; } -- cgit v1.2.3 From a3751212a8eeece59d2018c455000f30ed7e5bb7 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Tue, 14 Jan 2025 15:37:08 -0500 Subject: PCI: Add enable_device() and disable_device() callbacks for bridges MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some PCI host bridges require special handling when enabling or disabling PCI devices. For example, the i.MX95 platform has a lookup table to map Requester IDs to StreamIDs, which the SMMU and MSI controller use to identify the source of DMA accesses. Without this mapping, DMA accesses may target unintended memory, which would corrupt memory or read the wrong data. Add a host bridge enable_device() hook the imx6 driver can use to configure the Requester ID to StreamID mapping. The hardware table isn't big enough to map all possible Requester IDs, so this hook may fail if no table space is available. In that case, return failure from pci_enable_device(). It might make more sense to make pci_set_master() decline to enable bus mastering and return failure, but it currently doesn't have a way to return failure. Link: https://lore.kernel.org/r/20250114-imx95_lut-v9-1-39f58dbed03a@nxp.com Tested-by: Marc Zyngier Signed-off-by: Frank Li Signed-off-by: Bjorn Helgaas [kwilczynski: commit log] Signed-off-by: Krzysztof Wilczyński Reviewed-by: Marc Zyngier Acked-by: Manivannan Sadhasivam --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index db9b47ce3eef..bcbef004dd56 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -595,6 +595,8 @@ struct pci_host_bridge { u8 (*swizzle_irq)(struct pci_dev *, u8 *); /* Platform IRQ swizzler */ int (*map_irq)(const struct pci_dev *, u8, u8); void (*release_fn)(struct pci_host_bridge *); + int (*enable_device)(struct pci_host_bridge *bridge, struct pci_dev *dev); + void (*disable_device)(struct pci_host_bridge *bridge, struct pci_dev *dev); void *release_data; unsigned int ignore_reset_delay:1; /* For entire hierarchy */ unsigned int no_ext_tags:1; /* No Extended Tags */ -- cgit v1.2.3 From 042087247835dad1ec5e39052abf022fd13c6326 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 10 Jan 2025 15:45:23 +0100 Subject: mtd: spinand: Create distinct fast and slow read from cache variants So far, the SPINAND_PAGE_READ_FROM_CACHE_OP macro was taking a first argument, "fast", which was inducing the possibility to support higher bus frequencies than with the normal (slower) read from cache alternative. In practice, without frequency change on the bus, this was likely without effect, besides perhaps allowing another variant of the same command, that could run at the default highest speed. If we want to support this fully, we need to add a frequency parameter to the slowest command. But before we do that, let's drop the "fast" boolean from the macro and duplicate it, this will further help supporting having different frequencies allowed for each variant. The change is also of course propagated to all users. It has the nice effect to have all macros aligned on the same pattern. Reviewed-by: Tudor Ambarus Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index cbbcd44ac225..0d80fa9400d5 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -62,14 +62,26 @@ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_NO_DATA) -#define SPINAND_PAGE_READ_FROM_CACHE_OP(fast, addr, ndummy, buf, len) \ - SPI_MEM_OP(SPI_MEM_OP_CMD(fast ? 0x0b : 0x03, 1), \ +#define SPINAND_PAGE_READ_FROM_CACHE_OP(addr, ndummy, buf, len) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0x03, 1), \ SPI_MEM_OP_ADDR(2, addr, 1), \ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 1)) -#define SPINAND_PAGE_READ_FROM_CACHE_OP_3A(fast, addr, ndummy, buf, len) \ - SPI_MEM_OP(SPI_MEM_OP_CMD(fast ? 0x0b : 0x03, 1), \ +#define SPINAND_PAGE_READ_FROM_CACHE_FAST_OP(addr, ndummy, buf, len) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0x0b, 1), \ + SPI_MEM_OP_ADDR(2, addr, 1), \ + SPI_MEM_OP_DUMMY(ndummy, 1), \ + SPI_MEM_OP_DATA_IN(len, buf, 1)) + +#define SPINAND_PAGE_READ_FROM_CACHE_OP_3A(addr, ndummy, buf, len) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0x03, 1), \ + SPI_MEM_OP_ADDR(3, addr, 1), \ + SPI_MEM_OP_DUMMY(ndummy, 1), \ + SPI_MEM_OP_DATA_IN(len, buf, 1)) + +#define SPINAND_PAGE_READ_FROM_CACHE_FAST_OP_3A(addr, ndummy, buf, len) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0x0b, 1), \ SPI_MEM_OP_ADDR(3, addr, 1), \ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 1)) -- cgit v1.2.3 From 7ce0d16d5802bfde4209e52ee8ad644ca1eab423 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 10 Jan 2025 15:45:24 +0100 Subject: mtd: spinand: Add an optional frequency to read from cache macros While the SPINAND_PAGE_READ_FROM_CACHE_FAST_OP macro is supposed to be able to run at the flash highest supported frequency, it is not the case of the regular read from cache, which may be limited in terms of maximum frequency. Add an optional argument to this macro, which will be used to set the maximum frequency, if any. Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 0d80fa9400d5..2f5cce23b3ec 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -62,11 +62,12 @@ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_NO_DATA) -#define SPINAND_PAGE_READ_FROM_CACHE_OP(addr, ndummy, buf, len) \ +#define SPINAND_PAGE_READ_FROM_CACHE_OP(addr, ndummy, buf, len, ...) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x03, 1), \ SPI_MEM_OP_ADDR(2, addr, 1), \ SPI_MEM_OP_DUMMY(ndummy, 1), \ - SPI_MEM_OP_DATA_IN(len, buf, 1)) + SPI_MEM_OP_DATA_IN(len, buf, 1), \ + __VA_OPT__(SPI_MEM_OP_MAX_FREQ(__VA_ARGS__))) #define SPINAND_PAGE_READ_FROM_CACHE_FAST_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x0b, 1), \ -- cgit v1.2.3 From 8586bc8d95488dfaadbc1af89ba59900d2c39119 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 10 Jan 2025 15:45:26 +0100 Subject: mtd: spinand: Add support for read DTR operations Advanced SPI-NAND chips are capable of reading data much faster by leveraging DTR support. This support extends to dual and quad configurations. Create macros defining all possible read from cache DTR variants: - SPINAND_PAGE_READ_FROM_CACHE_DTR_OP - SPINAND_PAGE_READ_FROM_CACHE_X2_DTR_OP - SPINAND_PAGE_READ_FROM_CACHE_X4_DTR_OP - SPINAND_PAGE_READ_FROM_CACHE_DUALIO_DTR_OP - SPINAND_PAGE_READ_FROM_CACHE_QUADIO_DTR_OP Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 2f5cce23b3ec..0da8a1c7740e 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -87,6 +87,13 @@ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 1)) +#define SPINAND_PAGE_READ_FROM_CACHE_DTR_OP(addr, ndummy, buf, len, freq) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0x0d, 1), \ + SPI_MEM_DTR_OP_ADDR(2, addr, 1), \ + SPI_MEM_DTR_OP_DUMMY(ndummy, 1), \ + SPI_MEM_DTR_OP_DATA_IN(len, buf, 1), \ + SPI_MEM_OP_MAX_FREQ(freq)) + #define SPINAND_PAGE_READ_FROM_CACHE_X2_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x3b, 1), \ SPI_MEM_OP_ADDR(2, addr, 1), \ @@ -99,6 +106,13 @@ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 2)) +#define SPINAND_PAGE_READ_FROM_CACHE_X2_DTR_OP(addr, ndummy, buf, len, freq) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0x3d, 1), \ + SPI_MEM_DTR_OP_ADDR(2, addr, 1), \ + SPI_MEM_DTR_OP_DUMMY(ndummy, 1), \ + SPI_MEM_DTR_OP_DATA_IN(len, buf, 2), \ + SPI_MEM_OP_MAX_FREQ(freq)) + #define SPINAND_PAGE_READ_FROM_CACHE_X4_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x6b, 1), \ SPI_MEM_OP_ADDR(2, addr, 1), \ @@ -111,6 +125,13 @@ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 4)) +#define SPINAND_PAGE_READ_FROM_CACHE_X4_DTR_OP(addr, ndummy, buf, len, freq) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0x6d, 1), \ + SPI_MEM_DTR_OP_ADDR(2, addr, 1), \ + SPI_MEM_DTR_OP_DUMMY(ndummy, 1), \ + SPI_MEM_DTR_OP_DATA_IN(len, buf, 4), \ + SPI_MEM_OP_MAX_FREQ(freq)) + #define SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1), \ SPI_MEM_OP_ADDR(2, addr, 2), \ @@ -123,6 +144,13 @@ SPI_MEM_OP_DUMMY(ndummy, 2), \ SPI_MEM_OP_DATA_IN(len, buf, 2)) +#define SPINAND_PAGE_READ_FROM_CACHE_DUALIO_DTR_OP(addr, ndummy, buf, len, freq) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0xbd, 1), \ + SPI_MEM_DTR_OP_ADDR(2, addr, 2), \ + SPI_MEM_DTR_OP_DUMMY(ndummy, 2), \ + SPI_MEM_DTR_OP_DATA_IN(len, buf, 2), \ + SPI_MEM_OP_MAX_FREQ(freq)) + #define SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1), \ SPI_MEM_OP_ADDR(2, addr, 4), \ @@ -135,6 +163,13 @@ SPI_MEM_OP_DUMMY(ndummy, 4), \ SPI_MEM_OP_DATA_IN(len, buf, 4)) +#define SPINAND_PAGE_READ_FROM_CACHE_QUADIO_DTR_OP(addr, ndummy, buf, len, freq) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0xed, 1), \ + SPI_MEM_DTR_OP_ADDR(2, addr, 4), \ + SPI_MEM_DTR_OP_DUMMY(ndummy, 4), \ + SPI_MEM_DTR_OP_DATA_IN(len, buf, 4), \ + SPI_MEM_OP_MAX_FREQ(freq)) + #define SPINAND_PROG_EXEC_OP(addr) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x10, 1), \ SPI_MEM_OP_ADDR(3, addr, 1), \ -- cgit v1.2.3 From 2d2a46cf23788a19e5450c6f9c86ab17f596c708 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 12 Jan 2025 16:01:32 +0000 Subject: timekeeping: Remove unused ktime_get_fast_timestamps() ktime_get_fast_timestamps() was added in 2020 by commit e2d977c9f1ab ("timekeeping: Provide multi-timestamp accessor to NMI safe timekeeper") but has remained unused. Remove it. [ tglx: Fold the inline as David suggested in the submission ] Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250112160132.450209-1-linux@treblig.org --- include/linux/timekeeping.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 0e035f675efe..542773650200 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -263,18 +263,6 @@ extern bool timekeeping_rtc_skipresume(void); extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta); -/** - * struct ktime_timestamps - Simultaneous mono/boot/real timestamps - * @mono: Monotonic timestamp - * @boot: Boottime timestamp - * @real: Realtime timestamp - */ -struct ktime_timestamps { - u64 mono; - u64 boot; - u64 real; -}; - /** * struct system_time_snapshot - simultaneous raw/real time capture with * counter value @@ -345,9 +333,6 @@ extern int get_device_system_crosststamp( */ extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); -/* NMI safe mono/boot/realtime timestamps */ -extern void ktime_get_fast_timestamps(struct ktime_timestamps *snap); - /* * Persistent clock related interfaces */ -- cgit v1.2.3 From f94a18249b7f9131f3ca8eacf07f21050747ebd7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Dec 2024 11:34:17 +0100 Subject: genirq: Remove IRQ_MOVE_PCNTXT and related code Now that x86 is converted over to use the IRQCHIP_MOVE_DEFERRED flags, remove IRQ*_MOVE_PCNTXT and related code. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241210103335.626707225@linutronix.de --- include/linux/irq.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 6e021548fa0a..8daa17f0107a 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -64,7 +64,6 @@ enum irqchip_irq_state; * IRQ_NOAUTOEN - Interrupt is not automatically enabled in * request/setup_irq() * IRQ_NO_BALANCING - Interrupt cannot be balanced (affinity set) - * IRQ_MOVE_PCNTXT - Interrupt can be migrated from process context * IRQ_NESTED_THREAD - Interrupt nests into another thread * IRQ_PER_CPU_DEVID - Dev_id is a per-cpu variable * IRQ_IS_POLLED - Always polled by another interrupt. Exclude @@ -93,7 +92,6 @@ enum { IRQ_NOREQUEST = (1 << 11), IRQ_NOAUTOEN = (1 << 12), IRQ_NO_BALANCING = (1 << 13), - IRQ_MOVE_PCNTXT = (1 << 14), IRQ_NESTED_THREAD = (1 << 15), IRQ_NOTHREAD = (1 << 16), IRQ_PER_CPU_DEVID = (1 << 17), @@ -105,7 +103,7 @@ enum { #define IRQF_MODIFY_MASK \ (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \ - IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \ + IRQ_NOAUTOEN | IRQ_LEVEL | IRQ_NO_BALANCING | \ IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \ IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_HIDDEN) @@ -201,8 +199,6 @@ struct irq_data { * IRQD_LEVEL - Interrupt is level triggered * IRQD_WAKEUP_STATE - Interrupt is configured for wakeup * from suspend - * IRQD_MOVE_PCNTXT - Interrupt can be moved in process - * context * IRQD_IRQ_DISABLED - Disabled state of the interrupt * IRQD_IRQ_MASKED - Masked state of the interrupt * IRQD_IRQ_INPROGRESS - In progress state of the interrupt @@ -233,7 +229,6 @@ enum { IRQD_AFFINITY_SET = BIT(12), IRQD_LEVEL = BIT(13), IRQD_WAKEUP_STATE = BIT(14), - IRQD_MOVE_PCNTXT = BIT(15), IRQD_IRQ_DISABLED = BIT(16), IRQD_IRQ_MASKED = BIT(17), IRQD_IRQ_INPROGRESS = BIT(18), @@ -338,11 +333,6 @@ static inline bool irqd_is_wakeup_set(struct irq_data *d) return __irqd_to_state(d) & IRQD_WAKEUP_STATE; } -static inline bool irqd_can_move_in_process_context(struct irq_data *d) -{ - return __irqd_to_state(d) & IRQD_MOVE_PCNTXT; -} - static inline bool irqd_irq_disabled(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_DISABLED; -- cgit v1.2.3 From 477ac7b0a7984e12e8db07fe54ad64443c5f8928 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 4 Dec 2024 15:01:44 +0000 Subject: PCI: host-generic: Allow {en,dis}able_device() to be provided via pci_ecam_ops In order to let host controller drivers using the host-generic infrastructure use the {en,dis}able_device() callbacks that can be used to configure sideband RID mapping hardware, provide these two callbacks as part of the pci_ecam_ops structure. Link: https://lore.kernel.org/r/20241204150145.800408-2-maz@kernel.org Signed-off-by: Marc Zyngier Signed-off-by: Bjorn Helgaas Reviewed-by: Frank Li Reviewed-by: Manivannan Sadhasivam --- include/linux/pci-ecam.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h index 3a4860bd2758..3a10f8cfc3ad 100644 --- a/include/linux/pci-ecam.h +++ b/include/linux/pci-ecam.h @@ -45,6 +45,10 @@ struct pci_ecam_ops { unsigned int bus_shift; struct pci_ops pci_ops; int (*init)(struct pci_config_window *); + int (*enable_device)(struct pci_host_bridge *, + struct pci_dev *); + void (*disable_device)(struct pci_host_bridge *, + struct pci_dev *); }; /* -- cgit v1.2.3 From 27c3f0e61f19d2306527406cad233d5f5915ca1e Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 15 Jan 2025 08:39:18 +0100 Subject: i2c: add kdoc for the new debugfs entry of clients When adding the new debugfs entry, its kdoc equivalent was forgotten. Add it now. Fixes: d06905d68610 ("i2c: add core-managed per-client directory in debugfs") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/r/20250115163146.6c48f066@canb.auug.org.au Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 36de788dc7fe..c31fd1dba3bd 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -317,6 +317,8 @@ struct i2c_driver { * calls it to pass on slave events to the slave driver. * @devres_group_id: id of the devres group that will be created for resources * acquired when probing this device. + * @debugfs: pointer to the debugfs subdirectory which the I2C core created + * for this client. * * An i2c_client identifies a single device (i.e. chip) connected to an * i2c bus. The behaviour exposed to Linux is defined by the driver -- cgit v1.2.3 From c6739623c91bb3d6e9b20e05afbe69a2664f2d70 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 13 Jan 2025 09:22:29 +0000 Subject: net: phylink: pass neg_mode into .pcs_get_state() method Pass the current neg_mode into the .pcs_get_state() method. Update all users of phylink PCS. Signed-off-by: Russell King (Oracle) Reviewed-by: Maxime Chevallier Tested-by: Maxime Chevallier Link: https://patch.msgid.link/E1tXGeT-000Et3-4L@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 4b7a20620b49..0bbcb4898e93 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -446,7 +446,7 @@ struct phylink_pcs_ops { phy_interface_t interface); int (*pcs_post_config)(struct phylink_pcs *pcs, phy_interface_t interface); - void (*pcs_get_state)(struct phylink_pcs *pcs, + void (*pcs_get_state)(struct phylink_pcs *pcs, unsigned int neg_mode, struct phylink_link_state *state); int (*pcs_config)(struct phylink_pcs *pcs, unsigned int neg_mode, phy_interface_t interface, @@ -505,6 +505,7 @@ void pcs_disable(struct phylink_pcs *pcs); /** * pcs_get_state() - Read the current inband link state from the hardware * @pcs: a pointer to a &struct phylink_pcs. + * @neg_mode: link negotiation mode (PHYLINK_PCS_NEG_xxx) * @state: a pointer to a &struct phylink_link_state. * * Read the current inband link state from the MAC PCS, reporting the @@ -513,8 +514,11 @@ void pcs_disable(struct phylink_pcs *pcs); * negotiation completion state in @state->an_complete, and link up state * in @state->link. If possible, @state->lp_advertising should also be * populated. + * + * Note that the @neg_mode parameter is always the PHYLINK_PCS_NEG_xxx + * state, not MLO_AN_xxx. */ -void pcs_get_state(struct phylink_pcs *pcs, +void pcs_get_state(struct phylink_pcs *pcs, unsigned int neg_mode, struct phylink_link_state *state); /** -- cgit v1.2.3 From 7e3cb4e874ab0dcf8b10e43e5068824bf0adcb4c Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 13 Jan 2025 09:22:34 +0000 Subject: net: phylink: pass neg_mode into c22 state decoder Pass the current neg_mode into phylink_mii_c22_pcs_get_state() and phylink_mii_c22_pcs_decode_state(). Update all users of phylink PCS that use these functions. Signed-off-by: Russell King (Oracle) Reviewed-by: Maxime Chevallier Tested-by: Maxime Chevallier Link: https://patch.msgid.link/E1tXGeY-000Et9-8g@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 0bbcb4898e93..f19b7108c840 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -693,8 +693,9 @@ static inline int phylink_get_link_timer_ns(phy_interface_t interface) } void phylink_mii_c22_pcs_decode_state(struct phylink_link_state *state, - u16 bmsr, u16 lpa); + unsigned int neg_mode, u16 bmsr, u16 lpa); void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs, + unsigned int neg_mode, struct phylink_link_state *state); int phylink_mii_c22_pcs_encode_advertisement(phy_interface_t interface, const unsigned long *advertising); -- cgit v1.2.3 From df998c22321dde3f70cd3cf8c183dfd6bf64c759 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Wed, 8 Jan 2025 17:13:45 +0300 Subject: power: supply: add undervoltage health status property Add POWER_SUPPLY_HEALTH_UNDERVOLTAGE status for power supply to report under voltage lockout failures. Signed-off-by: Dzmitry Sankouski Link: https://lore.kernel.org/r/20250108-starqltechn_integration_upstream-v14-1-f6e84ec20d96@gmail.com Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index c3ce9f2b17d4..6ed53b292162 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -60,6 +60,7 @@ enum { POWER_SUPPLY_HEALTH_OVERHEAT, POWER_SUPPLY_HEALTH_DEAD, POWER_SUPPLY_HEALTH_OVERVOLTAGE, + POWER_SUPPLY_HEALTH_UNDERVOLTAGE, POWER_SUPPLY_HEALTH_UNSPEC_FAILURE, POWER_SUPPLY_HEALTH_COLD, POWER_SUPPLY_HEALTH_WATCHDOG_TIMER_EXPIRE, -- cgit v1.2.3 From 197258f0ef685ddbd534254dc79f49faa47dc93d Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 14 Jan 2025 14:28:43 +0000 Subject: net: ethtool: add hds_config member in ethtool_netdev_state When tcp-data-split is UNKNOWN mode, drivers arbitrarily handle it. For example, bnxt_en driver automatically enables if at least one of LRO/GRO/JUMBO is enabled. If tcp-data-split is UNKNOWN and LRO is enabled, a driver returns ENABLES of tcp-data-split, not UNKNOWN. So, `ethtool -g eth0` shows tcp-data-split is enabled. The problem is in the setting situation. In the ethnl_set_rings(), it first calls get_ringparam() to get the current driver's config. At that moment, if driver's tcp-data-split config is UNKNOWN, it returns ENABLE if LRO/GRO/JUMBO is enabled. Then, it sets values from the user and driver's current config to kernel_ethtool_ringparam. Last it calls .set_ringparam(). The driver, especially bnxt_en driver receives ETHTOOL_TCP_DATA_SPLIT_ENABLED. But it can't distinguish whether it is set by the user or just the current config. When user updates ring parameter, the new hds_config value is updated and current hds_config value is stored to old_hdsconfig. Driver's .set_ringparam() callback can distinguish a passed tcp-data-split value is came from user explicitly. If .set_ringparam() is failed, hds_config is rollbacked immediately. Suggested-by: Jakub Kicinski Reviewed-by: Jakub Kicinski Signed-off-by: Taehee Yoo Link: https://patch.msgid.link/20250114142852.3364986-2-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 2 ++ include/linux/netdevice.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 20a86bd5f4e3..d79bd201c1c8 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1157,12 +1157,14 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev, * @rss_ctx: XArray of custom RSS contexts * @rss_lock: Protects entries in @rss_ctx. May be taken from * within RTNL. + * @hds_config: HDS value from userspace. * @wol_enabled: Wake-on-LAN is enabled * @module_fw_flash_in_progress: Module firmware flashing is in progress. */ struct ethtool_netdev_state { struct xarray rss_ctx; struct mutex rss_lock; + u8 hds_config; unsigned wol_enabled:1; unsigned module_fw_flash_in_progress:1; }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bced03fb349e..3e6336775baf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4082,6 +4082,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); u8 dev_xdp_prog_count(struct net_device *dev); int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf); +u8 dev_xdp_sb_prog_count(struct net_device *dev); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); u32 dev_get_min_mp_channel_count(const struct net_device *dev); -- cgit v1.2.3 From eec8359f0797ef87c6ef6cbed6de08b02073b833 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 14 Jan 2025 14:28:44 +0000 Subject: net: ethtool: add support for configuring hds-thresh The hds-thresh option configures the threshold value of the header-data-split. If a received packet size is larger than this threshold value, a packet will be split into header and payload. The header indicates TCP and UDP header, but it depends on driver spec. The bnxt_en driver supports HDS(Header-Data-Split) configuration at FW level, affecting TCP and UDP too. So, If hds-thresh is set, it affects UDP and TCP packets. Example: # ethtool -G hds-thresh # ethtool -G enp14s0f0np0 tcp-data-split on hds-thresh 256 # ethtool -g enp14s0f0np0 Ring parameters for enp14s0f0np0: Pre-set maximums: ... HDS thresh: 1023 Current hardware settings: ... TCP data split: on HDS thresh: 256 The default/min/max values are not defined in the ethtool so the drivers should define themself. The 0 value means that all TCP/UDP packets' header and payload will be split. Tested-by: Stanislav Fomichev Signed-off-by: Taehee Yoo Link: https://patch.msgid.link/20250114142852.3364986-3-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index d79bd201c1c8..e4136b0df892 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -78,6 +78,9 @@ enum { * @cqe_size: Size of TX/RX completion queue event * @tx_push_buf_len: Size of TX push buffer * @tx_push_buf_max_len: Maximum allowed size of TX push buffer + * @hds_thresh: Packet size threshold for header data split (HDS) + * @hds_thresh_max: Maximum supported setting for @hds_threshold + * */ struct kernel_ethtool_ringparam { u32 rx_buf_len; @@ -87,6 +90,8 @@ struct kernel_ethtool_ringparam { u32 cqe_size; u32 tx_push_buf_len; u32 tx_push_buf_max_len; + u32 hds_thresh; + u32 hds_thresh_max; }; /** @@ -97,6 +102,7 @@ struct kernel_ethtool_ringparam { * @ETHTOOL_RING_USE_RX_PUSH: capture for setting rx_push * @ETHTOOL_RING_USE_TX_PUSH_BUF_LEN: capture for setting tx_push_buf_len * @ETHTOOL_RING_USE_TCP_DATA_SPLIT: capture for setting tcp_data_split + * @ETHTOOL_RING_USE_HDS_THRS: capture for setting header-data-split-thresh */ enum ethtool_supported_ring_param { ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0), @@ -105,6 +111,7 @@ enum ethtool_supported_ring_param { ETHTOOL_RING_USE_RX_PUSH = BIT(3), ETHTOOL_RING_USE_TX_PUSH_BUF_LEN = BIT(4), ETHTOOL_RING_USE_TCP_DATA_SPLIT = BIT(5), + ETHTOOL_RING_USE_HDS_THRS = BIT(6), }; #define __ETH_RSS_HASH_BIT(bit) ((u32)1 << (bit)) @@ -1157,6 +1164,7 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev, * @rss_ctx: XArray of custom RSS contexts * @rss_lock: Protects entries in @rss_ctx. May be taken from * within RTNL. + * @hds_thresh: HDS Threshold value. * @hds_config: HDS value from userspace. * @wol_enabled: Wake-on-LAN is enabled * @module_fw_flash_in_progress: Module firmware flashing is in progress. @@ -1164,6 +1172,7 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev, struct ethtool_netdev_state { struct xarray rss_ctx; struct mutex rss_lock; + u32 hds_thresh; u8 hds_config; unsigned wol_enabled:1; unsigned module_fw_flash_in_progress:1; -- cgit v1.2.3 From 3440fa34ad99d471f1085bc2f4dedeaebc310261 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Jan 2025 22:10:49 +0000 Subject: inet: ipmr: fix data-races Following fields of 'struct mr_mfc' can be updated concurrently (no lock protection) from ip_mr_forward() and ip6_mr_forward() - bytes - pkt - wrong_if - lastuse They also can be read from other functions. Convert bytes, pkt and wrong_if to atomic_long_t, and use READ_ONCE()/WRITE_ONCE() for lastuse. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250114221049.1190631-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 9dd4bf157255..58a2401e4b55 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -146,9 +146,9 @@ struct mr_mfc { unsigned long last_assert; int minvif; int maxvif; - unsigned long bytes; - unsigned long pkt; - unsigned long wrong_if; + atomic_long_t bytes; + atomic_long_t pkt; + atomic_long_t wrong_if; unsigned long lastuse; unsigned char ttls[MAXVIFS]; refcount_t refcount; -- cgit v1.2.3 From ebda2f0bbde540ff7da168d2837f8cfb14581e2e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 14 Jan 2025 19:53:09 -0800 Subject: net: add netdev_lock() / netdev_unlock() helpers Add helpers for locking the netdev instance, use it in drivers and the shaper code. This will make grepping for the lock usage much easier, as we extend the lock to cover more fields. Reviewed-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/20250115035319.559603-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3e6336775baf..6d440db35d5f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2444,8 +2444,12 @@ struct net_device { u32 napi_defer_hard_irqs; /** - * @lock: protects @net_shaper_hierarchy, feel free to use for other - * netdev-scope protection. Ordering: take after rtnl_lock. + * @lock: netdev-scope lock, protects a small selection of fields. + * Should always be taken using netdev_lock() / netdev_unlock() helpers. + * Drivers are free to use it for other protection. + * + * Protects: @net_shaper_hierarchy. + * Ordering: take after rtnl_lock. */ struct mutex lock; @@ -2671,6 +2675,21 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, enum netdev_queue_type type, struct napi_struct *napi); +static inline void netdev_lock(struct net_device *dev) +{ + mutex_lock(&dev->lock); +} + +static inline void netdev_unlock(struct net_device *dev) +{ + mutex_unlock(&dev->lock); +} + +static inline void netdev_assert_locked(struct net_device *dev) +{ + lockdep_assert_held(&dev->lock); +} + static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) { napi->irq = irq; -- cgit v1.2.3 From 5fda3f35349b6b7f22f5f5095a3821261d515075 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 14 Jan 2025 19:53:10 -0800 Subject: net: make netdev_lock() protect netdev->reg_state Protect writes to netdev->reg_state with netdev_lock(). From now on holding netdev_lock() is sufficient to prevent the net_device from getting unregistered, so code which wants to hold just a single netdev around no longer needs to hold rtnl_lock. We do not protect the NETREG_UNREGISTERED -> NETREG_RELEASED transition. We'd need to move mutex_destroy(netdev->lock) to .release, but the real reason is that trying to stop the unregistration process mid-way would be unsafe / crazy. Taking references on such devices is not safe, either. So the intended semantics are to lock REGISTERED devices. Reviewed-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250115035319.559603-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6d440db35d5f..007bcfa383c9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2448,7 +2448,7 @@ struct net_device { * Should always be taken using netdev_lock() / netdev_unlock() helpers. * Drivers are free to use it for other protection. * - * Protects: @net_shaper_hierarchy. + * Protects: @reg_state, @net_shaper_hierarchy. * Ordering: take after rtnl_lock. */ struct mutex lock; -- cgit v1.2.3 From 5112457f3d8e41f987908266068af88ef9f3ab78 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 14 Jan 2025 19:53:12 -0800 Subject: net: add netdev->up protected by netdev_lock() Some uAPI (netdev netlink) hide net_device's sub-objects while the interface is down to ensure uniform behavior across drivers. To remove the rtnl_lock dependency from those uAPIs we need a way to safely tell if the device is down or up. Add an indication of whether device is open or closed, protected by netdev->lock. The semantics are the same as IFF_UP, but taking netdev_lock around every write to ->flags would be a lot of code churn. We don't want to blanket the entire open / close path by netdev_lock, because it will prevent us from applying it to specific structures - core helpers won't be able to take that lock from any function called by the drivers on open/close paths. So the state of the flag is "pessimistic", as in it may report false negatives, but never false positives. Reviewed-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250115035319.559603-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 007bcfa383c9..cac81b0a166f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2443,12 +2443,24 @@ struct net_device { unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; + /** + * @up: copy of @state's IFF_UP, but safe to read with just @lock. + * May report false negatives while the device is being opened + * or closed (@lock does not protect .ndo_open, or .ndo_close). + */ + bool up; + /** * @lock: netdev-scope lock, protects a small selection of fields. * Should always be taken using netdev_lock() / netdev_unlock() helpers. * Drivers are free to use it for other protection. * - * Protects: @reg_state, @net_shaper_hierarchy. + * Protects: + * @net_shaper_hierarchy, @reg_state + * + * Partially protects (writers must hold both @lock and rtnl_lock): + * @up + * * Ordering: take after rtnl_lock. */ struct mutex lock; -- cgit v1.2.3 From 1b23cdbd2bbc4b40e21c12ae86c2781e347ff0f8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 14 Jan 2025 19:53:13 -0800 Subject: net: protect netdev->napi_list with netdev_lock() Hold netdev->lock when NAPIs are getting added or removed. This will allow safe access to NAPI instances of a net_device without rtnl_lock. Create a family of helpers which assume the lock is already taken. Switch iavf to them, as it makes extensive use of netdev->lock, already. Reviewed-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250115035319.559603-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 54 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cac81b0a166f..3130a8c807dd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2456,7 +2456,7 @@ struct net_device { * Drivers are free to use it for other protection. * * Protects: - * @net_shaper_hierarchy, @reg_state + * @napi_list, @net_shaper_hierarchy, @reg_state * * Partially protects (writers must hold both @lock and rtnl_lock): * @up @@ -2712,8 +2712,19 @@ static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) */ #define NAPI_POLL_WEIGHT 64 -void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight); +void netif_napi_add_weight_locked(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), + int weight); + +static inline void +netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) +{ + netdev_lock(dev); + netif_napi_add_weight_locked(dev, napi, poll, weight); + netdev_unlock(dev); +} /** * netif_napi_add() - initialize a NAPI context @@ -2731,6 +2742,13 @@ netif_napi_add(struct net_device *dev, struct napi_struct *napi, netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT); } +static inline void +netif_napi_add_locked(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int)) +{ + netif_napi_add_weight_locked(dev, napi, poll, NAPI_POLL_WEIGHT); +} + static inline void netif_napi_add_tx_weight(struct net_device *dev, struct napi_struct *napi, @@ -2741,6 +2759,15 @@ netif_napi_add_tx_weight(struct net_device *dev, netif_napi_add_weight(dev, napi, poll, weight); } +static inline void +netif_napi_add_config_locked(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int index) +{ + napi->index = index; + napi->config = &dev->napi_config[index]; + netif_napi_add_weight_locked(dev, napi, poll, NAPI_POLL_WEIGHT); +} + /** * netif_napi_add_config - initialize a NAPI context with persistent config * @dev: network device @@ -2752,9 +2779,9 @@ static inline void netif_napi_add_config(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int index) { - napi->index = index; - napi->config = &dev->napi_config[index]; - netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT); + netdev_lock(dev); + netif_napi_add_config_locked(dev, napi, poll, index); + netdev_unlock(dev); } /** @@ -2774,6 +2801,8 @@ static inline void netif_napi_add_tx(struct net_device *dev, netif_napi_add_tx_weight(dev, napi, poll, NAPI_POLL_WEIGHT); } +void __netif_napi_del_locked(struct napi_struct *napi); + /** * __netif_napi_del - remove a NAPI context * @napi: NAPI context @@ -2782,7 +2811,18 @@ static inline void netif_napi_add_tx(struct net_device *dev, * containing @napi. Drivers might want to call this helper to combine * all the needed RCU grace periods into a single one. */ -void __netif_napi_del(struct napi_struct *napi); +static inline void __netif_napi_del(struct napi_struct *napi) +{ + netdev_lock(napi->dev); + __netif_napi_del_locked(napi); + netdev_unlock(napi->dev); +} + +static inline void netif_napi_del_locked(struct napi_struct *napi) +{ + __netif_napi_del_locked(napi); + synchronize_net(); +} /** * netif_napi_del - remove a NAPI context -- cgit v1.2.3 From 413f0271f3966e0c73d4937963f19335af19e628 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 14 Jan 2025 19:53:14 -0800 Subject: net: protect NAPI enablement with netdev_lock() Wrap napi_enable() / napi_disable() with netdev_lock(). Provide the "already locked" flavor of the API. iavf needs the usual adjustment. A number of drivers call napi_enable() under a spin lock, so they have to be modified to take netdev_lock() first, then spin lock then call napi_enable_locked(). Protecting napi_enable() implies that napi->napi_id is protected by netdev_lock(). Acked-by: Francois Romieu # via-velocity Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250115035319.559603-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3130a8c807dd..3941e4d0073e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -382,7 +382,7 @@ struct napi_struct { struct sk_buff *skb; struct list_head rx_list; /* Pending GRO_NORMAL skbs */ int rx_count; /* length of rx_list */ - unsigned int napi_id; + unsigned int napi_id; /* protected by netdev_lock */ struct hrtimer timer; struct task_struct *thread; unsigned long gro_flush_timeout; @@ -570,16 +570,11 @@ static inline bool napi_complete(struct napi_struct *n) int dev_set_threaded(struct net_device *dev, bool threaded); -/** - * napi_disable - prevent NAPI from scheduling - * @n: NAPI context - * - * Stop NAPI from being scheduled on this context. - * Waits till any outstanding processing completes. - */ void napi_disable(struct napi_struct *n); +void napi_disable_locked(struct napi_struct *n); void napi_enable(struct napi_struct *n); +void napi_enable_locked(struct napi_struct *n); /** * napi_synchronize - wait until NAPI is not running -- cgit v1.2.3 From 1bb86cf8f44b1c1a320566558250b1f5121f6fd3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 14 Jan 2025 19:53:16 -0800 Subject: net: protect threaded status of NAPI with netdev_lock() Now that NAPI instances can't come and go without holding netdev->lock we can trivially switch from rtnl_lock() to netdev_lock() for setting netdev->threaded via sysfs. Note that since we do not lock netdev_lock around sysfs calls in the core we don't have to "trylock" like we do with rtnl_lock. Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250115035319.559603-9-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3941e4d0073e..20e773bbd181 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -384,7 +384,7 @@ struct napi_struct { int rx_count; /* length of rx_list */ unsigned int napi_id; /* protected by netdev_lock */ struct hrtimer timer; - struct task_struct *thread; + struct task_struct *thread; /* protected by netdev_lock */ unsigned long gro_flush_timeout; unsigned long irq_suspend_timeout; u32 defer_hard_irqs; @@ -2451,11 +2451,13 @@ struct net_device { * Drivers are free to use it for other protection. * * Protects: - * @napi_list, @net_shaper_hierarchy, @reg_state + * @napi_list, @net_shaper_hierarchy, @reg_state, @threaded * * Partially protects (writers must hold both @lock and rtnl_lock): * @up * + * Also protects some fields in struct napi_struct. + * * Ordering: take after rtnl_lock. */ struct mutex lock; @@ -2697,6 +2699,13 @@ static inline void netdev_assert_locked(struct net_device *dev) lockdep_assert_held(&dev->lock); } +static inline void netdev_assert_locked_or_invisible(struct net_device *dev) +{ + if (dev->reg_state == NETREG_REGISTERED || + dev->reg_state == NETREG_UNREGISTERING) + netdev_assert_locked(dev); +} + static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) { napi->irq = irq; -- cgit v1.2.3 From 53ed30800d3fd36e1e9f7ba8014b150632f714b1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 14 Jan 2025 19:53:17 -0800 Subject: net: protect napi->irq with netdev_lock() Take netdev_lock() in netif_napi_set_irq(). All NAPI "control fields" are now protected by that lock (most of the other ones are set during napi add/del). The napi_hash_node is fully protected by the hash spin lock, but close enough for the kdoc... Reviewed-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250115035319.559603-10-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 20e773bbd181..a47ff20365f9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -388,6 +388,7 @@ struct napi_struct { unsigned long gro_flush_timeout; unsigned long irq_suspend_timeout; u32 defer_hard_irqs; + /* all fields past this point are write-protected by netdev_lock */ /* control-path-only fields follow */ struct list_head dev_list; struct hlist_node napi_hash_node; @@ -2706,11 +2707,18 @@ static inline void netdev_assert_locked_or_invisible(struct net_device *dev) netdev_assert_locked(dev); } -static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) +static inline void netif_napi_set_irq_locked(struct napi_struct *napi, int irq) { napi->irq = irq; } +static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) +{ + netdev_lock(napi->dev); + netif_napi_set_irq_locked(napi, irq); + netdev_unlock(napi->dev); +} + /* Default NAPI poll() weight * Device drivers are strongly advised to not use bigger value */ -- cgit v1.2.3 From e7ed2ba757bf86a4f90ae9c4080235fc9c74d8a2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 14 Jan 2025 19:53:18 -0800 Subject: net: protect NAPI config fields with netdev_lock() Protect the following members of netdev and napi by netdev_lock: - defer_hard_irqs, - gro_flush_timeout, - irq_suspend_timeout. The first two are written via sysfs (which this patch switches to new lock), and netdev genl which holds both netdev and rtnl locks. irq_suspend_timeout is only written by netdev genl. Reviewed-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250115035319.559603-11-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a47ff20365f9..8308d9c75918 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -384,11 +384,11 @@ struct napi_struct { int rx_count; /* length of rx_list */ unsigned int napi_id; /* protected by netdev_lock */ struct hrtimer timer; - struct task_struct *thread; /* protected by netdev_lock */ + /* all fields past this point are write-protected by netdev_lock */ + struct task_struct *thread; unsigned long gro_flush_timeout; unsigned long irq_suspend_timeout; u32 defer_hard_irqs; - /* all fields past this point are write-protected by netdev_lock */ /* control-path-only fields follow */ struct list_head dev_list; struct hlist_node napi_hash_node; @@ -2452,7 +2452,8 @@ struct net_device { * Drivers are free to use it for other protection. * * Protects: - * @napi_list, @net_shaper_hierarchy, @reg_state, @threaded + * @gro_flush_timeout, @napi_defer_hard_irqs, @napi_list, + * @net_shaper_hierarchy, @reg_state, @threaded * * Partially protects (writers must hold both @lock and rtnl_lock): * @up -- cgit v1.2.3 From 53078a736fbc60e5d3a1e14f4cd4214003815026 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sat, 11 Jan 2025 14:01:53 +1300 Subject: HID: hid-asus: Disable OOBE mode on the ProArt P16 The new ASUS ProArt 16" laptop series come with their keyboards stuck in an Out-Of-Box-Experience mode. While in this mode most functions will not work such as LED control or Fn key combos. The correct init sequence is now done to disable this OOBE. This patch addresses only the ProArt series so far and it is unknown if there may be others, in which case a new quirk may be required. Signed-off-by: Luke D. Jones Co-developed-by: Connor Belli Signed-off-by: Connor Belli Tested-by: Jan Schmidt Signed-off-by: Jiri Kosina --- include/linux/platform_data/x86/asus-wmi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 365e119bebaa..783e2a336861 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -184,6 +184,11 @@ static const struct dmi_system_id asus_use_hid_led_dmi_ids[] = { DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Flow"), }, }, + { + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "ProArt P16"), + }, + }, { .matches = { DMI_MATCH(DMI_BOARD_NAME, "GA403U"), -- cgit v1.2.3 From 4f3b63e8a8a28e3dcdcf3ff260f57a732a20b92b Mon Sep 17 00:00:00 2001 From: Sentaro Onizuka Date: Tue, 14 Jan 2025 00:14:00 +0900 Subject: fs: Fix return type of do_mount() from long to int Fix the return type of do_mount() function from long to int to match its ac tual behavior. The function only returns int values, and all callers, inclu ding those in fs/namespace.c and arch/alpha/kernel/osf_sys.c, already treat the return value as int. This change improves type consistency across the filesystem code and aligns the function signature with its existing impleme ntation and usage. Signed-off-by: Sentaro Onizuka Link: https://lore.kernel.org/r/20250113151400.55512-1-sentaro@amazon.com Signed-off-by: Christian Brauner --- include/linux/mount.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mount.h b/include/linux/mount.h index 33f17b6e8732..a7b472faec2c 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -114,7 +114,7 @@ extern struct vfsmount *kern_mount(struct file_system_type *); extern void kern_unmount(struct vfsmount *mnt); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); -extern long do_mount(const char *, const char __user *, +int do_mount(const char *, const char __user *, const char *, unsigned long, void *); extern struct vfsmount *collect_mounts(const struct path *); extern void drop_collected_mounts(struct vfsmount *); -- cgit v1.2.3 From 4b193fa75efffd90c054d1a7f2b5dbe29a461c14 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Jan 2025 10:46:37 +0100 Subject: lockref: remove lockref_put_not_zero lockref_put_not_zero is not used anywhere, and unless I'm missing something didn't end up being used used at all. Remove it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250115094702.504610-2-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/lockref.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lockref.h b/include/linux/lockref.h index c3a1f78bc884..e5aa0347f274 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -37,7 +37,6 @@ struct lockref { extern void lockref_get(struct lockref *); extern int lockref_put_return(struct lockref *); extern int lockref_get_not_zero(struct lockref *); -extern int lockref_put_not_zero(struct lockref *); extern int lockref_put_or_lock(struct lockref *); extern void lockref_mark_dead(struct lockref *); -- cgit v1.2.3 From 6d2868d5b6fca7534641440efe432cf268bd8e1b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Jan 2025 10:46:39 +0100 Subject: lockref: use bool for false/true returns Replace int used as bool with the actual bool type for return values that can only be true or false. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250115094702.504610-4-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/lockref.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockref.h b/include/linux/lockref.h index e5aa0347f274..3d770e1bdbad 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -36,11 +36,11 @@ struct lockref { extern void lockref_get(struct lockref *); extern int lockref_put_return(struct lockref *); -extern int lockref_get_not_zero(struct lockref *); -extern int lockref_put_or_lock(struct lockref *); +bool lockref_get_not_zero(struct lockref *lockref); +bool lockref_put_or_lock(struct lockref *lockref); extern void lockref_mark_dead(struct lockref *); -extern int lockref_get_not_dead(struct lockref *); +bool lockref_get_not_dead(struct lockref *lockref); /* Must be called under spinlock for reliable results */ static inline bool __lockref_is_dead(const struct lockref *l) -- cgit v1.2.3 From 25d8060418b4e83e109b20f3b3931301e254b8f4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Jan 2025 10:46:40 +0100 Subject: lockref: drop superfluous externs Drop the superfluous externs from the remaining prototypes in lockref.h. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250115094702.504610-5-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/lockref.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockref.h b/include/linux/lockref.h index 3d770e1bdbad..f821f46e9fb4 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -34,12 +34,12 @@ struct lockref { }; }; -extern void lockref_get(struct lockref *); -extern int lockref_put_return(struct lockref *); +void lockref_get(struct lockref *lockref); +int lockref_put_return(struct lockref *lockref); bool lockref_get_not_zero(struct lockref *lockref); bool lockref_put_or_lock(struct lockref *lockref); -extern void lockref_mark_dead(struct lockref *); +void lockref_mark_dead(struct lockref *lockref); bool lockref_get_not_dead(struct lockref *lockref); /* Must be called under spinlock for reliable results */ -- cgit v1.2.3 From 63440d1c6dd1fc782db905319dbfb4db354e54b9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Jan 2025 10:46:41 +0100 Subject: lockref: add a lockref_init helper Add a helper to initialize the lockdep, that is initialize the spinlock and set a value. Having to open code them isn't a big deal, but having an initializer feels right for a proper primitive. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250115094702.504610-6-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/lockref.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lockref.h b/include/linux/lockref.h index f821f46e9fb4..c39f119659ba 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -34,6 +34,17 @@ struct lockref { }; }; +/** + * lockref_init - Initialize a lockref + * @lockref: pointer to lockref structure + * @count: initial count + */ +static inline void lockref_init(struct lockref *lockref, unsigned int count) +{ + spin_lock_init(&lockref->lock); + lockref->count = count; +} + void lockref_get(struct lockref *lockref); int lockref_put_return(struct lockref *lockref); bool lockref_get_not_zero(struct lockref *lockref); -- cgit v1.2.3 From 2f8dea1692eef2b7ba6a256246ed82c365fdc686 Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Fri, 20 Dec 2024 22:44:21 +0900 Subject: hrtimers: Handle CPU state correctly on hotplug Consider a scenario where a CPU transitions from CPUHP_ONLINE to halfway through a CPU hotunplug down to CPUHP_HRTIMERS_PREPARE, and then back to CPUHP_ONLINE: Since hrtimers_prepare_cpu() does not run, cpu_base.hres_active remains set to 1 throughout. However, during a CPU unplug operation, the tick and the clockevents are shut down at CPUHP_AP_TICK_DYING. On return to the online state, for instance CFS incorrectly assumes that the hrtick is already active, and the chance of the clockevent device to transition to oneshot mode is also lost forever for the CPU, unless it goes back to a lower state than CPUHP_HRTIMERS_PREPARE once. This round-trip reveals another issue; cpu_base.online is not set to 1 after the transition, which appears as a WARN_ON_ONCE in enqueue_hrtimer(). Aside of that, the bulk of the per CPU state is not reset either, which means there are dangling pointers in the worst case. Address this by adding a corresponding startup() callback, which resets the stale per CPU state and sets the online flag. [ tglx: Make the new callback unconditionally available, remove the online modification in the prepare() callback and clear the remaining state in the starting callback instead of the prepare callback ] Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") Signed-off-by: Koichiro Den Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20241220134421.3809834-1-koichiro.den@canonical.com --- include/linux/hrtimer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 7ef5f7ef31a9..f7bfdcf0dda3 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -386,6 +386,7 @@ extern void __init hrtimers_init(void); extern void sysrq_timer_list_show(void); int hrtimers_prepare_cpu(unsigned int cpu); +int hrtimers_cpu_starting(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU int hrtimers_cpu_dying(unsigned int cpu); #else -- cgit v1.2.3 From d960f14800b581d79e1c3df4db524d9d4b3aac9a Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 15 Jan 2025 19:27:03 -0500 Subject: ACPI: platform_profile: Replace *class_dev member with class_dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of holding a reference to the class device, embed it the platform_profile_handler. This involves manually creating and registering the device and replacing dev_get_drvdata() with the newly created to_pprof_handler() macro. Reviewed-by: Mario Limonciello Signed-off-by: Kurt Borja Reviewed-by: Mark Pearson Tested-by: Mark Pearson Link: https://lore.kernel.org/r/20250116002721.75592-2-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index f1cd4b65e351..8a9b8754f9ac 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -9,6 +9,7 @@ #ifndef _PLATFORM_PROFILE_H_ #define _PLATFORM_PROFILE_H_ +#include #include /* @@ -30,7 +31,7 @@ enum platform_profile_option { struct platform_profile_handler { const char *name; struct device *dev; - struct device *class_dev; + struct device class_dev; int minor; unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; int (*profile_get)(struct platform_profile_handler *pprof, -- cgit v1.2.3 From 249c576f0f9d0556cb7473b8a437b30239afbd16 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 15 Jan 2025 19:27:04 -0500 Subject: ACPI: platform_profile: Let drivers set drvdata to the class device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add *drvdata to platform_profile_register() signature and assign it to the class device. While at it, pass specific driver state as drvdata to replace uses of container_of() with dev_get_drvdata(). Reviewed-by: Mario Limonciello Signed-off-by: Kurt Borja Reviewed-by: Mark Pearson Tested-by: Mark Pearson Link: https://lore.kernel.org/r/20250116002721.75592-3-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 8a9b8754f9ac..1c8fdda51eaa 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -40,9 +40,9 @@ struct platform_profile_handler { enum platform_profile_option profile); }; -int platform_profile_register(struct platform_profile_handler *pprof); +int platform_profile_register(struct platform_profile_handler *pprof, void *drvdata); int platform_profile_remove(struct platform_profile_handler *pprof); -int devm_platform_profile_register(struct platform_profile_handler *pprof); +int devm_platform_profile_register(struct platform_profile_handler *pprof, void *drvdata); int platform_profile_cycle(void); void platform_profile_notify(struct platform_profile_handler *pprof); -- cgit v1.2.3 From cf3ea098dd3af415f079bc0b999055f213dd4a83 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 15 Jan 2025 19:27:05 -0500 Subject: ACPI: platform_profile: Remove platform_profile_handler from callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Devices can now set drvdata to the class device, thus passing the platform_profile_handler to callbacks is unnecessary. Instead pass the class device. Reviewed-by: Mario Limonciello Signed-off-by: Kurt Borja Reviewed-by: Mark Pearson Tested-by: Mark Pearson Link: https://lore.kernel.org/r/20250116002721.75592-4-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 1c8fdda51eaa..5296d886c243 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -34,10 +34,8 @@ struct platform_profile_handler { struct device class_dev; int minor; unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; - int (*profile_get)(struct platform_profile_handler *pprof, - enum platform_profile_option *profile); - int (*profile_set)(struct platform_profile_handler *pprof, - enum platform_profile_option profile); + int (*profile_get)(struct device *dev, enum platform_profile_option *profile); + int (*profile_set)(struct device *dev, enum platform_profile_option profile); }; int platform_profile_register(struct platform_profile_handler *pprof, void *drvdata); -- cgit v1.2.3 From b5ca1a4488a5e6dfb9962e2319c03c7414e50ec3 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 15 Jan 2025 19:27:06 -0500 Subject: ACPI: platform_profile: Add `ops` member to handlers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace *profile_get and *profile_set members with a general *ops member. Reviewed-by: Mario Limonciello Signed-off-by: Kurt Borja Reviewed-by: Mark Pearson Tested-by: Mark Pearson Link: https://lore.kernel.org/r/20250116002721.75592-5-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 5296d886c243..6013c05d7b86 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -28,14 +28,20 @@ enum platform_profile_option { PLATFORM_PROFILE_LAST, /*must always be last */ }; +struct platform_profile_handler; + +struct platform_profile_ops { + int (*profile_get)(struct device *dev, enum platform_profile_option *profile); + int (*profile_set)(struct device *dev, enum platform_profile_option profile); +}; + struct platform_profile_handler { const char *name; struct device *dev; struct device class_dev; int minor; unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; - int (*profile_get)(struct device *dev, enum platform_profile_option *profile); - int (*profile_set)(struct device *dev, enum platform_profile_option profile); + const struct platform_profile_ops *ops; }; int platform_profile_register(struct platform_profile_handler *pprof, void *drvdata); -- cgit v1.2.3 From 58d5629dc8b8b8d9928fc649d9f2aaa361a8a5c5 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 15 Jan 2025 19:27:07 -0500 Subject: ACPI: platform_profile: Add `probe` to platform_profile_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a `probe` callback to platform_profile_ops, which lets drivers initialize the choices member manually. This is a step towards unexposing the struct platform_profile_handler from the consumer drivers. Reviewed-by: Mario Limonciello Signed-off-by: Kurt Borja Reviewed-by: Mark Pearson Tested-by: Mark Pearson Link: https://lore.kernel.org/r/20250116002721.75592-6-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 6013c05d7b86..5ad1ab7b75e4 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -31,6 +31,7 @@ enum platform_profile_option { struct platform_profile_handler; struct platform_profile_ops { + int (*probe)(void *drvdata, unsigned long *choices); int (*profile_get)(struct device *dev, enum platform_profile_option *profile); int (*profile_set)(struct device *dev, enum platform_profile_option profile); }; -- cgit v1.2.3 From ad41ddeeac216417a52fbc1060577f3098f4e90e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 14 Jan 2025 19:08:39 +0200 Subject: PCI: Add TLP Prefix reading to pcie_read_tlp_log() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pcie_read_tlp_log() handles only 4 Header Log DWORDs but TLP Prefix Log (PCIe r6.1 secs 7.8.4.12 & 7.9.14.13) may also be present. Generalize pcie_read_tlp_log() and struct pcie_tlp_log to also handle TLP Prefix Log. The relevant registers are formatted identically in AER and DPC Capability, but has these variations: a) The offsets of TLP Prefix Log registers vary. b) DPC RP PIO TLP Prefix Log register can be < 4 DWORDs. c) AER TLP Prefix Log Present (PCIe r6.1 sec 7.8.4.7) can indicate Prefix Log is not present. Therefore callers must pass the offset of the TLP Prefix Log register and the entire length to pcie_read_tlp_log() to be able to read the correct number of TLP Prefix DWORDs from the correct offset. Link: https://lore.kernel.org/r/20250114170840.1633-8-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen [bhelgaas: squash ternary fix from https://lore.kernel.org/r/20250116172019.88116-1-colin.i.king@gmail.com] Signed-off-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron --- include/linux/aer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/aer.h b/include/linux/aer.h index 4ef6515c3205..947b63091902 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -27,6 +27,7 @@ struct pci_dev; struct pcie_tlp_log { u32 dw[PCIE_STD_NUM_TLP_HEADERLOG]; + u32 prefix[PCIE_STD_MAX_TLP_PREFIXLOG]; }; struct aer_capability_regs { -- cgit v1.2.3 From d4679b79ffae994fea08bc7751ff6550ad057f05 Mon Sep 17 00:00:00 2001 From: Konrad Knitter Date: Wed, 6 Nov 2024 10:36:41 +0100 Subject: pldmfw: enable selected component update This patch enables to update a selected component from PLDM image containing multiple components. Example usage: struct pldmfw; data.mode = PLDMFW_UPDATE_MODE_SINGLE_COMPONENT; data.compontent_identifier = DRIVER_FW_MGMT_COMPONENT_ID; Reviewed-by: Jacob Keller Reviewed-by: Marcin Szycik Reviewed-by: Przemek Kitszel Signed-off-by: Konrad Knitter Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- include/linux/pldmfw.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pldmfw.h b/include/linux/pldmfw.h index 0fc831338226..f5047983004f 100644 --- a/include/linux/pldmfw.h +++ b/include/linux/pldmfw.h @@ -125,9 +125,17 @@ struct pldmfw_ops; * a pointer to their own data, used to implement the device specific * operations. */ + +enum pldmfw_update_mode { + PLDMFW_UPDATE_MODE_FULL, + PLDMFW_UPDATE_MODE_SINGLE_COMPONENT, +}; + struct pldmfw { const struct pldmfw_ops *ops; struct device *dev; + u16 component_identifier; + enum pldmfw_update_mode mode; }; bool pldmfw_op_pci_match_record(struct pldmfw *context, struct pldmfw_record *record); -- cgit v1.2.3 From a00e0d34c0362a69369f212b8be1be1f6f4c365d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 15 Jan 2025 20:42:32 +0000 Subject: net: phy: add support for querying PHY clock stop capability Add support for querying whether the PHY allows the transmit xMII clock to be stopped while in LPI mode. This will be used by phylink to pass to the MAC driver so it can configure the generation of the xMII clock appropriately. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Reviewed-by: Jacob Keller Link: https://patch.msgid.link/E1tYADg-0014Pb-AJ@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index afaae74d0949..244f747b3cd9 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2146,6 +2146,7 @@ int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask); int phy_unregister_fixup_for_id(const char *bus_id); int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask); +int phy_eee_tx_clock_stop_capable(struct phy_device *phydev); int phy_eee_rx_clock_stop(struct phy_device *phydev, bool clk_stop_enable); int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable); int phy_get_eee_err(struct phy_device *phydev); -- cgit v1.2.3 From 03abf2a7c65451e663b078b0ed1bfa648cd9380f Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 15 Jan 2025 20:42:42 +0000 Subject: net: phylink: add EEE management Add EEE management to phylink, making use of the phylib implementation. This will only be used where a MAC driver populates the methods and capabilities bitfield, otherwise we keep our old behaviour. Phylink will keep track of the EEE configuration, including the clock stop abilities at each end of the MAC to PHY link, programming the PHY appropriately and preserving the LPI configuration should the PHY go away. Phylink will call into the MAC driver when LPI needs to be enabled or disabled, with the requirement that the MAC have LPI disabled prior to the netdev being brought up (in other words, it will only call mac_disable_tx_lpi() if it has already called mac_enable_tx_lpi().) Support for phylink managed EEE is enabled by populating both tx_lpi MAC operations method pointers, and filling in both LPI interfaces and capabilities. If the methods are provided but the LPI interfaces or capabilities remain empty, this indicates to phylink that EEE is implemented by the driver but the hardware it is driving does not support EEE, and thus the ethtool set_eee() and get_eee() methods will return EOPNOTSUPP. No validation of the LPI timer value is performed by this patch. For interface modes which do not support LPI, we make no attempt to manipulate the phylib EEE advertisement, but instead refuse to activate LPI at the MAC, noting it at debug message level. We also restrict the advertisement and reported userspace support linkmode masks according to the lpi_capabilities provided to phylink by the MAC driver. Signed-off-by: Russell King (Oracle) Reviewed-by: Jacob Keller Link: https://patch.msgid.link/E1tYADq-0014Pn-J1@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index f19b7108c840..898b00451bbf 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -5,6 +5,8 @@ #include #include +#include + struct device_node; struct ethtool_cmd; struct fwnode_handle; @@ -143,11 +145,17 @@ enum phylink_op_type { * possible and avoid stopping it during suspend events. * @default_an_inband: if true, defaults to MLO_AN_INBAND rather than * MLO_AN_PHY. A fixed-link specification will override. + * @eee_rx_clk_stop_enable: if true, PHY can stop the receive clock during LPI * @get_fixed_state: callback to execute to determine the fixed link state, * if MAC link is at %MLO_AN_FIXED mode. * @supported_interfaces: bitmap describing which PHY_INTERFACE_MODE_xxx * are supported by the MAC/PCS. + * @lpi_interfaces: bitmap describing which PHY interface modes can support + * LPI signalling. * @mac_capabilities: MAC pause/speed/duplex capabilities. + * @lpi_capabilities: MAC speeds which can support LPI signalling + * @lpi_timer_default: Default EEE LPI timer setting. + * @eee_enabled_default: If set, EEE will be enabled by phylink at creation time */ struct phylink_config { struct device *dev; @@ -156,10 +164,15 @@ struct phylink_config { bool mac_managed_pm; bool mac_requires_rxc; bool default_an_inband; + bool eee_rx_clk_stop_enable; void (*get_fixed_state)(struct phylink_config *config, struct phylink_link_state *state); DECLARE_PHY_INTERFACE_MASK(supported_interfaces); + DECLARE_PHY_INTERFACE_MASK(lpi_interfaces); unsigned long mac_capabilities; + unsigned long lpi_capabilities; + u32 lpi_timer_default; + bool eee_enabled_default; }; void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed); @@ -173,6 +186,8 @@ void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed); * @mac_finish: finish a major reconfiguration of the interface. * @mac_link_down: take the link down. * @mac_link_up: allow the link to come up. + * @mac_disable_tx_lpi: disable LPI. + * @mac_enable_tx_lpi: enable and configure LPI. * * The individual methods are described more fully below. */ @@ -193,6 +208,9 @@ struct phylink_mac_ops { struct phy_device *phy, unsigned int mode, phy_interface_t interface, int speed, int duplex, bool tx_pause, bool rx_pause); + void (*mac_disable_tx_lpi)(struct phylink_config *config); + int (*mac_enable_tx_lpi)(struct phylink_config *config, u32 timer, + bool tx_clk_stop); }; #if 0 /* For kernel-doc purposes only. */ @@ -387,6 +405,33 @@ void mac_link_down(struct phylink_config *config, unsigned int mode, void mac_link_up(struct phylink_config *config, struct phy_device *phy, unsigned int mode, phy_interface_t interface, int speed, int duplex, bool tx_pause, bool rx_pause); + +/** + * mac_disable_tx_lpi() - disable LPI generation at the MAC + * @config: a pointer to a &struct phylink_config. + * + * Disable generation of LPI at the MAC, effectively preventing the MAC + * from indicating that it is idle. + */ +void mac_disable_tx_lpi(struct phylink_config *config); + +/** + * mac_enable_tx_lpi() - configure and enable LPI generation at the MAC + * @config: a pointer to a &struct phylink_config. + * @timer: LPI timeout in microseconds. + * @tx_clk_stop: allow xMII transmit clock to be stopped during LPI + * + * Configure the LPI timeout accordingly. This will only be called when + * the link is already up, to cater for situations where the hardware + * needs to be programmed according to the link speed. + * + * Enable LPI generation at the MAC, and configure whether the xMII transmit + * clock may be stopped. + * + * Returns: 0 on success. Please consult with rmk before returning an error. + */ +int mac_enable_tx_lpi(struct phylink_config *config, u32 timer, + bool tx_clk_stop); #endif struct phylink_pcs_ops; -- cgit v1.2.3 From 4d27afbf256028a1f54363367f30efc8854433c3 Mon Sep 17 00:00:00 2001 From: Kyle Tso Date: Tue, 14 Jan 2025 22:24:35 +0800 Subject: usb: typec: tcpci: Prevent Sink disconnection before vPpsShutdown in SPR PPS The Source can drop its output voltage to the minimum of the requested PPS APDO voltage range when it is in Current Limit Mode. If this voltage falls within the range of vPpsShutdown, the Source initiates a Hard Reset and discharges Vbus. However, currently the Sink may disconnect before the voltage reaches vPpsShutdown, leading to unexpected behavior. Prevent premature disconnection by setting the Sink's disconnect threshold to the minimum vPpsShutdown value. Additionally, consider the voltage drop due to IR drop when calculating the appropriate threshold. This ensures a robust and reliable interaction between the Source and Sink during SPR PPS Current Limit Mode operation. Fixes: 4288debeaa4e ("usb: typec: tcpci: Fix up sink disconnect thresholds for PD") Cc: stable Signed-off-by: Kyle Tso Reviewed-by: Heikki Krogerus Reviewed-by: Badhri Jagan Sridharan Link: https://lore.kernel.org/r/20250114142435.2093857-1-kyletso@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/tcpm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h index 061da9546a81..b22e659f81ba 100644 --- a/include/linux/usb/tcpm.h +++ b/include/linux/usb/tcpm.h @@ -163,7 +163,8 @@ struct tcpc_dev { void (*frs_sourcing_vbus)(struct tcpc_dev *dev); int (*enable_auto_vbus_discharge)(struct tcpc_dev *dev, bool enable); int (*set_auto_vbus_discharge_threshold)(struct tcpc_dev *dev, enum typec_pwr_opmode mode, - bool pps_active, u32 requested_vbus_voltage); + bool pps_active, u32 requested_vbus_voltage, + u32 pps_apdo_min_voltage); bool (*is_vbus_vsafe0v)(struct tcpc_dev *dev); void (*set_partner_usb_comm_capable)(struct tcpc_dev *dev, bool enable); void (*check_contaminant)(struct tcpc_dev *dev); -- cgit v1.2.3 From 39d0be87438a0cc29151898c7fba24b43f2f3df8 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 12 Jan 2025 13:57:59 +0000 Subject: serial: kgdb_nmi: Remove unused knock code kgdb_nmi_poll_knock() has been unused since it was added in 2013 in commit 0c57dfcc6c1d ("tty/serial: Add kgdb_nmi driver") Remove it, the static helpers, and module parameters it used. (The comment explaining why it might be used sounds sensible, but it's never been wired up, perhaps it's worth doing somewhere?) Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20250112135759.105541-1-linux@treblig.org Signed-off-by: Greg Kroah-Hartman --- include/linux/kgdb.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 76e891ee9e37..51ef131e66b7 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -309,11 +309,9 @@ extern unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs); #ifdef CONFIG_SERIAL_KGDB_NMI extern int kgdb_register_nmi_console(void); extern int kgdb_unregister_nmi_console(void); -extern bool kgdb_nmi_poll_knock(void); #else static inline int kgdb_register_nmi_console(void) { return 0; } static inline int kgdb_unregister_nmi_console(void) { return 0; } -static inline bool kgdb_nmi_poll_knock(void) { return true; } #endif extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops); -- cgit v1.2.3 From 72d1c18262dd5a18d835a94391c31cf04252c748 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Tue, 14 Jan 2025 23:23:03 +0800 Subject: of: Do not expose of_alias_scan() and correct its comments of_alias_scan() has no external callers and returns void. Do not expose it and delete return value descriptions in its comments. Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250114-of_core_fix-v5-1-b8bafd00a86f@quicinc.com Signed-off-by: Rob Herring (Arm) --- include/linux/of.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 0cdd58ff0a41..eaf0e2a2b75c 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -398,7 +398,6 @@ extern int of_phandle_iterator_args(struct of_phandle_iterator *it, uint32_t *args, int size); -extern void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align)); extern int of_alias_get_id(const struct device_node *np, const char *stem); extern int of_alias_get_highest_id(const char *stem); -- cgit v1.2.3 From 07f531b395db3cd1776ef0f7191abf4b077fcf21 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 15 Jan 2025 19:27:17 -0500 Subject: ACPI: platform_profile: Remove platform_profile_handler from exported symbols MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to protect the platform_profile_handler from API consumers, allocate it in platform_profile_register() and modify it's signature accordingly. Remove the platform_profile_handler from all consumer drivers and replace them with a pointer to the class device, which is now returned from platform_profile_register(). Replace *pprof with a pointer to the class device in the rest of exported symbols. Reviewed-by: Mario Limonciello Signed-off-by: Kurt Borja Reviewed-by: Mark Pearson Tested-by: Mark Pearson Link: https://lore.kernel.org/r/20250116002721.75592-16-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 5ad1ab7b75e4..778d4c661c3c 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -45,10 +45,14 @@ struct platform_profile_handler { const struct platform_profile_ops *ops; }; -int platform_profile_register(struct platform_profile_handler *pprof, void *drvdata); -int platform_profile_remove(struct platform_profile_handler *pprof); -int devm_platform_profile_register(struct platform_profile_handler *pprof, void *drvdata); +struct device *platform_profile_register(struct device *dev, const char *name, + void *drvdata, + const struct platform_profile_ops *ops); +int platform_profile_remove(struct device *dev); +struct device *devm_platform_profile_register(struct device *dev, const char *name, + void *drvdata, + const struct platform_profile_ops *ops); int platform_profile_cycle(void); -void platform_profile_notify(struct platform_profile_handler *pprof); +void platform_profile_notify(struct device *dev); #endif /*_PLATFORM_PROFILE_H_*/ -- cgit v1.2.3 From 6ef33895503583d0741a0d8faf820ca8143b9cf2 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 15 Jan 2025 19:27:18 -0500 Subject: ACPI: platform_profile: Move platform_profile_handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit platform_profile_handler is now an internal structure. Move it to platform_profile.c. Reviewed-by: Mario Limonciello Signed-off-by: Kurt Borja Reviewed-by: Mark Pearson Tested-by: Mark Pearson Link: https://lore.kernel.org/r/20250116002721.75592-17-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 778d4c661c3c..eea1daf85616 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -28,23 +28,12 @@ enum platform_profile_option { PLATFORM_PROFILE_LAST, /*must always be last */ }; -struct platform_profile_handler; - struct platform_profile_ops { int (*probe)(void *drvdata, unsigned long *choices); int (*profile_get)(struct device *dev, enum platform_profile_option *profile); int (*profile_set)(struct device *dev, enum platform_profile_option profile); }; -struct platform_profile_handler { - const char *name; - struct device *dev; - struct device class_dev; - int minor; - unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; - const struct platform_profile_ops *ops; -}; - struct device *platform_profile_register(struct device *dev, const char *name, void *drvdata, const struct platform_profile_ops *ops); -- cgit v1.2.3 From ee7f3e2b4942e3f6d8837780d0d3d5d58de8801a Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 15 Jan 2025 19:27:20 -0500 Subject: ACPI: platform_profile: Add documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add kerneldoc and sysfs class documentation. Reviewed-by: Mario Limonciello Signed-off-by: Kurt Borja Reviewed-by: Mark Pearson Tested-by: Mark Pearson Link: https://lore.kernel.org/r/20250116002721.75592-19-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index eea1daf85616..8ab5b0e8eb2c 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -28,6 +28,16 @@ enum platform_profile_option { PLATFORM_PROFILE_LAST, /*must always be last */ }; +/** + * struct platform_profile_ops - platform profile operations + * @probe: Callback to setup choices available to the new class device. These + * choices will only be enforced when setting a new profile, not when + * getting the current one. + * @profile_get: Callback that will be called when showing the current platform + * profile in sysfs. + * @profile_set: Callback that will be called when storing a new platform + * profile in sysfs. + */ struct platform_profile_ops { int (*probe)(void *drvdata, unsigned long *choices); int (*profile_get)(struct device *dev, enum platform_profile_option *profile); -- cgit v1.2.3 From 42d7c87b4e1251f36eceac987e74623e7cda8577 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 15 Jan 2025 15:41:57 +0100 Subject: regulator: Add support for power budget Introduce power budget management for the regulator device. Enable tracking of available power capacity by providing helpers to request and release power budget allocations. Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20250115-feature_regulator_pw_budget-v2-1-0a44b949e6bc@bootlin.com Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 21 +++++++++++++++++++++ include/linux/regulator/driver.h | 2 ++ include/linux/regulator/machine.h | 2 ++ 3 files changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 8c3c372ad735..8d1a6eca7eb9 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -258,6 +258,11 @@ int regulator_sync_voltage(struct regulator *regulator); int regulator_set_current_limit(struct regulator *regulator, int min_uA, int max_uA); int regulator_get_current_limit(struct regulator *regulator); +int regulator_get_unclaimed_power_budget(struct regulator *regulator); +int regulator_request_power_budget(struct regulator *regulator, + unsigned int pw_req); +void regulator_free_power_budget(struct regulator *regulator, + unsigned int pw); int regulator_set_mode(struct regulator *regulator, unsigned int mode); unsigned int regulator_get_mode(struct regulator *regulator); @@ -571,6 +576,22 @@ static inline int regulator_get_current_limit(struct regulator *regulator) return 0; } +static inline int regulator_get_unclaimed_power_budget(struct regulator *regulator) +{ + return INT_MAX; +} + +static inline int regulator_request_power_budget(struct regulator *regulator, + unsigned int pw_req) +{ + return -EOPNOTSUPP; +} + +static inline void regulator_free_power_budget(struct regulator *regulator, + unsigned int pw) +{ +} + static inline int regulator_set_mode(struct regulator *regulator, unsigned int mode) { diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index 5b66caf1695d..4a216fdba354 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -656,6 +656,8 @@ struct regulator_dev { int cached_err; bool use_cached_err; spinlock_t err_lock; + + int pw_requested_mW; }; /* diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index b3db09a7429b..1fc440c5c4c7 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -113,6 +113,7 @@ struct notification_limit { * @min_uA: Smallest current consumers may set. * @max_uA: Largest current consumers may set. * @ilim_uA: Maximum input current. + * @pw_budget_mW: Power budget for the regulator in mW. * @system_load: Load that isn't captured by any consumer requests. * * @over_curr_limits: Limits for acting on over current. @@ -185,6 +186,7 @@ struct regulation_constraints { int max_uA; int ilim_uA; + int pw_budget_mW; int system_load; /* used for coupled regulators */ -- cgit v1.2.3 From fd8318a32573d73eb20637a0c80689de0dc98169 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Fri, 3 Jan 2025 16:41:13 +0800 Subject: PM: sleep: wakeirq: Introduce device-managed variant of dev_pm_set_wake_irq() Add device-managed variant of dev_pm_set_wake_irq which automatically clear the wake irq on device destruction to simplify error handling and resource management in drivers. Signed-off-by: Peng Fan Link: https://patch.msgid.link/20250103-wake_irq-v2-1-e3aeff5e9966@nxp.com Signed-off-by: Rafael J. Wysocki --- include/linux/pm_wakeirq.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_wakeirq.h b/include/linux/pm_wakeirq.h index d9642c6cf852..25b63ed51b76 100644 --- a/include/linux/pm_wakeirq.h +++ b/include/linux/pm_wakeirq.h @@ -10,6 +10,7 @@ extern int dev_pm_set_wake_irq(struct device *dev, int irq); extern int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq); extern int dev_pm_set_dedicated_wake_irq_reverse(struct device *dev, int irq); extern void dev_pm_clear_wake_irq(struct device *dev); +extern int devm_pm_set_wake_irq(struct device *dev, int irq); #else /* !CONFIG_PM */ @@ -32,5 +33,10 @@ static inline void dev_pm_clear_wake_irq(struct device *dev) { } +static inline int devm_pm_set_wake_irq(struct device *dev, int irq) +{ + return 0; +} + #endif /* CONFIG_PM */ #endif /* _LINUX_PM_WAKEIRQ_H */ -- cgit v1.2.3 From 6a7e17b22062c84a111d7073c67cc677c4190f32 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 16 Jan 2025 17:02:54 +0000 Subject: block: Add common atomic writes enable flag Currently only stacked devices need to explicitly enable atomic writes by setting BLK_FEAT_ATOMIC_WRITES_STACKED flag. This does not work well for device mapper stacking devices, as there many sets of limits are stacked and what is the 'bottom' and 'top' device can swapped. This means that BLK_FEAT_ATOMIC_WRITES_STACKED needs to be set for many queue limits, which is messy. Generalize enabling atomic writes enabling by ensuring that all devices must explicitly set a flag - that includes NVMe, SCSI sd, and md raid. Signed-off-by: John Garry Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20250116170301.474130-2-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7ac153e4423a..76f0a4e7c2e5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -331,8 +331,8 @@ typedef unsigned int __bitwise blk_features_t; #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ ((__force blk_features_t)(1u << 15)) -/* stacked device can/does support atomic writes */ -#define BLK_FEAT_ATOMIC_WRITES_STACKED \ +/* atomic writes enabled */ +#define BLK_FEAT_ATOMIC_WRITES \ ((__force blk_features_t)(1u << 16)) /* -- cgit v1.2.3 From 3194e36488e2dae05f9a822c73eaa367e47b2e3d Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 16 Jan 2025 17:02:56 +0000 Subject: dm-table: atomic writes support Support stacking atomic write limits for DM devices. All the pre-existing code in blk_stack_atomic_writes_limits() already takes care of finding the aggregrate limits from the bottom devices. Feature flag DM_TARGET_ATOMIC_WRITES is introduced so that atomic writes can be enabled on personalities selectively. This is to ensure that atomic writes are only enabled when verified to be working properly (for a specific personality). In addition, it just may not make sense to enable atomic writes on some personalities (so this flag also helps there). Signed-off-by: John Garry Reviewed-by: Mike Snitzer Signed-off-by: Mikulas Patocka --- include/linux/device-mapper.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 8321f65897f3..bcc6d7b69470 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -299,6 +299,9 @@ struct target_type { #define dm_target_supports_mixed_zoned_model(type) (false) #endif +#define DM_TARGET_ATOMIC_WRITES 0x00000400 +#define dm_target_supports_atomic_writes(type) ((type)->features & DM_TARGET_ATOMIC_WRITES) + struct dm_target { struct dm_table *table; struct target_type *type; -- cgit v1.2.3 From 61bc24ac974a4873e3040765e640f62fe99d6226 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 9 Dec 2024 19:15:58 -0500 Subject: make sure that DNAME_INLINE_LEN is a multiple of word size ... calling the number of words DNAME_INLINE_WORDS. The next step will be to have a structure to hold inline name arrays (both in dentry and in name_snapshot) and use that to alias the existing arrays of unsigned char there. That will allow both full-structure copies and convenient word-by-word accesses. Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: Al Viro --- include/linux/dcache.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index bff956f7b2b9..42dd89beaf4e 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -68,15 +68,17 @@ extern const struct qstr dotdot_name; * large memory footprint increase). */ #ifdef CONFIG_64BIT -# define DNAME_INLINE_LEN 40 /* 192 bytes */ +# define DNAME_INLINE_WORDS 5 /* 192 bytes */ #else # ifdef CONFIG_SMP -# define DNAME_INLINE_LEN 36 /* 128 bytes */ +# define DNAME_INLINE_WORDS 9 /* 128 bytes */ # else -# define DNAME_INLINE_LEN 44 /* 128 bytes */ +# define DNAME_INLINE_WORDS 11 /* 128 bytes */ # endif #endif +#define DNAME_INLINE_LEN (DNAME_INLINE_WORDS*sizeof(unsigned long)) + #define d_lock d_lockref.lock struct dentry { -- cgit v1.2.3 From 58cf9c383c5c686668082f83f7e0f3e0bd5cc2e3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 9 Dec 2024 19:35:36 -0500 Subject: dcache: back inline names with a struct-wrapped array of unsigned long ... so that they can be copied with struct assignment (which generates better code) and accessed word-by-word. The type is union shortname_storage; it's a union of arrays of unsigned char and unsigned long. struct name_snapshot.inline_name turned into union shortname_storage; users (all in fs/dcache.c) adjusted. struct dentry.d_iname has some users outside of fs/dcache.c; to reduce the amount of noise in commit, it is replaced with union shortname_storage d_shortname and d_iname is turned into a macro that expands to d_shortname.string (similar to d_lock handling). That compat macro is temporary - most of the remaining instances will be taken out by debugfs series, and once that is merged and few others are taken care of this will go away. Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: Al Viro --- include/linux/dcache.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 42dd89beaf4e..8bc567a35718 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -79,7 +79,13 @@ extern const struct qstr dotdot_name; #define DNAME_INLINE_LEN (DNAME_INLINE_WORDS*sizeof(unsigned long)) +union shortname_store { + unsigned char string[DNAME_INLINE_LEN]; + unsigned long words[DNAME_INLINE_WORDS]; +}; + #define d_lock d_lockref.lock +#define d_iname d_shortname.string struct dentry { /* RCU lookup touched fields */ @@ -90,7 +96,7 @@ struct dentry { struct qstr d_name; struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ - unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ + union shortname_store d_shortname; /* --- cacheline 1 boundary (64 bytes) was 32 bytes ago --- */ /* Ref lookup also touches following */ @@ -591,7 +597,7 @@ static inline struct inode *d_real_inode(const struct dentry *dentry) struct name_snapshot { struct qstr name; - unsigned char inline_name[DNAME_INLINE_LEN]; + union shortname_store inline_name; }; void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *); void release_dentry_name_snapshot(struct name_snapshot *); -- cgit v1.2.3 From 6a128cdf1926b20a94d6af7d7d03b76ba19a4f8b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 16 Jan 2025 12:46:25 +0200 Subject: net: ethtool: ts: add separate counter for unconfirmed one-step TX timestamps For packets with two-step timestamp requests, the hardware timestamp comes back to the driver through a confirmation mechanism of sorts, which allows the driver to confidently bump the successful "pkts" counter. For one-step PTP, the NIC is supposed to autonomously insert its hardware TX timestamp in the packet headers while simultaneously transmitting it. There may be a confirmation that this was done successfully, or there may not. None of the current drivers which implement ethtool_ops :: get_ts_stats() also support HWTSTAMP_TX_ONESTEP_SYNC or HWTSTAMP_TX_ONESTEP_SYNC, so it is a bit unclear which model to follow. But there are NICs, such as DSA, where there is no transmit confirmation at all. Here, it would be wrong / misleading to increment the successful "pkts" counter, because one-step PTP packets can be dropped on TX just like any other packets. So introduce a special counter which signifies "yes, an attempt was made, but we don't know whether it also exited the port or not". I expect that for one-step PTP packets where a confirmation is available, the "pkts" counter would be bumped. Signed-off-by: Vladimir Oltean Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20250116104628.123555-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index e4136b0df892..64301ddf2f59 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -559,6 +559,12 @@ struct ethtool_rmon_stats { /** * struct ethtool_ts_stats - HW timestamping statistics * @pkts: Number of packets successfully timestamped by the hardware. + * @onestep_pkts_unconfirmed: Number of PTP packets with one-step TX + * timestamping that were sent, but for which the + * device offers no confirmation whether they made + * it onto the wire and the timestamp was inserted + * in the originTimestamp or correctionField, or + * not. * @lost: Number of hardware timestamping requests where the timestamping * information from the hardware never arrived for submission with * the skb. @@ -571,6 +577,7 @@ struct ethtool_rmon_stats { struct ethtool_ts_stats { struct_group(tx_stats, u64 pkts; + u64 onestep_pkts_unconfirmed; u64 lost; u64 err; ); -- cgit v1.2.3 From 4bf97069239bcfca9840936313c7ac35a6e04488 Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Wed, 13 Nov 2024 18:21:20 -0800 Subject: riscv: Add ghostwrite vulnerability Follow the patterns of the other architectures that use GENERIC_CPU_VULNERABILITIES for riscv to introduce the ghostwrite vulnerability and mitigation. The mitigation is to disable all vector which is accomplished by clearing the bit from the cpufeature field. Ghostwrite only affects thead c9xx CPUs that impelment xtheadvector, so the vulerability will only be mitigated on these CPUs. Signed-off-by: Charlie Jenkins Tested-by: Yangyu Chen Link: https://lore.kernel.org/r/20241113-xtheadvector-v11-14-236c22791ef9@rivosinc.com Signed-off-by: Palmer Dabbelt --- include/linux/cpu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index bdcec1732445..6a0a8f1c7c90 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -77,6 +77,7 @@ extern ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_ghostwrite(struct device *dev, struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, -- cgit v1.2.3 From f546e8033d8f3e45d49622f04ca2fde650b80f6d Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Mon, 9 Dec 2024 14:06:23 +0100 Subject: PCI: Export pci_intx_unmanaged() and pcim_intx() pci_intx() is a hybrid function which sometimes performs devres operations, depending on whether pcim_enable_device() has been used to enable the pci_dev. This sometimes-managed nature of the function is problematic. Notably, it causes the function to allocate under some circumstances which makes it unusable from interrupt context. Export pcim_intx() (which is always managed) and rename __pcim_intx() (which is never managed) to pci_intx_unmanaged() and export it as well. Then all callers of pci_intx() can be ported to the version they need, depending whether they use pci_enable_device() or pcim_enable_device(). Link: https://lore.kernel.org/r/20241209130632.132074-3-pstanner@redhat.com Signed-off-by: Philipp Stanner [bhelgaas: commit log] Signed-off-by: Bjorn Helgaas Reviewed-by: Damien Le Moal --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index db9b47ce3eef..b5eb8bda655d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1350,6 +1350,7 @@ int __must_check pcim_set_mwi(struct pci_dev *dev); int pci_try_set_mwi(struct pci_dev *dev); void pci_clear_mwi(struct pci_dev *dev); void pci_disable_parity(struct pci_dev *dev); +void pci_intx_unmanaged(struct pci_dev *pdev, int enable); void pci_intx(struct pci_dev *dev, int enable); bool pci_check_and_mask_intx(struct pci_dev *dev); bool pci_check_and_unmask_intx(struct pci_dev *dev); @@ -2297,6 +2298,7 @@ static inline void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev) { } #endif +int pcim_intx(struct pci_dev *pdev, int enabled); int pcim_request_all_regions(struct pci_dev *pdev, const char *name); void __iomem *pcim_iomap(struct pci_dev *pdev, int bar, unsigned long maxlen); void __iomem *pcim_iomap_region(struct pci_dev *pdev, int bar, -- cgit v1.2.3 From dfa2f4d5f9e5d757700cefa8ee480099889f1c69 Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Mon, 9 Dec 2024 14:06:33 +0100 Subject: PCI: Remove devres from pci_intx() pci_intx() is a hybrid function which can sometimes be managed through devres. This hybrid nature is undesirable. Since all users of pci_intx() have by now been ported either to always-managed pcim_intx() or never-managed pci_intx_unmanaged(), the devres functionality can be removed from pci_intx(). Consequently, pci_intx_unmanaged() is now redundant, because pci_intx() itself is now unmanaged. Remove the devres functionality from pci_intx(). Have all users of pci_intx_unmanaged() call pci_intx(). Remove pci_intx_unmanaged(). Link: https://lore.kernel.org/r/20241209130632.132074-13-pstanner@redhat.com Signed-off-by: Philipp Stanner Signed-off-by: Bjorn Helgaas Acked-by: Paolo Abeni --- include/linux/pci.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index b5eb8bda655d..f05903dd7695 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1350,7 +1350,6 @@ int __must_check pcim_set_mwi(struct pci_dev *dev); int pci_try_set_mwi(struct pci_dev *dev); void pci_clear_mwi(struct pci_dev *dev); void pci_disable_parity(struct pci_dev *dev); -void pci_intx_unmanaged(struct pci_dev *pdev, int enable); void pci_intx(struct pci_dev *dev, int enable); bool pci_check_and_mask_intx(struct pci_dev *dev); bool pci_check_and_unmask_intx(struct pci_dev *dev); -- cgit v1.2.3 From 4dbf0155dfcfa65440b5f70d3e905261208b387e Mon Sep 17 00:00:00 2001 From: Frank Li Date: Tue, 19 Nov 2024 14:44:19 -0500 Subject: of: address: Add parent_bus_addr to struct of_pci_range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new field called 'parent_bus_addr' to struct of_pci_range to use when retrieving parent bus address information. Refer to the diagram below to better understand that the bus fabric in some systems (like i.MX8QXP) does not always use a 1:1 address map between input and output. Currently, many controller drivers use the cpu_addr_fixup() callback that would often hardcode address translation directly in the code, e.g., "cpu_addr & CDNS_PLAT_CPU_TO_BUS_ADDR" or "cpu_addr + BUS_IATU_OFFSET", etc., even though those translations *should* be described via DT. However, the cpu_addr_fixup() can be eliminated if DT correctly reflects hardware behavior and drivers use 'parent_bus_addr' in struct of_pci_range. ┌─────────┐ ┌────────────┐ ┌─────┐ │ │ IA: 0x8ff8_0000 │ │ │ CPU ├───►│ ┌────►├─────────────────┐ │ PCI │ └─────┘ │ │ │ IA: 0x8ff0_0000 │ │ │ CPU Addr │ │ ┌─►├─────────────┐ │ │ Controller │ 0x7ff8_0000─┼───┘ │ │ │ │ │ │ │ │ │ │ │ │ │ PCI Addr 0x7ff0_0000─┼──────┘ │ │ └──► IOSpace ─┼────────────► │ │ │ │ │ 0 0x7000_0000─┼────────►├─────────┐ │ │ │ └─────────┘ │ └──────► CfgSpace ─┼────────────► BUS Fabric │ │ │ 0 │ │ │ └──────────► MemSpace ─┼────────────► IA: 0x8000_0000 │ │ 0x8000_0000 └────────────┘ bus@5f000000 { compatible = "simple-bus"; #address-cells = <1>; #size-cells = <1>; ranges = <0x80000000 0x0 0x70000000 0x10000000>; pcie@5f010000 { compatible = "fsl,imx8q-pcie"; reg = <0x5f010000 0x10000>, <0x8ff00000 0x80000>; reg-names = "dbi", "config"; #address-cells = <3>; #size-cells = <2>; device_type = "pci"; bus-range = <0x00 0xff>; ranges = <0x81000000 0 0x00000000 0x8ff80000 0 0x00010000>, <0x82000000 0 0x80000000 0x80000000 0 0x0ff00000>; ... }; }; In the diagram above, the 'parent_bus_addr' field in struct of_pci_range can indicate internal address (IA) address information. Link: https://lore.kernel.org/r/20241119-pci_fixup_addr-v8-1-c4bfa5193288@nxp.com Signed-off-by: Frank Li [kwilczynski: commit log] Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas Reviewed-by: Rob Herring (Arm) Acked-by: Manivannan Sadhasivam --- include/linux/of_address.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/of_address.h b/include/linux/of_address.h index 9e034363788a..0cff90365391 100644 --- a/include/linux/of_address.h +++ b/include/linux/of_address.h @@ -26,6 +26,7 @@ struct of_pci_range { u64 bus_addr; }; u64 cpu_addr; + u64 parent_bus_addr; u64 size; u32 flags; }; -- cgit v1.2.3 From e4b1d67e71419c4af581890ecea84b04920d4116 Mon Sep 17 00:00:00 2001 From: Valentina Fernandez Date: Tue, 17 Dec 2024 11:31:34 +0000 Subject: mailbox: add Microchip IPC support Add a mailbox controller driver for the Microchip Inter-processor Communication (IPC), which is used to send and receive data between processors. The driver uses the RISC-V Supervisor Binary Interface (SBI) to communicate with software running in machine mode (M-mode) to access the IPC hardware block. Additional details on the Microchip vendor extension and the IPC function IDs described in the driver can be found in the following documentation: https://github.com/linux4microchip/microchip-sbi-ecall-extension This SBI interface in this driver is compatible with the Mi-V Inter-hart Communication (IHC) IP. Transmitting and receiving data through the mailbox framework is done through struct mchp_ipc_msg. Signed-off-by: Valentina Fernandez Signed-off-by: Jassi Brar --- include/linux/mailbox/mchp-ipc.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 include/linux/mailbox/mchp-ipc.h (limited to 'include/linux') diff --git a/include/linux/mailbox/mchp-ipc.h b/include/linux/mailbox/mchp-ipc.h new file mode 100644 index 000000000000..f084ac9e291b --- /dev/null +++ b/include/linux/mailbox/mchp-ipc.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + *Copyright (c) 2024 Microchip Technology Inc. All rights reserved. + */ + +#ifndef _LINUX_MCHP_IPC_H_ +#define _LINUX_MCHP_IPC_H_ + +#include +#include + +struct mchp_ipc_msg { + u32 *buf; + u16 size; +}; + +struct mchp_ipc_sbi_chan { + void *buf_base_tx; + void *buf_base_rx; + void *msg_buf_tx; + void *msg_buf_rx; + phys_addr_t buf_base_tx_addr; + phys_addr_t buf_base_rx_addr; + phys_addr_t msg_buf_tx_addr; + phys_addr_t msg_buf_rx_addr; + int chan_aggregated_irq; + int mp_irq; + int mc_irq; + u32 id; + u32 max_msg_size; +}; + +#endif /* _LINUX_MCHP_IPC_H_ */ -- cgit v1.2.3 From fbf7e5ce408e0619072e84e93e875de52f2b5fa5 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Wed, 15 Jan 2025 14:18:15 +0000 Subject: mailbox: add Samsung Exynos driver The Samsung Exynos mailbox controller, used on Google GS101 SoC, has 16 flag bits for hardware interrupt generation and a shared register for passing mailbox messages. When the controller is used by the ACPM interface the shared register is ignored and the mailbox controller acts as a doorbell. The controller just raises the interrupt to APM after the ACPM interface has written the message to SRAM. Add support for the Samsung Exynos mailbox controller. Signed-off-by: Tudor Ambarus Signed-off-by: Jassi Brar --- include/linux/mailbox/exynos-message.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/linux/mailbox/exynos-message.h (limited to 'include/linux') diff --git a/include/linux/mailbox/exynos-message.h b/include/linux/mailbox/exynos-message.h new file mode 100644 index 000000000000..5a9ed5ce2046 --- /dev/null +++ b/include/linux/mailbox/exynos-message.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Exynos mailbox message. + * + * Copyright 2024 Linaro Ltd. + */ + +#ifndef _LINUX_EXYNOS_MESSAGE_H_ +#define _LINUX_EXYNOS_MESSAGE_H_ + +#define EXYNOS_MBOX_CHAN_TYPE_DOORBELL 0 +#define EXYNOS_MBOX_CHAN_TYPE_DATA 1 + +struct exynos_mbox_msg { + unsigned int chan_id; + unsigned int chan_type; +}; + +#endif /* _LINUX_EXYNOS_MESSAGE_H_ */ -- cgit v1.2.3 From 12d5151be01017401b8d2681f3c975265a233eaf Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 16 Jan 2025 22:38:40 +0100 Subject: net: phy: remove leftovers from switch to linkmode bitmaps We have some leftovers from the switch to linkmode bitmaps which - have never been used - are not used any longer - have no user outside phy_device.c So remove them. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/5493b96e-88bb-4230-a911-322659ec5167@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 244f747b3cd9..19f076a71f94 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -32,19 +32,6 @@ #include #include -#define PHY_DEFAULT_FEATURES (SUPPORTED_Autoneg | \ - SUPPORTED_TP | \ - SUPPORTED_MII) - -#define PHY_10BT_FEATURES (SUPPORTED_10baseT_Half | \ - SUPPORTED_10baseT_Full) - -#define PHY_100BT_FEATURES (SUPPORTED_100baseT_Half | \ - SUPPORTED_100baseT_Full) - -#define PHY_1000BT_FEATURES (SUPPORTED_1000baseT_Half | \ - SUPPORTED_1000baseT_Full) - extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_t1_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_t1s_p2mp_features) __ro_after_init; @@ -62,16 +49,11 @@ extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_eee_cap2_features) __ro_after_init; #define PHY_BASIC_T1S_P2MP_FEATURES ((unsigned long *)&phy_basic_t1s_p2mp_features) #define PHY_GBIT_FEATURES ((unsigned long *)&phy_gbit_features) #define PHY_GBIT_FIBRE_FEATURES ((unsigned long *)&phy_gbit_fibre_features) -#define PHY_GBIT_ALL_PORTS_FEATURES ((unsigned long *)&phy_gbit_all_ports_features) #define PHY_10GBIT_FEATURES ((unsigned long *)&phy_10gbit_features) -#define PHY_10GBIT_FEC_FEATURES ((unsigned long *)&phy_10gbit_fec_features) -#define PHY_10GBIT_FULL_FEATURES ((unsigned long *)&phy_10gbit_full_features) #define PHY_EEE_CAP1_FEATURES ((unsigned long *)&phy_eee_cap1_features) #define PHY_EEE_CAP2_FEATURES ((unsigned long *)&phy_eee_cap2_features) extern const int phy_basic_ports_array[3]; -extern const int phy_fibre_port_array[1]; -extern const int phy_all_ports_features_array[7]; extern const int phy_10_100_features_array[4]; extern const int phy_basic_t1_features_array[3]; extern const int phy_basic_t1s_p2mp_features_array[2]; -- cgit v1.2.3 From f50fcd23c9b9d99bf03d0ab9f30cba4665e6326e Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 12 Jan 2025 16:30:59 +0000 Subject: crypto: asymmetric_keys - Remove unused key_being_used_for[] key_being_used_for[] is an unused array of textual names for the elements of the enum key_being_used_for. It was added in 2015 by commit 99db44350672 ("PKCS#7: Appropriately restrict authenticated attributes and content type") Remove it. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Herbert Xu --- include/linux/verification.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/verification.h b/include/linux/verification.h index cb2d47f28091..4f3022d081c3 100644 --- a/include/linux/verification.h +++ b/include/linux/verification.h @@ -38,8 +38,6 @@ enum key_being_used_for { VERIFYING_UNSPECIFIED_SIGNATURE, NR__KEY_BEING_USED_FOR }; -extern const char *const key_being_used_for[NR__KEY_BEING_USED_FOR]; - #ifdef CONFIG_SYSTEM_DATA_VERIFICATION struct key; -- cgit v1.2.3 From e876695aab1e3d4743e11633219cea456820660b Mon Sep 17 00:00:00 2001 From: I Hsin Cheng Date: Fri, 17 Jan 2025 22:27:24 +0800 Subject: cpumask: Rephrase comments for cpumask_any*() APIs The cpumask_any*() APIs comment states that it returns a "random" cpu within the given cpumask. However it's not actually random as random itself stands a meaning for uniform distribution. cpumask_any*() APIs are a naming convention for the caller to states that it doesn't care which CPU it gets, so change "random" to "arbitrary" would be more appropriate. CC: Mark Rutland Signed-off-by: I Hsin Cheng Reviewed-by: Kuan-Wei Chiu Signed-off-by: Yury Norov --- include/linux/cpumask.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 590d8438514c..36a890d0dd57 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -391,7 +391,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta for_each_set_bit_from(cpu, cpumask_bits(mask), small_cpumask_bits) /** - * cpumask_any_but - return a "random" in a cpumask, but not this one. + * cpumask_any_but - return an arbitrary cpu in a cpumask, but not this one. * @mask: the cpumask to search * @cpu: the cpu to ignore. * @@ -411,7 +411,7 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) } /** - * cpumask_any_and_but - pick a "random" cpu from *mask1 & *mask2, but not this one. + * cpumask_any_and_but - pick an arbitrary cpu from *mask1 & *mask2, but not this one. * @mask1: the first input cpumask * @mask2: the second input cpumask * @cpu: the cpu to ignore @@ -840,7 +840,7 @@ void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp) } /** - * cpumask_any - pick a "random" cpu from *srcp + * cpumask_any - pick an arbitrary cpu from *srcp * @srcp: the input cpumask * * Return: >= nr_cpu_ids if no cpus set. @@ -848,7 +848,7 @@ void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp) #define cpumask_any(srcp) cpumask_first(srcp) /** - * cpumask_any_and - pick a "random" cpu from *mask1 & *mask2 + * cpumask_any_and - pick an arbitrary cpu from *mask1 & *mask2 * @mask1: the first input cpumask * @mask2: the second input cpumask * -- cgit v1.2.3 From b489e7946656ed67fea1a30f5103eb62a8686e04 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 9 Jan 2024 11:57:48 +0530 Subject: PM / OPP: Add reference counting helpers for Rust implementation To ensure that resources such as OPP tables or OPP nodes are not freed while in use by the Rust implementation, it is necessary to increment their reference count from Rust code. This commit introduces a new helper function, dev_pm_opp_get_opp_table_ref(), to increment the reference count of an OPP table and declares the existing helper dev_pm_opp_get() in pm_opp.h. Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 414146abfe81..c247317aae38 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -100,6 +100,7 @@ struct dev_pm_opp_data { #if defined(CONFIG_PM_OPP) struct opp_table *dev_pm_opp_get_opp_table(struct device *dev); +void dev_pm_opp_get_opp_table_ref(struct opp_table *opp_table); void dev_pm_opp_put_opp_table(struct opp_table *opp_table); unsigned long dev_pm_opp_get_bw(struct dev_pm_opp *opp, bool peak, int index); @@ -160,6 +161,7 @@ struct dev_pm_opp *dev_pm_opp_find_bw_ceil(struct device *dev, struct dev_pm_opp *dev_pm_opp_find_bw_floor(struct device *dev, unsigned int *bw, int index); +void dev_pm_opp_get(struct dev_pm_opp *opp); void dev_pm_opp_put(struct dev_pm_opp *opp); int dev_pm_opp_add_dynamic(struct device *dev, struct dev_pm_opp_data *opp); @@ -205,6 +207,8 @@ static inline struct opp_table *dev_pm_opp_get_opp_table_indexed(struct device * return ERR_PTR(-EOPNOTSUPP); } +static inline void dev_pm_opp_get_opp_table_ref(struct opp_table *opp_table) {} + static inline void dev_pm_opp_put_opp_table(struct opp_table *opp_table) {} static inline unsigned long dev_pm_opp_get_bw(struct dev_pm_opp *opp, bool peak, int index) @@ -341,6 +345,8 @@ static inline struct dev_pm_opp *dev_pm_opp_find_bw_floor(struct device *dev, return ERR_PTR(-EOPNOTSUPP); } +static inline void dev_pm_opp_get(struct dev_pm_opp *opp) {} + static inline void dev_pm_opp_put(struct dev_pm_opp *opp) {} static inline int -- cgit v1.2.3 From 3c836451ca9041cfb32a7d8f59ea15b3b991bbb3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 18 Jan 2025 18:05:11 -0800 Subject: net: move HDS config from ethtool state Separate the HDS config from the ethtool state struct. The HDS config contains just simple parameters, not state. Having it as a separate struct will make it easier to clone / copy and also long term potentially make it per-queue. Reviewed-by: Michael Chan Link: https://patch.msgid.link/20250119020518.1962249-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 4 ---- include/linux/netdevice.h | 3 +++ 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 64301ddf2f59..870994cc3ef7 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1171,16 +1171,12 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev, * @rss_ctx: XArray of custom RSS contexts * @rss_lock: Protects entries in @rss_ctx. May be taken from * within RTNL. - * @hds_thresh: HDS Threshold value. - * @hds_config: HDS value from userspace. * @wol_enabled: Wake-on-LAN is enabled * @module_fw_flash_in_progress: Module firmware flashing is in progress. */ struct ethtool_netdev_state { struct xarray rss_ctx; struct mutex rss_lock; - u32 hds_thresh; - u8 hds_config; unsigned wol_enabled:1; unsigned module_fw_flash_in_progress:1; }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8308d9c75918..173a8b3a9eb2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -63,6 +63,7 @@ struct dsa_port; struct ip_tunnel_parm_kern; struct macsec_context; struct macsec_ops; +struct netdev_config; struct netdev_name_node; struct sd_flow_limit; struct sfp_bus; @@ -2410,6 +2411,8 @@ struct net_device { const struct udp_tunnel_nic_info *udp_tunnel_nic_info; struct udp_tunnel_nic *udp_tunnel_nic; + /** @cfg: net_device queue-related configuration */ + struct netdev_config *cfg; struct ethtool_netdev_state *ethtool; /* protected by rtnl_lock */ -- cgit v1.2.3 From 32ad1f7a050d0c17e1e52e1dfdd9f6221ae20ef9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 18 Jan 2025 18:05:13 -0800 Subject: net: provide pending ring configuration in net_device Record the pending configuration in net_device struct. ethtool core duplicates the current config and the specific handlers (for now just ringparam) can modify it. Reviewed-by: Michael Chan Link: https://patch.msgid.link/20250119020518.1962249-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 173a8b3a9eb2..8da4c61f97b9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2413,6 +2413,12 @@ struct net_device { /** @cfg: net_device queue-related configuration */ struct netdev_config *cfg; + /** + * @cfg_pending: same as @cfg but when device is being actively + * reconfigured includes any changes to the configuration + * requested by the user, but which may or may not be rejected. + */ + struct netdev_config *cfg_pending; struct ethtool_netdev_state *ethtool; /* protected by rtnl_lock */ -- cgit v1.2.3 From ead11ac50ad4b8ef1b64806e962ea984862d96ad Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 13 Jan 2025 17:29:03 -0500 Subject: nfs: fix incorrect error handling in LOCALIO nfs4_stat_to_errno() expects a NFSv4 error code as an argument and returns a POSIX errno. The problem is LOCALIO is passing nfs4_stat_to_errno() the POSIX errno return values from filp->f_op->read_iter(), filp->f_op->write_iter() and vfs_fsync_range(). So the POSIX errno that nfs_local_pgio_done() and nfs_local_commit_done() are passing to nfs4_stat_to_errno() are failing to match any NFSv4 error code, which results in nfs4_stat_to_errno() defaulting to returning -EREMOTEIO. This causes assertions in upper layers due to -EREMOTEIO not being a valid NFSv4 error code. Fix this by updating nfs_local_pgio_done() and nfs_local_commit_done() to use the new nfs_localio_errno_to_nfs4_stat() to map a POSIX errno to an NFSv4 error code. Care was taken to factor out nfs4_errtbl_common[] to avoid duplicating the same NFS error to errno table. nfs4_errtbl_common[] is checked first by both nfs4_stat_to_errno and nfs_localio_errno_to_nfs4_stat before they check their own more specialized tables (nfs4_errtbl[] and nfs4_errtbl_localio[] respectively). While auditing the associated error mapping tables, the (ab)use of -1 for the last table entry was removed in favor of using ARRAY_SIZE to iterate the nfs_errtbl[] and nfs4_errtbl[]. And 'errno_NFSERR_IO' was removed because it caused needless obfuscation. Fixes: 70ba381e1a431 ("nfs: add LOCALIO support") Reported-by: Trond Myklebust Signed-off-by: Mike Snitzer Signed-off-by: Anna Schumaker --- include/linux/nfs_common.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_common.h b/include/linux/nfs_common.h index 5fc02df88252..a541c3a02887 100644 --- a/include/linux/nfs_common.h +++ b/include/linux/nfs_common.h @@ -9,9 +9,10 @@ #include /* Mapping from NFS error code to "errno" error code. */ -#define errno_NFSERR_IO EIO int nfs_stat_to_errno(enum nfs_stat status); int nfs4_stat_to_errno(int stat); +__u32 nfs_localio_errno_to_nfs4_stat(int errno); + #endif /* _LINUX_NFS_COMMON_H */ -- cgit v1.2.3 From 8e1d32273ab7d06b6f78771e05824bfab01141f4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 9 Dec 2024 16:13:54 -0500 Subject: nfs_common: make include/linux/nfs4.h include generated nfs4_1.h In the long run, the NFS development community intends to autogenerate a lot of the XDR handling code. Both the NFS client and server include "include/linux/nfs4.hi". That file was hand-rolled, and some of the symbols in it conflict with the autogenerated symbols. Add a small nfs4_1.x to Documentation that currently just has the necessary definitions for the delstid draft, and generate the relevant header and source files. Make include/linux/nfs4.h include the generated include/linux/sunrpc/xdrgen/nfs4_1.h and remove the conflicting definitions from it and nfs_xdr.h. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/nfs4.h | 7 +- include/linux/nfs_xdr.h | 5 -- include/linux/sunrpc/xdrgen/nfs4_1.h | 124 +++++++++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+), 11 deletions(-) create mode 100644 include/linux/sunrpc/xdrgen/nfs4_1.h (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 8d7430d9f218..b90719244775 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -17,6 +17,7 @@ #include #include #include +#include enum nfs4_acl_whotype { NFS4_ACL_WHO_NAMED = 0, @@ -512,12 +513,6 @@ enum { FATTR4_XATTR_SUPPORT = 82, }; -enum { - FATTR4_TIME_DELEG_ACCESS = 84, - FATTR4_TIME_DELEG_MODIFY = 85, - FATTR4_OPEN_ARGUMENTS = 86, -}; - /* * The following internal definitions enable processing the above * attribute bits within 32-bit word boundaries. diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 559273a0f16d..e74a87bb18a4 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1315,11 +1315,6 @@ struct nfs4_fsid_present_res { #endif /* CONFIG_NFS_V4 */ -struct nfstime4 { - u64 seconds; - u32 nseconds; -}; - #ifdef CONFIG_NFS_V4_1 struct pnfs_commit_bucket { diff --git a/include/linux/sunrpc/xdrgen/nfs4_1.h b/include/linux/sunrpc/xdrgen/nfs4_1.h new file mode 100644 index 000000000000..6025ab6b7398 --- /dev/null +++ b/include/linux/sunrpc/xdrgen/nfs4_1.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Generated by xdrgen. Manual edits will be lost. */ +/* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */ +/* XDR specification modification time: Thu Oct 3 11:30:59 2024 */ + +#ifndef _LINUX_XDRGEN_NFS4_1_DEF_H +#define _LINUX_XDRGEN_NFS4_1_DEF_H + +#include +#include + +typedef s64 int64_t; + +typedef u32 uint32_t; + +typedef struct { + u32 count; + uint32_t *element; +} bitmap4; + +struct nfstime4 { + int64_t seconds; + uint32_t nseconds; +}; + +typedef bool fattr4_offline; + +enum { FATTR4_OFFLINE = 83 }; + +struct open_arguments4 { + bitmap4 oa_share_access; + bitmap4 oa_share_deny; + bitmap4 oa_share_access_want; + bitmap4 oa_open_claim; + bitmap4 oa_create_mode; +}; + +enum open_args_share_access4 { + OPEN_ARGS_SHARE_ACCESS_READ = 1, + OPEN_ARGS_SHARE_ACCESS_WRITE = 2, + OPEN_ARGS_SHARE_ACCESS_BOTH = 3, +}; +typedef enum open_args_share_access4 open_args_share_access4; + +enum open_args_share_deny4 { + OPEN_ARGS_SHARE_DENY_NONE = 0, + OPEN_ARGS_SHARE_DENY_READ = 1, + OPEN_ARGS_SHARE_DENY_WRITE = 2, + OPEN_ARGS_SHARE_DENY_BOTH = 3, +}; +typedef enum open_args_share_deny4 open_args_share_deny4; + +enum open_args_share_access_want4 { + OPEN_ARGS_SHARE_ACCESS_WANT_ANY_DELEG = 3, + OPEN_ARGS_SHARE_ACCESS_WANT_NO_DELEG = 4, + OPEN_ARGS_SHARE_ACCESS_WANT_CANCEL = 5, + OPEN_ARGS_SHARE_ACCESS_WANT_SIGNAL_DELEG_WHEN_RESRC_AVAIL = 17, + OPEN_ARGS_SHARE_ACCESS_WANT_PUSH_DELEG_WHEN_UNCONTENDED = 18, + OPEN_ARGS_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 20, + OPEN_ARGS_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 21, +}; +typedef enum open_args_share_access_want4 open_args_share_access_want4; + +enum open_args_open_claim4 { + OPEN_ARGS_OPEN_CLAIM_NULL = 0, + OPEN_ARGS_OPEN_CLAIM_PREVIOUS = 1, + OPEN_ARGS_OPEN_CLAIM_DELEGATE_CUR = 2, + OPEN_ARGS_OPEN_CLAIM_DELEGATE_PREV = 3, + OPEN_ARGS_OPEN_CLAIM_FH = 4, + OPEN_ARGS_OPEN_CLAIM_DELEG_CUR_FH = 5, + OPEN_ARGS_OPEN_CLAIM_DELEG_PREV_FH = 6, +}; +typedef enum open_args_open_claim4 open_args_open_claim4; + +enum open_args_createmode4 { + OPEN_ARGS_CREATEMODE_UNCHECKED4 = 0, + OPEN_ARGS_CREATE_MODE_GUARDED = 1, + OPEN_ARGS_CREATEMODE_EXCLUSIVE4 = 2, + OPEN_ARGS_CREATE_MODE_EXCLUSIVE4_1 = 3, +}; +typedef enum open_args_createmode4 open_args_createmode4; + +typedef struct open_arguments4 fattr4_open_arguments; + +enum { FATTR4_OPEN_ARGUMENTS = 86 }; + +enum { OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 0x200000 }; + +enum { OPEN4_RESULT_NO_OPEN_STATEID = 0x00000010 }; + +typedef struct nfstime4 fattr4_time_deleg_access; + +typedef struct nfstime4 fattr4_time_deleg_modify; + +enum { FATTR4_TIME_DELEG_ACCESS = 84 }; + +enum { FATTR4_TIME_DELEG_MODIFY = 85 }; + +enum { OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 0x100000 }; + +#define NFS4_int64_t_sz \ + (XDR_hyper) +#define NFS4_uint32_t_sz \ + (XDR_unsigned_int) +#define NFS4_bitmap4_sz (XDR_unsigned_int) +#define NFS4_nfstime4_sz \ + (NFS4_int64_t_sz + NFS4_uint32_t_sz) +#define NFS4_fattr4_offline_sz \ + (XDR_bool) +#define NFS4_open_arguments4_sz \ + (NFS4_bitmap4_sz + NFS4_bitmap4_sz + NFS4_bitmap4_sz + NFS4_bitmap4_sz + NFS4_bitmap4_sz) +#define NFS4_open_args_share_access4_sz (XDR_int) +#define NFS4_open_args_share_deny4_sz (XDR_int) +#define NFS4_open_args_share_access_want4_sz (XDR_int) +#define NFS4_open_args_open_claim4_sz (XDR_int) +#define NFS4_open_args_createmode4_sz (XDR_int) +#define NFS4_fattr4_open_arguments_sz \ + (NFS4_open_arguments4_sz) +#define NFS4_fattr4_time_deleg_access_sz \ + (NFS4_nfstime4_sz) +#define NFS4_fattr4_time_deleg_modify_sz \ + (NFS4_nfstime4_sz) + +#endif /* _LINUX_XDRGEN_NFS4_1_DEF_H */ -- cgit v1.2.3 From 8dfbea8bde6e976136948421325b24b5bdb76ad3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 9 Dec 2024 16:13:55 -0500 Subject: nfsd: switch to autogenerated definitions for open_delegation_type4 Rename the enum with the same name in include/linux/nfs4.h, add the proper enum to nfs4_1.x and regenerate the headers and source files. Do a mass rename of all NFS4_OPEN_DELEGATE_* to OPEN_DELEGATE_* in the nfsd directory. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/nfs4.h | 2 +- include/linux/sunrpc/xdrgen/nfs4_1.h | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index b90719244775..71fbebfa43c7 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -366,7 +366,7 @@ enum limit_by4 { NFS4_LIMIT_BLOCKS = 2 }; -enum open_delegation_type4 { +enum nfs4_open_delegation_type4 { NFS4_OPEN_DELEGATE_NONE = 0, NFS4_OPEN_DELEGATE_READ = 1, NFS4_OPEN_DELEGATE_WRITE = 2, diff --git a/include/linux/sunrpc/xdrgen/nfs4_1.h b/include/linux/sunrpc/xdrgen/nfs4_1.h index 6025ab6b7398..9ca83a4a04cf 100644 --- a/include/linux/sunrpc/xdrgen/nfs4_1.h +++ b/include/linux/sunrpc/xdrgen/nfs4_1.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Generated by xdrgen. Manual edits will be lost. */ /* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */ -/* XDR specification modification time: Thu Oct 3 11:30:59 2024 */ +/* XDR specification modification time: Sat Oct 12 08:10:54 2024 */ #ifndef _LINUX_XDRGEN_NFS4_1_DEF_H #define _LINUX_XDRGEN_NFS4_1_DEF_H @@ -98,6 +98,16 @@ enum { FATTR4_TIME_DELEG_MODIFY = 85 }; enum { OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 0x100000 }; +enum open_delegation_type4 { + OPEN_DELEGATE_NONE = 0, + OPEN_DELEGATE_READ = 1, + OPEN_DELEGATE_WRITE = 2, + OPEN_DELEGATE_NONE_EXT = 3, + OPEN_DELEGATE_READ_ATTRS_DELEG = 4, + OPEN_DELEGATE_WRITE_ATTRS_DELEG = 5, +}; +typedef enum open_delegation_type4 open_delegation_type4; + #define NFS4_int64_t_sz \ (XDR_hyper) #define NFS4_uint32_t_sz \ @@ -120,5 +130,6 @@ enum { OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 0x100000 }; (NFS4_nfstime4_sz) #define NFS4_fattr4_time_deleg_modify_sz \ (NFS4_nfstime4_sz) +#define NFS4_open_delegation_type4_sz (XDR_int) #endif /* _LINUX_XDRGEN_NFS4_1_DEF_H */ -- cgit v1.2.3 From c9c99a33e2b0083c83a2c29eebfad92c78e16791 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 9 Dec 2024 16:13:56 -0500 Subject: nfsd: rename NFS4_SHARE_WANT_* constants to OPEN4_SHARE_ACCESS_WANT_* Add the OPEN4_SHARE_ACCESS_WANT constants from the nfs4.1 and delstid draft into the nfs4_1.x file, and regenerate the headers and source files. Do a mass renaming of NFS4_SHARE_WANT_* to OPEN4_SHARE_ACCESS_WANT_* in the nfsd directory. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdrgen/nfs4_1.h | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdrgen/nfs4_1.h b/include/linux/sunrpc/xdrgen/nfs4_1.h index 9ca83a4a04cf..cf21a14aa885 100644 --- a/include/linux/sunrpc/xdrgen/nfs4_1.h +++ b/include/linux/sunrpc/xdrgen/nfs4_1.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Generated by xdrgen. Manual edits will be lost. */ /* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */ -/* XDR specification modification time: Sat Oct 12 08:10:54 2024 */ +/* XDR specification modification time: Mon Oct 14 09:10:13 2024 */ #ifndef _LINUX_XDRGEN_NFS4_1_DEF_H #define _LINUX_XDRGEN_NFS4_1_DEF_H @@ -84,8 +84,6 @@ typedef struct open_arguments4 fattr4_open_arguments; enum { FATTR4_OPEN_ARGUMENTS = 86 }; -enum { OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 0x200000 }; - enum { OPEN4_RESULT_NO_OPEN_STATEID = 0x00000010 }; typedef struct nfstime4 fattr4_time_deleg_access; @@ -96,8 +94,28 @@ enum { FATTR4_TIME_DELEG_ACCESS = 84 }; enum { FATTR4_TIME_DELEG_MODIFY = 85 }; +enum { OPEN4_SHARE_ACCESS_WANT_DELEG_MASK = 0xFF00 }; + +enum { OPEN4_SHARE_ACCESS_WANT_NO_PREFERENCE = 0x0000 }; + +enum { OPEN4_SHARE_ACCESS_WANT_READ_DELEG = 0x0100 }; + +enum { OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG = 0x0200 }; + +enum { OPEN4_SHARE_ACCESS_WANT_ANY_DELEG = 0x0300 }; + +enum { OPEN4_SHARE_ACCESS_WANT_NO_DELEG = 0x0400 }; + +enum { OPEN4_SHARE_ACCESS_WANT_CANCEL = 0x0500 }; + +enum { OPEN4_SHARE_ACCESS_WANT_SIGNAL_DELEG_WHEN_RESRC_AVAIL = 0x10000 }; + +enum { OPEN4_SHARE_ACCESS_WANT_PUSH_DELEG_WHEN_UNCONTENDED = 0x20000 }; + enum { OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 0x100000 }; +enum { OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 0x200000 }; + enum open_delegation_type4 { OPEN_DELEGATE_NONE = 0, OPEN_DELEGATE_READ = 1, -- cgit v1.2.3 From 6ae30d6eb26bce02c48c60074b4306270e2434c1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 9 Dec 2024 16:14:00 -0500 Subject: nfsd: add support for delegated timestamps Add support for the delegated timestamps on write delegations. This allows the server to proxy timestamps from the delegation holder to other clients that are doing GETATTRs vs. the same inode. When OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS bit is set in the OPEN call, set the dl_type to the *_ATTRS_DELEG flavor of delegation. Add timespec64 fields to nfs4_cb_fattr and decode the timestamps into those. Vet those timestamps according to the delstid spec and update the inode attrs if necessary. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/time64.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/time64.h b/include/linux/time64.h index f1bcea8c124a..9934331c7b86 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -49,6 +49,11 @@ static inline int timespec64_equal(const struct timespec64 *a, return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); } +static inline bool timespec64_is_epoch(const struct timespec64 *ts) +{ + return ts->tv_sec == 0 && ts->tv_nsec == 0; +} + /* * lhs < rhs: return <0 * lhs == rhs: return 0 -- cgit v1.2.3 From ee0d90d4b97a9787ed55b22c85c72376329d86ac Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 10 Dec 2024 01:02:23 +0000 Subject: sunrpc: Remove unused xprt_iter_get_xprt xprt_iter_get_xprt() was added by commit 80b14d5e61ca ("SUNRPC: Add a structure to track multiple transports") but is unused. Remove it. Signed-off-by: Dr. David Alan Gilbert Acked-by: Anna Schumaker Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/xprtmultipath.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h index c0514c684b2c..e411368cdacf 100644 --- a/include/linux/sunrpc/xprtmultipath.h +++ b/include/linux/sunrpc/xprtmultipath.h @@ -75,7 +75,6 @@ extern struct rpc_xprt_switch *xprt_iter_xchg_switch( struct rpc_xprt_switch *newswitch); extern struct rpc_xprt *xprt_iter_xprt(struct rpc_xprt_iter *xpi); -extern struct rpc_xprt *xprt_iter_get_xprt(struct rpc_xprt_iter *xpi); extern struct rpc_xprt *xprt_iter_get_next(struct rpc_xprt_iter *xpi); extern bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, -- cgit v1.2.3 From afc52b1eeb36f20eea321f50e338e38d00a8a61f Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 10 Dec 2024 01:02:24 +0000 Subject: sunrpc: Remove gss_generic_token deadcode Commit ec596aaf9b48 ("SUNRPC: Remove code behind CONFIG_RPCSEC_GSS_KRB5_SIMPLIFIED") was the last user of the routines in gss_generic_token.c. Remove the routines and associated header. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/gss_asn1.h | 81 ----------------------------------------- include/linux/sunrpc/gss_krb5.h | 1 - 2 files changed, 82 deletions(-) delete mode 100644 include/linux/sunrpc/gss_asn1.h (limited to 'include/linux') diff --git a/include/linux/sunrpc/gss_asn1.h b/include/linux/sunrpc/gss_asn1.h deleted file mode 100644 index 3ccecd0ad229..000000000000 --- a/include/linux/sunrpc/gss_asn1.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * linux/include/linux/sunrpc/gss_asn1.h - * - * minimal asn1 for generic encoding/decoding of gss tokens - * - * Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h, - * lib/gssapi/krb5/gssapiP_krb5.h, and others - * - * Copyright (c) 2000 The Regents of the University of Michigan. - * All rights reserved. - * - * Andy Adamson - */ - -/* - * Copyright 1995 by the Massachusetts Institute of Technology. - * All Rights Reserved. - * - * Export of this software from the United States of America may - * require a specific license from the United States Government. - * It is the responsibility of any person or organization contemplating - * export to obtain such a license before exporting. - * - * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and - * distribute this software and its documentation for any purpose and - * without fee is hereby granted, provided that the above copyright - * notice appear in all copies and that both that copyright notice and - * this permission notice appear in supporting documentation, and that - * the name of M.I.T. not be used in advertising or publicity pertaining - * to distribution of the software without specific, written prior - * permission. Furthermore if you modify this software you must label - * your software as modified software and not distribute it in such a - * fashion that it might be confused with the original M.I.T. software. - * M.I.T. makes no representations about the suitability of - * this software for any purpose. It is provided "as is" without express - * or implied warranty. - * - */ - - -#include - -#define SIZEOF_INT 4 - -/* from gssapi_err_generic.h */ -#define G_BAD_SERVICE_NAME (-2045022976L) -#define G_BAD_STRING_UID (-2045022975L) -#define G_NOUSER (-2045022974L) -#define G_VALIDATE_FAILED (-2045022973L) -#define G_BUFFER_ALLOC (-2045022972L) -#define G_BAD_MSG_CTX (-2045022971L) -#define G_WRONG_SIZE (-2045022970L) -#define G_BAD_USAGE (-2045022969L) -#define G_UNKNOWN_QOP (-2045022968L) -#define G_NO_HOSTNAME (-2045022967L) -#define G_BAD_HOSTNAME (-2045022966L) -#define G_WRONG_MECH (-2045022965L) -#define G_BAD_TOK_HEADER (-2045022964L) -#define G_BAD_DIRECTION (-2045022963L) -#define G_TOK_TRUNC (-2045022962L) -#define G_REFLECT (-2045022961L) -#define G_WRONG_TOKID (-2045022960L) - -#define g_OID_equal(o1,o2) \ - (((o1)->len == (o2)->len) && \ - (memcmp((o1)->data,(o2)->data,(int) (o1)->len) == 0)) - -u32 g_verify_token_header( - struct xdr_netobj *mech, - int *body_size, - unsigned char **buf_in, - int toksize); - -int g_token_size( - struct xdr_netobj *mech, - unsigned int body_size); - -void g_make_token_header( - struct xdr_netobj *mech, - int body_size, - unsigned char **buf); diff --git a/include/linux/sunrpc/gss_krb5.h b/include/linux/sunrpc/gss_krb5.h index 78a80bf3fdcb..43950b5237c8 100644 --- a/include/linux/sunrpc/gss_krb5.h +++ b/include/linux/sunrpc/gss_krb5.h @@ -40,7 +40,6 @@ #include #include #include -#include /* Length of constant used in key derivation */ #define GSS_KRB5_K5CLENGTH (5) -- cgit v1.2.3 From 66611c0475709607f398e2a5d691b1fc72fe9dfc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 21 Jan 2025 19:44:36 -0500 Subject: fgraph: Remove calltime and rettime from generic operations The function graph infrastructure is now generic so that kretprobes, fprobes and BPF can use it. But there is still some leftover logic that only the function graph tracer itself uses. This is the calculation of the calltime and return time of the functions. The calculation of the calltime has been moved into the function graph tracer and those users that need it so that it doesn't cause overhead to the other users. But the return function timestamp was still called. Instead of just moving the taking of the timestamp into the function graph trace remove the calltime and rettime completely from the ftrace_graph_ret structure. Instead, move it into the function graph return entry event structure and this also moves all the calltime and rettime logic out of the generic fgraph.c code and into the tracing code that uses it. This has been reported to decrease the overhead by ~27%. Link: https://lore.kernel.org/all/Z3aSuql3fnXMVMoM@krava/ Link: https://lore.kernel.org/all/173665959558.1629214.16724136597211810729.stgit@devnote2/ Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250121194436.15bdf71a@gandalf.local.home Reported-by: Jiri Olsa Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 07092dfb21a4..fbabc3d848b3 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1151,8 +1151,6 @@ struct ftrace_graph_ret { int depth; /* Number of functions that overran the depth limit for current task */ unsigned int overrun; - unsigned long long calltime; - unsigned long long rettime; } __packed; struct fgraph_ops; -- cgit v1.2.3 From f79b163c42314a1f46f4bcc40a19c8a75cf1e7a3 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 22 Jan 2025 10:35:56 +0100 Subject: Revert "serial: 8250: Switch to nbcon console" This reverts commit b63e6f60eab45b16a1bf734fef9035a4c4187cd5. kernel test robot has found problems with this commit so revert it for now. Link: https://lore.kernel.org/r/202501221029.fb0d574d-lkp@intel.com Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202501221029.fb0d574d-lkp@intel.com Cc: John Ogness Cc: Petr Mladek Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_8250.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index 57875c37023a..144de7a7948d 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -150,17 +150,8 @@ struct uart_8250_port { #define LSR_SAVE_FLAGS UART_LSR_BRK_ERROR_BITS u16 lsr_saved_flags; u16 lsr_save_mask; - - /* - * Track when a console line has been fully written to the - * hardware, i.e. true when the most recent byte written to - * UART_TX by the console was '\n'. - */ - bool console_line_ended; - #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA unsigned char msr_saved_flags; - struct irq_work modem_status_work; struct uart_8250_dma *dma; const struct uart_8250_ops *ops; @@ -211,8 +202,8 @@ void serial8250_tx_chars(struct uart_8250_port *up); unsigned int serial8250_modem_status(struct uart_8250_port *up); void serial8250_init_port(struct uart_8250_port *up); void serial8250_set_defaults(struct uart_8250_port *up); -void serial8250_console_write(struct uart_8250_port *up, - struct nbcon_write_context *wctxt, bool in_atomic); +void serial8250_console_write(struct uart_8250_port *up, const char *s, + unsigned int count); int serial8250_console_setup(struct uart_port *port, char *options, bool probe); int serial8250_console_exit(struct uart_port *port); -- cgit v1.2.3 From fa3595523d72d13508befd28cf2ca642cafc69f7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 22 Jan 2025 20:00:57 -0700 Subject: io_uring: get rid of alloc cache init_once handling init_once is called when an object doesn't come from the cache, and hence needs initial clearing of certain members. While the whole struct could get cleared by memset() in that case, a few of the cache members are large enough that this may cause unnecessary overhead if the caches used aren't large enough to satisfy the workload. For those cases, some churn of kmalloc+kfree is to be expected. Ensure that the 3 users that need clearing put the members they need cleared at the start of the struct, and wrap the rest of the struct in a struct group so the offset is known. While at it, improve the interaction with KASAN such that when/if KASAN writes to members inside the struct that should be retained over caching, it won't trip over itself. For rw and net, the retaining of the iovec over caching is disabled if KASAN is enabled. A helper will free and clear those members in that case. Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 2 +- include/linux/io_uring_types.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index a3ce553413de..abd0c8bd950b 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -19,8 +19,8 @@ struct io_uring_cmd { }; struct io_uring_cmd_data { - struct io_uring_sqe sqes[2]; void *op_data; + struct io_uring_sqe sqes[2]; }; static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 623d8e798a11..3def525a1da3 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -222,7 +222,8 @@ struct io_alloc_cache { void **entries; unsigned int nr_cached; unsigned int max_cached; - size_t elem_size; + unsigned int elem_size; + unsigned int init_clear; }; struct io_ring_ctx { -- cgit v1.2.3 From 53dac345395c0d2493cbc2f4c85fe38aef5b63f5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 18 Jan 2025 00:24:33 +0100 Subject: hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING hrtimers are migrated away from the dying CPU to any online target at the CPUHP_AP_HRTIMERS_DYING stage in order not to delay bandwidth timers handling tasks involved in the CPU hotplug forward progress. However wakeups can still be performed by the outgoing CPU after CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth timers being armed. Depending on several considerations (crystal ball power management based election, earliest timer already enqueued, timer migration enabled or not), the target may eventually be the current CPU even if offline. If that happens, the timer is eventually ignored. The most notable example is RCU which had to deal with each and every of those wake-ups by deferring them to an online CPU, along with related workarounds: _ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is dying) _ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from offline CPU) _ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline softirq) The problem isn't confined to RCU though as the stop machine kthread (which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at the end of its work through cpu_stop_signal_done() and performs a wake up that eventually arms the deadline server timer: WARNING: CPU: 94 PID: 588 at kernel/time/hrtimer.c:1086 hrtimer_start_range_ns+0x289/0x2d0 CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not tainted Stopper: multi_cpu_stop+0x0/0x120 <- stop_machine_cpuslocked+0x66/0xc0 RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0 Call Trace: start_dl_timer enqueue_dl_entity dl_server_start enqueue_task_fair enqueue_task ttwu_do_activate try_to_wake_up complete cpu_stopper_thread Instead of providing yet another bandaid to work around the situation, fix it in the hrtimers infrastructure instead: always migrate away a timer to an online target whenever it is enqueued from an offline CPU. This will also allow to revert all the above RCU disgraceful hacks. Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") Reported-by: Vlad Poenaru Reported-by: Usama Arif Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Tested-by: Paul E. McKenney Link: https://lore.kernel.org/all/20250117232433.24027-1-frederic@kernel.org Closes: 20241213203739.1519801-1-usamaarif642@gmail.com --- include/linux/hrtimer_defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index c3b4b7ed7c16..84a5045f80f3 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -125,6 +125,7 @@ struct hrtimer_cpu_base { ktime_t softirq_expires_next; struct hrtimer *softirq_next_timer; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; + call_single_data_t csd; } ____cacheline_aligned; -- cgit v1.2.3 From da6b353786997c0ffa67127355ad1d54ed3324c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 23 Jan 2025 18:27:07 +0100 Subject: pwm: Ensure callbacks exist before calling them MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If one of the waveform functions is called for a chip that only supports .apply(), we want that an error code is returned and not a NULL pointer exception. Fixes: 6c5126c6406d ("pwm: Provide new consumer API functions for waveforms") Cc: stable@vger.kernel.org Signed-off-by: Uwe Kleine-König Tested-by: Trevor Gamblin Link: https://lore.kernel.org/r/20250123172709.391349-2-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 78827f312407..b8d78009e779 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -347,6 +347,23 @@ struct pwm_chip { struct pwm_device pwms[] __counted_by(npwm); }; +/** + * pwmchip_supports_waveform() - checks if the given chip supports waveform callbacks + * @chip: The pwm_chip to test + * + * Returns true iff the pwm chip support the waveform functions like + * pwm_set_waveform_might_sleep() and pwm_round_waveform_might_sleep() + */ +static inline bool pwmchip_supports_waveform(struct pwm_chip *chip) +{ + /* + * only check for .write_waveform(). If that is available, + * .round_waveform_tohw() and .round_waveform_fromhw() asserted to be + * available, too, in pwmchip_add(). + */ + return chip->ops->write_waveform != NULL; +} + static inline struct device *pwmchip_parent(const struct pwm_chip *chip) { return chip->dev.parent; -- cgit v1.2.3 From 4891cd3eba62ac611a7929948cf5588a1abed909 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 16 Jan 2025 17:43:29 +0200 Subject: PM: Revert "Add EXPORT macros for exporting PM functions" Revert commit 41a337b40e98 ("Add EXPORT macros for exporting PM functions") because the macros added by it are still unused almost two years after they had been introduced. Reported-by: Adrian Hunter Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20250116154354.149297-1-andriy.shevchenko@linux.intel.com [ rjw: New changelog ] Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index e7f0260f15ad..0627a795892b 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -384,12 +384,8 @@ const struct dev_pm_ops name = { \ #ifdef CONFIG_PM #define _EXPORT_DEV_PM_OPS(name, license, ns) _EXPORT_PM_OPS(name, license, ns) -#define EXPORT_PM_FN_GPL(name) EXPORT_SYMBOL_GPL(name) -#define EXPORT_PM_FN_NS_GPL(name, ns) EXPORT_SYMBOL_NS_GPL(name, "ns") #else #define _EXPORT_DEV_PM_OPS(name, license, ns) _DISCARD_PM_OPS(name, license, ns) -#define EXPORT_PM_FN_GPL(name) -#define EXPORT_PM_FN_NS_GPL(name, ns) #endif #ifdef CONFIG_PM_SLEEP -- cgit v1.2.3 From 931656b9e2ff7029aee0b36e17780621948a6ac1 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 23 Jan 2025 07:35:43 -0800 Subject: kvm: defer huge page recovery vhost task to later Some libraries want to ensure they are single threaded before forking, so making the kernel's kvm huge page recovery process a vhost task of the user process breaks those. The minijail library used by crosvm is one such affected application. Defer the task to after the first VM_RUN call, which occurs after the parent process has forked all its jailed processes. This needs to happen only once for the kvm instance, so introduce some general-purpose infrastructure for that, too. It's similar in concept to pthread_once; except it is actually usable, because the callback takes a parameter. Cc: Sean Christopherson Cc: Paolo Bonzini Tested-by: Alyssa Ross Signed-off-by: Keith Busch Message-ID: <20250123153543.2769928-1-kbusch@meta.com> [Move call_once API to include/linux. - Paolo] Cc: stable@vger.kernel.org Fixes: d96c77bd4eeb ("KVM: x86: switch hugepage recovery thread to vhost_task") Signed-off-by: Paolo Bonzini --- include/linux/call_once.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 include/linux/call_once.h (limited to 'include/linux') diff --git a/include/linux/call_once.h b/include/linux/call_once.h new file mode 100644 index 000000000000..6261aa0b3fb0 --- /dev/null +++ b/include/linux/call_once.h @@ -0,0 +1,45 @@ +#ifndef _LINUX_CALL_ONCE_H +#define _LINUX_CALL_ONCE_H + +#include +#include + +#define ONCE_NOT_STARTED 0 +#define ONCE_RUNNING 1 +#define ONCE_COMPLETED 2 + +struct once { + atomic_t state; + struct mutex lock; +}; + +static inline void __once_init(struct once *once, const char *name, + struct lock_class_key *key) +{ + atomic_set(&once->state, ONCE_NOT_STARTED); + __mutex_init(&once->lock, name, key); +} + +#define once_init(once) \ +do { \ + static struct lock_class_key __key; \ + __once_init((once), #once, &__key); \ +} while (0) + +static inline void call_once(struct once *once, void (*cb)(struct once *)) +{ + /* Pairs with atomic_set_release() below. */ + if (atomic_read_acquire(&once->state) == ONCE_COMPLETED) + return; + + guard(mutex)(&once->lock); + WARN_ON(atomic_read(&once->state) == ONCE_RUNNING); + if (atomic_read(&once->state) != ONCE_NOT_STARTED) + return; + + atomic_set(&once->state, ONCE_RUNNING); + cb(once); + atomic_set_release(&once->state, ONCE_COMPLETED); +} + +#endif /* _LINUX_CALL_ONCE_H */ -- cgit v1.2.3 From 71ee9b16251ea4bf7c1fe222517c82bdb3220acc Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Nov 2024 19:11:24 +0000 Subject: minmax.h: add whitespace around operators and after commas Patch series "minmax.h: Cleanups and minor optimisations". Some tidyups and minor changes to minmax.h. This patch (of 7): Link: https://lkml.kernel.org/r/c50365d214e04f9ba256d417c8bebbc0@AcuMS.aculab.com Link: https://lkml.kernel.org/r/f04b2e1310244f62826267346fde0553@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dan Carpenter Cc: Jason A. Donenfeld Cc: Jens Axboe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/minmax.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 98008dd92153..51b0d988e322 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -51,10 +51,10 @@ * only need to be careful to not cause warnings for * pointer use. */ -#define __signed_type_use(x,ux) (2+__is_nonneg(x,ux)) -#define __unsigned_type_use(x,ux) (1+2*(sizeof(ux)<4)) -#define __sign_use(x,ux) (is_signed_type(typeof(ux))? \ - __signed_type_use(x,ux):__unsigned_type_use(x,ux)) +#define __signed_type_use(x, ux) (2 + __is_nonneg(x, ux)) +#define __unsigned_type_use(x, ux) (1 + 2 * (sizeof(ux) < 4)) +#define __sign_use(x, ux) (is_signed_type(typeof(ux)) ? \ + __signed_type_use(x, ux) : __unsigned_type_use(x, ux)) /* * To avoid warnings about casting pointers to integers @@ -74,15 +74,15 @@ #ifdef CONFIG_64BIT #define __signed_type(ux) long #else - #define __signed_type(ux) typeof(__builtin_choose_expr(sizeof(ux)>4,1LL,1L)) + #define __signed_type(ux) typeof(__builtin_choose_expr(sizeof(ux) > 4, 1LL, 1L)) #endif -#define __is_nonneg(x,ux) statically_true((__signed_type(ux))(x)>=0) +#define __is_nonneg(x, ux) statically_true((__signed_type(ux))(x) >= 0) -#define __types_ok(x,y,ux,uy) \ - (__sign_use(x,ux) & __sign_use(y,uy)) +#define __types_ok(x, y, ux, uy) \ + (__sign_use(x, ux) & __sign_use(y, uy)) -#define __types_ok3(x,y,z,ux,uy,uz) \ - (__sign_use(x,ux) & __sign_use(y,uy) & __sign_use(z,uz)) +#define __types_ok3(x, y, z, ux, uy, uz) \ + (__sign_use(x, ux) & __sign_use(y, uy) & __sign_use(z, uz)) #define __cmp_op_min < #define __cmp_op_max > @@ -97,7 +97,7 @@ #define __careful_cmp_once(op, x, y, ux, uy) ({ \ __auto_type ux = (x); __auto_type uy = (y); \ - BUILD_BUG_ON_MSG(!__types_ok(x,y,ux,uy), \ + BUILD_BUG_ON_MSG(!__types_ok(x, y, ux, uy), \ #op"("#x", "#y") signedness error"); \ __cmp(op, ux, uy); }) @@ -114,7 +114,7 @@ static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), \ (lo) <= (hi), true), \ "clamp() low limit " #lo " greater than high limit " #hi); \ - BUILD_BUG_ON_MSG(!__types_ok3(val,lo,hi,uval,ulo,uhi), \ + BUILD_BUG_ON_MSG(!__types_ok3(val, lo, hi, uval, ulo, uhi), \ "clamp("#val", "#lo", "#hi") signedness error"); \ __clamp(uval, ulo, uhi); }) @@ -154,7 +154,7 @@ #define __careful_op3(op, x, y, z, ux, uy, uz) ({ \ __auto_type ux = (x); __auto_type uy = (y);__auto_type uz = (z);\ - BUILD_BUG_ON_MSG(!__types_ok3(x,y,z,ux,uy,uz), \ + BUILD_BUG_ON_MSG(!__types_ok3(x, y, z, ux, uy, uz), \ #op"3("#x", "#y", "#z") signedness error"); \ __cmp(op, ux, __cmp(op, uy, uz)); }) @@ -326,9 +326,9 @@ static inline bool in_range32(u32 val, u32 start, u32 len) * Use these carefully: no type checking, and uses the arguments * multiple times. Use for obvious constants only. */ -#define MIN(a,b) __cmp(min,a,b) -#define MAX(a,b) __cmp(max,a,b) -#define MIN_T(type,a,b) __cmp(min,(type)(a),(type)(b)) -#define MAX_T(type,a,b) __cmp(max,(type)(a),(type)(b)) +#define MIN(a, b) __cmp(min, a, b) +#define MAX(a, b) __cmp(max, a, b) +#define MIN_T(type, a, b) __cmp(min, (type)(a), (type)(b)) +#define MAX_T(type, a, b) __cmp(max, (type)(a), (type)(b)) #endif /* _LINUX_MINMAX_H */ -- cgit v1.2.3 From 10666e99204818ef45c702469488353b5bb09ec7 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Nov 2024 19:12:07 +0000 Subject: minmax.h: update some comments - Change three to several. - Remove the comment about retaining constant expressions, no longer true. - Realign to nearer 80 columns and break on major punctiation. - Add a leading comment to the block before __signed_type() and __is_nonneg() Otherwise the block explaining the cast is a bit 'floating'. Reword the rest of that comment to improve readability. Link: https://lkml.kernel.org/r/85b050c81c1d4076aeb91a6cded45fee@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dan Carpenter Cc: Jason A. Donenfeld Cc: Jens Axboe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/minmax.h | 53 +++++++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 51b0d988e322..24e4b372649a 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -8,13 +8,10 @@ #include /* - * min()/max()/clamp() macros must accomplish three things: + * min()/max()/clamp() macros must accomplish several things: * * - Avoid multiple evaluations of the arguments (so side-effects like * "x++" happen only once) when non-constant. - * - Retain result as a constant expressions when called with only - * constant expressions (to avoid tripping VLA warnings in stack - * allocation usage). * - Perform signed v unsigned type-checking (to generate compile * errors instead of nasty runtime surprises). * - Unsigned char/short are always promoted to signed int and can be @@ -31,25 +28,23 @@ * bit #0 set if ok for unsigned comparisons * bit #1 set if ok for signed comparisons * - * In particular, statically non-negative signed integer - * expressions are ok for both. + * In particular, statically non-negative signed integer expressions + * are ok for both. * - * NOTE! Unsigned types smaller than 'int' are implicitly - * converted to 'int' in expressions, and are accepted for - * signed conversions for now. This is debatable. + * NOTE! Unsigned types smaller than 'int' are implicitly converted to 'int' + * in expressions, and are accepted for signed conversions for now. + * This is debatable. * - * Note that 'x' is the original expression, and 'ux' is - * the unique variable that contains the value. + * Note that 'x' is the original expression, and 'ux' is the unique variable + * that contains the value. * - * We use 'ux' for pure type checking, and 'x' for when - * we need to look at the value (but without evaluating - * it for side effects! Careful to only ever evaluate it - * with sizeof() or __builtin_constant_p() etc). + * We use 'ux' for pure type checking, and 'x' for when we need to look at the + * value (but without evaluating it for side effects! + * Careful to only ever evaluate it with sizeof() or __builtin_constant_p() etc). * - * Pointers end up being checked by the normal C type - * rules at the actual comparison, and these expressions - * only need to be careful to not cause warnings for - * pointer use. + * Pointers end up being checked by the normal C type rules at the actual + * comparison, and these expressions only need to be careful to not cause + * warnings for pointer use. */ #define __signed_type_use(x, ux) (2 + __is_nonneg(x, ux)) #define __unsigned_type_use(x, ux) (1 + 2 * (sizeof(ux) < 4)) @@ -57,19 +52,19 @@ __signed_type_use(x, ux) : __unsigned_type_use(x, ux)) /* - * To avoid warnings about casting pointers to integers - * of different sizes, we need that special sign type. + * Check whether a signed value is always non-negative. * - * On 64-bit we can just always use 'long', since any - * integer or pointer type can just be cast to that. + * A cast is needed to avoid any warnings from values that aren't signed + * integer types (in which case the result doesn't matter). * - * This does not work for 128-bit signed integers since - * the cast would truncate them, but we do not use s128 - * types in the kernel (we do use 'u128', but they will - * be handled by the !is_signed_type() case). + * On 64-bit any integer or pointer type can safely be cast to 'long'. + * But on 32-bit we need to avoid warnings about casting pointers to integers + * of different sizes without truncating 64-bit values so 'long' or 'long long' + * must be used depending on the size of the value. * - * NOTE! The cast is there only to avoid any warnings - * from when values that aren't signed integer types. + * This does not work for 128-bit signed integers since the cast would truncate + * them, but we do not use s128 types in the kernel (we do use 'u128', + * but they are handled by the !is_signed_type() case). */ #ifdef CONFIG_64BIT #define __signed_type(ux) long -- cgit v1.2.3 From b280bb27a9f7c91ddab730e1ad91a9c18a051f41 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Nov 2024 19:12:50 +0000 Subject: minmax.h: reduce the #define expansion of min(), max() and clamp() Since the test for signed values being non-negative only relies on __builtion_constant_p() (not is_constexpr()) it can use the 'ux' variable instead of the caller supplied expression. This means that the #define parameters are only expanded twice. Once in the code and once quoted in the error message. Link: https://lkml.kernel.org/r/051afc171806425da991908ed8688a98@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dan Carpenter Cc: Jason A. Donenfeld Cc: Jens Axboe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/minmax.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 24e4b372649a..6f7ea669d305 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -46,10 +46,10 @@ * comparison, and these expressions only need to be careful to not cause * warnings for pointer use. */ -#define __signed_type_use(x, ux) (2 + __is_nonneg(x, ux)) -#define __unsigned_type_use(x, ux) (1 + 2 * (sizeof(ux) < 4)) -#define __sign_use(x, ux) (is_signed_type(typeof(ux)) ? \ - __signed_type_use(x, ux) : __unsigned_type_use(x, ux)) +#define __signed_type_use(ux) (2 + __is_nonneg(ux)) +#define __unsigned_type_use(ux) (1 + 2 * (sizeof(ux) < 4)) +#define __sign_use(ux) (is_signed_type(typeof(ux)) ? \ + __signed_type_use(ux) : __unsigned_type_use(ux)) /* * Check whether a signed value is always non-negative. @@ -71,13 +71,13 @@ #else #define __signed_type(ux) typeof(__builtin_choose_expr(sizeof(ux) > 4, 1LL, 1L)) #endif -#define __is_nonneg(x, ux) statically_true((__signed_type(ux))(x) >= 0) +#define __is_nonneg(ux) statically_true((__signed_type(ux))(ux) >= 0) -#define __types_ok(x, y, ux, uy) \ - (__sign_use(x, ux) & __sign_use(y, uy)) +#define __types_ok(ux, uy) \ + (__sign_use(ux) & __sign_use(uy)) -#define __types_ok3(x, y, z, ux, uy, uz) \ - (__sign_use(x, ux) & __sign_use(y, uy) & __sign_use(z, uz)) +#define __types_ok3(ux, uy, uz) \ + (__sign_use(ux) & __sign_use(uy) & __sign_use(uz)) #define __cmp_op_min < #define __cmp_op_max > @@ -92,7 +92,7 @@ #define __careful_cmp_once(op, x, y, ux, uy) ({ \ __auto_type ux = (x); __auto_type uy = (y); \ - BUILD_BUG_ON_MSG(!__types_ok(x, y, ux, uy), \ + BUILD_BUG_ON_MSG(!__types_ok(ux, uy), \ #op"("#x", "#y") signedness error"); \ __cmp(op, ux, uy); }) @@ -109,7 +109,7 @@ static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), \ (lo) <= (hi), true), \ "clamp() low limit " #lo " greater than high limit " #hi); \ - BUILD_BUG_ON_MSG(!__types_ok3(val, lo, hi, uval, ulo, uhi), \ + BUILD_BUG_ON_MSG(!__types_ok3(uval, ulo, uhi), \ "clamp("#val", "#lo", "#hi") signedness error"); \ __clamp(uval, ulo, uhi); }) @@ -149,7 +149,7 @@ #define __careful_op3(op, x, y, z, ux, uy, uz) ({ \ __auto_type ux = (x); __auto_type uy = (y);__auto_type uz = (z);\ - BUILD_BUG_ON_MSG(!__types_ok3(x, y, z, ux, uy, uz), \ + BUILD_BUG_ON_MSG(!__types_ok3(ux, uy, uz), \ #op"3("#x", "#y", "#z") signedness error"); \ __cmp(op, ux, __cmp(op, uy, uz)); }) -- cgit v1.2.3 From a5743f32baec4728711bbc01d6ac2b33d4c67040 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Nov 2024 19:13:31 +0000 Subject: minmax.h: use BUILD_BUG_ON_MSG() for the lo < hi test in clamp() Use BUILD_BUG_ON_MSG(statically_true(ulo > uhi), ...) for the sanity check of the bounds in clamp(). Gives better error coverage and one less expansion of the arguments. Link: https://lkml.kernel.org/r/34d53778977747f19cce2abb287bb3e6@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dan Carpenter Cc: Jason A. Donenfeld Cc: Jens Axboe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/minmax.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 6f7ea669d305..91aa1b90c1bb 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -106,8 +106,7 @@ __auto_type uval = (val); \ __auto_type ulo = (lo); \ __auto_type uhi = (hi); \ - static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), \ - (lo) <= (hi), true), \ + BUILD_BUG_ON_MSG(statically_true(ulo > uhi), \ "clamp() low limit " #lo " greater than high limit " #hi); \ BUILD_BUG_ON_MSG(!__types_ok3(uval, ulo, uhi), \ "clamp("#val", "#lo", "#hi") signedness error"); \ -- cgit v1.2.3 From c3939872ee4a6b8bdcd0e813c66823b31e6e26f7 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Nov 2024 19:14:19 +0000 Subject: minmax.h: move all the clamp() definitions after the min/max() ones At some point the definitions for clamp() got added in the middle of the ones for min() and max(). Re-order the definitions so they are more sensibly grouped. Link: https://lkml.kernel.org/r/8bb285818e4846469121c8abc3dfb6e2@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dan Carpenter Cc: Jason A. Donenfeld Cc: Jens Axboe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/minmax.h | 109 +++++++++++++++++++++++-------------------------- 1 file changed, 51 insertions(+), 58 deletions(-) (limited to 'include/linux') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 91aa1b90c1bb..75fb7a6ad4c6 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -99,22 +99,6 @@ #define __careful_cmp(op, x, y) \ __careful_cmp_once(op, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_)) -#define __clamp(val, lo, hi) \ - ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) - -#define __clamp_once(val, lo, hi, uval, ulo, uhi) ({ \ - __auto_type uval = (val); \ - __auto_type ulo = (lo); \ - __auto_type uhi = (hi); \ - BUILD_BUG_ON_MSG(statically_true(ulo > uhi), \ - "clamp() low limit " #lo " greater than high limit " #hi); \ - BUILD_BUG_ON_MSG(!__types_ok3(uval, ulo, uhi), \ - "clamp("#val", "#lo", "#hi") signedness error"); \ - __clamp(uval, ulo, uhi); }) - -#define __careful_clamp(val, lo, hi) \ - __clamp_once(val, lo, hi, __UNIQUE_ID(v_), __UNIQUE_ID(l_), __UNIQUE_ID(h_)) - /** * min - return minimum of two values of the same or compatible types * @x: first value @@ -170,6 +154,22 @@ #define max3(x, y, z) \ __careful_op3(max, x, y, z, __UNIQUE_ID(x_), __UNIQUE_ID(y_), __UNIQUE_ID(z_)) +/** + * min_t - return minimum of two values, using the specified type + * @type: data type to use + * @x: first value + * @y: second value + */ +#define min_t(type, x, y) __cmp_once(min, type, x, y) + +/** + * max_t - return maximum of two values, using the specified type + * @type: data type to use + * @x: first value + * @y: second value + */ +#define max_t(type, x, y) __cmp_once(max, type, x, y) + /** * min_not_zero - return the minimum that is _not_ zero, unless both are zero * @x: value1 @@ -180,6 +180,22 @@ typeof(y) __y = (y); \ __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) +#define __clamp(val, lo, hi) \ + ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) + +#define __clamp_once(val, lo, hi, uval, ulo, uhi) ({ \ + __auto_type uval = (val); \ + __auto_type ulo = (lo); \ + __auto_type uhi = (hi); \ + BUILD_BUG_ON_MSG(statically_true(ulo > uhi), \ + "clamp() low limit " #lo " greater than high limit " #hi); \ + BUILD_BUG_ON_MSG(!__types_ok3(uval, ulo, uhi), \ + "clamp("#val", "#lo", "#hi") signedness error"); \ + __clamp(uval, ulo, uhi); }) + +#define __careful_clamp(val, lo, hi) \ + __clamp_once(val, lo, hi, __UNIQUE_ID(v_), __UNIQUE_ID(l_), __UNIQUE_ID(h_)) + /** * clamp - return a value clamped to a given range with strict typechecking * @val: current value @@ -191,28 +207,30 @@ */ #define clamp(val, lo, hi) __careful_clamp(val, lo, hi) -/* - * ..and if you can't take the strict - * types, you can specify one yourself. - * - * Or not use min/max/clamp at all, of course. - */ - /** - * min_t - return minimum of two values, using the specified type - * @type: data type to use - * @x: first value - * @y: second value + * clamp_t - return a value clamped to a given range using a given type + * @type: the type of variable to use + * @val: current value + * @lo: minimum allowable value + * @hi: maximum allowable value + * + * This macro does no typechecking and uses temporary variables of type + * @type to make all the comparisons. */ -#define min_t(type, x, y) __cmp_once(min, type, x, y) +#define clamp_t(type, val, lo, hi) __careful_clamp((type)(val), (type)(lo), (type)(hi)) /** - * max_t - return maximum of two values, using the specified type - * @type: data type to use - * @x: first value - * @y: second value + * clamp_val - return a value clamped to a given range using val's type + * @val: current value + * @lo: minimum allowable value + * @hi: maximum allowable value + * + * This macro does no typechecking and uses temporary variables of whatever + * type the input argument @val is. This is useful when @val is an unsigned + * type and @lo and @hi are literals that will otherwise be assigned a signed + * integer type. */ -#define max_t(type, x, y) __cmp_once(max, type, x, y) +#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) /* * Do not check the array parameter using __must_be_array(). @@ -257,31 +275,6 @@ */ #define max_array(array, len) __minmax_array(max, array, len) -/** - * clamp_t - return a value clamped to a given range using a given type - * @type: the type of variable to use - * @val: current value - * @lo: minimum allowable value - * @hi: maximum allowable value - * - * This macro does no typechecking and uses temporary variables of type - * @type to make all the comparisons. - */ -#define clamp_t(type, val, lo, hi) __careful_clamp((type)(val), (type)(lo), (type)(hi)) - -/** - * clamp_val - return a value clamped to a given range using val's type - * @val: current value - * @lo: minimum allowable value - * @hi: maximum allowable value - * - * This macro does no typechecking and uses temporary variables of whatever - * type the input argument @val is. This is useful when @val is an unsigned - * type and @lo and @hi are literals that will otherwise be assigned a signed - * integer type. - */ -#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) - static inline bool in_range64(u64 val, u64 start, u64 len) { return (val - start) < len; -- cgit v1.2.3 From 495bba17cdf95e9703af1b8ef773c55ef0dfe703 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Nov 2024 19:15:05 +0000 Subject: minmax.h: simplify the variants of clamp() Always pass a 'type' through to __clamp_once(), pass '__auto_type' from clamp() itself. The expansion of __types_ok3() is reasonable so it isn't worth the added complexity of avoiding it when a fixed type is used for all three values. Link: https://lkml.kernel.org/r/8f69f4deac014f558bab186444bac2e8@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dan Carpenter Cc: Jason A. Donenfeld Cc: Jens Axboe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/minmax.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 75fb7a6ad4c6..2bbdd5b5e07e 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -183,29 +183,29 @@ #define __clamp(val, lo, hi) \ ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) -#define __clamp_once(val, lo, hi, uval, ulo, uhi) ({ \ - __auto_type uval = (val); \ - __auto_type ulo = (lo); \ - __auto_type uhi = (hi); \ +#define __clamp_once(type, val, lo, hi, uval, ulo, uhi) ({ \ + type uval = (val); \ + type ulo = (lo); \ + type uhi = (hi); \ BUILD_BUG_ON_MSG(statically_true(ulo > uhi), \ "clamp() low limit " #lo " greater than high limit " #hi); \ BUILD_BUG_ON_MSG(!__types_ok3(uval, ulo, uhi), \ "clamp("#val", "#lo", "#hi") signedness error"); \ __clamp(uval, ulo, uhi); }) -#define __careful_clamp(val, lo, hi) \ - __clamp_once(val, lo, hi, __UNIQUE_ID(v_), __UNIQUE_ID(l_), __UNIQUE_ID(h_)) +#define __careful_clamp(type, val, lo, hi) \ + __clamp_once(type, val, lo, hi, __UNIQUE_ID(v_), __UNIQUE_ID(l_), __UNIQUE_ID(h_)) /** - * clamp - return a value clamped to a given range with strict typechecking + * clamp - return a value clamped to a given range with typechecking * @val: current value * @lo: lowest allowable value * @hi: highest allowable value * - * This macro does strict typechecking of @lo/@hi to make sure they are of the - * same type as @val. See the unnecessary pointer comparisons. + * This macro checks @val/@lo/@hi to make sure they have compatible + * signedness. */ -#define clamp(val, lo, hi) __careful_clamp(val, lo, hi) +#define clamp(val, lo, hi) __careful_clamp(__auto_type, val, lo, hi) /** * clamp_t - return a value clamped to a given range using a given type @@ -217,7 +217,7 @@ * This macro does no typechecking and uses temporary variables of type * @type to make all the comparisons. */ -#define clamp_t(type, val, lo, hi) __careful_clamp((type)(val), (type)(lo), (type)(hi)) +#define clamp_t(type, val, lo, hi) __careful_clamp(type, val, lo, hi) /** * clamp_val - return a value clamped to a given range using val's type @@ -230,7 +230,7 @@ * type and @lo and @hi are literals that will otherwise be assigned a signed * integer type. */ -#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) +#define clamp_val(val, lo, hi) __careful_clamp(typeof(val), val, lo, hi) /* * Do not check the array parameter using __must_be_array(). -- cgit v1.2.3 From 2b97aaf74ed534fb838d09867d09a3ca5d795208 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Nov 2024 19:15:51 +0000 Subject: minmax.h: remove some #defines that are only expanded once The bodies of __signed_type_use() and __unsigned_type_use() are much the same size as their names - so put the bodies in the only line that expands them. Similarly __signed_type() is defined separately for 64bit and then used exactly once just below. Change the test for __signed_type from CONFIG_64BIT to one based on gcc defined macros so that the code is valid if it gets used outside of a kernel build. Link: https://lkml.kernel.org/r/9386d1ebb8974fbabbed2635160c3975@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dan Carpenter Cc: Jason A. Donenfeld Cc: Jens Axboe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/minmax.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 2bbdd5b5e07e..eaaf5c008e4d 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -46,10 +46,8 @@ * comparison, and these expressions only need to be careful to not cause * warnings for pointer use. */ -#define __signed_type_use(ux) (2 + __is_nonneg(ux)) -#define __unsigned_type_use(ux) (1 + 2 * (sizeof(ux) < 4)) #define __sign_use(ux) (is_signed_type(typeof(ux)) ? \ - __signed_type_use(ux) : __unsigned_type_use(ux)) + (2 + __is_nonneg(ux)) : (1 + 2 * (sizeof(ux) < 4))) /* * Check whether a signed value is always non-negative. @@ -57,7 +55,7 @@ * A cast is needed to avoid any warnings from values that aren't signed * integer types (in which case the result doesn't matter). * - * On 64-bit any integer or pointer type can safely be cast to 'long'. + * On 64-bit any integer or pointer type can safely be cast to 'long long'. * But on 32-bit we need to avoid warnings about casting pointers to integers * of different sizes without truncating 64-bit values so 'long' or 'long long' * must be used depending on the size of the value. @@ -66,12 +64,12 @@ * them, but we do not use s128 types in the kernel (we do use 'u128', * but they are handled by the !is_signed_type() case). */ -#ifdef CONFIG_64BIT - #define __signed_type(ux) long +#if __SIZEOF_POINTER__ == __SIZEOF_LONG_LONG__ +#define __is_nonneg(ux) statically_true((long long)(ux) >= 0) #else - #define __signed_type(ux) typeof(__builtin_choose_expr(sizeof(ux) > 4, 1LL, 1L)) +#define __is_nonneg(ux) statically_true( \ + (typeof(__builtin_choose_expr(sizeof(ux) > 4, 1LL, 1L)))(ux) >= 0) #endif -#define __is_nonneg(ux) statically_true((__signed_type(ux))(ux) >= 0) #define __types_ok(ux, uy) \ (__sign_use(ux) & __sign_use(uy)) -- cgit v1.2.3 From f0ef073e213a3a1a494b6f55a76ce1242600a453 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 14 Jan 2025 21:04:54 +0800 Subject: include/linux/lz4.h: add some missing macros Currently, LZ4_DISTANCE_MAX and LZ4_DECOMPRESS_INPLACE_MARGIN are defined in the erofs subsystem for LZ4 in-place decompression, which is somewhat unsuitable since they should belong to the LZ4 itself and may change with future LZ4 codebase updates. Move them to include/linux/lz4.h to match the upstream LZ4 library [1]. No logic changes. [1] https://github.com/lz4/lz4/blob/v1.10.0/lib/lz4.h#L670 Link: https://lkml.kernel.org/r/20250114130454.1191150-1-hsiangkao@linux.alibaba.com Signed-off-by: Gao Xiang Cc: Yann Collet Cc: Nick Terrell Cc: Chao Yu Cc: Yue Hu Cc; Jeffle Xu Cc: Sandeep Dhavale Signed-off-by: Andrew Morton --- include/linux/lz4.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lz4.h b/include/linux/lz4.h index b16e15b9587a..ad6042a718b5 100644 --- a/include/linux/lz4.h +++ b/include/linux/lz4.h @@ -645,4 +645,10 @@ int LZ4_decompress_safe_usingDict(const char *source, char *dest, int LZ4_decompress_fast_usingDict(const char *source, char *dest, int originalSize, const char *dictStart, int dictSize); +#define LZ4_DECOMPRESS_INPLACE_MARGIN(compressedSize) (((compressedSize) >> 8) + 32) + +#ifndef LZ4_DISTANCE_MAX /* history window size; can be user-defined at compile time */ +#define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */ +#endif + #endif -- cgit v1.2.3 From 6beaa75cd24d660e7913c60aff702ec809ff9b28 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 12 Jan 2025 01:20:49 +0000 Subject: kdb: Remove unused flags stack kdb_restore_flags() and kdb_save_flags() were added in 2010 by commit 5d5314d6795f ("kdb: core for kgdb back end (1 of 2)") but have remained unused. Remove them, and their associated storage. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20250112012049.319515-1-linux@treblig.org Signed-off-by: Daniel Thompson (RISCstar) --- include/linux/kdb.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index f6c2ddb16b95..905a2e2f45f6 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -140,9 +140,6 @@ extern const char *kdb_diemsg; extern unsigned int kdb_flags; /* Global flags, see kdb_state for per cpu state */ -extern void kdb_save_flags(void); -extern void kdb_restore_flags(void); - #define KDB_FLAG(flag) (kdb_flags & KDB_FLAG_##flag) #define KDB_FLAG_SET(flag) ((void)(kdb_flags |= KDB_FLAG_##flag)) #define KDB_FLAG_CLEAR(flag) ((void)(kdb_flags &= ~KDB_FLAG_##flag)) -- cgit v1.2.3 From 40733e7e0c260d540447d3646e451274bc5d3374 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Dec 2024 19:46:31 +0800 Subject: mm/swap_cgroup: remove swap_cgroup_cmpxchg This function is never used after commit 6b611388b626 ("memcg-v1: remove charge move code"). Link: https://lkml.kernel.org/r/20241218114633.85196-3-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Yosry Ahmed Reviewed-by: Roman Gushchin Acked-by: Shakeel Butt Acked-by: Chris Li Cc: Barry Song Cc: Hugh Dickins Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/swap_cgroup.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index ae73a87775b3..d521ad1c4164 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -6,8 +6,6 @@ #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, - unsigned short old, unsigned short new); extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, unsigned int nr_ents); extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); -- cgit v1.2.3 From 6769183166b33b1a5de8f938d1ff4d5f4be0f428 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Dec 2024 19:46:33 +0800 Subject: mm/swap_cgroup: decouple swap cgroup recording and clearing The current implementation of swap cgroup tracking is a bit complex and fragile: On charging path, swap_cgroup_record always records an actual memcg id, and it depends on the caller to make sure all entries passed in must belong to one single folio. As folios are always charged or uncharged as a whole, and always charged and uncharged in order, swap_cgroup doesn't need an extra lock. On uncharging path, swap_cgroup_record always sets the record to zero. These entries won't be charged again until uncharging is done. So there is no extra lock needed either. Worth noting that swap cgroup clearing may happen without folio involved, eg. exiting processes will zap its page table without swapin. The xchg/cmpxchg provides atomic operations and barriers to ensure no tearing or synchronization issue of these swap cgroup records. It works but quite error-prone. Things can be much clear and robust by decoupling recording and clearing into two helpers. Recording takes the actual folio being charged as argument, and clearing always set the record to zero, and refine the debug sanity checks to better reflect their usage Benchmark even showed a very slight improvement as it saved some extra arguments and lookups: make -j96 with defconfig on tmpfs in 1.5G memory cgroup using 4k folios: Before: sys 9617.23 (stdev 37.764062) After : sys 9541.54 (stdev 42.973976) make -j96 with defconfig on tmpfs in 2G memory cgroup using 64k folios: Before: sys 7358.98 (stdev 54.927593) After : sys 7337.82 (stdev 39.398956) Link: https://lkml.kernel.org/r/20241218114633.85196-5-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Cc: Barry Song Cc: Hugh Dickins Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap_cgroup.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index d521ad1c4164..b5ec038069da 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -6,8 +6,8 @@ #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, - unsigned int nr_ents); +extern void swap_cgroup_record(struct folio *folio, swp_entry_t ent); +extern unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents); extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); extern int swap_cgroup_swapon(int type, unsigned long max_pages); extern void swap_cgroup_swapoff(int type); @@ -15,8 +15,12 @@ extern void swap_cgroup_swapoff(int type); #else static inline -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, - unsigned int nr_ents) +void swap_cgroup_record(struct folio *folio, swp_entry_t ent) +{ +} + +static inline +unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents) { return 0; } -- cgit v1.2.3 From 04f13d241b8b146b23038bffd907cb8278391d07 Mon Sep 17 00:00:00 2001 From: yangge Date: Sat, 11 Jan 2025 15:58:20 +0800 Subject: mm: replace free hugepage folios after migration My machine has 4 NUMA nodes, each equipped with 32GB of memory. I have configured each NUMA node with 16GB of CMA and 16GB of in-use hugetlb pages. The allocation of contiguous memory via cma_alloc() can fail probabilistically. When there are free hugetlb folios in the hugetlb pool, during the migration of in-use hugetlb folios, new folios are allocated from the free hugetlb pool. After the migration is completed, the old folios are released back to the free hugetlb pool instead of being returned to the buddy system. This can cause test_pages_isolated() check to fail, ultimately leading to the failure of cma_alloc(). Call trace: cma_alloc() __alloc_contig_migrate_range() // migrate in-use hugepage test_pages_isolated() __test_page_isolated_in_pageblock() PageBuddy(page) // check if the page is in buddy To address this issue, we introduce a function named replace_free_hugepage_folios(). This function will replace the hugepage in the free hugepage pool with a new one and release the old one to the buddy system. After the migration of in-use hugetlb pages is completed, we will invoke replace_free_hugepage_folios() to ensure that these hugepages are properly released to the buddy system. Following this step, when test_pages_isolated() is executed for inspection, it will successfully pass. Additionally, when alloc_contig_range() is used to migrate multiple in-use hugetlb pages, it can result in some in-use hugetlb pages being released back to the free hugetlb pool and subsequently being reallocated and used again. For example: [huge 0] [huge 1] To migrate huge 0, we obtain huge x from the pool. After the migration is completed, we return the now-freed huge 0 back to the pool. When it's time to migrate huge 1, we can simply reuse the now-freed huge 0 from the pool. As a result, when replace_free_hugepage_folios() is executed, it cannot release huge 0 back to the buddy system. To address this issue, we should prevent the reuse of isolated free hugepages during the migration process. Link: https://lkml.kernel.org/r/1734503588-16254-1-git-send-email-yangge1116@126.com Link: https://lkml.kernel.org/r/1736582300-11364-1-git-send-email-yangge1116@126.com Signed-off-by: yangge Cc: Baolin Wang Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ae4fe8615bb6..10faf42ca96a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -681,6 +681,7 @@ struct huge_bootmem_page { }; int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); +int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, @@ -1059,6 +1060,12 @@ static inline int isolate_or_dissolve_huge_page(struct page *page, return -ENOMEM; } +static inline int replace_free_hugepage_folios(unsigned long start_pfn, + unsigned long end_pfn) +{ + return 0; +} + static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) -- cgit v1.2.3 From 44d46b76c3a4b514a0cc9dab147ed430e5c1d699 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 20 Dec 2024 16:07:09 -0500 Subject: mm: add build-time option for hotplug memory default online type Memory hotplug presently auto-onlines memory into a zone the kernel deems appropriate if CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y. The memhp_default_state boot param enables runtime config, but it's not possible to do this at build-time. Remove CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE, and replace it with CONFIG_MHP_DEFAULT_ONLINE_TYPE_* choices that sync with the boot param. Selections: CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE => mhp_default_online_type = "offline" Memory will not be onlined automatically. CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO => mhp_default_online_type = "online" Memory will be onlined automatically in a zone deemed. appropriate by the kernel. CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL => mhp_default_online_type = "online_kernel" Memory will be onlined automatically. The zone may allow kernel data (e.g. ZONE_NORMAL). CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE => mhp_default_online_type = "online_movable" Memory will be onlined automatically. The zone will be ZONE_MOVABLE. Default to CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE to match the existing default CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=n behavior. Existing users of CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y should use CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO. [gourry@gourry.net: update KConfig comments] Link: https://lkml.kernel.org/r/20241226182918.648799-1-gourry@gourry.net Link: https://lkml.kernel.org/r/20241220210709.300066-1-gourry@gourry.net Signed-off-by: Gregory Price Acked-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Huacai Chen Cc: Jonathan Corbet Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index b27ddce5d324..eaac5ae8c05c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -144,8 +144,6 @@ extern u64 max_mem_size; extern int mhp_online_type_from_str(const char *str); -/* Default online_type (MMOP_*) when new memory blocks are added. */ -extern int mhp_default_online_type; /* If movable_node boot option specified */ extern bool movable_node_enabled; static inline bool movable_node_is_enabled(void) @@ -303,6 +301,9 @@ static inline void __remove_memory(u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_MEMORY_HOTPLUG +/* Default online_type (MMOP_*) when new memory blocks are added. */ +extern int mhp_get_default_online_type(void); +extern void mhp_set_default_online_type(int online_type); extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat); extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); -- cgit v1.2.3 From 5fcf5fa61218176acf198d9e63fb5739dd147244 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 8 Jan 2025 14:57:20 +0800 Subject: mm: pgtable: add statistics for P4D level page table Like other levels of page tables, add statistics for P4D level page table. Link: https://lkml.kernel.org/r/d55fe3c286305aae84457da9e1066df99b3de125.1736317725.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Originally-by: Peter Zijlstra (Intel) Reviewed-by: Kevin Brodsky Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Aneesh Kumar K.V (Arm) Cc: Arnd Bergmann Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Muchun Song Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index e7c54b9aac6d..2e56a9634a97 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3175,6 +3175,22 @@ static inline void pagetable_pud_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } +static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) +{ + struct folio *folio = ptdesc_folio(ptdesc); + + __folio_set_pgtable(folio); + lruvec_stat_add_folio(folio, NR_PAGETABLE); +} + +static inline void pagetable_p4d_dtor(struct ptdesc *ptdesc) +{ + struct folio *folio = ptdesc_folio(ptdesc); + + __folio_clear_pgtable(folio); + lruvec_stat_sub_folio(folio, NR_PAGETABLE); +} + extern void __init pagecache_init(void); extern void free_initmem(void); -- cgit v1.2.3 From db6b435d731a8d82c38e558175db55466cb5832a Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 8 Jan 2025 14:57:23 +0800 Subject: mm: pgtable: introduce pagetable_dtor() The pagetable_p*_dtor() are exactly the same except for the handling of ptlock. If we make ptlock_free() handle the case where ptdesc->ptl is NULL and remove VM_BUG_ON_PAGE() from pmd_ptlock_free(), we can unify pagetable_p*_dtor() into one function. Let's introduce pagetable_dtor() to do this. Later, pagetable_dtor() will be moved to tlb_remove_ptdesc(), so that ptlock and page table pages can be freed together (regardless of whether RCU is used). This prevents the use-after-free problem where the ptlock is freed immediately but the page table pages is freed later via RCU. Link: https://lkml.kernel.org/r/47f44fff9dc68d9d9e9a0d6c036df275f820598a.1736317725.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Originally-by: Peter Zijlstra (Intel) Reviewed-by: Kevin Brodsky Acked-by: Alexander Gordeev [s390] Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Aneesh Kumar K.V (Arm) Cc: Arnd Bergmann Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Muchun Song Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm.h | 52 +++++++++------------------------------------------- 1 file changed, 9 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e56a9634a97..a3b2263f1c1a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2992,6 +2992,15 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ +static inline void pagetable_dtor(struct ptdesc *ptdesc) +{ + struct folio *folio = ptdesc_folio(ptdesc); + + ptlock_free(ptdesc); + __folio_clear_pgtable(folio); + lruvec_stat_sub_folio(folio, NR_PAGETABLE); +} + static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); @@ -3003,15 +3012,6 @@ static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) return true; } -static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) -{ - struct folio *folio = ptdesc_folio(ptdesc); - - ptlock_free(ptdesc); - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); -} - pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) @@ -3088,14 +3088,6 @@ static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) return ptlock_init(ptdesc); } -static inline void pmd_ptlock_free(struct ptdesc *ptdesc) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc)); -#endif - ptlock_free(ptdesc); -} - #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) #else @@ -3106,7 +3098,6 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } -static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {} #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) @@ -3131,15 +3122,6 @@ static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) return true; } -static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc) -{ - struct folio *folio = ptdesc_folio(ptdesc); - - pmd_ptlock_free(ptdesc); - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); -} - /* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be @@ -3167,14 +3149,6 @@ static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) lruvec_stat_add_folio(folio, NR_PAGETABLE); } -static inline void pagetable_pud_dtor(struct ptdesc *ptdesc) -{ - struct folio *folio = ptdesc_folio(ptdesc); - - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); -} - static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); @@ -3183,14 +3157,6 @@ static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) lruvec_stat_add_folio(folio, NR_PAGETABLE); } -static inline void pagetable_p4d_dtor(struct ptdesc *ptdesc) -{ - struct folio *folio = ptdesc_folio(ptdesc); - - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); -} - extern void __init pagecache_init(void); extern void free_initmem(void); -- cgit v1.2.3 From 553e77529fb61e5520b839a0ce412a46cba996e0 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 8 Jan 2025 14:57:33 +0800 Subject: mm: pgtable: introduce generic pagetable_dtor_free() The pte_free(), pmd_free(), __pud_free() and __p4d_free() in asm-generic/pgalloc.h and the generic __tlb_remove_table() are basically the same, so let's introduce pagetable_dtor_free() to deduplicate them. In addition, the pagetable_dtor_free() in s390 does the same thing, so let's s390 also calls generic pagetable_dtor_free(). Link: https://lkml.kernel.org/r/1663a0565aca881d1338ceb7d1db4aa9c333abd6.1736317725.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Suggested-by: Peter Zijlstra (Intel) Reviewed-by: Kevin Brodsky Acked-by: Alexander Gordeev [s390] Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Aneesh Kumar K.V (Arm) Cc: Arnd Bergmann Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Muchun Song Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a3b2263f1c1a..15a903d59d09 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3001,6 +3001,12 @@ static inline void pagetable_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } +static inline void pagetable_dtor_free(struct ptdesc *ptdesc) +{ + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); +} + static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); -- cgit v1.2.3 From 30cee1e4861b59200aa09c94a4d789c461e5f408 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Mon, 30 Dec 2024 15:40:43 +0530 Subject: lib/list_debug.c: add object information in case of invalid object As of now during link list corruption it prints about cluprit address and its wrong value, but sometime it is not enough to catch the actual issue point. If it prints allocation and free path of that corrupted node, it will be a lot easier to find and fix the issues. Adding the same information when data mismatch is found in link list debug data: [ 14.243055] slab kmalloc-32 start ffff0000cda19320 data offset 32 pointer offset 8 size 32 allocated at add_to_list+0x28/0xb0 [ 14.245259] __kmalloc_cache_noprof+0x1c4/0x358 [ 14.245572] add_to_list+0x28/0xb0 ... [ 14.248632] do_el0_svc_compat+0x1c/0x34 [ 14.249018] el0_svc_compat+0x2c/0x80 [ 14.249244] Free path: [ 14.249410] kfree+0x24c/0x2f0 [ 14.249724] do_force_corruption+0xbc/0x100 ... [ 14.252266] el0_svc_common.constprop.0+0x40/0xe0 [ 14.252540] do_el0_svc_compat+0x1c/0x34 [ 14.252763] el0_svc_compat+0x2c/0x80 [ 14.253071] ------------[ cut here ]------------ [ 14.253303] list_del corruption. next->prev should be ffff0000cda192a8, but was 6b6b6b6b6b6b6b6b. (next=ffff0000cda19348) [ 14.254255] WARNING: CPU: 3 PID: 84 at lib/list_debug.c:65 __list_del_entry_valid_or_report+0x158/0x164 Moved prototype of mem_dump_obj() to bug.h, as mm.h can not be included in bug.h. Link: https://lkml.kernel.org/r/20241230101043.53773-1-maninder1.s@samsung.com Signed-off-by: Maninder Singh Acked-by: Jan Kara Cc: Al Viro Cc: Christian Brauner Cc: Marco Elver Cc: Rohit Thapliyal Signed-off-by: Andrew Morton --- include/linux/bug.h | 10 +++++++++- include/linux/mm.h | 6 ------ 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bug.h b/include/linux/bug.h index 348acf2558f3..a9948a9f1093 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -73,15 +73,23 @@ static inline void generic_bug_clear_once(void) {} #endif /* CONFIG_GENERIC_BUG */ +#ifdef CONFIG_PRINTK +void mem_dump_obj(void *object); +#else +static inline void mem_dump_obj(void *object) {} +#endif + /* * Since detected data corruption should stop operation on the affected * structures. Return value must be checked and sanely acted on by caller. */ static inline __must_check bool check_data_corruption(bool v) { return v; } -#define CHECK_DATA_CORRUPTION(condition, fmt, ...) \ +#define CHECK_DATA_CORRUPTION(condition, addr, fmt, ...) \ check_data_corruption(({ \ bool corruption = unlikely(condition); \ if (corruption) { \ + if (addr) \ + mem_dump_obj(addr); \ if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \ pr_err(fmt, ##__VA_ARGS__); \ BUG(); \ diff --git a/include/linux/mm.h b/include/linux/mm.h index 15a903d59d09..c550912a5d6d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4084,12 +4084,6 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping, extern int sysctl_nr_trim_pages; -#ifdef CONFIG_PRINTK -void mem_dump_obj(void *object); -#else -static inline void mem_dump_obj(void *object) {} -#endif - #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, -- cgit v1.2.3 From b0d66d82fce60161de6f3d57f87016c3a6f7a121 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 3 Jan 2025 19:35:35 +0000 Subject: mm/debug: introduce VM_WARN_ON_VMG() to dump VMA merge state Patch series "mm/debug: introduce and use VM_WARN_ON_VMG()". We use a number of asserts, enabled only when CONFIG_DEBUG_VM is set, during VMA merge operations to ensure state is as expected. However, when syzkaller or the like encounters these asserts, often the information provided by the report is insufficient to narrow down what the problem is. We noticed this recently in [0], where a non-repro issue resisted debugging due to simply not having sufficient information to go on. This series improves the situation by providing VM_WARN_ON_VMG() which acts like VM_WARN_ON() (i.e. only actually being invoked if CONFIG_DEBUG_VM is set), while dumping significant information about the VMA merge state, the mm_struct describing the virtual address space, all associated VMAs and, if CONFIG_DEBUG_VM_MAPLE_TREE is set, the associated maple tree. [0]:https://lore.kernel.org/all/6774c98f.050a0220.25abdd.0991.GAE@google.com/ This patch (of 2): We use a number of asserts, enabled only when CONFIG_DEBUG_VM is set, during VMA merge operations to ensure state is as expected. However, when syzkaller or the like encounters these asserts, often the information provided by the report is insufficient to narrow down what the problem is. This might not be so much of an issue if the reported problem is reproducible, but if it is a rarely encountered race or some other case which precludes a repro, it is a very big problem (see [0] for the motivating case). It is therefore sensible to provide a means by which we can easily and conveniently dump a lot more information in these circumstances. The aggregation of merge state into a single struct threaded through the operation makes this trivial - we can simply introduce a variant on VM_WARN_ON() which takes the VMA merge state object (vmg) and use that to dump information. This patch therefore introduces VM_WARN_ON_VMG() which provides this functionality. It additionally dumps full mm state, VMA state for each of the three VMAs the vmg contains (prev, next, vma) and if CONFIG_DEBUG_VM_MAPLE_TREE is enabled, dumps the maple tree from the provided VMA iterator if non-NULL. This patch has no functional impact if CONFIG_DEBUG_VM is not set. [0]:https://lore.kernel.org/all/6774c98f.050a0220.25abdd.0991.GAE@google.com/ Link: https://lkml.kernel.org/r/cover.1735932169.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/13b09b52d4d103ee86acaf0ae612539648ae29e0.1735932169.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Jann Horn Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mmdebug.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index d7cb1e5ecbda..a0a3894900ed 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -9,10 +9,12 @@ struct page; struct vm_area_struct; struct mm_struct; struct vma_iterator; +struct vma_merge_struct; void dump_page(const struct page *page, const char *reason); void dump_vma(const struct vm_area_struct *vma); void dump_mm(const struct mm_struct *mm); +void dump_vmg(const struct vma_merge_struct *vmg, const char *reason); void vma_iter_dump_tree(const struct vma_iterator *vmi); #ifdef CONFIG_DEBUG_VM @@ -87,6 +89,15 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); } \ unlikely(__ret_warn_once); \ }) +#define VM_WARN_ON_VMG(cond, vmg) ({ \ + int __ret_warn = !!(cond); \ + \ + if (unlikely(__ret_warn)) { \ + dump_vmg(vmg, "VM_WARN_ON_VMG(" __stringify(cond)")"); \ + WARN_ON(1); \ + } \ + unlikely(__ret_warn); \ +}) #define VM_WARN_ON(cond) (void)WARN_ON(cond) #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) @@ -104,9 +115,10 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); #define VM_WARN_ON_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_MM(cond, mm) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ON_VMG(cond, vmg) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) -#endif +#endif /* CONFIG_DEBUG_VM */ #ifdef CONFIG_DEBUG_VM_IRQSOFF #define VM_WARN_ON_IRQS_ENABLED() WARN_ON_ONCE(!irqs_disabled()) -- cgit v1.2.3 From 11e2400b21a3e2dfbc95e31a9a849a30191f7a92 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Fri, 3 Jan 2025 18:44:10 +0000 Subject: mm: move common part of pagetable_*_ctor to helper Patch series "Account page tables at all levels". This series should be considered in conjunction with Qi's series [1]. Together, they ensure that page table ctor/dtor are called at all levels (PTE to PGD) and all architectures, where page tables are regular pages. Besides the improvement in accounting and general cleanup, this also create a single place where construction/destruction hooks can be called for all page tables, namely the now-generic pagetable_dtor() introduced by Qi, and __pagetable_ctor() introduced in this series. [1] https://lore.kernel.org/linux-mm/cover.1735549103.git.zhengqi.arch@bytedance.com/ This patch (of 6): pagetable_*_ctor all have the same basic implementation. Move the common part to a helper to reduce duplication. Link: https://lkml.kernel.org/r/20250103184415.2744423-1-kevin.brodsky@arm.com Link: https://lkml.kernel.org/r/20250103184415.2744423-2-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: Dave Hansen Acked-by: Qi Zheng Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Linus Walleij Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Will Deacon Cc: Ingo Molnar Signed-off-by: Andrew Morton --- include/linux/mm.h | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c550912a5d6d..2949b58fd633 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2992,6 +2992,14 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ +static inline void __pagetable_ctor(struct ptdesc *ptdesc) +{ + struct folio *folio = ptdesc_folio(ptdesc); + + __folio_set_pgtable(folio); + lruvec_stat_add_folio(folio, NR_PAGETABLE); +} + static inline void pagetable_dtor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); @@ -3009,12 +3017,9 @@ static inline void pagetable_dtor_free(struct ptdesc *ptdesc) static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); - if (!ptlock_init(ptdesc)) return false; - __folio_set_pgtable(folio); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __pagetable_ctor(ptdesc); return true; } @@ -3118,13 +3123,10 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); - if (!pmd_ptlock_init(ptdesc)) return false; - __folio_set_pgtable(folio); ptdesc_pmd_pts_init(ptdesc); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __pagetable_ctor(ptdesc); return true; } @@ -3149,18 +3151,12 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); - - __folio_set_pgtable(folio); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __pagetable_ctor(ptdesc); } static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); - - __folio_set_pgtable(folio); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __pagetable_ctor(ptdesc); } extern void __init pagecache_init(void); -- cgit v1.2.3 From d95936a2267c11a38917d5fc7bf3862a64fe13d8 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Fri, 3 Jan 2025 18:44:15 +0000 Subject: mm: introduce ctor/dtor at PGD level Following on from the introduction of P4D-level ctor/dtor, let's finish the job and introduce ctor/dtor at PGD level. The incurred improvement in page accounting is minimal - the main motivation is to create a single, generic place where construction/destruction hooks can be added for all page table pages. This patch should cover all architectures and all configurations where PGDs are one or more regular pages. This excludes any configuration where PGDs are allocated from a kmem_cache object. Link: https://lkml.kernel.org/r/20250103184415.2744423-7-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: Dave Hansen Acked-by: Qi Zheng Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Ingo Molnar Cc: Linus Walleij Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2949b58fd633..3550cbeed488 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3159,6 +3159,11 @@ static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) __pagetable_ctor(ptdesc); } +static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc) +{ + __pagetable_ctor(ptdesc); +} + extern void __init pagecache_init(void); extern void free_initmem(void); -- cgit v1.2.3 From 42b7491af14cbba2393329ce43d508a957bd94fa Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Jan 2025 09:43:53 -0800 Subject: mm/damon/core: introduce damon_call() Introduce a new DAMON core API function, damon_call(). It aims to replace some damon_callback usages that access damon_ctx of ongoing kdamond with additional synchronizations. It receives a function pointer, let the parallel kdamond invokes the function, and returns after the invocation is finished, or canceled due to some races. kdamond invokes the function inside the main loop after sampling is done. If it is deactivated by DAMOS watermarks or already out of the main loop, mark the request as canceled so that damon_call() can wakeup and return. Link: https://lkml.kernel.org/r/20250103174400.54890-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index a67f2c4940e9..ac2d42a50751 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -552,6 +552,27 @@ struct damon_callback { void (*before_terminate)(struct damon_ctx *context); }; +/* + * struct damon_call_control - Control damon_call(). + * + * @fn: Function to be called back. + * @data: Data that will be passed to @fn. + * @return_code: Return code from @fn invocation. + * + * Control damon_call(), which requests specific kdamond to invoke a given + * function. Refer to damon_call() for more details. + */ +struct damon_call_control { + int (*fn)(void *data); + void *data; + int return_code; +/* private: internal use only */ + /* informs if the kdamond finished handling of the request */ + struct completion completion; + /* informs if the kdamond canceled @fn infocation */ + bool canceled; +}; + /** * struct damon_attrs - Monitoring attributes for accuracy/overhead control. * @@ -632,6 +653,9 @@ struct damon_ctx { /* for scheme quotas prioritization */ unsigned long *regions_score_histogram; + struct damon_call_control *call_control; + struct mutex call_control_lock; + /* public: */ struct task_struct *kdamond; struct mutex kdamond_lock; @@ -779,6 +803,8 @@ static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); +int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); + int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end); -- cgit v1.2.3 From bf0eaba0ff9c9c8e6fd58ddfa1a8b6df4b813f61 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Jan 2025 09:43:57 -0800 Subject: mm/damon/core: implement damos_walk() Introduce a new core layer interface, damos_walk(). It aims to replace some damon_callback usages that access DAMOS schemes applied regions of ongoing kdamond with additional synchronizations. It receives a function pointer and asks kdamond to invoke it for any region that it tried to apply any DAMOS action within one scheme apply interval for every scheme of it. The function further waits until the kdamond finishes the invocations for every scheme, or cancels the request, and returns. The kdamond invokes the function as requested within the main loop. If it is deactivated by DAMOS watermarks or going out of the main loop, it marks the request as canceled, so that damos_walk() can wakeup and return. Link: https://lkml.kernel.org/r/20250103174400.54890-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index ac2d42a50751..2889de3526c3 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -352,6 +352,31 @@ struct damos_filter { struct list_head list; }; +struct damon_ctx; +struct damos; + +/** + * struct damos_walk_control - Control damos_walk(). + * + * @walk_fn: Function to be called back for each region. + * @data: Data that will be passed to walk functions. + * + * Control damos_walk(), which requests specific kdamond to invoke the given + * function to each region that eligible to apply actions of the kdamond's + * schemes. Refer to damos_walk() for more details. + */ +struct damos_walk_control { + void (*walk_fn)(void *data, struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *s); + void *data; +/* private: internal use only */ + /* informs if the kdamond finished handling of the walk request */ + struct completion completion; + /* informs if the walk is canceled. */ + bool canceled; +}; + /** * struct damos_access_pattern - Target access pattern of the given scheme. * @min_sz_region: Minimum size of target regions. @@ -415,6 +440,8 @@ struct damos { * @action */ unsigned long next_apply_sis; + /* informs if ongoing DAMOS walk for this scheme is finished */ + bool walk_completed; /* public: */ struct damos_quota quota; struct damos_watermarks wmarks; @@ -442,8 +469,6 @@ enum damon_ops_id { NR_DAMON_OPS, }; -struct damon_ctx; - /** * struct damon_operations - Monitoring operations for given use cases. * @@ -656,6 +681,9 @@ struct damon_ctx { struct damon_call_control *call_control; struct mutex call_control_lock; + struct damos_walk_control *walk_control; + struct mutex walk_control_lock; + /* public: */ struct task_struct *kdamond; struct mutex kdamond_lock; @@ -804,6 +832,7 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); +int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end); -- cgit v1.2.3 From 626ffabe67c2359f3a88bb61fdc83a6280ef16e9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2025 11:33:46 -0800 Subject: mm/damon: clarify trying vs applying on damos_stat kernel-doc comment Patch series "mm/damon: enable page level properties based monitoring". TL; DR ====== This patch series enables access monitoring based on page level properties including their anonymousness, belonging cgroups and young-ness, by extending DAMOS stats and regions walk features with region-internal DAMOS filters. Background ========== DAMOS has initially developed for only access-aware system operations. But, efficient acces monitoring results querying is yet another major usage of today's DAMOS. DAMOS stats and regions walk, which exposes accumulated counts and per-region monitoring results that filtered by DAMOS parameters including target access pattern, quotas and DAMOS filters, are the key features for that usage. For tunings and investigations, it can be more useful if only the information can be exposed without making real system operational change. Special DAMOS action, DAMOS_STAT, was introduced for the purpose. DAMOS fundametally works with only access pattern information in region granularity. For some use cases, fixed and fine granularity information based on non access pattern properties can be useful, though. For example, on systems having swap devices that much faster than storage devices for files, DAMOS-based proactive reclaim need to be applied differently for anonymous pages and file-backed pages. DAMOS filters is a feature that makes it possible. It supports non access pattern information including page level properties such as anonymousness, belonging cgroups, and young-ness (whether the page has accessed since the last access check of it). The information can be useful for tuning and investigations. DAMOS stat exposes some of it via {nr,sz}_applied, but it is mixed with operation failures. Also, exposing the information without making system operation change is impossible, since DAMOS_STAT simply ignores the page level properties based DAMOS filters. Design ====== Expose the exact information for every DAMOS action including DAMOS_STAT by implementing below changes. Extend the interface for DAMON operations set layer, which contains the implementation of the page level filters, to report back the amount of memory that passed the region-internal DAMOS filters to the core layer. On the core layer, account the operations set layer reported stat with DAMOS stat for per-scheme monitoring. Also, pass the information to regions walk for per-region monitoring. In this way, DAMON API users can efficiently get the fine-grained information. For the user-space, make DAMON sysfs interface collects the information using the updated DAMON core API, and expose those to new per-scheme stats file and per-DAMOS-tried region properties file. Practical Usages ================ With this patch series, DAMON users can query how many bytes of regions of specific access temperature is backed by pages of specific type. The type can be any of DAMOS filter-supporting one, including anonymousness, belonging cgroups, and young-ness. For example, users can visualize access hotness-based page granulairty histogram for different cgroups, backing content type, or youngness. In future, it could be extended to more types such as whether it is THP, position on LRU lists, etc. This can be useful for estimating benefits of a new or an existing access-aware system optimizations without really committing the changes. Patches Sequence ================ The patches are constructed in four sub-sequences. First three patches (patches 1-3) update documents to have missing background knowledges and better structures for easily introducing followup changes. Following three patches (patches 4-6) change the operations set layer interface to report back the region-internal filter passed memory size, and make the operations set implementations support the changed symantic. Following five patches (patches 7-11) implement per-scheme accumulated stat for region-internal filter-passed memory size on core API (damos_stat) and DAMON sysfs interface. First two patches of those are for code change, and following three patches are for documentation. Finally, five patches (patches 12-16) implementing per-region region-internal filter-passed memory size follows. Similar to that for per-scheme stat, first two patches implement core-API and sysfs interface change. Then three patches for documentation update follow. This patch (of 16): DAMOS stat kernel-doc documentation is using terms that bit ambiguous. Without reading the code, understanding it correctly is not that easy. Add the clarification on the kernel-doc comment. Link: https://lkml.kernel.org/r/20250106193401.109161-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250106193401.109161-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 2889de3526c3..b85eae388f5b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -287,6 +287,23 @@ struct damos_watermarks { * @nr_applied: Total number of regions that the scheme is applied. * @sz_applied: Total size of regions that the scheme is applied. * @qt_exceeds: Total number of times the quota of the scheme has exceeded. + * + * "Tried an action to a region" in this context means the DAMOS core logic + * determined the region as eligible to apply the action. The access pattern + * (&struct damos_access_pattern), quotas (&struct damos_quota), watermarks + * (&struct damos_watermarks) and filters (&struct damos_filter) that handled + * on core logic can affect this. The core logic asks the operation set + * (&struct damon_operations) to apply the action to the region. + * + * "Applied an action to a region" in this context means the operation set + * (&struct damon_operations) successfully applied the action to the region, at + * least to a part of the region. The filters (&struct damos_filter) that + * handled on operation set layer and type of the action and pages of the + * region can affect this. For example, if a filter is set to exclude + * anonymous pages and the region has only anonymous pages, the region will be + * failed at applying the action. If the action is &DAMOS_PAGEOUT and all + * pages of the region are already paged out, the region will be failed at + * applying the action. */ struct damos_stat { unsigned long nr_tried; -- cgit v1.2.3 From b5bbe9c08fd1519f96832b82256543a567ce2900 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2025 11:33:49 -0800 Subject: mm/damon: ask apply_scheme() to report filter-passed region-internal bytes Some DAMOS filter types including those for young page, anon page, and belonging memcg are handled by underlying DAMON operations set implementation, via damon_operations->apply_scheme() interface. How many bytes of the region have passed the filter can be useful for DAMOS scheme tuning and access pattern monitoring. Modify the interface to let the callback implementation reports back the number if possible. Link: https://lkml.kernel.org/r/20250106193401.109161-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index b85eae388f5b..da003173210f 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -529,7 +529,8 @@ enum damon_ops_id { * @apply_scheme is called from @kdamond when a region for user provided * DAMON-based operation scheme is found. It should apply the scheme's action * to the region and return bytes of the region that the action is successfully - * applied. + * applied. It should also report how many bytes of the region has passed + * filters (&struct damos_filter) that handled by itself. * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup is called from @kdamond just before its termination. @@ -546,7 +547,7 @@ struct damon_operations { struct damos *scheme); unsigned long (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, - struct damos *scheme); + struct damos *scheme, unsigned long *sz_filter_passed); bool (*target_valid)(struct damon_target *t); void (*cleanup)(struct damon_ctx *context); }; -- cgit v1.2.3 From 60fa9355a6c620f7b727d3fdb433fb6cf714a9b0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2025 11:33:52 -0800 Subject: mm/damon/core: implement per-scheme ops-handled filter-passed bytes stat Implement a new per-DAMOS scheme statistic field, namely sz_ops_filter_passed, using the changed damon_operations->apply_scheme() interface. It counts total bytes of memory that given DAMOS action tried to be applied, and passed the operations layer handled region-internal filters of the scheme. DAMON API users can access it using DAMON-internal safe access features such as damon_call() and/or damos_walk(). Link: https://lkml.kernel.org/r/20250106193401.109161-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index da003173210f..2a93dbe06ecc 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -286,6 +286,8 @@ struct damos_watermarks { * @sz_tried: Total size of regions that the scheme is tried to be applied. * @nr_applied: Total number of regions that the scheme is applied. * @sz_applied: Total size of regions that the scheme is applied. + * @sz_ops_filter_passed: + * Total bytes that passed ops layer-handled DAMOS filters. * @qt_exceeds: Total number of times the quota of the scheme has exceeded. * * "Tried an action to a region" in this context means the DAMOS core logic @@ -310,6 +312,7 @@ struct damos_stat { unsigned long sz_tried; unsigned long nr_applied; unsigned long sz_applied; + unsigned long sz_ops_filter_passed; unsigned long qt_exceeds; }; -- cgit v1.2.3 From cfc33a7d2daca4455ef3ebae63a2e89bd9bb0ebe Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2025 11:33:57 -0800 Subject: mm/damon/core: pass per-region filter-passed bytes to damos_walk_control->walk_fn() Total size of memory that passed DAMON operations set layer-handled DAMOS filters per scheme is provided to DAMON core API and ABI (sysfs interface) users. Having it per-region in non-accumulated way can provide it in finer granularity. Provide it to damos_walk() core API users, by passing the data to damos_walk_control->walk_fn(). Link: https://lkml.kernel.org/r/20250106193401.109161-13-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 2a93dbe06ecc..298b1a831e62 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -388,7 +388,7 @@ struct damos; struct damos_walk_control { void (*walk_fn)(void *data, struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, - struct damos *s); + struct damos *s, unsigned long sz_filter_passed); void *data; /* private: internal use only */ /* informs if the kdamond finished handling of the walk request */ -- cgit v1.2.3 From 63db8170bf34ce9e0763f87d993cf9b4c9002b09 Mon Sep 17 00:00:00 2001 From: Bruno Faccini Date: Mon, 6 Jan 2025 04:06:59 -0800 Subject: mm/fake-numa: allow later numa node hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current fake-numa implementation prevents new Numa nodes to be later hot-plugged by drivers. A common symptom of this limitation is the "node was absent from the node_possible_map" message by associated warning in mm/memory_hotplug.c: add_memory_resource(). This comes from the lack of remapping in both pxm_to_node_map[] and node_to_pxm_map[] tables to take fake-numa nodes into account and thus triggers collisions with original and physical nodes only-mapping that had been determined from BIOS tables. This patch fixes this by doing the necessary node-ids translation in both pxm_to_node_map[]/node_to_pxm_map[] tables. node_distance[] table has also been fixed accordingly. Details: When trying to use fake-numa feature on our system where new Numa nodes are being "hot-plugged" upon driver load, this fails with the following type of message and warning with stack : node 8 was absent from the node_possible_map WARNING: CPU: 61 PID: 4259 at mm/memory_hotplug.c:1506 add_memory_resource+0x3dc/0x418 This issue prevents the use of the fake-NUMA debug feature with the system's full configuration, when it has proven to be sometimes extremely useful for performance testing of multi-tasked, memory-bound applications, as it enables better isolation of processes/ranks compared to fat NUMA nodes. Usual numactl output after driver has “hot-plugged”/unveiled some new Numa nodes with and without memory : $ numactl --hardware available: 9 nodes (0-8) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 0 size: 490037 MB node 0 free: 484432 MB node 1 cpus: node 1 size: 97280 MB node 1 free: 97279 MB node 2 cpus: node 2 size: 0 MB node 2 free: 0 MB node 3 cpus: node 3 size: 0 MB node 3 free: 0 MB node 4 cpus: node 4 size: 0 MB node 4 free: 0 MB node 5 cpus: node 5 size: 0 MB node 5 free: 0 MB node 6 cpus: node 6 size: 0 MB node 6 free: 0 MB node 7 cpus: node 7 size: 0 MB node 7 free: 0 MB node 8 cpus: node 8 size: 0 MB node 8 free: 0 MB node distances: node 0 1 2 3 4 5 6 7 8 0: 10 80 80 80 80 80 80 80 80 1: 80 10 255 255 255 255 255 255 255 2: 80 255 10 255 255 255 255 255 255 3: 80 255 255 10 255 255 255 255 255 4: 80 255 255 255 10 255 255 255 255 5: 80 255 255 255 255 10 255 255 255 6: 80 255 255 255 255 255 10 255 255 7: 80 255 255 255 255 255 255 10 255 8: 80 255 255 255 255 255 255 255 10 With recent M.Rapoport set of fake-numa patches in mm-everything and using numa=fake=4 boot parameter : $ numactl --hardware available: 4 nodes (0-3) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 0 size: 122518 MB node 0 free: 117141 MB node 1 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 1 size: 219911 MB node 1 free: 219751 MB node 2 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 2 size: 122599 MB node 2 free: 122541 MB node 3 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 3 size: 122479 MB node 3 free: 122408 MB node distances: node 0 1 2 3 0: 10 10 10 10 1: 10 10 10 10 2: 10 10 10 10 3: 10 10 10 10 With recent M.Rapoport set of fake-numa patches in mm-everything, this patch on top, using numa=fake=4 boot parameter : # numactl —hardware available: 12 nodes (0-11) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 0 size: 122518 MB node 0 free: 116429 MB node 1 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 1 size: 122631 MB node 1 free: 122576 MB node 2 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 2 size: 122599 MB node 2 free: 122544 MB node 3 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 3 size: 122479 MB node 3 free: 122419 MB node 4 cpus: node 4 size: 97280 MB node 4 free: 97279 MB node 5 cpus: node 5 size: 0 MB node 5 free: 0 MB node 6 cpus: node 6 size: 0 MB node 6 free: 0 MB node 7 cpus: node 7 size: 0 MB node 7 free: 0 MB node 8 cpus: node 8 size: 0 MB node 8 free: 0 MB node 9 cpus: node 9 size: 0 MB node 9 free: 0 MB node 10 cpus: node 10 size: 0 MB node 10 free: 0 MB node 11 cpus: node 11 size: 0 MB node 11 free: 0 MB node distances: node 0 1 2 3 4 5 6 7 8 9 10 11 0: 10 10 10 10 80 80 80 80 80 80 80 80 1: 10 10 10 10 80 80 80 80 80 80 80 80 2: 10 10 10 10 80 80 80 80 80 80 80 80 3: 10 10 10 10 80 80 80 80 80 80 80 80 4: 80 80 80 80 10 255 255 255 255 255 255 255 5: 80 80 80 80 255 10 255 255 255 255 255 255 6: 80 80 80 80 255 255 10 255 255 255 255 255 7: 80 80 80 80 255 255 255 10 255 255 255 255 8: 80 80 80 80 255 255 255 255 10 255 255 255 9: 80 80 80 80 255 255 255 255 255 10 255 255 10: 80 80 80 80 255 255 255 255 255 255 10 255 11: 80 80 80 80 255 255 255 255 255 255 255 10 Link: https://lkml.kernel.org/r/20250106120659.359610-2-bfaccini@nvidia.com Signed-off-by: Bruno Faccini Cc: David Hildenbrand Cc: John Hubbard Cc: Mike Rapoport (Microsoft) Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/numa_memblks.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h index cfad6ce7e1bd..dd85613cdd86 100644 --- a/include/linux/numa_memblks.h +++ b/include/linux/numa_memblks.h @@ -29,7 +29,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi); int __init numa_memblks_init(int (*init_func)(void), bool memblock_force_top_down); +extern int numa_distance_cnt; + #ifdef CONFIG_NUMA_EMU +extern int emu_nid_to_phys[MAX_NUMNODES]; int numa_emu_cmdline(char *str); void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, unsigned int nr_emu_nids); -- cgit v1.2.3 From b2aad24b53333f1904a55d97e3fde2246ef05bb6 Mon Sep 17 00:00:00 2001 From: Guo Weikang Date: Mon, 6 Jan 2025 10:11:25 +0800 Subject: mm/memmap: prevent double scanning of memmap by kmemleak kmemleak explicitly scans the mem_map through the valid struct page objects. However, memmap_alloc() was also adding this memory to the gray object list, causing it to be scanned twice. Remove memmap_alloc() from the scan list and add a comment to clarify the behavior. Link: https://lore.kernel.org/lkml/CAOm6qn=FVeTpH54wGDFMHuCOeYtvoTx30ktnv9-w3Nh8RMofEA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20250106021126.1678334-1-guoweikang.kernel@gmail.com Signed-off-by: Guo Weikang Reviewed-by: Catalin Marinas Cc: Mike Rapoport (Microsoft) Signed-off-by: Andrew Morton --- include/linux/memblock.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 673d5cae7c81..d48b56c1e558 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -378,6 +378,10 @@ static inline int memblock_get_region_node(const struct memblock_region *r) /* Flags for memblock allocation APIs */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) #define MEMBLOCK_ALLOC_ACCESSIBLE 0 +/* + * MEMBLOCK_ALLOC_NOLEAKTRACE avoids kmemleak tracing. It implies + * MEMBLOCK_ALLOC_ACCESSIBLE + */ #define MEMBLOCK_ALLOC_NOLEAKTRACE 1 /* We are using top down, so it is safe to use 0 here */ -- cgit v1.2.3 From 30cef82bc6e8975a360ec05b707f7fb194c875ed Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 7 Jan 2025 15:39:58 -0500 Subject: mm/hugetlb: rename avoid_reserve to cow_from_owner The old name "avoid_reserve" can be too generic and can be used wrongly in the new call sites that want to allocate a hugetlb folio. It's confusing on two things: (1) whether one can opt-in to avoid global reservation, and (2) whether it should take more than one count. In reality, this flag is only used in an extremely hacky path, in an extremely hacky way in hugetlb CoW path only, and always use with 1 saying "skip global reservation". Rename the flag to avoid future abuse of this flag, making it a boolean so as to reflect its true representation that it's not a counter. To make it even harder to abuse, add a comment above the function to explain it. Link: https://lkml.kernel.org/r/20250107204002.2683356-4-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Oscar Salvador Cc: Ackerley Tng Cc: Breno Leitao Cc: Muchun Song Cc: Naoya Horiguchi Cc: Rik van Riel Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 10faf42ca96a..49ec2362ce92 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -683,7 +683,7 @@ struct huge_bootmem_page { int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve); + unsigned long addr, bool cow_from_owner); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback); @@ -1068,7 +1068,7 @@ static inline int replace_free_hugepage_folios(unsigned long start_pfn, static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, - int avoid_reserve) + bool cow_from_owner) { return NULL; } -- cgit v1.2.3 From c8b979530f27f90c0353a189b2faa6e50a0ea94a Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Mon, 23 Dec 2024 17:00:37 -0500 Subject: mm: alloc_pages_bulk_noprof: drop page_list argument Patch series "mm: alloc_pages_bulk: small API refactor", v2. Today, alloc_pages_bulk_noprof() supports two arguments to return allocated pages: a linked list and an array. There are also higher level APIs for both. However, the linked list API has apparently never been used. So, this series removes it along with the list API and also refactors the remaining API naming for consistency. This patch (of 2): commit 387ba26fb1cb ("mm/page_alloc: add a bulk page allocator") added __alloc_pages_bulk() along with the page_list argument. The next commit 0f87d9d30f21 ("mm/page_alloc: add an array-based interface to the bulk page allocator") added the array-based argument. As it turns out, the page_list argument has no users in the current tree (if it ever had any). Dropping it allows for a slight simplification and eliminates some unnecessary checks, now that page_array is required. Also, note that the removal of the page_list argument was proposed before in the thread below, where Matthew Wilcox mentions that: """ Iterating a linked list is _expensive_. It is about 10x quicker to iterate an array than a linked list. """ (https://lore.kernel.org/linux-mm/20231025093254.xvomlctwhcuerzky@techsingularity.net) Link: https://lkml.kernel.org/r/cover.1734991165.git.luizcap@redhat.com Link: https://lkml.kernel.org/r/f1c75db91d08cafd211eca6a3b199b629d4ffe16.1734991165.git.luizcap@redhat.com Signed-off-by: Luiz Capitulino Acked-by: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- include/linux/gfp.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c96d5d7f7b89..f8b33c5e7a14 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -212,7 +212,6 @@ struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, - struct list_head *page_list, struct page **page_array); #define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__)) @@ -223,11 +222,8 @@ unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__)) /* Bulk allocate order-0 pages */ -#define alloc_pages_bulk_list(_gfp, _nr_pages, _list) \ - __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _list, NULL) - #define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array) \ - __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, NULL, _page_array) + __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _page_array) static inline unsigned long alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, @@ -236,7 +232,7 @@ alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, if (nid == NUMA_NO_NODE) nid = numa_mem_id(); - return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, NULL, page_array); + return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, page_array); } #define alloc_pages_bulk_array_node(...) \ -- cgit v1.2.3 From 6bf9b5b40af373690313f64a3935b2bf2e5d46d9 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Mon, 23 Dec 2024 17:00:38 -0500 Subject: mm: alloc_pages_bulk: rename API The previous commit removed the page_list argument from alloc_pages_bulk_noprof() along with the alloc_pages_bulk_list() function. Now that only the *_array() flavour of the API remains, we can do the following renaming (along with the _noprof() ones): alloc_pages_bulk_array -> alloc_pages_bulk alloc_pages_bulk_array_mempolicy -> alloc_pages_bulk_mempolicy alloc_pages_bulk_array_node -> alloc_pages_bulk_node Link: https://lkml.kernel.org/r/275a3bbc0be20fbe9002297d60045e67ab3d4ada.1734991165.git.luizcap@redhat.com Signed-off-by: Luiz Capitulino Acked-by: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- include/linux/gfp.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f8b33c5e7a14..6bb1a5a7a4ae 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -215,18 +215,18 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, struct page **page_array); #define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__)) -unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, +unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array); -#define alloc_pages_bulk_array_mempolicy(...) \ - alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__)) +#define alloc_pages_bulk_mempolicy(...) \ + alloc_hooks(alloc_pages_bulk_mempolicy_noprof(__VA_ARGS__)) /* Bulk allocate order-0 pages */ -#define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array) \ +#define alloc_pages_bulk(_gfp, _nr_pages, _page_array) \ __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _page_array) static inline unsigned long -alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, +alloc_pages_bulk_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array) { if (nid == NUMA_NO_NODE) @@ -235,8 +235,8 @@ alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, page_array); } -#define alloc_pages_bulk_array_node(...) \ - alloc_hooks(alloc_pages_bulk_array_node_noprof(__VA_ARGS__)) +#define alloc_pages_bulk_node(...) \ + alloc_hooks(alloc_pages_bulk_node_noprof(__VA_ARGS__)) static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask) { -- cgit v1.2.3 From e20f52e8e3b7947e40bd40c6cdc69884c6df716c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 9 Jan 2025 09:51:17 -0800 Subject: mm/damon: fixup damos_filter kernel-doc Patch series "mm/damon: extend DAMOS filters for inclusion", v2. DAMOS fitlers are exclusive filters. It only excludes memory of given criterias from the DAMOS action targets. This has below limitations. First, the name is not explicitly explaining the behavior. This actually resulted in users' confusions[1]. Secondly, combined uses of multiple filters provide only restriced coverages. For example, building a DAMOS scheme that applies the action to memory that belongs to cgroup A "or" cgroup B is impossible. A workaround would be using two schemes that fitlers out memory that not belong to cgroup A and cgroup B, respectively. It is cumbersome, and difficult to control quota-like per-scheme features in an orchestration. Monitoring of filters-passed memory statistic will also be complicated. Extend DAMOS filters to support not only exclusion (rejecting), but also inclusion (allowing) behavior. For this, add a new damos_filter struct field called 'allow' for DAMON kernel API users. The filter works as an inclusion or exclusion filter when it is set or unset, respectively. For DAMON user-space ABI users, add a DAMON sysfs file of same name under DAMOS filter sysfs directory. To prevent exposing a behavioral change to old users, set rejecting as the default behavior. Note that allow-filters work for only inclusion, not exclusion of memory that not satisfying the criteria. And the default behavior of DAMOS for memory that no filter has involved is that the action can be applied to those memory. Also, filters-passed memory statistics are for any memory that passed through the DAMOS filters check stage. These implies installing allow-filters at the endof the filter list is useless. Refer to the design doc change of this series for more details. [1] https://lore.kernel.org/20240320165619.71478-1-sj@kernel.org This patch (of 10): The comment is slightly wrong. DAMOS filters are not only for pages, but general bytes of memory. Also the description of 'matching' is bit confusing, since DAMOS filters do only filtering out. Update the comments to be less confusing. Link: https://lkml.kernel.org/r/20250109175126.57878-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250109175126.57878-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 298b1a831e62..72afba74ac6d 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -347,8 +347,8 @@ enum damos_filter_type { /** * struct damos_filter - DAMOS action target memory filter. - * @type: Type of the page. - * @matching: If the matching page should filtered out or in. + * @type: Type of the target memory. + * @matching: If the @type-matching memory should be filtered out. * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. * @addr_range: Address range if @type is DAMOS_FILTER_TYPE_ADDR. * @target_idx: Index of the &struct damon_target of @@ -357,9 +357,10 @@ enum damos_filter_type { * @list: List head for siblings. * * Before applying the &damos->action to a memory region, DAMOS checks if each - * page of the region matches to this and avoid applying the action if so. - * Support of each filter type depends on the running &struct damon_operations - * and the type. Refer to &enum damos_filter_type for more detai. + * byte of the region matches to this given condition and avoid applying the + * action if so. Support of each filter type depends on the running &struct + * damon_operations and the type. Refer to &enum damos_filter_type for more + * details. */ struct damos_filter { enum damos_filter_type type; -- cgit v1.2.3 From fe6d7fdd62491524d11433b9ff8d3db5dde32700 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 9 Jan 2025 09:51:18 -0800 Subject: mm/damon/core: add damos_filter->allow field DAMOS filters work as only exclusive (reject) filters. This makes it easy to be confused, and restrictive at combining multiple filters for covering various types of memory. Add a field named 'allow' to damos_filter. The field will be used to indicate whether the filter should work for inclusion or exclusion. To keep the old behavior, set it as 'false' (work as exclusive filter) by default, from damos_new_filter(). Following two commits will make the core and operations set layers, which handles damos_filter objects, respect the field, respectively. Link: https://lkml.kernel.org/r/20250109175126.57878-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 72afba74ac6d..8a2d104df5a3 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -348,7 +348,8 @@ enum damos_filter_type { /** * struct damos_filter - DAMOS action target memory filter. * @type: Type of the target memory. - * @matching: If the @type-matching memory should be filtered out. + * @matching: Whether this is for @type-matching memory. + * @allow: Whether to include or exclude the @matching memory. * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. * @addr_range: Address range if @type is DAMOS_FILTER_TYPE_ADDR. * @target_idx: Index of the &struct damon_target of @@ -365,6 +366,7 @@ enum damos_filter_type { struct damos_filter { enum damos_filter_type type; bool matching; + bool allow; union { unsigned short memcg_id; struct damon_addr_range addr_range; -- cgit v1.2.3 From e2fbfedad03401a38b8c3b7fd52d8fdcd039d0bc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 9 Jan 2025 09:51:21 -0800 Subject: mm/damon: add 'allow' argument to damos_new_filter() DAMON API users should set damos_filter->allow manually to use a DAMOS allow-filter, since damos_new_filter() unsets the field always. It is cumbersome and easy to mistake. Add an arugment for setting the field to damos_new_filter(). Link: https://lkml.kernel.org/r/20250109175126.57878-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 8a2d104df5a3..0834d7ffcb84 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -801,7 +801,7 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed, struct damon_attrs *attrs); struct damos_filter *damos_new_filter(enum damos_filter_type type, - bool matching); + bool matching, bool allow); void damos_add_filter(struct damos *s, struct damos_filter *f); void damos_destroy_filter(struct damos_filter *f); -- cgit v1.2.3 From 07438779313caafe52ac1a1a6958d735a5938988 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Dec 2024 13:16:38 -0800 Subject: alloc_tag: avoid current->alloc_tag manipulations when profiling is disabled When memory allocation profiling is disabled there is no need to update current->alloc_tag and these manipulations add unnecessary overhead. Fix the overhead by skipping these extra updates. I ran comprehensive testing on Pixel 6 on Big, Medium and Little cores: Overhead before fixes Overhead after fixes slab alloc page alloc slab alloc page alloc Big 6.21% 5.32% 3.31% 4.93% Medium 4.51% 5.05% 3.79% 4.39% Little 7.62% 1.82% 6.68% 1.02% This is an allocation microbenchmark doing allocations in a tight loop. Not a really realistic scenario and useful only to make performance comparisons. Link: https://lkml.kernel.org/r/20241226211639.1357704-1-surenb@google.com Fixes: b951aaff5035 ("mm: enable page allocation tagging") Signed-off-by: Suren Baghdasaryan Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Yu Zhao Cc: Zhenhua Huang Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 0bbbe537c5f9..a946e0203e6d 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -224,9 +224,14 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} #define alloc_hooks_tag(_tag, _do_alloc) \ ({ \ - struct alloc_tag * __maybe_unused _old = alloc_tag_save(_tag); \ - typeof(_do_alloc) _res = _do_alloc; \ - alloc_tag_restore(_tag, _old); \ + typeof(_do_alloc) _res; \ + if (mem_alloc_profiling_enabled()) { \ + struct alloc_tag * __maybe_unused _old; \ + _old = alloc_tag_save(_tag); \ + _res = _do_alloc; \ + alloc_tag_restore(_tag, _old); \ + } else \ + _res = _do_alloc; \ _res; \ }) -- cgit v1.2.3 From 7277433096f6ce4a84a1620529ac4ba3e1041ee1 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:22 +0800 Subject: mm, swap: remove old allocation path for HDD We are currently using different swap allocation algorithm for HDD and non-HDD. This leads to the existence of a different set of locks, and the code path is heavily bloated, causing difficulties for further optimization and maintenance. This commit removes all HDD swap allocation and related dead code, and uses the cluster allocation algorithm instead. The performance may drop temporarily, but this should be negligible: The main advantage of the legacy HDD allocation algorithm is that it tends to use continuous slots, but swap device gets fragmented quickly anyway, and the attempt to use continuous slots will fail easily. This commit also enables mTHP swap on HDD, which is expected to be beneficial, and following commits will adapt and optimize the cluster allocator for HDD. Link: https://lkml.kernel.org/r/20250113175732.48099-4-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Suggested-by: "Huang, Ying" Reviewed-by: Baoquan He Cc: Barry Song Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index f3e0ac20c2e8..3a71198a6957 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -309,9 +309,6 @@ struct swap_info_struct { unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ unsigned int inuse_pages; /* number of those currently in use */ - unsigned int cluster_next; /* likely index for next allocation */ - unsigned int cluster_nr; /* countdown to next cluster search */ - unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ -- cgit v1.2.3 From 27701521beb5897d6b97e2f8c20de41e74cbcb7b Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:24 +0800 Subject: mm, swap: clean up device availability check Remove highest_bit and lowest_bit. After the HDD allocation path has been removed, the only purpose of these two fields is to determine whether the device is full or not, which can instead be determined by checking the inuse_pages. Link: https://lkml.kernel.org/r/20250113175732.48099-6-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baoquan He Cc: Barry Song Cc: Chis Li Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 3a71198a6957..c0d49dad7a4b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -305,8 +305,6 @@ struct swap_info_struct { struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; - unsigned int lowest_bit; /* index of first free in swap_map */ - unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ unsigned int inuse_pages; /* number of those currently in use */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ -- cgit v1.2.3 From b228386cf237e659cdf5d8037a19db0b0a06f6b5 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:25 +0800 Subject: mm, swap: clean up plist removal and adding When the swap device is full (inuse_pages == pages), it should be removed from the allocation available plist. If any slot is freed, the swap device should be added back to the plist. Additionally, during swapon or swapoff, the swap device is forcefully added or removed. Currently, the condition (inuse_pages == pages) is checked after every counter update, then remove or add the device accordingly. This is serialized by si->lock. This commit decouples it from the protection of si->lock and reworked plist removal and adding, making it possible to get rid of the hard dependency on si->lock in allocation path in later commits. To achieve this, simply using another lock is not an optimal approach, as the overhead is observable for a hot counter, and may cause complex locking issues. Thus, this commit manages to make it a lock-free atomic operation, by embedding the plist state into the second highest bit of the atomic counter. Simply making the counter an atomic will not work, if the update and plist status check are not performed atomically, we may miss an addition or removal. With the embedded info we can update the counter and check the plist status with single atomic operations, and avoid any extra overheads: If the counter is full (inuse_pages == pages) and the off-list bit is unset, we attempt to remove it from the plist. If the counter is not full (inuse_pages != pages) and the off-list bit is set, we attempt to add it to the plist. Removing, adding and bit update is serialized with a lock, which is a cold path. Ordinary counter updates will be lock-free. Link: https://lkml.kernel.org/r/20250113175732.48099-7-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Baoquan He Cc: Barry Song Cc: Chis Li Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index c0d49dad7a4b..16dcf8bd1a4e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -306,7 +306,7 @@ struct swap_info_struct { /* list of cluster that are fragmented or contented */ unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ - unsigned int inuse_pages; /* number of those currently in use */ + atomic_long_t inuse_pages; /* number of those currently in use */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ -- cgit v1.2.3 From 9a0ddeb7988095a5c21994c37005a45b240039ef Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:26 +0800 Subject: mm, swap: hold a reference during scan and cleanup flag usage The flag SWP_SCANNING was used as an indicator of whether a device is being scanned for allocation, and prevents swapoff. Combined with SWP_WRITEOK, they work as a set of barriers for a clean swapoff: 1. Swapoff clears SWP_WRITEOK, allocation requests will see ~SWP_WRITEOK and abort as it's serialized by si->lock. 2. Swapoff unuses all allocated entries. 3. Swapoff waits for SWP_SCANNING flag to be cleared, so ongoing allocations will stop, preventing UAF. 4. Now swapoff can free everything safely. This will make the allocation path have a hard dependency on si->lock. Allocation always have to acquire si->lock first for setting SWP_SCANNING and checking SWP_WRITEOK. This commit removes this flag, and just uses the existing per-CPU refcount instead to prevent UAF in step 3, which serves well for such usage without dependency on si->lock, and scales very well too. Just hold a reference during the whole scan and allocation process. Swapoff will kill and wait for the counter. And for preventing any allocation from happening after step 1 so the unuse in step 2 can ensure all slots are free, swapoff will acquire the ci->lock of each cluster one by one to ensure all allocations see ~SWP_WRITEOK and abort. This way these dependences on si->lock are gone. And worth noting we can't kill the refcount as the first step for swapoff as the unuse process have to acquire the refcount. Link: https://lkml.kernel.org/r/20250113175732.48099-8-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Baoquan He Cc: Barry Song Cc: Chis Li Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 16dcf8bd1a4e..1651174959c8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -219,7 +219,6 @@ enum { SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ /* add others here before... */ - SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL -- cgit v1.2.3 From 3494d184706ff5e7d28481de0c841b039caa38b1 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:27 +0800 Subject: mm, swap: use an enum to define all cluster flags and wrap flags changes Currently, we are only using flags to indicate which list the cluster is on. Using one bit for each list type might be a waste, as the list type grows, we will consume too many bits. Additionally, the current mixed usage of '&' and '==' is a bit confusing. Make it clean by using an enum to define all possible cluster statuses. Only an off-list cluster will have the NONE (0) flag. And use a wrapper to annotate and sanitize all flag settings and list movements. Link: https://lkml.kernel.org/r/20250113175732.48099-9-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 1651174959c8..0e59cb158b15 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -256,10 +256,19 @@ struct swap_cluster_info { u8 order; struct list_head list; }; -#define CLUSTER_FLAG_FREE 1 /* This cluster is free */ -#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ -#define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */ -#define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */ + +/* All on-list cluster must have a non-zero flag. */ +enum swap_cluster_flags { + CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ + CLUSTER_FLAG_FREE, + CLUSTER_FLAG_NONFULL, + CLUSTER_FLAG_FRAG, + /* Clusters with flags above are allocatable */ + CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, + CLUSTER_FLAG_FULL, + CLUSTER_FLAG_DISCARD, + CLUSTER_FLAG_MAX, +}; /* * The first page in the swap file is the swap header, which is always marked -- cgit v1.2.3 From 3b644773eefda88112d3ee5d57620f6e58fccfc6 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:28 +0800 Subject: mm, swap: reduce contention on device lock Currently, swap locking is mainly composed of two locks: the cluster lock (ci->lock) and the device lock (si->lock). The cluster lock is much more fine-grained, so it is best to use ci->lock instead of si->lock as much as possible. We have cleaned up other hard dependencies on si->lock. Following the new cluster allocator design, most operations don't need to touch si->lock at all. In practice, we only need to take si->lock when moving clusters between lists. To achieve this, this commit reworks the locking pattern of all si->lock and ci->lock users, eliminates all usage of ci->lock inside si->lock, and introduces a new design to avoid touching si->lock unless needed. For minimal contention and easier understanding of the system, two ideas are introduced with the corresponding helpers: isolation and relocation. - Clusters will be `isolated` from the list when iterating the list to search for an allocatable cluster. This ensures other CPUs won't walk into the same cluster easily, and it releases si->lock after acquiring ci->lock, providing the only place that handles the inversion of two locks, and avoids contention. Iterating the cluster list almost always moves the cluster (free -> nonfull, nonfull -> frag, frag -> frag tail), but it doesn't know where the cluster should be moved to until scanning is done. So keeping the cluster off-list is a good option with low overhead. The off-list time window of a cluster is also minimal. In the worst case, one CPU will return the cluster after scanning the 512 entries on it, which we used to busy wait with a spin lock. This is done with the new helper `isolate_lock_cluster`. - Clusters will be `relocated` after allocation or freeing, according to their usage count and status. Allocations no longer hold si->lock now, and may drop ci->lock for reclaim, so the cluster could be moved to any location while no lock is held. Besides, isolation clears all flags when it takes the cluster off the list (the flags must be in sync with the list status, so cluster users don't need to touch si->lock for checking its list status). So the cluster has to be relocated to the right list according to its usage after allocation or freeing. Relocation is optional, if the cluster flags indicate it's already on the right list, it will skip touching the list or si->lock. This is done with `relocate_cluster` after allocation or with `[partial_]free_cluster` after freeing. This handled usage of all kinds of clusters in a clean way. Scanning and allocation by iterating the cluster list is handled by "isolate - - relocate". Scanning and allocation of per-CPU clusters will only involve " - relocate", as it knows which cluster to lock and use. Freeing will only involve "relocate". Each CPU will keep using its per-CPU cluster until the 512 entries are all consumed. Freeing also has to free 512 entries to trigger cluster movement in the best case, so si->lock is rarely touched. Testing with building the Linux kernel with defconfig showed huge improvement: tiem make -j96 / 768M memcg, 4K pages, 10G ZRAM, on Intel 8255C: Before: Sys time: 73578.30, Real time: 864.05 After: (-50.7% sys time, -44.8% real time) Sys time: 36227.49, Real time: 476.66 time make -j96 / 1152M memcg, 64K mTHP, 10G ZRAM, on Intel 8255C: (avg of 4 test run) Before: Sys time: 74044.85, Real time: 846.51 hugepages-64kB/stats/swpout: 1735216 hugepages-64kB/stats/swpout_fallback: 430333 After: (-40.4% sys time, -37.1% real time) Sys time: 44160.56, Real time: 532.07 hugepages-64kB/stats/swpout: 1786288 hugepages-64kB/stats/swpout_fallback: 243384 time make -j32 / 512M memcg, 4K pages, 5G ZRAM, on AMD 7K62: Before: Sys time: 8098.21, Real time: 401.3 After: (-22.6% sys time, -12.8% real time ) Sys time: 6265.02, Real time: 349.83 The allocation success rate also slightly improved as we sanitized the usage of clusters with new defined helpers, previously dropping si->lock or ci->lock during scan will cause cluster order shuffle. Link: https://lkml.kernel.org/r/20250113175732.48099-10-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 0e59cb158b15..5fe650beb77d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -290,6 +290,7 @@ enum swap_cluster_flags { * throughput. */ struct percpu_cluster { + local_lock_t lock; /* Protect the percpu_cluster above */ unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; @@ -312,7 +313,7 @@ struct swap_info_struct { /* list of cluster that contains at least one free slot */ struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ - unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; + atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ -- cgit v1.2.3 From e3ae2dec849ba8bc5649c2d0507e02bd4379da71 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:29 +0800 Subject: mm, swap: simplify percpu cluster updating Instead of using a returning argument, we can simply store the next cluster offset to the fixed percpu location, which reduce the stack usage and simplify the function: Object size: ./scripts/bloat-o-meter mm/swapfile.o mm/swapfile.o.new add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-271 (-271) Function old new delta get_swap_pages 2847 2733 -114 alloc_swap_scan_cluster 894 737 -157 Total: Before=30833, After=30562, chg -0.88% Stack usage: Before: swapfile.c:1190:5:get_swap_pages 240 static After: swapfile.c:1185:5:get_swap_pages 216 static Link: https://lkml.kernel.org/r/20250113175732.48099-11-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Baoquan He Cc: Barry Song Cc: Chis Li Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 5fe650beb77d..75b2b0166cb1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -274,9 +274,9 @@ enum swap_cluster_flags { * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the * cluster to which it belongs being marked free. Therefore 0 is safe to use as - * a sentinel to indicate next is not valid in percpu_cluster. + * a sentinel to indicate an entry is not valid. */ -#define SWAP_NEXT_INVALID 0 +#define SWAP_ENTRY_INVALID 0 #ifdef CONFIG_THP_SWAP #define SWAP_NR_ORDERS (PMD_ORDER + 1) -- cgit v1.2.3 From bae8a4ef3efb56bb7e83bafd3c0856845aeaf605 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:31 +0800 Subject: mm, swap: use a global swap cluster for non-rotation devices Non-rotational devices (SSD / ZRAM) can tolerate fragmentation, so the goal of the SWAP allocator is to avoid contention for clusters. It uses a per-CPU cluster design, and each CPU will use a different cluster as much as possible. However, HDDs are very sensitive to fragmentation, contention is trivial in comparison. Therefore, we use one global cluster instead. This ensures that each order will be written to the same cluster as much as possible, which helps make the I/O more continuous. This ensures that the performance of the cluster allocator is as good as that of the old allocator. Tests after this commit compared to those before this series: Tested using 'make -j32' with tinyconfig, a 1G memcg limit, and HDD swap: make -j32 with tinyconfig, using 1G memcg limit and HDD swap: Before this series: 114.44user 29.11system 39:42.90elapsed 6%CPU (0avgtext+0avgdata 157284maxresident)k 2901232inputs+0outputs (238877major+4227640minor)pagefaults After this commit: 113.90user 23.81system 38:11.77elapsed 6%CPU (0avgtext+0avgdata 157260maxresident)k 2548728inputs+0outputs (235471major+4238110minor)pagefaults [ryncsn@gmail.com: check kmalloc() return in setup_clusters] Link: https://lkml.kernel.org/r/CAMgjq7Au+o04ckHyT=iU-wVx9az=t0B-ZiC5E0bDqNrAtNOP-g@mail.gmail.com Link: https://lkml.kernel.org/r/20250113175732.48099-13-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 75b2b0166cb1..a5f475335aea 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -317,6 +317,8 @@ struct swap_info_struct { unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ + struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */ + spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ -- cgit v1.2.3 From 4f79384a25d57a59e142009e52f40ae1f25102fe Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:32 +0800 Subject: mm, swap_slots: remove slot cache for freeing path The slot cache for freeing path is mostly for reducing the overhead of si->lock. As we have basically eliminated the si->lock usage for freeing path, it can be removed. This helps simplify the code, and avoids swap entries from being hold in cache upon freeing. The delayed freeing of entries have been causing trouble for further optimizations for zswap [1] and in theory will also cause more fragmentation, and extra overhead. Test with build linux kernel showed both performance and fragmentation is better without the cache: tiem make -j96 / 768M memcg, 4K pages, 10G ZRAM, avg of 4 test run:: Before: Sys time: 36047.78, Real time: 472.43 After: (-7.6% sys time, -7.3% real time) Sys time: 33314.76, Real time: 437.67 time make -j96 / 1152M memcg, 64K mTHP, 10G ZRAM, avg of 4 test run: Before: Sys time: 46859.04, Real time: 562.63 hugepages-64kB/stats/swpout: 1783392 hugepages-64kB/stats/swpout_fallback: 240875 After: (-23.3% sys time, -21.3% real time) Sys time: 35958.87, Real time: 442.69 hugepages-64kB/stats/swpout: 1866267 hugepages-64kB/stats/swpout_fallback: 158330 Sequential SWAP should be also slightly faster, tests didn't show a measurable difference though, at least no regression: Swapin 4G zero page on ZRAM (time in us): Before (avg. 1923756) 1912391 1927023 1927957 1916527 1918263 1914284 1934753 1940813 1921791 After (avg. 1922290): 1919101 1925743 1916810 1917007 1923930 1935152 1917403 1923549 1921913 Link: https://lore.kernel.org/all/CAMgjq7ACohT_uerSz8E_994ZZCv709Zor+43hdmesW_59W1BWw@mail.gmail.com/[1] Link: https://lkml.kernel.org/r/20250113175732.48099-14-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap_slots.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h index 15adfb8c813a..840aec3523b2 100644 --- a/include/linux/swap_slots.h +++ b/include/linux/swap_slots.h @@ -16,15 +16,12 @@ struct swap_slots_cache { swp_entry_t *slots; int nr; int cur; - spinlock_t free_lock; /* protects slots_ret, n_ret */ - swp_entry_t *slots_ret; int n_ret; }; void disable_swap_slots_cache_lock(void); void reenable_swap_slots_cache_unlock(void); void enable_swap_slots_cache(void); -void free_swap_slot(swp_entry_t entry); extern bool swap_slot_cache_enabled; -- cgit v1.2.3 From f8d4a6cabb74f82c37ccb7c5e9dc3fdad50393d4 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 2 Jan 2025 12:10:52 +0000 Subject: mm: make mmap_region() internal Now that we have removed the one user of mmap_region() outside of mm, make it internal and add it to vma.c so it can be userland tested. This ensures that all external memory mappings are performed using the appropriate interfaces and allows us to modify memory mapping logic as we see fit. Additionally expand test stubs to allow for the mmap_region() code to compile and be userland testable. Link: https://lkml.kernel.org/r/de5a3c574d35c26237edf20a1d8652d7305709c9.1735819274.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Jann Horn Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3550cbeed488..8483e09aeb2c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3363,9 +3363,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, return __get_unmapped_area(file, addr, len, pgoff, flags, 0); } -extern unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf); extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, -- cgit v1.2.3 From c6f239796b55dbc4225a6fca9f96232092b9df83 Mon Sep 17 00:00:00 2001 From: Guo Weikang Date: Thu, 2 Jan 2025 15:25:28 +0800 Subject: mm/memblock: add memblock_alloc_or_panic interface Before SLUB initialization, various subsystems used memblock_alloc to allocate memory. In most cases, when memory allocation fails, an immediate panic is required. To simplify this behavior and reduce repetitive checks, introduce `memblock_alloc_or_panic`. This function ensures that memory allocation failures result in a panic automatically, improving code readability and consistency across subsystems that require this behavior. [guoweikang.kernel@gmail.com: arch/s390: save_area_alloc default failure behavior changed to panic] Link: https://lkml.kernel.org/r/20250109033136.2845676-1-guoweikang.kernel@gmail.com Link: https://lore.kernel.org/lkml/Z2fknmnNtiZbCc7x@kernel.org/ Link: https://lkml.kernel.org/r/20250102072528.650926-1-guoweikang.kernel@gmail.com Signed-off-by: Guo Weikang Acked-by: Geert Uytterhoeven [m68k] Reviewed-by: Alexander Gordeev [s390] Acked-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Signed-off-by: Andrew Morton --- include/linux/memblock.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index d48b56c1e558..e79eb6ac516f 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -421,6 +421,12 @@ static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align) MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); } +void *__memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, + const char *func); + +#define memblock_alloc_or_panic(size, align) \ + __memblock_alloc_or_panic(size, align, __func__) + static inline void *memblock_alloc_raw(phys_addr_t size, phys_addr_t align) { -- cgit v1.2.3 From 798c0330c2ca078cc3e155e567c77c4d61345a38 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 30 Dec 2024 21:35:34 -0700 Subject: mm/mglru: rework aging feedback The aging feedback is based on both the number of generations and the distribution of folios in each generation. The number of generations is currently the distance between max_seq and anon min_seq. This is because anon min_seq is not allowed to move past file min_seq. The rationale for that is that file is always evictable whereas anon is not. However, for use cases where anon is a lot cheaper than file: 1. Anon in the second oldest generation can be a better choice than file in the oldest generation. 2. A large amount of file in the oldest generation can skew the distribution, making should_run_aging() return false negative. Allow anon and file min_seq to move independently, and use solely the number of generations as the feedback for aging. Specifically, when both anon and file are evictable, anon min_seq can now be greater than file min_seq, and therefore the number of generations becomes the distance between max_seq and min(min_seq[0],min_seq[1]). And should_run_aging() returns true if and only if the number of generations is less than MAX_NR_GENS. As the first step to the final optimization, this change by itself should not have userspace-visiable effects beyond performance. The next twos patch will take advantage of this change; the last patch in this series will better distribute folios across MAX_NR_GENS. [yuzhao@google.com: restore behaviour for systems with swappiness == 200] Link: https://lkml.kernel.org/r/Z4S3-aJy5dj9tBTk@google.com Link: https://lkml.kernel.org/r/20241231043538.4075764-4-yuzhao@google.com Signed-off-by: Yu Zhao Reported-by: David Stevens Tested-by: Kalesh Singh Cc: Barry Song Cc: Bharata B Rao Cc: Kairui Song Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b36124145a16..8245ecb0400b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -421,12 +421,11 @@ enum { /* * The youngest generation number is stored in max_seq for both anon and file * types as they are aged on an equal footing. The oldest generation numbers are - * stored in min_seq[] separately for anon and file types as clean file pages - * can be evicted regardless of swap constraints. - * - * Normally anon and file min_seq are in sync. But if swapping is constrained, - * e.g., out of swap space, file min_seq is allowed to advance and leave anon - * min_seq behind. + * stored in min_seq[] separately for anon and file types so that they can be + * incremented independently. Ideally min_seq[] are kept in sync when both anon + * and file types are evictable. However, to adapt to situations like extreme + * swappiness, they are allowed to be out of sync by at most + * MAX_NR_GENS-MIN_NR_GENS-1. * * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. @@ -446,8 +445,8 @@ struct lru_gen_folio { unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; /* the exponential moving average of evicted+protected */ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; - /* the first tier doesn't need protection, hence the minus one */ - unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; + /* can only be modified under the LRU lock */ + unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* can be modified without holding the LRU lock */ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; @@ -498,7 +497,7 @@ struct lru_gen_mm_walk { int mm_stats[NR_MM_STATS]; /* total batched items */ int batched; - bool can_swap; + int swappiness; bool force_scan; }; -- cgit v1.2.3 From 4d5d14a01e2c9091b128fb46e1d07475e9a7bb72 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 30 Dec 2024 21:35:37 -0700 Subject: mm/mglru: rework workingset protection With the aging feedback no longer considering the distribution of folios in each generation, rework workingset protection to better distribute folios across MAX_NR_GENS. This is achieved by reusing PG_workingset and PG_referenced/LRU_REFS_FLAGS in a slightly different way. For folios accessed multiple times through file descriptors, make lru_gen_inc_refs() set additional bits of LRU_REFS_WIDTH in folio->flags after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily promoted into the second oldest generation in the eviction path. And when folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that lru_gen_inc_refs() can start over. For this case, LRU_REFS_MASK is only valid when PG_referenced is set. For folios accessed multiple times through page tables, folio_update_gen() from a page table walk or lru_gen_set_refs() from a rmap walk sets PG_referenced after the accessed bit is cleared for the first time. Thereafter, those two paths set PG_workingset and promote folios to the youngest generation. Like folio_inc_gen(), when folio_update_gen() does that, it also clears PG_referenced. For this case, LRU_REFS_MASK is not used. For both of the cases, after PG_workingset is set on a folio, it remains until this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It can be set again if lru_gen_test_recent() returns true upon a refault. When adding folios to the LRU lists, lru_gen_folio_seq() distributes them as follows: +---------------------------------+---------------------------------+ | Accessed thru page tables | Accessed thru file descriptors | +---------------------------------+---------------------------------+ | PG_active (set while isolated) | | +----------------+----------------+----------------+----------------+ | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS | +---------------------------------+---------------------------------+ |<--------- MIN_NR_GENS --------->| | |<-------------------------- MAX_NR_GENS -------------------------->| After this patch, some typical client and server workloads showed improvements under heavy memory pressure. For example, Python TPC-C, which was used to benchmark a different approach [1] to better detect refault distances, showed a significant decrease in total refaults: Before After Change Time (seconds) 10801 10801 0% Executed (transactions) 41472 43663 +5% workingset_nodes 109070 120244 +10% workingset_refault_anon 5019627 7281831 +45% workingset_refault_file 1294678786 554855564 -57% workingset_refault_total 1299698413 562137395 -57% [1] https://lore.kernel.org/20230920190244.16839-1-ryncsn@gmail.com/ Link: https://lkml.kernel.org/r/20241231043538.4075764-7-yuzhao@google.com Signed-off-by: Yu Zhao Reported-by: Kairui Song Closes: https://lore.kernel.org/CAOUHufahuWcKf5f1Sg3emnqX+cODuR=2TQo7T4Gr-QYLujn4RA@mail.gmail.com/ Tested-by: Kalesh Singh Cc: Barry Song Cc: Bharata B Rao Cc: David Stevens Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 88 +++++++++++++++++++++++------------------------ include/linux/mmzone.h | 82 ++++++++++++++++++++++++++----------------- 2 files changed, 94 insertions(+), 76 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 34e5097182a0..f9157a0c42a5 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -133,31 +133,25 @@ static inline int lru_hist_from_seq(unsigned long seq) return seq % NR_HIST_GENS; } -static inline int lru_tier_from_refs(int refs) +static inline int lru_tier_from_refs(int refs, bool workingset) { VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); - /* see the comment in folio_lru_refs() */ - return order_base_2(refs + 1); + /* see the comment on MAX_NR_TIERS */ + return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs); } static inline int folio_lru_refs(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); - bool workingset = flags & BIT(PG_workingset); + if (!(flags & BIT(PG_referenced))) + return 0; /* - * Return the number of accesses beyond PG_referenced, i.e., N-1 if the - * total number of accesses is N>1, since N=0,1 both map to the first - * tier. lru_tier_from_refs() will account for this off-by-one. Also see - * the comment on MAX_NR_TIERS. + * Return the total number of accesses including PG_referenced. Also see + * the comment on LRU_REFS_FLAGS. */ - return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; -} - -static inline void folio_clear_lru_refs(struct folio *folio) -{ - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); + return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1; } static inline int folio_lru_gen(struct folio *folio) @@ -223,11 +217,43 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } +static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio, + bool reclaiming) +{ + int gen; + int type = folio_is_file_lru(folio); + struct lru_gen_folio *lrugen = &lruvec->lrugen; + + /* + * +-----------------------------------+-----------------------------------+ + * | Accessed through page tables and | Accessed through file descriptors | + * | promoted by folio_update_gen() | and protected by folio_inc_gen() | + * +-----------------------------------+-----------------------------------+ + * | PG_active (set while isolated) | | + * +-----------------+-----------------+-----------------+-----------------+ + * | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS | + * +-----------------------------------+-----------------------------------+ + * |<---------- MIN_NR_GENS ---------->| | + * |<---------------------------- MAX_NR_GENS ---------------------------->| + */ + if (folio_test_active(folio)) + gen = MIN_NR_GENS - folio_test_workingset(folio); + else if (reclaiming) + gen = MAX_NR_GENS; + else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) || + (folio_test_reclaim(folio) && + (folio_test_dirty(folio) || folio_test_writeback(folio)))) + gen = MIN_NR_GENS; + else + gen = MAX_NR_GENS - folio_test_workingset(folio); + + return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type])); +} + static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long seq; unsigned long flags; - unsigned long mask; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); @@ -237,40 +263,12 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, if (folio_test_unevictable(folio) || !lrugen->enabled) return false; - /* - * There are four common cases for this page: - * 1. If it's hot, i.e., freshly faulted in, add it to the youngest - * generation, and it's protected over the rest below. - * 2. If it can't be evicted immediately, i.e., a dirty page pending - * writeback, add it to the second youngest generation. - * 3. If it should be evicted first, e.g., cold and clean from - * folio_rotate_reclaimable(), add it to the oldest generation. - * 4. Everything else falls between 2 & 3 above and is added to the - * second oldest generation if it's considered inactive, or the - * oldest generation otherwise. See lru_gen_is_active(). - */ - if (folio_test_active(folio)) - seq = lrugen->max_seq; - else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) || - (folio_test_reclaim(folio) && - (folio_test_dirty(folio) || folio_test_writeback(folio)))) - seq = lrugen->max_seq - 1; - else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq) - seq = lrugen->min_seq[type]; - else - seq = lrugen->min_seq[type] + 1; + seq = lru_gen_folio_seq(lruvec, folio, reclaiming); gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ - mask = LRU_GEN_MASK; - /* - * Don't clear PG_workingset here because it can affect PSI accounting - * if the activation is due to workingset refault. - */ - if (folio_test_active(folio)) - mask |= LRU_REFS_MASK | BIT(PG_referenced) | BIT(PG_active); - set_mask_bits(&folio->flags, mask, flags); + set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8245ecb0400b..9540b41894da 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -332,66 +332,88 @@ enum lruvec_flags { #endif /* !__GENERATING_BOUNDS_H */ /* - * Evictable pages are divided into multiple generations. The youngest and the + * Evictable folios are divided into multiple generations. The youngest and the * oldest generation numbers, max_seq and min_seq, are monotonically increasing. * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while - * a page is on one of lrugen->folios[]. Otherwise it stores 0. + * a folio is on one of lrugen->folios[]. Otherwise it stores 0. * - * A page is added to the youngest generation on faulting. The aging needs to - * check the accessed bit at least twice before handing this page over to the - * eviction. The first check takes care of the accessed bit set on the initial - * fault; the second check makes sure this page hasn't been used since then. - * This process, AKA second chance, requires a minimum of two generations, - * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive - * LRU, e.g., /proc/vmstat, these two generations are considered active; the - * rest of generations, if they exist, are considered inactive. See - * lru_gen_is_active(). + * After a folio is faulted in, the aging needs to check the accessed bit at + * least twice before handing this folio over to the eviction. The first check + * clears the accessed bit from the initial fault; the second check makes sure + * this folio hasn't been used since then. This process, AKA second chance, + * requires a minimum of two generations, hence MIN_NR_GENS. And to maintain ABI + * compatibility with the active/inactive LRU, e.g., /proc/vmstat, these two + * generations are considered active; the rest of generations, if they exist, + * are considered inactive. See lru_gen_is_active(). * - * PG_active is always cleared while a page is on one of lrugen->folios[] so - * that the aging needs not to worry about it. And it's set again when a page - * considered active is isolated for non-reclaiming purposes, e.g., migration. - * See lru_gen_add_folio() and lru_gen_del_folio(). + * PG_active is always cleared while a folio is on one of lrugen->folios[] so + * that the sliding window needs not to worry about it. And it's set again when + * a folio considered active is isolated for non-reclaiming purposes, e.g., + * migration. See lru_gen_add_folio() and lru_gen_del_folio(). * * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits - * in folio->flags. + * in folio->flags, masked by LRU_GEN_MASK. */ #define MIN_NR_GENS 2U #define MAX_NR_GENS 4U /* - * Each generation is divided into multiple tiers. A page accessed N times - * through file descriptors is in tier order_base_2(N). A page in the first tier - * (N=0,1) is marked by PG_referenced unless it was faulted in through page - * tables or read ahead. A page in any other tier (N>1) is marked by - * PG_referenced and PG_workingset. This implies a minimum of two tiers is - * supported without using additional bits in folio->flags. + * Each generation is divided into multiple tiers. A folio accessed N times + * through file descriptors is in tier order_base_2(N). A folio in the first + * tier (N=0,1) is marked by PG_referenced unless it was faulted in through page + * tables or read ahead. A folio in the last tier (MAX_NR_TIERS-1) is marked by + * PG_workingset. A folio in any other tier (1flags. * * In contrast to moving across generations which requires the LRU lock, moving * across tiers only involves atomic operations on folio->flags and therefore * has a negligible cost in the buffered access path. In the eviction path, - * comparisons of refaulted/(evicted+protected) from the first tier and the - * rest infer whether pages accessed multiple times through file descriptors - * are statistically hot and thus worth protecting. + * comparisons of refaulted/(evicted+protected) from the first tier and the rest + * infer whether folios accessed multiple times through file descriptors are + * statistically hot and thus worth protecting. * * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in - * folio->flags. + * folio->flags, masked by LRU_REFS_MASK. */ #define MAX_NR_TIERS 4U #ifndef __GENERATING_BOUNDS_H -struct lruvec; -struct page_vma_mapped_walk; - #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) +/* + * For folios accessed multiple times through file descriptors, + * lru_gen_inc_refs() sets additional bits of LRU_REFS_WIDTH in folio->flags + * after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its + * bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily + * promoted into the second oldest generation in the eviction path. And when + * folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that + * lru_gen_inc_refs() can start over. Note that for this case, LRU_REFS_MASK is + * only valid when PG_referenced is set. + * + * For folios accessed multiple times through page tables, folio_update_gen() + * from a page table walk or lru_gen_set_refs() from a rmap walk sets + * PG_referenced after the accessed bit is cleared for the first time. + * Thereafter, those two paths set PG_workingset and promote folios to the + * youngest generation. Like folio_inc_gen(), folio_update_gen() also clears + * PG_referenced. Note that for this case, LRU_REFS_MASK is not used. + * + * For both cases above, after PG_workingset is set on a folio, it remains until + * this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It + * can be set again if lru_gen_test_recent() returns true upon a refault. + */ +#define LRU_REFS_FLAGS (LRU_REFS_MASK | BIT(PG_referenced)) + +struct lruvec; +struct page_vma_mapped_walk; + #ifdef CONFIG_LRU_GEN enum { @@ -406,8 +428,6 @@ enum { NR_LRU_GEN_CAPS }; -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) - #define MIN_LRU_BATCH BITS_PER_LONG #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) -- cgit v1.2.3 From d670c8e5302af8ccdf5f12242e58816420738bb5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 9 Jan 2025 15:22:21 +0000 Subject: mm: remove PageTransTail() The last caller was removed in October. Also remove the FALSE definition of PageTransCompoundMap(); the normal definition was removed a few years ago. Link: https://lkml.kernel.org/r/20250109152245.1591914-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Acked-by: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 691506bdf2c5..330929b6e062 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -894,21 +894,9 @@ static inline int PageTransCompound(const struct page *page) { return PageCompound(page); } - -/* - * PageTransTail returns true for both transparent huge pages - * and hugetlbfs pages, so it should only be called when it's known - * that hugetlbfs pages aren't involved. - */ -static inline int PageTransTail(const struct page *page) -{ - return PageTail(page); -} #else TESTPAGEFLAG_FALSE(TransHuge, transhuge) TESTPAGEFLAG_FALSE(TransCompound, transcompound) -TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap) -TESTPAGEFLAG_FALSE(TransTail, transtail) #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -- cgit v1.2.3 From d783cc5913f17b2b5d9c51cb0904860ec97ed44d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 10 Jan 2025 10:52:32 -0800 Subject: mm/damon: explain "effective quota" on kernel-doc comment The kernel-doc comment for 'struct damos_quota' describes how "effective quota" is calculated, but does not explain what it is. Actually there was an input[1] about it. Add the explanation on the comment. Also, fix a trivial typo on the comment block: s/empt/empty/ [1] https://github.com/damonitor/damo/issues/17#issuecomment-2497525043 Link: https://lkml.kernel.org/r/20250110185232.54907-6-sj@kernel.org Signed-off-by: SeongJae Park Suggested-by: Honggyu Kim Cc: Yunjeong Mun Cc: Honggyu Kim Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 0834d7ffcb84..af525252b853 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -193,11 +193,16 @@ struct damos_quota_goal { * size quota is set, DAMON tries to apply the action only up to &sz bytes * within &reset_interval. * - * Internally, the time quota is transformed to a size quota using estimated - * throughput of the scheme's action. DAMON then compares it against &sz and - * uses smaller one as the effective quota. + * To convince the different types of quotas and goals, DAMON internally + * converts those into one single size quota called "effective quota". DAMON + * internally uses it as the only one real quota. The conversion is made as + * follows. * - * If @goals is not empt, DAMON calculates yet another size quota based on the + * The time quota is transformed to a size quota using estimated throughput of + * the scheme's action. DAMON then compares it against &sz and uses smaller + * one as the effective quota. + * + * If @goals is not empty, DAMON calculates yet another size quota based on the * goals using its internal feedback loop algorithm, for every @reset_interval. * Then, if the new size quota is smaller than the effective quota, it uses the * new size quota as the effective quota. -- cgit v1.2.3 From 3ab76c767bc783c122a8dfe105fbc10a0b029b42 Mon Sep 17 00:00:00 2001 From: xu xin Date: Fri, 10 Jan 2025 17:40:34 +0800 Subject: ksm: add ksm involvement information for each process In /proc//ksm_stat, add two extra ksm involvement items including KSM_mergeable and KSM_merge_any. It helps administrators to better know the system's KSM behavior at process level. ksm_merge_any: yes/no whether the process'mm is added by prctl() into the candidate list of KSM or not, and fully enabled at process level. ksm_mergeable: yes/no whether any VMAs of the process'mm are currently applicable to KSM. Purpose ======= These two items are just to improve the observability of KSM at process level, so that users can know if a certain process has enabled KSM. For example, if without these two items, when we look at /proc//ksm_stat and there's no merging pages found, We are not sure whether it is because KSM was not enabled or because KSM did not successfully merge any pages. Although "mg" in /proc//smaps indicate VM_MERGEABLE, it's opaque and not very obvious for non professionals. [akpm@linux-foundation.org: wording tweaks, per David and akpm] Link: https://lkml.kernel.org/r/20250110174034304QOb8eDoqtFkp3_t8mqnqc@zte.com.cn Signed-off-by: xu xin Acked-by: David Hildenbrand Tested-by: Mario Casquero Cc: Wang Yaxin Cc: Yang Yang Signed-off-by: Andrew Morton --- include/linux/ksm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 6a53ac4885bb..d73095b5cd96 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -93,6 +93,7 @@ void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early); long ksm_process_profit(struct mm_struct *); +bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ -- cgit v1.2.3 From 8d91fed83cc12306cbb63efa6c473ffee117977a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 Jan 2025 14:16:06 +0100 Subject: mm/huge_memory: convert has_hwpoisoned into a pure folio flag Patch series "mm: hugetlb+THP folio and migration cleanups", v2. Some cleanups around more folio conversion and migration handling that I collected working on random stuff. This patch (of 6): Let's stop setting it on pages, there is no need to anymore. Link: https://lkml.kernel.org/r/20250113131611.2554758-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Baolin Wang Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 330929b6e062..616b57ddc3fe 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -906,11 +906,9 @@ TESTPAGEFLAG_FALSE(TransCompound, transcompound) * * This flag is set by hwpoison handler. Cleared by THP split or free page. */ -PAGEFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) - TESTSCFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) +FOLIO_FLAG(has_hwpoisoned, FOLIO_SECOND_PAGE) #else -PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) - TESTSCFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) +FOLIO_FLAG_FALSE(has_hwpoisoned) #endif /* -- cgit v1.2.3 From 4c640f128074e0d4459ecf072595a44df5c2ae18 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 Jan 2025 14:16:07 +0100 Subject: mm/hugetlb: rename isolate_hugetlb() to folio_isolate_hugetlb() Let's make the function name match "folio_isolate_lru()", and add some kernel doc. Link: https://lkml.kernel.org/r/20250113131611.2554758-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Baolin Wang Cc: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 49ec2362ce92..c95ad5cd7894 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -153,7 +153,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -bool isolate_hugetlb(struct folio *folio, struct list_head *list); +bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); @@ -414,7 +414,7 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline bool isolate_hugetlb(struct folio *folio, struct list_head *list) +static inline bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list) { return false; } -- cgit v1.2.3 From b235448e8cab7eea17d164efc7bf55505985ba65 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 Jan 2025 14:16:09 +0100 Subject: mm/hugetlb: rename folio_putback_active_hugetlb() to folio_putback_hugetlb() Now that folio_putback_hugetlb() is only called on folios that were previously isolated through folio_isolate_hugetlb(), let's rename it to match folio_putback_lru(). Add some kernel doc to clarify how this function is supposed to be used. Link: https://lkml.kernel.org/r/20250113131611.2554758-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Baolin Wang Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c95ad5cd7894..ec8c0ccc8f95 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -157,7 +157,7 @@ bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); -void folio_putback_active_hugetlb(struct folio *folio); +void folio_putback_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; @@ -430,7 +430,7 @@ static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return 0; } -static inline void folio_putback_active_hugetlb(struct folio *folio) +static inline void folio_putback_hugetlb(struct folio *folio) { } -- cgit v1.2.3 From cceba6f7e46c48deca433030d80fc34599fb9fd8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:42 -0700 Subject: mm: add PG_dropbehind folio flag Add a folio flag that file IO can use to indicate that the cached IO being done should be dropped from the page cache upon completion. Link: https://lkml.kernel.org/r/20241220154831.1086649-5-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Kirill A. Shutemov Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 616b57ddc3fe..36d283552f80 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -110,6 +110,7 @@ enum pageflags { PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ + PG_dropbehind, /* drop pages on IO completion */ #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif @@ -562,6 +563,10 @@ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE) +FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE) + FOLIO_TEST_CLEAR_FLAG(dropbehind, FOLIO_HEAD_PAGE) + __FOLIO_SET_FLAG(dropbehind, FOLIO_HEAD_PAGE) + #ifdef CONFIG_HIGHMEM /* * Must use a macro here due to header dependency issues. page_zone() is not -- cgit v1.2.3 From 77d075221ae777296e2b18a0a4f5fea6f75daf2c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:43 -0700 Subject: mm/readahead: add readahead_control->dropbehind member If ractl->dropbehind is set to true, then folios created are marked as dropbehind as well. Link: https://lkml.kernel.org/r/20241220154831.1086649-6-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Kirill A. Shutemov Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index fc2e1319c7bb..d53c49abead6 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1358,6 +1358,7 @@ struct readahead_control { pgoff_t _index; unsigned int _nr_pages; unsigned int _batch_count; + bool dropbehind; bool _workingset; unsigned long _pflags; }; -- cgit v1.2.3 From b9f958d4f146bd11be33a5f2bc3ced50f86d6b23 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:45 -0700 Subject: fs: add RWF_DONTCACHE iocb and FOP_DONTCACHE file_operations flag If a file system supports uncached buffered IO, it may set FOP_DONTCACHE and enable support for RWF_DONTCACHE. If RWF_DONTCACHE is attempted without the file system supporting it, it'll get errored with -EOPNOTSUPP. Link: https://lkml.kernel.org/r/20241220154831.1086649-8-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/fs.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..6a838b5479a6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -322,6 +322,7 @@ struct readahead_control; #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC +#define IOCB_DONTCACHE (__force int) RWF_DONTCACHE /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -356,7 +357,8 @@ struct readahead_control; { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ - { IOCB_ATOMIC, "ATOMIC"}, \ + { IOCB_ATOMIC, "ATOMIC" }, \ + { IOCB_DONTCACHE, "DONTCACHE" }, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ @@ -2127,6 +2129,8 @@ struct file_operations { #define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) /* Supports asynchronous lock callbacks */ #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) +/* File system supports uncached read/write buffered IO */ +#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, @@ -3614,6 +3618,14 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) return -EOPNOTSUPP; } + if (flags & RWF_DONTCACHE) { + /* file system must support it */ + if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE)) + return -EOPNOTSUPP; + /* DAX mappings not supported */ + if (IS_DAX(ki->ki_filp->f_mapping->host)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; -- cgit v1.2.3 From dddc559f2e7cff9c6525150cd29ef3a4f6692b26 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:48 -0700 Subject: mm/filemap: add filemap_fdatawrite_range_kick() helper Works like filemap_fdatawrite_range(), except it's a non-integrity data writeback and hence only starts writeback on the specified range. Will help facilitate generically starting uncached writeback from generic_write_sync(), as header dependencies preclude doing this inline from fs.h. Link: https://lkml.kernel.org/r/20241220154831.1086649-11-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6a838b5479a6..653b5efa3d3f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2878,6 +2878,8 @@ extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, extern int __must_check file_check_and_advance_wb_err(struct file *file); extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); +int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, + loff_t end); static inline int file_write_and_wait(struct file *file) { -- cgit v1.2.3 From 1d4457576570627e1702614bc060b55d95b85e39 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:49 -0700 Subject: mm: call filemap_fdatawrite_range_kick() after IOCB_DONTCACHE issue When a buffered write submitted with IOCB_DONTCACHE has been successfully submitted, call filemap_fdatawrite_range_kick() to kick off the IO. File systems call generic_write_sync() for any successful buffered write submission, hence add the logic here rather than needing to modify the file system. Link: https://lkml.kernel.org/r/20241220154831.1086649-12-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/fs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 653b5efa3d3f..58a618853574 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2912,6 +2912,11 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) (iocb->ki_flags & IOCB_SYNC) ? 0 : 1); if (ret) return ret; + } else if (iocb->ki_flags & IOCB_DONTCACHE) { + struct address_space *mapping = iocb->ki_filp->f_mapping; + + filemap_fdatawrite_range_kick(mapping, iocb->ki_pos, + iocb->ki_pos + count); } return count; -- cgit v1.2.3 From d94d23fdd7529f1f3218235d1e0a69e9856907b7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:50 -0700 Subject: mm: add FGP_DONTCACHE folio creation flag Callers can pass this in for uncached folio creation, in which case if a folio is newly created it gets marked as uncached. If a folio exists for this index and lookup succeeds, then it will not get marked as uncached. If an !uncached lookup finds a cached folio, clear the flag. For that case, there are competeting uncached and cached users of the folio, and it should not get pruned. Link: https://lkml.kernel.org/r/20241220154831.1086649-13-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d53c49abead6..47bfc6b1b632 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -710,6 +710,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, * * %FGP_NOFS - __GFP_FS will get cleared in gfp. * * %FGP_NOWAIT - Don't block on the folio lock. * * %FGP_STABLE - Wait for the folio to be stable (finished writeback) + * * %FGP_DONTCACHE - Uncached buffered IO * * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin() * implementation. */ @@ -723,6 +724,7 @@ typedef unsigned int __bitwise fgf_t; #define FGP_NOWAIT ((__force fgf_t)0x00000020) #define FGP_FOR_MMAP ((__force fgf_t)0x00000040) #define FGP_STABLE ((__force fgf_t)0x00000080) +#define FGP_DONTCACHE ((__force fgf_t)0x00000100) #define FGF_GET_ORDER(fgf) (((__force unsigned)fgf) >> 26) /* top 6 bits */ #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) -- cgit v1.2.3 From 3c7fd94205f86ad89f1d1d01dbfbc4b139860d8f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 16 Jan 2025 10:27:30 -0800 Subject: seqlock: add missing parameter documentation for raw_seqcount_try_begin() Add missing documentation for raw_seqcount_try_begin() start parameter. Link: https://lkml.kernel.org/r/20250116182730.801497-1-surenb@google.com Fixes: dba4761a3e40 ("seqlock: add raw_seqcount_try_begin") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/all/20250116170522.23e884d5@canb.auug.org.au/ Signed-off-by: Suren Baghdasaryan Acked-by: Waiman Long Cc: Boqun Feng Cc: David Hildenbrand Cc: Ingo Molnar Cc: Liam Howlett Cc: Peter Zijlstra (Intel) Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/seqlock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 22c2c48b4265..b783a3a7ed62 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -322,6 +322,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) * raw_seqcount_try_begin() - begin a seqcount_t read critical section * w/o lockdep and w/o counter stabilization * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants + * @start: count to be passed to read_seqcount_retry() * * Similar to raw_seqcount_begin(), except it enables eliding the critical * section entirely if odd, instead of doing the speculation knowing it will -- cgit v1.2.3 From a145c848d69f9c6f32008d8319edaa133360dd74 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 8 Jan 2025 10:04:30 +0100 Subject: module: Extend the preempt disabled section in dereference_symbol_descriptor(). dereference_symbol_descriptor() needs to obtain the module pointer belonging to pointer in order to resolve that pointer. The returned mod pointer is obtained under RCU-sched/ preempt_disable() guarantees and needs to be used within this section to ensure that the module is not removed in the meantime. Extend the preempt_disable() section to also cover dereference_module_function_descriptor(). Fixes: 04b8eb7a4ccd9 ("symbol lookup: introduce dereference_symbol_descriptor()") Cc: James E.J. Bottomley Cc: Christophe Leroy Cc: Helge Deller Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Naveen N Rao Cc: Nicholas Piggin Cc: Sergey Senozhatsky Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Reviewed-by: Sergey Senozhatsky Acked-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/20250108090457.512198-2-bigeasy@linutronix.de Signed-off-by: Petr Pavlu --- include/linux/kallsyms.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index c3f075e8f60c..1c6a6c1704d8 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -57,10 +57,10 @@ static inline void *dereference_symbol_descriptor(void *ptr) preempt_disable(); mod = __module_address((unsigned long)ptr); - preempt_enable(); if (mod) ptr = dereference_module_function_descriptor(mod, ptr); + preempt_enable(); #endif return ptr; } -- cgit v1.2.3 From 38e3fe6595e1fa806c0450b2db666bc46325025e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 16 Dec 2024 18:25:09 +0100 Subject: module: Handle 'struct module_version_attribute' as const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The structure is always read-only due to its placement in the read-only section __modver. Reflect this at its usage sites. Also prepare for the const handling of 'struct module_attribute' itself. Signed-off-by: Thomas Weißschuh Reviewed-by: Petr Pavlu Link: https://lore.kernel.org/r/20241216-sysfs-const-attr-module-v1-2-3790b53e0abf@weissschuh.net Signed-off-by: Petr Pavlu --- include/linux/module.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index b3a643435357..5001c166c74f 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -275,7 +275,7 @@ extern typeof(name) __mod_device_table__##type##__##name \ #else #define MODULE_VERSION(_version) \ MODULE_INFO(version, _version); \ - static struct module_version_attribute __modver_attr \ + static const struct module_version_attribute __modver_attr \ __used __section("__modver") \ __aligned(__alignof__(struct module_version_attribute)) \ = { \ -- cgit v1.2.3 From f3227ffda07470848abe3cfa2039b21816ce3090 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 16 Dec 2024 18:25:10 +0100 Subject: module: Constify 'struct module_attribute' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These structs are never modified, move them to read-only memory. This makes the API clearer and also prepares for the constification of 'struct attribute' itself. While at it, also constify 'modinfo_attrs_count'. Signed-off-by: Thomas Weißschuh Reviewed-by: Petr Pavlu Link: https://lore.kernel.org/r/20241216-sysfs-const-attr-module-v1-3-3790b53e0abf@weissschuh.net Signed-off-by: Petr Pavlu --- include/linux/module.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 5001c166c74f..37eb5d88f6eb 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -52,9 +52,9 @@ struct module_kobject { struct module_attribute { struct attribute attr; - ssize_t (*show)(struct module_attribute *, struct module_kobject *, + ssize_t (*show)(const struct module_attribute *, struct module_kobject *, char *); - ssize_t (*store)(struct module_attribute *, struct module_kobject *, + ssize_t (*store)(const struct module_attribute *, struct module_kobject *, const char *, size_t count); void (*setup)(struct module *, const char *); int (*test)(struct module *); @@ -67,10 +67,10 @@ struct module_version_attribute { const char *version; }; -extern ssize_t __modver_version_show(struct module_attribute *, +extern ssize_t __modver_version_show(const struct module_attribute *, struct module_kobject *, char *); -extern struct module_attribute module_uevent; +extern const struct module_attribute module_uevent; /* These are either module local, or the kernel's dummy ones. */ extern int init_module(void); -- cgit v1.2.3 From 819403c893551c5e93bf9087d334e01bcab5c6b9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 4 Dec 2024 13:54:36 +0100 Subject: fs/proc/vmcore: move vmcore definitions out of kcore.h These vmcore defines are not related to /proc/kcore, move them out. We'll move "struct vmcoredd_node" to vmcore.c, because it is only used internally. While "struct vmcore" is only used internally for now, we're planning on using it from inline functions in crash_dump.h next, so move it to crash_dump.h. While at it, rename "struct vmcore" to "struct vmcore_range", which is a more suitable name and will make the usage of it outside of vmcore.c clearer. Signed-off-by: David Hildenbrand Message-Id: <20241204125444.1734652-6-david@redhat.com> Acked-by: Andrew Morton Signed-off-by: Michael S. Tsirkin --- include/linux/crash_dump.h | 7 +++++++ include/linux/kcore.h | 13 ------------- 2 files changed, 7 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index acc55626afdc..788a45061f35 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -114,6 +114,13 @@ struct vmcore_cb { extern void register_vmcore_cb(struct vmcore_cb *cb); extern void unregister_vmcore_cb(struct vmcore_cb *cb); +struct vmcore_range { + struct list_head list; + unsigned long long paddr; + unsigned long long size; + loff_t offset; +}; + #else /* !CONFIG_CRASH_DUMP */ static inline bool is_kdump_kernel(void) { return false; } #endif /* CONFIG_CRASH_DUMP */ diff --git a/include/linux/kcore.h b/include/linux/kcore.h index 86c0f1d18998..9a2fa013c91d 100644 --- a/include/linux/kcore.h +++ b/include/linux/kcore.h @@ -20,19 +20,6 @@ struct kcore_list { int type; }; -struct vmcore { - struct list_head list; - unsigned long long paddr; - unsigned long long size; - loff_t offset; -}; - -struct vmcoredd_node { - struct list_head list; /* List of dumps */ - void *buf; /* Buffer containing device's dump */ - unsigned int size; /* Size of the buffer */ -}; - #ifdef CONFIG_PROC_KCORE void __init kclist_add(struct kcore_list *, void *, size_t, int type); -- cgit v1.2.3 From e017b1f4aa4eb887ee85fe13862206c0d31344b4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 4 Dec 2024 13:54:37 +0100 Subject: fs/proc/vmcore: factor out allocating a vmcore range and adding it to a list Let's factor it out into include/linux/crash_dump.h, from where we can use it also outside of vmcore.c later. Acked-by: Baoquan He Signed-off-by: David Hildenbrand Message-Id: <20241204125444.1734652-7-david@redhat.com> Acked-by: Andrew Morton Signed-off-by: Michael S. Tsirkin --- include/linux/crash_dump.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 788a45061f35..9717912ce4d1 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -121,6 +121,20 @@ struct vmcore_range { loff_t offset; }; +/* Allocate a vmcore range and add it to the list. */ +static inline int vmcore_alloc_add_range(struct list_head *list, + unsigned long long paddr, unsigned long long size) +{ + struct vmcore_range *m = kzalloc(sizeof(*m), GFP_KERNEL); + + if (!m) + return -ENOMEM; + m->paddr = paddr; + m->size = size; + list_add_tail(&m->list, list); + return 0; +} + #else /* !CONFIG_CRASH_DUMP */ static inline bool is_kdump_kernel(void) { return false; } #endif /* CONFIG_CRASH_DUMP */ -- cgit v1.2.3 From e29e9acae06dc28ca8dbf3db976e09787e610dc8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 4 Dec 2024 13:54:38 +0100 Subject: fs/proc/vmcore: factor out freeing a list of vmcore ranges Let's factor it out into include/linux/crash_dump.h, from where we can use it also outside of vmcore.c later. Acked-by: Baoquan He Signed-off-by: David Hildenbrand Message-Id: <20241204125444.1734652-8-david@redhat.com> Acked-by: Andrew Morton Signed-off-by: Michael S. Tsirkin --- include/linux/crash_dump.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 9717912ce4d1..5d61c7454fd6 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -135,6 +135,17 @@ static inline int vmcore_alloc_add_range(struct list_head *list, return 0; } +/* Free a list of vmcore ranges. */ +static inline void vmcore_free_ranges(struct list_head *list) +{ + struct vmcore_range *m, *tmp; + + list_for_each_entry_safe(m, tmp, list, list) { + list_del(&m->list); + kfree(m); + } +} + #else /* !CONFIG_CRASH_DUMP */ static inline bool is_kdump_kernel(void) { return false; } #endif /* CONFIG_CRASH_DUMP */ -- cgit v1.2.3 From 7ad4d1f6e6ef967cd24c6275d8d4056045c019c1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 4 Dec 2024 13:54:39 +0100 Subject: fs/proc/vmcore: introduce PROC_VMCORE_DEVICE_RAM to detect device RAM ranges in 2nd kernel s390 allocates+prepares the elfcore hdr in the dump (2nd) kernel, not in the crashed kernel. RAM provided by memory devices such as virtio-mem can only be detected using the device driver; when vmcore_init() is called, these device drivers are usually not loaded yet, or the devices did not get probed yet. Consequently, on s390 these RAM ranges will not be included in the crash dump, which makes the dump partially corrupt and is unfortunate. Instead of deferring the vmcore_init() call, to an (unclear?) later point, let's reuse the vmcore_cb infrastructure to obtain device RAM ranges as the device drivers probe the device and get access to this information. Then, we'll add these ranges to the vmcore, adding more PT_LOAD entries and updating the offsets+vmcore size. Use a separate Kconfig option to be set by an architecture to include this code only if the arch really needs it. Further, we'll make the config depend on the relevant drivers (i.e., virtio_mem) once they implement support (next). The alternative of having a PROVIDE_PROC_VMCORE_DEVICE_RAM config option was dropped for now for simplicity. The current target use case is s390, which only creates an elf64 elfcore, so focusing on elf64 is sufficient. Signed-off-by: David Hildenbrand Message-Id: <20241204125444.1734652-9-david@redhat.com> Acked-by: Andrew Morton Signed-off-by: Michael S. Tsirkin --- include/linux/crash_dump.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 5d61c7454fd6..2f2555e6407c 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -20,6 +20,8 @@ extern int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size); extern void elfcorehdr_free(unsigned long long addr); extern ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos); extern ssize_t elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); +void elfcorehdr_fill_device_ram_ptload_elf64(Elf64_Phdr *phdr, + unsigned long long paddr, unsigned long long size); extern int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot); @@ -99,6 +101,12 @@ static inline void vmcore_unusable(void) * indicated in the vmcore instead. For example, a ballooned page * contains no data and reading from such a page will cause high * load in the hypervisor. + * @get_device_ram: query RAM ranges that can only be detected by device + * drivers, such as the virtio-mem driver, so they can be included in + * the crash dump on architectures that allocate the elfcore hdr in the dump + * ("2nd") kernel. Indicated RAM ranges may contain holes to reduce the + * total number of ranges; such holes can be detected using the pfn_is_ram + * callback just like for other RAM. * @next: List head to manage registered callbacks internally; initialized by * register_vmcore_cb(). * @@ -109,6 +117,7 @@ static inline void vmcore_unusable(void) */ struct vmcore_cb { bool (*pfn_is_ram)(struct vmcore_cb *cb, unsigned long pfn); + int (*get_device_ram)(struct vmcore_cb *cb, struct list_head *list); struct list_head next; }; extern void register_vmcore_cb(struct vmcore_cb *cb); -- cgit v1.2.3 From a0ec4fb63f5ce15732f8dadc63c931bdf9ff98b5 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 27 Nov 2024 08:57:31 +0200 Subject: virtio_pci: Add support for PCIe Function Level Reset Implement support for Function Level Reset (FLR) in virtio_pci devices. This change adds reset_prepare and reset_done callbacks, allowing drivers to properly handle FLR operations. Without this patch, performing and recovering from an FLR is not possible for virtio_pci devices. This implementation ensures proper FLR handling and recovery for both physical and virtual functions. The device reset can be triggered in case of error or manually via sysfs: echo 1 > /sys/bus/pci/devices/$PCI_ADDR/reset Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Message-Id: <1732690652-3065-2-git-send-email-israelr@nvidia.com> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index dd88682e27e3..4d16c13d0df5 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -190,6 +190,8 @@ int virtio_device_freeze(struct virtio_device *dev); int virtio_device_restore(struct virtio_device *dev); #endif void virtio_reset_device(struct virtio_device *dev); +int virtio_device_reset_prepare(struct virtio_device *dev); +int virtio_device_reset_done(struct virtio_device *dev); size_t virtio_max_dma_size(const struct virtio_device *vdev); @@ -214,6 +216,10 @@ size_t virtio_max_dma_size(const struct virtio_device *vdev); * changes; may be called in interrupt context. * @freeze: optional function to call during suspend/hibernation. * @restore: optional function to call on resume. + * @reset_prepare: optional function to call when a transport specific reset + * occurs. + * @reset_done: optional function to call after transport specific reset + * operation has finished. */ struct virtio_driver { struct device_driver driver; @@ -229,6 +235,8 @@ struct virtio_driver { void (*config_changed)(struct virtio_device *dev); int (*freeze)(struct virtio_device *dev); int (*restore)(struct virtio_device *dev); + int (*reset_prepare)(struct virtio_device *dev); + int (*reset_done)(struct virtio_device *dev); }; #define drv_to_virtio(__drv) container_of_const(__drv, struct virtio_driver, driver) -- cgit v1.2.3 From 2f0805d7c08bea71c95561bfb3e45d93b05196b9 Mon Sep 17 00:00:00 2001 From: Liang Jie Date: Fri, 10 Jan 2025 18:05:24 +0800 Subject: ceph: streamline request head structures in MDS client The existence of the ceph_mds_request_head_old structure in the MDS client code is no longer required due to improvements in handling different MDS request header versions. This patch removes the now redundant ceph_mds_request_head_old structure and replaces its usage with the flexible and extensible ceph_mds_request_head structure. Changes include: - Modification of find_legacy_request_head to directly cast the pointer to ceph_mds_request_head_legacy without going through the old structure. - Update sizeof calculations in create_request_message to use offsetofend for consistency and future-proofing, rather than referencing the old structure. - Use of the structured ceph_mds_request_head directly instead of the old one. Additionally, this consolidation normalizes the handling of request_head_version v1 to align with versions v2 and v3, leading to a more consistent and maintainable codebase. These changes simplify the codebase and reduce potential confusion stemming from the existence of an obsolete structure. Signed-off-by: Liang Jie Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_fs.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 2d7d86f0290d..c7f2c63b3bc3 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -504,20 +504,6 @@ struct ceph_mds_request_head_legacy { #define CEPH_MDS_REQUEST_HEAD_VERSION 3 -struct ceph_mds_request_head_old { - __le16 version; /* struct version */ - __le64 oldest_client_tid; - __le32 mdsmap_epoch; /* on client */ - __le32 flags; /* CEPH_MDS_FLAG_* */ - __u8 num_retry, num_fwd; /* count retry, fwd attempts */ - __le16 num_releases; /* # include cap/lease release records */ - __le32 op; /* mds op code */ - __le32 caller_uid, caller_gid; - __le64 ino; /* use this ino for openc, mkdir, mknod, - etc. (if replaying) */ - union ceph_mds_request_args_ext args; -} __attribute__ ((packed)); - struct ceph_mds_request_head { __le16 version; /* struct version */ __le64 oldest_client_tid; -- cgit v1.2.3 From 09ebd028d6d70c7dc4b1c69212a18134ad2e0020 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B0=A2=E8=87=B4=E9=82=A6=20=28XIE=20Zhibang=29?= Date: Thu, 23 Jan 2025 11:57:03 +0000 Subject: net: the appletalk subsystem no longer uses ndo_do_ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ndo_do_ioctl is no longer used by the appletalk subsystem after commit 45bd1c5ba758 ("net: appletalk: Drop aarp_send_probe_phase1()"). Signed-off-by: 谢致邦 (XIE Zhibang) Reviewed-by: Simon Horman Link: https://patch.msgid.link/tencent_4AC6ED413FEA8116B4253D3ED6947FDBCF08@qq.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8da4c61f97b9..2a59034a5fa2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1085,8 +1085,8 @@ struct netdev_net_notifier { * * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); * Old-style ioctl entry point. This is used internally by the - * appletalk and ieee802154 subsystems but is no longer called by - * the device ioctl handler. + * ieee802154 subsystem but is no longer called by the device + * ioctl handler. * * int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd); * Used by the bonding driver for its device specific ioctls: -- cgit v1.2.3 From 5be1fa8abd7b049f51e6e98e75a37eef5ae2c296 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 8 Dec 2024 00:28:51 -0500 Subject: Pass parent directory inode and expected name to ->d_revalidate() ->d_revalidate() often needs to access dentry parent and name; that has to be done carefully, since the locking environment varies from caller to caller. We are not guaranteed that dentry in question will not be moved right under us - not unless the filesystem is such that nothing on it ever gets renamed. It can be dealt with, but that results in boilerplate code that isn't even needed - the callers normally have just found the dentry via dcache lookup and want to verify that it's in the right place; they already have the values of ->d_parent and ->d_name stable. There is a couple of exceptions (overlayfs and, to less extent, ecryptfs), but for the majority of calls that song and dance is not needed at all. It's easier to make ecryptfs and overlayfs find and pass those values if there's a ->d_revalidate() instance to be called, rather than doing that in the instances. This commit only changes the calling conventions; making use of supplied values is left to followups. NOTE: some instances need more than just the parent - things like CIFS may need to build an entire path from filesystem root, so they need more precautions than the usual boilerplate. This series doesn't do anything to that need - these filesystems have to keep their locking mechanisms (rename_lock loops, use of dentry_path_raw(), private rwsem a-la v9fs). One thing to keep in mind when using name is that name->name will normally point into the pathname being resolved; the filename in question occupies name->len bytes starting at name->name, and there is NUL somewhere after it, but it the next byte might very well be '/' rather than '\0'. Do not ignore name->len. Reviewed-by: Jeff Layton Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Al Viro --- include/linux/dcache.h | 3 ++- include/linux/fscrypt.h | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 8bc567a35718..4a6bdadf2f29 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -144,7 +144,8 @@ enum d_real_type { }; struct dentry_operations { - int (*d_revalidate)(struct dentry *, unsigned int); + int (*d_revalidate)(struct inode *, const struct qstr *, + struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); int (*d_hash)(const struct dentry *, struct qstr *); int (*d_compare)(const struct dentry *, diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 772f822dc6b8..18855cb44b1c 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -192,7 +192,8 @@ struct fscrypt_operations { unsigned int *num_devs); }; -int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags); +int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags); static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) @@ -711,8 +712,8 @@ static inline u64 fscrypt_fname_siphash(const struct inode *dir, return 0; } -static inline int fscrypt_d_revalidate(struct dentry *dentry, - unsigned int flags) +static inline int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { return 1; } -- cgit v1.2.3 From ffeeaada2bddb88078f16ba24f24ce8651c22d5b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 8 Dec 2024 01:27:11 -0500 Subject: nfs: fix ->d_revalidate() UAF on ->d_name accesses Pass the stable name all the way down to ->rpc_ops->lookup() instances. Note that passing &dentry->d_name is safe in e.g. nfs_lookup() - it *is* stable there, as it is in ->create() et.al. dget_parent() in nfs_instantiate() should be redundant - it'd better be stable there; if it's not, we have more trouble, since ->d_name would also be unsafe in such case. nfs_submount() and nfs4_submount() may or may not require fixes - if they ever get moved on server with fhandle preserved, we are in trouble there... UAF window is fairly narrow here and exfiltration requires the ability to watch the traffic. Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- include/linux/nfs_xdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 559273a0f16d..08b62bbf59f0 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1785,7 +1785,7 @@ struct nfs_rpc_ops { struct nfs_fattr *, struct inode *); int (*setattr) (struct dentry *, struct nfs_fattr *, struct iattr *); - int (*lookup) (struct inode *, struct dentry *, + int (*lookup) (struct inode *, struct dentry *, const struct qstr *, struct nfs_fh *, struct nfs_fattr *); int (*lookupp) (struct inode *, struct nfs_fh *, struct nfs_fattr *); -- cgit v1.2.3 From 30d61efe118cad1a73ad2ad66a3298e4abdf9f41 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 5 Jan 2025 21:33:17 -0500 Subject: 9p: fix ->rename_sem exclusion 9p wants to be able to build a path from given dentry to fs root and keep it valid over a blocking operation. ->s_vfs_rename_mutex would be a natural candidate, but there are places where we need that and where we have no way to tell if ->s_vfs_rename_mutex is already held deeper in callchain. Moreover, it's only held for cross-directory renames; name changes within the same directory happen without it. Solution: * have d_move() done in ->rename() rather than in its caller * maintain a 9p-private rwsem (per-filesystem) * hold it exclusive over the relevant part of ->rename() * hold it shared over the places where we want the path. That almost works. FS_RENAME_DOES_D_MOVE is enough to put all d_move() and d_exchange() calls under filesystem's control. However, there's also __d_unalias(), which isn't covered by any of that. If ->lookup() hits a directory inode with preexisting dentry elsewhere (due to e.g. rename done on server behind our back), d_splice_alias() called by ->lookup() will move/rename that alias. Add a couple of optional methods, so that __d_unalias() would do if alias->d_op->d_unalias_trylock != NULL if (!alias->d_op->d_unalias_trylock(alias)) fail (resulting in -ESTALE from lookup) __d_move(...) if alias->d_op->d_unalias_unlock != NULL alias->d_unalias_unlock(alias) where it currently does __d_move(). 9p instances do down_write_trylock() and up_write() of ->rename_mutex. Signed-off-by: Al Viro --- include/linux/dcache.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4a6bdadf2f29..9a1a30857763 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -159,6 +159,8 @@ struct dentry_operations { struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, enum d_real_type type); + bool (*d_unalias_trylock)(const struct dentry *); + void (*d_unalias_unlock)(const struct dentry *); } ____cacheline_aligned; /* -- cgit v1.2.3 From c1feab95e0b2e9fce7e4f4b2739baf40d84543af Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 23 Jan 2025 22:51:04 -0500 Subject: add a string-to-qstr constructor Quite a few places want to build a struct qstr by given string; it would be convenient to have a primitive doing that, rather than open-coding it via QSTR_INIT(). The closest approximation was in bcachefs, but that expands to initializer list - {.len = strlen(string), .name = string}. It would be more useful to have it as compound literal - (struct qstr){.len = strlen(string), .name = string}. Unlike initializer list it's a valid expression. What's more, it's a valid lvalue - it's an equivalent of anonymous local variable with such initializer, so the things like path->dentry = d_alloc_pseudo(mnt->mnt_sb, &QSTR(name)); are valid. It can also be used as initializer, with identical effect - struct qstr x = (struct qstr){.name = s, .len = strlen(s)}; is equivalent to struct qstr anon_variable = {.name = s, .len = strlen(s)}; struct qstr x = anon_variable; // anon_variable is never used after that point and any even remotely sane compiler will manage to collapse that into struct qstr x = {.name = s, .len = strlen(s)}; What compound literals can't be used for is initialization of global variables, but those are covered by QSTR_INIT(). This commit lifts definition(s) of QSTR() into linux/dcache.h, converts it to compound literal (all bcachefs users are fine with that) and converts assorted open-coded instances to using that. Signed-off-by: Al Viro --- include/linux/dcache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index bff956f7b2b9..3d53a6014591 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -57,6 +57,7 @@ struct qstr { }; #define QSTR_INIT(n,l) { { { .len = l } }, .name = n } +#define QSTR(n) (struct qstr)QSTR_INIT(n, strlen(n)) extern const struct qstr empty_name; extern const struct qstr slash_name; -- cgit v1.2.3 From 3775fc538f535a7c5adaf11990c7932a0bd1f9eb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 28 Jan 2025 20:24:41 +0100 Subject: PM: sleep: core: Synchronize runtime PM status of parents and children Commit 6e176bf8d461 ("PM: sleep: core: Do not skip callbacks in the resume phase") overlooked the case in which the parent of a device with DPM_FLAG_SMART_SUSPEND set did not use that flag and could be runtime- suspended before a transition into a system-wide sleep state. In that case, if the child is resumed during the subsequent transition from that state into the working state, its runtime PM status will be set to RPM_ACTIVE, but the runtime PM status of the parent will not be updated accordingly, even though the parent will be resumed too, because of the dev_pm_skip_suspend() check in device_resume_noirq(). Address this problem by tracking the need to set the runtime PM status to RPM_ACTIVE during system-wide resume transitions for devices with DPM_FLAG_SMART_SUSPEND set and all of the devices depended on by them. Fixes: 6e176bf8d461 ("PM: sleep: core: Do not skip callbacks in the resume phase") Closes: https://lore.kernel.org/linux-pm/Z30p2Etwf3F2AUvD@hovoldconsulting.com/ Reported-by: Johan Hovold Tested-by: Manivannan Sadhasivam Signed-off-by: Rafael J. Wysocki Reviewed-by: Johan Hovold Tested-by: Johan Hovold Link: https://patch.msgid.link/12619233.O9o76ZdvQC@rjwysocki.net --- include/linux/pm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index 0627a795892b..0d2597a76dfc 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -679,6 +679,7 @@ struct dev_pm_info { bool no_pm_callbacks:1; /* Owned by the PM core */ bool async_in_progress:1; /* Owned by the PM core */ bool must_resume:1; /* Owned by the PM core */ + bool set_active:1; /* Owned by the PM core */ bool may_skip_resume:1; /* Set by subsystems */ #else bool should_wakeup:1; -- cgit v1.2.3 From fe6628608627424fb4a6d4c8d2235822457c5d9c Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 28 Jan 2025 20:04:13 +0530 Subject: block: get rid of request queue ->sysfs_dir_lock The request queue uses ->sysfs_dir_lock for protecting the addition/ deletion of kobject entries under sysfs while we register/unregister blk-mq. However kobject addition/deletion is already protected with kernfs/sysfs internal synchronization primitives. So use of q->sysfs_ dir_lock seems redundant. Moreover, q->sysfs_dir_lock is also used at few other callsites along with q->sysfs_lock for protecting the addition/deletion of kojects. One such example is when we register with sysfs a set of independent access ranges for a disk. Here as well we could get rid off q->sysfs_ dir_lock and only use q->sysfs_lock. The only variable which q->sysfs_dir_lock appears to protect is q-> mq_sysfs_init_done which is set/unset while registering/unregistering blk-mq with sysfs. But use of q->mq_sysfs_init_done could be easily replaced using queue registered bit QUEUE_FLAG_REGISTERED. So with this patch we remove q->sysfs_dir_lock from each callsite and replace q->mq_sysfs_init_done using QUEUE_FLAG_REGISTERED. Reviewed-by: Christoph Hellwig Signed-off-by: Nilay Shroff Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20250128143436.874357-2-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 76f0a4e7c2e5..248416ecd01c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -561,7 +561,6 @@ struct request_queue { struct list_head flush_list; struct mutex sysfs_lock; - struct mutex sysfs_dir_lock; struct mutex limits_lock; /* @@ -605,8 +604,6 @@ struct request_queue { * Serializes all debugfs metadata operations using the above dentries. */ struct mutex debugfs_mutex; - - bool mq_sysfs_init_done; }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ -- cgit v1.2.3 From 27560b371ab82c1894d048aef0d113acb093f67f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 29 Jan 2025 07:37:57 +0100 Subject: fs: pack struct kstat better Move the change_cookie and subvol up to avoid two 4 byte holes. Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- include/linux/stat.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stat.h b/include/linux/stat.h index 9d8382e23a9c..be7496a6a0dd 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -50,11 +50,11 @@ struct kstat { struct timespec64 btime; /* File creation time */ u64 blocks; u64 mnt_id; + u64 change_cookie; + u64 subvol; u32 dio_mem_align; u32 dio_offset_align; u32 dio_read_offset_align; - u64 change_cookie; - u64 subvol; u32 atomic_write_unit_min; u32 atomic_write_unit_max; u32 atomic_write_segments_max; -- cgit v1.2.3 From 36b62df5683c315ba58c950f1a9c771c796c30ec Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 22 Jan 2025 18:09:14 +0800 Subject: bpf: Fix wrong copied_seq calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'sk->copied_seq' was updated in the tcp_eat_skb() function when the action of a BPF program was SK_REDIRECT. For other actions, like SK_PASS, the update logic for 'sk->copied_seq' was moved to tcp_bpf_recvmsg_parser() to ensure the accuracy of the 'fionread' feature. It works for a single stream_verdict scenario, as it also modified sk_data_ready->sk_psock_verdict_data_ready->tcp_read_skb to remove updating 'sk->copied_seq'. However, for programs where both stream_parser and stream_verdict are active (strparser purpose), tcp_read_sock() was used instead of tcp_read_skb() (sk_data_ready->strp_data_ready->tcp_read_sock). tcp_read_sock() now still updates 'sk->copied_seq', leading to duplicate updates. In summary, for strparser + SK_PASS, copied_seq is redundantly calculated in both tcp_read_sock() and tcp_bpf_recvmsg_parser(). The issue causes incorrect copied_seq calculations, which prevent correct data reads from the recv() interface in user-land. We do not want to add new proto_ops to implement a new version of tcp_read_sock, as this would introduce code complexity [1]. We could have added noack and copied_seq to desc, and then called ops->read_sock. However, unfortunately, other modules didn’t fully initialize desc to zero. So, for now, we are directly calling tcp_read_sock_noack() in tcp_bpf.c. [1]: https://lore.kernel.org/bpf/20241218053408.437295-1-mrpre@163.com Fixes: e5c6de5fa025 ("bpf, sockmap: Incorrectly handling copied_seq") Suggested-by: Jakub Sitnicki Signed-off-by: Jiayuan Chen Signed-off-by: Martin KaFai Lau Reviewed-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://patch.msgid.link/20250122100917.49845-3-mrpre@163.com --- include/linux/skmsg.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 2cbe0c22a32f..0b9095a281b8 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -91,6 +91,8 @@ struct sk_psock { struct sk_psock_progs progs; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) struct strparser strp; + u32 copied_seq; + u32 ingress_bytes; #endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; -- cgit v1.2.3 From bb2784d9ab49587ba4fbff37a319fff2924db289 Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Thu, 30 Jan 2025 19:26:58 +0000 Subject: jiffies: Cast to unsigned long in secs_to_jiffies() conversion While converting users of msecs_to_jiffies(), lkp reported that some range checks would always be true because of the mismatch between the implied int value of secs_to_jiffies() vs the unsigned long return value of the msecs_to_jiffies() calls it was replacing. Fix this by casting the secs_to_jiffies() input value to unsigned long. Fixes: b35108a51cf7ba ("jiffies: Define secs_to_jiffies()") Reported-by: kernel test robot Signed-off-by: Easwar Hariharan Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20250130192701.99626-1-eahariha@linux.microsoft.com Closes: https://lore.kernel.org/oe-kbuild-all/202501301334.NB6NszQR-lkp@intel.com/ --- include/linux/jiffies.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index ed945f42e064..0ea8c9887429 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -537,7 +537,7 @@ static __always_inline unsigned long msecs_to_jiffies(const unsigned int m) * * Return: jiffies value */ -#define secs_to_jiffies(_secs) ((_secs) * HZ) +#define secs_to_jiffies(_secs) (unsigned long)((_secs) * HZ) extern unsigned long __usecs_to_jiffies(const unsigned int u); #if !(USEC_PER_SEC % HZ) -- cgit v1.2.3 From 1e1a9cecfab3f22ebef0a976f849c87be8d03c1c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 Jan 2025 13:03:47 +0100 Subject: block: force noio scope in blk_mq_freeze_queue When block drivers or the core block code perform allocations with a frozen queue, this could try to recurse into the block device to reclaim memory and deadlock. Thus all allocations done by a process that froze a queue need to be done without __GFP_IO and __GFP_FS. Instead of tying to track all of them down, force a noio scope as part of freezing the queue. Note that nvme is a bit of a mess here due to the non-owner freezes, and they will be addressed separately. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250131120352.1315351-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a0a9007cc1e3..9ebb53f031cd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -900,8 +900,22 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); -void blk_mq_freeze_queue(struct request_queue *q); -void blk_mq_unfreeze_queue(struct request_queue *q); +void blk_mq_freeze_queue_nomemsave(struct request_queue *q); +void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q); +static inline unsigned int __must_check +blk_mq_freeze_queue(struct request_queue *q) +{ + unsigned int memflags = memalloc_noio_save(); + + blk_mq_freeze_queue_nomemsave(q); + return memflags; +} +static inline void +blk_mq_unfreeze_queue(struct request_queue *q, unsigned int memflags) +{ + blk_mq_unfreeze_queue_nomemrestore(q); + memalloc_noio_restore(memflags); +} void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, -- cgit v1.2.3 From 1c7b17cf0594f33c898004ac1b5576c032f266e2 Mon Sep 17 00:00:00 2001 From: liuye Date: Tue, 19 Nov 2024 14:08:42 +0800 Subject: mm/vmscan: fix hard LOCKUP in function isolate_lru_folios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes the following hard lockup in isolate_lru_folios() during memory reclaim. If the LRU mostly contains ineligible folios this may trigger watchdog. watchdog: Watchdog detected hard LOCKUP on cpu 173 RIP: 0010:native_queued_spin_lock_slowpath+0x255/0x2a0 Call Trace: _raw_spin_lock_irqsave+0x31/0x40 folio_lruvec_lock_irqsave+0x5f/0x90 folio_batch_move_lru+0x91/0x150 lru_add_drain_per_cpu+0x1c/0x40 process_one_work+0x17d/0x350 worker_thread+0x27b/0x3a0 kthread+0xe8/0x120 ret_from_fork+0x34/0x50 ret_from_fork_asm+0x1b/0x30 lruvec->lru_lock owner: PID: 2865 TASK: ffff888139214d40 CPU: 40 COMMAND: "kswapd0" #0 [fffffe0000945e60] crash_nmi_callback at ffffffffa567a555 #1 [fffffe0000945e68] nmi_handle at ffffffffa563b171 #2 [fffffe0000945eb0] default_do_nmi at ffffffffa6575920 #3 [fffffe0000945ed0] exc_nmi at ffffffffa6575af4 #4 [fffffe0000945ef0] end_repeat_nmi at ffffffffa6601dde [exception RIP: isolate_lru_folios+403] RIP: ffffffffa597df53 RSP: ffffc90006fb7c28 RFLAGS: 00000002 RAX: 0000000000000001 RBX: ffffc90006fb7c60 RCX: ffffea04a2196f88 RDX: ffffc90006fb7c60 RSI: ffffc90006fb7c60 RDI: ffffea04a2197048 RBP: ffff88812cbd3010 R8: ffffea04a2197008 R9: 0000000000000001 R10: 0000000000000000 R11: 0000000000000001 R12: ffffea04a2197008 R13: ffffea04a2197048 R14: ffffc90006fb7de8 R15: 0000000003e3e937 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #5 [ffffc90006fb7c28] isolate_lru_folios at ffffffffa597df53 #6 [ffffc90006fb7cf8] shrink_active_list at ffffffffa597f788 #7 [ffffc90006fb7da8] balance_pgdat at ffffffffa5986db0 #8 [ffffc90006fb7ec0] kswapd at ffffffffa5987354 #9 [ffffc90006fb7ef8] kthread at ffffffffa5748238 crash> Scenario: User processe are requesting a large amount of memory and keep page active. Then a module continuously requests memory from ZONE_DMA32 area. Memory reclaim will be triggered due to ZONE_DMA32 watermark alarm reached. However pages in the LRU(active_anon) list are mostly from the ZONE_NORMAL area. Reproduce: Terminal 1: Construct to continuously increase pages active(anon). mkdir /tmp/memory mount -t tmpfs -o size=1024000M tmpfs /tmp/memory dd if=/dev/zero of=/tmp/memory/block bs=4M tail /tmp/memory/block Terminal 2: vmstat -a 1 active will increase. procs ---memory--- ---swap-- ---io---- -system-- ---cpu--- ... r b swpd free inact active si so bi bo 1 0 0 1445623076 45898836 83646008 0 0 0 1 0 0 1445623076 43450228 86094616 0 0 0 1 0 0 1445623076 41003480 88541364 0 0 0 1 0 0 1445623076 38557088 90987756 0 0 0 1 0 0 1445623076 36109688 93435156 0 0 0 1 0 0 1445619552 33663256 95881632 0 0 0 1 0 0 1445619804 31217140 98327792 0 0 0 1 0 0 1445619804 28769988 100774944 0 0 0 1 0 0 1445619804 26322348 103222584 0 0 0 1 0 0 1445619804 23875592 105669340 0 0 0 cat /proc/meminfo | head Active(anon) increase. MemTotal: 1579941036 kB MemFree: 1445618500 kB MemAvailable: 1453013224 kB Buffers: 6516 kB Cached: 128653956 kB SwapCached: 0 kB Active: 118110812 kB Inactive: 11436620 kB Active(anon): 115345744 kB Inactive(anon): 945292 kB When the Active(anon) is 115345744 kB, insmod module triggers the ZONE_DMA32 watermark. perf record -e vmscan:mm_vmscan_lru_isolate -aR perf script isolate_mode=0 classzone=1 order=1 nr_requested=32 nr_scanned=2 nr_skipped=2 nr_taken=0 lru=active_anon isolate_mode=0 classzone=1 order=1 nr_requested=32 nr_scanned=0 nr_skipped=0 nr_taken=0 lru=active_anon isolate_mode=0 classzone=1 order=0 nr_requested=32 nr_scanned=28835844 nr_skipped=28835844 nr_taken=0 lru=active_anon isolate_mode=0 classzone=1 order=1 nr_requested=32 nr_scanned=28835844 nr_skipped=28835844 nr_taken=0 lru=active_anon isolate_mode=0 classzone=1 order=0 nr_requested=32 nr_scanned=29 nr_skipped=29 nr_taken=0 lru=active_anon isolate_mode=0 classzone=1 order=0 nr_requested=32 nr_scanned=0 nr_skipped=0 nr_taken=0 lru=active_anon See nr_scanned=28835844. 28835844 * 4k = 115343376KB approximately equal to 115345744 kB. If increase Active(anon) to 1000G then insmod module triggers the ZONE_DMA32 watermark. hard lockup will occur. In my device nr_scanned = 0000000003e3e937 when hard lockup. Convert to memory size 0x0000000003e3e937 * 4KB = 261072092 KB. [ffffc90006fb7c28] isolate_lru_folios at ffffffffa597df53 ffffc90006fb7c30: 0000000000000020 0000000000000000 ffffc90006fb7c40: ffffc90006fb7d40 ffff88812cbd3000 ffffc90006fb7c50: ffffc90006fb7d30 0000000106fb7de8 ffffc90006fb7c60: ffffea04a2197008 ffffea0006ed4a48 ffffc90006fb7c70: 0000000000000000 0000000000000000 ffffc90006fb7c80: 0000000000000000 0000000000000000 ffffc90006fb7c90: 0000000000000000 0000000000000000 ffffc90006fb7ca0: 0000000000000000 0000000003e3e937 ffffc90006fb7cb0: 0000000000000000 0000000000000000 ffffc90006fb7cc0: 8d7c0b56b7874b00 ffff88812cbd3000 About the Fixes: Why did it take eight years to be discovered? The problem requires the following conditions to occur: 1. The device memory should be large enough. 2. Pages in the LRU(active_anon) list are mostly from the ZONE_NORMAL area. 3. The memory in ZONE_DMA32 needs to reach the watermark. If the memory is not large enough, or if the usage design of ZONE_DMA32 area memory is reasonable, this problem is difficult to detect. notes: The problem is most likely to occur in ZONE_DMA32 and ZONE_NORMAL, but other suitable scenarios may also trigger the problem. Link: https://lkml.kernel.org/r/20241119060842.274072-1-liuye@kylinos.cn Fixes: b2e18757f2c9 ("mm, vmscan: begin reclaiming pages on a per-node basis") Signed-off-by: liuye Cc: Hugh Dickins Cc: Mel Gorman Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index a5f475335aea..b13b72645db3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -222,6 +222,7 @@ enum { }; #define SWAP_CLUSTER_MAX 32UL +#define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10) #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX /* Bit flag in swap_map */ -- cgit v1.2.3 From b69bb476dee99d564d65d418e9a20acca6f32c3f Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 30 Jan 2025 16:05:42 -0800 Subject: cgroup: fix race between fork and cgroup.kill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tejun reported the following race between fork() and cgroup.kill at [1]. Tejun: I was looking at cgroup.kill implementation and wondering whether there could be a race window. So, __cgroup_kill() does the following: k1. Set CGRP_KILL. k2. Iterate tasks and deliver SIGKILL. k3. Clear CGRP_KILL. The copy_process() does the following: c1. Copy a bunch of stuff. c2. Grab siglock. c3. Check fatal_signal_pending(). c4. Commit to forking. c5. Release siglock. c6. Call cgroup_post_fork() which puts the task on the css_set and tests CGRP_KILL. The intention seems to be that either a forking task gets SIGKILL and terminates on c3 or it sees CGRP_KILL on c6 and kills the child. However, I don't see what guarantees that k3 can't happen before c6. ie. After a forking task passes c5, k2 can take place and then before the forking task reaches c6, k3 can happen. Then, nobody would send SIGKILL to the child. What am I missing? This is indeed a race. One way to fix this race is by taking cgroup_threadgroup_rwsem in write mode in __cgroup_kill() as the fork() side takes cgroup_threadgroup_rwsem in read mode from cgroup_can_fork() to cgroup_post_fork(). However that would be heavy handed as this adds one more potential stall scenario for cgroup.kill which is usually called under extreme situation like memory pressure. To fix this race, let's maintain a sequence number per cgroup which gets incremented on __cgroup_kill() call. On the fork() side, the cgroup_can_fork() will cache the sequence number locally and recheck it against the cgroup's sequence number at cgroup_post_fork() site. If the sequence numbers mismatch, it means __cgroup_kill() can been called and we should send SIGKILL to the newly created task. Reported-by: Tejun Heo Closes: https://lore.kernel.org/all/Z5QHE2Qn-QZ6M-KW@slm.duckdns.org/ [1] Fixes: 661ee6280931 ("cgroup: introduce cgroup.kill") Cc: stable@vger.kernel.org # v5.14+ Signed-off-by: Shakeel Butt Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 6 +++--- include/linux/sched/task.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1b20d2d8ef7c..17960a1e858d 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -71,9 +71,6 @@ enum { /* Cgroup is frozen. */ CGRP_FROZEN, - - /* Control group has to be killed. */ - CGRP_KILL, }; /* cgroup_root->flags */ @@ -461,6 +458,9 @@ struct cgroup { int nr_threaded_children; /* # of live threaded child cgroups */ + /* sequence number for cgroup.kill, serialized by css_set_lock. */ + unsigned int kill_seq; + struct kernfs_node *kn; /* cgroup kernfs entry */ struct cgroup_file procs_file; /* handle for "cgroup.procs" */ struct cgroup_file events_file; /* handle for "cgroup.events" */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 0f2aeb37bbb0..ca1db4b92c32 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -43,6 +43,7 @@ struct kernel_clone_args { void *fn_arg; struct cgroup *cgrp; struct css_set *cset; + unsigned int kill_seq; }; /* -- cgit v1.2.3 From d3ed6dee73c560fad0a8e152c8e233b3fb3a2e44 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 1 Feb 2025 19:02:51 +0100 Subject: net: harmonize tstats and dstats After the blamed commits below, some UDP tunnel use dstats for accounting. On the xmit path, all the UDP-base tunnels ends up using iptunnel_xmit_stats() for stats accounting, and the latter assumes the relevant (tunnel) network device uses tstats. The end result is some 'funny' stat report for the mentioned UDP tunnel, e.g. when no packet is actually dropped and a bunch of packets are transmitted: gnv2: mtu 1450 qdisc noqueue \ state UNKNOWN mode DEFAULT group default qlen 1000 link/ether ee:7d:09:87:90:ea brd ff:ff:ff:ff:ff:ff RX: bytes packets errors dropped missed mcast 14916 23 0 15 0 0 TX: bytes packets errors dropped carrier collsns 0 1566 0 0 0 0 Address the issue ensuring the same binary layout for the overlapping fields of dstats and tstats. While this solution is a bit hackish, is smaller and with no performance pitfall compared to other alternatives i.e. supporting both dstat and tstat in iptunnel_xmit_stats() or reverting the blamed commit. With time we should possibly move all the IP-based tunnel (and virtual devices) to dstats. Fixes: c77200c07491 ("bareudp: Handle stats using NETDEV_PCPU_STAT_DSTATS.") Fixes: 6fa6de302246 ("geneve: Handle stats using NETDEV_PCPU_STAT_DSTATS.") Fixes: be226352e8dc ("vxlan: Handle stats using NETDEV_PCPU_STAT_DSTATS.") Signed-off-by: Paolo Abeni Reviewed-by: Guillaume Nault Link: https://patch.msgid.link/2e1c444cf0f63ae472baff29862c4c869be17031.1738432804.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2a59034a5fa2..03bb584c62cf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2904,9 +2904,9 @@ struct pcpu_sw_netstats { struct pcpu_dstats { u64_stats_t rx_packets; u64_stats_t rx_bytes; - u64_stats_t rx_drops; u64_stats_t tx_packets; u64_stats_t tx_bytes; + u64_stats_t rx_drops; u64_stats_t tx_drops; struct u64_stats_sync syncp; } __aligned(8 * sizeof(u64)); -- cgit v1.2.3 From ba69e0750b0362870294adab09339a0c39c3beaf Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 1 Feb 2025 18:21:35 +0100 Subject: efi: Avoid cold plugged memory for placing the kernel UEFI 2.11 introduced EFI_MEMORY_HOT_PLUGGABLE to annotate system memory regions that are 'cold plugged' at boot, i.e., hot pluggable memory that is available from early boot, and described as system RAM by the firmware. Existing loaders and EFI applications running in the boot context will happily use this memory for allocating data structures that cannot be freed or moved at runtime, and this prevents the memory from being unplugged. Going forward, the new EFI_MEMORY_HOT_PLUGGABLE attribute should be tested, and memory annotated as such should be avoided for such allocations. In the EFI stub, there are a couple of occurrences where, instead of the high-level AllocatePages() UEFI boot service, a low-level code sequence is used that traverses the EFI memory map and carves out the requested number of pages from a free region. This is needed, e.g., for allocating as low as possible, or for allocating pages at random. While AllocatePages() should presumably avoid special purpose memory and cold plugged regions, this manual approach needs to incorporate this logic itself, in order to prevent the kernel itself from ending up in a hot unpluggable region, preventing it from being unplugged. So add the EFI_MEMORY_HOTPLUGGABLE macro definition, and check for it where appropriate. Cc: stable@vger.kernel.org Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 053c57e61869..db293d7de686 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -128,6 +128,7 @@ typedef struct { #define EFI_MEMORY_RO ((u64)0x0000000000020000ULL) /* read-only */ #define EFI_MEMORY_SP ((u64)0x0000000000040000ULL) /* soft reserved */ #define EFI_MEMORY_CPU_CRYPTO ((u64)0x0000000000080000ULL) /* supports encryption */ +#define EFI_MEMORY_HOT_PLUGGABLE BIT_ULL(20) /* supports unplugging at runtime */ #define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */ #define EFI_MEMORY_DESCRIPTOR_VERSION 1 -- cgit v1.2.3 From bbc4578537e350d5bf8a7a2c7d054d6b163b3c41 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 1 Feb 2025 18:21:36 +0100 Subject: efi: Use BIT_ULL() constants for memory attributes For legibility, use the existing BIT_ULL() to generate the u64 type EFI memory attribute macros. Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index db293d7de686..7d63d1d75f22 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -114,22 +114,22 @@ typedef struct { #define EFI_MAX_MEMORY_TYPE 16 /* Attribute values: */ -#define EFI_MEMORY_UC ((u64)0x0000000000000001ULL) /* uncached */ -#define EFI_MEMORY_WC ((u64)0x0000000000000002ULL) /* write-coalescing */ -#define EFI_MEMORY_WT ((u64)0x0000000000000004ULL) /* write-through */ -#define EFI_MEMORY_WB ((u64)0x0000000000000008ULL) /* write-back */ -#define EFI_MEMORY_UCE ((u64)0x0000000000000010ULL) /* uncached, exported */ -#define EFI_MEMORY_WP ((u64)0x0000000000001000ULL) /* write-protect */ -#define EFI_MEMORY_RP ((u64)0x0000000000002000ULL) /* read-protect */ -#define EFI_MEMORY_XP ((u64)0x0000000000004000ULL) /* execute-protect */ -#define EFI_MEMORY_NV ((u64)0x0000000000008000ULL) /* non-volatile */ -#define EFI_MEMORY_MORE_RELIABLE \ - ((u64)0x0000000000010000ULL) /* higher reliability */ -#define EFI_MEMORY_RO ((u64)0x0000000000020000ULL) /* read-only */ -#define EFI_MEMORY_SP ((u64)0x0000000000040000ULL) /* soft reserved */ -#define EFI_MEMORY_CPU_CRYPTO ((u64)0x0000000000080000ULL) /* supports encryption */ +#define EFI_MEMORY_UC BIT_ULL(0) /* uncached */ +#define EFI_MEMORY_WC BIT_ULL(1) /* write-coalescing */ +#define EFI_MEMORY_WT BIT_ULL(2) /* write-through */ +#define EFI_MEMORY_WB BIT_ULL(3) /* write-back */ +#define EFI_MEMORY_UCE BIT_ULL(4) /* uncached, exported */ +#define EFI_MEMORY_WP BIT_ULL(12) /* write-protect */ +#define EFI_MEMORY_RP BIT_ULL(13) /* read-protect */ +#define EFI_MEMORY_XP BIT_ULL(14) /* execute-protect */ +#define EFI_MEMORY_NV BIT_ULL(15) /* non-volatile */ +#define EFI_MEMORY_MORE_RELIABLE BIT_ULL(16) /* higher reliability */ +#define EFI_MEMORY_RO BIT_ULL(17) /* read-only */ +#define EFI_MEMORY_SP BIT_ULL(18) /* soft reserved */ +#define EFI_MEMORY_CPU_CRYPTO BIT_ULL(19) /* supports encryption */ #define EFI_MEMORY_HOT_PLUGGABLE BIT_ULL(20) /* supports unplugging at runtime */ -#define EFI_MEMORY_RUNTIME ((u64)0x8000000000000000ULL) /* range requires runtime mapping */ +#define EFI_MEMORY_RUNTIME BIT_ULL(63) /* range requires runtime mapping */ + #define EFI_MEMORY_DESCRIPTOR_VERSION 1 #define EFI_PAGE_SHIFT 12 -- cgit v1.2.3 From 6f61269495260531e15d84d090ee63618110c470 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 24 Jan 2025 10:26:22 -0500 Subject: KVM: remove kvm_arch_post_init_vm The only statement in a kvm_arch_post_init_vm implementation can be moved into the x86 kvm_arch_init_vm. Do so and remove all traces from architecture-independent code. Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3cb9a32a6330..f34f4cfaa513 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1615,7 +1615,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu); -int kvm_arch_post_init_vm(struct kvm *kvm); void kvm_arch_pre_destroy_vm(struct kvm *kvm); void kvm_arch_create_vm_debugfs(struct kvm *kvm); -- cgit v1.2.3 From c4d3dfd8ccaef2cbd374860e307f1e056854a472 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 5 Feb 2025 14:21:36 +0100 Subject: Revert "i2c: Replace list-based mechanism for handling userspace-created clients" This reverts commit 3cfe39b3a845593a485ab1c716615979004ef9f6. Mux handling is not sufficiently implemented. It needs more time. Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index c31fd1dba3bd..4955d9e76c5f 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -313,6 +313,8 @@ struct i2c_driver { * @dev: Driver model device node for the slave. * @init_irq: IRQ that was set at initialization * @irq: indicates the IRQ generated by this device (if any) + * @detected: member of an i2c_driver.clients list or i2c-core's + * userspace_devices list * @slave_cb: Callback when I2C slave mode of an adapter is used. The adapter * calls it to pass on slave events to the slave driver. * @devres_group_id: id of the devres group that will be created for resources @@ -333,7 +335,6 @@ struct i2c_client { #define I2C_CLIENT_HOST_NOTIFY 0x40 /* We want to use I2C host notify */ #define I2C_CLIENT_WAKE 0x80 /* for board_info; true iff can wake */ #define I2C_CLIENT_AUTO 0x100 /* client was auto-detected */ -#define I2C_CLIENT_USER 0x200 /* client was userspace-created */ #define I2C_CLIENT_SCCB 0x9000 /* Use Omnivision SCCB protocol */ /* Must match I2C_M_STOP|IGNORE_NAK */ @@ -345,6 +346,7 @@ struct i2c_client { struct device dev; /* the device structure */ int init_irq; /* irq set at initialization */ int irq; /* irq issued by device */ + struct list_head detected; #if IS_ENABLED(CONFIG_I2C_SLAVE) i2c_slave_cb_t slave_cb; /* callback for slave mode */ #endif @@ -751,6 +753,9 @@ struct i2c_adapter { char name[48]; struct completion dev_released; + struct mutex userspace_clients_lock; + struct list_head userspace_clients; + struct i2c_bus_recovery_info *bus_recovery_info; const struct i2c_adapter_quirks *quirks; -- cgit v1.2.3 From 3bfa08fe9ec8dd79e183c88e1275be74191e7bc8 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 5 Feb 2025 14:22:12 +0100 Subject: Revert "i2c: Replace list-based mechanism for handling auto-detected clients" This reverts commit 56a50667cbcfaf95eea9128d5676af94e54b51a8. Mux handling is not sufficiently implemented. It needs more time. Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 4955d9e76c5f..2b2af24d2a43 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -244,6 +244,7 @@ enum i2c_driver_flags { * @id_table: List of I2C devices supported by this driver * @detect: Callback for device detection * @address_list: The I2C addresses to probe (for detect) + * @clients: List of detected clients we created (for i2c-core use only) * @flags: A bitmask of flags defined in &enum i2c_driver_flags * * The driver.owner field should be set to the module owner of this driver. @@ -298,6 +299,7 @@ struct i2c_driver { /* Device detection callback for automatic device creation */ int (*detect)(struct i2c_client *client, struct i2c_board_info *info); const unsigned short *address_list; + struct list_head clients; u32 flags; }; @@ -334,7 +336,6 @@ struct i2c_client { #define I2C_CLIENT_SLAVE 0x20 /* we are the slave */ #define I2C_CLIENT_HOST_NOTIFY 0x40 /* We want to use I2C host notify */ #define I2C_CLIENT_WAKE 0x80 /* for board_info; true iff can wake */ -#define I2C_CLIENT_AUTO 0x100 /* client was auto-detected */ #define I2C_CLIENT_SCCB 0x9000 /* Use Omnivision SCCB protocol */ /* Must match I2C_M_STOP|IGNORE_NAK */ -- cgit v1.2.3 From 4c56eb33e603c3b9eb4bd24efbfdd0283c1c37e4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 2 Feb 2025 03:51:41 +0900 Subject: kbuild: keep symbols for symbol_get() even with CONFIG_TRIM_UNUSED_KSYMS Linus observed that the symbol_request(utf8_data_table) call fails when CONFIG_UNICODE=y and CONFIG_TRIM_UNUSED_KSYMS=y. symbol_get() relies on the symbol data being present in the ksymtab for symbol lookups. However, EXPORT_SYMBOL_GPL(utf8_data_table) is dropped due to CONFIG_TRIM_UNUSED_KSYMS, as no module references it in this case. Probably, this has been broken since commit dbacb0ef670d ("kconfig option for TRIM_UNUSED_KSYMS"). This commit addresses the issue by leveraging modpost. Symbol names passed to symbol_get() are recorded in the special .no_trim_symbol section, which is then parsed by modpost to forcibly keep such symbols. The .no_trim_symbol section is discarded by the linker scripts, so there is no impact on the size of the final vmlinux or modules. This commit cannot resolve the issue for direct calls to __symbol_get() because the symbol name is not known at compile-time. Although symbol_get() may eventually be deprecated, this workaround should be good enough meanwhile. Reported-by: Linus Torvalds Suggested-by: Linus Torvalds Signed-off-by: Masahiro Yamada --- include/linux/module.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 23792d5d7b74..30e5b19bafa9 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -306,7 +306,10 @@ extern int modules_disabled; /* for sysctl */ /* Get/put a kernel symbol (calls must be symmetric) */ void *__symbol_get(const char *symbol); void *__symbol_get_gpl(const char *symbol); -#define symbol_get(x) ((typeof(&x))(__symbol_get(__stringify(x)))) +#define symbol_get(x) ({ \ + static const char __notrim[] \ + __used __section(".no_trim_symbol") = __stringify(x); \ + (typeof(&x))(__symbol_get(__stringify(x))); }) /* modules using other modules: kdb wants to see this. */ struct module_use { -- cgit v1.2.3 From 482ad2a4ace2740ca0ff1cbc8f3c7f862f3ab507 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Feb 2025 15:51:09 +0000 Subject: net: add dev_net_rcu() helper dev->nd_net can change, readers should either use rcu_read_lock() or RTNL. We currently use a generic helper, dev_net() with no debugging support. We probably have many hidden bugs. Add dev_net_rcu() helper for callers using rcu_read_lock() protection. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250205155120.1676781-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 03bb584c62cf..c0a86afb85da 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2663,6 +2663,12 @@ struct net *dev_net(const struct net_device *dev) return read_pnet(&dev->nd_net); } +static inline +struct net *dev_net_rcu(const struct net_device *dev) +{ + return read_pnet_rcu(&dev->nd_net); +} + static inline void dev_net_set(struct net_device *dev, struct net *net) { -- cgit v1.2.3 From cb7380de9e4cbc9a24216b722ec50e092ae83036 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 5 Feb 2025 12:32:49 -0800 Subject: compiler.h: Move C string helpers into C-only kernel section The C kernel helpers for evaluating C Strings were positioned where they were visible to assembly inclusion, which was not intended. Move them into the kernel and C-only area of the header so future changes won't confuse the assembler. Fixes: d7a516c6eeae ("compiler.h: Fix undefined BUILD_BUG_ON_ZERO()") Fixes: 559048d156ff ("string: Check for "nonstring" attribute on strscpy() arguments") Reviewed-by: Miguel Ojeda Signed-off-by: Kees Cook --- include/linux/compiler.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 240c632c5b95..7af999a131cb 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -214,6 +214,19 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, __v; \ }) +#ifdef __CHECKER__ +#define __BUILD_BUG_ON_ZERO_MSG(e, msg) (0) +#else /* __CHECKER__ */ +#define __BUILD_BUG_ON_ZERO_MSG(e, msg) ((int)sizeof(struct {_Static_assert(!(e), msg);})) +#endif /* __CHECKER__ */ + +/* &a[0] degrades to a pointer: a different type from an array */ +#define __must_be_array(a) __BUILD_BUG_ON_ZERO_MSG(__same_type((a), &(a)[0]), "must be array") + +/* Require C Strings (i.e. NUL-terminated) lack the "nonstring" attribute. */ +#define __must_be_cstr(p) \ + __BUILD_BUG_ON_ZERO_MSG(__annotated(p, nonstring), "must be cstr (NUL-terminated)") + #endif /* __KERNEL__ */ /** @@ -254,19 +267,6 @@ static inline void *offset_to_ptr(const int *off) #define __ADDRESSABLE_ASM_STR(sym) __stringify(__ADDRESSABLE_ASM(sym)) -#ifdef __CHECKER__ -#define __BUILD_BUG_ON_ZERO_MSG(e, msg) (0) -#else /* __CHECKER__ */ -#define __BUILD_BUG_ON_ZERO_MSG(e, msg) ((int)sizeof(struct {_Static_assert(!(e), msg);})) -#endif /* __CHECKER__ */ - -/* &a[0] degrades to a pointer: a different type from an array */ -#define __must_be_array(a) __BUILD_BUG_ON_ZERO_MSG(__same_type((a), &(a)[0]), "must be array") - -/* Require C Strings (i.e. NUL-terminated) lack the "nonstring" attribute. */ -#define __must_be_cstr(p) \ - __BUILD_BUG_ON_ZERO_MSG(__annotated(p, nonstring), "must be cstr (NUL-terminated)") - /* * This returns a constant expression while determining if an argument is * a constant expression, most importantly without evaluating the argument. -- cgit v1.2.3 From 20e5cc26e56db09cc612721f90b4994cce5e5b7b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 5 Feb 2025 12:48:07 -0800 Subject: compiler.h: Introduce __must_be_byte_array() In preparation for adding stricter type checking to the str/mem*() helpers, provide a way to check that a variable is a byte array via __must_be_byte_array(). Suggested-by: Kent Overstreet Signed-off-by: Kees Cook --- include/linux/compiler.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 7af999a131cb..1c0688319435 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -221,7 +221,13 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif /* __CHECKER__ */ /* &a[0] degrades to a pointer: a different type from an array */ -#define __must_be_array(a) __BUILD_BUG_ON_ZERO_MSG(__same_type((a), &(a)[0]), "must be array") +#define __is_array(a) (!__same_type((a), &(a)[0])) +#define __must_be_array(a) __BUILD_BUG_ON_ZERO_MSG(!__is_array(a), \ + "must be array") + +#define __is_byte_array(a) (__is_array(a) && sizeof((a)[0]) == 1) +#define __must_be_byte_array(a) __BUILD_BUG_ON_ZERO_MSG(!__is_byte_array(a), \ + "must be byte array") /* Require C Strings (i.e. NUL-terminated) lack the "nonstring" attribute. */ #define __must_be_cstr(p) \ -- cgit v1.2.3 From 6270f4deba3fbd77d1717fb8634f1fc612ff69e2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 5 Feb 2025 13:45:26 -0800 Subject: string.h: Use ARRAY_SIZE() for memtostr*()/strtomem*() The destination argument of memtostr*() and strtomem*() must be a fixed-size char array at compile time, so there is no need to use __builtin_object_size() (which is useful for when an argument is either a pointer or unknown). Instead use ARRAY_SIZE(), which has the benefit of working around a bug in Clang (fixed[1] in 15+) that got __builtin_object_size() wrong sometimes. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501310832.kiAeOt2z-lkp@intel.com/ Suggested-by: Kent Overstreet Link: https://github.com/llvm/llvm-project/commit/d8e0a6d5e9dd2311641f9a8a5d2bf90829951ddc [1] Tested-by: Suren Baghdasaryan Signed-off-by: Kees Cook --- include/linux/string.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 493ac4862c77..fc5ae145bd78 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -411,7 +411,8 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, * must be discoverable by the compiler. */ #define strtomem_pad(dest, src, pad) do { \ - const size_t _dest_len = __builtin_object_size(dest, 1); \ + const size_t _dest_len = __must_be_byte_array(dest) + \ + ARRAY_SIZE(dest); \ const size_t _src_len = __builtin_object_size(src, 1); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ @@ -434,7 +435,8 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, * must be discoverable by the compiler. */ #define strtomem(dest, src) do { \ - const size_t _dest_len = __builtin_object_size(dest, 1); \ + const size_t _dest_len = __must_be_byte_array(dest) + \ + ARRAY_SIZE(dest); \ const size_t _src_len = __builtin_object_size(src, 1); \ \ BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \ @@ -453,7 +455,8 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, * Note that sizes of @dest and @src must be known at compile-time. */ #define memtostr(dest, src) do { \ - const size_t _dest_len = __builtin_object_size(dest, 1); \ + const size_t _dest_len = __must_be_byte_array(dest) + \ + ARRAY_SIZE(dest); \ const size_t _src_len = __builtin_object_size(src, 1); \ const size_t _src_chars = strnlen(src, _src_len); \ const size_t _copy_len = min(_dest_len - 1, _src_chars); \ @@ -478,7 +481,8 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count, * Note that sizes of @dest and @src must be known at compile-time. */ #define memtostr_pad(dest, src) do { \ - const size_t _dest_len = __builtin_object_size(dest, 1); \ + const size_t _dest_len = __must_be_byte_array(dest) + \ + ARRAY_SIZE(dest); \ const size_t _src_len = __builtin_object_size(src, 1); \ const size_t _src_chars = strnlen(src, _src_len); \ const size_t _copy_len = min(_dest_len - 1, _src_chars); \ -- cgit v1.2.3 From bb504b4d64266fa0d7460c218c85afed371db03a Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 30 Jan 2025 14:56:23 +0100 Subject: lockref: remove count argument of lockref_init All users of lockref_init() now initialize the count to 1, so hardcode that and remove the count argument. Reviewed-by: Christoph Hellwig Signed-off-by: Andreas Gruenbacher Link: https://lore.kernel.org/r/20250130135624.1899988-4-agruenba@redhat.com Signed-off-by: Christian Brauner --- include/linux/lockref.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockref.h b/include/linux/lockref.h index c39f119659ba..676721ee878d 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -37,12 +37,13 @@ struct lockref { /** * lockref_init - Initialize a lockref * @lockref: pointer to lockref structure - * @count: initial count + * + * Initializes @lockref->count to 1. */ -static inline void lockref_init(struct lockref *lockref, unsigned int count) +static inline void lockref_init(struct lockref *lockref) { spin_lock_init(&lockref->lock); - lockref->count = count; + lockref->count = 1; } void lockref_get(struct lockref *lockref); -- cgit v1.2.3 From 95101401bb50ae2cf9deee1bbf4d2b28d0dfdc26 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 3 Feb 2025 23:32:03 +0100 Subject: fsnotify: use accessor to set FMODE_NONOTIFY_* The FMODE_NONOTIFY_* bits are a 2-bits mode. Open coding manipulation of those bits is risky. Use an accessor file_set_fsnotify_mode() to set the mode. Rename file_set_fsnotify_mode() => file_set_fsnotify_mode_from_watchers() to make way for the simple accessor name. Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20250203223205.861346-2-amir73il@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 7 ++++++- include/linux/fsnotify.h | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index be3ad155ec9f..7620547432a8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -222,7 +222,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_FSNOTIFY_HSM(mode) 0 #endif - /* * Attribute flags. These should be or-ed together to figure out what * has been changed! @@ -3140,6 +3139,12 @@ static inline void exe_file_allow_write_access(struct file *exe_file) allow_write_access(exe_file); } +static inline void file_set_fsnotify_mode(struct file *file, fmode_t mode) +{ + file->f_mode &= ~FMODE_FSNOTIFY_MASK; + file->f_mode |= mode; +} + static inline bool inode_is_open_for_write(const struct inode *inode) { return atomic_read(&inode->i_writecount) > 0; diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 1a9ef8f6784d..6a33288bd6a1 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -129,7 +129,7 @@ static inline int fsnotify_file(struct file *file, __u32 mask) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS -void file_set_fsnotify_mode(struct file *file); +void file_set_fsnotify_mode_from_watchers(struct file *file); /* * fsnotify_file_area_perm - permission hook before access to file range @@ -213,7 +213,7 @@ static inline int fsnotify_open_perm(struct file *file) } #else -static inline void file_set_fsnotify_mode(struct file *file) +static inline void file_set_fsnotify_mode_from_watchers(struct file *file) { } -- cgit v1.2.3 From 37d11cfc63604b3886308e2111d845d148ced8bc Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 4 Feb 2025 22:32:07 +0100 Subject: vfs: sanity check the length passed to inode_set_cached_link() This costs a strlen() call when instatianating a symlink. Preferably it would be hidden behind VFS_WARN_ON (or compatible), but there is no such facility at the moment. With the facility in place the call can be patched out in production kernels. In the meantime, since the cost is being paid unconditionally, use the result to a fixup the bad caller. This is not expected to persist in the long run (tm). Sample splat: bad length passed for symlink [/tmp/syz-imagegen43743633/file0/file0] (got 131109, expected 37) [rest of WARN blurp goes here] Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250204213207.337980-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7620547432a8..2c3b2f8a621f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -790,6 +790,19 @@ struct inode { static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { + int testlen; + + /* + * TODO: patch it into a debug-only check if relevant macros show up. + * In the meantime, since we are suffering strlen even on production kernels + * to find the right length, do a fixup if the wrong value got passed. + */ + testlen = strlen(link); + if (testlen != linklen) { + WARN_ONCE(1, "bad length passed for symlink [%s] (got %d, expected %d)", + link, linklen, testlen); + linklen = testlen; + } inode->i_link = link; inode->i_linklen = linklen; inode->i_opflags |= IOP_CACHED_LINK; -- cgit v1.2.3 From 011b0335903832facca86cd8ed05d7d8d94c9c76 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 6 Feb 2025 22:28:48 +0100 Subject: Revert "net: skb: introduce and use a single page frag cache" This reverts commit dbae2b062824 ("net: skb: introduce and use a single page frag cache"). The intended goal of such change was to counter a performance regression introduced by commit 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs"). Unfortunately, the blamed commit introduces another regression for the virtio_net driver. Such a driver calls napi_alloc_skb() with a tiny size, so that the whole head frag could fit a 512-byte block. The single page frag cache uses a 1K fragment for such allocation, and the additional overhead, under small UDP packets flood, makes the page allocator a bottleneck. Thanks to commit bf9f1baa279f ("net: add dedicated kmem_cache for typical/small skb->head"), this revert does not re-introduce the original regression. Actually, in the relevant test on top of this revert, I measure a small but noticeable positive delta, just above noise level. The revert itself required some additional mangling due to the introduction of the SKB_HEAD_ALIGN() helper and local lock infra in the affected code. Suggested-by: Eric Dumazet Fixes: dbae2b062824 ("net: skb: introduce and use a single page frag cache") Signed-off-by: Paolo Abeni Link: https://patch.msgid.link/e649212fde9f0fdee23909ca0d14158d32bb7425.1738877290.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c0a86afb85da..365f0e2098d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4115,7 +4115,6 @@ void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); -void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); static inline void napi_free_frags(struct napi_struct *napi) -- cgit v1.2.3 From a1f7b7ff0e10ae574d388131596390157222f986 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 10 Feb 2025 10:17:27 +0200 Subject: PCI: pci_ids: add INTEL_HDA_PTL_H Add Intel PTL-H audio Device ID. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Bard Liao Acked-by: Bjorn Helgaas Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250210081730.22916-2-peter.ujfalusi@linux.intel.com --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index de5deb1a0118..1a2594a38199 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -3134,6 +3134,7 @@ #define PCI_DEVICE_ID_INTEL_HDA_LNL_P 0xa828 #define PCI_DEVICE_ID_INTEL_S21152BB 0xb152 #define PCI_DEVICE_ID_INTEL_HDA_BMG 0xe2f7 +#define PCI_DEVICE_ID_INTEL_HDA_PTL_H 0xe328 #define PCI_DEVICE_ID_INTEL_HDA_PTL 0xe428 #define PCI_DEVICE_ID_INTEL_HDA_CML_R 0xf0c8 #define PCI_DEVICE_ID_INTEL_HDA_RKL_S 0xf1c8 -- cgit v1.2.3 From 1d0013962d220b166d9f7c9fe2746f1542e459a3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2025 22:23:59 +0000 Subject: netfs: Fix a number of read-retry hangs Fix a number of hangs in the netfslib read-retry code, including: (1) netfs_reissue_read() doubles up the getting of references on subrequests, thereby leaking the subrequest and causing inode eviction to wait indefinitely. This can lead to the kernel reporting a hang in the filesystem's evict_inode(). Fix this by removing the get from netfs_reissue_read() and adding one to netfs_retry_read_subrequests() to deal with the one place that didn't double up. (2) The loop in netfs_retry_read_subrequests() that retries a sequence of failed subrequests doesn't record whether or not it retried the one that the "subreq" pointer points to when it leaves the loop. It may not if renegotiation/repreparation of the subrequests means that fewer subrequests are needed to span the cumulative range of the sequence. Because it doesn't record this, the piece of code that discards now-superfluous subrequests doesn't know whether it should discard the one "subreq" points to - and so it doesn't. Fix this by noting whether the last subreq it examines is superfluous and if it is, then getting rid of it and all subsequent subrequests. If that one one wasn't superfluous, then we would have tried to go round the previous loop again and so there can be no further unretried subrequests in the sequence. (3) netfs_retry_read_subrequests() gets yet an extra ref on any additional subrequests it has to get because it ran out of ones it could reuse to to renegotiation/repreparation shrinking the subrequests. Fix this by removing that extra ref. (4) In netfs_retry_reads(), it was using wait_on_bit() to wait for NETFS_SREQ_IN_PROGRESS to be cleared on all subrequests in the sequence - but netfs_read_subreq_terminated() is now using a wait queue on the request instead and so this wait will never finish. Fix this by waiting on the wait queue instead. To make this work, a new flag, NETFS_RREQ_RETRYING, is now set around the wait loop to tell the wake-up code to wake up the wait queue rather than requeuing the request's work item. Note that this flag replaces the NETFS_RREQ_NEED_RETRY flag which is no longer used. (5) Whilst not strictly anything to do with the hang, netfs_retry_read_subrequests() was also doubly incrementing the subreq_counter and re-setting the debug index, leaving a gap in the trace. This is also fixed. One of these hangs was observed with 9p and with cifs. Others were forced by manual code injection into fs/afs/file.c. Firstly, afs_prepare_read() was created to provide an changing pattern of maximum subrequest sizes: static int afs_prepare_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; if (!S_ISREG(subreq->rreq->inode->i_mode)) return 0; if (subreq->retry_count < 20) rreq->io_streams[0].sreq_max_len = umax(200, 2222 - subreq->retry_count * 40); else rreq->io_streams[0].sreq_max_len = 3333; return 0; } and pointed to by afs_req_ops. Then the following: struct netfs_io_subrequest *subreq = op->fetch.subreq; if (subreq->error == 0 && S_ISREG(subreq->rreq->inode->i_mode) && subreq->retry_count < 20) { subreq->transferred = subreq->already_done; __clear_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); afs_fetch_data_notify(op); return; } was inserted into afs_fetch_data_success() at the beginning and struct netfs_io_subrequest given an extra field, "already_done" that was set to the value in "subreq->transferred" by netfs_reissue_read(). When reading a 4K file, the subrequests would get gradually smaller, a new subrequest would be allocated around the 3rd retry and then eventually be rendered superfluous when the 20th retry was hit and the limit on the first subrequest was eased. Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item") Signed-off-by: David Howells Link: https://lore.kernel.org/r/20250212222402.3618494-2-dhowells@redhat.com Tested-by: Marc Dionne Tested-by: Steve French cc: Ihor Solodrai cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Dominique Martinet cc: Christian Schoenebeck cc: Paulo Alcantara cc: Jeff Layton cc: v9fs@lists.linux.dev cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 071d05d81d38..c86a11cfc4a3 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -278,7 +278,7 @@ struct netfs_io_request { #define NETFS_RREQ_PAUSE 11 /* Pause subrequest generation */ #define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */ #define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */ -#define NETFS_RREQ_NEED_RETRY 14 /* Need to try retrying */ +#define NETFS_RREQ_RETRYING 14 /* Set if we're in the retry path */ #define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark * write to cache on read */ const struct netfs_request_ops *netfs_ops; -- cgit v1.2.3 From 1f47ed294a2bd577d5ae43e6e28e1c9a3be4a833 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 13 Feb 2025 08:18:46 -0700 Subject: block: cleanup and fix batch completion adding conditions The conditions for whether or not a request is allowed adding to a completion batch are a bit hard to read, and they also have a few issues. One is that ioerror may indeed be a random value on passthrough, and it's being checked unconditionally of whether or not the given request is a passthrough request or not. Rewrite the conditions to be separate for easier reading, and only check ioerror for non-passthrough requests. This fixes an issue with bio unmapping on passthrough, where it fails getting added to a batch. This both leads to suboptimal performance, and may trigger a potential schedule-under-atomic condition for polled passthrough IO. Fixes: f794f3351f26 ("block: add support for blk_mq_end_request_batch()") Link: https://lore.kernel.org/r/20575f0a-656e-4bb3-9d82-dec6c7e3a35c@kernel.dk Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9ebb53f031cd..fa2a76cc2f73 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -861,12 +861,22 @@ static inline bool blk_mq_add_to_batch(struct request *req, void (*complete)(struct io_comp_batch *)) { /* - * blk_mq_end_request_batch() can't end request allocated from - * sched tags + * Check various conditions that exclude batch processing: + * 1) No batch container + * 2) Has scheduler data attached + * 3) Not a passthrough request and end_io set + * 4) Not a passthrough request and an ioerror */ - if (!iob || (req->rq_flags & RQF_SCHED_TAGS) || ioerror || - (req->end_io && !blk_rq_is_passthrough(req))) + if (!iob) return false; + if (req->rq_flags & RQF_SCHED_TAGS) + return false; + if (!blk_rq_is_passthrough(req)) { + if (req->end_io) + return false; + if (ioerror < 0) + return false; + } if (!iob->complete) iob->complete = complete; -- cgit v1.2.3 From 35fa2d88ca9481e5caf533d58b99ca259c63b2fe Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 10 Feb 2025 13:30:25 +0100 Subject: driver core: add a faux bus for use when a simple device/bus is needed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many drivers abuse the platform driver/bus system as it provides a simple way to create and bind a device to a driver-specific set of probe/release functions. Instead of doing that, and wasting all of the memory associated with a platform device, here is a "faux" bus that can be used instead. Reviewed-by: Jonathan Cameron Reviewed-by: Danilo Krummrich Reviewed-by: Lyude Paul Reviewed-by: Thomas Weißschuh Reviewed-by: Zijun Hu Link: https://lore.kernel.org/r/2025021026-atlantic-gibberish-3f0c@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/device/faux.h | 69 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 include/linux/device/faux.h (limited to 'include/linux') diff --git a/include/linux/device/faux.h b/include/linux/device/faux.h new file mode 100644 index 000000000000..9f43c0e46aa4 --- /dev/null +++ b/include/linux/device/faux.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2025 Greg Kroah-Hartman + * Copyright (c) 2025 The Linux Foundation + * + * A "simple" faux bus that allows devices to be created and added + * automatically to it. This is to be used whenever you need to create a + * device that is not associated with any "real" system resources, and do + * not want to have to deal with a bus/driver binding logic. It is + * intended to be very simple, with only a create and a destroy function + * available. + */ +#ifndef _FAUX_DEVICE_H_ +#define _FAUX_DEVICE_H_ + +#include +#include + +/** + * struct faux_device - a "faux" device + * @dev: internal struct device of the object + * + * A simple faux device that can be created/destroyed. To be used when a + * driver only needs to have a device to "hang" something off. This can be + * used for downloading firmware or other basic tasks. Use this instead of + * a struct platform_device if the device has no resources assigned to + * it at all. + */ +struct faux_device { + struct device dev; +}; +#define to_faux_device(x) container_of_const((x), struct faux_device, dev) + +/** + * struct faux_device_ops - a set of callbacks for a struct faux_device + * @probe: called when a faux device is probed by the driver core + * before the device is fully bound to the internal faux bus + * code. If probe succeeds, return 0, otherwise return a + * negative error number to stop the probe sequence from + * succeeding. + * @remove: called when a faux device is removed from the system + * + * Both @probe and @remove are optional, if not needed, set to NULL. + */ +struct faux_device_ops { + int (*probe)(struct faux_device *faux_dev); + void (*remove)(struct faux_device *faux_dev); +}; + +struct faux_device *faux_device_create(const char *name, + struct device *parent, + const struct faux_device_ops *faux_ops); +struct faux_device *faux_device_create_with_groups(const char *name, + struct device *parent, + const struct faux_device_ops *faux_ops, + const struct attribute_group **groups); +void faux_device_destroy(struct faux_device *faux_dev); + +static inline void *faux_device_get_drvdata(const struct faux_device *faux_dev) +{ + return dev_get_drvdata(&faux_dev->dev); +} + +static inline void faux_device_set_drvdata(struct faux_device *faux_dev, void *data) +{ + dev_set_drvdata(&faux_dev->dev, data); +} + +#endif /* _FAUX_DEVICE_H_ */ -- cgit v1.2.3 From 0892b840318daa6ae739b7cdec5ecdfca4006689 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 13 Feb 2025 08:49:44 -0800 Subject: Reapply "net: skb: introduce and use a single page frag cache" This reverts commit 011b0335903832facca86cd8ed05d7d8d94c9c76. Sabrina reports that the revert may trigger warnings due to intervening changes, especially the ability to rise MAX_SKB_FRAGS. Let's drop it and revisit once that part is also ironed out. Fixes: 011b03359038 ("Revert "net: skb: introduce and use a single page frag cache"") Reported-by: Sabrina Dubroca Link: https://lore.kernel.org/6bf54579233038bc0e76056c5ea459872ce362ab.1739375933.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 365f0e2098d1..c0a86afb85da 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4115,6 +4115,7 @@ void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); +void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); static inline void napi_free_frags(struct napi_struct *napi) -- cgit v1.2.3 From 435b344a7042e91fb4719d589f18310e8919e39f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 10 Feb 2025 22:53:47 +0000 Subject: crypto: ccp: Add external API interface for PSP module initialization KVM is dependent on the PSP SEV driver and PSP SEV driver needs to be loaded before KVM module. In case of module loading any dependent modules are automatically loaded but in case of built-in modules there is no inherent mechanism available to specify dependencies between modules and ensure that any dependent modules are loaded implicitly. Add a new external API interface for PSP module initialization which allows PSP SEV driver to be loaded explicitly if KVM is built-in. Signed-off-by: Sean Christopherson Co-developed-by: Ashish Kalra Signed-off-by: Ashish Kalra Reviewed-by: Tom Lendacky Message-ID: <15279ca0cad56a07cf12834ec544310f85ff5edc.1739226950.git.ashish.kalra@amd.com> Signed-off-by: Paolo Bonzini --- include/linux/psp-sev.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 903ddfea8585..f3cad182d4ef 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -814,6 +814,15 @@ struct sev_data_snp_commit { #ifdef CONFIG_CRYPTO_DEV_SP_PSP +/** + * sev_module_init - perform PSP SEV module initialization + * + * Returns: + * 0 if the PSP module is successfully initialized + * negative value if the PSP module initialization fails + */ +int sev_module_init(void); + /** * sev_platform_init - perform SEV INIT command * -- cgit v1.2.3 From 02d954c0fdf91845169cdacc7405b120f90afe01 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 10 Feb 2025 16:32:50 +0100 Subject: sched: Compact RSEQ concurrency IDs with reduced threads and affinity When a process reduces its number of threads or clears bits in its CPU affinity mask, the mm_cid allocation should eventually converge towards smaller values. However, the change introduced by: commit 7e019dcc470f ("sched: Improve cache locality of RSEQ concurrency IDs for intermittent workloads") adds a per-mm/CPU recent_cid which is never unset unless a thread migrates. This is a tradeoff between: A) Preserving cache locality after a transition from many threads to few threads, or after reducing the hamming weight of the allowed CPU mask. B) Making the mm_cid upper bounds wrt nr threads and allowed CPU mask easy to document and understand. C) Allowing applications to eventually react to mm_cid compaction after reduction of the nr threads or allowed CPU mask, making the tracking of mm_cid compaction easier by shrinking it back towards 0 or not. D) Making sure applications that periodically reduce and then increase again the nr threads or allowed CPU mask still benefit from good cache locality with mm_cid. Introduce the following changes: * After shrinking the number of threads or reducing the number of allowed CPUs, reduce the value of max_nr_cid so expansion of CID allocation will preserve cache locality if the number of threads or allowed CPUs increase again. * Only re-use a recent_cid if it is within the max_nr_cid upper bound, else find the first available CID. Fixes: 7e019dcc470f ("sched: Improve cache locality of RSEQ concurrency IDs for intermittent workloads") Signed-off-by: Mathieu Desnoyers Signed-off-by: Gabriele Monaco Signed-off-by: Peter Zijlstra (Intel) Tested-by: Gabriele Monaco Link: https://lkml.kernel.org/r/20250210153253.460471-2-gmonaco@redhat.com --- include/linux/mm_types.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6b27db7f9496..0234f14f2aa6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -875,10 +875,11 @@ struct mm_struct { */ unsigned int nr_cpus_allowed; /** - * @max_nr_cid: Maximum number of concurrency IDs allocated. + * @max_nr_cid: Maximum number of allowed concurrency + * IDs allocated. * - * Track the highest number of concurrency IDs allocated for the - * mm. + * Track the highest number of allowed concurrency IDs + * allocated for the mm. */ atomic_t max_nr_cid; /** -- cgit v1.2.3 From 84e009042d0f3dfe91bec60bcd208ee3f866cbcd Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Mon, 17 Feb 2025 17:08:27 +0100 Subject: nvme-tcp: add basic support for the C2HTermReq PDU Previously, the NVMe/TCP host driver did not handle the C2HTermReq PDU, instead printing "unsupported pdu type (3)" when received. This patch adds support for processing the C2HTermReq PDU, allowing the driver to print the Fatal Error Status field. Example of output: nvme nvme4: Received C2HTermReq (FES = Invalid PDU Header Field) Signed-off-by: Maurizio Lombardi Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- include/linux/nvme-tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h index e07e8978d691..e435250fcb4d 100644 --- a/include/linux/nvme-tcp.h +++ b/include/linux/nvme-tcp.h @@ -13,6 +13,8 @@ #define NVME_TCP_ADMIN_CCSZ SZ_8K #define NVME_TCP_DIGEST_LENGTH 4 #define NVME_TCP_MIN_MAXH2CDATA 4096 +#define NVME_TCP_MIN_C2HTERM_PLEN 24 +#define NVME_TCP_MAX_C2HTERM_PLEN 152 enum nvme_tcp_pfv { NVME_TCP_PFV_1_0 = 0x0, -- cgit v1.2.3 From d422247d14a53fe825b1778edf104167d8fd8f3f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Feb 2025 15:49:59 +0900 Subject: nvme: Cleanup the definition of the controller config register fields Reorganized the enum used to define the fields of the contrller configuration (CC) register in include/linux/nvme.h to: 1) Group together all the values defined for each field. 2) Add the missing field masks definitions. 3) Add comments to describe the enum and each field. Signed-off-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index fe3b60818fdc..2dc05b1c3283 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -199,28 +199,54 @@ enum { #define NVME_NVM_IOSQES 6 #define NVME_NVM_IOCQES 4 +/* + * Controller Configuration (CC) register (Offset 14h) + */ enum { + /* Enable (EN): bit 0 */ NVME_CC_ENABLE = 1 << 0, NVME_CC_EN_SHIFT = 0, + + /* Bits 03:01 are reserved (NVMe Base Specification rev 2.1) */ + + /* I/O Command Set Selected (CSS): bits 06:04 */ NVME_CC_CSS_SHIFT = 4, - NVME_CC_MPS_SHIFT = 7, - NVME_CC_AMS_SHIFT = 11, - NVME_CC_SHN_SHIFT = 14, - NVME_CC_IOSQES_SHIFT = 16, - NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_CSS_MASK = 7 << NVME_CC_CSS_SHIFT, NVME_CC_CSS_NVM = 0 << NVME_CC_CSS_SHIFT, NVME_CC_CSS_CSI = 6 << NVME_CC_CSS_SHIFT, - NVME_CC_CSS_MASK = 7 << NVME_CC_CSS_SHIFT, + + /* Memory Page Size (MPS): bits 10:07 */ + NVME_CC_MPS_SHIFT = 7, + NVME_CC_MPS_MASK = 0xf << NVME_CC_MPS_SHIFT, + + /* Arbitration Mechanism Selected (AMS): bits 13:11 */ + NVME_CC_AMS_SHIFT = 11, + NVME_CC_AMS_MASK = 7 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_RR = 0 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_WRRU = 1 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_VS = 7 << NVME_CC_AMS_SHIFT, + + /* Shutdown Notification (SHN): bits 15:14 */ + NVME_CC_SHN_SHIFT = 14, + NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_NONE = 0 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_NORMAL = 1 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_ABRUPT = 2 << NVME_CC_SHN_SHIFT, - NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, + + /* I/O Submission Queue Entry Size (IOSQES): bits 19:16 */ + NVME_CC_IOSQES_SHIFT = 16, + NVME_CC_IOSQES_MASK = 0xf << NVME_CC_IOSQES_SHIFT, NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, + + /* I/O Completion Queue Entry Size (IOCQES): bits 23:20 */ + NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_IOCQES_MASK = 0xf << NVME_CC_IOCQES_SHIFT, NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, + + /* Controller Ready Independent of Media Enable (CRIME): bit 24 */ NVME_CC_CRIME = 1 << 24, + + /* Bits 25:31 are reserved (NVMe Base Specification rev 2.1) */ }; enum { -- cgit v1.2.3 From 4b5a28b38c4a0106c64416a1b2042405166b26ce Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 18 Feb 2025 05:49:30 -0800 Subject: net: Add non-RCU dev_getbyhwaddr() helper Add dedicated helper for finding devices by hardware address when holding rtnl_lock, similar to existing dev_getbyhwaddr_rcu(). This prevents PROVE_LOCKING warnings when rtnl_lock is held but RCU read lock is not. Extract common address comparison logic into dev_addr_cmp(). The context about this change could be found in the following discussion: Link: https://lore.kernel.org/all/20250206-scarlet-ermine-of-improvement-1fcac5@leitao/ Cc: kuniyu@amazon.com Cc: ushankar@purestorage.com Suggested-by: Eric Dumazet Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250218-arm_fix_selftest-v5-1-d3d6892db9e1@debian.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c0a86afb85da..94b7d4eca003 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3275,6 +3275,8 @@ static inline struct net_device *first_net_device_rcu(struct net *net) } int netdev_boot_setup_check(struct net_device *dev); +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, + const char *hwaddr); struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, const char *hwaddr); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type); -- cgit v1.2.3 From 6bc7e4eb0499562ccd291712fd7be0d1a5aad00a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Feb 2025 19:29:40 +0100 Subject: Revert "net: skb: introduce and use a single page frag cache" After the previous commit is finally safe to revert commit dbae2b062824 ("net: skb: introduce and use a single page frag cache"): do it here. The intended goal of such change was to counter a performance regression introduced by commit 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs"). Unfortunately, the blamed commit introduces another regression for the virtio_net driver. Such a driver calls napi_alloc_skb() with a tiny size, so that the whole head frag could fit a 512-byte block. The single page frag cache uses a 1K fragment for such allocation, and the additional overhead, under small UDP packets flood, makes the page allocator a bottleneck. Thanks to commit bf9f1baa279f ("net: add dedicated kmem_cache for typical/small skb->head"), this revert does not re-introduce the original regression. Actually, in the relevant test on top of this revert, I measure a small but noticeable positive delta, just above noise level. The revert itself required some additional mangling due to recent updates in the affected code. Suggested-by: Eric Dumazet Fixes: dbae2b062824 ("net: skb: introduce and use a single page frag cache") Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 94b7d4eca003..ab550a89b9bf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4117,7 +4117,6 @@ void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); -void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); static inline void napi_free_frags(struct napi_struct *napi) -- cgit v1.2.3