From 200f091c95bbc4b8660636bd345805c45d6eced7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 28 Sep 2024 14:08:31 -0700 Subject: coredump: Do not lock during 'comm' reporting The 'comm' member will always be NUL terminated, and this is not fast-path, so we can just perform a direct memcpy during a coredump instead of potentially deadlocking while holding the task struct lock. Reported-by: Vegard Nossum Closes: https://lore.kernel.org/all/d122ece6-3606-49de-ae4d-8da88846bef2@oracle.com Fixes: c114e9948c2b ("coredump: Standartize and fix logging") Tested-by: Vegard Nossum Link: https://lore.kernel.org/r/20240928210830.work.307-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/coredump.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 45e598fe3476..77e6e195d1d6 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -52,8 +52,8 @@ extern void do_coredump(const kernel_siginfo_t *siginfo); #define __COREDUMP_PRINTK(Level, Format, ...) \ do { \ char comm[TASK_COMM_LEN]; \ - \ - get_task_comm(comm, current); \ + /* This will always be NUL terminated. */ \ + memcpy(comm, current->comm, sizeof(comm)); \ printk_ratelimited(Level "coredump: %d(%*pE): " Format "\n", \ task_tgid_vnr(current), (int)strlen(comm), comm, ##__VA_ARGS__); \ } while (0) \ -- cgit v1.2.3 From b9c44b91476b67327a521568a854babecc4070ab Mon Sep 17 00:00:00 2001 From: Yabin Cui Date: Wed, 15 May 2024 12:36:07 -0700 Subject: perf/core: Save raw sample data conditionally based on sample type Currently, space for raw sample data is always allocated within sample records for both BPF output and tracepoint events. This leads to unused space in sample records when raw sample data is not requested. This patch enforces checking sample type of an event in perf_sample_save_raw_data(). So raw sample data will only be saved if explicitly requested, reducing overhead when it is not needed. Fixes: 0a9081cf0a11 ("perf/core: Add perf_sample_save_raw_data() helper") Signed-off-by: Yabin Cui Signed-off-by: Ingo Molnar Reviewed-by: Ian Rogers Acked-by: Namhyung Kim Link: https://lore.kernel.org/r/20240515193610.2350456-2-yabinc@google.com --- include/linux/perf_event.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index cb99ec8c9e96..f7c0a3f2f502 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1287,12 +1287,18 @@ static inline void perf_sample_save_callchain(struct perf_sample_data *data, } static inline void perf_sample_save_raw_data(struct perf_sample_data *data, + struct perf_event *event, struct perf_raw_record *raw) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; int size; + if (!(event->attr.sample_type & PERF_SAMPLE_RAW)) + return; + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_RAW)) + return; + do { sum += frag->size; if (perf_raw_frag_last(frag)) -- cgit v1.2.3 From f226805bc5f60adf03783d8e4cbfe303ccecd64e Mon Sep 17 00:00:00 2001 From: Yabin Cui Date: Wed, 15 May 2024 12:36:08 -0700 Subject: perf/core: Check sample_type in perf_sample_save_callchain Check sample_type in perf_sample_save_callchain() to prevent saving callchain data when it isn't required. Suggested-by: Namhyung Kim Signed-off-by: Yabin Cui Signed-off-by: Ingo Molnar Reviewed-by: Ian Rogers Acked-by: Namhyung Kim Link: https://lore.kernel.org/r/20240515193610.2350456-3-yabinc@google.com --- include/linux/perf_event.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f7c0a3f2f502..3ac202d971fb 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1279,6 +1279,11 @@ static inline void perf_sample_save_callchain(struct perf_sample_data *data, { int size = 1; + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) + return; + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) + return; + data->callchain = perf_callchain(event, regs); size += data->callchain->nr; -- cgit v1.2.3 From faac6f105ef169e2e5678c14e1ffebf2a7d780b6 Mon Sep 17 00:00:00 2001 From: Yabin Cui Date: Wed, 15 May 2024 12:36:09 -0700 Subject: perf/core: Check sample_type in perf_sample_save_brstack Check sample_type in perf_sample_save_brstack() to prevent saving branch stack data when it isn't required. Suggested-by: Namhyung Kim Signed-off-by: Yabin Cui Signed-off-by: Ingo Molnar Reviewed-by: Ian Rogers Acked-by: Namhyung Kim Link: https://lore.kernel.org/r/20240515193610.2350456-4-yabinc@google.com --- include/linux/perf_event.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3ac202d971fb..bf831b1485ff 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1320,6 +1320,11 @@ static inline void perf_sample_save_raw_data(struct perf_sample_data *data, data->sample_flags |= PERF_SAMPLE_RAW; } +static inline bool has_branch_stack(struct perf_event *event) +{ + return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; +} + static inline void perf_sample_save_brstack(struct perf_sample_data *data, struct perf_event *event, struct perf_branch_stack *brs, @@ -1327,6 +1332,11 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data, { int size = sizeof(u64); /* nr */ + if (!has_branch_stack(event)) + return; + if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_BRANCH_STACK)) + return; + if (branch_sample_hw_index(event)) size += sizeof(u64); size += brs->nr * sizeof(struct perf_branch_entry); @@ -1716,11 +1726,6 @@ static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) # define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) #endif -static inline bool has_branch_stack(struct perf_event *event) -{ - return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; -} - static inline bool needs_branch_stack(struct perf_event *event) { return event->attr.branch_sample_type != 0; -- cgit v1.2.3 From 2815a56e4b7252a836969f5674ee356ea1ce482c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 14 Nov 2024 10:26:17 -0500 Subject: x86/mm/tlb: Add tracepoint for TLB flush IPI to stale CPU Add a tracepoint when we send a TLB flush IPI to a CPU that used to be in the mm_cpumask, but isn't any more. Suggested-by: Dave Hansen Signed-off-by: Rik van Riel Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241114152723.1294686-3-riel@surriel.com --- include/linux/mm_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6e3bdf8e38bc..6b6f05404304 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1335,6 +1335,7 @@ enum tlb_flush_reason { TLB_LOCAL_SHOOTDOWN, TLB_LOCAL_MM_SHOOTDOWN, TLB_REMOTE_SEND_IPI, + TLB_REMOTE_WRONG_CPU, NR_TLB_FLUSH_REASONS, }; -- cgit v1.2.3 From 0a499a7e9819e7a0980408f18df68160a0b55f2e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Dec 2024 17:08:26 -0800 Subject: lib/crc32: drop leading underscores from __crc32c_le_base Remove the leading underscores from __crc32c_le_base(). This is in preparation for adding crc32c_le_arch() and eventually renaming __crc32c_le() to crc32c_le(). Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20241202010844.144356-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc32.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 87f788c0d607..5b07fc9081c4 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -39,7 +39,7 @@ static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2) } u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len); -u32 __pure __crc32c_le_base(u32 crc, unsigned char const *p, size_t len); +u32 __pure crc32c_le_base(u32 crc, unsigned char const *p, size_t len); /** * __crc32c_le_combine - Combine two crc32c check values into one. For two -- cgit v1.2.3 From d36cebe03c3ae4ea1fde20cfc797fab8729c3ab5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Dec 2024 17:08:27 -0800 Subject: lib/crc32: improve support for arch-specific overrides Currently the CRC32 library functions are defined as weak symbols, and the arm64 and riscv architectures override them. This method of arch-specific overrides has the limitation that it only works when both the base and arch code is built-in. Also, it makes the arch-specific code be silently not used if it is accidentally built with lib-y instead of obj-y; unfortunately the RISC-V code does this. This commit reorganizes the code to have explicit *_arch() functions that are called when they are enabled, similar to how some of the crypto library code works (e.g. chacha_crypt() calls chacha_crypt_arch()). Make the existing kconfig choice for the CRC32 implementation also control whether the arch-optimized implementation (if one is available) is enabled or not. Make it enabled by default if CRC32 is also enabled. The result is that arch-optimized CRC32 library functions will be included automatically when appropriate, but it is now possible to disable them. They can also now be built as a loadable module if the CRC32 library functions happen to be used only by loadable modules, in which case the arch and base CRC32 modules will be automatically loaded via direct symbol dependency when appropriate. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20241202010844.144356-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc32.h | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 5b07fc9081c4..58c632533b08 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -8,10 +8,34 @@ #include #include -u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len); -u32 __pure crc32_le_base(u32 crc, unsigned char const *p, size_t len); -u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len); -u32 __pure crc32_be_base(u32 crc, unsigned char const *p, size_t len); +u32 __pure crc32_le_arch(u32 crc, const u8 *p, size_t len); +u32 __pure crc32_le_base(u32 crc, const u8 *p, size_t len); +u32 __pure crc32_be_arch(u32 crc, const u8 *p, size_t len); +u32 __pure crc32_be_base(u32 crc, const u8 *p, size_t len); +u32 __pure crc32c_le_arch(u32 crc, const u8 *p, size_t len); +u32 __pure crc32c_le_base(u32 crc, const u8 *p, size_t len); + +static inline u32 __pure crc32_le(u32 crc, const u8 *p, size_t len) +{ + if (IS_ENABLED(CONFIG_CRC32_ARCH)) + return crc32_le_arch(crc, p, len); + return crc32_le_base(crc, p, len); +} + +static inline u32 __pure crc32_be(u32 crc, const u8 *p, size_t len) +{ + if (IS_ENABLED(CONFIG_CRC32_ARCH)) + return crc32_be_arch(crc, p, len); + return crc32_be_base(crc, p, len); +} + +/* TODO: leading underscores should be dropped once callers have been updated */ +static inline u32 __pure __crc32c_le(u32 crc, const u8 *p, size_t len) +{ + if (IS_ENABLED(CONFIG_CRC32_ARCH)) + return crc32c_le_arch(crc, p, len); + return crc32c_le_base(crc, p, len); +} /** * crc32_le_combine - Combine two crc32 check values into one. For two @@ -38,9 +62,6 @@ static inline u32 crc32_le_combine(u32 crc1, u32 crc2, size_t len2) return crc32_le_shift(crc1, len2) ^ crc2; } -u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len); -u32 __pure crc32c_le_base(u32 crc, unsigned char const *p, size_t len); - /** * __crc32c_le_combine - Combine two crc32c check values into one. For two * sequences of bytes, seq1 and seq2 with lengths len1 -- cgit v1.2.3 From b5ae12e0ee099e4c458f7814f0317f4e2cbf105e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Dec 2024 17:08:28 -0800 Subject: lib/crc32: expose whether the lib is really optimized at runtime Make the CRC32 library export a function crc32_optimizations() which returns flags that indicate which CRC32 functions are actually executing optimized code at runtime. This will be used to determine whether the crc32[c]-$arch shash algorithms should be registered in the crypto API. btrfs could also start using these flags instead of the hack that it currently uses where it parses the crypto_shash_driver_name. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20241202010844.144356-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc32.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/crc32.h b/include/linux/crc32.h index 58c632533b08..e9bd40056687 100644 --- a/include/linux/crc32.h +++ b/include/linux/crc32.h @@ -37,6 +37,21 @@ static inline u32 __pure __crc32c_le(u32 crc, const u8 *p, size_t len) return crc32c_le_base(crc, p, len); } +/* + * crc32_optimizations() returns flags that indicate which CRC32 library + * functions are using architecture-specific optimizations. Unlike + * IS_ENABLED(CONFIG_CRC32_ARCH) it takes into account the different CRC32 + * variants and also whether any needed CPU features are available at runtime. + */ +#define CRC32_LE_OPTIMIZATION BIT(0) /* crc32_le() is optimized */ +#define CRC32_BE_OPTIMIZATION BIT(1) /* crc32_be() is optimized */ +#define CRC32C_OPTIMIZATION BIT(2) /* __crc32c_le() is optimized */ +#if IS_ENABLED(CONFIG_CRC32_ARCH) +u32 crc32_optimizations(void); +#else +static inline u32 crc32_optimizations(void) { return 0; } +#endif + /** * crc32_le_combine - Combine two crc32 check values into one. For two * sequences of bytes, seq1 and seq2 with lengths len1 -- cgit v1.2.3 From 38a9a5121c3bcf2ed857430a92e493568b247c35 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Dec 2024 17:08:40 -0800 Subject: lib/crc32: make crc32c() go directly to lib Now that the lower level __crc32c_le() library function is optimized for each architecture, make crc32c() just call that instead of taking an inefficient and error-prone detour through the shash API. Note: a future cleanup should make crc32c_le() be the actual library function instead of __crc32c_le(). That will require updating callers of __crc32c_le() to use crc32c_le() instead, and updating callers of crc32c_le() that expect a 'const void *' arg to expect 'const u8 *' instead. Similarly, a future cleanup should remove LIBCRC32C by making everyone who is selecting it just select CRC32 directly instead. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20241202010844.144356-16-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc32c.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h index 357ae4611a45..47eb78003c26 100644 --- a/include/linux/crc32c.h +++ b/include/linux/crc32c.h @@ -2,9 +2,12 @@ #ifndef _LINUX_CRC32C_H #define _LINUX_CRC32C_H -#include +#include -extern u32 crc32c(u32 crc, const void *address, unsigned int length); +static inline u32 crc32c(u32 crc, const void *address, unsigned int length) +{ + return __crc32c_le(crc, address, length); +} /* This macro exists for backwards-compatibility. */ #define crc32c_le crc32c -- cgit v1.2.3 From dd348f054b24a3f57cbcdc2c8e7ebc22c62eb72f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Dec 2024 17:08:42 -0800 Subject: jbd2: switch to using the crc32c library Now that the crc32c() library function directly takes advantage of architecture-specific optimizations, it is unnecessary to go through the crypto API. Just use crc32c(). This is much simpler, and it improves performance due to eliminating the crypto API overhead. Reviewed-by: Ard Biesheuvel Reviewed-by: Darrick J. Wong Acked-by: Theodore Ts'o Link: https://lore.kernel.org/r/20241202010844.144356-18-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/jbd2.h | 33 +++------------------------------ 1 file changed, 3 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 50f7ea8714bf..561025b4f3d9 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -28,7 +28,7 @@ #include #include #include -#include +#include #endif #define journal_oom_retry 1 @@ -1241,13 +1241,6 @@ struct journal_s */ void *j_private; - /** - * @j_chksum_driver: - * - * Reference to checksum algorithm driver via cryptoapi. - */ - struct crypto_shash *j_chksum_driver; - /** * @j_csum_seed: * @@ -1750,10 +1743,7 @@ static inline bool jbd2_journal_has_csum_v2or3_feature(journal_t *j) static inline int jbd2_journal_has_csum_v2or3(journal_t *journal) { - WARN_ON_ONCE(jbd2_journal_has_csum_v2or3_feature(journal) && - journal->j_chksum_driver == NULL); - - return journal->j_chksum_driver != NULL; + return jbd2_journal_has_csum_v2or3_feature(journal); } static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb) @@ -1790,27 +1780,10 @@ static inline unsigned long jbd2_log_space_left(journal_t *journal) #define BJ_Reserved 4 /* Buffer is reserved for access by journal */ #define BJ_Types 5 -/* JBD uses a CRC32 checksum */ -#define JBD_MAX_CHECKSUM_SIZE 4 - static inline u32 jbd2_chksum(journal_t *journal, u32 crc, const void *address, unsigned int length) { - DEFINE_RAW_FLEX(struct shash_desc, desc, __ctx, - DIV_ROUND_UP(JBD_MAX_CHECKSUM_SIZE, - sizeof(*((struct shash_desc *)0)->__ctx))); - int err; - - BUG_ON(crypto_shash_descsize(journal->j_chksum_driver) > - JBD_MAX_CHECKSUM_SIZE); - - desc->tfm = journal->j_chksum_driver; - *(u32 *)desc->__ctx = crc; - - err = crypto_shash_update(desc, address, length); - BUG_ON(err); - - return *(u32 *)desc->__ctx; + return crc32c(crc, address, length); } /* Return most recent uncommitted transaction */ -- cgit v1.2.3 From 31e4cdde4d8b4bc358f3e6b44647ead3cba13aba Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Dec 2024 17:08:44 -0800 Subject: scsi: target: iscsi: switch to using the crc32c library Now that the crc32c() library function directly takes advantage of architecture-specific optimizations, it is unnecessary to go through the crypto API. Just use crc32c(). This is much simpler, and it improves performance due to eliminating the crypto API overhead. Reviewed-by: Ard Biesheuvel Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241202010844.144356-20-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/target/iscsi/iscsi_target_core.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/target/iscsi/iscsi_target_core.h b/include/target/iscsi/iscsi_target_core.h index 60af7c63b34e..51ca80abacf7 100644 --- a/include/target/iscsi/iscsi_target_core.h +++ b/include/target/iscsi/iscsi_target_core.h @@ -576,9 +576,6 @@ struct iscsit_conn { spinlock_t state_lock; spinlock_t login_timer_lock; spinlock_t login_worker_lock; - /* libcrypto RX and TX contexts for crc32c */ - struct ahash_request *conn_rx_hash; - struct ahash_request *conn_tx_hash; /* Used for scheduling TX and RX connection kthreads */ cpumask_var_t conn_cpumask; cpumask_var_t allowed_cpumask; -- cgit v1.2.3 From be3c45b070cba3be4dd248b38d4798e3e2859451 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Dec 2024 17:20:45 -0800 Subject: lib/crc-t10dif: stop wrapping the crypto API In preparation for making the CRC-T10DIF library directly optimized for each architecture, like what has been done for CRC32, get rid of the weird layering where crc_t10dif_update() calls into the crypto API. Instead, move crc_t10dif_generic() into the crc-t10dif library module, and make crc_t10dif_update() just call crc_t10dif_generic(). Acceleration will be reintroduced via crc_t10dif_arch() in the following patches. Reviewed-by: Ard Biesheuvel Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241202012056.209768-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc-t10dif.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/crc-t10dif.h b/include/linux/crc-t10dif.h index 6bb0c0bf357b..206ba2305483 100644 --- a/include/linux/crc-t10dif.h +++ b/include/linux/crc-t10dif.h @@ -6,11 +6,17 @@ #define CRC_T10DIF_DIGEST_SIZE 2 #define CRC_T10DIF_BLOCK_SIZE 1 -#define CRC_T10DIF_STRING "crct10dif" -extern __u16 crc_t10dif_generic(__u16 crc, const unsigned char *buffer, - size_t len); -extern __u16 crc_t10dif(unsigned char const *, size_t); -extern __u16 crc_t10dif_update(__u16 crc, unsigned char const *, size_t); +u16 crc_t10dif_generic(u16 crc, const u8 *p, size_t len); + +static inline u16 crc_t10dif_update(u16 crc, const u8 *p, size_t len) +{ + return crc_t10dif_generic(crc, p, len); +} + +static inline u16 crc_t10dif(const u8 *p, size_t len) +{ + return crc_t10dif_update(0, p, len); +} #endif -- cgit v1.2.3 From 0961c3bcefa64d5f0999e2b703391862c733bb52 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Dec 2024 17:20:46 -0800 Subject: lib/crc-t10dif: add support for arch overrides Following what was done for CRC32, add support for architecture-specific override of the CRC-T10DIF library. This will allow the CRC-T10DIF library functions to access architecture-optimized code directly. Reviewed-by: Ard Biesheuvel Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241202012056.209768-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/crc-t10dif.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/crc-t10dif.h b/include/linux/crc-t10dif.h index 206ba2305483..16787c1cee21 100644 --- a/include/linux/crc-t10dif.h +++ b/include/linux/crc-t10dif.h @@ -7,10 +7,13 @@ #define CRC_T10DIF_DIGEST_SIZE 2 #define CRC_T10DIF_BLOCK_SIZE 1 +u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len); u16 crc_t10dif_generic(u16 crc, const u8 *p, size_t len); static inline u16 crc_t10dif_update(u16 crc, const u8 *p, size_t len) { + if (IS_ENABLED(CONFIG_CRC_T10DIF_ARCH)) + return crc_t10dif_arch(crc, p, len); return crc_t10dif_generic(crc, p, len); } @@ -19,4 +22,13 @@ static inline u16 crc_t10dif(const u8 *p, size_t len) return crc_t10dif_update(0, p, len); } +#if IS_ENABLED(CONFIG_CRC_T10DIF_ARCH) +bool crc_t10dif_is_optimized(void); +#else +static inline bool crc_t10dif_is_optimized(void) +{ + return false; +} +#endif + #endif -- cgit v1.2.3 From f4d3d7340e719dd3d2c23ce8d6c360e2f93ba7e4 Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Tue, 22 Oct 2024 17:22:52 +0530 Subject: dt-bindings: clock: qcom: Add QCS615 GCC clocks Add device tree bindings for global clock controller on QCS615 SoCs. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Taniya Das Link: https://lore.kernel.org/r/20241022-qcs615-clock-driver-v4-3-3d716ad0d987@quicinc.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,qcs615-gcc.h | 211 ++++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 include/dt-bindings/clock/qcom,qcs615-gcc.h (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,qcs615-gcc.h b/include/dt-bindings/clock/qcom,qcs615-gcc.h new file mode 100644 index 000000000000..9704091636b8 --- /dev/null +++ b/include/dt-bindings/clock/qcom,qcs615-gcc.h @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_GCC_QCS615_H +#define _DT_BINDINGS_CLK_QCOM_GCC_QCS615_H + +/* GCC clocks */ +#define GPLL0_OUT_AUX2_DIV 0 +#define GPLL3_OUT_AUX2_DIV 1 +#define GPLL0 2 +#define GPLL3 3 +#define GPLL4 4 +#define GPLL6 5 +#define GPLL6_OUT_MAIN 6 +#define GPLL7 7 +#define GPLL8 8 +#define GPLL8_OUT_MAIN 9 +#define GCC_AGGRE_UFS_PHY_AXI_CLK 10 +#define GCC_AGGRE_USB2_SEC_AXI_CLK 11 +#define GCC_AGGRE_USB3_PRIM_AXI_CLK 12 +#define GCC_AHB2PHY_EAST_CLK 13 +#define GCC_AHB2PHY_WEST_CLK 14 +#define GCC_BOOT_ROM_AHB_CLK 15 +#define GCC_CAMERA_AHB_CLK 16 +#define GCC_CAMERA_HF_AXI_CLK 17 +#define GCC_CAMERA_XO_CLK 18 +#define GCC_CE1_AHB_CLK 19 +#define GCC_CE1_AXI_CLK 20 +#define GCC_CE1_CLK 21 +#define GCC_CFG_NOC_USB2_SEC_AXI_CLK 22 +#define GCC_CFG_NOC_USB3_PRIM_AXI_CLK 23 +#define GCC_CPUSS_AHB_CLK 24 +#define GCC_CPUSS_AHB_CLK_SRC 25 +#define GCC_CPUSS_GNOC_CLK 26 +#define GCC_DDRSS_GPU_AXI_CLK 27 +#define GCC_DISP_AHB_CLK 28 +#define GCC_DISP_GPLL0_DIV_CLK_SRC 29 +#define GCC_DISP_HF_AXI_CLK 30 +#define GCC_DISP_XO_CLK 31 +#define GCC_EMAC_AXI_CLK 32 +#define GCC_EMAC_PTP_CLK 33 +#define GCC_EMAC_PTP_CLK_SRC 34 +#define GCC_EMAC_RGMII_CLK 35 +#define GCC_EMAC_RGMII_CLK_SRC 36 +#define GCC_EMAC_SLV_AHB_CLK 37 +#define GCC_GP1_CLK 38 +#define GCC_GP1_CLK_SRC 39 +#define GCC_GP2_CLK 40 +#define GCC_GP2_CLK_SRC 41 +#define GCC_GP3_CLK 42 +#define GCC_GP3_CLK_SRC 43 +#define GCC_GPU_CFG_AHB_CLK 44 +#define GCC_GPU_GPLL0_CLK_SRC 45 +#define GCC_GPU_GPLL0_DIV_CLK_SRC 46 +#define GCC_GPU_IREF_CLK 47 +#define GCC_GPU_MEMNOC_GFX_CLK 48 +#define GCC_GPU_SNOC_DVM_GFX_CLK 49 +#define GCC_PCIE0_PHY_REFGEN_CLK 50 +#define GCC_PCIE_0_AUX_CLK 51 +#define GCC_PCIE_0_AUX_CLK_SRC 52 +#define GCC_PCIE_0_CFG_AHB_CLK 53 +#define GCC_PCIE_0_CLKREF_CLK 54 +#define GCC_PCIE_0_MSTR_AXI_CLK 55 +#define GCC_PCIE_0_PIPE_CLK 56 +#define GCC_PCIE_0_SLV_AXI_CLK 57 +#define GCC_PCIE_0_SLV_Q2A_AXI_CLK 58 +#define GCC_PCIE_PHY_AUX_CLK 59 +#define GCC_PCIE_PHY_REFGEN_CLK_SRC 60 +#define GCC_PDM2_CLK 61 +#define GCC_PDM2_CLK_SRC 62 +#define GCC_PDM_AHB_CLK 63 +#define GCC_PDM_XO4_CLK 64 +#define GCC_PRNG_AHB_CLK 65 +#define GCC_QMIP_CAMERA_NRT_AHB_CLK 66 +#define GCC_QMIP_DISP_AHB_CLK 67 +#define GCC_QMIP_PCIE_AHB_CLK 68 +#define GCC_QMIP_VIDEO_VCODEC_AHB_CLK 69 +#define GCC_QSPI_CNOC_PERIPH_AHB_CLK 70 +#define GCC_QSPI_CORE_CLK 71 +#define GCC_QSPI_CORE_CLK_SRC 72 +#define GCC_QUPV3_WRAP0_CORE_2X_CLK 73 +#define GCC_QUPV3_WRAP0_CORE_CLK 74 +#define GCC_QUPV3_WRAP0_S0_CLK 75 +#define GCC_QUPV3_WRAP0_S0_CLK_SRC 76 +#define GCC_QUPV3_WRAP0_S1_CLK 77 +#define GCC_QUPV3_WRAP0_S1_CLK_SRC 78 +#define GCC_QUPV3_WRAP0_S2_CLK 79 +#define GCC_QUPV3_WRAP0_S2_CLK_SRC 80 +#define GCC_QUPV3_WRAP0_S3_CLK 81 +#define GCC_QUPV3_WRAP0_S3_CLK_SRC 82 +#define GCC_QUPV3_WRAP0_S4_CLK 83 +#define GCC_QUPV3_WRAP0_S4_CLK_SRC 84 +#define GCC_QUPV3_WRAP0_S5_CLK 85 +#define GCC_QUPV3_WRAP0_S5_CLK_SRC 86 +#define GCC_QUPV3_WRAP1_CORE_2X_CLK 87 +#define GCC_QUPV3_WRAP1_CORE_CLK 88 +#define GCC_QUPV3_WRAP1_S0_CLK 89 +#define GCC_QUPV3_WRAP1_S0_CLK_SRC 90 +#define GCC_QUPV3_WRAP1_S1_CLK 91 +#define GCC_QUPV3_WRAP1_S1_CLK_SRC 92 +#define GCC_QUPV3_WRAP1_S2_CLK 93 +#define GCC_QUPV3_WRAP1_S2_CLK_SRC 94 +#define GCC_QUPV3_WRAP1_S3_CLK 95 +#define GCC_QUPV3_WRAP1_S3_CLK_SRC 96 +#define GCC_QUPV3_WRAP1_S4_CLK 97 +#define GCC_QUPV3_WRAP1_S4_CLK_SRC 98 +#define GCC_QUPV3_WRAP1_S5_CLK 99 +#define GCC_QUPV3_WRAP1_S5_CLK_SRC 100 +#define GCC_QUPV3_WRAP_0_M_AHB_CLK 101 +#define GCC_QUPV3_WRAP_0_S_AHB_CLK 102 +#define GCC_QUPV3_WRAP_1_M_AHB_CLK 103 +#define GCC_QUPV3_WRAP_1_S_AHB_CLK 104 +#define GCC_RX1_USB2_CLKREF_CLK 105 +#define GCC_RX3_USB2_CLKREF_CLK 106 +#define GCC_SDCC1_AHB_CLK 107 +#define GCC_SDCC1_APPS_CLK 108 +#define GCC_SDCC1_APPS_CLK_SRC 109 +#define GCC_SDCC1_ICE_CORE_CLK 110 +#define GCC_SDCC1_ICE_CORE_CLK_SRC 111 +#define GCC_SDCC2_AHB_CLK 112 +#define GCC_SDCC2_APPS_CLK 113 +#define GCC_SDCC2_APPS_CLK_SRC 114 +#define GCC_SDR_CORE_CLK 115 +#define GCC_SDR_CSR_HCLK 116 +#define GCC_SDR_PRI_MI2S_CLK 117 +#define GCC_SDR_SEC_MI2S_CLK 118 +#define GCC_SDR_WR0_MEM_CLK 119 +#define GCC_SDR_WR1_MEM_CLK 120 +#define GCC_SDR_WR2_MEM_CLK 121 +#define GCC_SYS_NOC_CPUSS_AHB_CLK 122 +#define GCC_UFS_CARD_CLKREF_CLK 123 +#define GCC_UFS_MEM_CLKREF_CLK 124 +#define GCC_UFS_PHY_AHB_CLK 125 +#define GCC_UFS_PHY_AXI_CLK 126 +#define GCC_UFS_PHY_AXI_CLK_SRC 127 +#define GCC_UFS_PHY_ICE_CORE_CLK 128 +#define GCC_UFS_PHY_ICE_CORE_CLK_SRC 129 +#define GCC_UFS_PHY_PHY_AUX_CLK 130 +#define GCC_UFS_PHY_PHY_AUX_CLK_SRC 131 +#define GCC_UFS_PHY_RX_SYMBOL_0_CLK 132 +#define GCC_UFS_PHY_TX_SYMBOL_0_CLK 133 +#define GCC_UFS_PHY_UNIPRO_CORE_CLK 134 +#define GCC_UFS_PHY_UNIPRO_CORE_CLK_SRC 135 +#define GCC_USB20_SEC_MASTER_CLK 136 +#define GCC_USB20_SEC_MASTER_CLK_SRC 137 +#define GCC_USB20_SEC_MOCK_UTMI_CLK 138 +#define GCC_USB20_SEC_MOCK_UTMI_CLK_SRC 139 +#define GCC_USB20_SEC_SLEEP_CLK 140 +#define GCC_USB2_PRIM_CLKREF_CLK 141 +#define GCC_USB2_SEC_CLKREF_CLK 142 +#define GCC_USB2_SEC_PHY_AUX_CLK 143 +#define GCC_USB2_SEC_PHY_AUX_CLK_SRC 144 +#define GCC_USB2_SEC_PHY_COM_AUX_CLK 145 +#define GCC_USB2_SEC_PHY_PIPE_CLK 146 +#define GCC_USB30_PRIM_MASTER_CLK 147 +#define GCC_USB30_PRIM_MASTER_CLK_SRC 148 +#define GCC_USB30_PRIM_MOCK_UTMI_CLK 149 +#define GCC_USB30_PRIM_MOCK_UTMI_CLK_SRC 150 +#define GCC_USB30_PRIM_SLEEP_CLK 151 +#define GCC_USB3_PRIM_CLKREF_CLK 152 +#define GCC_USB3_PRIM_PHY_AUX_CLK 153 +#define GCC_USB3_PRIM_PHY_AUX_CLK_SRC 154 +#define GCC_USB3_PRIM_PHY_COM_AUX_CLK 155 +#define GCC_USB3_PRIM_PHY_PIPE_CLK 156 +#define GCC_USB3_SEC_CLKREF_CLK 157 +#define GCC_VIDEO_AHB_CLK 158 +#define GCC_VIDEO_AXI0_CLK 159 +#define GCC_VIDEO_XO_CLK 160 +#define GCC_VSENSOR_CLK_SRC 161 +#define GCC_AGGRE_UFS_PHY_AXI_HW_CTL_CLK 162 +#define GCC_UFS_PHY_AXI_HW_CTL_CLK 163 +#define GCC_UFS_PHY_ICE_CORE_HW_CTL_CLK 164 +#define GCC_UFS_PHY_PHY_AUX_HW_CTL_CLK 165 +#define GCC_UFS_PHY_UNIPRO_CORE_HW_CTL_CLK 166 + +/* GCC Resets */ +#define GCC_EMAC_BCR 0 +#define GCC_QUSB2PHY_PRIM_BCR 1 +#define GCC_QUSB2PHY_SEC_BCR 2 +#define GCC_USB30_PRIM_BCR 3 +#define GCC_USB2_PHY_SEC_BCR 4 +#define GCC_USB3_DP_PHY_SEC_BCR 5 +#define GCC_USB3PHY_PHY_SEC_BCR 6 +#define GCC_PCIE_0_BCR 7 +#define GCC_PCIE_0_PHY_BCR 8 +#define GCC_PCIE_PHY_BCR 9 +#define GCC_PCIE_PHY_COM_BCR 10 +#define GCC_UFS_PHY_BCR 11 +#define GCC_USB20_SEC_BCR 12 +#define GCC_USB3_PHY_PRIM_SP0_BCR 13 +#define GCC_USB3PHY_PHY_PRIM_SP0_BCR 14 +#define GCC_SDCC1_BCR 15 +#define GCC_SDCC2_BCR 16 + +/* GCC power domains */ +#define EMAC_GDSC 0 +#define PCIE_0_GDSC 1 +#define UFS_PHY_GDSC 2 +#define USB20_SEC_GDSC 3 +#define USB30_PRIM_GDSC 4 +#define HLOS1_VOTE_AGGRE_NOC_MMU_AUDIO_TBU_GDSC 5 +#define HLOS1_VOTE_AGGRE_NOC_MMU_TBU1_GDSC 6 +#define HLOS1_VOTE_AGGRE_NOC_MMU_TBU2_GDSC 7 +#define HLOS1_VOTE_AGGRE_NOC_MMU_PCIE_TBU_GDSC 8 +#define HLOS1_VOTE_MMNOC_MMU_TBU_HF0_GDSC 9 +#define HLOS1_VOTE_MMNOC_MMU_TBU_SF_GDSC 10 +#define HLOS1_VOTE_MMNOC_MMU_TBU_HF1_GDSC 11 + +#endif -- cgit v1.2.3 From 0a670e151a71434765de69590944e18c08ee08cf Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 25 Nov 2024 15:09:57 +0100 Subject: tree-wide: s/override_creds()/override_creds_light(get_new_cred())/g Convert all callers from override_creds() to override_creds_light(get_new_cred()) in preparation of making override_creds() not take a separate reference at all. Link: https://lore.kernel.org/r/20241125-work-cred-v2-1-68b9d38bb5b2@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cred.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cred.h b/include/linux/cred.h index e4a3155fe409..b0bc1fea9ca0 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -211,9 +211,10 @@ static inline struct cred *get_new_cred_many(struct cred *cred, int nr) * Get a reference on the specified set of new credentials. The caller must * release the reference. */ -static inline struct cred *get_new_cred(struct cred *cred) +static inline struct cred *get_new_cred(const struct cred *cred) { - return get_new_cred_many(cred, 1); + struct cred *nonconst_cred = (struct cred *) cred; + return get_new_cred_many(nonconst_cred, 1); } /** -- cgit v1.2.3 From 95c54bc81791c210b131f2b1013942487e74896f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 25 Nov 2024 15:09:58 +0100 Subject: cred: return old creds from revert_creds_light() So we can easily convert revert_creds() callers over to drop the reference count explicitly. Link: https://lore.kernel.org/r/20241125-work-cred-v2-2-68b9d38bb5b2@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cred.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cred.h b/include/linux/cred.h index b0bc1fea9ca0..57cf0256ea29 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -185,9 +185,12 @@ static inline const struct cred *override_creds_light(const struct cred *overrid return old; } -static inline void revert_creds_light(const struct cred *revert_cred) +static inline const struct cred *revert_creds_light(const struct cred *revert_cred) { + const struct cred *override_cred = current->cred; + rcu_assign_pointer(current->cred, revert_cred); + return override_cred; } /** -- cgit v1.2.3 From a51a1d6bcaa345cc88e738cad468083c4e13aa3b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 25 Nov 2024 15:10:00 +0100 Subject: cred: remove old {override,revert}_creds() helpers They are now unused. Link: https://lore.kernel.org/r/20241125-work-cred-v2-4-68b9d38bb5b2@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cred.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/cred.h b/include/linux/cred.h index 57cf0256ea29..80dcc18ef6e4 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -155,8 +155,6 @@ extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); -extern const struct cred *override_creds(const struct cred *); -extern void revert_creds(const struct cred *); extern struct cred *prepare_kernel_cred(struct task_struct *); extern int set_security_override(struct cred *, u32); extern int set_security_override_from_ctx(struct cred *, const char *); @@ -172,11 +170,6 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred) cred->cap_inheritable)); } -/* - * Override creds without bumping reference count. Caller must ensure - * reference remains valid or has taken reference. Almost always not the - * interface you want. Use override_creds()/revert_creds() instead. - */ static inline const struct cred *override_creds_light(const struct cred *override_cred) { const struct cred *old = current->cred; -- cgit v1.2.3 From 6771e004b40962402d0e973fc7d2e0e61364fdfb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 25 Nov 2024 15:10:01 +0100 Subject: tree-wide: s/override_creds_light()/override_creds()/g Rename all calls to override_creds_light() back to overrid_creds(). Link: https://lore.kernel.org/r/20241125-work-cred-v2-5-68b9d38bb5b2@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cred.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cred.h b/include/linux/cred.h index 80dcc18ef6e4..a073e6163c4e 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -170,7 +170,7 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred) cred->cap_inheritable)); } -static inline const struct cred *override_creds_light(const struct cred *override_cred) +static inline const struct cred *override_creds(const struct cred *override_cred) { const struct cred *old = current->cred; -- cgit v1.2.3 From 51c0bcf0973a3836adfc46f30f876f412478e376 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 25 Nov 2024 15:10:02 +0100 Subject: tree-wide: s/revert_creds_light()/revert_creds()/g Rename all calls to revert_creds_light() back to revert_creds(). Link: https://lore.kernel.org/r/20241125-work-cred-v2-6-68b9d38bb5b2@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cred.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cred.h b/include/linux/cred.h index a073e6163c4e..a7df1c759ef0 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -178,7 +178,7 @@ static inline const struct cred *override_creds(const struct cred *override_cred return old; } -static inline const struct cred *revert_creds_light(const struct cred *revert_cred) +static inline const struct cred *revert_creds(const struct cred *revert_cred) { const struct cred *override_cred = current->cred; -- cgit v1.2.3 From 6efbb80490a545cfd9f87ebd9225879d8cdbed93 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 25 Nov 2024 15:10:25 +0100 Subject: cred: remove unused get_new_cred() This helper is not used anymore so remove it. Link: https://lore.kernel.org/r/20241125-work-cred-v2-29-68b9d38bb5b2@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cred.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/linux/cred.h b/include/linux/cred.h index a7df1c759ef0..360f5fd3854b 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -200,19 +200,6 @@ static inline struct cred *get_new_cred_many(struct cred *cred, int nr) return cred; } -/** - * get_new_cred - Get a reference on a new set of credentials - * @cred: The new credentials to reference - * - * Get a reference on the specified set of new credentials. The caller must - * release the reference. - */ -static inline struct cred *get_new_cred(const struct cred *cred) -{ - struct cred *nonconst_cred = (struct cred *) cred; - return get_new_cred_many(nonconst_cred, 1); -} - /** * get_cred_many - Get references on a set of credentials * @cred: The credentials to reference -- cgit v1.2.3 From a6babf4cbeaaa1c97a205382cdc958571f668ea8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 26 Nov 2024 14:22:16 +0100 Subject: cred: fold get_new_cred_many() into get_cred_many() There's no need for this to be a separate helper. Link: https://lore.kernel.org/r/20241126-zaunpfahl-wovon-c3979b990a63@brauner Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cred.h | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/cred.h b/include/linux/cred.h index 360f5fd3854b..0c3c4b16b469 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -186,20 +186,6 @@ static inline const struct cred *revert_creds(const struct cred *revert_cred) return override_cred; } -/** - * get_new_cred_many - Get references on a new set of credentials - * @cred: The new credentials to reference - * @nr: Number of references to acquire - * - * Get references on the specified set of new credentials. The caller must - * release all acquired references. - */ -static inline struct cred *get_new_cred_many(struct cred *cred, int nr) -{ - atomic_long_add(nr, &cred->usage); - return cred; -} - /** * get_cred_many - Get references on a set of credentials * @cred: The credentials to reference @@ -220,7 +206,8 @@ static inline const struct cred *get_cred_many(const struct cred *cred, int nr) if (!cred) return cred; nonconst_cred->non_rcu = 0; - return get_new_cred_many(nonconst_cred, nr); + atomic_long_add(nr, &nonconst_cred->usage); + return cred; } /* -- cgit v1.2.3 From 7863dcc72d0f4b13a641065670426435448b3d80 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 22 Nov 2024 14:24:58 +0100 Subject: pid: allow pid_max to be set per pid namespace The pid_max sysctl is a global value. For a long time the default value has been 65535 and during the pidfd dicussions Linus proposed to bump pid_max by default (cf. [1]). Based on this discussion systemd started bumping pid_max to 2^22. So all new systems now run with a very high pid_max limit with some distros having also backported that change. The decision to bump pid_max is obviously correct. It just doesn't make a lot of sense nowadays to enforce such a low pid number. There's sufficient tooling to make selecting specific processes without typing really large pid numbers available. In any case, there are workloads that have expections about how large pid numbers they accept. Either for historical reasons or architectural reasons. One concreate example is the 32-bit version of Android's bionic libc which requires pid numbers less than 65536. There are workloads where it is run in a 32-bit container on a 64-bit kernel. If the host has a pid_max value greater than 65535 the libc will abort thread creation because of size assumptions of pthread_mutex_t. That's a fairly specific use-case however, in general specific workloads that are moved into containers running on a host with a new kernel and a new systemd can run into issues with large pid_max values. Obviously making assumptions about the size of the allocated pid is suboptimal but we have userspace that does it. Of course, giving containers the ability to restrict the number of processes in their respective pid namespace indepent of the global limit through pid_max is something desirable in itself and comes in handy in general. Independent of motivating use-cases the existence of pid namespaces makes this also a good semantical extension and there have been prior proposals pushing in a similar direction. The trick here is to minimize the risk of regressions which I think is doable. The fact that pid namespaces are hierarchical will help us here. What we mostly care about is that when the host sets a low pid_max limit, say (crazy number) 100 that no descendant pid namespace can allocate a higher pid number in its namespace. Since pid allocation is hierarchial this can be ensured by checking each pid allocation against the pid namespace's pid_max limit. This means if the allocation in the descendant pid namespace succeeds, the ancestor pid namespace can reject it. If the ancestor pid namespace has a higher limit than the descendant pid namespace the descendant pid namespace will reject the pid allocation. The ancestor pid namespace will obviously not care about this. All in all this means pid_max continues to enforce a system wide limit on the number of processes but allows pid namespaces sufficient leeway in handling workloads with assumptions about pid values and allows containers to restrict the number of processes in a pid namespace through the pid_max interface. [1]: https://lore.kernel.org/linux-api/CAHk-=wiZ40LVjnXSi9iHLE_-ZBsWFGCgdmNiYZUXn1-V5YBg2g@mail.gmail.com - rebased from 5.14-rc1 - a few fixes (missing ns_free_inum on error path, missing initialization, etc) - permission check changes in pid_table_root_permissions - unsigned int pid_max -> int pid_max (keep pid_max type as it was) - add READ_ONCE in alloc_pid() as suggested by Christian - rebased from 6.7 and take into account: * sysctl: treewide: drop unused argument ctl_table_root::set_ownership(table) * sysctl: treewide: constify ctl_table_header::ctl_table_arg * pidfd: add pidfs * tracing: Move saved_cmdline code into trace_sched_switch.c Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20241122132459.135120-2-aleksandr.mikhalitsyn@canonical.com Signed-off-by: Christian Brauner --- include/linux/pid.h | 3 --- include/linux/pid_namespace.h | 10 +++++++++- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/pid.h b/include/linux/pid.h index a3aad9b4074c..c800cbee584b 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -106,9 +106,6 @@ extern void exchange_tids(struct task_struct *task, struct task_struct *old); extern void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type); -extern int pid_max; -extern int pid_max_min, pid_max_max; - /* * look up a PID in the hash table. Must be called with the tasklist_lock * or rcu_read_lock() held. diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index f9f9931e02d6..7c67a5811199 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -30,6 +30,7 @@ struct pid_namespace { struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; + int pid_max; struct pid_namespace *parent; #ifdef CONFIG_BSD_PROCESS_ACCT struct fs_pin *bacct; @@ -38,9 +39,14 @@ struct pid_namespace { struct ucounts *ucounts; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; -#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + struct work_struct work; +#ifdef CONFIG_SYSCTL + struct ctl_table_set set; + struct ctl_table_header *sysctls; +#if defined(CONFIG_MEMFD_CREATE) int memfd_noexec_scope; #endif +#endif } __randomize_layout; extern struct pid_namespace init_pid_ns; @@ -117,6 +123,8 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); void pidhash_init(void); void pid_idr_init(void); +int register_pidns_sysctls(struct pid_namespace *pidns); +void unregister_pidns_sysctls(struct pid_namespace *pidns); static inline bool task_is_in_init_pid_ns(struct task_struct *tsk) { -- cgit v1.2.3 From 96450ead16527cbef559b5bd046182e731228f95 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:14 -0800 Subject: seqlock: add raw_seqcount_try_begin Add raw_seqcount_try_begin() to opens a read critical section of the given seqcount_t if the counter is even. This enables eliding the critical section entirely if the counter is odd, instead of doing the speculation knowing it will fail. Suggested-by: Peter Zijlstra Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: David Hildenbrand Reviewed-by: Liam R. Howlett Link: https://lkml.kernel.org/r/20241122174416.1367052-1-surenb@google.com --- include/linux/seqlock.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5298765d6ca4..22c2c48b4265 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -318,6 +318,28 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) __seq; \ }) +/** + * raw_seqcount_try_begin() - begin a seqcount_t read critical section + * w/o lockdep and w/o counter stabilization + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants + * + * Similar to raw_seqcount_begin(), except it enables eliding the critical + * section entirely if odd, instead of doing the speculation knowing it will + * fail. + * + * Useful when counter stabilization is more or less equivalent to taking + * the lock and there is a slowpath that does that. + * + * If true, start will be set to the (even) sequence count read. + * + * Return: true when a read critical section is started. + */ +#define raw_seqcount_try_begin(s, start) \ +({ \ + start = raw_read_seqcount(s); \ + !(start & 1); \ +}) + /** * raw_seqcount_begin() - begin a seqcount_t read critical section w/o * lockdep and w/o counter stabilization -- cgit v1.2.3 From eb449bd96954b1c1e491d19066cfd2a010f0aa47 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:15 -0800 Subject: mm: convert mm_lock_seq to a proper seqcount Convert mm_lock_seq to be seqcount_t and change all mmap_write_lock variants to increment it, in-line with the usual seqcount usage pattern. This lets us check whether the mmap_lock is write-locked by checking mm_lock_seq.sequence counter (odd=locked, even=unlocked). This will be used when implementing mmap_lock speculation functions. As a result vm_lock_seq is also change to be unsigned to match the type of mm_lock_seq.sequence. Suggested-by: Peter Zijlstra Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Liam R. Howlett Link: https://lkml.kernel.org/r/20241122174416.1367052-2-surenb@google.com --- include/linux/mm.h | 12 +++++------ include/linux/mm_types.h | 7 ++++-- include/linux/mmap_lock.h | 55 +++++++++++++++++++++++++++++++---------------- 3 files changed, 47 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index c39c4945946c..ca59d165f1f2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -710,7 +710,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) return false; if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) @@ -727,7 +727,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { up_read(&vma->vm_lock->lock); return false; } @@ -742,7 +742,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) +static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -750,7 +750,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ - *mm_lock_seq = vma->vm_mm->mm_lock_seq; + *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; return (vma->vm_lock_seq == *mm_lock_seq); } @@ -761,7 +761,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) */ static inline void vma_start_write(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; if (__is_vma_write_locked(vma, &mm_lock_seq)) return; @@ -779,7 +779,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) static inline void vma_assert_write_locked(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7361a8f3ab68..97e2f4fe1d6c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -697,7 +697,7 @@ struct vm_area_struct { * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ - int vm_lock_seq; + unsigned int vm_lock_seq; /* Unstable RCU readers are allowed to read this. */ struct vma_lock *vm_lock; #endif @@ -891,6 +891,9 @@ struct mm_struct { * Roughly speaking, incrementing the sequence number is * equivalent to releasing locks on VMAs; reading the sequence * number can be part of taking a read lock on a VMA. + * Incremented every time mmap_lock is write-locked/unlocked. + * Initialized to 0, therefore odd values indicate mmap_lock + * is write-locked and even values that it's released. * * Can be modified under write mmap_lock using RELEASE * semantics. @@ -899,7 +902,7 @@ struct mm_struct { * Can be read with ACQUIRE semantics if not holding write * mmap_lock. */ - int mm_lock_seq; + seqcount_t mm_lock_seq; #endif diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index de9dc20b01ba..9715326f5a85 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -71,39 +71,39 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK -/* - * Drop all currently-held per-VMA locks. - * This is called from the mmap_lock implementation directly before releasing - * a write-locked mmap_lock (or downgrading it to read-locked). - * This should normally NOT be called manually from other places. - * If you want to call this manually anyway, keep in mind that this will release - * *all* VMA write locks, including ones from further up the stack. - */ -static inline void vma_end_write_all(struct mm_struct *mm) +static inline void mm_lock_seqcount_init(struct mm_struct *mm) { - mmap_assert_write_locked(mm); - /* - * Nobody can concurrently modify mm->mm_lock_seq due to exclusive - * mmap_lock being held. - * We need RELEASE semantics here to ensure that preceding stores into - * the VMA take effect before we unlock it with this store. - * Pairs with ACQUIRE semantics in vma_start_read(). - */ - smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1); + seqcount_init(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) +{ + do_raw_write_seqcount_begin(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_end(struct mm_struct *mm) +{ + ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq); + do_raw_write_seqcount_end(&mm->mm_lock_seq); } + #else -static inline void vma_end_write_all(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} #endif static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); + mm_lock_seqcount_init(mm); } static inline void mmap_write_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, true); down_write(&mm->mmap_lock); + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } @@ -111,6 +111,7 @@ static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) { __mmap_lock_trace_start_locking(mm, true); down_write_nested(&mm->mmap_lock, subclass); + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } @@ -120,10 +121,26 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm) __mmap_lock_trace_start_locking(mm, true); ret = down_write_killable(&mm->mmap_lock); + if (!ret) + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, ret == 0); return ret; } +/* + * Drop all currently-held per-VMA locks. + * This is called from the mmap_lock implementation directly before releasing + * a write-locked mmap_lock (or downgrading it to read-locked). + * This should normally NOT be called manually from other places. + * If you want to call this manually anyway, keep in mind that this will release + * *all* VMA write locks, including ones from further up the stack. + */ +static inline void vma_end_write_all(struct mm_struct *mm) +{ + mmap_assert_write_locked(mm); + mm_lock_seqcount_end(mm); +} + static inline void mmap_write_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, true); -- cgit v1.2.3 From 03a001b156d2da186a5618de242750d06bf81e2d Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:16 -0800 Subject: mm: introduce mmap_lock_speculate_{try_begin|retry} Add helper functions to speculatively perform operations without read-locking mmap_lock, expecting that mmap_lock will not be write-locked and mm is not modified from under us. Suggested-by: Peter Zijlstra Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Liam R. Howlett Link: https://lkml.kernel.org/r/20241122174416.1367052-3-surenb@google.com --- include/linux/mmap_lock.h | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 9715326f5a85..45a21faa3ff6 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -71,6 +71,7 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK + static inline void mm_lock_seqcount_init(struct mm_struct *mm) { seqcount_init(&mm->mm_lock_seq); @@ -87,11 +88,39 @@ static inline void mm_lock_seqcount_end(struct mm_struct *mm) do_raw_write_seqcount_end(&mm->mm_lock_seq); } -#else +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + /* + * Since mmap_lock is a sleeping lock, and waiting for it to become + * unlocked is more or less equivalent with taking it ourselves, don't + * bother with the speculative path if mmap_lock is already write-locked + * and take the slow path, which takes the lock. + */ + return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq); +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return read_seqcount_retry(&mm->mm_lock_seq, seq); +} + +#else /* CONFIG_PER_VMA_LOCK */ + static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} -#endif + +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + return false; +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return true; +} + +#endif /* CONFIG_PER_VMA_LOCK */ static inline void mmap_init_lock(struct mm_struct *mm) { -- cgit v1.2.3 From 2116b349e29a2e9ba17ea2e45b31234e4b350793 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:52 +0100 Subject: objtool: Generic annotation infrastructure Avoid endless .discard.foo sections for each annotation, create a single .discard.annotate_insn section that takes an annotation type along with the instruction. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094310.932794537@infradead.org --- include/linux/objtool.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index b3b8d3dab52d..d98531ecc687 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -57,6 +57,13 @@ ".long 998b\n\t" \ ".popsection\n\t" +#define ASM_ANNOTATE(type) \ + "911:\n\t" \ + ".pushsection .discard.annotate_insn,\"M\",@progbits,8\n\t" \ + ".long 911b - .\n\t" \ + ".long " __stringify(type) "\n\t" \ + ".popsection\n\t" + #else /* __ASSEMBLY__ */ /* @@ -146,6 +153,14 @@ .popsection .endm +.macro ANNOTATE type:req +.Lhere_\@: + .pushsection .discard.annotate_insn,"M",@progbits,8 + .long .Lhere_\@ - . + .long \type + .popsection +.endm + #endif /* __ASSEMBLY__ */ #else /* !CONFIG_OBJTOOL */ @@ -155,6 +170,7 @@ #define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t" #define STACK_FRAME_NON_STANDARD(func) #define STACK_FRAME_NON_STANDARD_FP(func) +#define ASM_ANNOTATE(type) #define ANNOTATE_NOENDBR #define ASM_REACHABLE #else @@ -167,6 +183,8 @@ .endm .macro REACHABLE .endm +.macro ANNOTATE type:req +.endm #endif #endif /* CONFIG_OBJTOOL */ -- cgit v1.2.3 From 22c3d58079688b697f36d670616e463cbb14d058 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:53 +0100 Subject: objtool: Convert ANNOTATE_NOENDBR to ANNOTATE Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.042140333@infradead.org --- include/linux/objtool.h | 17 ++++------------- include/linux/objtool_types.h | 5 +++++ 2 files changed, 9 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index d98531ecc687..b5e9c0ab4048 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -45,12 +45,6 @@ #define STACK_FRAME_NON_STANDARD_FP(func) #endif -#define ANNOTATE_NOENDBR \ - "986: \n\t" \ - ".pushsection .discard.noendbr\n\t" \ - ".long 986b\n\t" \ - ".popsection\n\t" - #define ASM_REACHABLE \ "998:\n\t" \ ".pushsection .discard.reachable\n\t" \ @@ -64,6 +58,8 @@ ".long " __stringify(type) "\n\t" \ ".popsection\n\t" +#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) + #else /* __ASSEMBLY__ */ /* @@ -122,13 +118,6 @@ #endif .endm -.macro ANNOTATE_NOENDBR -.Lhere_\@: - .pushsection .discard.noendbr - .long .Lhere_\@ - .popsection -.endm - /* * Use objtool to validate the entry requirement that all code paths do * VALIDATE_UNRET_END before RET. @@ -161,6 +150,8 @@ .popsection .endm +#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR + #endif /* __ASSEMBLY__ */ #else /* !CONFIG_OBJTOOL */ diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index 453a4f4ef39d..4884f8cf8429 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -54,4 +54,9 @@ struct unwind_hint { #define UNWIND_HINT_TYPE_SAVE 6 #define UNWIND_HINT_TYPE_RESTORE 7 +/* + * Annotate types + */ +#define ANNOTYPE_NOENDBR 1 + #endif /* _LINUX_OBJTOOL_TYPES_H */ -- cgit v1.2.3 From bf5febebd99fddfc6226a94e937d38a8d470b24e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:54 +0100 Subject: objtool: Convert ANNOTATE_RETPOLINE_SAFE to ANNOTATE Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.145275669@infradead.org --- include/linux/objtool_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index 4884f8cf8429..1b348361ad1d 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -58,5 +58,6 @@ struct unwind_hint { * Annotate types */ #define ANNOTYPE_NOENDBR 1 +#define ANNOTYPE_RETPOLINE_SAFE 2 #endif /* _LINUX_OBJTOOL_TYPES_H */ -- cgit v1.2.3 From 317f2a64618c528539d17fe6957a64106087fbd2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:55 +0100 Subject: objtool: Convert instrumentation_{begin,end}() to ANNOTATE Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.245980207@infradead.org --- include/linux/instrumentation.h | 11 +++++------ include/linux/objtool.h | 12 +++++++++--- include/linux/objtool_types.h | 2 ++ 3 files changed, 16 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/instrumentation.h b/include/linux/instrumentation.h index bc7babe91b2e..c8f866cf02d8 100644 --- a/include/linux/instrumentation.h +++ b/include/linux/instrumentation.h @@ -4,14 +4,14 @@ #ifdef CONFIG_NOINSTR_VALIDATION +#include #include /* Begin/end of an instrumentation safe region */ #define __instrumentation_begin(c) ({ \ asm volatile(__stringify(c) ": nop\n\t" \ - ".pushsection .discard.instr_begin\n\t" \ - ".long " __stringify(c) "b - .\n\t" \ - ".popsection\n\t" : : "i" (c)); \ + __ASM_ANNOTATE(__ASM_BREF(c), ANNOTYPE_INSTR_BEGIN)\ + : : "i" (c)); \ }) #define instrumentation_begin() __instrumentation_begin(__COUNTER__) @@ -48,9 +48,8 @@ */ #define __instrumentation_end(c) ({ \ asm volatile(__stringify(c) ": nop\n\t" \ - ".pushsection .discard.instr_end\n\t" \ - ".long " __stringify(c) "b - .\n\t" \ - ".popsection\n\t" : : "i" (c)); \ + __ASM_ANNOTATE(__ASM_BREF(c), ANNOTYPE_INSTR_END) \ + : : "i" (c)); \ }) #define instrumentation_end() __instrumentation_end(__COUNTER__) #else /* !CONFIG_NOINSTR_VALIDATION */ diff --git a/include/linux/objtool.h b/include/linux/objtool.h index b5e9c0ab4048..89c67cd7eebe 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -51,13 +51,18 @@ ".long 998b\n\t" \ ".popsection\n\t" -#define ASM_ANNOTATE(type) \ - "911:\n\t" \ +#define __ASM_BREF(label) label ## b + +#define __ASM_ANNOTATE(label, type) \ ".pushsection .discard.annotate_insn,\"M\",@progbits,8\n\t" \ - ".long 911b - .\n\t" \ + ".long " __stringify(label) " - .\n\t" \ ".long " __stringify(type) "\n\t" \ ".popsection\n\t" +#define ASM_ANNOTATE(type) \ + "911:\n\t" \ + __ASM_ANNOTATE(911b, type) + #define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) #else /* __ASSEMBLY__ */ @@ -161,6 +166,7 @@ #define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t" #define STACK_FRAME_NON_STANDARD(func) #define STACK_FRAME_NON_STANDARD_FP(func) +#define __ASM_ANNOTATE(label, type) #define ASM_ANNOTATE(type) #define ANNOTATE_NOENDBR #define ASM_REACHABLE diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index 1b348361ad1d..d4d68dd36f7a 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -59,5 +59,7 @@ struct unwind_hint { */ #define ANNOTYPE_NOENDBR 1 #define ANNOTYPE_RETPOLINE_SAFE 2 +#define ANNOTYPE_INSTR_BEGIN 3 +#define ANNOTYPE_INSTR_END 4 #endif /* _LINUX_OBJTOOL_TYPES_H */ -- cgit v1.2.3 From 18aa6118a1689b4d73c5ebbd917ae3f20c9c0db1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:56 +0100 Subject: objtool: Convert VALIDATE_UNRET_BEGIN to ANNOTATE Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.358508242@infradead.org --- include/linux/objtool.h | 9 +++------ include/linux/objtool_types.h | 1 + 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 89c67cd7eebe..5f0bf8052dc7 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -130,15 +130,12 @@ * NOTE: The macro must be used at the beginning of a global symbol, otherwise * it will be ignored. */ -.macro VALIDATE_UNRET_BEGIN #if defined(CONFIG_NOINSTR_VALIDATION) && \ (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) -.Lhere_\@: - .pushsection .discard.validate_unret - .long .Lhere_\@ - . - .popsection +#define VALIDATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN +#else +#define VALIDATE_UNRET_BEGIN #endif -.endm .macro REACHABLE .Lhere_\@: diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index d4d68dd36f7a..16236a56364b 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -61,5 +61,6 @@ struct unwind_hint { #define ANNOTYPE_RETPOLINE_SAFE 2 #define ANNOTYPE_INSTR_BEGIN 3 #define ANNOTYPE_INSTR_END 4 +#define ANNOTYPE_UNRET_BEGIN 5 #endif /* _LINUX_OBJTOOL_TYPES_H */ -- cgit v1.2.3 From f0cd57c35a75f152d3b31b9be3f7f413b96a6d3f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:57 +0100 Subject: objtool: Convert ANNOTATE_IGNORE_ALTERNATIVE to ANNOTATE Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.465691316@infradead.org --- include/linux/objtool_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index 16236a56364b..eab15dbe1cb7 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -62,5 +62,6 @@ struct unwind_hint { #define ANNOTYPE_INSTR_BEGIN 3 #define ANNOTYPE_INSTR_END 4 #define ANNOTYPE_UNRET_BEGIN 5 +#define ANNOTYPE_IGNORE_ALTS 6 #endif /* _LINUX_OBJTOOL_TYPES_H */ -- cgit v1.2.3 From 112765ca1cb9353e71b4f5af4e6e6c4a69c28d99 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:38:58 +0100 Subject: objtool: Convert ANNOTATE_INTRA_FUNCTION_CALL to ANNOTATE Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.584892071@infradead.org --- include/linux/objtool.h | 16 ++++++---------- include/linux/objtool_types.h | 1 + 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 5f0bf8052dc7..42287c1e32ce 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -67,16 +67,6 @@ #else /* __ASSEMBLY__ */ -/* - * This macro indicates that the following intra-function call is valid. - * Any non-annotated intra-function call will cause objtool to issue a warning. - */ -#define ANNOTATE_INTRA_FUNCTION_CALL \ - 999: \ - .pushsection .discard.intra_function_calls; \ - .long 999b; \ - .popsection; - /* * In asm, there are two kinds of code: normal C-type callable functions and * the rest. The normal callable functions can be called by other code, and @@ -154,6 +144,12 @@ #define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR +/* + * This macro indicates that the following intra-function call is valid. + * Any non-annotated intra-function call will cause objtool to issue a warning. + */ +#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL + #endif /* __ASSEMBLY__ */ #else /* !CONFIG_OBJTOOL */ diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index eab15dbe1cb7..23d6fb6d04c7 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -63,5 +63,6 @@ struct unwind_hint { #define ANNOTYPE_INSTR_END 4 #define ANNOTYPE_UNRET_BEGIN 5 #define ANNOTYPE_IGNORE_ALTS 6 +#define ANNOTYPE_INTRA_FUNCTION_CALL 7 #endif /* _LINUX_OBJTOOL_TYPES_H */ -- cgit v1.2.3 From bb8170067470cc7af28e4386e600b1e0a6a8956a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:39:00 +0100 Subject: objtool: Collect more annotations in objtool.h Suggested-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.786598147@infradead.org --- include/linux/instrumentation.h | 4 +-- include/linux/objtool.h | 80 +++++++++++++++++++++++++++-------------- 2 files changed, 55 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/instrumentation.h b/include/linux/instrumentation.h index c8f866cf02d8..bf675a8aef8a 100644 --- a/include/linux/instrumentation.h +++ b/include/linux/instrumentation.h @@ -10,7 +10,7 @@ /* Begin/end of an instrumentation safe region */ #define __instrumentation_begin(c) ({ \ asm volatile(__stringify(c) ": nop\n\t" \ - __ASM_ANNOTATE(__ASM_BREF(c), ANNOTYPE_INSTR_BEGIN)\ + ANNOTATE_INSTR_BEGIN(__ASM_BREF(c)) \ : : "i" (c)); \ }) #define instrumentation_begin() __instrumentation_begin(__COUNTER__) @@ -48,7 +48,7 @@ */ #define __instrumentation_end(c) ({ \ asm volatile(__stringify(c) ": nop\n\t" \ - __ASM_ANNOTATE(__ASM_BREF(c), ANNOTYPE_INSTR_END) \ + ANNOTATE_INSTR_END(__ASM_BREF(c)) \ : : "i" (c)); \ }) #define instrumentation_end() __instrumentation_end(__COUNTER__) diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 42287c1e32ce..fd487d466bb2 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -63,8 +63,6 @@ "911:\n\t" \ __ASM_ANNOTATE(911b, type) -#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) - #else /* __ASSEMBLY__ */ /* @@ -113,19 +111,6 @@ #endif .endm -/* - * Use objtool to validate the entry requirement that all code paths do - * VALIDATE_UNRET_END before RET. - * - * NOTE: The macro must be used at the beginning of a global symbol, otherwise - * it will be ignored. - */ -#if defined(CONFIG_NOINSTR_VALIDATION) && \ - (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) -#define VALIDATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN -#else -#define VALIDATE_UNRET_BEGIN -#endif .macro REACHABLE .Lhere_\@: @@ -142,14 +127,6 @@ .popsection .endm -#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR - -/* - * This macro indicates that the following intra-function call is valid. - * Any non-annotated intra-function call will cause objtool to issue a warning. - */ -#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL - #endif /* __ASSEMBLY__ */ #else /* !CONFIG_OBJTOOL */ @@ -161,16 +138,12 @@ #define STACK_FRAME_NON_STANDARD_FP(func) #define __ASM_ANNOTATE(label, type) #define ASM_ANNOTATE(type) -#define ANNOTATE_NOENDBR #define ASM_REACHABLE #else -#define ANNOTATE_INTRA_FUNCTION_CALL .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 .endm .macro STACK_FRAME_NON_STANDARD func:req .endm -.macro ANNOTATE_NOENDBR -.endm .macro REACHABLE .endm .macro ANNOTATE type:req @@ -179,4 +152,57 @@ #endif /* CONFIG_OBJTOOL */ +#ifndef __ASSEMBLY__ +/* + * Annotate away the various 'relocation to !ENDBR` complaints; knowing that + * these relocations will never be used for indirect calls. + */ +#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) +/* + * This should be used immediately before an indirect jump/call. It tells + * objtool the subsequent indirect jump/call is vouched safe for retpoline + * builds. + */ +#define ANNOTATE_RETPOLINE_SAFE ASM_ANNOTATE(ANNOTYPE_RETPOLINE_SAFE) +/* + * See linux/instrumentation.h + */ +#define ANNOTATE_INSTR_BEGIN(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_BEGIN) +#define ANNOTATE_INSTR_END(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_END) +/* + * objtool annotation to ignore the alternatives and only consider the original + * instruction(s). + */ +#define ANNOTATE_IGNORE_ALTERNATIVE ASM_ANNOTATE(ANNOTYPE_IGNORE_ALTS) +/* + * This macro indicates that the following intra-function call is valid. + * Any non-annotated intra-function call will cause objtool to issue a warning. + */ +#define ANNOTATE_INTRA_FUNCTION_CALL ASM_ANNOTATE(ANNOTYPE_INTRA_FUNCTION_CALL) +/* + * Use objtool to validate the entry requirement that all code paths do + * VALIDATE_UNRET_END before RET. + * + * NOTE: The macro must be used at the beginning of a global symbol, otherwise + * it will be ignored. + */ +#define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN) + +#else +#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR +#define ANNOTATE_RETPOLINE_SAFE ANNOTATE type=ANNOTYPE_RETPOLINE_SAFE +/* ANNOTATE_INSTR_BEGIN ANNOTATE type=ANNOTYPE_INSTR_BEGIN */ +/* ANNOTATE_INSTR_END ANNOTATE type=ANNOTYPE_INSTR_END */ +#define ANNOTATE_IGNORE_ALTERNATIVE ANNOTATE type=ANNOTYPE_IGNORE_ALTS +#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL +#define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN +#endif + +#if defined(CONFIG_NOINSTR_VALIDATION) && \ + (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) +#define VALIDATE_UNRET_BEGIN ANNOTATE_UNRET_BEGIN +#else +#define VALIDATE_UNRET_BEGIN +#endif + #endif /* _LINUX_OBJTOOL_H */ -- cgit v1.2.3 From c837de3810982cd41cd70e5170da1931439f025c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:39:01 +0100 Subject: unreachable: Unify Since barrier_before_unreachable() is empty for !GCC it is trivial to unify the two definitions. Less is more. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094311.924381359@infradead.org --- include/linux/compiler-gcc.h | 12 ------------ include/linux/compiler.h | 10 +++++++--- 2 files changed, 7 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index d0ed9583743f..c9b58188ec61 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -52,18 +52,6 @@ */ #define barrier_before_unreachable() asm volatile("") -/* - * Mark a position in code as unreachable. This can be used to - * suppress control flow warnings after asm blocks that transfer - * control elsewhere. - */ -#define unreachable() \ - do { \ - annotate_unreachable(); \ - barrier_before_unreachable(); \ - __builtin_unreachable(); \ - } while (0) - #if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP) #define __HAVE_BUILTIN_BSWAP32__ #define __HAVE_BUILTIN_BSWAP64__ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 469a64dd6495..7be80897a62f 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -141,12 +141,16 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #define __annotate_jump_table #endif /* CONFIG_OBJTOOL */ -#ifndef unreachable -# define unreachable() do { \ +/* + * Mark a position in code as unreachable. This can be used to + * suppress control flow warnings after asm blocks that transfer + * control elsewhere. + */ +#define unreachable() do { \ annotate_unreachable(); \ + barrier_before_unreachable(); \ __builtin_unreachable(); \ } while (0) -#endif /* * KENTRY - kernel entry point -- cgit v1.2.3 From 06e24745985c8dd0da18337503afcf2f2fdbdff1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:39:04 +0100 Subject: objtool: Remove annotate_{,un}reachable() There are no users of annotate_reachable() left. And the annotate_unreachable() usage in unreachable() is plain wrong; it will hide dangerous fall-through code-gen. Remove both. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094312.235637588@infradead.org --- include/linux/compiler.h | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 7be80897a62f..3d9a0e483e51 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -109,35 +109,9 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, /* Unreachable code */ #ifdef CONFIG_OBJTOOL -/* - * These macros help objtool understand GCC code flow for unreachable code. - * The __COUNTER__ based labels are a hack to make each instance of the macros - * unique, to convince GCC not to merge duplicate inline asm statements. - */ -#define __stringify_label(n) #n - -#define __annotate_reachable(c) ({ \ - asm volatile(__stringify_label(c) ":\n\t" \ - ".pushsection .discard.reachable\n\t" \ - ".long " __stringify_label(c) "b - .\n\t" \ - ".popsection\n\t"); \ -}) -#define annotate_reachable() __annotate_reachable(__COUNTER__) - -#define __annotate_unreachable(c) ({ \ - asm volatile(__stringify_label(c) ":\n\t" \ - ".pushsection .discard.unreachable\n\t" \ - ".long " __stringify_label(c) "b - .\n\t" \ - ".popsection\n\t" : : "i" (c)); \ -}) -#define annotate_unreachable() __annotate_unreachable(__COUNTER__) - /* Annotate a C jump table to allow objtool to follow the code flow */ #define __annotate_jump_table __section(".rodata..c_jump_table,\"a\",@progbits #") - #else /* !CONFIG_OBJTOOL */ -#define annotate_reachable() -#define annotate_unreachable() #define __annotate_jump_table #endif /* CONFIG_OBJTOOL */ @@ -147,7 +121,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, * control elsewhere. */ #define unreachable() do { \ - annotate_unreachable(); \ barrier_before_unreachable(); \ __builtin_unreachable(); \ } while (0) -- cgit v1.2.3 From e7a174fb43d24adca066e82d1cb9fdee092d48d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:39:05 +0100 Subject: objtool: Convert {.UN}REACHABLE to ANNOTATE Suggested-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094312.353431347@infradead.org --- include/linux/objtool.h | 18 +++++++----------- include/linux/objtool_types.h | 1 + 2 files changed, 8 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index fd487d466bb2..e3cb13583fba 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -111,14 +111,6 @@ #endif .endm - -.macro REACHABLE -.Lhere_\@: - .pushsection .discard.reachable - .long .Lhere_\@ - .popsection -.endm - .macro ANNOTATE type:req .Lhere_\@: .pushsection .discard.annotate_insn,"M",@progbits,8 @@ -138,14 +130,11 @@ #define STACK_FRAME_NON_STANDARD_FP(func) #define __ASM_ANNOTATE(label, type) #define ASM_ANNOTATE(type) -#define ASM_REACHABLE #else .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 .endm .macro STACK_FRAME_NON_STANDARD func:req .endm -.macro REACHABLE -.endm .macro ANNOTATE type:req .endm #endif @@ -187,6 +176,12 @@ * it will be ignored. */ #define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN) +/* + * This should be used directly after an instruction that is considered + * terminating, like a noreturn CALL or UD2 when we know they are not -- eg + * WARN using UD2. + */ +#define ANNOTATE_REACHABLE ASM_ANNOTATE(ANNOTYPE_REACHABLE) #else #define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR @@ -196,6 +191,7 @@ #define ANNOTATE_IGNORE_ALTERNATIVE ANNOTATE type=ANNOTYPE_IGNORE_ALTS #define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL #define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN +#define ANNOTATE_REACHABLE ANNOTATE type=ANNOTYPE_REACHABLE #endif #if defined(CONFIG_NOINSTR_VALIDATION) && \ diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index 23d6fb6d04c7..df5d9fa84dba 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -64,5 +64,6 @@ struct unwind_hint { #define ANNOTYPE_UNRET_BEGIN 5 #define ANNOTYPE_IGNORE_ALTS 6 #define ANNOTYPE_INTRA_FUNCTION_CALL 7 +#define ANNOTYPE_REACHABLE 8 #endif /* _LINUX_OBJTOOL_TYPES_H */ -- cgit v1.2.3 From 87116ae6da034242baf06e799f9f0e2a8ee6a796 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Nov 2024 10:39:06 +0100 Subject: objtool: Fix ANNOTATE_REACHABLE to be a normal annotation Currently REACHABLE is weird for being on the instruction after the instruction it modifies. Since all REACHABLE annotations have an explicit instruction, flip them around. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20241128094312.494176035@infradead.org --- include/linux/objtool.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index e3cb13583fba..c722a921165b 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -177,11 +177,11 @@ */ #define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN) /* - * This should be used directly after an instruction that is considered + * This should be used to refer to an instruction that is considered * terminating, like a noreturn CALL or UD2 when we know they are not -- eg * WARN using UD2. */ -#define ANNOTATE_REACHABLE ASM_ANNOTATE(ANNOTYPE_REACHABLE) +#define ANNOTATE_REACHABLE(label) __ASM_ANNOTATE(label, ANNOTYPE_REACHABLE) #else #define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR -- cgit v1.2.3 From ae5c677729e99b8cb3e6252aaa9b72a92985d203 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 30 Oct 2024 13:52:50 -0400 Subject: sched/core: Remove HK_TYPE_SCHED The HK_TYPE_SCHED housekeeping type is defined but not set anywhere. So any code that try to use HK_TYPE_SCHED are essentially dead code. So remove HK_TYPE_SCHED and any code that use it. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20241030175253.125248-2-longman@redhat.com --- include/linux/sched/isolation.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 2b461129d1fa..499d5e480882 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -10,7 +10,6 @@ enum hk_type { HK_TYPE_TIMER, HK_TYPE_RCU, HK_TYPE_MISC, - HK_TYPE_SCHED, HK_TYPE_TICK, HK_TYPE_DOMAIN, HK_TYPE_WQ, -- cgit v1.2.3 From 6010d245ddc9f463bbf0311ac49073a78f444755 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 30 Oct 2024 13:52:52 -0400 Subject: sched/isolation: Consolidate housekeeping cpumasks that are always identical The housekeeping cpumasks are only set by two boot commandline parameters: "nohz_full" and "isolcpus". When there is more than one of "nohz_full" or "isolcpus", the extra ones must have the same CPU list or the setup will fail partially. The HK_TYPE_DOMAIN and HK_TYPE_MANAGED_IRQ types are settable by "isolcpus" only and their settings can be independent of the other types. The other housekeeping types are all set by "nohz_full" or "isolcpus=nohz" without a way to set them individually. So they all have identical cpumasks. There is actually no point in having different cpumasks for these "nohz_full" only housekeeping types. Consolidate these types to use the same cpumask by aliasing them to the same value. If there is a need to set any of them independently in the future, we can break them out to their own cpumasks again. With this change, the number of cpumasks in the housekeeping structure drops from 9 to 3. Other than that, there should be no other functional change. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20241030175253.125248-4-longman@redhat.com --- include/linux/sched/isolation.h | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 499d5e480882..d8501f4709b5 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -7,15 +7,21 @@ #include enum hk_type { - HK_TYPE_TIMER, - HK_TYPE_RCU, - HK_TYPE_MISC, - HK_TYPE_TICK, HK_TYPE_DOMAIN, - HK_TYPE_WQ, HK_TYPE_MANAGED_IRQ, - HK_TYPE_KTHREAD, - HK_TYPE_MAX + HK_TYPE_KERNEL_NOISE, + HK_TYPE_MAX, + + /* + * The following housekeeping types are only set by the nohz_full + * boot commandline option. So they can share the same value. + */ + HK_TYPE_TICK = HK_TYPE_KERNEL_NOISE, + HK_TYPE_TIMER = HK_TYPE_KERNEL_NOISE, + HK_TYPE_RCU = HK_TYPE_KERNEL_NOISE, + HK_TYPE_MISC = HK_TYPE_KERNEL_NOISE, + HK_TYPE_WQ = HK_TYPE_KERNEL_NOISE, + HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE }; #ifdef CONFIG_CPU_ISOLATION -- cgit v1.2.3 From 484c997e03cec04da6f69c2c17e854b22aa0f98f Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 3 Dec 2024 18:45:34 +0800 Subject: ASoC: sdw_utils: cs_amp: Assign non-overlapping TDM masks for each codec on a bus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use snd_soc_dai_set_tdm_slot() on capture DAIs to prevent multiple aggregated amps from trying to send data at the same time. When the capture DAIs of multiple amps on a bus are aggregated they will all be sharing the same bit slots for transmitted audio. This would lead to bus errors if all channels on all amps were enabled, because multiple amps would be trying to send data at the same time. To prevent this, the available channels are divided between the amps on a bus so that only one amp will be sending data for each channel position. A CS35L56 has 4 TX channels, which must be split between all the amps on a bus so that no two amps are using the same channel. This is done simply by dividing by the number of amps on the bus, so that 1 amp can use all 4 channels, 2 amps can use 2 channels each, and 3 or 4 amps can only use 1 channel each. The amps are usually aggregated across multiple SoundWire buses. In this case there will be multiple cpu DAIs in the dailink. The channel mapping is used to determine which amps are on each bus. The allocation of the 4 channels is done separately for each bus (only amps on the same bus can interfere with each other). Signed-off-by: Richard Fitzgerald Reviewed-by: Péter Ujfalusi Reviewed-by: Liam Girdwood Reviewed-by: Ranjani Sridharan Signed-off-by: Bard Liao Link: https://patch.msgid.link/20241203104534.56719-3-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- include/sound/soc_sdw_utils.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/sound/soc_sdw_utils.h b/include/sound/soc_sdw_utils.h index 0e82598e10af..36a4a1e1d8ca 100644 --- a/include/sound/soc_sdw_utils.h +++ b/include/sound/soc_sdw_utils.h @@ -224,6 +224,8 @@ int asoc_sdw_cs_amp_init(struct snd_soc_card *card, struct snd_soc_dai_link *dai_links, struct asoc_sdw_codec_info *info, bool playback); +int asoc_sdw_cs_spk_feedback_rtd_init(struct snd_soc_pcm_runtime *rtd, + struct snd_soc_dai *dai); /* MAXIM codec support */ int asoc_sdw_maxim_init(struct snd_soc_card *card, -- cgit v1.2.3 From 6fe437cfe2cdc797b03f63b338a13fac96ed6a08 Mon Sep 17 00:00:00 2001 From: Levi Yun Date: Tue, 3 Dec 2024 14:31:08 +0000 Subject: firmware: arm_ffa: Fix the race around setting ffa_dev->properties Currently, ffa_dev->properties is set after the ffa_device_register() call return in ffa_setup_partitions(). This could potentially result in a race where the partition's properties is accessed while probing struct ffa_device before it is set. Update the ffa_device_register() to receive ffa_partition_info so all the data from the partition information received from the firmware can be updated into the struct ffa_device before the calling device_register() in ffa_device_register(). Fixes: e781858488b9 ("firmware: arm_ffa: Add initial FFA bus support for device enumeration") Signed-off-by: Levi Yun Message-Id: <20241203143109.1030514-2-yeoreum.yun@arm.com> Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index a28e2a6a13d0..74169dd0f659 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -166,9 +166,12 @@ static inline void *ffa_dev_get_drvdata(struct ffa_device *fdev) return dev_get_drvdata(&fdev->dev); } +struct ffa_partition_info; + #if IS_REACHABLE(CONFIG_ARM_FFA_TRANSPORT) -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_ops *ops); +struct ffa_device * +ffa_device_register(const struct ffa_partition_info *part_info, + const struct ffa_ops *ops); void ffa_device_unregister(struct ffa_device *ffa_dev); int ffa_driver_register(struct ffa_driver *driver, struct module *owner, const char *mod_name); @@ -176,9 +179,9 @@ void ffa_driver_unregister(struct ffa_driver *driver); bool ffa_device_is_valid(struct ffa_device *ffa_dev); #else -static inline -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_ops *ops) +static inline struct ffa_device * +ffa_device_register(const struct ffa_partition_info *part_info, + const struct ffa_ops *ops) { return NULL; } -- cgit v1.2.3 From 790fb9956eead785b720ccc0851f09a5ca3a093e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 2 Dec 2024 09:20:04 -0800 Subject: linux/dmaengine.h: fix a few kernel-doc warnings The comment block for "Interleaved Transfer Request" should not begin with "/**" since it is not in kernel-doc format. Fix doc name for enum sum_check_flags. Fix all (4) missing struct member warnings. Use "Warning:" for one "Note:" in enum dma_desc_metadata_mode since scripts/kernel-doc does not allow more than one Note: per function or identifier description. This leaves around 49 kernel-doc warnings like: include/linux/dmaengine.h:43: warning: Enum value 'DMA_OUT_OF_ORDER' not described in enum 'dma_status' and another scripts/kernel-doc problem with it not being able to parse some typedefs. Fixes: b14dab792dee ("DMAEngine: Define interleaved transfer request api") Fixes: ad283ea4a3ce ("async_tx: add sum check flags") Fixes: 272420214d26 ("dmaengine: Add DMA_CTRL_REUSE") Fixes: f067025bc676 ("dmaengine: add support to provide error result from a DMA transation") Fixes: d38a8c622a1b ("dmaengine: prepare for generic 'unmap' data") Fixes: 5878853fc938 ("dmaengine: Add API function dmaengine_prep_peripheral_dma_vec()") Signed-off-by: Randy Dunlap Cc: Dan Williams Cc: Dave Jiang Cc: Paul Cercueil Cc: Nuno Sa Cc: Vinod Koul Cc: dmaengine@vger.kernel.org Link: https://lore.kernel.org/r/20241202172004.76020-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index b137fdb56093..346251bf1026 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -84,7 +84,7 @@ enum dma_transfer_direction { DMA_TRANS_NONE, }; -/** +/* * Interleaved Transfer Request * ---------------------------- * A chunk is collection of contiguous bytes to be transferred. @@ -223,7 +223,7 @@ enum sum_check_bits { }; /** - * enum pq_check_flags - result of async_{xor,pq}_zero_sum operations + * enum sum_check_flags - result of async_{xor,pq}_zero_sum operations * @SUM_CHECK_P_RESULT - 1 if xor zero sum error, 0 otherwise * @SUM_CHECK_Q_RESULT - 1 if reed-solomon zero sum error, 0 otherwise */ @@ -286,7 +286,7 @@ typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t; * pointer to the engine's metadata area * 4. Read out the metadata from the pointer * - * Note: the two mode is not compatible and clients must use one mode for a + * Warning: the two modes are not compatible and clients must use one mode for a * descriptor. */ enum dma_desc_metadata_mode { @@ -594,9 +594,13 @@ struct dma_descriptor_metadata_ops { * @phys: physical address of the descriptor * @chan: target channel for this operation * @tx_submit: accept the descriptor, assign ordered cookie and mark the + * @desc_free: driver's callback function to free a resusable descriptor + * after completion * descriptor pending. To be pushed on .issue_pending() call * @callback: routine to call after this operation is complete + * @callback_result: error result from a DMA transaction * @callback_param: general parameter to pass to the callback routine + * @unmap: hook for generic DMA unmap data * @desc_metadata_mode: core managed metadata mode to protect mixed use of * DESC_METADATA_CLIENT or DESC_METADATA_ENGINE. Otherwise * DESC_METADATA_NONE @@ -827,6 +831,9 @@ struct dma_filter { * @device_prep_dma_memset: prepares a memset operation * @device_prep_dma_memset_sg: prepares a memset operation over a scatter list * @device_prep_dma_interrupt: prepares an end of chain interrupt operation + * @device_prep_peripheral_dma_vec: prepares a scatter-gather DMA transfer, + * where the address and size of each segment is located in one entry of + * the dma_vec array. * @device_prep_slave_sg: prepares a slave dma operation * @device_prep_dma_cyclic: prepare a cyclic dma operation suitable for audio. * The function takes a buffer of size buf_len. The callback function will -- cgit v1.2.3 From dcbef0798eb825cd584f7a93f62bed63f7fbbfc9 Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Wed, 18 Sep 2024 11:10:22 -0700 Subject: dmaengine: amd: qdma: Remove using the private get and set dma_ops APIs The get_dma_ops and set_dma_ops APIs were never for driver to use. Remove these calls from QDMA driver. Instead, pass the DMA device pointer from the qdma_platdata structure. Fixes: 73d5fc92a11c ("dmaengine: amd: qdma: Add AMD QDMA driver") Signed-off-by: Lizhi Hou Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240918181022.2155715-1-lizhi.hou@amd.com Signed-off-by: Vinod Koul --- include/linux/platform_data/amd_qdma.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/platform_data/amd_qdma.h b/include/linux/platform_data/amd_qdma.h index 576d952f97ed..967a6ef31cf9 100644 --- a/include/linux/platform_data/amd_qdma.h +++ b/include/linux/platform_data/amd_qdma.h @@ -26,11 +26,13 @@ struct dma_slave_map; * @max_mm_channels: Maximum number of MM DMA channels in each direction * @device_map: DMA slave map * @irq_index: The index of first IRQ + * @dma_dev: The device pointer for dma operations */ struct qdma_platdata { u32 max_mm_channels; u32 irq_index; struct dma_slave_map *device_map; + struct device *dma_dev; }; #endif /* _PLATDATA_AMD_QDMA_H */ -- cgit v1.2.3 From e500d497c16c29304907f5de8bc79a8d4904c3e9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 27 Nov 2024 19:37:56 -0800 Subject: usb: gadget: functionfs: fix spellos Fix typos in documentation as reported by codespell. Fixes: f0175ab51993 ("usb: gadget: f_fs: OS descriptors support") Fixes: ddf8abd25994 ("USB: f_fs: the FunctionFS driver") Signed-off-by: Randy Dunlap Cc: Michal Nazarewicz Cc: Andrzej Pietrasiewicz Cc: Greg Kroah-Hartman Cc: linux-usb@vger.kernel.org Link: https://lore.kernel.org/r/20241128033756.373517-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/usb/functionfs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/usb/functionfs.h b/include/uapi/linux/usb/functionfs.h index 2ebdba111a8f..beef1752e36e 100644 --- a/include/uapi/linux/usb/functionfs.h +++ b/include/uapi/linux/usb/functionfs.h @@ -206,7 +206,7 @@ struct usb_ffs_dmabuf_transfer_req { * +-----+-----------------+------+--------------------------+ * | off | name | type | description | * +-----+-----------------+------+--------------------------+ - * | 0 | inteface | U8 | related interface number | + * | 0 | interface | U8 | related interface number | * +-----+-----------------+------+--------------------------+ * | 1 | dwLength | U32 | length of the descriptor | * +-----+-----------------+------+--------------------------+ @@ -224,7 +224,7 @@ struct usb_ffs_dmabuf_transfer_req { * +-----+-----------------+------+--------------------------+ * | off | name | type | description | * +-----+-----------------+------+--------------------------+ - * | 0 | inteface | U8 | related interface number | + * | 0 | interface | U8 | related interface number | * +-----+-----------------+------+--------------------------+ * | 1 | dwLength | U32 | length of the descriptor | * +-----+-----------------+------+--------------------------+ @@ -237,7 +237,7 @@ struct usb_ffs_dmabuf_transfer_req { * | 11 | ExtProp[] | | list of ext. prop. d. | * +-----+-----------------+------+--------------------------+ * - * ExtCompat[] is an array of valid Extended Compatiblity descriptors + * ExtCompat[] is an array of valid Extended Compatibility descriptors * which have the following format: * * +-----+-----------------------+------+-------------------------------------+ @@ -295,7 +295,7 @@ struct usb_functionfs_strings_head { * | 16 | stringtab | StringTab[lang_count] | table of strings per lang | * * For each language there is one stringtab entry (ie. there are lang_count - * stringtab entires). Each StringTab has following format: + * stringtab entries). Each StringTab has following format: * * | off | name | type | description | * |-----+---------+-------------------+------------------------------------| -- cgit v1.2.3 From f42d22d3f79639c1b4e41daf28dad2505d6a5a8b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 22 Nov 2024 09:42:25 +0100 Subject: wifi: cfg80211: define and use wiphy guard Define a guard for the wiphy mutex, and use it in most code in cfg80211, though not all due to some interaction with RTNL and/or indentation. Suggested-by: Jeff Johnson Reviewed-by: Jeff Johnson Signed-off-by: Johannes Berg Link: https://patch.msgid.link/20241122094225.88765cbaab65.I610c9b14f36902e75e1d13f0db29f8bef2298804@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 27acf1292a5c..63e79a22a214 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6031,6 +6031,10 @@ static inline void wiphy_unlock(struct wiphy *wiphy) mutex_unlock(&wiphy->mtx); } +DEFINE_GUARD(wiphy, struct wiphy *, + mutex_lock(&_T->mtx), + mutex_unlock(&_T->mtx)) + struct wiphy_work; typedef void (*wiphy_work_func_t)(struct wiphy *, struct wiphy_work *); -- cgit v1.2.3 From d9b4067aef50aa7a13d1a9fbb4e35bdea6804ff3 Mon Sep 17 00:00:00 2001 From: Duan Chenghao Date: Thu, 24 Oct 2024 10:40:38 +0800 Subject: USB: Fix the issue of task recovery failure caused by USB status when S4 wakes up When a device is inserted into the USB port and an S4 wakeup is initiated, after the USB-hub initialization is completed, it will automatically enter suspend mode. Upon detecting a device on the USB port, it will proceed with resume and set the hcd to the HCD_FLAG_WAKEUP_PENDING state. During the S4 wakeup process, peripherals are put into suspend mode, followed by task recovery. However, upon detecting that the hcd is in the HCD_FLAG_WAKEUP_PENDING state, it will return an EBUSY status, causing the S4 suspend to fail and subsequent task recovery to not proceed. - [ 27.594598][ 1] PM: pci_pm_freeze(): hcd_pci_suspend+0x0/0x28 returns -16 [ 27.594601][ 1] PM: dpm_run_callback(): pci_pm_freeze+0x0/0x100 returns -16 [ 27.603420][ 1] ehci-pci 0000:00:04.1: pci_pm_freeze+0x0/0x100 returned 0 after 3 usecs [ 27.612233][ 1] ehci-pci 0000:00:05.1: pci_pm_freeze+0x0/0x100 returned -16 after 17223 usecs [ 27.810067][ 1] PM: Device 0000:00:05.1 failed to quiesce async: error -16 [ 27.816988][ 1] PM: quiesce of devices aborted after 1833.282 msecs [ 27.823302][ 1] PM: start quiesce of devices aborted after 1839.975 msecs ...... [ 31.303172][ 1] PM: recover of devices complete after 3473.039 msecs [ 31.309818][ 1] PM: Failed to load hibernation image, recovering. [ 31.348188][ 1] PM: Basic memory bitmaps freed [ 31.352686][ 1] OOM killer enabled. [ 31.356232][ 1] Restarting tasks ... done. [ 31.360609][ 1] PM: resume from hibernation failed (0) [ 31.365800][ 1] PM: Hibernation image not present or could not be loaded. The "do_wakeup" is determined based on whether the controller's power/wakeup attribute is set. The current issue necessitates considering the type of suspend that is occurring. If the suspend type is either PM_EVENT_FREEZE or PM_EVENT_QUIESCE, then "do_wakeup" should be set to false. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202410151722.rfjtknRz-lkp@intel.com/ Signed-off-by: Alan Stern Signed-off-by: Duan Chenghao Link: https://lore.kernel.org/r/20241024024038.26157-1-duanchenghao@kylinos.cn Signed-off-by: Greg Kroah-Hartman --- include/linux/pm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pm.h b/include/linux/pm.h index e7f0260f15ad..08c37b83fea8 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -570,7 +570,8 @@ const struct dev_pm_ops name = { \ { .event = PM_EVENT_AUTO_RESUME, }) #define PMSG_IS_AUTO(msg) (((msg).event & PM_EVENT_AUTO) != 0) - +#define PMSG_NO_WAKEUP(msg) (((msg).event & \ + (PM_EVENT_FREEZE | PM_EVENT_QUIESCE)) != 0) /* * Device run-time power management status. * -- cgit v1.2.3 From 7a53af85d3bbdbe06cd47b81a6d99a04dc0a3963 Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Mon, 25 Nov 2024 14:02:16 +0530 Subject: wifi: cfg80211: send MLO links tx power info in GET_INTERFACE Currently, TX power is reported on interface/wdev level as part of NL80211_CMD_GET_INTERFACE. With MLO, Multiple links can be part of an interface/wdev and hence its necessary to report the TX power of each link. Add support to send tx power for all valid links of an MLD as part of NL80211_CMD_GET_INTERFACE request. As far as userspace is concerned, there is no behavioral change for Non-ML Interfaces. For ML interfaces, userspace should fetch TX power that is nested inside NL80211_ATTR_MLO_LINKS, similar to how channel info(NL80211_ATTR_WIPHY_FREQ) is fetched. Co-developed-by: Aaradhana Sahu Signed-off-by: Aaradhana Sahu Signed-off-by: Rameshkumar Sundaram Link: https://patch.msgid.link/20241125083217.216095-2-quic_ramess@quicinc.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 63e79a22a214..0a48f47a77dc 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4733,7 +4733,7 @@ struct cfg80211_ops { int (*set_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev, enum nl80211_tx_power_setting type, int mbm); int (*get_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev, - int *dbm); + unsigned int link_id, int *dbm); void (*rfkill_poll)(struct wiphy *wiphy); -- cgit v1.2.3 From 24dab555ad5951824e3fb6b665aaca84ac69dd12 Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Mon, 25 Nov 2024 14:02:17 +0530 Subject: wifi: mac80211: get tx power per link ML interfaces can have multiple affiliated links to it and hence there is a need to report tx power of specified link rather deflink. Add changes to report tx power of requested link from mac80211, also pass link id as an argument in get_tx_power op so that supported drivers can use it to report link's tx power. Co-developed-by: Aaradhana Sahu Signed-off-by: Aaradhana Sahu Signed-off-by: Rameshkumar Sundaram Link: https://patch.msgid.link/20241125083217.216095-3-quic_ramess@quicinc.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a97c9f85ae9a..5ce4dfa3fba5 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4759,7 +4759,7 @@ struct ieee80211_ops { u32 (*get_expected_throughput)(struct ieee80211_hw *hw, struct ieee80211_sta *sta); int (*get_txpower)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - int *dbm); + unsigned int link_id, int *dbm); int (*tdls_channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, -- cgit v1.2.3 From d8d936c51388442f769a81e512b505dcf87c6a51 Mon Sep 17 00:00:00 2001 From: Dingyan Li <18500469033@163.com> Date: Wed, 30 Oct 2024 16:38:58 +0800 Subject: usb: storage: add a macro for the upper limit of max LUN The meaning of this value is already used in several places, but with constant values and comments to explain it separately. It's better to have a central place to do this then use the macro in those places for better readability. Signed-off-by: Dingyan Li <18500469033@163.com> Reviewed-by: Alan Stern Link: https://lore.kernel.org/r/20241030083858.46907-1-18500469033@163.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/storage.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/usb/storage.h b/include/linux/usb/storage.h index 8539956bc2be..51be3bb8fccb 100644 --- a/include/linux/usb/storage.h +++ b/include/linux/usb/storage.h @@ -82,4 +82,12 @@ struct bulk_cs_wrap { #define US_BULK_RESET_REQUEST 0xff #define US_BULK_GET_MAX_LUN 0xfe +/* + * If 4 LUNs are supported then the LUNs would be + * numbered from 0 to 3, and the return value for + * US_BULK_GET_MAX_LUN request would be 3. The valid + * LUN field is 4 bits wide, the upper limit is 0x0f. + */ +#define US_BULK_MAX_LUN_LIMIT 0x0f + #endif -- cgit v1.2.3 From 535a07698b8b3e6f305673102d297262cae2360a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 4 Dec 2024 05:09:22 +0200 Subject: serial: 8250_pci: Share WCH IDs with parport_serial driver parport_serial driver uses subset of WCH IDs that are present in 8250_pci. Share them via pci_ids.h and switch parport_serial to use defined constants. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20241204031114.1029882-3-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/pci_ids.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index d2402bf4aea2..de5deb1a0118 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2593,6 +2593,11 @@ #define PCI_VENDOR_ID_REDHAT 0x1b36 +#define PCI_VENDOR_ID_WCHIC 0x1c00 +#define PCI_DEVICE_ID_WCHIC_CH382_0S1P 0x3050 +#define PCI_DEVICE_ID_WCHIC_CH382_2S1P 0x3250 +#define PCI_DEVICE_ID_WCHIC_CH382_2S 0x3253 + #define PCI_VENDOR_ID_SILICOM_DENMARK 0x1c2c #define PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS 0x1c36 @@ -2647,6 +2652,12 @@ #define PCI_VENDOR_ID_AKS 0x416c #define PCI_DEVICE_ID_AKS_ALADDINCARD 0x0100 +#define PCI_VENDOR_ID_WCHCN 0x4348 +#define PCI_DEVICE_ID_WCHCN_CH353_4S 0x3453 +#define PCI_DEVICE_ID_WCHCN_CH353_2S1PF 0x5046 +#define PCI_DEVICE_ID_WCHCN_CH353_1S1P 0x5053 +#define PCI_DEVICE_ID_WCHCN_CH353_2S1P 0x7053 + #define PCI_VENDOR_ID_ACCESSIO 0x494f #define PCI_DEVICE_ID_ACCESSIO_WDG_CSM 0x22c0 -- cgit v1.2.3 From 6fba89813ccf333d2bc4d5caea04cd5f3c39eb50 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 23 Oct 2024 14:21:54 -0700 Subject: lsm: ensure the correct LSM context releaser Add a new lsm_context data structure to hold all the information about a "security context", including the string, its size and which LSM allocated the string. The allocation information is necessary because LSMs have different policies regarding the lifecycle of these strings. SELinux allocates and destroys them on each use, whereas Smack provides a pointer to an entry in a list that never goes away. Update security_release_secctx() to use the lsm_context instead of a (char *, len) pair. Change its callers to do likewise. The LSMs supporting this hook have had comments added to remind the developer that there is more work to be done. The BPF security module provides all LSM hooks. While there has yet to be a known instance of a BPF configuration that uses security contexts, the possibility is real. In the existing implementation there is potential for multiple frees in that case. Cc: linux-integrity@vger.kernel.org Cc: netdev@vger.kernel.org Cc: audit@vger.kernel.org Cc: netfilter-devel@vger.kernel.org To: Pablo Neira Ayuso Cc: linux-nfs@vger.kernel.org Cc: Todd Kjos Signed-off-by: Casey Schaufler [PM: subject tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 +- include/linux/security.h | 35 +++++++++++++++++++++++++++++++++-- include/net/scm.h | 11 ++++++----- 3 files changed, 40 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index eb2937599cb0..c13df23132eb 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -300,7 +300,7 @@ LSM_HOOK(int, -EOPNOTSUPP, secid_to_secctx, u32 secid, char **secdata, LSM_HOOK(int, -EOPNOTSUPP, lsmprop_to_secctx, struct lsm_prop *prop, char **secdata, u32 *seclen) LSM_HOOK(int, 0, secctx_to_secid, const char *secdata, u32 seclen, u32 *secid) -LSM_HOOK(void, LSM_RET_VOID, release_secctx, char *secdata, u32 seclen) +LSM_HOOK(void, LSM_RET_VOID, release_secctx, struct lsm_context *cp) LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode) LSM_HOOK(int, 0, inode_notifysecctx, struct inode *inode, void *ctx, u32 ctxlen) LSM_HOOK(int, 0, inode_setsecctx, struct dentry *dentry, void *ctx, u32 ctxlen) diff --git a/include/linux/security.h b/include/linux/security.h index cbdba435b798..68e56935716b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -225,6 +225,37 @@ extern unsigned long dac_mmap_min_addr; #define dac_mmap_min_addr 0UL #endif +/* + * A "security context" is the text representation of + * the information used by LSMs. + * This structure contains the string, its length, and which LSM + * it is useful for. + */ +struct lsm_context { + char *context; /* Provided by the module */ + u32 len; + int id; /* Identifies the module */ +}; + +/** + * lsmcontext_init - initialize an lsmcontext structure. + * @cp: Pointer to the context to initialize + * @context: Initial context, or NULL + * @size: Size of context, or 0 + * @id: Which LSM provided the context + * + * Fill in the lsmcontext from the provided information. + * This is a scaffolding function that will be removed when + * lsm_context integration is complete. + */ +static inline void lsmcontext_init(struct lsm_context *cp, char *context, + u32 size, int id) +{ + cp->id = id; + cp->context = context; + cp->len = size; +} + /* * Values used in the task_security_ops calls */ @@ -556,7 +587,7 @@ int security_ismaclabel(const char *name); int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen); int security_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata, u32 *seclen); int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid); -void security_release_secctx(char *secdata, u32 seclen); +void security_release_secctx(struct lsm_context *cp); void security_inode_invalidate_secctx(struct inode *inode); int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen); int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen); @@ -1545,7 +1576,7 @@ static inline int security_secctx_to_secid(const char *secdata, return -EOPNOTSUPP; } -static inline void security_release_secctx(char *secdata, u32 seclen) +static inline void security_release_secctx(struct lsm_context *cp) { } diff --git a/include/net/scm.h b/include/net/scm.h index 0d35c7c77a74..f75449e1d876 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -105,16 +105,17 @@ static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, #ifdef CONFIG_SECURITY_NETWORK static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) { - char *secdata; - u32 seclen; + struct lsm_context ctx; int err; if (test_bit(SOCK_PASSSEC, &sock->flags)) { - err = security_secid_to_secctx(scm->secid, &secdata, &seclen); + err = security_secid_to_secctx(scm->secid, &ctx.context, + &ctx.len); if (!err) { - put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, seclen, secdata); - security_release_secctx(secdata, seclen); + put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len, + ctx.context); + security_release_secctx(&ctx); } } } -- cgit v1.2.3 From 1995edc5f9089ecb8b77a34f21e4abd8f887b856 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 3 Dec 2024 19:03:54 -0800 Subject: bpf: Consolidate locks and reference state in verifier state Currently, state for RCU read locks and preemption is in bpf_verifier_state, while locks and pointer reference state remains in bpf_func_state. There is no particular reason to keep the latter in bpf_func_state. Additionally, it is copied into a new frame's state and copied back to the caller frame's state everytime the verifier processes a pseudo call instruction. This is a bit wasteful, given this state is global for a given verification state / path. Move all resource and reference related state in bpf_verifier_state structure in this patch, in preparation for introducing new reference state types in the future. Since we switch print_verifier_state and friends to print using vstate, we now need to explicitly pass in the verifier state from the caller along with the bpf_func_state, so modify the prototype and callers to do so. To ensure func state matches the verifier state when we're printing data, take in frame number instead of bpf_func_state pointer instead and avoid inconsistencies induced by the caller. Acked-by: Eduard Zingerman Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241204030400.208005-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f4290c179bee..03e351c43fa8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -315,9 +315,6 @@ struct bpf_func_state { u32 callback_depth; /* The following fields should be last. See copy_func_state() */ - int acquired_refs; - int active_locks; - struct bpf_reference_state *refs; /* The state of the stack. Each element of the array describes BPF_REG_SIZE * (i.e. 8) bytes worth of stack memory. * stack[0] represents bytes [*(r10-8)..*(r10-1)] @@ -370,6 +367,8 @@ struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; struct bpf_verifier_state *parent; + /* Acquired reference states */ + struct bpf_reference_state *refs; /* * 'branches' field is the number of branches left to explore: * 0 - all possible paths from this state reached bpf_exit or @@ -419,9 +418,12 @@ struct bpf_verifier_state { u32 insn_idx; u32 curframe; - bool speculative; + u32 acquired_refs; + u32 active_locks; + u32 active_preempt_locks; bool active_rcu_lock; - u32 active_preempt_lock; + + bool speculative; /* If this state was ever pointed-to by other state's loop_entry field * this flag would be set to true. Used to avoid freeing such states * while they are still in use. @@ -979,8 +981,9 @@ const char *dynptr_type_str(enum bpf_dynptr_type type); const char *iter_type_str(const struct btf *btf, u32 btf_id); const char *iter_state_str(enum bpf_iter_state state); -void print_verifier_state(struct bpf_verifier_env *env, - const struct bpf_func_state *state, bool print_all); -void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state); +void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, + u32 frameno, bool print_all); +void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, + u32 frameno); #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit v1.2.3 From c8e2ee1f3df05dc4caa746c062c6b5791c745c79 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 3 Dec 2024 19:03:57 -0800 Subject: bpf: Introduce support for bpf_local_irq_{save,restore} Teach the verifier about IRQ-disabled sections through the introduction of two new kfuncs, bpf_local_irq_save, to save IRQ state and disable them, and bpf_local_irq_restore, to restore IRQ state and enable them back again. For the purposes of tracking the saved IRQ state, the verifier is taught about a new special object on the stack of type STACK_IRQ_FLAG. This is a 8 byte value which saves the IRQ flags which are to be passed back to the IRQ restore kfunc. Renumber the enums for REF_TYPE_* to simplify the check in find_lock_state, filtering out non-lock types as they grow will become cumbersome and is unecessary. To track a dynamic number of IRQ-disabled regions and their associated saved states, a new resource type RES_TYPE_IRQ is introduced, which its state management functions: acquire_irq_state and release_irq_state, taking advantage of the refactoring and clean ups made in earlier commits. One notable requirement of the kernel's IRQ save and restore API is that they cannot happen out of order. For this purpose, when releasing reference we keep track of the prev_id we saw with REF_TYPE_IRQ. Since reference states are inserted in increasing order of the index, this is used to remember the ordering of acquisitions of IRQ saved states, so that we maintain a logical stack in acquisition order of resource identities, and can enforce LIFO ordering when restoring IRQ state. The top of the stack is maintained using bpf_verifier_state's active_irq_id. To maintain the stack property when releasing reference states, we need to modify release_reference_state to instead shift the remaining array left using memmove instead of swapping deleted element with last that might break the ordering. A selftest to test this subtle behavior is added in late patches. The logic to detect initialized and unitialized irq flag slots, marking and unmarking is similar to how it's done for iterators. No additional checks are needed in refsafe for REF_TYPE_IRQ, apart from the usual check_id satisfiability check on the ref[i].id. We have to perform the same check_ids check on state->active_irq_id as well. To ensure we don't get assigned REF_TYPE_PTR by default after acquire_reference_state, if someone forgets to assign the type, let's also renumber the enum ref_state_type. This way any unassigned types get caught by refsafe's default switch statement, don't assume REF_TYPE_PTR by default. The kfuncs themselves are plain wrappers over local_irq_save and local_irq_restore macros. Acked-by: Eduard Zingerman Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20241204030400.208005-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 03e351c43fa8..de09ac3067ae 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -233,6 +233,7 @@ enum bpf_stack_slot_type { */ STACK_DYNPTR, STACK_ITER, + STACK_IRQ_FLAG, }; #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ @@ -254,8 +255,9 @@ struct bpf_reference_state { * default to pointer reference on zero initialization of a state. */ enum ref_state_type { - REF_TYPE_PTR = 0, - REF_TYPE_LOCK, + REF_TYPE_PTR = 1, + REF_TYPE_IRQ = 2, + REF_TYPE_LOCK = 3, } type; /* Track each reference created with a unique id, even if the same * instruction creates the reference multiple times (eg, via CALL). @@ -421,6 +423,7 @@ struct bpf_verifier_state { u32 acquired_refs; u32 active_locks; u32 active_preempt_locks; + u32 active_irq_id; bool active_rcu_lock; bool speculative; -- cgit v1.2.3 From cdb03e598750e7ebc222571aa96653e9b5a59dbe Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 10 Nov 2024 23:33:24 +0100 Subject: scsi: bsg: Replace zero-length array with flexible array member Replace the deprecated zero-length array with a modern flexible array member in the struct iscsi_bsg_host_vendor_reply. Link: https://github.com/KSPP/linux/issues/78 Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20241110223323.42772-2-thorsten.blum@linux.dev Signed-off-by: Martin K. Petersen --- include/scsi/scsi_bsg_iscsi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/scsi/scsi_bsg_iscsi.h b/include/scsi/scsi_bsg_iscsi.h index 9b1f0f424a79..a569c35b258d 100644 --- a/include/scsi/scsi_bsg_iscsi.h +++ b/include/scsi/scsi_bsg_iscsi.h @@ -59,7 +59,7 @@ struct iscsi_bsg_host_vendor { */ struct iscsi_bsg_host_vendor_reply { /* start of vendor response area */ - uint32_t vendor_rsp[0]; + DECLARE_FLEX_ARRAY(uint32_t, vendor_rsp); }; -- cgit v1.2.3 From 209f4e43b8068c24cde227f464111030430153fa Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Sun, 24 Nov 2024 09:08:07 +0200 Subject: scsi: ufs: core: Introduce a new clock_gating lock Introduce a new clock gating lock to serialize access to some of the clock gating members instead of the host_lock. While at it, simplify the code with the guard() macro and co for automatic cleanup of the new lock. There are some explicit spin_lock_irqsave()/spin_unlock_irqrestore() snaking instances I left behind because I couldn't make heads or tails of it. Additionally, move the trace_ufshcd_clk_gating() call from inside the region protected by the lock as it doesn't needs protection. Signed-off-by: Avri Altman Link: https://lore.kernel.org/r/20241124070808.194860-4-avri.altman@wdc.com Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- include/ufs/ufshcd.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index d7aca9e61684..b6311069d901 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -403,6 +403,9 @@ enum clk_gating_state { * delay_ms * @ungate_work: worker to turn on clocks that will be used in case of * interrupt context + * @clk_gating_workq: workqueue for clock gating work. + * @lock: serialize access to some struct ufs_clk_gating members. An outer lock + * relative to the host lock * @state: the current clocks state * @delay_ms: gating delay in ms * @is_suspended: clk gating is suspended when set to 1 which can be used @@ -413,11 +416,14 @@ enum clk_gating_state { * @is_initialized: Indicates whether clock gating is initialized or not * @active_reqs: number of requests that are pending and should be waited for * completion before gating clocks. - * @clk_gating_workq: workqueue for clock gating work. */ struct ufs_clk_gating { struct delayed_work gate_work; struct work_struct ungate_work; + struct workqueue_struct *clk_gating_workq; + + spinlock_t lock; + enum clk_gating_state state; unsigned long delay_ms; bool is_suspended; @@ -426,7 +432,6 @@ struct ufs_clk_gating { bool is_enabled; bool is_initialized; int active_reqs; - struct workqueue_struct *clk_gating_workq; }; /** -- cgit v1.2.3 From be769e5cf53b8a45eedcc7354bacf939ae16f72c Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Sun, 24 Nov 2024 09:08:08 +0200 Subject: scsi: ufs: core: Introduce a new clock_scaling lock Introduce a new clock scaling lock to serialize access to some of the clock scaling members instead of the host_lock. here also, simplify the code with the guard() macro and co. Reviewed-by: Bart Van Assche Signed-off-by: Avri Altman Link: https://lore.kernel.org/r/20241124070808.194860-5-avri.altman@wdc.com Signed-off-by: Martin K. Petersen --- include/ufs/ufshcd.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index b6311069d901..ce7667b020e2 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -436,6 +436,10 @@ struct ufs_clk_gating { /** * struct ufs_clk_scaling - UFS clock scaling related data + * @workq: workqueue to schedule devfreq suspend/resume work + * @suspend_work: worker to suspend devfreq + * @resume_work: worker to resume devfreq + * @lock: serialize access to some struct ufs_clk_scaling members * @active_reqs: number of requests that are pending. If this is zero when * devfreq ->target() function is called then schedule "suspend_work" to * suspend devfreq. @@ -445,9 +449,6 @@ struct ufs_clk_gating { * @enable_attr: sysfs attribute to enable/disable clock scaling * @saved_pwr_info: UFS power mode may also be changed during scaling and this * one keeps track of previous power mode. - * @workq: workqueue to schedule devfreq suspend/resume work - * @suspend_work: worker to suspend devfreq - * @resume_work: worker to resume devfreq * @target_freq: frequency requested by devfreq framework * @min_gear: lowest HS gear to scale down to * @is_enabled: tracks if scaling is currently enabled or not, controlled by @@ -459,15 +460,18 @@ struct ufs_clk_gating { * @is_suspended: tracks if devfreq is suspended or not */ struct ufs_clk_scaling { + struct workqueue_struct *workq; + struct work_struct suspend_work; + struct work_struct resume_work; + + spinlock_t lock; + int active_reqs; unsigned long tot_busy_t; ktime_t window_start_t; ktime_t busy_start_t; struct device_attribute enable_attr; struct ufs_pa_layer_attr saved_pwr_info; - struct workqueue_struct *workq; - struct work_struct suspend_work; - struct work_struct resume_work; unsigned long target_freq; u32 min_gear; bool is_enabled; -- cgit v1.2.3 From 2d470c778120d3cdb8d8ab250329ca85f49f12b1 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 23 Oct 2024 14:21:55 -0700 Subject: lsm: replace context+len with lsm_context Replace the (secctx,seclen) pointer pair with a single lsm_context pointer to allow return of the LSM identifier along with the context and context length. This allows security_release_secctx() to know how to release the context. Callers have been modified to use or save the returned data from the new structure. security_secid_to_secctx() and security_lsmproc_to_secctx() will now return the length value on success instead of 0. Cc: netdev@vger.kernel.org Cc: audit@vger.kernel.org Cc: netfilter-devel@vger.kernel.org Cc: Todd Kjos Signed-off-by: Casey Schaufler [PM: subject tweak, kdoc fix, signedness fix from Dan Carpenter] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 5 ++--- include/linux/security.h | 9 ++++----- include/net/scm.h | 5 ++--- 3 files changed, 8 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index c13df23132eb..01e5a8e09bba 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -295,10 +295,9 @@ LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, const char *name, char **value) LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size) LSM_HOOK(int, 0, ismaclabel, const char *name) -LSM_HOOK(int, -EOPNOTSUPP, secid_to_secctx, u32 secid, char **secdata, - u32 *seclen) +LSM_HOOK(int, -EOPNOTSUPP, secid_to_secctx, u32 secid, struct lsm_context *cp) LSM_HOOK(int, -EOPNOTSUPP, lsmprop_to_secctx, struct lsm_prop *prop, - char **secdata, u32 *seclen) + struct lsm_context *cp) LSM_HOOK(int, 0, secctx_to_secid, const char *secdata, u32 seclen, u32 *secid) LSM_HOOK(void, LSM_RET_VOID, release_secctx, struct lsm_context *cp) LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode) diff --git a/include/linux/security.h b/include/linux/security.h index 68e56935716b..58518bbc00a6 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -584,8 +584,8 @@ int security_getprocattr(struct task_struct *p, int lsmid, const char *name, int security_setprocattr(int lsmid, const char *name, void *value, size_t size); int security_netlink_send(struct sock *sk, struct sk_buff *skb); int security_ismaclabel(const char *name); -int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen); -int security_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata, u32 *seclen); +int security_secid_to_secctx(u32 secid, struct lsm_context *cp); +int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp); int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid); void security_release_secctx(struct lsm_context *cp); void security_inode_invalidate_secctx(struct inode *inode); @@ -1557,14 +1557,13 @@ static inline int security_ismaclabel(const char *name) return 0; } -static inline int security_secid_to_secctx(u32 secid, char **secdata, - u32 *seclen) +static inline int security_secid_to_secctx(u32 secid, struct lsm_context *cp) { return -EOPNOTSUPP; } static inline int security_lsmprop_to_secctx(struct lsm_prop *prop, - char **secdata, u32 *seclen) + struct lsm_context *cp) { return -EOPNOTSUPP; } diff --git a/include/net/scm.h b/include/net/scm.h index f75449e1d876..22bb49589fde 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -109,10 +109,9 @@ static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct sc int err; if (test_bit(SOCK_PASSSEC, &sock->flags)) { - err = security_secid_to_secctx(scm->secid, &ctx.context, - &ctx.len); + err = security_secid_to_secctx(scm->secid, &ctx); - if (!err) { + if (err >= 0) { put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len, ctx.context); security_release_secctx(&ctx); -- cgit v1.2.3 From 76ecf306ae5da84ef8f48c7a2608736e6866440c Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 23 Oct 2024 14:21:56 -0700 Subject: lsm: use lsm_context in security_inode_getsecctx Change the security_inode_getsecctx() interface to fill a lsm_context structure instead of data and length pointers. This provides the information about which LSM created the context so that security_release_secctx() can use the correct hook. Cc: linux-nfs@vger.kernel.org Signed-off-by: Casey Schaufler [PM: subject tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 4 ++-- include/linux/security.h | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 01e5a8e09bba..69e1076448c6 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -303,8 +303,8 @@ LSM_HOOK(void, LSM_RET_VOID, release_secctx, struct lsm_context *cp) LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode) LSM_HOOK(int, 0, inode_notifysecctx, struct inode *inode, void *ctx, u32 ctxlen) LSM_HOOK(int, 0, inode_setsecctx, struct dentry *dentry, void *ctx, u32 ctxlen) -LSM_HOOK(int, -EOPNOTSUPP, inode_getsecctx, struct inode *inode, void **ctx, - u32 *ctxlen) +LSM_HOOK(int, -EOPNOTSUPP, inode_getsecctx, struct inode *inode, + struct lsm_context *cp) #if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE) LSM_HOOK(int, 0, post_notification, const struct cred *w_cred, diff --git a/include/linux/security.h b/include/linux/security.h index 58518bbc00a6..29f8100bc7c8 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -591,7 +591,7 @@ void security_release_secctx(struct lsm_context *cp); void security_inode_invalidate_secctx(struct inode *inode); int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen); int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen); -int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen); +int security_inode_getsecctx(struct inode *inode, struct lsm_context *cp); int security_locked_down(enum lockdown_reason what); int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, u32 *uctx_len, void *val, size_t val_len, u64 id, u64 flags); @@ -1591,7 +1591,8 @@ static inline int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 { return -EOPNOTSUPP; } -static inline int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) +static inline int security_inode_getsecctx(struct inode *inode, + struct lsm_context *cp) { return -EOPNOTSUPP; } -- cgit v1.2.3 From b530104f50e86db6f187d39fed5821b3cca755ee Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 23 Oct 2024 14:21:57 -0700 Subject: lsm: lsm_context in security_dentry_init_security Replace the (secctx,seclen) pointer pair with a single lsm_context pointer to allow return of the LSM identifier along with the context and context length. This allows security_release_secctx() to know how to release the context. Callers have been modified to use or save the returned data from the new structure. Cc: ceph-devel@vger.kernel.org Cc: linux-nfs@vger.kernel.org Signed-off-by: Casey Schaufler [PM: subject tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 +- include/linux/security.h | 26 +++----------------------- 2 files changed, 4 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 69e1076448c6..e2f1ce37c41e 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -83,7 +83,7 @@ LSM_HOOK(int, 0, move_mount, const struct path *from_path, const struct path *to_path) LSM_HOOK(int, -EOPNOTSUPP, dentry_init_security, struct dentry *dentry, int mode, const struct qstr *name, const char **xattr_name, - void **ctx, u32 *ctxlen) + struct lsm_context *cp) LSM_HOOK(int, 0, dentry_create_files_as, struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, struct cred *new) diff --git a/include/linux/security.h b/include/linux/security.h index 29f8100bc7c8..980b6c207cad 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -237,25 +237,6 @@ struct lsm_context { int id; /* Identifies the module */ }; -/** - * lsmcontext_init - initialize an lsmcontext structure. - * @cp: Pointer to the context to initialize - * @context: Initial context, or NULL - * @size: Size of context, or 0 - * @id: Which LSM provided the context - * - * Fill in the lsmcontext from the provided information. - * This is a scaffolding function that will be removed when - * lsm_context integration is complete. - */ -static inline void lsmcontext_init(struct lsm_context *cp, char *context, - u32 size, int id) -{ - cp->id = id; - cp->context = context; - cp->len = size; -} - /* * Values used in the task_security_ops calls */ @@ -409,8 +390,8 @@ int security_sb_clone_mnt_opts(const struct super_block *oldsb, int security_move_mount(const struct path *from_path, const struct path *to_path); int security_dentry_init_security(struct dentry *dentry, int mode, const struct qstr *name, - const char **xattr_name, void **ctx, - u32 *ctxlen); + const char **xattr_name, + struct lsm_context *lsmcxt); int security_dentry_create_files_as(struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, @@ -883,8 +864,7 @@ static inline int security_dentry_init_security(struct dentry *dentry, int mode, const struct qstr *name, const char **xattr_name, - void **ctx, - u32 *ctxlen) + struct lsm_context *lsmcxt) { return -EOPNOTSUPP; } -- cgit v1.2.3 From ed638918f4df39daa458435f0825b487c1f192c8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 22 Oct 2024 11:07:53 -0700 Subject: scsi: Rename .slave_alloc() and .slave_destroy() Rename .slave_alloc() into .sdev_init() and .slave_destroy() into .sdev_destroy(). The new names make it clear that these are actions on SCSI devices. Make this change in the SCSI core, SCSI drivers and also in the ATA drivers. No functionality has been changed. This patch has been created as follows: * Change the text "slave_alloc" into "sdev_init" in all source files except those in drivers/net/ and Documentation/. * Change the text "slave_destroy" into "sdev_destroy" in all source files except those in drivers/net/ and Documentation/. * Rename lpfc_no_slave() into lpfc_no_sdev(). * Manually adjust whitespace where necessary to restore vertical alignment (dc395x driver and include/linux/libata.h). Acked-by: Damien Le Moal Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20241022180839.2712439-2-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- include/linux/libata.h | 8 ++++---- include/scsi/libfc.h | 2 +- include/scsi/libsas.h | 4 ++-- include/scsi/scsi_device.h | 4 ++-- include/scsi/scsi_host.h | 16 ++++++++-------- 5 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/libata.h b/include/linux/libata.h index c1a85d46eba6..fb5e13b8d89f 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1199,10 +1199,10 @@ extern int ata_std_bios_param(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[]); extern void ata_scsi_unlock_native_capacity(struct scsi_device *sdev); -extern int ata_scsi_slave_alloc(struct scsi_device *sdev); +extern int ata_scsi_sdev_init(struct scsi_device *sdev); int ata_scsi_device_configure(struct scsi_device *sdev, struct queue_limits *lim); -extern void ata_scsi_slave_destroy(struct scsi_device *sdev); +extern void ata_scsi_sdev_destroy(struct scsi_device *sdev); extern int ata_scsi_change_queue_depth(struct scsi_device *sdev, int queue_depth); extern int ata_change_queue_depth(struct ata_port *ap, struct scsi_device *sdev, @@ -1458,8 +1458,8 @@ extern const struct attribute_group *ata_common_sdev_groups[]; .this_id = ATA_SHT_THIS_ID, \ .emulated = ATA_SHT_EMULATED, \ .proc_name = drv_name, \ - .slave_alloc = ata_scsi_slave_alloc, \ - .slave_destroy = ata_scsi_slave_destroy, \ + .sdev_init = ata_scsi_sdev_init, \ + .sdev_destroy = ata_scsi_sdev_destroy, \ .bios_param = ata_std_bios_param, \ .unlock_native_capacity = ata_scsi_unlock_native_capacity,\ .max_sectors = ATA_MAX_SECTORS_LBA48 diff --git a/include/scsi/libfc.h b/include/scsi/libfc.h index 4a9b4169e081..183d9fd50d2d 100644 --- a/include/scsi/libfc.h +++ b/include/scsi/libfc.h @@ -963,7 +963,7 @@ int fc_queuecommand(struct Scsi_Host *, struct scsi_cmnd *); int fc_eh_abort(struct scsi_cmnd *); int fc_eh_device_reset(struct scsi_cmnd *); int fc_eh_host_reset(struct scsi_cmnd *); -int fc_slave_alloc(struct scsi_device *); +int fc_sdev_init(struct scsi_device *); /* * ELS/CT interface diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h index 1324068dd950..f30f716ba045 100644 --- a/include/scsi/libsas.h +++ b/include/scsi/libsas.h @@ -703,7 +703,7 @@ int sas_eh_device_reset_handler(struct scsi_cmnd *cmd); int sas_eh_target_reset_handler(struct scsi_cmnd *cmd); extern void sas_target_destroy(struct scsi_target *); -extern int sas_slave_alloc(struct scsi_device *); +extern int sas_sdev_init(struct scsi_device *); extern int sas_ioctl(struct scsi_device *sdev, unsigned int cmd, void __user *arg); extern int sas_drain_work(struct sas_ha_struct *ha); @@ -751,7 +751,7 @@ void sas_notify_phy_event(struct asd_sas_phy *phy, enum phy_event event, #define LIBSAS_SHT_BASE _LIBSAS_SHT_BASE \ .device_configure = sas_device_configure, \ - .slave_alloc = sas_slave_alloc, \ + .sdev_init = sas_sdev_init, \ #define LIBSAS_SHT_BASE_NO_SLAVE_INIT _LIBSAS_SHT_BASE diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 9c540f5468eb..7acd0ec82bb0 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -155,7 +155,7 @@ struct scsi_device { blist_flags_t sdev_bflags; /* black/white flags as also found in * scsi_devinfo.[hc]. For now used only to - * pass settings from slave_alloc to scsi + * pass settings from sdev_init to scsi * core. */ unsigned int eh_timeout; /* Error handling timeout */ @@ -357,7 +357,7 @@ struct scsi_target { atomic_t target_blocked; /* - * LLDs should set this in the slave_alloc host template callout. + * LLDs should set this in the sdev_init host template callout. * If set to zero then there is not limit. */ unsigned int can_queue; diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 2b4ab0369ffb..58b29cc6de09 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -168,20 +168,20 @@ struct scsi_host_template { * Return values: 0 on success, non-0 on failure * * Deallocation: If we didn't find any devices at this ID, you will - * get an immediate call to slave_destroy(). If we find something + * get an immediate call to sdev_destroy(). If we find something * here then you will get a call to slave_configure(), then the * device will be used for however long it is kept around, then when * the device is removed from the system (or * possibly at reboot - * time), you will then get a call to slave_destroy(). This is - * assuming you implement slave_configure and slave_destroy. + * time), you will then get a call to sdev_destroy(). This is + * assuming you implement slave_configure and sdev_destroy. * However, if you allocate memory and hang it off the device struct, - * then you must implement the slave_destroy() routine at a minimum + * then you must implement the sdev_destroy() routine at a minimum * in order to avoid leaking memory * each time a device is tore down. * * Status: OPTIONAL */ - int (* slave_alloc)(struct scsi_device *); + int (* sdev_init)(struct scsi_device *); /* * Once the device has responded to an INQUIRY and we know the @@ -206,7 +206,7 @@ struct scsi_host_template { * specific setup basis... * 6. Return 0 on success, non-0 on error. The device will be marked * as offline on error so that no access will occur. If you return - * non-0, your slave_destroy routine will never get called for this + * non-0, your sdev_destroy routine will never get called for this * device, so don't leave any loose memory hanging around, clean * up after yourself before returning non-0 * @@ -223,11 +223,11 @@ struct scsi_host_template { * has ceased the mid layer calls this point so that the low level * driver may completely detach itself from the scsi device and vice * versa. The low level driver is responsible for freeing any memory - * it allocated in the slave_alloc or slave_configure calls. + * it allocated in the sdev_init or slave_configure calls. * * Status: OPTIONAL */ - void (* slave_destroy)(struct scsi_device *); + void (* sdev_destroy)(struct scsi_device *); /* * Before the mid layer attempts to scan for a new device attached -- cgit v1.2.3 From 47c2e30afcec52968e50db01f92dda7d373042cb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 22 Oct 2024 11:07:54 -0700 Subject: scsi: Rename .device_configure() into .sdev_configure() Improve naming consistency with the .sdev_prep() and .sdev_destroy() methods by renaming .device_configure() into .sdev_configure(). Cc: Christoph Hellwig Acked-by: Damien Le Moal Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20241022180839.2712439-3-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- include/linux/libata.h | 11 +++++------ include/scsi/libsas.h | 5 ++--- include/scsi/scsi_host.h | 4 ++-- 3 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/libata.h b/include/linux/libata.h index fb5e13b8d89f..7717f06a548d 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1200,8 +1200,7 @@ extern int ata_std_bios_param(struct scsi_device *sdev, sector_t capacity, int geom[]); extern void ata_scsi_unlock_native_capacity(struct scsi_device *sdev); extern int ata_scsi_sdev_init(struct scsi_device *sdev); -int ata_scsi_device_configure(struct scsi_device *sdev, - struct queue_limits *lim); +int ata_scsi_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim); extern void ata_scsi_sdev_destroy(struct scsi_device *sdev); extern int ata_scsi_change_queue_depth(struct scsi_device *sdev, int queue_depth); @@ -1301,8 +1300,8 @@ extern struct ata_port *ata_port_alloc(struct ata_host *host); extern void ata_port_free(struct ata_port *ap); extern int ata_tport_add(struct device *parent, struct ata_port *ap); extern void ata_tport_delete(struct ata_port *ap); -int ata_sas_device_configure(struct scsi_device *sdev, struct queue_limits *lim, - struct ata_port *ap); +int ata_sas_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim, + struct ata_port *ap); extern int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap); extern void ata_tf_to_fis(const struct ata_taskfile *tf, u8 pmp, int is_cmd, u8 *fis); @@ -1468,13 +1467,13 @@ extern const struct attribute_group *ata_common_sdev_groups[]; __ATA_BASE_SHT(drv_name), \ .can_queue = ATA_DEF_QUEUE, \ .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ - .device_configure = ata_scsi_device_configure + .sdev_configure = ata_scsi_sdev_configure #define ATA_SUBBASE_SHT_QD(drv_name, drv_qd) \ __ATA_BASE_SHT(drv_name), \ .can_queue = drv_qd, \ .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ - .device_configure = ata_scsi_device_configure + .sdev_configure = ata_scsi_sdev_configure #define ATA_BASE_SHT(drv_name) \ ATA_SUBBASE_SHT(drv_name), \ diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h index f30f716ba045..ba460b6c0374 100644 --- a/include/scsi/libsas.h +++ b/include/scsi/libsas.h @@ -683,8 +683,7 @@ int sas_phy_reset(struct sas_phy *phy, int hard_reset); int sas_phy_enable(struct sas_phy *phy, int enable); extern int sas_queuecommand(struct Scsi_Host *, struct scsi_cmnd *); extern int sas_target_alloc(struct scsi_target *); -int sas_device_configure(struct scsi_device *dev, - struct queue_limits *lim); +int sas_sdev_configure(struct scsi_device *dev, struct queue_limits *lim); extern int sas_change_queue_depth(struct scsi_device *, int new_depth); extern int sas_bios_param(struct scsi_device *, struct block_device *, sector_t capacity, int *hsc); @@ -750,7 +749,7 @@ void sas_notify_phy_event(struct asd_sas_phy *phy, enum phy_event event, #endif #define LIBSAS_SHT_BASE _LIBSAS_SHT_BASE \ - .device_configure = sas_device_configure, \ + .sdev_configure = sas_sdev_configure, \ .sdev_init = sas_sdev_init, \ #define LIBSAS_SHT_BASE_NO_SLAVE_INIT _LIBSAS_SHT_BASE diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 58b29cc6de09..7e1b1a54e46a 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -212,10 +212,10 @@ struct scsi_host_template { * * Status: OPTIONAL * - * Note: slave_configure is the legacy version, use device_configure for + * Note: slave_configure is the legacy version, use sdev_configure for * all new code. A driver must never define both. */ - int (* device_configure)(struct scsi_device *, struct queue_limits *lim); + int (* sdev_configure)(struct scsi_device *, struct queue_limits *lim); int (* slave_configure)(struct scsi_device *); /* -- cgit v1.2.3 From 0f98212d96a2af52e4091a199ef1d35d478d0c60 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 22 Oct 2024 11:07:56 -0700 Subject: scsi: core: Remove the .slave_configure() method Now that all SCSI drivers have been converted from .slave_configure() to .sdev_configure(), remove support for .slave_configure() from the SCSI core. Reviewed-by: Damien Le Moal Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20241022180839.2712439-5-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- include/scsi/scsi_host.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 7e1b1a54e46a..8fccfd27393e 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -169,11 +169,11 @@ struct scsi_host_template { * * Deallocation: If we didn't find any devices at this ID, you will * get an immediate call to sdev_destroy(). If we find something - * here then you will get a call to slave_configure(), then the + * here then you will get a call to sdev_configure(), then the * device will be used for however long it is kept around, then when * the device is removed from the system (or * possibly at reboot * time), you will then get a call to sdev_destroy(). This is - * assuming you implement slave_configure and sdev_destroy. + * assuming you implement sdev_configure and sdev_destroy. * However, if you allocate memory and hang it off the device struct, * then you must implement the sdev_destroy() routine at a minimum * in order to avoid leaking memory @@ -211,19 +211,15 @@ struct scsi_host_template { * up after yourself before returning non-0 * * Status: OPTIONAL - * - * Note: slave_configure is the legacy version, use sdev_configure for - * all new code. A driver must never define both. */ int (* sdev_configure)(struct scsi_device *, struct queue_limits *lim); - int (* slave_configure)(struct scsi_device *); /* * Immediately prior to deallocating the device and after all activity * has ceased the mid layer calls this point so that the low level * driver may completely detach itself from the scsi device and vice * versa. The low level driver is responsible for freeing any memory - * it allocated in the sdev_init or slave_configure calls. + * it allocated in the sdev_init or sdev_configure calls. * * Status: OPTIONAL */ -- cgit v1.2.3 From d48da4d5ed7b4a022a4e54f210575baac71f58af Mon Sep 17 00:00:00 2001 From: Jordan Rome Date: Wed, 4 Dec 2024 07:59:11 -0800 Subject: security: add trace event for cap_capable In cases where we want a stable way to observe/trace cap_capable (e.g. protection from inlining and API updates) add a tracepoint that passes: - The credentials used - The user namespace of the resource being accessed - The user namespace in which the credential provides the capability to access the targeted resource - The capability to check for - The return value of the check Signed-off-by: Jordan Rome Acked-by: Andrii Nakryiko Reviewed-by: Paul Moore Reviewed-by: Serge Hallyn Link: https://lore.kernel.org/r/20241204155911.1817092-1-linux@jordanrome.com Signed-off-by: Serge Hallyn --- include/trace/events/capability.h | 57 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 include/trace/events/capability.h (limited to 'include') diff --git a/include/trace/events/capability.h b/include/trace/events/capability.h new file mode 100644 index 000000000000..17340257946c --- /dev/null +++ b/include/trace/events/capability.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM capability + +#if !defined(_TRACE_CAPABILITY_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_CAPABILITY_H + +#include +#include +#include + +/** + * cap_capable - called after it's determined if a task has a particular + * effective capability + * + * @cred: The credentials used + * @target_ns: The user namespace of the resource being accessed + * @capable_ns: The user namespace in which the credential provides the + * capability to access the targeted resource. + * This will be NULL if ret is not 0. + * @cap: The capability to check for + * @ret: The return value of the check: 0 if it does, -ve if it does not + * + * Allows to trace calls to cap_capable in commoncap.c + */ +TRACE_EVENT(cap_capable, + + TP_PROTO(const struct cred *cred, struct user_namespace *target_ns, + const struct user_namespace *capable_ns, int cap, int ret), + + TP_ARGS(cred, target_ns, capable_ns, cap, ret), + + TP_STRUCT__entry( + __field(const struct cred *, cred) + __field(struct user_namespace *, target_ns) + __field(const struct user_namespace *, capable_ns) + __field(int, cap) + __field(int, ret) + ), + + TP_fast_assign( + __entry->cred = cred; + __entry->target_ns = target_ns; + __entry->capable_ns = ret == 0 ? capable_ns : NULL; + __entry->cap = cap; + __entry->ret = ret; + ), + + TP_printk("cred %p, target_ns %p, capable_ns %p, cap %d, ret %d", + __entry->cred, __entry->target_ns, __entry->capable_ns, __entry->cap, + __entry->ret) +); + +#endif /* _TRACE_CAPABILITY_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From a61b19f4a6586590a9ae6baf2ac4a25a852e547f Mon Sep 17 00:00:00 2001 From: Maksym Kutsevol Date: Mon, 2 Dec 2024 11:55:07 -0800 Subject: netpoll: Make netpoll_send_udp return status instead of void netpoll_send_udp can return if send was successful. It will allow client code to be aware of the send status. Possible return values are the result of __netpoll_send_skb (cast to int) and -ENOMEM. This doesn't cover the case when TX was not successful instantaneously and was scheduled for later, __netpoll__send_skb returns success in that case. Signed-off-by: Maksym Kutsevol Link: https://patch.msgid.link/20241202-netcons-add-udp-send-fail-statistics-to-netconsole-v5-1-70e82239f922@kutsevol.com Signed-off-by: Jakub Kicinski --- include/linux/netpoll.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index b34301650c47..f91e50a76efd 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -57,7 +57,7 @@ static inline void netpoll_poll_disable(struct net_device *dev) { return; } static inline void netpoll_poll_enable(struct net_device *dev) { return; } #endif -void netpoll_send_udp(struct netpoll *np, const char *msg, int len); +int netpoll_send_udp(struct netpoll *np, const char *msg, int len); void netpoll_print_options(struct netpoll *np); int netpoll_parse_options(struct netpoll *np, char *opt); int __netpoll_setup(struct netpoll *np, struct net_device *ndev); -- cgit v1.2.3 From b4c7698dd95f253c6958d8c6ac219098009bf28a Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:31:02 +0000 Subject: net: phy: add phy_inband_caps() Add a method to query the PHY's in-band capabilities for a PHY interface mode. Where the interface mode does not have in-band capability, or the PHY driver has not been updated to return this information, then phy_inband_caps() should return zero. Otherwise, PHY drivers will return a value consisting of the following flags: LINK_INBAND_DISABLE indicates that the hardware does not support in-band signalling, or can have in-band signalling configured via software to be disabled. LINK_INBAND_ENABLE indicates that the hardware will use in-band signalling, or can have in-band signalling configured via software to be enabled. LINK_INBAND_BYPASS indicates that the hardware has the ability to bypass in-band signalling when enabled after a timeout if the link partner does not respond to its in-band signalling. This reports the PHY capabilities for the particular interface mode, not the current configuration. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tIUre-006ITz-KF@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 563c46205685..ccb93d892da9 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -817,6 +817,24 @@ struct phy_tdr_config { }; #define PHY_PAIR_ALL -1 +/** + * enum link_inband_signalling - in-band signalling modes that are supported + * + * @LINK_INBAND_DISABLE: in-band signalling can be disabled + * @LINK_INBAND_ENABLE: in-band signalling can be enabled without bypass + * @LINK_INBAND_BYPASS: in-band signalling can be enabled with bypass + * + * The possible and required bits can only be used if the valid bit is set. + * If possible is clear, that means inband signalling can not be used. + * Required is only valid when possible is set, and means that inband + * signalling must be used. + */ +enum link_inband_signalling { + LINK_INBAND_DISABLE = BIT(0), + LINK_INBAND_ENABLE = BIT(1), + LINK_INBAND_BYPASS = BIT(2), +}; + /** * struct phy_plca_cfg - Configuration of the PLCA (Physical Layer Collision * Avoidance) Reconciliation Sublayer. @@ -956,6 +974,14 @@ struct phy_driver { */ int (*get_features)(struct phy_device *phydev); + /** + * @inband_caps: query whether in-band is supported for the given PHY + * interface mode. Returns a bitmask of bits defined by enum + * link_inband_signalling. + */ + unsigned int (*inband_caps)(struct phy_device *phydev, + phy_interface_t interface); + /** * @get_rate_matching: Get the supported type of rate matching for a * particular phy interface. This is used by phy consumers to determine @@ -1818,6 +1844,8 @@ int phy_config_aneg(struct phy_device *phydev); int _phy_start_aneg(struct phy_device *phydev); int phy_start_aneg(struct phy_device *phydev); int phy_aneg_done(struct phy_device *phydev); +unsigned int phy_inband_caps(struct phy_device *phydev, + phy_interface_t interface); int phy_speed_down(struct phy_device *phydev, bool sync); int phy_speed_up(struct phy_device *phydev); bool phy_check_valid(int speed, int duplex, unsigned long *features); -- cgit v1.2.3 From 5d58a890c02770ba8d790b1f3c6e8c0e20514dc2 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:31:18 +0000 Subject: net: phy: add phy_config_inband() Add a method to configure the PHY's in-band mode. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tIUru-006IUI-08@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index ccb93d892da9..61a1bc81f597 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -982,6 +982,11 @@ struct phy_driver { unsigned int (*inband_caps)(struct phy_device *phydev, phy_interface_t interface); + /** + * @config_inband: configure in-band mode for the PHY + */ + int (*config_inband)(struct phy_device *phydev, unsigned int modes); + /** * @get_rate_matching: Get the supported type of rate matching for a * particular phy interface. This is used by phy consumers to determine @@ -1846,6 +1851,7 @@ int phy_start_aneg(struct phy_device *phydev); int phy_aneg_done(struct phy_device *phydev); unsigned int phy_inband_caps(struct phy_device *phydev, phy_interface_t interface); +int phy_config_inband(struct phy_device *phydev, unsigned int modes); int phy_speed_down(struct phy_device *phydev, bool sync); int phy_speed_up(struct phy_device *phydev); bool phy_check_valid(int speed, int duplex, unsigned long *features); -- cgit v1.2.3 From df874f9e52c340cc6f0a0014a97b778f67d46849 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 3 Dec 2024 15:31:28 +0000 Subject: net: phylink: add pcs_inband_caps() method Add a pcs_inband_caps() method to query the PCS for its inband link capabilities, and use this to determine whether link modes used with optical SFPs can be supported. When a PCS does not provide a method, we allow inband negotiation to be either on or off, making this a no-op until the pcs_inband_caps() method is implemented by a PCS driver. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tIUs4-006IUU-7K@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 5c01048860c4..5462cc6a37dc 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -419,6 +419,7 @@ struct phylink_pcs { /** * struct phylink_pcs_ops - MAC PCS operations structure. * @pcs_validate: validate the link configuration. + * @pcs_inband_caps: query inband support for interface mode. * @pcs_enable: enable the PCS. * @pcs_disable: disable the PCS. * @pcs_pre_config: pre-mac_config method (for errata) @@ -434,6 +435,8 @@ struct phylink_pcs { struct phylink_pcs_ops { int (*pcs_validate)(struct phylink_pcs *pcs, unsigned long *supported, const struct phylink_link_state *state); + unsigned int (*pcs_inband_caps)(struct phylink_pcs *pcs, + phy_interface_t interface); int (*pcs_enable)(struct phylink_pcs *pcs); void (*pcs_disable)(struct phylink_pcs *pcs); void (*pcs_pre_config)(struct phylink_pcs *pcs, @@ -470,6 +473,20 @@ struct phylink_pcs_ops { int pcs_validate(struct phylink_pcs *pcs, unsigned long *supported, const struct phylink_link_state *state); +/** + * pcs_inband_caps - query PCS in-band capabilities for interface mode. + * @pcs: a pointer to a &struct phylink_pcs. + * @interface: interface mode to be queried + * + * Returns zero if it is unknown what in-band signalling is supported by the + * PHY (e.g. because the PHY driver doesn't implement the method.) Otherwise, + * returns a bit mask of the LINK_INBAND_* values from + * &enum link_inband_signalling to describe which inband modes are supported + * for this interface mode. + */ +unsigned int pcs_inband_caps(struct phylink_pcs *pcs, + phy_interface_t interface); + /** * pcs_enable() - enable the PCS. * @pcs: a pointer to a &struct phylink_pcs. -- cgit v1.2.3 From e05feab22fd7dabcd6d272c4e2401ec1acdfdb9b Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Tue, 3 Dec 2024 15:45:37 +0200 Subject: RDMA/mlx5: Enforce same type port association for multiport RoCE Different core device types such as PFs and VFs shouldn't be affiliated together since they have different capabilities, fix that by enforcing type check before doing the affiliation. Fixes: 32f69e4be269 ("{net, IB}/mlx5: Manage port association for multiport RoCE") Reviewed-by: Mark Bloch Signed-off-by: Patrisious Haddad Link: https://patch.msgid.link/88699500f690dff1c1852c1ddb71f8a1cc8b956e.1733233480.git.leonro@nvidia.com Reviewed-by: Mateusz Polchlopek Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fc7e6153b73d..4f9e6f6dbaab 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1202,6 +1202,12 @@ static inline bool mlx5_core_is_vf(const struct mlx5_core_dev *dev) return dev->coredev_type == MLX5_COREDEV_VF; } +static inline bool mlx5_core_same_coredev_type(const struct mlx5_core_dev *dev1, + const struct mlx5_core_dev *dev2) +{ + return dev1->coredev_type == dev2->coredev_type; +} + static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev) { return dev->caps.embedded_cpu; -- cgit v1.2.3 From 64e844505bc08cde3f346f193cbbbab0096fef54 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:06:59 -0500 Subject: include: uapi: protocol number and packet structs for AGGFRAG in ESP Add the RFC assigned IP protocol number for AGGFRAG. Add the on-wire basic and congestion-control IP-TFS packet headers. Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- include/uapi/linux/in.h | 2 ++ include/uapi/linux/ip.h | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 5d32d53508d9..ced0fc3c3aa5 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -79,6 +79,8 @@ enum { #define IPPROTO_MPLS IPPROTO_MPLS IPPROTO_ETHERNET = 143, /* Ethernet-within-IPv6 Encapsulation */ #define IPPROTO_ETHERNET IPPROTO_ETHERNET + IPPROTO_AGGFRAG = 144, /* AGGFRAG in ESP (RFC 9347) */ +#define IPPROTO_AGGFRAG IPPROTO_AGGFRAG IPPROTO_RAW = 255, /* Raw IP packets */ #define IPPROTO_RAW IPPROTO_RAW IPPROTO_SMC = 256, /* Shared Memory Communications */ diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h index 283dec7e3645..5bd7ce934d74 100644 --- a/include/uapi/linux/ip.h +++ b/include/uapi/linux/ip.h @@ -137,6 +137,22 @@ struct ip_beet_phdr { __u8 reserved; }; +struct ip_iptfs_hdr { + __u8 subtype; /* 0*: basic, 1: CC */ + __u8 flags; + __be16 block_offset; +}; + +struct ip_iptfs_cc_hdr { + __u8 subtype; /* 0: basic, 1*: CC */ + __u8 flags; + __be16 block_offset; + __be32 loss_rate; + __be64 rtt_adelay_xdelay; + __be32 tval; + __be32 techo; +}; + /* index values for the variables in ipv4_devconf */ enum { -- cgit v1.2.3 From f69eb4f65c58f5a081dbafb76011dad73757420c Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:07:00 -0500 Subject: xfrm: netlink: add config (netlink) options Add netlink options for configuring IP-TFS SAs. Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- include/uapi/linux/xfrm.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index d73a97e3030a..a23495c0e0a1 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -158,7 +158,8 @@ enum { #define XFRM_MODE_ROUTEOPTIMIZATION 2 #define XFRM_MODE_IN_TRIGGER 3 #define XFRM_MODE_BEET 4 -#define XFRM_MODE_MAX 5 +#define XFRM_MODE_IPTFS 5 +#define XFRM_MODE_MAX 6 /* Netlink configuration messages. */ enum { @@ -323,6 +324,12 @@ enum xfrm_attr_type_t { XFRMA_SA_DIR, /* __u8 */ XFRMA_NAT_KEEPALIVE_INTERVAL, /* __u32 in seconds for NAT keepalive */ XFRMA_SA_PCPU, /* __u32 */ + XFRMA_IPTFS_DROP_TIME, /* __u32 in: usec to wait for next seq */ + XFRMA_IPTFS_REORDER_WINDOW, /* __u16 in: reorder window size (pkts) */ + XFRMA_IPTFS_DONT_FRAG, /* out: don't use fragmentation */ + XFRMA_IPTFS_INIT_DELAY, /* __u32 out: initial packet wait delay (usec) */ + XFRMA_IPTFS_MAX_QSIZE, /* __u32 out: max ingress queue size (octets) */ + XFRMA_IPTFS_PKT_SIZE, /* __u32 out: size of outer packet, 0 for PMTU */ __XFRMA_MAX #define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */ -- cgit v1.2.3 From 7ac64f4598b4daa3f955f82759760666e047bdf8 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:07:01 -0500 Subject: xfrm: add mode_cbs module functionality Add a set of callbacks xfrm_mode_cbs to xfrm_state. These callbacks enable the addition of new xfrm modes, such as IP-TFS to be defined in modules. Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 32c09e85a64c..1ebc09cde627 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -213,6 +213,7 @@ struct xfrm_state { u16 family; xfrm_address_t saddr; int header_len; + int enc_hdr_len; int trailer_len; u32 extra_flags; struct xfrm_mark smark; @@ -303,6 +304,9 @@ struct xfrm_state { * interpreted by xfrm_type methods. */ void *data; u8 dir; + + const struct xfrm_mode_cbs *mode_cbs; + void *mode_data; }; static inline struct net *xs_net(struct xfrm_state *x) @@ -460,6 +464,45 @@ struct xfrm_type_offload { int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family); void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); +/** + * struct xfrm_mode_cbs - XFRM mode callbacks + * @owner: module owner or NULL + * @init_state: Add/init mode specific state in `xfrm_state *x` + * @clone_state: Copy mode specific values from `orig` to new state `x` + * @destroy_state: Cleanup mode specific state from `xfrm_state *x` + * @user_init: Process mode specific netlink attributes from user + * @copy_to_user: Add netlink attributes to `attrs` based on state in `x` + * @sa_len: Return space required to store mode specific netlink attributes + * @get_inner_mtu: Return avail payload space after removing encap overhead + * @input: Process received packet from SA using mode + * @output: Output given packet using mode + * @prepare_output: Add mode specific encapsulation to packet in skb. On return + * `transport_header` should point at ESP header, `network_header` should + * point at outer IP header and `mac_header` should opint at the + * protocol/nexthdr field of the outer IP. + * + * One should examine and understand the specific uses of these callbacks in + * xfrm for further detail on how and when these functions are called. RTSL. + */ +struct xfrm_mode_cbs { + struct module *owner; + int (*init_state)(struct xfrm_state *x); + int (*clone_state)(struct xfrm_state *x, struct xfrm_state *orig); + void (*destroy_state)(struct xfrm_state *x); + int (*user_init)(struct net *net, struct xfrm_state *x, + struct nlattr **attrs, + struct netlink_ext_ack *extack); + int (*copy_to_user)(struct xfrm_state *x, struct sk_buff *skb); + unsigned int (*sa_len)(const struct xfrm_state *x); + u32 (*get_inner_mtu)(struct xfrm_state *x, int outer_mtu); + int (*input)(struct xfrm_state *x, struct sk_buff *skb); + int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); + int (*prepare_output)(struct xfrm_state *x, struct sk_buff *skb); +}; + +int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs); +void xfrm_unregister_mode_cbs(u8 mode); + static inline int xfrm_af2proto(unsigned int family) { switch(family) { -- cgit v1.2.3 From d1716d5a44c37e5743bf6ea4e5cdbdab37727f27 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 14 Nov 2024 02:07:02 -0500 Subject: xfrm: add generic iptfs defines and functionality Define `XFRM_MODE_IPTFS` and `IPSEC_MODE_IPTFS` constants, and add these to switch case and conditionals adjacent with the existing TUNNEL modes. Signed-off-by: Christian Hopps Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 + include/uapi/linux/ipsec.h | 3 ++- include/uapi/linux/snmp.h | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 1ebc09cde627..4b0677e48190 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -38,6 +38,7 @@ #define XFRM_PROTO_COMP 108 #define XFRM_PROTO_IPIP 4 #define XFRM_PROTO_IPV6 41 +#define XFRM_PROTO_IPTFS IPPROTO_AGGFRAG #define XFRM_PROTO_ROUTING IPPROTO_ROUTING #define XFRM_PROTO_DSTOPTS IPPROTO_DSTOPTS diff --git a/include/uapi/linux/ipsec.h b/include/uapi/linux/ipsec.h index 50d8ee1791e2..696b790f4346 100644 --- a/include/uapi/linux/ipsec.h +++ b/include/uapi/linux/ipsec.h @@ -14,7 +14,8 @@ enum { IPSEC_MODE_ANY = 0, /* We do not support this for SA */ IPSEC_MODE_TRANSPORT = 1, IPSEC_MODE_TUNNEL = 2, - IPSEC_MODE_BEET = 3 + IPSEC_MODE_BEET = 3, + IPSEC_MODE_IPTFS = 4 }; enum { diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index adf5fd78dd50..5a2553511190 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -339,6 +339,8 @@ enum LINUX_MIB_XFRMACQUIREERROR, /* XfrmAcquireError */ LINUX_MIB_XFRMOUTSTATEDIRERROR, /* XfrmOutStateDirError */ LINUX_MIB_XFRMINSTATEDIRERROR, /* XfrmInStateDirError */ + LINUX_MIB_XFRMINIPTFSERROR, /* XfrmInIptfsError */ + LINUX_MIB_XFRMOUTNOQSPACE, /* XfrmOutNoQueueSpace */ __LINUX_MIB_XFRMMAX }; -- cgit v1.2.3 From f9a5b34f9251cf530fecf08ef039be64ead8c459 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Thu, 5 Dec 2024 00:09:21 +0200 Subject: net/mlx5: ifc: Reorganize mlx5_ifc_flow_table_context_bits The nested union at the end is not in the same style as the rest of the code, so un-nest it to make the style uniformly applied again. Signed-off-by: Cosmin Ratiu Reviewed-by: Saeed Mahameed Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241204220931.254964-2-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4fbbcf35498b..f3650f989e68 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -6324,6 +6324,20 @@ struct mlx5_ifc_modify_other_hca_cap_in_bits { struct mlx5_ifc_other_hca_cap_bits other_capability; }; +struct mlx5_ifc_sw_owner_icm_root_params_bits { + u8 sw_owner_icm_root_1[0x40]; + + u8 sw_owner_icm_root_0[0x40]; +}; + +struct mlx5_ifc_rtc_params_bits { + u8 rtc_id_0[0x20]; + + u8 rtc_id_1[0x20]; + + u8 reserved_at_40[0x40]; +}; + struct mlx5_ifc_flow_table_context_bits { u8 reformat_en[0x1]; u8 decap_en[0x1]; @@ -6342,20 +6356,10 @@ struct mlx5_ifc_flow_table_context_bits { u8 lag_master_next_table_id[0x18]; u8 reserved_at_60[0x60]; - union { - struct { - u8 sw_owner_icm_root_1[0x40]; - - u8 sw_owner_icm_root_0[0x40]; - } sws; - struct { - u8 rtc_id_0[0x20]; - u8 rtc_id_1[0x20]; - - u8 reserved_at_100[0x40]; - - } hws; + union { + struct mlx5_ifc_sw_owner_icm_root_params_bits sws; + struct mlx5_ifc_rtc_params_bits hws; }; }; -- cgit v1.2.3 From e799ac9dd3c485a7cda3586f2a12784b030b9df0 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 5 Dec 2024 00:09:22 +0200 Subject: net/mlx5: Add ConnectX-8 device to ifc In preparation for ConnectX-8 SWS support, add enum for the new device type. Signed-off-by: Yevgeny Kliteynik Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241204220931.254964-3-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f3650f989e68..bd9b1833408e 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1590,6 +1590,7 @@ enum { MLX5_STEERING_FORMAT_CONNECTX_5 = 0, MLX5_STEERING_FORMAT_CONNECTX_6DX = 1, MLX5_STEERING_FORMAT_CONNECTX_7 = 2, + MLX5_STEERING_FORMAT_CONNECTX_8 = 3, }; struct mlx5_ifc_cmd_hca_cap_bits { -- cgit v1.2.3 From 03713108e0cccf325bb71941edd9ed6122142907 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 5 Dec 2024 00:09:23 +0200 Subject: net/mlx5: Add support for new scheduling elements Introduce new scheduling elements in the E-Switch QoS hierarchy to enhance traffic management capabilities. This patch adds support for: - Rate Limit scheduling elements: Enables bandwidth limitation across multiple nodes without a shared ancestor, providing a mechanism for more granular control of bandwidth allocation. - Traffic Class Transmit Scheduling Arbiter (TSAR): Introduces the infrastructure for creating Traffic Class TSARs, allowing hierarchical arbitration based on traffic classes. - Traffic Class Arbiter TSAR: Adds support for a TSAR capable of managing arbitration between multiple traffic classes, enabling improved bandwidth prioritization and traffic management. No functional changes are introduced in this patch. Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241204220931.254964-4-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index bd9b1833408e..8b202521b774 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1103,7 +1103,8 @@ struct mlx5_ifc_qos_cap_bits { u8 packet_pacing_min_rate[0x20]; - u8 reserved_at_80[0x10]; + u8 reserved_at_80[0xb]; + u8 log_esw_max_rate_limit[0x5]; u8 packet_pacing_rate_table_size[0x10]; u8 esw_element_type[0x10]; @@ -4104,6 +4105,7 @@ enum { SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC = 0x2, SCHEDULING_CONTEXT_ELEMENT_TYPE_PARA_VPORT_TC = 0x3, SCHEDULING_CONTEXT_ELEMENT_TYPE_QUEUE_GROUP = 0x4, + SCHEDULING_CONTEXT_ELEMENT_TYPE_RATE_LIMIT = 0x5, }; enum { @@ -4112,22 +4114,26 @@ enum { ELEMENT_TYPE_CAP_MASK_VPORT_TC = 1 << 2, ELEMENT_TYPE_CAP_MASK_PARA_VPORT_TC = 1 << 3, ELEMENT_TYPE_CAP_MASK_QUEUE_GROUP = 1 << 4, + ELEMENT_TYPE_CAP_MASK_RATE_LIMIT = 1 << 5, }; enum { TSAR_ELEMENT_TSAR_TYPE_DWRR = 0x0, TSAR_ELEMENT_TSAR_TYPE_ROUND_ROBIN = 0x1, TSAR_ELEMENT_TSAR_TYPE_ETS = 0x2, + TSAR_ELEMENT_TSAR_TYPE_TC_ARB = 0x3, }; enum { TSAR_TYPE_CAP_MASK_DWRR = 1 << 0, TSAR_TYPE_CAP_MASK_ROUND_ROBIN = 1 << 1, TSAR_TYPE_CAP_MASK_ETS = 1 << 2, + TSAR_TYPE_CAP_MASK_TC_ARB = 1 << 3, }; struct mlx5_ifc_tsar_element_bits { - u8 reserved_at_0[0x8]; + u8 traffic_class[0x4]; + u8 reserved_at_4[0x4]; u8 tsar_type[0x8]; u8 reserved_at_10[0x10]; }; @@ -4164,7 +4170,9 @@ struct mlx5_ifc_scheduling_context_bits { u8 max_average_bw[0x20]; - u8 reserved_at_e0[0x120]; + u8 max_bw_obj_id[0x20]; + + u8 reserved_at_100[0x100]; }; struct mlx5_ifc_rqtc_bits { -- cgit v1.2.3 From f09ed834a946f9c77088d53af4d4806974728d7b Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Thu, 5 Dec 2024 00:09:24 +0200 Subject: net/mlx5: qos: Add ifc support for cross-esw scheduling This adds the capability bit and the vport element fields related to cross-esw scheduling. Signed-off-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241204220931.254964-5-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 8b202521b774..5451ff1d4356 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1095,7 +1095,9 @@ struct mlx5_ifc_qos_cap_bits { u8 log_esw_max_sched_depth[0x4]; u8 reserved_at_10[0x10]; - u8 reserved_at_20[0xb]; + u8 reserved_at_20[0x9]; + u8 esw_cross_esw_sched[0x1]; + u8 reserved_at_2a[0x1]; u8 log_max_qos_nic_queue_group[0x5]; u8 reserved_at_30[0x10]; @@ -4139,13 +4141,16 @@ struct mlx5_ifc_tsar_element_bits { }; struct mlx5_ifc_vport_element_bits { - u8 reserved_at_0[0x10]; + u8 reserved_at_0[0x4]; + u8 eswitch_owner_vhca_id_valid[0x1]; + u8 eswitch_owner_vhca_id[0xb]; u8 vport_number[0x10]; }; struct mlx5_ifc_vport_tc_element_bits { u8 traffic_class[0x4]; - u8 reserved_at_4[0xc]; + u8 eswitch_owner_vhca_id_valid[0x1]; + u8 eswitch_owner_vhca_id[0xb]; u8 vport_number[0x10]; }; -- cgit v1.2.3 From 6d9d6ab3a82af50e36e13e7bc8e2d1b970e39f79 Mon Sep 17 00:00:00 2001 From: Takahiro Kuwano Date: Tue, 3 Dec 2024 11:46:49 +0900 Subject: mtd: spinand: Introduce a way to avoid raw access SkyHigh spinand device has ECC enable bit in configuration register but it must be always enabled. If ECC is disabled, read and write ops results in undetermined state. For such devices, a way to avoid raw access is needed. Introduce SPINAND_NO_RAW_ACCESS flag to advertise the device does not support raw access. In such devices, the on-die ECC engine ops returns error to I/O request in raw mode. Checking and marking BBM need to be cared as special case, by adding fallback mechanism that tries read/write OOB with ECC enabled. Signed-off-by: Takahiro Kuwano Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 702e5fb13dae..5cf11005b41a 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -314,6 +314,7 @@ struct spinand_ecc_info { #define SPINAND_HAS_CR_FEAT_BIT BIT(1) #define SPINAND_HAS_PROG_PLANE_SELECT_BIT BIT(2) #define SPINAND_HAS_READ_PLANE_SELECT_BIT BIT(3) +#define SPINAND_NO_RAW_ACCESS BIT(4) /** * struct spinand_ondie_ecc_conf - private SPI-NAND on-die ECC engine structure -- cgit v1.2.3 From 1a50e3612de9187857f55ee14a573f7f8e7d4ebc Mon Sep 17 00:00:00 2001 From: Takahiro Kuwano Date: Tue, 3 Dec 2024 11:46:50 +0900 Subject: mtd: spinand: Add support for SkyHigh S35ML-3 family SkyHigh S35ML01G300, S35ML01G301, S35ML02G300, and S35ML04G300 are 1Gb, 2Gb, and 4Gb SLC SPI NAND flash family. This family of devices has on-die ECC which parity bits are stored to hidden area. In this family the on-die ECC cannot be disabled so raw access needs to be prevented. Link: https://www.skyhighmemory.com/download/SPI_S35ML01_04G3_002_19205.pdf?v=P Co-developed-by: KR Kim Signed-off-by: KR Kim Signed-off-by: Takahiro Kuwano Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 5cf11005b41a..cbbcd44ac225 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -268,6 +268,7 @@ extern const struct spinand_manufacturer gigadevice_spinand_manufacturer; extern const struct spinand_manufacturer macronix_spinand_manufacturer; extern const struct spinand_manufacturer micron_spinand_manufacturer; extern const struct spinand_manufacturer paragon_spinand_manufacturer; +extern const struct spinand_manufacturer skyhigh_spinand_manufacturer; extern const struct spinand_manufacturer toshiba_spinand_manufacturer; extern const struct spinand_manufacturer winbond_spinand_manufacturer; extern const struct spinand_manufacturer xtx_spinand_manufacturer; -- cgit v1.2.3 From 0600cf40e9b36fe17f9c9f04d4f9cef249eaa5e7 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Tue, 3 Dec 2024 13:49:42 +0100 Subject: include: net: add static inline dst_dev_overhead() to dst.h Add static inline dst_dev_overhead() function to include/net/dst.h. This helper function is used by ioam6_iptunnel, rpl_iptunnel and seg6_iptunnel to get the dev's overhead based on a cache entry (dst_entry). If the cache is empty, the default and generic value skb->mac_len is returned. Otherwise, LL_RESERVED_SPACE() over dst's dev is returned. Signed-off-by: Justin Iurman Cc: Alexander Lobakin Cc: Vadim Fedorenko Signed-off-by: Paolo Abeni --- include/net/dst.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 0f303cc60252..08647c99d79c 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -440,6 +440,15 @@ static inline void dst_set_expires(struct dst_entry *dst, int timeout) dst->expires = expires; } +static inline unsigned int dst_dev_overhead(struct dst_entry *dst, + struct sk_buff *skb) +{ + if (likely(dst)) + return LL_RESERVED_SPACE(dst->dev); + + return skb->mac_len; +} + INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *, -- cgit v1.2.3 From 49922401c2190713c5cc03902dc68c3ecd3f13e8 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 4 Dec 2024 07:55:47 -0800 Subject: ethtool: separate definitions that are gonna be generated Reshuffle definitions that are gonna be generated into ethtool_netlink_generated.h and match ynl spec order. This should make it easier to compare the output of the ynl-gen-c to the existing uapi header. No functional changes. Things that are still remaining to be manually defined: - ETHTOOL_FLAG_ALL - probably no good way to add to spec? - some of the cable test bits (not sure whether it's possible to move to spec) - some of the stats definitions (no way currently to move to spec) Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20241204155549.641348-7-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool_netlink.h | 893 +----------------------- include/uapi/linux/ethtool_netlink_generated.h | 899 +++++++++++++++++++++++++ 2 files changed, 900 insertions(+), 892 deletions(-) create mode 100644 include/uapi/linux/ethtool_netlink_generated.h (limited to 'include') diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 283305f6b063..9c909ce733a5 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -10,545 +10,12 @@ #define _UAPI_LINUX_ETHTOOL_NETLINK_H_ #include - -/* message types - userspace to kernel */ -enum { - ETHTOOL_MSG_USER_NONE, - ETHTOOL_MSG_STRSET_GET, - ETHTOOL_MSG_LINKINFO_GET, - ETHTOOL_MSG_LINKINFO_SET, - ETHTOOL_MSG_LINKMODES_GET, - ETHTOOL_MSG_LINKMODES_SET, - ETHTOOL_MSG_LINKSTATE_GET, - ETHTOOL_MSG_DEBUG_GET, - ETHTOOL_MSG_DEBUG_SET, - ETHTOOL_MSG_WOL_GET, - ETHTOOL_MSG_WOL_SET, - ETHTOOL_MSG_FEATURES_GET, - ETHTOOL_MSG_FEATURES_SET, - ETHTOOL_MSG_PRIVFLAGS_GET, - ETHTOOL_MSG_PRIVFLAGS_SET, - ETHTOOL_MSG_RINGS_GET, - ETHTOOL_MSG_RINGS_SET, - ETHTOOL_MSG_CHANNELS_GET, - ETHTOOL_MSG_CHANNELS_SET, - ETHTOOL_MSG_COALESCE_GET, - ETHTOOL_MSG_COALESCE_SET, - ETHTOOL_MSG_PAUSE_GET, - ETHTOOL_MSG_PAUSE_SET, - ETHTOOL_MSG_EEE_GET, - ETHTOOL_MSG_EEE_SET, - ETHTOOL_MSG_TSINFO_GET, - ETHTOOL_MSG_CABLE_TEST_ACT, - ETHTOOL_MSG_CABLE_TEST_TDR_ACT, - ETHTOOL_MSG_TUNNEL_INFO_GET, - ETHTOOL_MSG_FEC_GET, - ETHTOOL_MSG_FEC_SET, - ETHTOOL_MSG_MODULE_EEPROM_GET, - ETHTOOL_MSG_STATS_GET, - ETHTOOL_MSG_PHC_VCLOCKS_GET, - ETHTOOL_MSG_MODULE_GET, - ETHTOOL_MSG_MODULE_SET, - ETHTOOL_MSG_PSE_GET, - ETHTOOL_MSG_PSE_SET, - ETHTOOL_MSG_RSS_GET, - ETHTOOL_MSG_PLCA_GET_CFG, - ETHTOOL_MSG_PLCA_SET_CFG, - ETHTOOL_MSG_PLCA_GET_STATUS, - ETHTOOL_MSG_MM_GET, - ETHTOOL_MSG_MM_SET, - ETHTOOL_MSG_MODULE_FW_FLASH_ACT, - ETHTOOL_MSG_PHY_GET, - - /* add new constants above here */ - __ETHTOOL_MSG_USER_CNT, - ETHTOOL_MSG_USER_MAX = __ETHTOOL_MSG_USER_CNT - 1 -}; - -/* message types - kernel to userspace */ -enum { - ETHTOOL_MSG_KERNEL_NONE, - ETHTOOL_MSG_STRSET_GET_REPLY, - ETHTOOL_MSG_LINKINFO_GET_REPLY, - ETHTOOL_MSG_LINKINFO_NTF, - ETHTOOL_MSG_LINKMODES_GET_REPLY, - ETHTOOL_MSG_LINKMODES_NTF, - ETHTOOL_MSG_LINKSTATE_GET_REPLY, - ETHTOOL_MSG_DEBUG_GET_REPLY, - ETHTOOL_MSG_DEBUG_NTF, - ETHTOOL_MSG_WOL_GET_REPLY, - ETHTOOL_MSG_WOL_NTF, - ETHTOOL_MSG_FEATURES_GET_REPLY, - ETHTOOL_MSG_FEATURES_SET_REPLY, - ETHTOOL_MSG_FEATURES_NTF, - ETHTOOL_MSG_PRIVFLAGS_GET_REPLY, - ETHTOOL_MSG_PRIVFLAGS_NTF, - ETHTOOL_MSG_RINGS_GET_REPLY, - ETHTOOL_MSG_RINGS_NTF, - ETHTOOL_MSG_CHANNELS_GET_REPLY, - ETHTOOL_MSG_CHANNELS_NTF, - ETHTOOL_MSG_COALESCE_GET_REPLY, - ETHTOOL_MSG_COALESCE_NTF, - ETHTOOL_MSG_PAUSE_GET_REPLY, - ETHTOOL_MSG_PAUSE_NTF, - ETHTOOL_MSG_EEE_GET_REPLY, - ETHTOOL_MSG_EEE_NTF, - ETHTOOL_MSG_TSINFO_GET_REPLY, - ETHTOOL_MSG_CABLE_TEST_NTF, - ETHTOOL_MSG_CABLE_TEST_TDR_NTF, - ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY, - ETHTOOL_MSG_FEC_GET_REPLY, - ETHTOOL_MSG_FEC_NTF, - ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY, - ETHTOOL_MSG_STATS_GET_REPLY, - ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY, - ETHTOOL_MSG_MODULE_GET_REPLY, - ETHTOOL_MSG_MODULE_NTF, - ETHTOOL_MSG_PSE_GET_REPLY, - ETHTOOL_MSG_RSS_GET_REPLY, - ETHTOOL_MSG_PLCA_GET_CFG_REPLY, - ETHTOOL_MSG_PLCA_GET_STATUS_REPLY, - ETHTOOL_MSG_PLCA_NTF, - ETHTOOL_MSG_MM_GET_REPLY, - ETHTOOL_MSG_MM_NTF, - ETHTOOL_MSG_MODULE_FW_FLASH_NTF, - ETHTOOL_MSG_PHY_GET_REPLY, - ETHTOOL_MSG_PHY_NTF, - - /* add new constants above here */ - __ETHTOOL_MSG_KERNEL_CNT, - ETHTOOL_MSG_KERNEL_MAX = __ETHTOOL_MSG_KERNEL_CNT - 1 -}; - -/* request header */ - -enum ethtool_header_flags { - ETHTOOL_FLAG_COMPACT_BITSETS = 1 << 0, /* use compact bitsets in reply */ - ETHTOOL_FLAG_OMIT_REPLY = 1 << 1, /* provide optional reply for SET or ACT requests */ - ETHTOOL_FLAG_STATS = 1 << 2, /* request statistics, if supported by the driver */ -}; +#include #define ETHTOOL_FLAG_ALL (ETHTOOL_FLAG_COMPACT_BITSETS | \ ETHTOOL_FLAG_OMIT_REPLY | \ ETHTOOL_FLAG_STATS) -enum { - ETHTOOL_A_HEADER_UNSPEC, - ETHTOOL_A_HEADER_DEV_INDEX, /* u32 */ - ETHTOOL_A_HEADER_DEV_NAME, /* string */ - ETHTOOL_A_HEADER_FLAGS, /* u32 - ETHTOOL_FLAG_* */ - ETHTOOL_A_HEADER_PHY_INDEX, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_HEADER_CNT, - ETHTOOL_A_HEADER_MAX = __ETHTOOL_A_HEADER_CNT - 1 -}; - -/* bit sets */ - -enum { - ETHTOOL_A_BITSET_BIT_UNSPEC, - ETHTOOL_A_BITSET_BIT_INDEX, /* u32 */ - ETHTOOL_A_BITSET_BIT_NAME, /* string */ - ETHTOOL_A_BITSET_BIT_VALUE, /* flag */ - - /* add new constants above here */ - __ETHTOOL_A_BITSET_BIT_CNT, - ETHTOOL_A_BITSET_BIT_MAX = __ETHTOOL_A_BITSET_BIT_CNT - 1 -}; - -enum { - ETHTOOL_A_BITSET_BITS_UNSPEC, - ETHTOOL_A_BITSET_BITS_BIT, /* nest - _A_BITSET_BIT_* */ - - /* add new constants above here */ - __ETHTOOL_A_BITSET_BITS_CNT, - ETHTOOL_A_BITSET_BITS_MAX = __ETHTOOL_A_BITSET_BITS_CNT - 1 -}; - -enum { - ETHTOOL_A_BITSET_UNSPEC, - ETHTOOL_A_BITSET_NOMASK, /* flag */ - ETHTOOL_A_BITSET_SIZE, /* u32 */ - ETHTOOL_A_BITSET_BITS, /* nest - _A_BITSET_BITS_* */ - ETHTOOL_A_BITSET_VALUE, /* binary */ - ETHTOOL_A_BITSET_MASK, /* binary */ - - /* add new constants above here */ - __ETHTOOL_A_BITSET_CNT, - ETHTOOL_A_BITSET_MAX = __ETHTOOL_A_BITSET_CNT - 1 -}; - -/* string sets */ - -enum { - ETHTOOL_A_STRING_UNSPEC, - ETHTOOL_A_STRING_INDEX, /* u32 */ - ETHTOOL_A_STRING_VALUE, /* string */ - - /* add new constants above here */ - __ETHTOOL_A_STRING_CNT, - ETHTOOL_A_STRING_MAX = __ETHTOOL_A_STRING_CNT - 1 -}; - -enum { - ETHTOOL_A_STRINGS_UNSPEC, - ETHTOOL_A_STRINGS_STRING, /* nest - _A_STRINGS_* */ - - /* add new constants above here */ - __ETHTOOL_A_STRINGS_CNT, - ETHTOOL_A_STRINGS_MAX = __ETHTOOL_A_STRINGS_CNT - 1 -}; - -enum { - ETHTOOL_A_STRINGSET_UNSPEC, - ETHTOOL_A_STRINGSET_ID, /* u32 */ - ETHTOOL_A_STRINGSET_COUNT, /* u32 */ - ETHTOOL_A_STRINGSET_STRINGS, /* nest - _A_STRINGS_* */ - - /* add new constants above here */ - __ETHTOOL_A_STRINGSET_CNT, - ETHTOOL_A_STRINGSET_MAX = __ETHTOOL_A_STRINGSET_CNT - 1 -}; - -enum { - ETHTOOL_A_STRINGSETS_UNSPEC, - ETHTOOL_A_STRINGSETS_STRINGSET, /* nest - _A_STRINGSET_* */ - - /* add new constants above here */ - __ETHTOOL_A_STRINGSETS_CNT, - ETHTOOL_A_STRINGSETS_MAX = __ETHTOOL_A_STRINGSETS_CNT - 1 -}; - -/* STRSET */ - -enum { - ETHTOOL_A_STRSET_UNSPEC, - ETHTOOL_A_STRSET_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_STRSET_STRINGSETS, /* nest - _A_STRINGSETS_* */ - ETHTOOL_A_STRSET_COUNTS_ONLY, /* flag */ - - /* add new constants above here */ - __ETHTOOL_A_STRSET_CNT, - ETHTOOL_A_STRSET_MAX = __ETHTOOL_A_STRSET_CNT - 1 -}; - -/* LINKINFO */ - -enum { - ETHTOOL_A_LINKINFO_UNSPEC, - ETHTOOL_A_LINKINFO_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_LINKINFO_PORT, /* u8 */ - ETHTOOL_A_LINKINFO_PHYADDR, /* u8 */ - ETHTOOL_A_LINKINFO_TP_MDIX, /* u8 */ - ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, /* u8 */ - ETHTOOL_A_LINKINFO_TRANSCEIVER, /* u8 */ - - /* add new constants above here */ - __ETHTOOL_A_LINKINFO_CNT, - ETHTOOL_A_LINKINFO_MAX = __ETHTOOL_A_LINKINFO_CNT - 1 -}; - -/* LINKMODES */ - -enum { - ETHTOOL_A_LINKMODES_UNSPEC, - ETHTOOL_A_LINKMODES_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_LINKMODES_AUTONEG, /* u8 */ - ETHTOOL_A_LINKMODES_OURS, /* bitset */ - ETHTOOL_A_LINKMODES_PEER, /* bitset */ - ETHTOOL_A_LINKMODES_SPEED, /* u32 */ - ETHTOOL_A_LINKMODES_DUPLEX, /* u8 */ - ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, /* u8 */ - ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, /* u8 */ - ETHTOOL_A_LINKMODES_LANES, /* u32 */ - ETHTOOL_A_LINKMODES_RATE_MATCHING, /* u8 */ - - /* add new constants above here */ - __ETHTOOL_A_LINKMODES_CNT, - ETHTOOL_A_LINKMODES_MAX = __ETHTOOL_A_LINKMODES_CNT - 1 -}; - -/* LINKSTATE */ - -enum { - ETHTOOL_A_LINKSTATE_UNSPEC, - ETHTOOL_A_LINKSTATE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_LINKSTATE_LINK, /* u8 */ - ETHTOOL_A_LINKSTATE_SQI, /* u32 */ - ETHTOOL_A_LINKSTATE_SQI_MAX, /* u32 */ - ETHTOOL_A_LINKSTATE_EXT_STATE, /* u8 */ - ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, /* u8 */ - ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_LINKSTATE_CNT, - ETHTOOL_A_LINKSTATE_MAX = __ETHTOOL_A_LINKSTATE_CNT - 1 -}; - -/* DEBUG */ - -enum { - ETHTOOL_A_DEBUG_UNSPEC, - ETHTOOL_A_DEBUG_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_DEBUG_MSGMASK, /* bitset */ - - /* add new constants above here */ - __ETHTOOL_A_DEBUG_CNT, - ETHTOOL_A_DEBUG_MAX = __ETHTOOL_A_DEBUG_CNT - 1 -}; - -/* WOL */ - -enum { - ETHTOOL_A_WOL_UNSPEC, - ETHTOOL_A_WOL_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_WOL_MODES, /* bitset */ - ETHTOOL_A_WOL_SOPASS, /* binary */ - - /* add new constants above here */ - __ETHTOOL_A_WOL_CNT, - ETHTOOL_A_WOL_MAX = __ETHTOOL_A_WOL_CNT - 1 -}; - -/* FEATURES */ - -enum { - ETHTOOL_A_FEATURES_UNSPEC, - ETHTOOL_A_FEATURES_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_FEATURES_HW, /* bitset */ - ETHTOOL_A_FEATURES_WANTED, /* bitset */ - ETHTOOL_A_FEATURES_ACTIVE, /* bitset */ - ETHTOOL_A_FEATURES_NOCHANGE, /* bitset */ - - /* add new constants above here */ - __ETHTOOL_A_FEATURES_CNT, - ETHTOOL_A_FEATURES_MAX = __ETHTOOL_A_FEATURES_CNT - 1 -}; - -/* PRIVFLAGS */ - -enum { - ETHTOOL_A_PRIVFLAGS_UNSPEC, - ETHTOOL_A_PRIVFLAGS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PRIVFLAGS_FLAGS, /* bitset */ - - /* add new constants above here */ - __ETHTOOL_A_PRIVFLAGS_CNT, - ETHTOOL_A_PRIVFLAGS_MAX = __ETHTOOL_A_PRIVFLAGS_CNT - 1 -}; - -/* RINGS */ - -enum { - ETHTOOL_TCP_DATA_SPLIT_UNKNOWN = 0, - ETHTOOL_TCP_DATA_SPLIT_DISABLED, - ETHTOOL_TCP_DATA_SPLIT_ENABLED, -}; - -enum { - ETHTOOL_A_RINGS_UNSPEC, - ETHTOOL_A_RINGS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_RINGS_RX_MAX, /* u32 */ - ETHTOOL_A_RINGS_RX_MINI_MAX, /* u32 */ - ETHTOOL_A_RINGS_RX_JUMBO_MAX, /* u32 */ - ETHTOOL_A_RINGS_TX_MAX, /* u32 */ - ETHTOOL_A_RINGS_RX, /* u32 */ - ETHTOOL_A_RINGS_RX_MINI, /* u32 */ - ETHTOOL_A_RINGS_RX_JUMBO, /* u32 */ - ETHTOOL_A_RINGS_TX, /* u32 */ - ETHTOOL_A_RINGS_RX_BUF_LEN, /* u32 */ - ETHTOOL_A_RINGS_TCP_DATA_SPLIT, /* u8 */ - ETHTOOL_A_RINGS_CQE_SIZE, /* u32 */ - ETHTOOL_A_RINGS_TX_PUSH, /* u8 */ - ETHTOOL_A_RINGS_RX_PUSH, /* u8 */ - ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, /* u32 */ - ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_RINGS_CNT, - ETHTOOL_A_RINGS_MAX = (__ETHTOOL_A_RINGS_CNT - 1) -}; - -/* CHANNELS */ - -enum { - ETHTOOL_A_CHANNELS_UNSPEC, - ETHTOOL_A_CHANNELS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_CHANNELS_RX_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_TX_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_OTHER_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_COMBINED_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_RX_COUNT, /* u32 */ - ETHTOOL_A_CHANNELS_TX_COUNT, /* u32 */ - ETHTOOL_A_CHANNELS_OTHER_COUNT, /* u32 */ - ETHTOOL_A_CHANNELS_COMBINED_COUNT, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_CHANNELS_CNT, - ETHTOOL_A_CHANNELS_MAX = (__ETHTOOL_A_CHANNELS_CNT - 1) -}; - -/* COALESCE */ - -enum { - ETHTOOL_A_COALESCE_UNSPEC, - ETHTOOL_A_COALESCE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_COALESCE_RX_USECS, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES, /* u32 */ - ETHTOOL_A_COALESCE_RX_USECS_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_STATS_BLOCK_USECS, /* u32 */ - ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX, /* u8 */ - ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX, /* u8 */ - ETHTOOL_A_COALESCE_PKT_RATE_LOW, /* u32 */ - ETHTOOL_A_COALESCE_RX_USECS_LOW, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS_LOW, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW, /* u32 */ - ETHTOOL_A_COALESCE_PKT_RATE_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_RX_USECS_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL, /* u32 */ - ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, /* u8 */ - ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, /* u8 */ - ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES, /* u32 */ - ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, /* u32 */ - ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, /* u32 */ - /* nest - _A_PROFILE_IRQ_MODERATION */ - ETHTOOL_A_COALESCE_RX_PROFILE, - /* nest - _A_PROFILE_IRQ_MODERATION */ - ETHTOOL_A_COALESCE_TX_PROFILE, - - /* add new constants above here */ - __ETHTOOL_A_COALESCE_CNT, - ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) -}; - -enum { - ETHTOOL_A_PROFILE_UNSPEC, - /* nest, _A_IRQ_MODERATION_* */ - ETHTOOL_A_PROFILE_IRQ_MODERATION, - __ETHTOOL_A_PROFILE_CNT, - ETHTOOL_A_PROFILE_MAX = (__ETHTOOL_A_PROFILE_CNT - 1) -}; - -enum { - ETHTOOL_A_IRQ_MODERATION_UNSPEC, - ETHTOOL_A_IRQ_MODERATION_USEC, /* u32 */ - ETHTOOL_A_IRQ_MODERATION_PKTS, /* u32 */ - ETHTOOL_A_IRQ_MODERATION_COMPS, /* u32 */ - - __ETHTOOL_A_IRQ_MODERATION_CNT, - ETHTOOL_A_IRQ_MODERATION_MAX = (__ETHTOOL_A_IRQ_MODERATION_CNT - 1) -}; - -/* PAUSE */ - -enum { - ETHTOOL_A_PAUSE_UNSPEC, - ETHTOOL_A_PAUSE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PAUSE_AUTONEG, /* u8 */ - ETHTOOL_A_PAUSE_RX, /* u8 */ - ETHTOOL_A_PAUSE_TX, /* u8 */ - ETHTOOL_A_PAUSE_STATS, /* nest - _PAUSE_STAT_* */ - ETHTOOL_A_PAUSE_STATS_SRC, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_PAUSE_CNT, - ETHTOOL_A_PAUSE_MAX = (__ETHTOOL_A_PAUSE_CNT - 1) -}; - -enum { - ETHTOOL_A_PAUSE_STAT_UNSPEC, - ETHTOOL_A_PAUSE_STAT_PAD, - - ETHTOOL_A_PAUSE_STAT_TX_FRAMES, - ETHTOOL_A_PAUSE_STAT_RX_FRAMES, - - /* add new constants above here - * adjust ETHTOOL_PAUSE_STAT_CNT if adding non-stats! - */ - __ETHTOOL_A_PAUSE_STAT_CNT, - ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) -}; - -/* EEE */ - -enum { - ETHTOOL_A_EEE_UNSPEC, - ETHTOOL_A_EEE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_EEE_MODES_OURS, /* bitset */ - ETHTOOL_A_EEE_MODES_PEER, /* bitset */ - ETHTOOL_A_EEE_ACTIVE, /* u8 */ - ETHTOOL_A_EEE_ENABLED, /* u8 */ - ETHTOOL_A_EEE_TX_LPI_ENABLED, /* u8 */ - ETHTOOL_A_EEE_TX_LPI_TIMER, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_EEE_CNT, - ETHTOOL_A_EEE_MAX = (__ETHTOOL_A_EEE_CNT - 1) -}; - -/* TSINFO */ - -enum { - ETHTOOL_A_TSINFO_UNSPEC, - ETHTOOL_A_TSINFO_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_TSINFO_TIMESTAMPING, /* bitset */ - ETHTOOL_A_TSINFO_TX_TYPES, /* bitset */ - ETHTOOL_A_TSINFO_RX_FILTERS, /* bitset */ - ETHTOOL_A_TSINFO_PHC_INDEX, /* u32 */ - ETHTOOL_A_TSINFO_STATS, /* nest - _A_TSINFO_STAT */ - - /* add new constants above here */ - __ETHTOOL_A_TSINFO_CNT, - ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1) -}; - -enum { - ETHTOOL_A_TS_STAT_UNSPEC, - - ETHTOOL_A_TS_STAT_TX_PKTS, /* uint */ - ETHTOOL_A_TS_STAT_TX_LOST, /* uint */ - ETHTOOL_A_TS_STAT_TX_ERR, /* uint */ - - /* add new constants above here */ - __ETHTOOL_A_TS_STAT_CNT, - ETHTOOL_A_TS_STAT_MAX = (__ETHTOOL_A_TS_STAT_CNT - 1) - -}; - -/* PHC VCLOCKS */ - -enum { - ETHTOOL_A_PHC_VCLOCKS_UNSPEC, - ETHTOOL_A_PHC_VCLOCKS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PHC_VCLOCKS_NUM, /* u32 */ - ETHTOOL_A_PHC_VCLOCKS_INDEX, /* array, s32 */ - - /* add new constants above here */ - __ETHTOOL_A_PHC_VCLOCKS_CNT, - ETHTOOL_A_PHC_VCLOCKS_MAX = (__ETHTOOL_A_PHC_VCLOCKS_CNT - 1) -}; - -/* CABLE TEST */ - -enum { - ETHTOOL_A_CABLE_TEST_UNSPEC, - ETHTOOL_A_CABLE_TEST_HEADER, /* nest - _A_HEADER_* */ - - /* add new constants above here */ - __ETHTOOL_A_CABLE_TEST_CNT, - ETHTOOL_A_CABLE_TEST_MAX = __ETHTOOL_A_CABLE_TEST_CNT - 1 -}; - /* CABLE TEST NOTIFY */ enum { ETHTOOL_A_CABLE_RESULT_CODE_UNSPEC, @@ -582,74 +49,12 @@ enum { ETHTOOL_A_CABLE_INF_SRC_ALCD, }; -enum { - ETHTOOL_A_CABLE_RESULT_UNSPEC, - ETHTOOL_A_CABLE_RESULT_PAIR, /* u8 ETHTOOL_A_CABLE_PAIR_ */ - ETHTOOL_A_CABLE_RESULT_CODE, /* u8 ETHTOOL_A_CABLE_RESULT_CODE_ */ - ETHTOOL_A_CABLE_RESULT_SRC, /* u32 ETHTOOL_A_CABLE_INF_SRC_ */ - - __ETHTOOL_A_CABLE_RESULT_CNT, - ETHTOOL_A_CABLE_RESULT_MAX = (__ETHTOOL_A_CABLE_RESULT_CNT - 1) -}; - -enum { - ETHTOOL_A_CABLE_FAULT_LENGTH_UNSPEC, - ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR, /* u8 ETHTOOL_A_CABLE_PAIR_ */ - ETHTOOL_A_CABLE_FAULT_LENGTH_CM, /* u32 */ - ETHTOOL_A_CABLE_FAULT_LENGTH_SRC, /* u32 ETHTOOL_A_CABLE_INF_SRC_ */ - - __ETHTOOL_A_CABLE_FAULT_LENGTH_CNT, - ETHTOOL_A_CABLE_FAULT_LENGTH_MAX = (__ETHTOOL_A_CABLE_FAULT_LENGTH_CNT - 1) -}; - enum { ETHTOOL_A_CABLE_TEST_NTF_STATUS_UNSPEC, ETHTOOL_A_CABLE_TEST_NTF_STATUS_STARTED, ETHTOOL_A_CABLE_TEST_NTF_STATUS_COMPLETED }; -enum { - ETHTOOL_A_CABLE_NEST_UNSPEC, - ETHTOOL_A_CABLE_NEST_RESULT, /* nest - ETHTOOL_A_CABLE_RESULT_ */ - ETHTOOL_A_CABLE_NEST_FAULT_LENGTH, /* nest - ETHTOOL_A_CABLE_FAULT_LENGTH_ */ - __ETHTOOL_A_CABLE_NEST_CNT, - ETHTOOL_A_CABLE_NEST_MAX = (__ETHTOOL_A_CABLE_NEST_CNT - 1) -}; - -enum { - ETHTOOL_A_CABLE_TEST_NTF_UNSPEC, - ETHTOOL_A_CABLE_TEST_NTF_HEADER, /* nest - ETHTOOL_A_HEADER_* */ - ETHTOOL_A_CABLE_TEST_NTF_STATUS, /* u8 - _STARTED/_COMPLETE */ - ETHTOOL_A_CABLE_TEST_NTF_NEST, /* nest - of results: */ - - __ETHTOOL_A_CABLE_TEST_NTF_CNT, - ETHTOOL_A_CABLE_TEST_NTF_MAX = (__ETHTOOL_A_CABLE_TEST_NTF_CNT - 1) -}; - -/* CABLE TEST TDR */ - -enum { - ETHTOOL_A_CABLE_TEST_TDR_CFG_UNSPEC, - ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST, /* u32 */ - ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST, /* u32 */ - ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP, /* u32 */ - ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR, /* u8 */ - - /* add new constants above here */ - __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT, - ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT - 1 -}; - -enum { - ETHTOOL_A_CABLE_TEST_TDR_UNSPEC, - ETHTOOL_A_CABLE_TEST_TDR_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_CABLE_TEST_TDR_CFG, /* nest - *_TDR_CFG_* */ - - /* add new constants above here */ - __ETHTOOL_A_CABLE_TEST_TDR_CNT, - ETHTOOL_A_CABLE_TEST_TDR_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CNT - 1 -}; - /* CABLE TEST TDR NOTIFY */ enum { @@ -689,132 +94,6 @@ enum { ETHTOOL_A_CABLE_TDR_NEST_MAX = (__ETHTOOL_A_CABLE_TDR_NEST_CNT - 1) }; -enum { - ETHTOOL_A_CABLE_TEST_TDR_NTF_UNSPEC, - ETHTOOL_A_CABLE_TEST_TDR_NTF_HEADER, /* nest - ETHTOOL_A_HEADER_* */ - ETHTOOL_A_CABLE_TEST_TDR_NTF_STATUS, /* u8 - _STARTED/_COMPLETE */ - ETHTOOL_A_CABLE_TEST_TDR_NTF_NEST, /* nest - of results: */ - - /* add new constants above here */ - __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT, - ETHTOOL_A_CABLE_TEST_TDR_NTF_MAX = __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT - 1 -}; - -/* TUNNEL INFO */ - -enum { - ETHTOOL_UDP_TUNNEL_TYPE_VXLAN, - ETHTOOL_UDP_TUNNEL_TYPE_GENEVE, - ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE, - - __ETHTOOL_UDP_TUNNEL_TYPE_CNT -}; - -enum { - ETHTOOL_A_TUNNEL_UDP_ENTRY_UNSPEC, - - ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, /* be16 */ - ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT, - ETHTOOL_A_TUNNEL_UDP_ENTRY_MAX = (__ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT - 1) -}; - -enum { - ETHTOOL_A_TUNNEL_UDP_TABLE_UNSPEC, - - ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, /* u32 */ - ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, /* bitset */ - ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY, /* nest - _UDP_ENTRY_* */ - - /* add new constants above here */ - __ETHTOOL_A_TUNNEL_UDP_TABLE_CNT, - ETHTOOL_A_TUNNEL_UDP_TABLE_MAX = (__ETHTOOL_A_TUNNEL_UDP_TABLE_CNT - 1) -}; - -enum { - ETHTOOL_A_TUNNEL_UDP_UNSPEC, - - ETHTOOL_A_TUNNEL_UDP_TABLE, /* nest - _UDP_TABLE_* */ - - /* add new constants above here */ - __ETHTOOL_A_TUNNEL_UDP_CNT, - ETHTOOL_A_TUNNEL_UDP_MAX = (__ETHTOOL_A_TUNNEL_UDP_CNT - 1) -}; - -enum { - ETHTOOL_A_TUNNEL_INFO_UNSPEC, - ETHTOOL_A_TUNNEL_INFO_HEADER, /* nest - _A_HEADER_* */ - - ETHTOOL_A_TUNNEL_INFO_UDP_PORTS, /* nest - _UDP_TABLE */ - - /* add new constants above here */ - __ETHTOOL_A_TUNNEL_INFO_CNT, - ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) -}; - -/* FEC */ - -enum { - ETHTOOL_A_FEC_UNSPEC, - ETHTOOL_A_FEC_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_FEC_MODES, /* bitset */ - ETHTOOL_A_FEC_AUTO, /* u8 */ - ETHTOOL_A_FEC_ACTIVE, /* u32 */ - ETHTOOL_A_FEC_STATS, /* nest - _A_FEC_STAT */ - - __ETHTOOL_A_FEC_CNT, - ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1) -}; - -enum { - ETHTOOL_A_FEC_STAT_UNSPEC, - ETHTOOL_A_FEC_STAT_PAD, - - ETHTOOL_A_FEC_STAT_CORRECTED, /* array, u64 */ - ETHTOOL_A_FEC_STAT_UNCORR, /* array, u64 */ - ETHTOOL_A_FEC_STAT_CORR_BITS, /* array, u64 */ - - /* add new constants above here */ - __ETHTOOL_A_FEC_STAT_CNT, - ETHTOOL_A_FEC_STAT_MAX = (__ETHTOOL_A_FEC_STAT_CNT - 1) -}; - -/* MODULE EEPROM */ - -enum { - ETHTOOL_A_MODULE_EEPROM_UNSPEC, - ETHTOOL_A_MODULE_EEPROM_HEADER, /* nest - _A_HEADER_* */ - - ETHTOOL_A_MODULE_EEPROM_OFFSET, /* u32 */ - ETHTOOL_A_MODULE_EEPROM_LENGTH, /* u32 */ - ETHTOOL_A_MODULE_EEPROM_PAGE, /* u8 */ - ETHTOOL_A_MODULE_EEPROM_BANK, /* u8 */ - ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS, /* u8 */ - ETHTOOL_A_MODULE_EEPROM_DATA, /* binary */ - - __ETHTOOL_A_MODULE_EEPROM_CNT, - ETHTOOL_A_MODULE_EEPROM_MAX = (__ETHTOOL_A_MODULE_EEPROM_CNT - 1) -}; - -/* STATS */ - -enum { - ETHTOOL_A_STATS_UNSPEC, - ETHTOOL_A_STATS_PAD, - ETHTOOL_A_STATS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_STATS_GROUPS, /* bitset */ - - ETHTOOL_A_STATS_GRP, /* nest - _A_STATS_GRP_* */ - - ETHTOOL_A_STATS_SRC, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_STATS_CNT, - ETHTOOL_A_STATS_MAX = (__ETHTOOL_A_STATS_CNT - 1) -}; - enum { ETHTOOL_STATS_ETH_PHY, ETHTOOL_STATS_ETH_MAC, @@ -825,27 +104,6 @@ enum { __ETHTOOL_STATS_CNT }; -enum { - ETHTOOL_A_STATS_GRP_UNSPEC, - ETHTOOL_A_STATS_GRP_PAD, - - ETHTOOL_A_STATS_GRP_ID, /* u32 */ - ETHTOOL_A_STATS_GRP_SS_ID, /* u32 */ - - ETHTOOL_A_STATS_GRP_STAT, /* nest */ - - ETHTOOL_A_STATS_GRP_HIST_RX, /* nest */ - ETHTOOL_A_STATS_GRP_HIST_TX, /* nest */ - - ETHTOOL_A_STATS_GRP_HIST_BKT_LOW, /* u32 */ - ETHTOOL_A_STATS_GRP_HIST_BKT_HI, /* u32 */ - ETHTOOL_A_STATS_GRP_HIST_VAL, /* u64 */ - - /* add new constants above here */ - __ETHTOOL_A_STATS_GRP_CNT, - ETHTOOL_A_STATS_GRP_MAX = (__ETHTOOL_A_STATS_GRP_CNT - 1) -}; - enum { /* 30.3.2.1.5 aSymbolErrorDuringCarrier */ ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR, @@ -935,155 +193,6 @@ enum { ETHTOOL_A_STATS_RMON_MAX = (__ETHTOOL_A_STATS_RMON_CNT - 1) }; -/* MODULE */ - -enum { - ETHTOOL_A_MODULE_UNSPEC, - ETHTOOL_A_MODULE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_MODULE_POWER_MODE_POLICY, /* u8 */ - ETHTOOL_A_MODULE_POWER_MODE, /* u8 */ - - /* add new constants above here */ - __ETHTOOL_A_MODULE_CNT, - ETHTOOL_A_MODULE_MAX = (__ETHTOOL_A_MODULE_CNT - 1) -}; - -/* Power Sourcing Equipment */ -enum { - ETHTOOL_A_C33_PSE_PW_LIMIT_UNSPEC, - ETHTOOL_A_C33_PSE_PW_LIMIT_MIN, /* u32 */ - ETHTOOL_A_C33_PSE_PW_LIMIT_MAX, /* u32 */ -}; - -enum { - ETHTOOL_A_PSE_UNSPEC, - ETHTOOL_A_PSE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PODL_PSE_ADMIN_STATE, /* u32 */ - ETHTOOL_A_PODL_PSE_ADMIN_CONTROL, /* u32 */ - ETHTOOL_A_PODL_PSE_PW_D_STATUS, /* u32 */ - ETHTOOL_A_C33_PSE_ADMIN_STATE, /* u32 */ - ETHTOOL_A_C33_PSE_ADMIN_CONTROL, /* u32 */ - ETHTOOL_A_C33_PSE_PW_D_STATUS, /* u32 */ - ETHTOOL_A_C33_PSE_PW_CLASS, /* u32 */ - ETHTOOL_A_C33_PSE_ACTUAL_PW, /* u32 */ - ETHTOOL_A_C33_PSE_EXT_STATE, /* u32 */ - ETHTOOL_A_C33_PSE_EXT_SUBSTATE, /* u32 */ - ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT, /* u32 */ - ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES, /* nest - _C33_PSE_PW_LIMIT_* */ - - /* add new constants above here */ - __ETHTOOL_A_PSE_CNT, - ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1) -}; - -enum { - ETHTOOL_A_RSS_UNSPEC, - ETHTOOL_A_RSS_HEADER, - ETHTOOL_A_RSS_CONTEXT, /* u32 */ - ETHTOOL_A_RSS_HFUNC, /* u32 */ - ETHTOOL_A_RSS_INDIR, /* binary */ - ETHTOOL_A_RSS_HKEY, /* binary */ - ETHTOOL_A_RSS_INPUT_XFRM, /* u32 */ - ETHTOOL_A_RSS_START_CONTEXT, /* u32 */ - - __ETHTOOL_A_RSS_CNT, - ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1), -}; - -/* PLCA */ - -enum { - ETHTOOL_A_PLCA_UNSPEC, - ETHTOOL_A_PLCA_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PLCA_VERSION, /* u16 */ - ETHTOOL_A_PLCA_ENABLED, /* u8 */ - ETHTOOL_A_PLCA_STATUS, /* u8 */ - ETHTOOL_A_PLCA_NODE_CNT, /* u32 */ - ETHTOOL_A_PLCA_NODE_ID, /* u32 */ - ETHTOOL_A_PLCA_TO_TMR, /* u32 */ - ETHTOOL_A_PLCA_BURST_CNT, /* u32 */ - ETHTOOL_A_PLCA_BURST_TMR, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_PLCA_CNT, - ETHTOOL_A_PLCA_MAX = (__ETHTOOL_A_PLCA_CNT - 1) -}; - -/* MAC Merge (802.3) */ - -enum { - ETHTOOL_A_MM_STAT_UNSPEC, - ETHTOOL_A_MM_STAT_PAD, - - /* aMACMergeFrameAssErrorCount */ - ETHTOOL_A_MM_STAT_REASSEMBLY_ERRORS, /* u64 */ - /* aMACMergeFrameSmdErrorCount */ - ETHTOOL_A_MM_STAT_SMD_ERRORS, /* u64 */ - /* aMACMergeFrameAssOkCount */ - ETHTOOL_A_MM_STAT_REASSEMBLY_OK, /* u64 */ - /* aMACMergeFragCountRx */ - ETHTOOL_A_MM_STAT_RX_FRAG_COUNT, /* u64 */ - /* aMACMergeFragCountTx */ - ETHTOOL_A_MM_STAT_TX_FRAG_COUNT, /* u64 */ - /* aMACMergeHoldCount */ - ETHTOOL_A_MM_STAT_HOLD_COUNT, /* u64 */ - - /* add new constants above here */ - __ETHTOOL_A_MM_STAT_CNT, - ETHTOOL_A_MM_STAT_MAX = (__ETHTOOL_A_MM_STAT_CNT - 1) -}; - -enum { - ETHTOOL_A_MM_UNSPEC, - ETHTOOL_A_MM_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_MM_PMAC_ENABLED, /* u8 */ - ETHTOOL_A_MM_TX_ENABLED, /* u8 */ - ETHTOOL_A_MM_TX_ACTIVE, /* u8 */ - ETHTOOL_A_MM_TX_MIN_FRAG_SIZE, /* u32 */ - ETHTOOL_A_MM_RX_MIN_FRAG_SIZE, /* u32 */ - ETHTOOL_A_MM_VERIFY_ENABLED, /* u8 */ - ETHTOOL_A_MM_VERIFY_STATUS, /* u8 */ - ETHTOOL_A_MM_VERIFY_TIME, /* u32 */ - ETHTOOL_A_MM_MAX_VERIFY_TIME, /* u32 */ - ETHTOOL_A_MM_STATS, /* nest - _A_MM_STAT_* */ - - /* add new constants above here */ - __ETHTOOL_A_MM_CNT, - ETHTOOL_A_MM_MAX = (__ETHTOOL_A_MM_CNT - 1) -}; - -/* MODULE_FW_FLASH */ - -enum { - ETHTOOL_A_MODULE_FW_FLASH_UNSPEC, - ETHTOOL_A_MODULE_FW_FLASH_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME, /* string */ - ETHTOOL_A_MODULE_FW_FLASH_PASSWORD, /* u32 */ - ETHTOOL_A_MODULE_FW_FLASH_STATUS, /* u32 */ - ETHTOOL_A_MODULE_FW_FLASH_STATUS_MSG, /* string */ - ETHTOOL_A_MODULE_FW_FLASH_DONE, /* uint */ - ETHTOOL_A_MODULE_FW_FLASH_TOTAL, /* uint */ - - /* add new constants above here */ - __ETHTOOL_A_MODULE_FW_FLASH_CNT, - ETHTOOL_A_MODULE_FW_FLASH_MAX = (__ETHTOOL_A_MODULE_FW_FLASH_CNT - 1) -}; - -enum { - ETHTOOL_A_PHY_UNSPEC, - ETHTOOL_A_PHY_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PHY_INDEX, /* u32 */ - ETHTOOL_A_PHY_DRVNAME, /* string */ - ETHTOOL_A_PHY_NAME, /* string */ - ETHTOOL_A_PHY_UPSTREAM_TYPE, /* u32 */ - ETHTOOL_A_PHY_UPSTREAM_INDEX, /* u32 */ - ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, /* string */ - ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, /* string */ - - /* add new constants above here */ - __ETHTOOL_A_PHY_CNT, - ETHTOOL_A_PHY_MAX = (__ETHTOOL_A_PHY_CNT - 1) -}; /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h new file mode 100644 index 000000000000..4b4bf17d1a88 --- /dev/null +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -0,0 +1,899 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +#ifndef _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H +#define _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H + +/* TUNNEL INFO */ + +enum { + ETHTOOL_UDP_TUNNEL_TYPE_VXLAN, + ETHTOOL_UDP_TUNNEL_TYPE_GENEVE, + ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE, + + __ETHTOOL_UDP_TUNNEL_TYPE_CNT +}; + +/* request header */ + +enum ethtool_header_flags { + ETHTOOL_FLAG_COMPACT_BITSETS = 1 << 0, /* use compact bitsets in reply */ + ETHTOOL_FLAG_OMIT_REPLY = 1 << 1, /* provide optional reply for SET or ACT requests */ + ETHTOOL_FLAG_STATS = 1 << 2, /* request statistics, if supported by the driver */ +}; + +enum { + ETHTOOL_TCP_DATA_SPLIT_UNKNOWN = 0, + ETHTOOL_TCP_DATA_SPLIT_DISABLED, + ETHTOOL_TCP_DATA_SPLIT_ENABLED, +}; + +enum { + ETHTOOL_A_HEADER_UNSPEC, + ETHTOOL_A_HEADER_DEV_INDEX, /* u32 */ + ETHTOOL_A_HEADER_DEV_NAME, /* string */ + ETHTOOL_A_HEADER_FLAGS, /* u32 - ETHTOOL_FLAG_* */ + ETHTOOL_A_HEADER_PHY_INDEX, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_HEADER_CNT, + ETHTOOL_A_HEADER_MAX = __ETHTOOL_A_HEADER_CNT - 1 +}; + +/* bit sets */ + +enum { + ETHTOOL_A_BITSET_BIT_UNSPEC, + ETHTOOL_A_BITSET_BIT_INDEX, /* u32 */ + ETHTOOL_A_BITSET_BIT_NAME, /* string */ + ETHTOOL_A_BITSET_BIT_VALUE, /* flag */ + + /* add new constants above here */ + __ETHTOOL_A_BITSET_BIT_CNT, + ETHTOOL_A_BITSET_BIT_MAX = __ETHTOOL_A_BITSET_BIT_CNT - 1 +}; + +enum { + ETHTOOL_A_BITSET_BITS_UNSPEC, + ETHTOOL_A_BITSET_BITS_BIT, /* nest - _A_BITSET_BIT_* */ + + /* add new constants above here */ + __ETHTOOL_A_BITSET_BITS_CNT, + ETHTOOL_A_BITSET_BITS_MAX = __ETHTOOL_A_BITSET_BITS_CNT - 1 +}; + +enum { + ETHTOOL_A_BITSET_UNSPEC, + ETHTOOL_A_BITSET_NOMASK, /* flag */ + ETHTOOL_A_BITSET_SIZE, /* u32 */ + ETHTOOL_A_BITSET_BITS, /* nest - _A_BITSET_BITS_* */ + ETHTOOL_A_BITSET_VALUE, /* binary */ + ETHTOOL_A_BITSET_MASK, /* binary */ + + /* add new constants above here */ + __ETHTOOL_A_BITSET_CNT, + ETHTOOL_A_BITSET_MAX = __ETHTOOL_A_BITSET_CNT - 1 +}; + +/* string sets */ + +enum { + ETHTOOL_A_STRING_UNSPEC, + ETHTOOL_A_STRING_INDEX, /* u32 */ + ETHTOOL_A_STRING_VALUE, /* string */ + + /* add new constants above here */ + __ETHTOOL_A_STRING_CNT, + ETHTOOL_A_STRING_MAX = __ETHTOOL_A_STRING_CNT - 1 +}; + +enum { + ETHTOOL_A_STRINGS_UNSPEC, + ETHTOOL_A_STRINGS_STRING, /* nest - _A_STRINGS_* */ + + /* add new constants above here */ + __ETHTOOL_A_STRINGS_CNT, + ETHTOOL_A_STRINGS_MAX = __ETHTOOL_A_STRINGS_CNT - 1 +}; + +enum { + ETHTOOL_A_STRINGSET_UNSPEC, + ETHTOOL_A_STRINGSET_ID, /* u32 */ + ETHTOOL_A_STRINGSET_COUNT, /* u32 */ + ETHTOOL_A_STRINGSET_STRINGS, /* nest - _A_STRINGS_* */ + + /* add new constants above here */ + __ETHTOOL_A_STRINGSET_CNT, + ETHTOOL_A_STRINGSET_MAX = __ETHTOOL_A_STRINGSET_CNT - 1 +}; + +enum { + ETHTOOL_A_STRINGSETS_UNSPEC, + ETHTOOL_A_STRINGSETS_STRINGSET, /* nest - _A_STRINGSET_* */ + + /* add new constants above here */ + __ETHTOOL_A_STRINGSETS_CNT, + ETHTOOL_A_STRINGSETS_MAX = __ETHTOOL_A_STRINGSETS_CNT - 1 +}; + +/* STRSET */ + +enum { + ETHTOOL_A_STRSET_UNSPEC, + ETHTOOL_A_STRSET_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_STRSET_STRINGSETS, /* nest - _A_STRINGSETS_* */ + ETHTOOL_A_STRSET_COUNTS_ONLY, /* flag */ + + /* add new constants above here */ + __ETHTOOL_A_STRSET_CNT, + ETHTOOL_A_STRSET_MAX = __ETHTOOL_A_STRSET_CNT - 1 +}; + +/* PRIVFLAGS */ + +enum { + ETHTOOL_A_PRIVFLAGS_UNSPEC, + ETHTOOL_A_PRIVFLAGS_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_PRIVFLAGS_FLAGS, /* bitset */ + + /* add new constants above here */ + __ETHTOOL_A_PRIVFLAGS_CNT, + ETHTOOL_A_PRIVFLAGS_MAX = __ETHTOOL_A_PRIVFLAGS_CNT - 1 +}; + +/* RINGS */ + +enum { + ETHTOOL_A_RINGS_UNSPEC, + ETHTOOL_A_RINGS_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_RINGS_RX_MAX, /* u32 */ + ETHTOOL_A_RINGS_RX_MINI_MAX, /* u32 */ + ETHTOOL_A_RINGS_RX_JUMBO_MAX, /* u32 */ + ETHTOOL_A_RINGS_TX_MAX, /* u32 */ + ETHTOOL_A_RINGS_RX, /* u32 */ + ETHTOOL_A_RINGS_RX_MINI, /* u32 */ + ETHTOOL_A_RINGS_RX_JUMBO, /* u32 */ + ETHTOOL_A_RINGS_TX, /* u32 */ + ETHTOOL_A_RINGS_RX_BUF_LEN, /* u32 */ + ETHTOOL_A_RINGS_TCP_DATA_SPLIT, /* u8 */ + ETHTOOL_A_RINGS_CQE_SIZE, /* u32 */ + ETHTOOL_A_RINGS_TX_PUSH, /* u8 */ + ETHTOOL_A_RINGS_RX_PUSH, /* u8 */ + ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, /* u32 */ + ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_RINGS_CNT, + ETHTOOL_A_RINGS_MAX = (__ETHTOOL_A_RINGS_CNT - 1) +}; + +/* MAC Merge (802.3) */ + +enum { + ETHTOOL_A_MM_STAT_UNSPEC, + ETHTOOL_A_MM_STAT_PAD, + + /* aMACMergeFrameAssErrorCount */ + ETHTOOL_A_MM_STAT_REASSEMBLY_ERRORS, /* u64 */ + /* aMACMergeFrameSmdErrorCount */ + ETHTOOL_A_MM_STAT_SMD_ERRORS, /* u64 */ + /* aMACMergeFrameAssOkCount */ + ETHTOOL_A_MM_STAT_REASSEMBLY_OK, /* u64 */ + /* aMACMergeFragCountRx */ + ETHTOOL_A_MM_STAT_RX_FRAG_COUNT, /* u64 */ + /* aMACMergeFragCountTx */ + ETHTOOL_A_MM_STAT_TX_FRAG_COUNT, /* u64 */ + /* aMACMergeHoldCount */ + ETHTOOL_A_MM_STAT_HOLD_COUNT, /* u64 */ + + /* add new constants above here */ + __ETHTOOL_A_MM_STAT_CNT, + ETHTOOL_A_MM_STAT_MAX = (__ETHTOOL_A_MM_STAT_CNT - 1) +}; + +enum { + ETHTOOL_A_MM_UNSPEC, + ETHTOOL_A_MM_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_MM_PMAC_ENABLED, /* u8 */ + ETHTOOL_A_MM_TX_ENABLED, /* u8 */ + ETHTOOL_A_MM_TX_ACTIVE, /* u8 */ + ETHTOOL_A_MM_TX_MIN_FRAG_SIZE, /* u32 */ + ETHTOOL_A_MM_RX_MIN_FRAG_SIZE, /* u32 */ + ETHTOOL_A_MM_VERIFY_ENABLED, /* u8 */ + ETHTOOL_A_MM_VERIFY_STATUS, /* u8 */ + ETHTOOL_A_MM_VERIFY_TIME, /* u32 */ + ETHTOOL_A_MM_MAX_VERIFY_TIME, /* u32 */ + ETHTOOL_A_MM_STATS, /* nest - _A_MM_STAT_* */ + + /* add new constants above here */ + __ETHTOOL_A_MM_CNT, + ETHTOOL_A_MM_MAX = (__ETHTOOL_A_MM_CNT - 1) +}; + +/* LINKINFO */ + +enum { + ETHTOOL_A_LINKINFO_UNSPEC, + ETHTOOL_A_LINKINFO_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_LINKINFO_PORT, /* u8 */ + ETHTOOL_A_LINKINFO_PHYADDR, /* u8 */ + ETHTOOL_A_LINKINFO_TP_MDIX, /* u8 */ + ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, /* u8 */ + ETHTOOL_A_LINKINFO_TRANSCEIVER, /* u8 */ + + /* add new constants above here */ + __ETHTOOL_A_LINKINFO_CNT, + ETHTOOL_A_LINKINFO_MAX = __ETHTOOL_A_LINKINFO_CNT - 1 +}; + +/* LINKMODES */ + +enum { + ETHTOOL_A_LINKMODES_UNSPEC, + ETHTOOL_A_LINKMODES_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_LINKMODES_AUTONEG, /* u8 */ + ETHTOOL_A_LINKMODES_OURS, /* bitset */ + ETHTOOL_A_LINKMODES_PEER, /* bitset */ + ETHTOOL_A_LINKMODES_SPEED, /* u32 */ + ETHTOOL_A_LINKMODES_DUPLEX, /* u8 */ + ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, /* u8 */ + ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, /* u8 */ + ETHTOOL_A_LINKMODES_LANES, /* u32 */ + ETHTOOL_A_LINKMODES_RATE_MATCHING, /* u8 */ + + /* add new constants above here */ + __ETHTOOL_A_LINKMODES_CNT, + ETHTOOL_A_LINKMODES_MAX = __ETHTOOL_A_LINKMODES_CNT - 1 +}; + +/* LINKSTATE */ + +enum { + ETHTOOL_A_LINKSTATE_UNSPEC, + ETHTOOL_A_LINKSTATE_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_LINKSTATE_LINK, /* u8 */ + ETHTOOL_A_LINKSTATE_SQI, /* u32 */ + ETHTOOL_A_LINKSTATE_SQI_MAX, /* u32 */ + ETHTOOL_A_LINKSTATE_EXT_STATE, /* u8 */ + ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, /* u8 */ + ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_LINKSTATE_CNT, + ETHTOOL_A_LINKSTATE_MAX = __ETHTOOL_A_LINKSTATE_CNT - 1 +}; + +/* DEBUG */ + +enum { + ETHTOOL_A_DEBUG_UNSPEC, + ETHTOOL_A_DEBUG_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_DEBUG_MSGMASK, /* bitset */ + + /* add new constants above here */ + __ETHTOOL_A_DEBUG_CNT, + ETHTOOL_A_DEBUG_MAX = __ETHTOOL_A_DEBUG_CNT - 1 +}; + +/* WOL */ + +enum { + ETHTOOL_A_WOL_UNSPEC, + ETHTOOL_A_WOL_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_WOL_MODES, /* bitset */ + ETHTOOL_A_WOL_SOPASS, /* binary */ + + /* add new constants above here */ + __ETHTOOL_A_WOL_CNT, + ETHTOOL_A_WOL_MAX = __ETHTOOL_A_WOL_CNT - 1 +}; + +/* FEATURES */ + +enum { + ETHTOOL_A_FEATURES_UNSPEC, + ETHTOOL_A_FEATURES_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_FEATURES_HW, /* bitset */ + ETHTOOL_A_FEATURES_WANTED, /* bitset */ + ETHTOOL_A_FEATURES_ACTIVE, /* bitset */ + ETHTOOL_A_FEATURES_NOCHANGE, /* bitset */ + + /* add new constants above here */ + __ETHTOOL_A_FEATURES_CNT, + ETHTOOL_A_FEATURES_MAX = __ETHTOOL_A_FEATURES_CNT - 1 +}; + +/* CHANNELS */ + +enum { + ETHTOOL_A_CHANNELS_UNSPEC, + ETHTOOL_A_CHANNELS_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_CHANNELS_RX_MAX, /* u32 */ + ETHTOOL_A_CHANNELS_TX_MAX, /* u32 */ + ETHTOOL_A_CHANNELS_OTHER_MAX, /* u32 */ + ETHTOOL_A_CHANNELS_COMBINED_MAX, /* u32 */ + ETHTOOL_A_CHANNELS_RX_COUNT, /* u32 */ + ETHTOOL_A_CHANNELS_TX_COUNT, /* u32 */ + ETHTOOL_A_CHANNELS_OTHER_COUNT, /* u32 */ + ETHTOOL_A_CHANNELS_COMBINED_COUNT, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_CHANNELS_CNT, + ETHTOOL_A_CHANNELS_MAX = (__ETHTOOL_A_CHANNELS_CNT - 1) +}; + +enum { + ETHTOOL_A_IRQ_MODERATION_UNSPEC, + ETHTOOL_A_IRQ_MODERATION_USEC, /* u32 */ + ETHTOOL_A_IRQ_MODERATION_PKTS, /* u32 */ + ETHTOOL_A_IRQ_MODERATION_COMPS, /* u32 */ + + __ETHTOOL_A_IRQ_MODERATION_CNT, + ETHTOOL_A_IRQ_MODERATION_MAX = (__ETHTOOL_A_IRQ_MODERATION_CNT - 1) +}; + +enum { + ETHTOOL_A_PROFILE_UNSPEC, + /* nest, _A_IRQ_MODERATION_* */ + ETHTOOL_A_PROFILE_IRQ_MODERATION, + __ETHTOOL_A_PROFILE_CNT, + ETHTOOL_A_PROFILE_MAX = (__ETHTOOL_A_PROFILE_CNT - 1) +}; + +/* COALESCE */ + +enum { + ETHTOOL_A_COALESCE_UNSPEC, + ETHTOOL_A_COALESCE_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_COALESCE_RX_USECS, /* u32 */ + ETHTOOL_A_COALESCE_RX_MAX_FRAMES, /* u32 */ + ETHTOOL_A_COALESCE_RX_USECS_IRQ, /* u32 */ + ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ, /* u32 */ + ETHTOOL_A_COALESCE_TX_USECS, /* u32 */ + ETHTOOL_A_COALESCE_TX_MAX_FRAMES, /* u32 */ + ETHTOOL_A_COALESCE_TX_USECS_IRQ, /* u32 */ + ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ, /* u32 */ + ETHTOOL_A_COALESCE_STATS_BLOCK_USECS, /* u32 */ + ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX, /* u8 */ + ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX, /* u8 */ + ETHTOOL_A_COALESCE_PKT_RATE_LOW, /* u32 */ + ETHTOOL_A_COALESCE_RX_USECS_LOW, /* u32 */ + ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW, /* u32 */ + ETHTOOL_A_COALESCE_TX_USECS_LOW, /* u32 */ + ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW, /* u32 */ + ETHTOOL_A_COALESCE_PKT_RATE_HIGH, /* u32 */ + ETHTOOL_A_COALESCE_RX_USECS_HIGH, /* u32 */ + ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH, /* u32 */ + ETHTOOL_A_COALESCE_TX_USECS_HIGH, /* u32 */ + ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH, /* u32 */ + ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL, /* u32 */ + ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, /* u8 */ + ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, /* u8 */ + ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES, /* u32 */ + ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, /* u32 */ + ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, /* u32 */ + /* nest - _A_PROFILE_IRQ_MODERATION */ + ETHTOOL_A_COALESCE_RX_PROFILE, + /* nest - _A_PROFILE_IRQ_MODERATION */ + ETHTOOL_A_COALESCE_TX_PROFILE, + + /* add new constants above here */ + __ETHTOOL_A_COALESCE_CNT, + ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) +}; + +/* PAUSE */ + +enum { + ETHTOOL_A_PAUSE_STAT_UNSPEC, + ETHTOOL_A_PAUSE_STAT_PAD, + + ETHTOOL_A_PAUSE_STAT_TX_FRAMES, + ETHTOOL_A_PAUSE_STAT_RX_FRAMES, + + /* add new constants above here + * adjust ETHTOOL_PAUSE_STAT_CNT if adding non-stats! + */ + __ETHTOOL_A_PAUSE_STAT_CNT, + ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) +}; + +enum { + ETHTOOL_A_PAUSE_UNSPEC, + ETHTOOL_A_PAUSE_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_PAUSE_AUTONEG, /* u8 */ + ETHTOOL_A_PAUSE_RX, /* u8 */ + ETHTOOL_A_PAUSE_TX, /* u8 */ + ETHTOOL_A_PAUSE_STATS, /* nest - _PAUSE_STAT_* */ + ETHTOOL_A_PAUSE_STATS_SRC, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_PAUSE_CNT, + ETHTOOL_A_PAUSE_MAX = (__ETHTOOL_A_PAUSE_CNT - 1) +}; + +/* EEE */ + +enum { + ETHTOOL_A_EEE_UNSPEC, + ETHTOOL_A_EEE_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_EEE_MODES_OURS, /* bitset */ + ETHTOOL_A_EEE_MODES_PEER, /* bitset */ + ETHTOOL_A_EEE_ACTIVE, /* u8 */ + ETHTOOL_A_EEE_ENABLED, /* u8 */ + ETHTOOL_A_EEE_TX_LPI_ENABLED, /* u8 */ + ETHTOOL_A_EEE_TX_LPI_TIMER, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_EEE_CNT, + ETHTOOL_A_EEE_MAX = (__ETHTOOL_A_EEE_CNT - 1) +}; + +/* TSINFO */ + +enum { + ETHTOOL_A_TS_STAT_UNSPEC, + + ETHTOOL_A_TS_STAT_TX_PKTS, /* uint */ + ETHTOOL_A_TS_STAT_TX_LOST, /* uint */ + ETHTOOL_A_TS_STAT_TX_ERR, /* uint */ + + /* add new constants above here */ + __ETHTOOL_A_TS_STAT_CNT, + ETHTOOL_A_TS_STAT_MAX = (__ETHTOOL_A_TS_STAT_CNT - 1) + +}; + +enum { + ETHTOOL_A_TSINFO_UNSPEC, + ETHTOOL_A_TSINFO_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_TSINFO_TIMESTAMPING, /* bitset */ + ETHTOOL_A_TSINFO_TX_TYPES, /* bitset */ + ETHTOOL_A_TSINFO_RX_FILTERS, /* bitset */ + ETHTOOL_A_TSINFO_PHC_INDEX, /* u32 */ + ETHTOOL_A_TSINFO_STATS, /* nest - _A_TSINFO_STAT */ + + /* add new constants above here */ + __ETHTOOL_A_TSINFO_CNT, + ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_RESULT_UNSPEC, + ETHTOOL_A_CABLE_RESULT_PAIR, /* u8 ETHTOOL_A_CABLE_PAIR_ */ + ETHTOOL_A_CABLE_RESULT_CODE, /* u8 ETHTOOL_A_CABLE_RESULT_CODE_ */ + ETHTOOL_A_CABLE_RESULT_SRC, /* u32 ETHTOOL_A_CABLE_INF_SRC_ */ + + __ETHTOOL_A_CABLE_RESULT_CNT, + ETHTOOL_A_CABLE_RESULT_MAX = (__ETHTOOL_A_CABLE_RESULT_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_FAULT_LENGTH_UNSPEC, + ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR, /* u8 ETHTOOL_A_CABLE_PAIR_ */ + ETHTOOL_A_CABLE_FAULT_LENGTH_CM, /* u32 */ + ETHTOOL_A_CABLE_FAULT_LENGTH_SRC, /* u32 ETHTOOL_A_CABLE_INF_SRC_ */ + + __ETHTOOL_A_CABLE_FAULT_LENGTH_CNT, + ETHTOOL_A_CABLE_FAULT_LENGTH_MAX = (__ETHTOOL_A_CABLE_FAULT_LENGTH_CNT - 1) +}; + +enum { + ETHTOOL_A_CABLE_NEST_UNSPEC, + ETHTOOL_A_CABLE_NEST_RESULT, /* nest - ETHTOOL_A_CABLE_RESULT_ */ + ETHTOOL_A_CABLE_NEST_FAULT_LENGTH, /* nest - ETHTOOL_A_CABLE_FAULT_LENGTH_ */ + __ETHTOOL_A_CABLE_NEST_CNT, + ETHTOOL_A_CABLE_NEST_MAX = (__ETHTOOL_A_CABLE_NEST_CNT - 1) +}; + +/* CABLE TEST */ + +enum { + ETHTOOL_A_CABLE_TEST_UNSPEC, + ETHTOOL_A_CABLE_TEST_HEADER, /* nest - _A_HEADER_* */ + + /* add new constants above here */ + __ETHTOOL_A_CABLE_TEST_CNT, + ETHTOOL_A_CABLE_TEST_MAX = __ETHTOOL_A_CABLE_TEST_CNT - 1 +}; + +enum { + ETHTOOL_A_CABLE_TEST_NTF_UNSPEC, + ETHTOOL_A_CABLE_TEST_NTF_HEADER, /* nest - ETHTOOL_A_HEADER_* */ + ETHTOOL_A_CABLE_TEST_NTF_STATUS, /* u8 - _STARTED/_COMPLETE */ + ETHTOOL_A_CABLE_TEST_NTF_NEST, /* nest - of results: */ + + __ETHTOOL_A_CABLE_TEST_NTF_CNT, + ETHTOOL_A_CABLE_TEST_NTF_MAX = (__ETHTOOL_A_CABLE_TEST_NTF_CNT - 1) +}; + +/* CABLE TEST TDR */ + +enum { + ETHTOOL_A_CABLE_TEST_TDR_CFG_UNSPEC, + ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST, /* u32 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST, /* u32 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP, /* u32 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR, /* u8 */ + + /* add new constants above here */ + __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT, + ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT - 1 +}; + +enum { + ETHTOOL_A_CABLE_TEST_TDR_NTF_UNSPEC, + ETHTOOL_A_CABLE_TEST_TDR_NTF_HEADER, /* nest - ETHTOOL_A_HEADER_* */ + ETHTOOL_A_CABLE_TEST_TDR_NTF_STATUS, /* u8 - _STARTED/_COMPLETE */ + ETHTOOL_A_CABLE_TEST_TDR_NTF_NEST, /* nest - of results: */ + + /* add new constants above here */ + __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT, + ETHTOOL_A_CABLE_TEST_TDR_NTF_MAX = __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT - 1 +}; + +enum { + ETHTOOL_A_CABLE_TEST_TDR_UNSPEC, + ETHTOOL_A_CABLE_TEST_TDR_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_CABLE_TEST_TDR_CFG, /* nest - *_TDR_CFG_* */ + + /* add new constants above here */ + __ETHTOOL_A_CABLE_TEST_TDR_CNT, + ETHTOOL_A_CABLE_TEST_TDR_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CNT - 1 +}; + +enum { + ETHTOOL_A_TUNNEL_UDP_ENTRY_UNSPEC, + + ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, /* be16 */ + ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT, + ETHTOOL_A_TUNNEL_UDP_ENTRY_MAX = (__ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_UDP_TABLE_UNSPEC, + + ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, /* u32 */ + ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, /* bitset */ + ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY, /* nest - _UDP_ENTRY_* */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_UDP_TABLE_CNT, + ETHTOOL_A_TUNNEL_UDP_TABLE_MAX = (__ETHTOOL_A_TUNNEL_UDP_TABLE_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_UDP_UNSPEC, + + ETHTOOL_A_TUNNEL_UDP_TABLE, /* nest - _UDP_TABLE_* */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_UDP_CNT, + ETHTOOL_A_TUNNEL_UDP_MAX = (__ETHTOOL_A_TUNNEL_UDP_CNT - 1) +}; + +enum { + ETHTOOL_A_TUNNEL_INFO_UNSPEC, + ETHTOOL_A_TUNNEL_INFO_HEADER, /* nest - _A_HEADER_* */ + + ETHTOOL_A_TUNNEL_INFO_UDP_PORTS, /* nest - _UDP_TABLE */ + + /* add new constants above here */ + __ETHTOOL_A_TUNNEL_INFO_CNT, + ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) +}; + +/* FEC */ + +enum { + ETHTOOL_A_FEC_STAT_UNSPEC, + ETHTOOL_A_FEC_STAT_PAD, + + ETHTOOL_A_FEC_STAT_CORRECTED, /* array, u64 */ + ETHTOOL_A_FEC_STAT_UNCORR, /* array, u64 */ + ETHTOOL_A_FEC_STAT_CORR_BITS, /* array, u64 */ + + /* add new constants above here */ + __ETHTOOL_A_FEC_STAT_CNT, + ETHTOOL_A_FEC_STAT_MAX = (__ETHTOOL_A_FEC_STAT_CNT - 1) +}; + +enum { + ETHTOOL_A_FEC_UNSPEC, + ETHTOOL_A_FEC_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_FEC_MODES, /* bitset */ + ETHTOOL_A_FEC_AUTO, /* u8 */ + ETHTOOL_A_FEC_ACTIVE, /* u32 */ + ETHTOOL_A_FEC_STATS, /* nest - _A_FEC_STAT */ + + __ETHTOOL_A_FEC_CNT, + ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1) +}; + +/* MODULE EEPROM */ + +enum { + ETHTOOL_A_MODULE_EEPROM_UNSPEC, + ETHTOOL_A_MODULE_EEPROM_HEADER, /* nest - _A_HEADER_* */ + + ETHTOOL_A_MODULE_EEPROM_OFFSET, /* u32 */ + ETHTOOL_A_MODULE_EEPROM_LENGTH, /* u32 */ + ETHTOOL_A_MODULE_EEPROM_PAGE, /* u8 */ + ETHTOOL_A_MODULE_EEPROM_BANK, /* u8 */ + ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS, /* u8 */ + ETHTOOL_A_MODULE_EEPROM_DATA, /* binary */ + + __ETHTOOL_A_MODULE_EEPROM_CNT, + ETHTOOL_A_MODULE_EEPROM_MAX = (__ETHTOOL_A_MODULE_EEPROM_CNT - 1) +}; + + +enum { + ETHTOOL_A_STATS_GRP_UNSPEC, + ETHTOOL_A_STATS_GRP_PAD, + + ETHTOOL_A_STATS_GRP_ID, /* u32 */ + ETHTOOL_A_STATS_GRP_SS_ID, /* u32 */ + + ETHTOOL_A_STATS_GRP_STAT, /* nest */ + + ETHTOOL_A_STATS_GRP_HIST_RX, /* nest */ + ETHTOOL_A_STATS_GRP_HIST_TX, /* nest */ + + ETHTOOL_A_STATS_GRP_HIST_BKT_LOW, /* u32 */ + ETHTOOL_A_STATS_GRP_HIST_BKT_HI, /* u32 */ + ETHTOOL_A_STATS_GRP_HIST_VAL, /* u64 */ + + /* add new constants above here */ + __ETHTOOL_A_STATS_GRP_CNT, + ETHTOOL_A_STATS_GRP_MAX = (__ETHTOOL_A_STATS_GRP_CNT - 1) +}; + +/* STATS */ + +enum { + ETHTOOL_A_STATS_UNSPEC, + ETHTOOL_A_STATS_PAD, + ETHTOOL_A_STATS_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_STATS_GROUPS, /* bitset */ + + ETHTOOL_A_STATS_GRP, /* nest - _A_STATS_GRP_* */ + + ETHTOOL_A_STATS_SRC, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_STATS_CNT, + ETHTOOL_A_STATS_MAX = (__ETHTOOL_A_STATS_CNT - 1) +}; + +/* PHC VCLOCKS */ + +enum { + ETHTOOL_A_PHC_VCLOCKS_UNSPEC, + ETHTOOL_A_PHC_VCLOCKS_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_PHC_VCLOCKS_NUM, /* u32 */ + ETHTOOL_A_PHC_VCLOCKS_INDEX, /* array, s32 */ + + /* add new constants above here */ + __ETHTOOL_A_PHC_VCLOCKS_CNT, + ETHTOOL_A_PHC_VCLOCKS_MAX = (__ETHTOOL_A_PHC_VCLOCKS_CNT - 1) +}; + +/* MODULE */ + +enum { + ETHTOOL_A_MODULE_UNSPEC, + ETHTOOL_A_MODULE_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_MODULE_POWER_MODE_POLICY, /* u8 */ + ETHTOOL_A_MODULE_POWER_MODE, /* u8 */ + + /* add new constants above here */ + __ETHTOOL_A_MODULE_CNT, + ETHTOOL_A_MODULE_MAX = (__ETHTOOL_A_MODULE_CNT - 1) +}; + +/* Power Sourcing Equipment */ +enum { + ETHTOOL_A_C33_PSE_PW_LIMIT_UNSPEC, + ETHTOOL_A_C33_PSE_PW_LIMIT_MIN, /* u32 */ + ETHTOOL_A_C33_PSE_PW_LIMIT_MAX, /* u32 */ +}; + +enum { + ETHTOOL_A_PSE_UNSPEC, + ETHTOOL_A_PSE_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_PODL_PSE_ADMIN_STATE, /* u32 */ + ETHTOOL_A_PODL_PSE_ADMIN_CONTROL, /* u32 */ + ETHTOOL_A_PODL_PSE_PW_D_STATUS, /* u32 */ + ETHTOOL_A_C33_PSE_ADMIN_STATE, /* u32 */ + ETHTOOL_A_C33_PSE_ADMIN_CONTROL, /* u32 */ + ETHTOOL_A_C33_PSE_PW_D_STATUS, /* u32 */ + ETHTOOL_A_C33_PSE_PW_CLASS, /* u32 */ + ETHTOOL_A_C33_PSE_ACTUAL_PW, /* u32 */ + ETHTOOL_A_C33_PSE_EXT_STATE, /* u32 */ + ETHTOOL_A_C33_PSE_EXT_SUBSTATE, /* u32 */ + ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT, /* u32 */ + ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES, /* nest - _C33_PSE_PW_LIMIT_* */ + + /* add new constants above here */ + __ETHTOOL_A_PSE_CNT, + ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1) +}; + +enum { + ETHTOOL_A_RSS_UNSPEC, + ETHTOOL_A_RSS_HEADER, + ETHTOOL_A_RSS_CONTEXT, /* u32 */ + ETHTOOL_A_RSS_HFUNC, /* u32 */ + ETHTOOL_A_RSS_INDIR, /* binary */ + ETHTOOL_A_RSS_HKEY, /* binary */ + ETHTOOL_A_RSS_INPUT_XFRM, /* u32 */ + ETHTOOL_A_RSS_START_CONTEXT, /* u32 */ + + __ETHTOOL_A_RSS_CNT, + ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1), +}; + +/* PLCA */ + +enum { + ETHTOOL_A_PLCA_UNSPEC, + ETHTOOL_A_PLCA_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_PLCA_VERSION, /* u16 */ + ETHTOOL_A_PLCA_ENABLED, /* u8 */ + ETHTOOL_A_PLCA_STATUS, /* u8 */ + ETHTOOL_A_PLCA_NODE_CNT, /* u32 */ + ETHTOOL_A_PLCA_NODE_ID, /* u32 */ + ETHTOOL_A_PLCA_TO_TMR, /* u32 */ + ETHTOOL_A_PLCA_BURST_CNT, /* u32 */ + ETHTOOL_A_PLCA_BURST_TMR, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_PLCA_CNT, + ETHTOOL_A_PLCA_MAX = (__ETHTOOL_A_PLCA_CNT - 1) +}; + +/* MODULE_FW_FLASH */ + +enum { + ETHTOOL_A_MODULE_FW_FLASH_UNSPEC, + ETHTOOL_A_MODULE_FW_FLASH_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME, /* string */ + ETHTOOL_A_MODULE_FW_FLASH_PASSWORD, /* u32 */ + ETHTOOL_A_MODULE_FW_FLASH_STATUS, /* u32 */ + ETHTOOL_A_MODULE_FW_FLASH_STATUS_MSG, /* string */ + ETHTOOL_A_MODULE_FW_FLASH_DONE, /* uint */ + ETHTOOL_A_MODULE_FW_FLASH_TOTAL, /* uint */ + + /* add new constants above here */ + __ETHTOOL_A_MODULE_FW_FLASH_CNT, + ETHTOOL_A_MODULE_FW_FLASH_MAX = (__ETHTOOL_A_MODULE_FW_FLASH_CNT - 1) +}; + +enum { + ETHTOOL_A_PHY_UNSPEC, + ETHTOOL_A_PHY_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_PHY_INDEX, /* u32 */ + ETHTOOL_A_PHY_DRVNAME, /* string */ + ETHTOOL_A_PHY_NAME, /* string */ + ETHTOOL_A_PHY_UPSTREAM_TYPE, /* u32 */ + ETHTOOL_A_PHY_UPSTREAM_INDEX, /* u32 */ + ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, /* string */ + ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, /* string */ + + /* add new constants above here */ + __ETHTOOL_A_PHY_CNT, + ETHTOOL_A_PHY_MAX = (__ETHTOOL_A_PHY_CNT - 1) +}; + +/* message types - userspace to kernel */ +enum { + ETHTOOL_MSG_USER_NONE, + ETHTOOL_MSG_STRSET_GET, + ETHTOOL_MSG_LINKINFO_GET, + ETHTOOL_MSG_LINKINFO_SET, + ETHTOOL_MSG_LINKMODES_GET, + ETHTOOL_MSG_LINKMODES_SET, + ETHTOOL_MSG_LINKSTATE_GET, + ETHTOOL_MSG_DEBUG_GET, + ETHTOOL_MSG_DEBUG_SET, + ETHTOOL_MSG_WOL_GET, + ETHTOOL_MSG_WOL_SET, + ETHTOOL_MSG_FEATURES_GET, + ETHTOOL_MSG_FEATURES_SET, + ETHTOOL_MSG_PRIVFLAGS_GET, + ETHTOOL_MSG_PRIVFLAGS_SET, + ETHTOOL_MSG_RINGS_GET, + ETHTOOL_MSG_RINGS_SET, + ETHTOOL_MSG_CHANNELS_GET, + ETHTOOL_MSG_CHANNELS_SET, + ETHTOOL_MSG_COALESCE_GET, + ETHTOOL_MSG_COALESCE_SET, + ETHTOOL_MSG_PAUSE_GET, + ETHTOOL_MSG_PAUSE_SET, + ETHTOOL_MSG_EEE_GET, + ETHTOOL_MSG_EEE_SET, + ETHTOOL_MSG_TSINFO_GET, + ETHTOOL_MSG_CABLE_TEST_ACT, + ETHTOOL_MSG_CABLE_TEST_TDR_ACT, + ETHTOOL_MSG_TUNNEL_INFO_GET, + ETHTOOL_MSG_FEC_GET, + ETHTOOL_MSG_FEC_SET, + ETHTOOL_MSG_MODULE_EEPROM_GET, + ETHTOOL_MSG_STATS_GET, + ETHTOOL_MSG_PHC_VCLOCKS_GET, + ETHTOOL_MSG_MODULE_GET, + ETHTOOL_MSG_MODULE_SET, + ETHTOOL_MSG_PSE_GET, + ETHTOOL_MSG_PSE_SET, + ETHTOOL_MSG_RSS_GET, + ETHTOOL_MSG_PLCA_GET_CFG, + ETHTOOL_MSG_PLCA_SET_CFG, + ETHTOOL_MSG_PLCA_GET_STATUS, + ETHTOOL_MSG_MM_GET, + ETHTOOL_MSG_MM_SET, + ETHTOOL_MSG_MODULE_FW_FLASH_ACT, + ETHTOOL_MSG_PHY_GET, + + /* add new constants above here */ + __ETHTOOL_MSG_USER_CNT, + ETHTOOL_MSG_USER_MAX = __ETHTOOL_MSG_USER_CNT - 1 +}; + +/* message types - kernel to userspace */ +enum { + ETHTOOL_MSG_KERNEL_NONE, + ETHTOOL_MSG_STRSET_GET_REPLY, + ETHTOOL_MSG_LINKINFO_GET_REPLY, + ETHTOOL_MSG_LINKINFO_NTF, + ETHTOOL_MSG_LINKMODES_GET_REPLY, + ETHTOOL_MSG_LINKMODES_NTF, + ETHTOOL_MSG_LINKSTATE_GET_REPLY, + ETHTOOL_MSG_DEBUG_GET_REPLY, + ETHTOOL_MSG_DEBUG_NTF, + ETHTOOL_MSG_WOL_GET_REPLY, + ETHTOOL_MSG_WOL_NTF, + ETHTOOL_MSG_FEATURES_GET_REPLY, + ETHTOOL_MSG_FEATURES_SET_REPLY, + ETHTOOL_MSG_FEATURES_NTF, + ETHTOOL_MSG_PRIVFLAGS_GET_REPLY, + ETHTOOL_MSG_PRIVFLAGS_NTF, + ETHTOOL_MSG_RINGS_GET_REPLY, + ETHTOOL_MSG_RINGS_NTF, + ETHTOOL_MSG_CHANNELS_GET_REPLY, + ETHTOOL_MSG_CHANNELS_NTF, + ETHTOOL_MSG_COALESCE_GET_REPLY, + ETHTOOL_MSG_COALESCE_NTF, + ETHTOOL_MSG_PAUSE_GET_REPLY, + ETHTOOL_MSG_PAUSE_NTF, + ETHTOOL_MSG_EEE_GET_REPLY, + ETHTOOL_MSG_EEE_NTF, + ETHTOOL_MSG_TSINFO_GET_REPLY, + ETHTOOL_MSG_CABLE_TEST_NTF, + ETHTOOL_MSG_CABLE_TEST_TDR_NTF, + ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY, + ETHTOOL_MSG_FEC_GET_REPLY, + ETHTOOL_MSG_FEC_NTF, + ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY, + ETHTOOL_MSG_STATS_GET_REPLY, + ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY, + ETHTOOL_MSG_MODULE_GET_REPLY, + ETHTOOL_MSG_MODULE_NTF, + ETHTOOL_MSG_PSE_GET_REPLY, + ETHTOOL_MSG_RSS_GET_REPLY, + ETHTOOL_MSG_PLCA_GET_CFG_REPLY, + ETHTOOL_MSG_PLCA_GET_STATUS_REPLY, + ETHTOOL_MSG_PLCA_NTF, + ETHTOOL_MSG_MM_GET_REPLY, + ETHTOOL_MSG_MM_NTF, + ETHTOOL_MSG_MODULE_FW_FLASH_NTF, + ETHTOOL_MSG_PHY_GET_REPLY, + ETHTOOL_MSG_PHY_NTF, + + /* add new constants above here */ + __ETHTOOL_MSG_KERNEL_CNT, + ETHTOOL_MSG_KERNEL_MAX = __ETHTOOL_MSG_KERNEL_CNT - 1 +}; + +#endif /* _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H */ -- cgit v1.2.3 From dd7cde36de15b071b5f9163d21d7c9142089b424 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 4 Dec 2024 07:55:48 -0800 Subject: ethtool: remove the comments that are not gonna be generated Cleanup the header manually to make it easier to review the changes that ynl generator brings in. No functional changes. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20241204155549.641348-8-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool_netlink_generated.h | 678 ++++++++++--------------- 1 file changed, 274 insertions(+), 404 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 4b4bf17d1a88..35a24d490efe 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -2,8 +2,6 @@ #ifndef _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H #define _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H -/* TUNNEL INFO */ - enum { ETHTOOL_UDP_TUNNEL_TYPE_VXLAN, ETHTOOL_UDP_TUNNEL_TYPE_GENEVE, @@ -12,8 +10,6 @@ enum { __ETHTOOL_UDP_TUNNEL_TYPE_CNT }; -/* request header */ - enum ethtool_header_flags { ETHTOOL_FLAG_COMPACT_BITSETS = 1 << 0, /* use compact bitsets in reply */ ETHTOOL_FLAG_OMIT_REPLY = 1 << 1, /* provide optional reply for SET or ACT requests */ @@ -28,303 +24,250 @@ enum { enum { ETHTOOL_A_HEADER_UNSPEC, - ETHTOOL_A_HEADER_DEV_INDEX, /* u32 */ - ETHTOOL_A_HEADER_DEV_NAME, /* string */ - ETHTOOL_A_HEADER_FLAGS, /* u32 - ETHTOOL_FLAG_* */ - ETHTOOL_A_HEADER_PHY_INDEX, /* u32 */ + ETHTOOL_A_HEADER_DEV_INDEX, + ETHTOOL_A_HEADER_DEV_NAME, + ETHTOOL_A_HEADER_FLAGS, + ETHTOOL_A_HEADER_PHY_INDEX, - /* add new constants above here */ __ETHTOOL_A_HEADER_CNT, ETHTOOL_A_HEADER_MAX = __ETHTOOL_A_HEADER_CNT - 1 }; -/* bit sets */ - enum { ETHTOOL_A_BITSET_BIT_UNSPEC, - ETHTOOL_A_BITSET_BIT_INDEX, /* u32 */ - ETHTOOL_A_BITSET_BIT_NAME, /* string */ - ETHTOOL_A_BITSET_BIT_VALUE, /* flag */ + ETHTOOL_A_BITSET_BIT_INDEX, + ETHTOOL_A_BITSET_BIT_NAME, + ETHTOOL_A_BITSET_BIT_VALUE, - /* add new constants above here */ __ETHTOOL_A_BITSET_BIT_CNT, ETHTOOL_A_BITSET_BIT_MAX = __ETHTOOL_A_BITSET_BIT_CNT - 1 }; enum { ETHTOOL_A_BITSET_BITS_UNSPEC, - ETHTOOL_A_BITSET_BITS_BIT, /* nest - _A_BITSET_BIT_* */ + ETHTOOL_A_BITSET_BITS_BIT, - /* add new constants above here */ __ETHTOOL_A_BITSET_BITS_CNT, ETHTOOL_A_BITSET_BITS_MAX = __ETHTOOL_A_BITSET_BITS_CNT - 1 }; enum { ETHTOOL_A_BITSET_UNSPEC, - ETHTOOL_A_BITSET_NOMASK, /* flag */ - ETHTOOL_A_BITSET_SIZE, /* u32 */ - ETHTOOL_A_BITSET_BITS, /* nest - _A_BITSET_BITS_* */ - ETHTOOL_A_BITSET_VALUE, /* binary */ - ETHTOOL_A_BITSET_MASK, /* binary */ + ETHTOOL_A_BITSET_NOMASK, + ETHTOOL_A_BITSET_SIZE, + ETHTOOL_A_BITSET_BITS, + ETHTOOL_A_BITSET_VALUE, + ETHTOOL_A_BITSET_MASK, - /* add new constants above here */ __ETHTOOL_A_BITSET_CNT, ETHTOOL_A_BITSET_MAX = __ETHTOOL_A_BITSET_CNT - 1 }; -/* string sets */ - enum { ETHTOOL_A_STRING_UNSPEC, - ETHTOOL_A_STRING_INDEX, /* u32 */ - ETHTOOL_A_STRING_VALUE, /* string */ + ETHTOOL_A_STRING_INDEX, + ETHTOOL_A_STRING_VALUE, - /* add new constants above here */ __ETHTOOL_A_STRING_CNT, ETHTOOL_A_STRING_MAX = __ETHTOOL_A_STRING_CNT - 1 }; enum { ETHTOOL_A_STRINGS_UNSPEC, - ETHTOOL_A_STRINGS_STRING, /* nest - _A_STRINGS_* */ + ETHTOOL_A_STRINGS_STRING, - /* add new constants above here */ __ETHTOOL_A_STRINGS_CNT, ETHTOOL_A_STRINGS_MAX = __ETHTOOL_A_STRINGS_CNT - 1 }; enum { ETHTOOL_A_STRINGSET_UNSPEC, - ETHTOOL_A_STRINGSET_ID, /* u32 */ - ETHTOOL_A_STRINGSET_COUNT, /* u32 */ - ETHTOOL_A_STRINGSET_STRINGS, /* nest - _A_STRINGS_* */ + ETHTOOL_A_STRINGSET_ID, + ETHTOOL_A_STRINGSET_COUNT, + ETHTOOL_A_STRINGSET_STRINGS, - /* add new constants above here */ __ETHTOOL_A_STRINGSET_CNT, ETHTOOL_A_STRINGSET_MAX = __ETHTOOL_A_STRINGSET_CNT - 1 }; enum { ETHTOOL_A_STRINGSETS_UNSPEC, - ETHTOOL_A_STRINGSETS_STRINGSET, /* nest - _A_STRINGSET_* */ + ETHTOOL_A_STRINGSETS_STRINGSET, - /* add new constants above here */ __ETHTOOL_A_STRINGSETS_CNT, ETHTOOL_A_STRINGSETS_MAX = __ETHTOOL_A_STRINGSETS_CNT - 1 }; -/* STRSET */ - enum { ETHTOOL_A_STRSET_UNSPEC, - ETHTOOL_A_STRSET_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_STRSET_STRINGSETS, /* nest - _A_STRINGSETS_* */ - ETHTOOL_A_STRSET_COUNTS_ONLY, /* flag */ + ETHTOOL_A_STRSET_HEADER, + ETHTOOL_A_STRSET_STRINGSETS, + ETHTOOL_A_STRSET_COUNTS_ONLY, - /* add new constants above here */ __ETHTOOL_A_STRSET_CNT, ETHTOOL_A_STRSET_MAX = __ETHTOOL_A_STRSET_CNT - 1 }; -/* PRIVFLAGS */ - enum { ETHTOOL_A_PRIVFLAGS_UNSPEC, - ETHTOOL_A_PRIVFLAGS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PRIVFLAGS_FLAGS, /* bitset */ + ETHTOOL_A_PRIVFLAGS_HEADER, + ETHTOOL_A_PRIVFLAGS_FLAGS, - /* add new constants above here */ __ETHTOOL_A_PRIVFLAGS_CNT, ETHTOOL_A_PRIVFLAGS_MAX = __ETHTOOL_A_PRIVFLAGS_CNT - 1 }; -/* RINGS */ - enum { ETHTOOL_A_RINGS_UNSPEC, - ETHTOOL_A_RINGS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_RINGS_RX_MAX, /* u32 */ - ETHTOOL_A_RINGS_RX_MINI_MAX, /* u32 */ - ETHTOOL_A_RINGS_RX_JUMBO_MAX, /* u32 */ - ETHTOOL_A_RINGS_TX_MAX, /* u32 */ - ETHTOOL_A_RINGS_RX, /* u32 */ - ETHTOOL_A_RINGS_RX_MINI, /* u32 */ - ETHTOOL_A_RINGS_RX_JUMBO, /* u32 */ - ETHTOOL_A_RINGS_TX, /* u32 */ - ETHTOOL_A_RINGS_RX_BUF_LEN, /* u32 */ - ETHTOOL_A_RINGS_TCP_DATA_SPLIT, /* u8 */ - ETHTOOL_A_RINGS_CQE_SIZE, /* u32 */ - ETHTOOL_A_RINGS_TX_PUSH, /* u8 */ - ETHTOOL_A_RINGS_RX_PUSH, /* u8 */ - ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, /* u32 */ - ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, /* u32 */ - - /* add new constants above here */ + ETHTOOL_A_RINGS_HEADER, + ETHTOOL_A_RINGS_RX_MAX, + ETHTOOL_A_RINGS_RX_MINI_MAX, + ETHTOOL_A_RINGS_RX_JUMBO_MAX, + ETHTOOL_A_RINGS_TX_MAX, + ETHTOOL_A_RINGS_RX, + ETHTOOL_A_RINGS_RX_MINI, + ETHTOOL_A_RINGS_RX_JUMBO, + ETHTOOL_A_RINGS_TX, + ETHTOOL_A_RINGS_RX_BUF_LEN, + ETHTOOL_A_RINGS_TCP_DATA_SPLIT, + ETHTOOL_A_RINGS_CQE_SIZE, + ETHTOOL_A_RINGS_TX_PUSH, + ETHTOOL_A_RINGS_RX_PUSH, + ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, + ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, + __ETHTOOL_A_RINGS_CNT, ETHTOOL_A_RINGS_MAX = (__ETHTOOL_A_RINGS_CNT - 1) }; -/* MAC Merge (802.3) */ - enum { ETHTOOL_A_MM_STAT_UNSPEC, ETHTOOL_A_MM_STAT_PAD, + ETHTOOL_A_MM_STAT_REASSEMBLY_ERRORS, + ETHTOOL_A_MM_STAT_SMD_ERRORS, + ETHTOOL_A_MM_STAT_REASSEMBLY_OK, + ETHTOOL_A_MM_STAT_RX_FRAG_COUNT, + ETHTOOL_A_MM_STAT_TX_FRAG_COUNT, + ETHTOOL_A_MM_STAT_HOLD_COUNT, - /* aMACMergeFrameAssErrorCount */ - ETHTOOL_A_MM_STAT_REASSEMBLY_ERRORS, /* u64 */ - /* aMACMergeFrameSmdErrorCount */ - ETHTOOL_A_MM_STAT_SMD_ERRORS, /* u64 */ - /* aMACMergeFrameAssOkCount */ - ETHTOOL_A_MM_STAT_REASSEMBLY_OK, /* u64 */ - /* aMACMergeFragCountRx */ - ETHTOOL_A_MM_STAT_RX_FRAG_COUNT, /* u64 */ - /* aMACMergeFragCountTx */ - ETHTOOL_A_MM_STAT_TX_FRAG_COUNT, /* u64 */ - /* aMACMergeHoldCount */ - ETHTOOL_A_MM_STAT_HOLD_COUNT, /* u64 */ - - /* add new constants above here */ __ETHTOOL_A_MM_STAT_CNT, ETHTOOL_A_MM_STAT_MAX = (__ETHTOOL_A_MM_STAT_CNT - 1) }; enum { ETHTOOL_A_MM_UNSPEC, - ETHTOOL_A_MM_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_MM_PMAC_ENABLED, /* u8 */ - ETHTOOL_A_MM_TX_ENABLED, /* u8 */ - ETHTOOL_A_MM_TX_ACTIVE, /* u8 */ - ETHTOOL_A_MM_TX_MIN_FRAG_SIZE, /* u32 */ - ETHTOOL_A_MM_RX_MIN_FRAG_SIZE, /* u32 */ - ETHTOOL_A_MM_VERIFY_ENABLED, /* u8 */ - ETHTOOL_A_MM_VERIFY_STATUS, /* u8 */ - ETHTOOL_A_MM_VERIFY_TIME, /* u32 */ - ETHTOOL_A_MM_MAX_VERIFY_TIME, /* u32 */ - ETHTOOL_A_MM_STATS, /* nest - _A_MM_STAT_* */ - - /* add new constants above here */ + ETHTOOL_A_MM_HEADER, + ETHTOOL_A_MM_PMAC_ENABLED, + ETHTOOL_A_MM_TX_ENABLED, + ETHTOOL_A_MM_TX_ACTIVE, + ETHTOOL_A_MM_TX_MIN_FRAG_SIZE, + ETHTOOL_A_MM_RX_MIN_FRAG_SIZE, + ETHTOOL_A_MM_VERIFY_ENABLED, + ETHTOOL_A_MM_VERIFY_STATUS, + ETHTOOL_A_MM_VERIFY_TIME, + ETHTOOL_A_MM_MAX_VERIFY_TIME, + ETHTOOL_A_MM_STATS, + __ETHTOOL_A_MM_CNT, ETHTOOL_A_MM_MAX = (__ETHTOOL_A_MM_CNT - 1) }; -/* LINKINFO */ - enum { ETHTOOL_A_LINKINFO_UNSPEC, - ETHTOOL_A_LINKINFO_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_LINKINFO_PORT, /* u8 */ - ETHTOOL_A_LINKINFO_PHYADDR, /* u8 */ - ETHTOOL_A_LINKINFO_TP_MDIX, /* u8 */ - ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, /* u8 */ - ETHTOOL_A_LINKINFO_TRANSCEIVER, /* u8 */ - - /* add new constants above here */ + ETHTOOL_A_LINKINFO_HEADER, + ETHTOOL_A_LINKINFO_PORT, + ETHTOOL_A_LINKINFO_PHYADDR, + ETHTOOL_A_LINKINFO_TP_MDIX, + ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, + ETHTOOL_A_LINKINFO_TRANSCEIVER, + __ETHTOOL_A_LINKINFO_CNT, ETHTOOL_A_LINKINFO_MAX = __ETHTOOL_A_LINKINFO_CNT - 1 }; -/* LINKMODES */ - enum { ETHTOOL_A_LINKMODES_UNSPEC, - ETHTOOL_A_LINKMODES_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_LINKMODES_AUTONEG, /* u8 */ - ETHTOOL_A_LINKMODES_OURS, /* bitset */ - ETHTOOL_A_LINKMODES_PEER, /* bitset */ - ETHTOOL_A_LINKMODES_SPEED, /* u32 */ - ETHTOOL_A_LINKMODES_DUPLEX, /* u8 */ - ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, /* u8 */ - ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, /* u8 */ - ETHTOOL_A_LINKMODES_LANES, /* u32 */ - ETHTOOL_A_LINKMODES_RATE_MATCHING, /* u8 */ - - /* add new constants above here */ + ETHTOOL_A_LINKMODES_HEADER, + ETHTOOL_A_LINKMODES_AUTONEG, + ETHTOOL_A_LINKMODES_OURS, + ETHTOOL_A_LINKMODES_PEER, + ETHTOOL_A_LINKMODES_SPEED, + ETHTOOL_A_LINKMODES_DUPLEX, + ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, + ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, + ETHTOOL_A_LINKMODES_LANES, + ETHTOOL_A_LINKMODES_RATE_MATCHING, + __ETHTOOL_A_LINKMODES_CNT, ETHTOOL_A_LINKMODES_MAX = __ETHTOOL_A_LINKMODES_CNT - 1 }; -/* LINKSTATE */ - enum { ETHTOOL_A_LINKSTATE_UNSPEC, - ETHTOOL_A_LINKSTATE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_LINKSTATE_LINK, /* u8 */ - ETHTOOL_A_LINKSTATE_SQI, /* u32 */ - ETHTOOL_A_LINKSTATE_SQI_MAX, /* u32 */ - ETHTOOL_A_LINKSTATE_EXT_STATE, /* u8 */ - ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, /* u8 */ - ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT, /* u32 */ - - /* add new constants above here */ + ETHTOOL_A_LINKSTATE_HEADER, + ETHTOOL_A_LINKSTATE_LINK, + ETHTOOL_A_LINKSTATE_SQI, + ETHTOOL_A_LINKSTATE_SQI_MAX, + ETHTOOL_A_LINKSTATE_EXT_STATE, + ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, + ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT, + __ETHTOOL_A_LINKSTATE_CNT, ETHTOOL_A_LINKSTATE_MAX = __ETHTOOL_A_LINKSTATE_CNT - 1 }; -/* DEBUG */ - enum { ETHTOOL_A_DEBUG_UNSPEC, - ETHTOOL_A_DEBUG_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_DEBUG_MSGMASK, /* bitset */ + ETHTOOL_A_DEBUG_HEADER, + ETHTOOL_A_DEBUG_MSGMASK, - /* add new constants above here */ __ETHTOOL_A_DEBUG_CNT, ETHTOOL_A_DEBUG_MAX = __ETHTOOL_A_DEBUG_CNT - 1 }; -/* WOL */ - enum { ETHTOOL_A_WOL_UNSPEC, - ETHTOOL_A_WOL_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_WOL_MODES, /* bitset */ - ETHTOOL_A_WOL_SOPASS, /* binary */ + ETHTOOL_A_WOL_HEADER, + ETHTOOL_A_WOL_MODES, + ETHTOOL_A_WOL_SOPASS, - /* add new constants above here */ __ETHTOOL_A_WOL_CNT, ETHTOOL_A_WOL_MAX = __ETHTOOL_A_WOL_CNT - 1 }; -/* FEATURES */ - enum { ETHTOOL_A_FEATURES_UNSPEC, - ETHTOOL_A_FEATURES_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_FEATURES_HW, /* bitset */ - ETHTOOL_A_FEATURES_WANTED, /* bitset */ - ETHTOOL_A_FEATURES_ACTIVE, /* bitset */ - ETHTOOL_A_FEATURES_NOCHANGE, /* bitset */ + ETHTOOL_A_FEATURES_HEADER, + ETHTOOL_A_FEATURES_HW, + ETHTOOL_A_FEATURES_WANTED, + ETHTOOL_A_FEATURES_ACTIVE, + ETHTOOL_A_FEATURES_NOCHANGE, - /* add new constants above here */ __ETHTOOL_A_FEATURES_CNT, ETHTOOL_A_FEATURES_MAX = __ETHTOOL_A_FEATURES_CNT - 1 }; -/* CHANNELS */ - enum { ETHTOOL_A_CHANNELS_UNSPEC, - ETHTOOL_A_CHANNELS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_CHANNELS_RX_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_TX_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_OTHER_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_COMBINED_MAX, /* u32 */ - ETHTOOL_A_CHANNELS_RX_COUNT, /* u32 */ - ETHTOOL_A_CHANNELS_TX_COUNT, /* u32 */ - ETHTOOL_A_CHANNELS_OTHER_COUNT, /* u32 */ - ETHTOOL_A_CHANNELS_COMBINED_COUNT, /* u32 */ - - /* add new constants above here */ + ETHTOOL_A_CHANNELS_HEADER, + ETHTOOL_A_CHANNELS_RX_MAX, + ETHTOOL_A_CHANNELS_TX_MAX, + ETHTOOL_A_CHANNELS_OTHER_MAX, + ETHTOOL_A_CHANNELS_COMBINED_MAX, + ETHTOOL_A_CHANNELS_RX_COUNT, + ETHTOOL_A_CHANNELS_TX_COUNT, + ETHTOOL_A_CHANNELS_OTHER_COUNT, + ETHTOOL_A_CHANNELS_COMBINED_COUNT, + __ETHTOOL_A_CHANNELS_CNT, ETHTOOL_A_CHANNELS_MAX = (__ETHTOOL_A_CHANNELS_CNT - 1) }; enum { ETHTOOL_A_IRQ_MODERATION_UNSPEC, - ETHTOOL_A_IRQ_MODERATION_USEC, /* u32 */ - ETHTOOL_A_IRQ_MODERATION_PKTS, /* u32 */ - ETHTOOL_A_IRQ_MODERATION_COMPS, /* u32 */ + ETHTOOL_A_IRQ_MODERATION_USEC, + ETHTOOL_A_IRQ_MODERATION_PKTS, + ETHTOOL_A_IRQ_MODERATION_COMPS, __ETHTOOL_A_IRQ_MODERATION_CNT, ETHTOOL_A_IRQ_MODERATION_MAX = (__ETHTOOL_A_IRQ_MODERATION_CNT - 1) @@ -332,111 +275,91 @@ enum { enum { ETHTOOL_A_PROFILE_UNSPEC, - /* nest, _A_IRQ_MODERATION_* */ ETHTOOL_A_PROFILE_IRQ_MODERATION, __ETHTOOL_A_PROFILE_CNT, ETHTOOL_A_PROFILE_MAX = (__ETHTOOL_A_PROFILE_CNT - 1) }; -/* COALESCE */ - enum { ETHTOOL_A_COALESCE_UNSPEC, - ETHTOOL_A_COALESCE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_COALESCE_RX_USECS, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES, /* u32 */ - ETHTOOL_A_COALESCE_RX_USECS_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ, /* u32 */ - ETHTOOL_A_COALESCE_STATS_BLOCK_USECS, /* u32 */ - ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX, /* u8 */ - ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX, /* u8 */ - ETHTOOL_A_COALESCE_PKT_RATE_LOW, /* u32 */ - ETHTOOL_A_COALESCE_RX_USECS_LOW, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS_LOW, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW, /* u32 */ - ETHTOOL_A_COALESCE_PKT_RATE_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_RX_USECS_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_TX_USECS_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH, /* u32 */ - ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL, /* u32 */ - ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, /* u8 */ - ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, /* u8 */ - ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES, /* u32 */ - ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, /* u32 */ - ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, /* u32 */ - /* nest - _A_PROFILE_IRQ_MODERATION */ + ETHTOOL_A_COALESCE_HEADER, + ETHTOOL_A_COALESCE_RX_USECS, + ETHTOOL_A_COALESCE_RX_MAX_FRAMES, + ETHTOOL_A_COALESCE_RX_USECS_IRQ, + ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ, + ETHTOOL_A_COALESCE_TX_USECS, + ETHTOOL_A_COALESCE_TX_MAX_FRAMES, + ETHTOOL_A_COALESCE_TX_USECS_IRQ, + ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ, + ETHTOOL_A_COALESCE_STATS_BLOCK_USECS, + ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX, + ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX, + ETHTOOL_A_COALESCE_PKT_RATE_LOW, + ETHTOOL_A_COALESCE_RX_USECS_LOW, + ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW, + ETHTOOL_A_COALESCE_TX_USECS_LOW, + ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW, + ETHTOOL_A_COALESCE_PKT_RATE_HIGH, + ETHTOOL_A_COALESCE_RX_USECS_HIGH, + ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH, + ETHTOOL_A_COALESCE_TX_USECS_HIGH, + ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH, + ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL, + ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, + ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, + ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES, + ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, + ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, ETHTOOL_A_COALESCE_RX_PROFILE, - /* nest - _A_PROFILE_IRQ_MODERATION */ ETHTOOL_A_COALESCE_TX_PROFILE, - /* add new constants above here */ __ETHTOOL_A_COALESCE_CNT, ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) }; -/* PAUSE */ - enum { ETHTOOL_A_PAUSE_STAT_UNSPEC, ETHTOOL_A_PAUSE_STAT_PAD, - ETHTOOL_A_PAUSE_STAT_TX_FRAMES, ETHTOOL_A_PAUSE_STAT_RX_FRAMES, - /* add new constants above here - * adjust ETHTOOL_PAUSE_STAT_CNT if adding non-stats! - */ __ETHTOOL_A_PAUSE_STAT_CNT, ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) }; enum { ETHTOOL_A_PAUSE_UNSPEC, - ETHTOOL_A_PAUSE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PAUSE_AUTONEG, /* u8 */ - ETHTOOL_A_PAUSE_RX, /* u8 */ - ETHTOOL_A_PAUSE_TX, /* u8 */ - ETHTOOL_A_PAUSE_STATS, /* nest - _PAUSE_STAT_* */ - ETHTOOL_A_PAUSE_STATS_SRC, /* u32 */ - - /* add new constants above here */ + ETHTOOL_A_PAUSE_HEADER, + ETHTOOL_A_PAUSE_AUTONEG, + ETHTOOL_A_PAUSE_RX, + ETHTOOL_A_PAUSE_TX, + ETHTOOL_A_PAUSE_STATS, + ETHTOOL_A_PAUSE_STATS_SRC, + __ETHTOOL_A_PAUSE_CNT, ETHTOOL_A_PAUSE_MAX = (__ETHTOOL_A_PAUSE_CNT - 1) }; -/* EEE */ - enum { ETHTOOL_A_EEE_UNSPEC, - ETHTOOL_A_EEE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_EEE_MODES_OURS, /* bitset */ - ETHTOOL_A_EEE_MODES_PEER, /* bitset */ - ETHTOOL_A_EEE_ACTIVE, /* u8 */ - ETHTOOL_A_EEE_ENABLED, /* u8 */ - ETHTOOL_A_EEE_TX_LPI_ENABLED, /* u8 */ - ETHTOOL_A_EEE_TX_LPI_TIMER, /* u32 */ - - /* add new constants above here */ + ETHTOOL_A_EEE_HEADER, + ETHTOOL_A_EEE_MODES_OURS, + ETHTOOL_A_EEE_MODES_PEER, + ETHTOOL_A_EEE_ACTIVE, + ETHTOOL_A_EEE_ENABLED, + ETHTOOL_A_EEE_TX_LPI_ENABLED, + ETHTOOL_A_EEE_TX_LPI_TIMER, + __ETHTOOL_A_EEE_CNT, ETHTOOL_A_EEE_MAX = (__ETHTOOL_A_EEE_CNT - 1) }; -/* TSINFO */ - enum { ETHTOOL_A_TS_STAT_UNSPEC, + ETHTOOL_A_TS_STAT_TX_PKTS, + ETHTOOL_A_TS_STAT_TX_LOST, + ETHTOOL_A_TS_STAT_TX_ERR, - ETHTOOL_A_TS_STAT_TX_PKTS, /* uint */ - ETHTOOL_A_TS_STAT_TX_LOST, /* uint */ - ETHTOOL_A_TS_STAT_TX_ERR, /* uint */ - - /* add new constants above here */ __ETHTOOL_A_TS_STAT_CNT, ETHTOOL_A_TS_STAT_MAX = (__ETHTOOL_A_TS_STAT_CNT - 1) @@ -444,23 +367,22 @@ enum { enum { ETHTOOL_A_TSINFO_UNSPEC, - ETHTOOL_A_TSINFO_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_TSINFO_TIMESTAMPING, /* bitset */ - ETHTOOL_A_TSINFO_TX_TYPES, /* bitset */ - ETHTOOL_A_TSINFO_RX_FILTERS, /* bitset */ - ETHTOOL_A_TSINFO_PHC_INDEX, /* u32 */ - ETHTOOL_A_TSINFO_STATS, /* nest - _A_TSINFO_STAT */ - - /* add new constants above here */ + ETHTOOL_A_TSINFO_HEADER, + ETHTOOL_A_TSINFO_TIMESTAMPING, + ETHTOOL_A_TSINFO_TX_TYPES, + ETHTOOL_A_TSINFO_RX_FILTERS, + ETHTOOL_A_TSINFO_PHC_INDEX, + ETHTOOL_A_TSINFO_STATS, + __ETHTOOL_A_TSINFO_CNT, ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1) }; enum { ETHTOOL_A_CABLE_RESULT_UNSPEC, - ETHTOOL_A_CABLE_RESULT_PAIR, /* u8 ETHTOOL_A_CABLE_PAIR_ */ - ETHTOOL_A_CABLE_RESULT_CODE, /* u8 ETHTOOL_A_CABLE_RESULT_CODE_ */ - ETHTOOL_A_CABLE_RESULT_SRC, /* u32 ETHTOOL_A_CABLE_INF_SRC_ */ + ETHTOOL_A_CABLE_RESULT_PAIR, + ETHTOOL_A_CABLE_RESULT_CODE, + ETHTOOL_A_CABLE_RESULT_SRC, __ETHTOOL_A_CABLE_RESULT_CNT, ETHTOOL_A_CABLE_RESULT_MAX = (__ETHTOOL_A_CABLE_RESULT_CNT - 1) @@ -468,9 +390,9 @@ enum { enum { ETHTOOL_A_CABLE_FAULT_LENGTH_UNSPEC, - ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR, /* u8 ETHTOOL_A_CABLE_PAIR_ */ - ETHTOOL_A_CABLE_FAULT_LENGTH_CM, /* u32 */ - ETHTOOL_A_CABLE_FAULT_LENGTH_SRC, /* u32 ETHTOOL_A_CABLE_INF_SRC_ */ + ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR, + ETHTOOL_A_CABLE_FAULT_LENGTH_CM, + ETHTOOL_A_CABLE_FAULT_LENGTH_SRC, __ETHTOOL_A_CABLE_FAULT_LENGTH_CNT, ETHTOOL_A_CABLE_FAULT_LENGTH_MAX = (__ETHTOOL_A_CABLE_FAULT_LENGTH_CNT - 1) @@ -478,245 +400,204 @@ enum { enum { ETHTOOL_A_CABLE_NEST_UNSPEC, - ETHTOOL_A_CABLE_NEST_RESULT, /* nest - ETHTOOL_A_CABLE_RESULT_ */ - ETHTOOL_A_CABLE_NEST_FAULT_LENGTH, /* nest - ETHTOOL_A_CABLE_FAULT_LENGTH_ */ + ETHTOOL_A_CABLE_NEST_RESULT, + ETHTOOL_A_CABLE_NEST_FAULT_LENGTH, + __ETHTOOL_A_CABLE_NEST_CNT, ETHTOOL_A_CABLE_NEST_MAX = (__ETHTOOL_A_CABLE_NEST_CNT - 1) }; -/* CABLE TEST */ - enum { ETHTOOL_A_CABLE_TEST_UNSPEC, - ETHTOOL_A_CABLE_TEST_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_CABLE_TEST_HEADER, - /* add new constants above here */ __ETHTOOL_A_CABLE_TEST_CNT, ETHTOOL_A_CABLE_TEST_MAX = __ETHTOOL_A_CABLE_TEST_CNT - 1 }; enum { ETHTOOL_A_CABLE_TEST_NTF_UNSPEC, - ETHTOOL_A_CABLE_TEST_NTF_HEADER, /* nest - ETHTOOL_A_HEADER_* */ - ETHTOOL_A_CABLE_TEST_NTF_STATUS, /* u8 - _STARTED/_COMPLETE */ - ETHTOOL_A_CABLE_TEST_NTF_NEST, /* nest - of results: */ + ETHTOOL_A_CABLE_TEST_NTF_HEADER, + ETHTOOL_A_CABLE_TEST_NTF_STATUS, + ETHTOOL_A_CABLE_TEST_NTF_NEST, __ETHTOOL_A_CABLE_TEST_NTF_CNT, ETHTOOL_A_CABLE_TEST_NTF_MAX = (__ETHTOOL_A_CABLE_TEST_NTF_CNT - 1) }; -/* CABLE TEST TDR */ - enum { ETHTOOL_A_CABLE_TEST_TDR_CFG_UNSPEC, - ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST, /* u32 */ - ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST, /* u32 */ - ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP, /* u32 */ - ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR, /* u8 */ + ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST, + ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST, + ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP, + ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR, - /* add new constants above here */ __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT, ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT - 1 }; enum { ETHTOOL_A_CABLE_TEST_TDR_NTF_UNSPEC, - ETHTOOL_A_CABLE_TEST_TDR_NTF_HEADER, /* nest - ETHTOOL_A_HEADER_* */ - ETHTOOL_A_CABLE_TEST_TDR_NTF_STATUS, /* u8 - _STARTED/_COMPLETE */ - ETHTOOL_A_CABLE_TEST_TDR_NTF_NEST, /* nest - of results: */ + ETHTOOL_A_CABLE_TEST_TDR_NTF_HEADER, + ETHTOOL_A_CABLE_TEST_TDR_NTF_STATUS, + ETHTOOL_A_CABLE_TEST_TDR_NTF_NEST, - /* add new constants above here */ __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT, ETHTOOL_A_CABLE_TEST_TDR_NTF_MAX = __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT - 1 }; enum { ETHTOOL_A_CABLE_TEST_TDR_UNSPEC, - ETHTOOL_A_CABLE_TEST_TDR_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_CABLE_TEST_TDR_CFG, /* nest - *_TDR_CFG_* */ + ETHTOOL_A_CABLE_TEST_TDR_HEADER, + ETHTOOL_A_CABLE_TEST_TDR_CFG, - /* add new constants above here */ __ETHTOOL_A_CABLE_TEST_TDR_CNT, ETHTOOL_A_CABLE_TEST_TDR_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CNT - 1 }; enum { ETHTOOL_A_TUNNEL_UDP_ENTRY_UNSPEC, + ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, + ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, - ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, /* be16 */ - ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, /* u32 */ - - /* add new constants above here */ __ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT, ETHTOOL_A_TUNNEL_UDP_ENTRY_MAX = (__ETHTOOL_A_TUNNEL_UDP_ENTRY_CNT - 1) }; enum { ETHTOOL_A_TUNNEL_UDP_TABLE_UNSPEC, + ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, + ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, + ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY, - ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, /* u32 */ - ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES, /* bitset */ - ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY, /* nest - _UDP_ENTRY_* */ - - /* add new constants above here */ __ETHTOOL_A_TUNNEL_UDP_TABLE_CNT, ETHTOOL_A_TUNNEL_UDP_TABLE_MAX = (__ETHTOOL_A_TUNNEL_UDP_TABLE_CNT - 1) }; enum { ETHTOOL_A_TUNNEL_UDP_UNSPEC, + ETHTOOL_A_TUNNEL_UDP_TABLE, - ETHTOOL_A_TUNNEL_UDP_TABLE, /* nest - _UDP_TABLE_* */ - - /* add new constants above here */ __ETHTOOL_A_TUNNEL_UDP_CNT, ETHTOOL_A_TUNNEL_UDP_MAX = (__ETHTOOL_A_TUNNEL_UDP_CNT - 1) }; enum { ETHTOOL_A_TUNNEL_INFO_UNSPEC, - ETHTOOL_A_TUNNEL_INFO_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_TUNNEL_INFO_HEADER, + ETHTOOL_A_TUNNEL_INFO_UDP_PORTS, - ETHTOOL_A_TUNNEL_INFO_UDP_PORTS, /* nest - _UDP_TABLE */ - - /* add new constants above here */ __ETHTOOL_A_TUNNEL_INFO_CNT, ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) }; -/* FEC */ - enum { ETHTOOL_A_FEC_STAT_UNSPEC, ETHTOOL_A_FEC_STAT_PAD, + ETHTOOL_A_FEC_STAT_CORRECTED, + ETHTOOL_A_FEC_STAT_UNCORR, + ETHTOOL_A_FEC_STAT_CORR_BITS, - ETHTOOL_A_FEC_STAT_CORRECTED, /* array, u64 */ - ETHTOOL_A_FEC_STAT_UNCORR, /* array, u64 */ - ETHTOOL_A_FEC_STAT_CORR_BITS, /* array, u64 */ - - /* add new constants above here */ __ETHTOOL_A_FEC_STAT_CNT, ETHTOOL_A_FEC_STAT_MAX = (__ETHTOOL_A_FEC_STAT_CNT - 1) }; enum { ETHTOOL_A_FEC_UNSPEC, - ETHTOOL_A_FEC_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_FEC_MODES, /* bitset */ - ETHTOOL_A_FEC_AUTO, /* u8 */ - ETHTOOL_A_FEC_ACTIVE, /* u32 */ - ETHTOOL_A_FEC_STATS, /* nest - _A_FEC_STAT */ + ETHTOOL_A_FEC_HEADER, + ETHTOOL_A_FEC_MODES, + ETHTOOL_A_FEC_AUTO, + ETHTOOL_A_FEC_ACTIVE, + ETHTOOL_A_FEC_STATS, __ETHTOOL_A_FEC_CNT, ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1) }; -/* MODULE EEPROM */ - enum { ETHTOOL_A_MODULE_EEPROM_UNSPEC, - ETHTOOL_A_MODULE_EEPROM_HEADER, /* nest - _A_HEADER_* */ - - ETHTOOL_A_MODULE_EEPROM_OFFSET, /* u32 */ - ETHTOOL_A_MODULE_EEPROM_LENGTH, /* u32 */ - ETHTOOL_A_MODULE_EEPROM_PAGE, /* u8 */ - ETHTOOL_A_MODULE_EEPROM_BANK, /* u8 */ - ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS, /* u8 */ - ETHTOOL_A_MODULE_EEPROM_DATA, /* binary */ + ETHTOOL_A_MODULE_EEPROM_HEADER, + ETHTOOL_A_MODULE_EEPROM_OFFSET, + ETHTOOL_A_MODULE_EEPROM_LENGTH, + ETHTOOL_A_MODULE_EEPROM_PAGE, + ETHTOOL_A_MODULE_EEPROM_BANK, + ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS, + ETHTOOL_A_MODULE_EEPROM_DATA, __ETHTOOL_A_MODULE_EEPROM_CNT, ETHTOOL_A_MODULE_EEPROM_MAX = (__ETHTOOL_A_MODULE_EEPROM_CNT - 1) }; - enum { ETHTOOL_A_STATS_GRP_UNSPEC, ETHTOOL_A_STATS_GRP_PAD, + ETHTOOL_A_STATS_GRP_ID, + ETHTOOL_A_STATS_GRP_SS_ID, + ETHTOOL_A_STATS_GRP_STAT, + ETHTOOL_A_STATS_GRP_HIST_RX, + ETHTOOL_A_STATS_GRP_HIST_TX, + ETHTOOL_A_STATS_GRP_HIST_BKT_LOW, + ETHTOOL_A_STATS_GRP_HIST_BKT_HI, + ETHTOOL_A_STATS_GRP_HIST_VAL, - ETHTOOL_A_STATS_GRP_ID, /* u32 */ - ETHTOOL_A_STATS_GRP_SS_ID, /* u32 */ - - ETHTOOL_A_STATS_GRP_STAT, /* nest */ - - ETHTOOL_A_STATS_GRP_HIST_RX, /* nest */ - ETHTOOL_A_STATS_GRP_HIST_TX, /* nest */ - - ETHTOOL_A_STATS_GRP_HIST_BKT_LOW, /* u32 */ - ETHTOOL_A_STATS_GRP_HIST_BKT_HI, /* u32 */ - ETHTOOL_A_STATS_GRP_HIST_VAL, /* u64 */ - - /* add new constants above here */ __ETHTOOL_A_STATS_GRP_CNT, ETHTOOL_A_STATS_GRP_MAX = (__ETHTOOL_A_STATS_GRP_CNT - 1) }; -/* STATS */ - enum { ETHTOOL_A_STATS_UNSPEC, ETHTOOL_A_STATS_PAD, - ETHTOOL_A_STATS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_STATS_GROUPS, /* bitset */ + ETHTOOL_A_STATS_HEADER, + ETHTOOL_A_STATS_GROUPS, + ETHTOOL_A_STATS_GRP, + ETHTOOL_A_STATS_SRC, - ETHTOOL_A_STATS_GRP, /* nest - _A_STATS_GRP_* */ - - ETHTOOL_A_STATS_SRC, /* u32 */ - - /* add new constants above here */ __ETHTOOL_A_STATS_CNT, ETHTOOL_A_STATS_MAX = (__ETHTOOL_A_STATS_CNT - 1) }; -/* PHC VCLOCKS */ - enum { ETHTOOL_A_PHC_VCLOCKS_UNSPEC, - ETHTOOL_A_PHC_VCLOCKS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PHC_VCLOCKS_NUM, /* u32 */ - ETHTOOL_A_PHC_VCLOCKS_INDEX, /* array, s32 */ + ETHTOOL_A_PHC_VCLOCKS_HEADER, + ETHTOOL_A_PHC_VCLOCKS_NUM, + ETHTOOL_A_PHC_VCLOCKS_INDEX, - /* add new constants above here */ __ETHTOOL_A_PHC_VCLOCKS_CNT, ETHTOOL_A_PHC_VCLOCKS_MAX = (__ETHTOOL_A_PHC_VCLOCKS_CNT - 1) }; -/* MODULE */ - enum { ETHTOOL_A_MODULE_UNSPEC, - ETHTOOL_A_MODULE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_MODULE_POWER_MODE_POLICY, /* u8 */ - ETHTOOL_A_MODULE_POWER_MODE, /* u8 */ + ETHTOOL_A_MODULE_HEADER, + ETHTOOL_A_MODULE_POWER_MODE_POLICY, + ETHTOOL_A_MODULE_POWER_MODE, - /* add new constants above here */ __ETHTOOL_A_MODULE_CNT, ETHTOOL_A_MODULE_MAX = (__ETHTOOL_A_MODULE_CNT - 1) }; -/* Power Sourcing Equipment */ enum { ETHTOOL_A_C33_PSE_PW_LIMIT_UNSPEC, - ETHTOOL_A_C33_PSE_PW_LIMIT_MIN, /* u32 */ - ETHTOOL_A_C33_PSE_PW_LIMIT_MAX, /* u32 */ + ETHTOOL_A_C33_PSE_PW_LIMIT_MIN, + ETHTOOL_A_C33_PSE_PW_LIMIT_MAX, }; enum { ETHTOOL_A_PSE_UNSPEC, - ETHTOOL_A_PSE_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PODL_PSE_ADMIN_STATE, /* u32 */ - ETHTOOL_A_PODL_PSE_ADMIN_CONTROL, /* u32 */ - ETHTOOL_A_PODL_PSE_PW_D_STATUS, /* u32 */ - ETHTOOL_A_C33_PSE_ADMIN_STATE, /* u32 */ - ETHTOOL_A_C33_PSE_ADMIN_CONTROL, /* u32 */ - ETHTOOL_A_C33_PSE_PW_D_STATUS, /* u32 */ - ETHTOOL_A_C33_PSE_PW_CLASS, /* u32 */ - ETHTOOL_A_C33_PSE_ACTUAL_PW, /* u32 */ - ETHTOOL_A_C33_PSE_EXT_STATE, /* u32 */ - ETHTOOL_A_C33_PSE_EXT_SUBSTATE, /* u32 */ - ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT, /* u32 */ - ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES, /* nest - _C33_PSE_PW_LIMIT_* */ - - /* add new constants above here */ + ETHTOOL_A_PSE_HEADER, + ETHTOOL_A_PODL_PSE_ADMIN_STATE, + ETHTOOL_A_PODL_PSE_ADMIN_CONTROL, + ETHTOOL_A_PODL_PSE_PW_D_STATUS, + ETHTOOL_A_C33_PSE_ADMIN_STATE, + ETHTOOL_A_C33_PSE_ADMIN_CONTROL, + ETHTOOL_A_C33_PSE_PW_D_STATUS, + ETHTOOL_A_C33_PSE_PW_CLASS, + ETHTOOL_A_C33_PSE_ACTUAL_PW, + ETHTOOL_A_C33_PSE_EXT_STATE, + ETHTOOL_A_C33_PSE_EXT_SUBSTATE, + ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT, + ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES, + __ETHTOOL_A_PSE_CNT, ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1) }; @@ -724,70 +605,62 @@ enum { enum { ETHTOOL_A_RSS_UNSPEC, ETHTOOL_A_RSS_HEADER, - ETHTOOL_A_RSS_CONTEXT, /* u32 */ - ETHTOOL_A_RSS_HFUNC, /* u32 */ - ETHTOOL_A_RSS_INDIR, /* binary */ - ETHTOOL_A_RSS_HKEY, /* binary */ - ETHTOOL_A_RSS_INPUT_XFRM, /* u32 */ - ETHTOOL_A_RSS_START_CONTEXT, /* u32 */ + ETHTOOL_A_RSS_CONTEXT, + ETHTOOL_A_RSS_HFUNC, + ETHTOOL_A_RSS_INDIR, + ETHTOOL_A_RSS_HKEY, + ETHTOOL_A_RSS_INPUT_XFRM, + ETHTOOL_A_RSS_START_CONTEXT, __ETHTOOL_A_RSS_CNT, ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1), }; -/* PLCA */ - enum { ETHTOOL_A_PLCA_UNSPEC, - ETHTOOL_A_PLCA_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PLCA_VERSION, /* u16 */ - ETHTOOL_A_PLCA_ENABLED, /* u8 */ - ETHTOOL_A_PLCA_STATUS, /* u8 */ - ETHTOOL_A_PLCA_NODE_CNT, /* u32 */ - ETHTOOL_A_PLCA_NODE_ID, /* u32 */ - ETHTOOL_A_PLCA_TO_TMR, /* u32 */ - ETHTOOL_A_PLCA_BURST_CNT, /* u32 */ - ETHTOOL_A_PLCA_BURST_TMR, /* u32 */ - - /* add new constants above here */ + ETHTOOL_A_PLCA_HEADER, + ETHTOOL_A_PLCA_VERSION, + ETHTOOL_A_PLCA_ENABLED, + ETHTOOL_A_PLCA_STATUS, + ETHTOOL_A_PLCA_NODE_CNT, + ETHTOOL_A_PLCA_NODE_ID, + ETHTOOL_A_PLCA_TO_TMR, + ETHTOOL_A_PLCA_BURST_CNT, + ETHTOOL_A_PLCA_BURST_TMR, + __ETHTOOL_A_PLCA_CNT, ETHTOOL_A_PLCA_MAX = (__ETHTOOL_A_PLCA_CNT - 1) }; -/* MODULE_FW_FLASH */ - enum { ETHTOOL_A_MODULE_FW_FLASH_UNSPEC, - ETHTOOL_A_MODULE_FW_FLASH_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME, /* string */ - ETHTOOL_A_MODULE_FW_FLASH_PASSWORD, /* u32 */ - ETHTOOL_A_MODULE_FW_FLASH_STATUS, /* u32 */ - ETHTOOL_A_MODULE_FW_FLASH_STATUS_MSG, /* string */ - ETHTOOL_A_MODULE_FW_FLASH_DONE, /* uint */ - ETHTOOL_A_MODULE_FW_FLASH_TOTAL, /* uint */ - - /* add new constants above here */ + ETHTOOL_A_MODULE_FW_FLASH_HEADER, + ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME, + ETHTOOL_A_MODULE_FW_FLASH_PASSWORD, + ETHTOOL_A_MODULE_FW_FLASH_STATUS, + ETHTOOL_A_MODULE_FW_FLASH_STATUS_MSG, + ETHTOOL_A_MODULE_FW_FLASH_DONE, + ETHTOOL_A_MODULE_FW_FLASH_TOTAL, + __ETHTOOL_A_MODULE_FW_FLASH_CNT, ETHTOOL_A_MODULE_FW_FLASH_MAX = (__ETHTOOL_A_MODULE_FW_FLASH_CNT - 1) }; enum { ETHTOOL_A_PHY_UNSPEC, - ETHTOOL_A_PHY_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PHY_INDEX, /* u32 */ - ETHTOOL_A_PHY_DRVNAME, /* string */ - ETHTOOL_A_PHY_NAME, /* string */ - ETHTOOL_A_PHY_UPSTREAM_TYPE, /* u32 */ - ETHTOOL_A_PHY_UPSTREAM_INDEX, /* u32 */ - ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, /* string */ - ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, /* string */ - - /* add new constants above here */ + ETHTOOL_A_PHY_HEADER, + ETHTOOL_A_PHY_INDEX, + ETHTOOL_A_PHY_DRVNAME, + ETHTOOL_A_PHY_NAME, + ETHTOOL_A_PHY_UPSTREAM_TYPE, + ETHTOOL_A_PHY_UPSTREAM_INDEX, + ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, + ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, + __ETHTOOL_A_PHY_CNT, ETHTOOL_A_PHY_MAX = (__ETHTOOL_A_PHY_CNT - 1) }; -/* message types - userspace to kernel */ enum { ETHTOOL_MSG_USER_NONE, ETHTOOL_MSG_STRSET_GET, @@ -836,12 +709,10 @@ enum { ETHTOOL_MSG_MODULE_FW_FLASH_ACT, ETHTOOL_MSG_PHY_GET, - /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, ETHTOOL_MSG_USER_MAX = __ETHTOOL_MSG_USER_CNT - 1 }; -/* message types - kernel to userspace */ enum { ETHTOOL_MSG_KERNEL_NONE, ETHTOOL_MSG_STRSET_GET_REPLY, @@ -891,7 +762,6 @@ enum { ETHTOOL_MSG_PHY_GET_REPLY, ETHTOOL_MSG_PHY_NTF, - /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, ETHTOOL_MSG_KERNEL_MAX = __ETHTOOL_MSG_KERNEL_CNT - 1 }; -- cgit v1.2.3 From 8d0580c6ebdd27879c83483f53bc71e2e470f6fe Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 4 Dec 2024 07:55:49 -0800 Subject: ethtool: regenerate uapi header from the spec No functional changes. Mostly the following formatting: - extra docs - extra enums - XXX_MAX = __XXX_CNT - 1 -> XXX_MAX = (__XXX_CNT - 1) - newlines Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20241204155549.641348-9-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool_netlink_generated.h | 89 ++++++++++++++++---------- 1 file changed, 56 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 35a24d490efe..b58f352fe4f2 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -1,23 +1,43 @@ /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/ethtool.yaml */ +/* YNL-GEN uapi header */ + #ifndef _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H #define _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H +#define ETHTOOL_FAMILY_NAME "ethtool" +#define ETHTOOL_FAMILY_VERSION 1 + enum { ETHTOOL_UDP_TUNNEL_TYPE_VXLAN, ETHTOOL_UDP_TUNNEL_TYPE_GENEVE, ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE, - __ETHTOOL_UDP_TUNNEL_TYPE_CNT + /* private: */ + __ETHTOOL_UDP_TUNNEL_TYPE_CNT, + ETHTOOL_UDP_TUNNEL_TYPE_MAX = (__ETHTOOL_UDP_TUNNEL_TYPE_CNT - 1) }; +/** + * enum ethtool_header_flags - common ethtool header flags + * @ETHTOOL_FLAG_COMPACT_BITSETS: use compact bitsets in reply + * @ETHTOOL_FLAG_OMIT_REPLY: provide optional reply for SET or ACT requests + * @ETHTOOL_FLAG_STATS: request statistics, if supported by the driver + */ enum ethtool_header_flags { - ETHTOOL_FLAG_COMPACT_BITSETS = 1 << 0, /* use compact bitsets in reply */ - ETHTOOL_FLAG_OMIT_REPLY = 1 << 1, /* provide optional reply for SET or ACT requests */ - ETHTOOL_FLAG_STATS = 1 << 2, /* request statistics, if supported by the driver */ + ETHTOOL_FLAG_COMPACT_BITSETS = 1, + ETHTOOL_FLAG_OMIT_REPLY = 2, + ETHTOOL_FLAG_STATS = 4, }; enum { - ETHTOOL_TCP_DATA_SPLIT_UNKNOWN = 0, + ETHTOOL_PHY_UPSTREAM_TYPE_MAC, + ETHTOOL_PHY_UPSTREAM_TYPE_PHY, +}; + +enum ethtool_tcp_data_split { + ETHTOOL_TCP_DATA_SPLIT_UNKNOWN, ETHTOOL_TCP_DATA_SPLIT_DISABLED, ETHTOOL_TCP_DATA_SPLIT_ENABLED, }; @@ -30,7 +50,7 @@ enum { ETHTOOL_A_HEADER_PHY_INDEX, __ETHTOOL_A_HEADER_CNT, - ETHTOOL_A_HEADER_MAX = __ETHTOOL_A_HEADER_CNT - 1 + ETHTOOL_A_HEADER_MAX = (__ETHTOOL_A_HEADER_CNT - 1) }; enum { @@ -40,7 +60,7 @@ enum { ETHTOOL_A_BITSET_BIT_VALUE, __ETHTOOL_A_BITSET_BIT_CNT, - ETHTOOL_A_BITSET_BIT_MAX = __ETHTOOL_A_BITSET_BIT_CNT - 1 + ETHTOOL_A_BITSET_BIT_MAX = (__ETHTOOL_A_BITSET_BIT_CNT - 1) }; enum { @@ -48,7 +68,7 @@ enum { ETHTOOL_A_BITSET_BITS_BIT, __ETHTOOL_A_BITSET_BITS_CNT, - ETHTOOL_A_BITSET_BITS_MAX = __ETHTOOL_A_BITSET_BITS_CNT - 1 + ETHTOOL_A_BITSET_BITS_MAX = (__ETHTOOL_A_BITSET_BITS_CNT - 1) }; enum { @@ -60,7 +80,7 @@ enum { ETHTOOL_A_BITSET_MASK, __ETHTOOL_A_BITSET_CNT, - ETHTOOL_A_BITSET_MAX = __ETHTOOL_A_BITSET_CNT - 1 + ETHTOOL_A_BITSET_MAX = (__ETHTOOL_A_BITSET_CNT - 1) }; enum { @@ -69,7 +89,7 @@ enum { ETHTOOL_A_STRING_VALUE, __ETHTOOL_A_STRING_CNT, - ETHTOOL_A_STRING_MAX = __ETHTOOL_A_STRING_CNT - 1 + ETHTOOL_A_STRING_MAX = (__ETHTOOL_A_STRING_CNT - 1) }; enum { @@ -77,7 +97,7 @@ enum { ETHTOOL_A_STRINGS_STRING, __ETHTOOL_A_STRINGS_CNT, - ETHTOOL_A_STRINGS_MAX = __ETHTOOL_A_STRINGS_CNT - 1 + ETHTOOL_A_STRINGS_MAX = (__ETHTOOL_A_STRINGS_CNT - 1) }; enum { @@ -87,7 +107,7 @@ enum { ETHTOOL_A_STRINGSET_STRINGS, __ETHTOOL_A_STRINGSET_CNT, - ETHTOOL_A_STRINGSET_MAX = __ETHTOOL_A_STRINGSET_CNT - 1 + ETHTOOL_A_STRINGSET_MAX = (__ETHTOOL_A_STRINGSET_CNT - 1) }; enum { @@ -95,7 +115,7 @@ enum { ETHTOOL_A_STRINGSETS_STRINGSET, __ETHTOOL_A_STRINGSETS_CNT, - ETHTOOL_A_STRINGSETS_MAX = __ETHTOOL_A_STRINGSETS_CNT - 1 + ETHTOOL_A_STRINGSETS_MAX = (__ETHTOOL_A_STRINGSETS_CNT - 1) }; enum { @@ -105,7 +125,7 @@ enum { ETHTOOL_A_STRSET_COUNTS_ONLY, __ETHTOOL_A_STRSET_CNT, - ETHTOOL_A_STRSET_MAX = __ETHTOOL_A_STRSET_CNT - 1 + ETHTOOL_A_STRSET_MAX = (__ETHTOOL_A_STRSET_CNT - 1) }; enum { @@ -114,7 +134,7 @@ enum { ETHTOOL_A_PRIVFLAGS_FLAGS, __ETHTOOL_A_PRIVFLAGS_CNT, - ETHTOOL_A_PRIVFLAGS_MAX = __ETHTOOL_A_PRIVFLAGS_CNT - 1 + ETHTOOL_A_PRIVFLAGS_MAX = (__ETHTOOL_A_PRIVFLAGS_CNT - 1) }; enum { @@ -182,7 +202,7 @@ enum { ETHTOOL_A_LINKINFO_TRANSCEIVER, __ETHTOOL_A_LINKINFO_CNT, - ETHTOOL_A_LINKINFO_MAX = __ETHTOOL_A_LINKINFO_CNT - 1 + ETHTOOL_A_LINKINFO_MAX = (__ETHTOOL_A_LINKINFO_CNT - 1) }; enum { @@ -199,7 +219,7 @@ enum { ETHTOOL_A_LINKMODES_RATE_MATCHING, __ETHTOOL_A_LINKMODES_CNT, - ETHTOOL_A_LINKMODES_MAX = __ETHTOOL_A_LINKMODES_CNT - 1 + ETHTOOL_A_LINKMODES_MAX = (__ETHTOOL_A_LINKMODES_CNT - 1) }; enum { @@ -213,7 +233,7 @@ enum { ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT, __ETHTOOL_A_LINKSTATE_CNT, - ETHTOOL_A_LINKSTATE_MAX = __ETHTOOL_A_LINKSTATE_CNT - 1 + ETHTOOL_A_LINKSTATE_MAX = (__ETHTOOL_A_LINKSTATE_CNT - 1) }; enum { @@ -222,7 +242,7 @@ enum { ETHTOOL_A_DEBUG_MSGMASK, __ETHTOOL_A_DEBUG_CNT, - ETHTOOL_A_DEBUG_MAX = __ETHTOOL_A_DEBUG_CNT - 1 + ETHTOOL_A_DEBUG_MAX = (__ETHTOOL_A_DEBUG_CNT - 1) }; enum { @@ -232,7 +252,7 @@ enum { ETHTOOL_A_WOL_SOPASS, __ETHTOOL_A_WOL_CNT, - ETHTOOL_A_WOL_MAX = __ETHTOOL_A_WOL_CNT - 1 + ETHTOOL_A_WOL_MAX = (__ETHTOOL_A_WOL_CNT - 1) }; enum { @@ -244,7 +264,7 @@ enum { ETHTOOL_A_FEATURES_NOCHANGE, __ETHTOOL_A_FEATURES_CNT, - ETHTOOL_A_FEATURES_MAX = __ETHTOOL_A_FEATURES_CNT - 1 + ETHTOOL_A_FEATURES_MAX = (__ETHTOOL_A_FEATURES_CNT - 1) }; enum { @@ -276,6 +296,7 @@ enum { enum { ETHTOOL_A_PROFILE_UNSPEC, ETHTOOL_A_PROFILE_IRQ_MODERATION, + __ETHTOOL_A_PROFILE_CNT, ETHTOOL_A_PROFILE_MAX = (__ETHTOOL_A_PROFILE_CNT - 1) }; @@ -362,7 +383,6 @@ enum { __ETHTOOL_A_TS_STAT_CNT, ETHTOOL_A_TS_STAT_MAX = (__ETHTOOL_A_TS_STAT_CNT - 1) - }; enum { @@ -412,7 +432,7 @@ enum { ETHTOOL_A_CABLE_TEST_HEADER, __ETHTOOL_A_CABLE_TEST_CNT, - ETHTOOL_A_CABLE_TEST_MAX = __ETHTOOL_A_CABLE_TEST_CNT - 1 + ETHTOOL_A_CABLE_TEST_MAX = (__ETHTOOL_A_CABLE_TEST_CNT - 1) }; enum { @@ -433,7 +453,7 @@ enum { ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR, __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT, - ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT - 1 + ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX = (__ETHTOOL_A_CABLE_TEST_TDR_CFG_CNT - 1) }; enum { @@ -443,7 +463,7 @@ enum { ETHTOOL_A_CABLE_TEST_TDR_NTF_NEST, __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT, - ETHTOOL_A_CABLE_TEST_TDR_NTF_MAX = __ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT - 1 + ETHTOOL_A_CABLE_TEST_TDR_NTF_MAX = (__ETHTOOL_A_CABLE_TEST_TDR_NTF_CNT - 1) }; enum { @@ -452,7 +472,7 @@ enum { ETHTOOL_A_CABLE_TEST_TDR_CFG, __ETHTOOL_A_CABLE_TEST_TDR_CNT, - ETHTOOL_A_CABLE_TEST_TDR_MAX = __ETHTOOL_A_CABLE_TEST_TDR_CNT - 1 + ETHTOOL_A_CABLE_TEST_TDR_MAX = (__ETHTOOL_A_CABLE_TEST_TDR_CNT - 1) }; enum { @@ -580,6 +600,9 @@ enum { ETHTOOL_A_C33_PSE_PW_LIMIT_UNSPEC, ETHTOOL_A_C33_PSE_PW_LIMIT_MIN, ETHTOOL_A_C33_PSE_PW_LIMIT_MAX, + + __ETHTOOL_A_C33_PSE_PW_LIMIT_CNT, + __ETHTOOL_A_C33_PSE_PW_LIMIT_MAX = (__ETHTOOL_A_C33_PSE_PW_LIMIT_CNT - 1) }; enum { @@ -613,7 +636,7 @@ enum { ETHTOOL_A_RSS_START_CONTEXT, __ETHTOOL_A_RSS_CNT, - ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1), + ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1) }; enum { @@ -662,8 +685,8 @@ enum { }; enum { - ETHTOOL_MSG_USER_NONE, - ETHTOOL_MSG_STRSET_GET, + ETHTOOL_MSG_USER_NONE = 0, + ETHTOOL_MSG_STRSET_GET = 1, ETHTOOL_MSG_LINKINFO_GET, ETHTOOL_MSG_LINKINFO_SET, ETHTOOL_MSG_LINKMODES_GET, @@ -710,12 +733,12 @@ enum { ETHTOOL_MSG_PHY_GET, __ETHTOOL_MSG_USER_CNT, - ETHTOOL_MSG_USER_MAX = __ETHTOOL_MSG_USER_CNT - 1 + ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1) }; enum { - ETHTOOL_MSG_KERNEL_NONE, - ETHTOOL_MSG_STRSET_GET_REPLY, + ETHTOOL_MSG_KERNEL_NONE = 0, + ETHTOOL_MSG_STRSET_GET_REPLY = 1, ETHTOOL_MSG_LINKINFO_GET_REPLY, ETHTOOL_MSG_LINKINFO_NTF, ETHTOOL_MSG_LINKMODES_GET_REPLY, @@ -763,7 +786,7 @@ enum { ETHTOOL_MSG_PHY_NTF, __ETHTOOL_MSG_KERNEL_CNT, - ETHTOOL_MSG_KERNEL_MAX = __ETHTOOL_MSG_KERNEL_CNT - 1 + ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1) }; #endif /* _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H */ -- cgit v1.2.3 From 5765c7f6e3173eb894889a29963a497aeb721c5e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Dec 2024 17:19:50 +0000 Subject: net_sched: sch_fq: add three drop_reason MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add three new drop_reason, more precise than generic QDISC_DROP: "tc -s qd" show aggregate counters, it might be more useful to use drop_reason infrastructure for bug hunting. 1) SKB_DROP_REASON_FQ_BAND_LIMIT Whenever a packet is added while its band limit is hit. Corresponding value in "tc -s qd" is bandX_drops XXXX 2) SKB_DROP_REASON_FQ_HORIZON_LIMIT Whenever a packet has a timestamp too far in the future. Corresponding value in "tc -s qd" is horizon_drops XXXX 3) SKB_DROP_REASON_FQ_FLOW_LIMIT Whenever a flow has reached its limit. Corresponding value in "tc -s qd" is flows_plimit XXXX Tested: tc qd replace dev eth1 root fq flow_limit 10 limit 100000 perf record -a -e skb:kfree_skb sleep 1; perf script udp_stream 12329 [004] 216.929492: skb:kfree_skb: skbaddr=0xffff888eabe17e00 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_FLOW_LIMIT udp_stream 12385 [006] 216.929593: skb:kfree_skb: skbaddr=0xffff888ef8827f00 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_FLOW_LIMIT udp_stream 12389 [005] 216.929871: skb:kfree_skb: skbaddr=0xffff888ecb9ba500 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_FLOW_LIMIT udp_stream 12316 [009] 216.930398: skb:kfree_skb: skbaddr=0xffff888eca286b00 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_FLOW_LIMIT udp_stream 12400 [008] 216.930490: skb:kfree_skb: skbaddr=0xffff888eabf93d00 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_FLOW_LIMIT tc qd replace dev eth1 root fq flow_limit 100 limit 10000 perf record -a -e skb:kfree_skb sleep 1; perf script udp_stream 18074 [001] 1058.318040: skb:kfree_skb: skbaddr=0xffffa23c881fc000 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_BAND_LIMIT udp_stream 18126 [005] 1058.320651: skb:kfree_skb: skbaddr=0xffffa23c6aad4000 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_BAND_LIMIT udp_stream 18118 [006] 1058.321065: skb:kfree_skb: skbaddr=0xffffa23df0d48a00 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_BAND_LIMIT udp_stream 18074 [001] 1058.321126: skb:kfree_skb: skbaddr=0xffffa23c881ffa00 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_BAND_LIMIT udp_stream 15815 [003] 1058.321224: skb:kfree_skb: skbaddr=0xffffa23c9835db00 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x9d9 reason: FQ_BAND_LIMIT tc -s -d qd sh dev eth1 qdisc fq 8023: root refcnt 257 limit 10000p flow_limit 100p buckets 1024 orphan_mask 1023 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 weights 589824 196608 65536 quantum 18Kb initial_quantum 92120b low_rate_threshold 550Kbit refill_delay 40ms timer_slack 10us horizon 10s horizon_drop Sent 492439603330 bytes 336953991 pkt (dropped 61724094, overlimits 0 requeues 4463) backlog 14611228b 9995p requeues 4463 flows 2965 (inactive 1151 throttled 0) band0_pkts 0 band1_pkts 9993 band2_pkts 0 gc 6347 highprio 0 fastpath 30 throttled 5 latency 2.32us flows_plimit 7403693 band1_drops 54320401 Signed-off-by: Eric Dumazet Reviewed-by: Victor Nogueira Reviewed-by: Toke Høiland-Jørgensen Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20241204171950.89829-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 18 ++++++++++++++++++ include/net/sch_generic.h | 8 ++++++++ 2 files changed, 26 insertions(+) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 6c5a1ea209a2..c29282fabae6 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -58,6 +58,9 @@ FN(TC_EGRESS) \ FN(SECURITY_HOOK) \ FN(QDISC_DROP) \ + FN(FQ_BAND_LIMIT) \ + FN(FQ_HORIZON_LIMIT) \ + FN(FQ_FLOW_LIMIT) \ FN(CPU_BACKLOG) \ FN(XDP) \ FN(TC_INGRESS) \ @@ -311,6 +314,21 @@ enum skb_drop_reason { * failed to enqueue to current qdisc) */ SKB_DROP_REASON_QDISC_DROP, + /** + * @SKB_DROP_REASON_FQ_BAND_LIMIT: dropped by fq qdisc when per band + * limit is reached. + */ + SKB_DROP_REASON_FQ_BAND_LIMIT, + /** + * @SKB_DROP_REASON_FQ_HORIZON_LIMIT: dropped by fq qdisc when packet + * timestamp is too far in the future. + */ + SKB_DROP_REASON_FQ_HORIZON_LIMIT, + /** + * @SKB_DROP_REASON_FQ_FLOW_LIMIT: dropped by fq qdisc when a flow + * exceeds its limits. + */ + SKB_DROP_REASON_FQ_FLOW_LIMIT, /** * @SKB_DROP_REASON_CPU_BACKLOG: failed to enqueue the skb to the per CPU * backlog queue. This can be caused by backlog queue full (see diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 5d74fa7e694c..8074322dd636 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1245,6 +1245,14 @@ static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_DROP; } +static inline int qdisc_drop_reason(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free, + enum skb_drop_reason reason) +{ + tcf_set_drop_reason(skb, reason); + return qdisc_drop(skb, sch, to_free); +} + static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { -- cgit v1.2.3 From ca5c94949facce1f67a4a9a9528a27f635ff3e78 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 3 Dec 2024 18:37:24 +0100 Subject: xsk: align &xdp_buff_xsk harder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the series "XSk buff on a diet" by Maciej, the greatest pow-2 which &xdp_buff_xsk can be divided got reduced from 16 to 8 on x86_64. Also, sizeof(xdp_buff_xsk) now is 120 bytes, which, taking the previous sentence into account, leads to that it leaves 8 bytes at the end of cacheline, which means an array of buffs will have its elements messed between the cachelines chaotically. Use __aligned_largest for this struct. This alignment is usually 16 bytes, which makes it fill two full cachelines and align an array nicely. ___cacheline_aligned may be excessive here, especially on arches with 128-256 byte CLs, as well as 32-bit arches (76 -> 96 bytes on MIPS32R2), while not doing better than _largest. Signed-off-by: Alexander Lobakin Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241203173733.3181246-2-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/xsk_buff_pool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index bb03cee716b3..7637799b6c19 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -29,7 +29,7 @@ struct xdp_buff_xsk { dma_addr_t frame_dma; struct xsk_buff_pool *pool; struct list_head list_node; -}; +} __aligned_largest; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) #define XSK_TX_COMPL_FITS(t) BUILD_BUG_ON(sizeof(struct xsk_tx_metadata_compl) > sizeof(t)) -- cgit v1.2.3 From 7cd1107f48e2a246c6a628c2381e1b8aafa4675a Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 3 Dec 2024 18:37:25 +0100 Subject: bpf, xdp: constify some bpf_prog * function arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In lots of places, bpf_prog pointer is used only for tracing or other stuff that doesn't modify the structure itself. Same for net_device. Address at least some of them and add `const` attributes there. The object code didn't change, but that may prevent unwanted data modifications and also allow more helpers to have const arguments. Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Signed-off-by: Jakub Kicinski --- include/linux/bpf.h | 12 ++++++------ include/linux/filter.h | 9 +++++---- include/linux/netdevice.h | 6 +++--- include/linux/skbuff.h | 2 +- 4 files changed, 15 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eaee2a819f4c..ec3acb16359e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2591,10 +2591,10 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog); + const struct bpf_prog *xdp_prog); int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog, struct bpf_map *map, - bool exclude_ingress); + const struct bpf_prog *xdp_prog, + struct bpf_map *map, bool exclude_ingress); void __cpu_map_flush(struct list_head *flush_list); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, @@ -2864,15 +2864,15 @@ struct sk_buff; static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { return 0; } static inline int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog, struct bpf_map *map, - bool exclude_ingress) + const struct bpf_prog *xdp_prog, + struct bpf_map *map, bool exclude_ingress) { return 0; } diff --git a/include/linux/filter.h b/include/linux/filter.h index 3a21947f2fd4..9a5d23ae3855 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1179,17 +1179,18 @@ static inline int xdp_ok_fwd_dev(const struct net_device *fwd, * This does not appear to be a real limitation for existing software. */ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct xdp_buff *xdp, struct bpf_prog *prog); + struct xdp_buff *xdp, const struct bpf_prog *prog); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *prog); + const struct bpf_prog *prog); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, struct xdp_frame *xdpf, - struct bpf_prog *prog); + const struct bpf_prog *prog); void xdp_do_flush(void); -void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act); +void bpf_warn_invalid_xdp_action(const struct net_device *dev, + const struct bpf_prog *prog, u32 act); #ifdef CONFIG_INET struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ecc686409161..ecca21387a68 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3958,9 +3958,9 @@ static inline void dev_consume_skb_any(struct sk_buff *skb) } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog); -void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb); + const struct bpf_prog *xdp_prog); +void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog); +int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb); int netif_rx(struct sk_buff *skb); int __netif_rx(struct sk_buff *skb); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 58009fa66102..95452d1a07fc 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3627,7 +3627,7 @@ static inline netmem_ref skb_frag_netmem(const skb_frag_t *frag) int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, - struct bpf_prog *prog); + const struct bpf_prog *prog); /** * skb_frag_address - gets the address of the data contained in a paged fragment -- cgit v1.2.3 From dcf3827cde8621d2317a7f98e069adbdc2112982 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 3 Dec 2024 18:37:26 +0100 Subject: xdp, xsk: constify read-only arguments of some static inline helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lots of read-only helpers for &xdp_buff and &xdp_frame, such as getting the frame length, skb_shared_info etc., don't have their arguments marked with `const` for no reason. Add the missing annotations to leave less place for mistakes and more for optimization. Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241203173733.3181246-4-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp.h | 29 +++++++++++++++++------------ include/net/xdp_sock_drv.h | 11 ++++++----- include/net/xsk_buff_pool.h | 2 +- 3 files changed, 24 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index e6770dd40c91..197808df1ee1 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -88,7 +88,7 @@ struct xdp_buff { u32 flags; /* supported values defined in xdp_buff_flags */ }; -static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp) +static __always_inline bool xdp_buff_has_frags(const struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS); } @@ -103,7 +103,8 @@ static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; } -static __always_inline bool xdp_buff_is_frag_pfmemalloc(struct xdp_buff *xdp) +static __always_inline bool +xdp_buff_is_frag_pfmemalloc(const struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } @@ -144,15 +145,16 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) static inline struct skb_shared_info * -xdp_get_shared_info_from_buff(struct xdp_buff *xdp) +xdp_get_shared_info_from_buff(const struct xdp_buff *xdp) { return (struct skb_shared_info *)xdp_data_hard_end(xdp); } -static __always_inline unsigned int xdp_get_buff_len(struct xdp_buff *xdp) +static __always_inline unsigned int +xdp_get_buff_len(const struct xdp_buff *xdp) { unsigned int len = xdp->data_end - xdp->data; - struct skb_shared_info *sinfo; + const struct skb_shared_info *sinfo; if (likely(!xdp_buff_has_frags(xdp))) goto out; @@ -177,12 +179,13 @@ struct xdp_frame { u32 flags; /* supported values defined in xdp_buff_flags */ }; -static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame) +static __always_inline bool xdp_frame_has_frags(const struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); } -static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame) +static __always_inline bool +xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } @@ -201,7 +204,7 @@ static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) } static inline struct skb_shared_info * -xdp_get_shared_info_from_frame(struct xdp_frame *frame) +xdp_get_shared_info_from_frame(const struct xdp_frame *frame) { void *data_hard_start = frame->data - frame->headroom - sizeof(*frame); @@ -249,7 +252,8 @@ int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline -void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) +void xdp_convert_frame_to_buff(const struct xdp_frame *frame, + struct xdp_buff *xdp) { xdp->data_hard_start = frame->data - frame->headroom - sizeof(*frame); xdp->data = frame->data; @@ -260,7 +264,7 @@ void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) } static inline -int xdp_update_frame_from_buff(struct xdp_buff *xdp, +int xdp_update_frame_from_buff(const struct xdp_buff *xdp, struct xdp_frame *xdp_frame) { int metasize, headroom; @@ -317,9 +321,10 @@ void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq); void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq); -static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf) +static __always_inline unsigned int +xdp_get_frame_len(const struct xdp_frame *xdpf) { - struct skb_shared_info *sinfo; + const struct skb_shared_info *sinfo; unsigned int len = xdpf->len; if (likely(!xdp_frame_has_frags(xdpf))) diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 40085afd9160..f3175a5d28f7 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -101,7 +101,7 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return xp_alloc(pool); } -static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +static inline bool xsk_is_eop_desc(const struct xdp_desc *desc) { return !xp_mb_desc(desc); } @@ -143,7 +143,7 @@ static inline void xsk_buff_add_frag(struct xdp_buff *xdp) list_add_tail(&frag->list_node, &frag->pool->xskb_list); } -static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) +static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) { struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); struct xdp_buff *ret = NULL; @@ -200,7 +200,8 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) XDP_TXMD_FLAGS_CHECKSUM | \ 0) -static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) +static inline bool +xsk_buff_valid_tx_metadata(const struct xsk_tx_metadata *meta) { return !(meta->flags & ~XDP_TXMD_FLAGS_VALID); } @@ -337,7 +338,7 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return NULL; } -static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +static inline bool xsk_is_eop_desc(const struct xdp_desc *desc) { return false; } @@ -360,7 +361,7 @@ static inline void xsk_buff_add_frag(struct xdp_buff *xdp) { } -static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) +static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) { return NULL; } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 7637799b6c19..50779406bc2d 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -183,7 +183,7 @@ static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, !(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK); } -static inline bool xp_mb_desc(struct xdp_desc *desc) +static inline bool xp_mb_desc(const struct xdp_desc *desc) { return desc->options & XDP_PKT_CONTD; } -- cgit v1.2.3 From f65966fe0178c06065d354c22fb456fc4370b527 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 3 Dec 2024 18:37:27 +0100 Subject: xdp: allow attaching already registered memory model to xdp_rxq_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One may need to register memory model separately from xdp_rxq_info. One simple example may be XDP test run code, but in general, it might be useful when memory model registering is managed by one layer and then XDP RxQ info by a different one. Allow such scenarios by adding a simple helper which "attaches" already registered memory model to the desired xdp_rxq_info. As this is mostly needed for Page Pool, add a special function to do that for a &page_pool pointer. Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241203173733.3181246-5-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 197808df1ee1..1253fe21ede7 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -356,6 +356,38 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq); int xdp_reg_mem_model(struct xdp_mem_info *mem, enum xdp_mem_type type, void *allocator); void xdp_unreg_mem_model(struct xdp_mem_info *mem); +int xdp_reg_page_pool(struct page_pool *pool); +void xdp_unreg_page_pool(const struct page_pool *pool); +void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq, + const struct page_pool *pool); + +/** + * xdp_rxq_info_attach_mem_model - attach registered mem info to RxQ info + * @xdp_rxq: XDP RxQ info to attach the memory info to + * @mem: already registered memory info + * + * If the driver registers its memory providers manually, it must use this + * function instead of xdp_rxq_info_reg_mem_model(). + */ +static inline void +xdp_rxq_info_attach_mem_model(struct xdp_rxq_info *xdp_rxq, + const struct xdp_mem_info *mem) +{ + xdp_rxq->mem = *mem; +} + +/** + * xdp_rxq_info_detach_mem_model - detach registered mem info from RxQ info + * @xdp_rxq: XDP RxQ info to detach the memory info from + * + * If the driver registers its memory providers manually and then attaches it + * via xdp_rxq_info_attach_mem_model(), it must call this function before + * xdp_rxq_info_unreg(). + */ +static inline void xdp_rxq_info_detach_mem_model(struct xdp_rxq_info *xdp_rxq) +{ + xdp_rxq->mem = (struct xdp_mem_info){ }; +} /* Drivers not supporting XDP metadata can use this helper, which * rejects any room expansion for metadata as a result. -- cgit v1.2.3 From e77d9aee951341119be16a991fcfc76d1154d22a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 3 Dec 2024 18:37:29 +0100 Subject: xdp: register system page pool as an XDP memory model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To make the system page pool usable as a source for allocating XDP frames, we need to register it with xdp_reg_mem_model(), so that page return works correctly. This is done in preparation for using the system page_pool to convert XDP_PASS XSk frames to skbs; for the same reason, make the per-cpu variable non-static so we can access it from other source files as well (but w/o exporting). Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241203173733.3181246-7-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ecca21387a68..d1a8d98b132c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3322,6 +3322,7 @@ struct softnet_data { }; DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +DECLARE_PER_CPU(struct page_pool *, system_page_pool); #ifndef CONFIG_PREEMPT_RT static inline int dev_recursion_level(void) -- cgit v1.2.3 From 9bd9f72a74344b54cfb6fcabf1173e6c6e5c6952 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 3 Dec 2024 18:37:30 +0100 Subject: netmem: add a couple of page helper wrappers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the following netmem counterparts: * virt_to_netmem() -- simple page_to_netmem(virt_to_page()) wrapper; * netmem_is_pfmemalloc() -- page_is_pfmemalloc() for page-backed netmems, false otherwise; and the following "unsafe" versions: * __netmem_to_page() * __netmem_get_pp() * __netmem_address() They do the same as their non-underscored buddies, but assume the netmem is always page-backed. When working with header &page_pools, you don't need to check whether netmem belongs to the host memory and you can never get NULL instead of &page. Checks for the LSB, clearing the LSB, branches take cycles and increase object code size, sometimes significantly. When you're sure your PP is always host, you can avoid this by using the underscored counterparts. Signed-off-by: Alexander Lobakin Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241203173733.3181246-8-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/netmem.h | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 76 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netmem.h b/include/net/netmem.h index 8a6e20be4b9d..1b58faa4f20f 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -72,6 +72,22 @@ static inline bool netmem_is_net_iov(const netmem_ref netmem) return (__force unsigned long)netmem & NET_IOV; } +/** + * __netmem_to_page - unsafely get pointer to the &page backing @netmem + * @netmem: netmem reference to convert + * + * Unsafe version of netmem_to_page(). When @netmem is always page-backed, + * e.g. when it's a header buffer, performs faster and generates smaller + * object code (no check for the LSB, no WARN). When @netmem points to IOV, + * provokes undefined behaviour. + * + * Return: pointer to the &page (garbage if @netmem is not page-backed). + */ +static inline struct page *__netmem_to_page(netmem_ref netmem) +{ + return (__force struct page *)netmem; +} + /* This conversion fails (returns NULL) if the netmem_ref is not struct page * backed. */ @@ -80,7 +96,7 @@ static inline struct page *netmem_to_page(netmem_ref netmem) if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) return NULL; - return (__force struct page *)netmem; + return __netmem_to_page(netmem); } static inline struct net_iov *netmem_to_net_iov(netmem_ref netmem) @@ -103,6 +119,17 @@ static inline netmem_ref page_to_netmem(struct page *page) return (__force netmem_ref)page; } +/** + * virt_to_netmem - convert virtual memory pointer to a netmem reference + * @data: host memory pointer to convert + * + * Return: netmem reference to the &page backing this virtual address. + */ +static inline netmem_ref virt_to_netmem(const void *data) +{ + return page_to_netmem(virt_to_page(data)); +} + static inline int netmem_ref_count(netmem_ref netmem) { /* The non-pp refcount of net_iov is always 1. On net_iov, we only @@ -127,6 +154,22 @@ static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem) return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV); } +/** + * __netmem_get_pp - unsafely get pointer to the &page_pool backing @netmem + * @netmem: netmem reference to get the pointer from + * + * Unsafe version of netmem_get_pp(). When @netmem is always page-backed, + * e.g. when it's a header buffer, performs faster and generates smaller + * object code (avoids clearing the LSB). When @netmem points to IOV, + * provokes invalid memory access. + * + * Return: pointer to the &page_pool (garbage if @netmem is not page-backed). + */ +static inline struct page_pool *__netmem_get_pp(netmem_ref netmem) +{ + return __netmem_to_page(netmem)->pp; +} + static inline struct page_pool *netmem_get_pp(netmem_ref netmem) { return __netmem_clear_lsb(netmem)->pp; @@ -158,12 +201,43 @@ static inline netmem_ref netmem_compound_head(netmem_ref netmem) return page_to_netmem(compound_head(netmem_to_page(netmem))); } +/** + * __netmem_address - unsafely get pointer to the memory backing @netmem + * @netmem: netmem reference to get the pointer for + * + * Unsafe version of netmem_address(). When @netmem is always page-backed, + * e.g. when it's a header buffer, performs faster and generates smaller + * object code (no check for the LSB). When @netmem points to IOV, provokes + * undefined behaviour. + * + * Return: pointer to the memory (garbage if @netmem is not page-backed). + */ +static inline void *__netmem_address(netmem_ref netmem) +{ + return page_address(__netmem_to_page(netmem)); +} + static inline void *netmem_address(netmem_ref netmem) { if (netmem_is_net_iov(netmem)) return NULL; - return page_address(netmem_to_page(netmem)); + return __netmem_address(netmem); +} + +/** + * netmem_is_pfmemalloc - check if @netmem was allocated under memory pressure + * @netmem: netmem reference to check + * + * Return: true if @netmem is page-backed and the page was allocated under + * memory pressure, false otherwise. + */ +static inline bool netmem_is_pfmemalloc(netmem_ref netmem) +{ + if (netmem_is_net_iov(netmem)) + return false; + + return page_is_pfmemalloc(netmem_to_page(netmem)); } static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) -- cgit v1.2.3 From 024bfd2e9d80d7131f1178eb2235030b96f7ef0e Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 3 Dec 2024 18:37:31 +0100 Subject: page_pool: make page_pool_put_page_bulk() handle array of netmems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, page_pool_put_page_bulk() indeed takes an array of pointers to the data, not pages, despite the name. As one side effect, when you're freeing frags from &skb_shared_info, xdp_return_frame_bulk() converts page pointers to virtual addresses and then page_pool_put_page_bulk() converts them back. Moreover, data pointers assume every frag is placed in the host memory, making this function non-universal. Make page_pool_put_page_bulk() handle array of netmems. Pass frag netmems directly and use virt_to_netmem() when freeing xdpf->data, so that the PP core will then get the compound netmem and take care of the rest. Signed-off-by: Alexander Lobakin Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241203173733.3181246-9-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/types.h | 8 ++++---- include/net/xdp.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index c022c410abe3..1ea16b0e9c79 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -259,8 +259,8 @@ void page_pool_disable_direct_recycling(struct page_pool *pool); void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), const struct xdp_mem_info *mem); -void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count); +void page_pool_put_netmem_bulk(struct page_pool *pool, netmem_ref *data, + u32 count); #else static inline void page_pool_destroy(struct page_pool *pool) { @@ -272,8 +272,8 @@ static inline void page_pool_use_xdp_mem(struct page_pool *pool, { } -static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count) +static inline void page_pool_put_netmem_bulk(struct page_pool *pool, + netmem_ref *data, u32 count) { } #endif diff --git a/include/net/xdp.h b/include/net/xdp.h index 1253fe21ede7..f4020b29122f 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -194,7 +194,7 @@ xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame) struct xdp_frame_bulk { int count; void *xa; - void *q[XDP_BULK_QUEUE_SIZE]; + netmem_ref q[XDP_BULK_QUEUE_SIZE]; }; static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) -- cgit v1.2.3 From 8f1c716090a7ed20fea802b63b37758169d59b81 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 5 Dec 2024 10:42:10 +0000 Subject: net: phy: remove genphy_c45_eee_is_active()'s is_enabled arg All callers to genphy_c45_eee_is_active() now pass NULL as the is_enabled argument, which means we never use the value computed in this function. Remove the argument and clean up this function. Signed-off-by: Russell King (Oracle) Reviewed-by: Heiner Kallweit Link: https://patch.msgid.link/E1tJ9JC-006LIt-Ne@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 61a1bc81f597..bb157136351e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1991,7 +1991,7 @@ int genphy_c45_plca_set_cfg(struct phy_device *phydev, int genphy_c45_plca_get_status(struct phy_device *phydev, struct phy_plca_status *plca_st); int genphy_c45_eee_is_active(struct phy_device *phydev, unsigned long *adv, - unsigned long *lp, bool *is_enabled); + unsigned long *lp); int genphy_c45_ethtool_get_eee(struct phy_device *phydev, struct ethtool_keee *data); int genphy_c45_ethtool_set_eee(struct phy_device *phydev, -- cgit v1.2.3 From 18eabadd73ae60023ab05e376246bd725fb0c113 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 4 Dec 2024 13:11:21 +0100 Subject: vrf: Make pcpu_dstats update functions available to other modules. Currently vrf is the only module that uses NETDEV_PCPU_STAT_DSTATS. In order to make this kind of statistics available to other modules, we need to define the update functions in netdevice.h. Therefore, let's define dev_dstats_*() functions for RX and TX packet updates (packets, bytes and drops). Use these new functions in vrf.c instead of vrf_rx_stats() and the other manual counter updates. While there, update the type of the "len" variables to "unsigned int", so that there're aligned with both skb->len and the new dstats update functions. Signed-off-by: Guillaume Nault Link: https://patch.msgid.link/d7a552ee382c79f4854e7fcc224cf176cd21150d.1733313925.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d1a8d98b132c..135105441681 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2854,6 +2854,46 @@ static inline void dev_lstats_add(struct net_device *dev, unsigned int len) u64_stats_update_end(&lstats->syncp); } +static inline void dev_dstats_rx_add(struct net_device *dev, + unsigned int len) +{ + struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); + + u64_stats_update_begin(&dstats->syncp); + u64_stats_inc(&dstats->rx_packets); + u64_stats_add(&dstats->rx_bytes, len); + u64_stats_update_end(&dstats->syncp); +} + +static inline void dev_dstats_rx_dropped(struct net_device *dev) +{ + struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); + + u64_stats_update_begin(&dstats->syncp); + u64_stats_inc(&dstats->rx_drops); + u64_stats_update_end(&dstats->syncp); +} + +static inline void dev_dstats_tx_add(struct net_device *dev, + unsigned int len) +{ + struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); + + u64_stats_update_begin(&dstats->syncp); + u64_stats_inc(&dstats->tx_packets); + u64_stats_add(&dstats->tx_bytes, len); + u64_stats_update_end(&dstats->syncp); +} + +static inline void dev_dstats_tx_dropped(struct net_device *dev) +{ + struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); + + u64_stats_update_begin(&dstats->syncp); + u64_stats_inc(&dstats->tx_drops); + u64_stats_update_end(&dstats->syncp); +} + #define __netdev_alloc_pcpu_stats(type, gfp) \ ({ \ typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\ -- cgit v1.2.3 From 435004291c9aebeb940c4bb5b0f5764c012ea5ff Mon Sep 17 00:00:00 2001 From: David Lechner Date: Wed, 13 Nov 2024 10:55:19 -0600 Subject: iio: adc: ad4695: move dt-bindings header Move the dt-bindings header file to the include/dt-bindings/iio/adc/ directory. ad4695 is an ADC driver, so it should be in the adc/ subdirectory for better organization. Previously, it was in the iio/ subdirectory. Signed-off-by: David Lechner Link: https://patch.msgid.link/20241113-iio-adc-ad4695-move-dt-bindings-header-v1-1-aba1f0f9b628@baylibre.com Signed-off-by: Jonathan Cameron --- include/dt-bindings/iio/adc/adi,ad4695.h | 9 +++++++++ include/dt-bindings/iio/adi,ad4695.h | 9 --------- 2 files changed, 9 insertions(+), 9 deletions(-) create mode 100644 include/dt-bindings/iio/adc/adi,ad4695.h delete mode 100644 include/dt-bindings/iio/adi,ad4695.h (limited to 'include') diff --git a/include/dt-bindings/iio/adc/adi,ad4695.h b/include/dt-bindings/iio/adc/adi,ad4695.h new file mode 100644 index 000000000000..9fbef542bf67 --- /dev/null +++ b/include/dt-bindings/iio/adc/adi,ad4695.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ + +#ifndef _DT_BINDINGS_ADI_AD4695_H +#define _DT_BINDINGS_ADI_AD4695_H + +#define AD4695_COMMON_MODE_REFGND 0xFF +#define AD4695_COMMON_MODE_COM 0xFE + +#endif /* _DT_BINDINGS_ADI_AD4695_H */ diff --git a/include/dt-bindings/iio/adi,ad4695.h b/include/dt-bindings/iio/adi,ad4695.h deleted file mode 100644 index 9fbef542bf67..000000000000 --- a/include/dt-bindings/iio/adi,ad4695.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ - -#ifndef _DT_BINDINGS_ADI_AD4695_H -#define _DT_BINDINGS_ADI_AD4695_H - -#define AD4695_COMMON_MODE_REFGND 0xFF -#define AD4695_COMMON_MODE_COM 0xFE - -#endif /* _DT_BINDINGS_ADI_AD4695_H */ -- cgit v1.2.3 From cb3e9a446763da8850c8280be009d95f61e11a94 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 22 Nov 2024 11:42:48 -0600 Subject: iio: adc: ad_sigma_delta: add tab to align irq_line Align the irq_line field in struct ad_sigma_delta with the other fields. Signed-off-by: David Lechner Link: https://patch.msgid.link/20241122-iio-adc-ad_signal_delta-fix-align-v1-1-d0a071d2dc83@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/ad_sigma_delta.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index f8c1d2505940..1851f8fed3a4 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -96,7 +96,7 @@ struct ad_sigma_delta { unsigned int active_slots; unsigned int current_slot; unsigned int num_slots; - int irq_line; + int irq_line; bool status_appended; /* map slots to channels in order to know what to expect from devices */ unsigned int *slots; -- cgit v1.2.3 From 0c39208bc3af6f7afcbcdb675ad8fbe6b3022865 Mon Sep 17 00:00:00 2001 From: Robert Budai Date: Mon, 25 Nov 2024 15:35:08 +0200 Subject: iio: imu: adis: Remove documented not used elements This patch removes elements from adis.h that are documented but not used anymore. Signed-off-by: Robert Budai Link: https://patch.msgid.link/20241125133520.24328-2-robert.budai@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index e6a75356567a..4bb98d9731de 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -99,7 +99,6 @@ struct adis_data { * @spi: Reference to SPI device which owns this ADIS IIO device * @trig: IIO trigger object data * @data: ADIS chip variant specific data - * @burst: ADIS burst transfer information * @burst_extra_len: Burst extra length. Should only be used by devices that can * dynamically change their burst mode length. * @state_lock: Lock used by the device to protect state -- cgit v1.2.3 From e895f2edfe4820b671c700ccc17be35b1de3d295 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Mon, 25 Nov 2024 22:16:19 +0100 Subject: iio: core: fix doc reference to iio_push_to_buffers_with_ts_unaligned Use the right name of the function, which is defined in drivers/iio/industrialio-buffer.c Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241125-iio_memset_scan_holes-v1-11-0cb6e98d895c@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio-opaque.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h index a89e7e43e441..4247497f3f8b 100644 --- a/include/linux/iio/iio-opaque.h +++ b/include/linux/iio/iio-opaque.h @@ -28,7 +28,7 @@ * @groupcounter: index of next attribute group * @legacy_scan_el_group: attribute group for legacy scan elements attribute group * @legacy_buffer_group: attribute group for legacy buffer attributes group - * @bounce_buffer: for devices that call iio_push_to_buffers_with_timestamp_unaligned() + * @bounce_buffer: for devices that call iio_push_to_buffers_with_ts_unaligned() * @bounce_buffer_size: size of currently allocate bounce buffer * @scan_index_timestamp: cache of the index to the timestamp * @clock_id: timestamping clock posix identifier -- cgit v1.2.3 From d87daa18535d38385f1da460c557c004e0b66347 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Mon, 26 Aug 2024 20:31:14 +0300 Subject: dt-bindings: clk: at91: Add clock IDs for the slow clock controller Add clock IDs for the slow clock controller. Previously, raw numbers were used (0 or 1) for clocks generated by the slow clock controller. This leads to confusion and wrong IDs were used on few device trees. To avoid this add macros. Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240826173116.3628337-2-claudiu.beznea@tuxon.dev Signed-off-by: Claudiu Beznea --- include/dt-bindings/clock/at91.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/at91.h b/include/dt-bindings/clock/at91.h index 6ede88c3992d..99f4767ff6bb 100644 --- a/include/dt-bindings/clock/at91.h +++ b/include/dt-bindings/clock/at91.h @@ -55,4 +55,8 @@ #define AT91_PMC_GCKRDY 24 /* Generated Clocks */ #endif +/* Slow clock. */ +#define SCKC_MD_SLCK 0 +#define SCKC_TD_SLCK 1 + #endif -- cgit v1.2.3 From ebe559609d7829b52c6642b581860760984faf9d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 15 Nov 2024 10:30:14 -0500 Subject: fs: get rid of __FMODE_NONOTIFY kludge All it takes to get rid of the __FMODE_NONOTIFY kludge is switching fanotify from anon_inode_getfd() to anon_inode_getfile_fmode() and adding a dentry_open_nonotify() helper to be used by fanotify on the other path. That's it - no more weird shit in OPEN_FMODE(), etc. Signed-off-by: Al Viro Link: https://lore.kernel.org/linux-fsdevel/20241113043003.GH3387508@ZenIV/ Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/d1231137e7b661a382459e79a764259509a4115d.1731684329.git.josef@toxicpanda.com --- include/linux/fs.h | 6 +++--- include/uapi/asm-generic/fcntl.h | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..93c2b720271e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2751,6 +2751,8 @@ static inline struct file *file_open_root_mnt(struct vfsmount *mnt, } struct file *dentry_open(const struct path *path, int flags, const struct cred *creds); +struct file *dentry_open_nonotify(const struct path *path, int flags, + const struct cred *cred); struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred); struct path *backing_file_user_path(struct file *f); @@ -3707,11 +3709,9 @@ struct ctl_table; int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) -#define __FMODE_NONOTIFY ((__force int) FMODE_NONOTIFY) #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) -#define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \ - (flag & __FMODE_NONOTIFY))) +#define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE)) static inline bool is_sxid(umode_t mode) { diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h index 80f37a0d40d7..613475285643 100644 --- a/include/uapi/asm-generic/fcntl.h +++ b/include/uapi/asm-generic/fcntl.h @@ -6,7 +6,6 @@ /* * FMODE_EXEC is 0x20 - * FMODE_NONOTIFY is 0x4000000 * These cannot be used by userspace O_* until internal and external open * flags are split. * -Eric Paris -- cgit v1.2.3 From e419ddeabe7edd89650a19f411f928eea12b35b1 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Mon, 13 Nov 2023 23:32:09 +1000 Subject: m68k: Use kernel's generic muldi3 libgcc function Use the kernels own generic lib/muldi3.c implementation of muldi3 for 68K machines. Some 68K CPUs support 64bit multiplies so move the arch specific umul_ppmm() macro into a header file that is included by lib/muldi3.c. That way it can take advantage of the single instruction when available. There does not appear to be any existing mechanism for the generic lib/muldi3.c code to pick up an external arch definition of umul_ppmm(). Create an arch specific libgcc.h that can optionally be included by the system include/linux/libgcc.h to allow for this. Somewhat oddly there is also a similar definition of umul_ppmm() in the non-architecture code in lib/crypto/mpi/longlong.h for a wide range or machines. Its presence ends up complicating the include setup and means not being able to use something like compiler.h instead. Actually there is a few other defines of umul_ppmm() macros spread around in various architectures, but not directly usable for the m68k case. Signed-off-by: Greg Ungerer Link: https://lore.kernel.org/20231113133209.1367286-1-gerg@linux-m68k.org Reviewed-by: Geert Uytterhoeven Reviewed-by: Arnd Bergmann Signed-off-by: Geert Uytterhoeven --- include/linux/libgcc.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/libgcc.h b/include/linux/libgcc.h index fc388da6a027..0d68f9d6a6a7 100644 --- a/include/linux/libgcc.h +++ b/include/linux/libgcc.h @@ -34,4 +34,8 @@ long long notrace __lshrdi3(long long u, word_type b); long long notrace __muldi3(long long u, long long v); word_type notrace __ucmpdi2(unsigned long long a, unsigned long long b); +#ifdef CONFIG_HAVE_ARCH_LIBGCC_H +#include +#endif + #endif /* __ASM_LIBGCC_H */ -- cgit v1.2.3 From e9705da8472f306b44cbe1992ea2161bb96ece6e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 9 Dec 2024 10:44:42 +0100 Subject: ASoC: dt-bindings: qcom,wcd9335: Drop number of DAIs from the header Number of DAIs in the codec is not really a binding constant, because it could grow, e.g. when we implement missing features. Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20241209094442.38900-2-krzysztof.kozlowski@linaro.org Signed-off-by: Mark Brown --- include/dt-bindings/sound/qcom,wcd9335.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/dt-bindings/sound/qcom,wcd9335.h b/include/dt-bindings/sound/qcom,wcd9335.h index f5e9f1db091e..4fc68aeb9e04 100644 --- a/include/dt-bindings/sound/qcom,wcd9335.h +++ b/include/dt-bindings/sound/qcom,wcd9335.h @@ -10,6 +10,5 @@ #define AIF3_PB 4 #define AIF3_CAP 5 #define AIF4_PB 6 -#define NUM_CODEC_DAIS 7 #endif -- cgit v1.2.3 From 017b76fb8e5b6066f6791e7ad2387deb2c9c9a14 Mon Sep 17 00:00:00 2001 From: Joy Zou Date: Thu, 5 Dec 2024 11:51:12 -0500 Subject: regulator: pca9450: Add PMIC pca9452 support Add the PMIC pca9452 support, which add ldo3 compared with pca9451a. Signed-off-by: Joy Zou Signed-off-by: Frank Li Link: https://patch.msgid.link/20241205-pca9450-v1-4-aab448b74e78@nxp.com Signed-off-by: Mark Brown --- include/linux/regulator/pca9450.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/regulator/pca9450.h b/include/linux/regulator/pca9450.h index 243633c8dceb..b427b5873de1 100644 --- a/include/linux/regulator/pca9450.h +++ b/include/linux/regulator/pca9450.h @@ -10,6 +10,7 @@ enum pca9450_chip_type { PCA9450_TYPE_PCA9450A = 0, PCA9450_TYPE_PCA9450BC, PCA9450_TYPE_PCA9451A, + PCA9450_TYPE_PCA9452, PCA9450_TYPE_AMOUNT, }; -- cgit v1.2.3 From 2ff913ab3f472321ac1931b663314edd6c211a0c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 5 Dec 2024 16:24:14 -0800 Subject: uprobes: Simplify session consumer tracking In practice, each return_instance will typically contain either zero or one return_consumer, depending on whether it has any uprobe session consumer attached or not. It's highly unlikely that more than one uprobe session consumers will be attached to any given uprobe, so there is no need to optimize for that case. But the way we currently do memory allocation and accounting is by pre-allocating the space for 4 session consumers in contiguous block of memory next to struct return_instance fixed part. This is unnecessarily wasteful. This patch changes this to keep struct return_instance fixed-sized with one pre-allocated return_consumer, while (in a highly unlikely scenario) allowing for more session consumers in a separate dynamically allocated and reallocated array. We also simplify accounting a bit by not maintaining a separate temporary capacity for consumers array, and, instead, relying on krealloc() to be a no-op if underlying memory can accommodate a slightly bigger allocation (but again, it's very uncommon scenario to even have to do this reallocation). All this gets rid of ri_size(), simplifies push_consumer() and removes confusing ri->consumers_cnt re-assignment, while containing this singular preallocated consumer logic contained within a few simple preexisting helpers. Having fixed-sized struct return_instance simplifies and speeds up return_instance reuse that we ultimately add later in this patch set, see follow up patches. Signed-off-by: Andrii Nakryiko Signed-off-by: Ingo Molnar Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20241206002417.3295533-2-andrii@kernel.org --- include/linux/uprobes.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index e0a4c2082245..1d449978558d 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -154,12 +154,18 @@ struct return_instance { unsigned long stack; /* stack pointer */ unsigned long orig_ret_vaddr; /* original return address */ bool chained; /* true, if instance is nested */ - int consumers_cnt; + int cons_cnt; /* total number of session consumers */ struct return_instance *next; /* keep as stack */ struct rcu_head rcu; - struct return_consumer consumers[] __counted_by(consumers_cnt); + /* singular pre-allocated return_consumer instance for common case */ + struct return_consumer consumer; + /* + * extra return_consumer instances for rare cases of multiple session consumers, + * contains (cons_cnt - 1) elements + */ + struct return_consumer *extra_consumers; } ____cacheline_aligned; enum rp_check { -- cgit v1.2.3 From 8622e45b5da17e777e0e45f16296072494452318 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 5 Dec 2024 16:24:17 -0800 Subject: uprobes: Reuse return_instances between multiple uretprobes within task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of constantly allocating and freeing very short-lived struct return_instance, reuse it as much as possible within current task. For that, store a linked list of reusable return_instances within current->utask. The only complication is that ri_timer() might be still processing such return_instance. And so while the main uretprobe processing logic might be already done with return_instance and would be OK to immediately reuse it for the next uretprobe instance, it's not correct to unconditionally reuse it just like that. Instead we make sure that ri_timer() can't possibly be processing it by using seqcount_t, with ri_timer() being "a writer", while free_ret_instance() being "a reader". If, after we unlink return instance from utask->return_instances list, we know that ri_timer() hasn't gotten to processing utask->return_instances yet, then we can be sure that immediate return_instance reuse is OK, and so we put it onto utask->ri_pool for future (potentially, almost immediate) reuse. This change shows improvements both in single CPU performance (by avoiding relatively expensive kmalloc/free combon) and in terms of multi-CPU scalability, where you can see that per-CPU throughput doesn't decline as steeply with increased number of CPUs (which were previously attributed to kmalloc()/free() through profiling): BASELINE (latest perf/core) =========================== uretprobe-nop ( 1 cpus): 1.898 ± 0.002M/s ( 1.898M/s/cpu) uretprobe-nop ( 2 cpus): 3.574 ± 0.011M/s ( 1.787M/s/cpu) uretprobe-nop ( 3 cpus): 5.279 ± 0.066M/s ( 1.760M/s/cpu) uretprobe-nop ( 4 cpus): 6.824 ± 0.047M/s ( 1.706M/s/cpu) uretprobe-nop ( 5 cpus): 8.339 ± 0.060M/s ( 1.668M/s/cpu) uretprobe-nop ( 6 cpus): 9.812 ± 0.047M/s ( 1.635M/s/cpu) uretprobe-nop ( 7 cpus): 11.030 ± 0.048M/s ( 1.576M/s/cpu) uretprobe-nop ( 8 cpus): 12.453 ± 0.126M/s ( 1.557M/s/cpu) uretprobe-nop (10 cpus): 14.838 ± 0.044M/s ( 1.484M/s/cpu) uretprobe-nop (12 cpus): 17.092 ± 0.115M/s ( 1.424M/s/cpu) uretprobe-nop (14 cpus): 19.576 ± 0.022M/s ( 1.398M/s/cpu) uretprobe-nop (16 cpus): 22.264 ± 0.015M/s ( 1.391M/s/cpu) uretprobe-nop (24 cpus): 33.534 ± 0.078M/s ( 1.397M/s/cpu) uretprobe-nop (32 cpus): 43.262 ± 0.127M/s ( 1.352M/s/cpu) uretprobe-nop (40 cpus): 53.252 ± 0.080M/s ( 1.331M/s/cpu) uretprobe-nop (48 cpus): 55.778 ± 0.045M/s ( 1.162M/s/cpu) uretprobe-nop (56 cpus): 56.850 ± 0.227M/s ( 1.015M/s/cpu) uretprobe-nop (64 cpus): 62.005 ± 0.077M/s ( 0.969M/s/cpu) uretprobe-nop (72 cpus): 66.445 ± 0.236M/s ( 0.923M/s/cpu) uretprobe-nop (80 cpus): 68.353 ± 0.180M/s ( 0.854M/s/cpu) THIS PATCHSET (on top of latest perf/core) ========================================== uretprobe-nop ( 1 cpus): 2.253 ± 0.004M/s ( 2.253M/s/cpu) uretprobe-nop ( 2 cpus): 4.281 ± 0.003M/s ( 2.140M/s/cpu) uretprobe-nop ( 3 cpus): 6.389 ± 0.027M/s ( 2.130M/s/cpu) uretprobe-nop ( 4 cpus): 8.328 ± 0.005M/s ( 2.082M/s/cpu) uretprobe-nop ( 5 cpus): 10.353 ± 0.001M/s ( 2.071M/s/cpu) uretprobe-nop ( 6 cpus): 12.513 ± 0.010M/s ( 2.086M/s/cpu) uretprobe-nop ( 7 cpus): 14.525 ± 0.017M/s ( 2.075M/s/cpu) uretprobe-nop ( 8 cpus): 15.633 ± 0.013M/s ( 1.954M/s/cpu) uretprobe-nop (10 cpus): 19.532 ± 0.011M/s ( 1.953M/s/cpu) uretprobe-nop (12 cpus): 21.405 ± 0.009M/s ( 1.784M/s/cpu) uretprobe-nop (14 cpus): 24.857 ± 0.020M/s ( 1.776M/s/cpu) uretprobe-nop (16 cpus): 26.466 ± 0.018M/s ( 1.654M/s/cpu) uretprobe-nop (24 cpus): 40.513 ± 0.222M/s ( 1.688M/s/cpu) uretprobe-nop (32 cpus): 54.180 ± 0.074M/s ( 1.693M/s/cpu) uretprobe-nop (40 cpus): 66.100 ± 0.082M/s ( 1.652M/s/cpu) uretprobe-nop (48 cpus): 70.544 ± 0.068M/s ( 1.470M/s/cpu) uretprobe-nop (56 cpus): 74.494 ± 0.055M/s ( 1.330M/s/cpu) uretprobe-nop (64 cpus): 79.317 ± 0.029M/s ( 1.239M/s/cpu) uretprobe-nop (72 cpus): 84.875 ± 0.020M/s ( 1.179M/s/cpu) uretprobe-nop (80 cpus): 92.318 ± 0.224M/s ( 1.154M/s/cpu) For reference, with uprobe-nop we hit the following throughput: uprobe-nop (80 cpus): 143.485 ± 0.035M/s ( 1.794M/s/cpu) So now uretprobe stays a bit closer to that performance. Signed-off-by: Andrii Nakryiko Signed-off-by: Ingo Molnar Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20241206002417.3295533-5-andrii@kernel.org --- include/linux/uprobes.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 1d449978558d..b1df7d792fa1 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -16,6 +16,7 @@ #include #include #include +#include struct uprobe; struct vm_area_struct; @@ -124,6 +125,10 @@ struct uprobe_task { unsigned int depth; struct return_instance *return_instances; + struct return_instance *ri_pool; + struct timer_list ri_timer; + seqcount_t ri_seqcount; + union { struct { struct arch_uprobe_task autask; @@ -137,7 +142,6 @@ struct uprobe_task { }; struct uprobe *active_uprobe; - struct timer_list ri_timer; unsigned long xol_vaddr; struct arch_uprobe *auprobe; -- cgit v1.2.3 From 6057b90ecc84f232dd32a047a086a4c4c271765f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 3 Dec 2024 10:04:40 -0800 Subject: perf/core: Export perf_exclude_event() While at it, rename the same function in s390 cpum_sf PMU. Signed-off-by: Namhyung Kim Signed-off-by: Ingo Molnar Tested-by: Ravi Bangoria Reviewed-by: Ravi Bangoria Acked-by: Thomas Richter Link: https://lore.kernel.org/r/20241203180441.1634709-2-namhyung@kernel.org --- include/linux/perf_event.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index bf831b1485ff..8333f132f4a9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1690,6 +1690,8 @@ static inline int perf_allow_tracepoint(struct perf_event_attr *attr) return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); } +extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs); + extern void perf_event_init(void); extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, @@ -1895,6 +1897,10 @@ static inline u64 perf_event_pause(struct perf_event *event, bool reset) { return 0; } +static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) +{ + return 0; +} #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) -- cgit v1.2.3 From bcc80dec91ee745b3d66f3e48f0ec2efdea97149 Mon Sep 17 00:00:00 2001 From: Naman Jain Date: Tue, 17 Sep 2024 11:09:17 +0530 Subject: x86/hyperv: Fix hv tsc page based sched_clock for hibernation read_hv_sched_clock_tsc() assumes that the Hyper-V clock counter is bigger than the variable hv_sched_clock_offset, which is cached during early boot, but depending on the timing this assumption may be false when a hibernated VM starts again (the clock counter starts from 0 again) and is resuming back (Note: hv_init_tsc_clocksource() is not called during hibernation/resume); consequently, read_hv_sched_clock_tsc() may return a negative integer (which is interpreted as a huge positive integer since the return type is u64) and new kernel messages are prefixed with huge timestamps before read_hv_sched_clock_tsc() grows big enough (which typically takes several seconds). Fix the issue by saving the Hyper-V clock counter just before the suspend, and using it to correct the hv_sched_clock_offset in resume. This makes hv tsc page based sched_clock continuous and ensures that post resume, it starts from where it left off during suspend. Override x86_platform.save_sched_clock_state and x86_platform.restore_sched_clock_state routines to correct this as soon as possible. Note: if Invariant TSC is available, the issue doesn't happen because 1) we don't register read_hv_sched_clock_tsc() for sched clock: See commit e5313f1c5404 ("clocksource/drivers/hyper-v: Rework clocksource and sched clock setup"); 2) the common x86 code adjusts TSC similarly: see __restore_processor_state() -> tsc_verify_tsc_adjust(true) and x86_platform.restore_sched_clock_state(). Cc: stable@vger.kernel.org Fixes: 1349401ff1aa ("clocksource/drivers/hyper-v: Suspend/resume Hyper-V clocksource for hibernation") Co-developed-by: Dexuan Cui Signed-off-by: Dexuan Cui Signed-off-by: Naman Jain Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20240917053917.76787-1-namjain@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <20240917053917.76787-1-namjain@linux.microsoft.com> --- include/clocksource/hyperv_timer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/clocksource/hyperv_timer.h b/include/clocksource/hyperv_timer.h index 6cdc873ac907..aa5233b1eba9 100644 --- a/include/clocksource/hyperv_timer.h +++ b/include/clocksource/hyperv_timer.h @@ -38,6 +38,8 @@ extern void hv_remap_tsc_clocksource(void); extern unsigned long hv_get_tsc_pfn(void); extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void); +extern void hv_adj_sched_clock_offset(u64 offset); + static __always_inline bool hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc, u64 *time) -- cgit v1.2.3 From 07a756a49f4b4290b49ea46e089cbe6f79ff8d26 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 6 Nov 2024 07:42:47 -0800 Subject: Drivers: hv: util: Avoid accessing a ringbuffer not initialized yet If the KVP (or VSS) daemon starts before the VMBus channel's ringbuffer is fully initialized, we can hit the panic below: hv_utils: Registering HyperV Utility Driver hv_vmbus: registering driver hv_utils ... BUG: kernel NULL pointer dereference, address: 0000000000000000 CPU: 44 UID: 0 PID: 2552 Comm: hv_kvp_daemon Tainted: G E 6.11.0-rc3+ #1 RIP: 0010:hv_pkt_iter_first+0x12/0xd0 Call Trace: ... vmbus_recvpacket hv_kvp_onchannelcallback vmbus_on_event tasklet_action_common tasklet_action handle_softirqs irq_exit_rcu sysvec_hyperv_stimer0 asm_sysvec_hyperv_stimer0 ... kvp_register_done hvt_op_read vfs_read ksys_read __x64_sys_read This can happen because the KVP/VSS channel callback can be invoked even before the channel is fully opened: 1) as soon as hv_kvp_init() -> hvutil_transport_init() creates /dev/vmbus/hv_kvp, the kvp daemon can open the device file immediately and register itself to the driver by writing a message KVP_OP_REGISTER1 to the file (which is handled by kvp_on_msg() ->kvp_handle_handshake()) and reading the file for the driver's response, which is handled by hvt_op_read(), which calls hvt->on_read(), i.e. kvp_register_done(). 2) the problem with kvp_register_done() is that it can cause the channel callback to be called even before the channel is fully opened, and when the channel callback is starting to run, util_probe()-> vmbus_open() may have not initialized the ringbuffer yet, so the callback can hit the panic of NULL pointer dereference. To reproduce the panic consistently, we can add a "ssleep(10)" for KVP in __vmbus_open(), just before the first hv_ringbuffer_init(), and then we unload and reload the driver hv_utils, and run the daemon manually within the 10 seconds. Fix the panic by reordering the steps in util_probe() so the char dev entry used by the KVP or VSS daemon is not created until after vmbus_open() has completed. This reordering prevents the race condition from happening. Reported-by: Dexuan Cui Fixes: e0fa3e5e7df6 ("Drivers: hv: utils: fix a race on userspace daemons registration") Cc: stable@vger.kernel.org Signed-off-by: Michael Kelley Acked-by: Wei Liu Link: https://lore.kernel.org/r/20241106154247.2271-3-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20241106154247.2271-3-mhklinux@outlook.com> --- include/linux/hyperv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 22c22fb91042..02a226bcf0ed 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1559,6 +1559,7 @@ struct hv_util_service { void *channel; void (*util_cb)(void *); int (*util_init)(struct hv_util_service *); + int (*util_init_transport)(void); void (*util_deinit)(void); int (*util_pre_suspend)(void); int (*util_pre_resume)(void); -- cgit v1.2.3 From df8e78607d4795806b59564ba7a3e2e125d119fc Mon Sep 17 00:00:00 2001 From: Bastien Curutchet Date: Wed, 4 Dec 2024 10:43:16 +0100 Subject: memory: ti-aemif: Export aemif_*_cs_timings() Export the aemif_set_cs_timing() and aemif_check_cs_timing() symbols so they can be used by other drivers Add a mutex to protect the CS configuration register from concurrent accesses between the AEMIF and its 'children'. Signed-off-by: Bastien Curutchet Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/r/20241204094319.1050826-7-bastien.curutchet@bootlin.com [krzysztof: wrap aemif_set_cs_timings() at 80-char] Signed-off-by: Krzysztof Kozlowski --- include/linux/memory/ti-aemif.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 include/linux/memory/ti-aemif.h (limited to 'include') diff --git a/include/linux/memory/ti-aemif.h b/include/linux/memory/ti-aemif.h new file mode 100644 index 000000000000..da94a9d985e7 --- /dev/null +++ b/include/linux/memory/ti-aemif.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __MEMORY_TI_AEMIF_H +#define __MEMORY_TI_AEMIF_H + +/** + * struct aemif_cs_timings: structure to hold CS timing configuration + * values are expressed in number of clock cycles - 1 + * @ta: minimum turn around time + * @rhold: read hold width + * @rstrobe: read strobe width + * @rsetup: read setup width + * @whold: write hold width + * @wstrobe: write strobe width + * @wsetup: write setup width + */ +struct aemif_cs_timings { + u32 ta; + u32 rhold; + u32 rstrobe; + u32 rsetup; + u32 whold; + u32 wstrobe; + u32 wsetup; +}; + +struct aemif_device; + +int aemif_set_cs_timings(struct aemif_device *aemif, u8 cs, struct aemif_cs_timings *timings); +int aemif_check_cs_timings(struct aemif_cs_timings *timings); + +#endif // __MEMORY_TI_AEMIF_H -- cgit v1.2.3 From d1fd972914239996dbd15c5142d7f6e09d95a002 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:29 +0000 Subject: ktime: Add us_to_ktime() Add a us_to_ktime() helper to go with ms_to_ktime() and ns_to_ktime(). Signed-off-by: David Howells cc: Thomas Gleixner cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-2-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/ktime.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 3a4e723eae0f..383ed9985802 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -222,6 +222,11 @@ static inline ktime_t ns_to_ktime(u64 ns) return ns; } +static inline ktime_t us_to_ktime(u64 us) +{ + return us * NSEC_PER_USEC; +} + static inline ktime_t ms_to_ktime(u64 ms) { return ms * NSEC_PER_MSEC; -- cgit v1.2.3 From 0e56ebde245e4799ce74d38419426f2a80d39950 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:30 +0000 Subject: rxrpc: Fix handling of received connection abort Fix the handling of a connection abort that we've received. Though the abort is at the connection level, it needs propagating to the calls on that connection. Whilst the propagation bit is performed, the calls aren't then woken up to go and process their termination, and as no further input is forthcoming, they just hang. Also add some tracing for the logging of connection aborts. Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-3-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index d03e0bd8c028..27c23873c881 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -117,6 +117,7 @@ #define rxrpc_call_poke_traces \ EM(rxrpc_call_poke_abort, "Abort") \ EM(rxrpc_call_poke_complete, "Compl") \ + EM(rxrpc_call_poke_conn_abort, "Conn-abort") \ EM(rxrpc_call_poke_error, "Error") \ EM(rxrpc_call_poke_idle, "Idle") \ EM(rxrpc_call_poke_set_timeout, "Set-timo") \ @@ -282,6 +283,7 @@ EM(rxrpc_call_see_activate_client, "SEE act-clnt") \ EM(rxrpc_call_see_connect_failed, "SEE con-fail") \ EM(rxrpc_call_see_connected, "SEE connect ") \ + EM(rxrpc_call_see_conn_abort, "SEE conn-abt") \ EM(rxrpc_call_see_disconnected, "SEE disconn ") \ EM(rxrpc_call_see_distribute_error, "SEE dist-err") \ EM(rxrpc_call_see_input, "SEE input ") \ @@ -981,6 +983,29 @@ TRACE_EVENT(rxrpc_rx_abort, __entry->abort_code) ); +TRACE_EVENT(rxrpc_rx_conn_abort, + TP_PROTO(const struct rxrpc_connection *conn, const struct sk_buff *skb), + + TP_ARGS(conn, skb), + + TP_STRUCT__entry( + __field(unsigned int, conn) + __field(rxrpc_serial_t, serial) + __field(u32, abort_code) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->serial = rxrpc_skb(skb)->hdr.serial; + __entry->abort_code = skb->priority; + ), + + TP_printk("C=%08x ABORT %08x ac=%d", + __entry->conn, + __entry->serial, + __entry->abort_code) + ); + TRACE_EVENT(rxrpc_rx_challenge, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, u32 version, u32 nonce, u32 min_level), -- cgit v1.2.3 From efa95c32352b2ac7ff09d680144e22c0f25244cb Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:32 +0000 Subject: rxrpc: Clean up Tx header flags generation handling Clean up the generation of the header flags when building packet headers for transmission: (1) Assemble the flags in a local variable rather than in the txb->flags. (2) Do the flags masking and JUMBO-PACKET setting in one bit of code for both the main header and the jumbo headers. (3) Generate the REQUEST-ACK flag afresh each time. There's a possibility we might want to do jumbo retransmission packets in future. (4) Pass the local flags variable to the rxrpc_tx_data tracepoint rather than the combination of the txb flags and the wire header flags (the latter belong only to the first subpacket). Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-5-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 27c23873c881..62064f63d6eb 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -452,7 +452,6 @@ #define rxrpc_req_ack_traces \ EM(rxrpc_reqack_ack_lost, "ACK-LOST ") \ - EM(rxrpc_reqack_already_on, "ALREADY-ON") \ EM(rxrpc_reqack_more_rtt, "MORE-RTT ") \ EM(rxrpc_reqack_no_srv_last, "NO-SRVLAST") \ EM(rxrpc_reqack_old_rtt, "OLD-RTT ") \ -- cgit v1.2.3 From 8b5823ea437624b53ecf084b6dd582760f110394 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:35 +0000 Subject: rxrpc: Request an ACK on impending Tx stall Set the REQUEST-ACK flag on the DATA packet we're about to send if we're about to stall transmission because the app layer isn't keeping up supplying us with data to transmit. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-8-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 62064f63d6eb..d86b5f07d292 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -452,6 +452,7 @@ #define rxrpc_req_ack_traces \ EM(rxrpc_reqack_ack_lost, "ACK-LOST ") \ + EM(rxrpc_reqack_app_stall, "APP-STALL ") \ EM(rxrpc_reqack_more_rtt, "MORE-RTT ") \ EM(rxrpc_reqack_no_srv_last, "NO-SRVLAST") \ EM(rxrpc_reqack_old_rtt, "OLD-RTT ") \ -- cgit v1.2.3 From eeaedc5449d9fccf2b56e844a018df9d3720d59e Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:37 +0000 Subject: rxrpc: Implement path-MTU probing using padded PING ACKs (RFC8899) Implement path-MTU probing (along the lines of RFC8899) by padding some of the PING ACKs we send. PING ACKs get their own individual responses quite apart from the acking of data (though, as ACKs, they fulfil that role also). The probing concentrates on packet sizes that correspond how many subpackets can be stuffed inside a jumbo packet as jumbo DATA packets are just aggregations of individual DATA packets and can be split easily for retransmission purposes. If we want to perform probing, we advertise this by setting the maximum number of jumbo subpackets to 0 in the ack trailer when we send an ACK and see if the peer is also advertising the service. This is interpreted by non-supporting Rx stacks as an indication that jumbo packets aren't supported. The MTU sizes advertised in the ACK trailer AF_RXRPC transmits are pegged at a maximum of 1444 unless pmtud is supported by both sides. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-10-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 124 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index d86b5f07d292..9dcadad88e76 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -364,6 +364,7 @@ EM(rxrpc_propose_ack_ping_for_lost_ack, "LostAck") \ EM(rxrpc_propose_ack_ping_for_lost_reply, "LostRpl") \ EM(rxrpc_propose_ack_ping_for_0_retrans, "0-Retrn") \ + EM(rxrpc_propose_ack_ping_for_mtu_probe, "MTUProb") \ EM(rxrpc_propose_ack_ping_for_old_rtt, "OldRtt ") \ EM(rxrpc_propose_ack_ping_for_params, "Params ") \ EM(rxrpc_propose_ack_ping_for_rtt, "Rtt ") \ @@ -478,6 +479,11 @@ EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \ E_(rxrpc_txbuf_see_unacked, "SEE UNACKED") +#define rxrpc_pmtud_reduce_traces \ + EM(rxrpc_pmtud_reduce_ack, "Ack ") \ + EM(rxrpc_pmtud_reduce_icmp, "Icmp ") \ + E_(rxrpc_pmtud_reduce_route, "Route") + /* * Generate enums for tracing information. */ @@ -498,6 +504,7 @@ enum rxrpc_congest_change { rxrpc_congest_changes } __mode(byte); enum rxrpc_conn_trace { rxrpc_conn_traces } __mode(byte); enum rxrpc_local_trace { rxrpc_local_traces } __mode(byte); enum rxrpc_peer_trace { rxrpc_peer_traces } __mode(byte); +enum rxrpc_pmtud_reduce_trace { rxrpc_pmtud_reduce_traces } __mode(byte); enum rxrpc_propose_ack_outcome { rxrpc_propose_ack_outcomes } __mode(byte); enum rxrpc_propose_ack_trace { rxrpc_propose_ack_traces } __mode(byte); enum rxrpc_receive_trace { rxrpc_receive_traces } __mode(byte); @@ -534,6 +541,7 @@ rxrpc_congest_changes; rxrpc_congest_modes; rxrpc_conn_traces; rxrpc_local_traces; +rxrpc_pmtud_reduce_traces; rxrpc_propose_ack_traces; rxrpc_receive_traces; rxrpc_recvmsg_traces; @@ -2040,6 +2048,122 @@ TRACE_EVENT(rxrpc_sack, __entry->sack) ); +TRACE_EVENT(rxrpc_pmtud_tx, + TP_PROTO(struct rxrpc_call *call), + + TP_ARGS(call), + + TP_STRUCT__entry( + __field(unsigned int, peer_debug_id) + __field(unsigned int, call_debug_id) + __field(rxrpc_serial_t, ping_serial) + __field(unsigned short, pmtud_trial) + __field(unsigned short, pmtud_good) + __field(unsigned short, pmtud_bad) + ), + + TP_fast_assign( + __entry->peer_debug_id = call->peer->debug_id; + __entry->call_debug_id = call->debug_id; + __entry->ping_serial = call->conn->pmtud_probe; + __entry->pmtud_trial = call->peer->pmtud_trial; + __entry->pmtud_good = call->peer->pmtud_good; + __entry->pmtud_bad = call->peer->pmtud_bad; + ), + + TP_printk("P=%08x c=%08x pr=%08x %u-%u-%u", + __entry->peer_debug_id, + __entry->call_debug_id, + __entry->ping_serial, + __entry->pmtud_good, + __entry->pmtud_trial, + __entry->pmtud_bad) + ); + +TRACE_EVENT(rxrpc_pmtud_rx, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t resp_serial), + + TP_ARGS(conn, resp_serial), + + TP_STRUCT__entry( + __field(unsigned int, peer_debug_id) + __field(unsigned int, call_debug_id) + __field(rxrpc_serial_t, ping_serial) + __field(rxrpc_serial_t, resp_serial) + __field(unsigned short, max_data) + __field(u8, jumbo_max) + ), + + TP_fast_assign( + __entry->peer_debug_id = conn->peer->debug_id; + __entry->call_debug_id = conn->pmtud_call; + __entry->ping_serial = conn->pmtud_probe; + __entry->resp_serial = resp_serial; + __entry->max_data = conn->peer->max_data; + __entry->jumbo_max = conn->peer->pmtud_jumbo; + ), + + TP_printk("P=%08x c=%08x pr=%08x rr=%08x max=%u jm=%u", + __entry->peer_debug_id, + __entry->call_debug_id, + __entry->ping_serial, + __entry->resp_serial, + __entry->max_data, + __entry->jumbo_max) + ); + +TRACE_EVENT(rxrpc_pmtud_lost, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t resp_serial), + + TP_ARGS(conn, resp_serial), + + TP_STRUCT__entry( + __field(unsigned int, peer_debug_id) + __field(unsigned int, call_debug_id) + __field(rxrpc_serial_t, ping_serial) + __field(rxrpc_serial_t, resp_serial) + ), + + TP_fast_assign( + __entry->peer_debug_id = conn->peer->debug_id; + __entry->call_debug_id = conn->pmtud_call; + __entry->ping_serial = conn->pmtud_probe; + __entry->resp_serial = resp_serial; + ), + + TP_printk("P=%08x c=%08x pr=%08x rr=%08x", + __entry->peer_debug_id, + __entry->call_debug_id, + __entry->ping_serial, + __entry->resp_serial) + ); + +TRACE_EVENT(rxrpc_pmtud_reduce, + TP_PROTO(struct rxrpc_peer *peer, rxrpc_serial_t serial, + unsigned int max_data, enum rxrpc_pmtud_reduce_trace reason), + + TP_ARGS(peer, serial, max_data, reason), + + TP_STRUCT__entry( + __field(unsigned int, peer_debug_id) + __field(rxrpc_serial_t, serial) + __field(unsigned int, max_data) + __field(enum rxrpc_pmtud_reduce_trace, reason) + ), + + TP_fast_assign( + __entry->peer_debug_id = peer->debug_id; + __entry->serial = serial; + __entry->max_data = max_data; + __entry->reason = reason; + ), + + TP_printk("P=%08x %s r=%08x m=%u", + __entry->peer_debug_id, + __print_symbolic(__entry->reason, rxrpc_pmtud_reduce_traces), + __entry->serial, __entry->max_data) + ); + #undef EM #undef E_ -- cgit v1.2.3 From 149d002bee706f51772bd320cda90c922844bb0e Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:40 +0000 Subject: rxrpc: Add a tracepoint to show variables pertinent to jumbo packet size Add a tracepoint to be called right before packets are transmitted for the first time that shows variable values that are pertinent to how many subpackets will be added to a jumbo DATA packet. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-13-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 9dcadad88e76..71f07e726a90 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -903,6 +903,47 @@ TRACE_EVENT(rxrpc_txqueue, __entry->tx_winsize) ); +TRACE_EVENT(rxrpc_transmit, + TP_PROTO(struct rxrpc_call *call, int space), + + TP_ARGS(call, space), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, seq) + __field(u16, space) + __field(u16, tx_winsize) + __field(u16, cong_cwnd) + __field(u16, cong_extra) + __field(u16, in_flight) + __field(u16, prepared) + __field(u16, pmtud_jumbo) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->seq = call->tx_bottom; + __entry->space = space; + __entry->tx_winsize = call->tx_winsize; + __entry->cong_cwnd = call->cong_cwnd; + __entry->cong_extra = call->cong_extra; + __entry->prepared = call->tx_prepared - call->tx_bottom; + __entry->in_flight = call->tx_top - call->acks_hard_ack; + __entry->pmtud_jumbo = call->peer->pmtud_jumbo; + ), + + TP_printk("c=%08x q=%08x sp=%u tw=%u cw=%u+%u pr=%u if=%u pj=%u", + __entry->call, + __entry->seq, + __entry->space, + __entry->tx_winsize, + __entry->cong_cwnd, + __entry->cong_extra, + __entry->prepared, + __entry->in_flight, + __entry->pmtud_jumbo) + ); + TRACE_EVENT(rxrpc_rx_data, TP_PROTO(unsigned int call, rxrpc_seq_t seq, rxrpc_serial_t serial, u8 flags), -- cgit v1.2.3 From 9e3cccd176b5ec6ff78693287fb03097e453e69c Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:41 +0000 Subject: rxrpc: Fix CPU time starvation in I/O thread Starvation can happen in the rxrpc I/O thread because it goes back to the top of the I/O loop after it does any one thing without trying to give any other connection or call CPU time. Also, because it processes one call packet at a time, it tries to do the retransmission loop after each ACK without checking to see if there are other ACKs already in the queue that can update the SACK state. Fix this by: (1) Add a received-packet queue on each call. (2) Distribute packets from the master Rx queue to the individual call, conn and error queues and 'poking' calls to add them to the attend queue first thing in the I/O thread. (3) Go through all the attention-seeking connections and calls before going back to the top of the I/O thread. Each queue is extracted as a whole and then gone through so that new additions to insert themselves into the queue. (4) Make the call event handler go through all the packets currently on the call's rx_queue before transmitting and retransmitting DATA packets. (5) Drop the skb argument from the call event handler as this is now replaced with the rx_queue. Instead, keep track of whether we received a packet or an ACK for the tests that used to rely on that. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-14-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 71f07e726a90..28fa7be31ff8 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -120,6 +120,7 @@ EM(rxrpc_call_poke_conn_abort, "Conn-abort") \ EM(rxrpc_call_poke_error, "Error") \ EM(rxrpc_call_poke_idle, "Idle") \ + EM(rxrpc_call_poke_rx_packet, "Rx-packet") \ EM(rxrpc_call_poke_set_timeout, "Set-timo") \ EM(rxrpc_call_poke_start, "Start") \ EM(rxrpc_call_poke_timer, "Timer") \ @@ -128,6 +129,7 @@ #define rxrpc_skb_traces \ EM(rxrpc_skb_eaten_by_unshare, "ETN unshare ") \ EM(rxrpc_skb_eaten_by_unshare_nomem, "ETN unshar-nm") \ + EM(rxrpc_skb_get_call_rx, "GET call-rx ") \ EM(rxrpc_skb_get_conn_secured, "GET conn-secd") \ EM(rxrpc_skb_get_conn_work, "GET conn-work") \ EM(rxrpc_skb_get_last_nack, "GET last-nack") \ @@ -139,6 +141,7 @@ EM(rxrpc_skb_new_error_report, "NEW error-rpt") \ EM(rxrpc_skb_new_jumbo_subpacket, "NEW jumbo-sub") \ EM(rxrpc_skb_new_unshared, "NEW unshared ") \ + EM(rxrpc_skb_put_call_rx, "PUT call-rx ") \ EM(rxrpc_skb_put_conn_secured, "PUT conn-secd") \ EM(rxrpc_skb_put_conn_work, "PUT conn-work") \ EM(rxrpc_skb_put_error_report, "PUT error-rep") \ -- cgit v1.2.3 From b341a0263b1b804d329f864c2dc24815364510ec Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:46 +0000 Subject: rxrpc: Implement progressive transmission queue struct We need to scan the buffers in the transmission queue occasionally when processing ACKs, but the transmission queue is currently a linked list of transmission buffers which, when we eventually expand the Tx window to 8192 packets will be very slow to walk. Instead, pull the fields we need to examine a lot (last sent time, retransmitted flag) into a new struct rxrpc_txqueue and make each one hold an array of 32 or 64 packets. The transmission queue is then a list of these structs, each pointing to a contiguous set of packets. Scanning is then a lot faster as the flags and timestamps are concentrated in the CPU dcache. The transmission timestamps are stored as a number of microseconds from a base ktime to reduce memory requirements. This should be fine provided we manage to transmit an entire buffer within an hour. This will make implementing RACK-TLP [RFC8985] easier as it will be less costly to scan the transmission buffers. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-19-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 98 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 83 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 28fa7be31ff8..e6cf9ec940aa 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -297,7 +297,6 @@ #define rxrpc_txqueue_traces \ EM(rxrpc_txqueue_await_reply, "AWR") \ - EM(rxrpc_txqueue_dequeue, "DEQ") \ EM(rxrpc_txqueue_end, "END") \ EM(rxrpc_txqueue_queue, "QUE") \ EM(rxrpc_txqueue_queue_last, "QLS") \ @@ -482,6 +481,19 @@ EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \ E_(rxrpc_txbuf_see_unacked, "SEE UNACKED") +#define rxrpc_tq_traces \ + EM(rxrpc_tq_alloc, "ALLOC") \ + EM(rxrpc_tq_cleaned, "CLEAN") \ + EM(rxrpc_tq_decant, "DCNT ") \ + EM(rxrpc_tq_decant_advance, "DCNT>") \ + EM(rxrpc_tq_queue, "QUEUE") \ + EM(rxrpc_tq_queue_dup, "QUE!!") \ + EM(rxrpc_tq_rotate, "ROT ") \ + EM(rxrpc_tq_rotate_and_free, "ROT-F") \ + EM(rxrpc_tq_rotate_and_keep, "ROT-K") \ + EM(rxrpc_tq_transmit, "XMIT ") \ + E_(rxrpc_tq_transmit_advance, "XMIT>") + #define rxrpc_pmtud_reduce_traces \ EM(rxrpc_pmtud_reduce_ack, "Ack ") \ EM(rxrpc_pmtud_reduce_icmp, "Icmp ") \ @@ -518,6 +530,7 @@ enum rxrpc_rtt_tx_trace { rxrpc_rtt_tx_traces } __mode(byte); enum rxrpc_sack_trace { rxrpc_sack_traces } __mode(byte); enum rxrpc_skb_trace { rxrpc_skb_traces } __mode(byte); enum rxrpc_timer_trace { rxrpc_timer_traces } __mode(byte); +enum rxrpc_tq_trace { rxrpc_tq_traces } __mode(byte); enum rxrpc_tx_point { rxrpc_tx_points } __mode(byte); enum rxrpc_txbuf_trace { rxrpc_txbuf_traces } __mode(byte); enum rxrpc_txqueue_trace { rxrpc_txqueue_traces } __mode(byte); @@ -554,6 +567,7 @@ rxrpc_rtt_tx_traces; rxrpc_sack_traces; rxrpc_skb_traces; rxrpc_timer_traces; +rxrpc_tq_traces; rxrpc_tx_points; rxrpc_txbuf_traces; rxrpc_txqueue_traces; @@ -881,7 +895,7 @@ TRACE_EVENT(rxrpc_txqueue, __field(rxrpc_seq_t, acks_hard_ack) __field(rxrpc_seq_t, tx_bottom) __field(rxrpc_seq_t, tx_top) - __field(rxrpc_seq_t, tx_prepared) + __field(rxrpc_seq_t, send_top) __field(int, tx_winsize) ), @@ -891,7 +905,7 @@ TRACE_EVENT(rxrpc_txqueue, __entry->acks_hard_ack = call->acks_hard_ack; __entry->tx_bottom = call->tx_bottom; __entry->tx_top = call->tx_top; - __entry->tx_prepared = call->tx_prepared; + __entry->send_top = call->send_top; __entry->tx_winsize = call->tx_winsize; ), @@ -902,14 +916,14 @@ TRACE_EVENT(rxrpc_txqueue, __entry->acks_hard_ack, __entry->tx_top - __entry->tx_bottom, __entry->tx_top - __entry->acks_hard_ack, - __entry->tx_prepared - __entry->tx_bottom, + __entry->send_top - __entry->tx_top, __entry->tx_winsize) ); TRACE_EVENT(rxrpc_transmit, - TP_PROTO(struct rxrpc_call *call, int space), + TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t send_top, int space), - TP_ARGS(call, space), + TP_ARGS(call, send_top, space), TP_STRUCT__entry( __field(unsigned int, call) @@ -925,12 +939,12 @@ TRACE_EVENT(rxrpc_transmit, TP_fast_assign( __entry->call = call->debug_id; - __entry->seq = call->tx_bottom; + __entry->seq = call->tx_top + 1; __entry->space = space; __entry->tx_winsize = call->tx_winsize; __entry->cong_cwnd = call->cong_cwnd; __entry->cong_extra = call->cong_extra; - __entry->prepared = call->tx_prepared - call->tx_bottom; + __entry->prepared = send_top - call->tx_bottom; __entry->in_flight = call->tx_top - call->acks_hard_ack; __entry->pmtud_jumbo = call->peer->pmtud_jumbo; ), @@ -947,6 +961,32 @@ TRACE_EVENT(rxrpc_transmit, __entry->pmtud_jumbo) ); +TRACE_EVENT(rxrpc_tx_rotate, + TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, rxrpc_seq_t to), + + TP_ARGS(call, seq, to), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, seq) + __field(rxrpc_seq_t, to) + __field(rxrpc_seq_t, top) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->seq = seq; + __entry->to = to; + __entry->top = call->tx_top; + ), + + TP_printk("c=%08x q=%08x-%08x-%08x", + __entry->call, + __entry->seq, + __entry->to, + __entry->top) + ); + TRACE_EVENT(rxrpc_rx_data, TP_PROTO(unsigned int call, rxrpc_seq_t seq, rxrpc_serial_t serial, u8 flags), @@ -1621,10 +1661,11 @@ TRACE_EVENT(rxrpc_drop_ack, ); TRACE_EVENT(rxrpc_retransmit, - TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, - rxrpc_serial_t serial, ktime_t expiry), + TP_PROTO(struct rxrpc_call *call, + struct rxrpc_send_data_req *req, + struct rxrpc_txbuf *txb, ktime_t expiry), - TP_ARGS(call, seq, serial, expiry), + TP_ARGS(call, req, txb, expiry), TP_STRUCT__entry( __field(unsigned int, call) @@ -1635,8 +1676,8 @@ TRACE_EVENT(rxrpc_retransmit, TP_fast_assign( __entry->call = call->debug_id; - __entry->seq = seq; - __entry->serial = serial; + __entry->seq = req->seq; + __entry->serial = txb->serial; __entry->expiry = expiry; ), @@ -1714,9 +1755,9 @@ TRACE_EVENT(rxrpc_reset_cwnd, __entry->cwnd = call->cong_cwnd; __entry->extra = call->cong_extra; __entry->hard_ack = call->acks_hard_ack; - __entry->prepared = call->tx_prepared - call->tx_bottom; + __entry->prepared = call->send_top - call->tx_bottom; __entry->since_last_tx = ktime_sub(now, call->tx_last_sent); - __entry->has_data = !list_empty(&call->tx_sendmsg); + __entry->has_data = call->tx_bottom != call->tx_top; ), TP_printk("c=%08x q=%08x %s cw=%u+%u pr=%u tm=%llu d=%u", @@ -2024,6 +2065,33 @@ TRACE_EVENT(rxrpc_txbuf, __entry->ref) ); +TRACE_EVENT(rxrpc_tq, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq, + rxrpc_seq_t seq, enum rxrpc_tq_trace trace), + + TP_ARGS(call, tq, seq, trace), + + TP_STRUCT__entry( + __field(unsigned int, call_debug_id) + __field(rxrpc_seq_t, qbase) + __field(rxrpc_seq_t, seq) + __field(enum rxrpc_tq_trace, trace) + ), + + TP_fast_assign( + __entry->call_debug_id = call->debug_id; + __entry->qbase = tq ? tq->qbase : call->tx_qbase; + __entry->seq = seq; + __entry->trace = trace; + ), + + TP_printk("c=%08x bq=%08x q=%08x %s", + __entry->call_debug_id, + __entry->qbase, + __entry->seq, + __print_symbolic(__entry->trace, rxrpc_tq_traces)) + ); + TRACE_EVENT(rxrpc_poke_call, TP_PROTO(struct rxrpc_call *call, bool busy, enum rxrpc_call_poke_trace what), -- cgit v1.2.3 From 692c4caa074c0d6092bd713babc6fc3872b5592a Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:47 +0000 Subject: rxrpc: call->acks_hard_ack is now the same call->tx_bottom, so remove it Now that packets are removed from the Tx queue in the rotation function rather than being cleaned up later, call->acks_hard_ack now advances in step with call->tx_bottom, so remove it. Some of the places call->acks_hard_ack is used in the rxrpc tracepoints are replaced by call->acks_first_seq instead as that's the peer's reported idea of the hard-ACK point. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-20-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index e6cf9ec940aa..0f253287de00 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -892,8 +892,8 @@ TRACE_EVENT(rxrpc_txqueue, TP_STRUCT__entry( __field(unsigned int, call) __field(enum rxrpc_txqueue_trace, why) - __field(rxrpc_seq_t, acks_hard_ack) __field(rxrpc_seq_t, tx_bottom) + __field(rxrpc_seq_t, acks_first_seq) __field(rxrpc_seq_t, tx_top) __field(rxrpc_seq_t, send_top) __field(int, tx_winsize) @@ -902,8 +902,8 @@ TRACE_EVENT(rxrpc_txqueue, TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; - __entry->acks_hard_ack = call->acks_hard_ack; __entry->tx_bottom = call->tx_bottom; + __entry->acks_first_seq = call->acks_first_seq; __entry->tx_top = call->tx_top; __entry->send_top = call->send_top; __entry->tx_winsize = call->tx_winsize; @@ -913,9 +913,9 @@ TRACE_EVENT(rxrpc_txqueue, __entry->call, __print_symbolic(__entry->why, rxrpc_txqueue_traces), __entry->tx_bottom, - __entry->acks_hard_ack, - __entry->tx_top - __entry->tx_bottom, - __entry->tx_top - __entry->acks_hard_ack, + __entry->acks_first_seq, + __entry->acks_first_seq - __entry->tx_bottom, + __entry->tx_top - __entry->acks_first_seq, __entry->send_top - __entry->tx_top, __entry->tx_winsize) ); @@ -945,7 +945,7 @@ TRACE_EVENT(rxrpc_transmit, __entry->cong_cwnd = call->cong_cwnd; __entry->cong_extra = call->cong_extra; __entry->prepared = send_top - call->tx_bottom; - __entry->in_flight = call->tx_top - call->acks_hard_ack; + __entry->in_flight = call->tx_top - call->tx_bottom; __entry->pmtud_jumbo = call->peer->pmtud_jumbo; ), @@ -1707,7 +1707,7 @@ TRACE_EVENT(rxrpc_congest, TP_fast_assign( __entry->call = call->debug_id; __entry->change = change; - __entry->hard_ack = call->acks_hard_ack; + __entry->hard_ack = call->acks_first_seq; __entry->top = call->tx_top; __entry->lowest_nak = call->acks_lowest_nak; __entry->ack_serial = ack_serial; @@ -1754,7 +1754,7 @@ TRACE_EVENT(rxrpc_reset_cwnd, __entry->mode = call->cong_mode; __entry->cwnd = call->cong_cwnd; __entry->extra = call->cong_extra; - __entry->hard_ack = call->acks_hard_ack; + __entry->hard_ack = call->acks_first_seq; __entry->prepared = call->send_top - call->tx_bottom; __entry->since_last_tx = ktime_sub(now, call->tx_last_sent); __entry->has_data = call->tx_bottom != call->tx_top; @@ -1855,7 +1855,7 @@ TRACE_EVENT(rxrpc_resend, TP_fast_assign( struct rxrpc_skb_priv *sp = ack ? rxrpc_skb(ack) : NULL; __entry->call = call->debug_id; - __entry->seq = call->acks_hard_ack; + __entry->seq = call->acks_first_seq; __entry->transmitted = call->tx_transmitted; __entry->ack_serial = sp ? sp->hdr.serial : 0; ), @@ -1944,7 +1944,7 @@ TRACE_EVENT(rxrpc_call_reset, __entry->call_id = call->call_id; __entry->call_serial = call->rx_serial; __entry->conn_serial = call->conn->hi_serial; - __entry->tx_seq = call->acks_hard_ack; + __entry->tx_seq = call->acks_first_seq; __entry->rx_seq = call->rx_highest_seq; ), -- cgit v1.2.3 From 203457e11b591f80ada571f981dd5f4d683b0009 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:48 +0000 Subject: rxrpc: Replace call->acks_first_seq with tracking of the hard ACK point Replace the call->acks_first_seq variable (which holds ack.firstPacket from the latest ACK packet and indicates the sequence number of the first ack slot in the SACK table) with call->acks_hard_ack which will hold the highest sequence hard ACK'd. This is 1 less than call->acks_first_seq, but it fits in the same schema as the other tracking variables which hold the sequence of a packet, not one past it. This will fix the rxrpc_congest tracepoint's calculation of SACK window size which shows one fewer than it should - and will occasionally go to -1. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-21-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 68 +++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 0f253287de00..91108e0de3af 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -893,7 +893,7 @@ TRACE_EVENT(rxrpc_txqueue, __field(unsigned int, call) __field(enum rxrpc_txqueue_trace, why) __field(rxrpc_seq_t, tx_bottom) - __field(rxrpc_seq_t, acks_first_seq) + __field(rxrpc_seq_t, acks_hard_ack) __field(rxrpc_seq_t, tx_top) __field(rxrpc_seq_t, send_top) __field(int, tx_winsize) @@ -903,19 +903,19 @@ TRACE_EVENT(rxrpc_txqueue, __entry->call = call->debug_id; __entry->why = why; __entry->tx_bottom = call->tx_bottom; - __entry->acks_first_seq = call->acks_first_seq; + __entry->acks_hard_ack = call->acks_hard_ack; __entry->tx_top = call->tx_top; __entry->send_top = call->send_top; __entry->tx_winsize = call->tx_winsize; ), - TP_printk("c=%08x %s f=%08x h=%08x n=%u/%u/%u/%u", + TP_printk("c=%08x %s b=%08x h=%08x n=%u/%u/%u/%u", __entry->call, __print_symbolic(__entry->why, rxrpc_txqueue_traces), __entry->tx_bottom, - __entry->acks_first_seq, - __entry->acks_first_seq - __entry->tx_bottom, - __entry->tx_top - __entry->acks_first_seq, + __entry->acks_hard_ack, + __entry->acks_hard_ack - __entry->tx_bottom, + __entry->tx_top - __entry->acks_hard_ack, __entry->send_top - __entry->tx_top, __entry->tx_winsize) ); @@ -1015,11 +1015,9 @@ TRACE_EVENT(rxrpc_rx_data, ); TRACE_EVENT(rxrpc_rx_ack, - TP_PROTO(struct rxrpc_call *call, - rxrpc_serial_t serial, rxrpc_serial_t ack_serial, - rxrpc_seq_t first, rxrpc_seq_t prev, u8 reason, u8 n_acks), + TP_PROTO(struct rxrpc_call *call, struct rxrpc_skb_priv *sp), - TP_ARGS(call, serial, ack_serial, first, prev, reason, n_acks), + TP_ARGS(call, sp), TP_STRUCT__entry( __field(unsigned int, call) @@ -1032,13 +1030,13 @@ TRACE_EVENT(rxrpc_rx_ack, ), TP_fast_assign( - __entry->call = call->debug_id; - __entry->serial = serial; - __entry->ack_serial = ack_serial; - __entry->first = first; - __entry->prev = prev; - __entry->reason = reason; - __entry->n_acks = n_acks; + __entry->call = call->debug_id; + __entry->serial = sp->hdr.serial; + __entry->ack_serial = sp->ack.acked_serial; + __entry->first = sp->ack.first_ack; + __entry->prev = sp->ack.prev_ack; + __entry->reason = sp->ack.reason; + __entry->n_acks = sp->ack.nr_acks; ), TP_printk("c=%08x %08x %s r=%08x f=%08x p=%08x n=%u", @@ -1707,7 +1705,7 @@ TRACE_EVENT(rxrpc_congest, TP_fast_assign( __entry->call = call->debug_id; __entry->change = change; - __entry->hard_ack = call->acks_first_seq; + __entry->hard_ack = call->acks_hard_ack; __entry->top = call->tx_top; __entry->lowest_nak = call->acks_lowest_nak; __entry->ack_serial = ack_serial; @@ -1754,7 +1752,7 @@ TRACE_EVENT(rxrpc_reset_cwnd, __entry->mode = call->cong_mode; __entry->cwnd = call->cong_cwnd; __entry->extra = call->cong_extra; - __entry->hard_ack = call->acks_first_seq; + __entry->hard_ack = call->acks_hard_ack; __entry->prepared = call->send_top - call->tx_bottom; __entry->since_last_tx = ktime_sub(now, call->tx_last_sent); __entry->has_data = call->tx_bottom != call->tx_top; @@ -1855,7 +1853,7 @@ TRACE_EVENT(rxrpc_resend, TP_fast_assign( struct rxrpc_skb_priv *sp = ack ? rxrpc_skb(ack) : NULL; __entry->call = call->debug_id; - __entry->seq = call->acks_first_seq; + __entry->seq = call->acks_hard_ack; __entry->transmitted = call->tx_transmitted; __entry->ack_serial = sp ? sp->hdr.serial : 0; ), @@ -1944,7 +1942,7 @@ TRACE_EVENT(rxrpc_call_reset, __entry->call_id = call->call_id; __entry->call_serial = call->rx_serial; __entry->conn_serial = call->conn->hi_serial; - __entry->tx_seq = call->acks_first_seq; + __entry->tx_seq = call->acks_hard_ack; __entry->rx_seq = call->rx_highest_seq; ), @@ -1976,38 +1974,36 @@ TRACE_EVENT(rxrpc_notify_socket, ); TRACE_EVENT(rxrpc_rx_discard_ack, - TP_PROTO(unsigned int debug_id, rxrpc_serial_t serial, - rxrpc_seq_t first_soft_ack, rxrpc_seq_t call_ackr_first, - rxrpc_seq_t prev_pkt, rxrpc_seq_t call_ackr_prev), + TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial, + rxrpc_seq_t hard_ack, rxrpc_seq_t prev_pkt), - TP_ARGS(debug_id, serial, first_soft_ack, call_ackr_first, - prev_pkt, call_ackr_prev), + TP_ARGS(call, serial, hard_ack, prev_pkt), TP_STRUCT__entry( __field(unsigned int, debug_id) __field(rxrpc_serial_t, serial) - __field(rxrpc_seq_t, first_soft_ack) - __field(rxrpc_seq_t, call_ackr_first) + __field(rxrpc_seq_t, hard_ack) __field(rxrpc_seq_t, prev_pkt) - __field(rxrpc_seq_t, call_ackr_prev) + __field(rxrpc_seq_t, acks_hard_ack) + __field(rxrpc_seq_t, acks_prev_seq) ), TP_fast_assign( - __entry->debug_id = debug_id; + __entry->debug_id = call->debug_id; __entry->serial = serial; - __entry->first_soft_ack = first_soft_ack; - __entry->call_ackr_first = call_ackr_first; + __entry->hard_ack = hard_ack; __entry->prev_pkt = prev_pkt; - __entry->call_ackr_prev = call_ackr_prev; + __entry->acks_hard_ack = call->acks_hard_ack; + __entry->acks_prev_seq = call->acks_prev_seq; ), TP_printk("c=%08x r=%08x %08x<%08x %08x<%08x", __entry->debug_id, __entry->serial, - __entry->first_soft_ack, - __entry->call_ackr_first, + __entry->hard_ack, + __entry->acks_hard_ack, __entry->prev_pkt, - __entry->call_ackr_prev) + __entry->acks_prev_seq) ); TRACE_EVENT(rxrpc_req_ack, -- cgit v1.2.3 From f7dd0dc9651326f609579fa81cbdda69b0467c2a Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:50 +0000 Subject: rxrpc: Adjust names and types of congestion-related fields Adjust some of the names of fields and constants to make them look a bit more like the TCP congestion symbol names, such as flight_size -> in_flight and congest_mode to ca_state. Move the persistent congestion-related fields from the rxrpc_ack_summary struct into the rxrpc_call struct rather than copying them out and back in again. The rxrpc_congest tracepoint can fetch them from the call struct. Rename the counters for soft acks and nacks to have an 's' on the front to reflect the softness, e.g. nr_acks -> nr_sacks. Make fields counting numbers of packets or numbers of acks u16 rather than u8 to allow for windows of up to 8192 DATA packets in flight in future. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-23-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 58 ++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 91108e0de3af..d47b8235fad3 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -378,11 +378,11 @@ EM(rxrpc_propose_ack_rx_idle, "RxIdle ") \ E_(rxrpc_propose_ack_terminal_ack, "ClTerm ") -#define rxrpc_congest_modes \ - EM(RXRPC_CALL_CONGEST_AVOIDANCE, "CongAvoid") \ - EM(RXRPC_CALL_FAST_RETRANSMIT, "FastReTx ") \ - EM(RXRPC_CALL_PACKET_LOSS, "PktLoss ") \ - E_(RXRPC_CALL_SLOW_START, "SlowStart") +#define rxrpc_ca_states \ + EM(RXRPC_CA_CONGEST_AVOIDANCE, "CongAvoid") \ + EM(RXRPC_CA_FAST_RETRANSMIT, "FastReTx ") \ + EM(RXRPC_CA_PACKET_LOSS, "PktLoss ") \ + E_(RXRPC_CA_SLOW_START, "SlowStart") #define rxrpc_congest_changes \ EM(rxrpc_cong_begin_retransmission, " Retrans") \ @@ -550,11 +550,11 @@ enum rxrpc_txqueue_trace { rxrpc_txqueue_traces } __mode(byte); rxrpc_abort_reasons; rxrpc_bundle_traces; +rxrpc_ca_states; rxrpc_call_poke_traces; rxrpc_call_traces; rxrpc_client_traces; rxrpc_congest_changes; -rxrpc_congest_modes; rxrpc_conn_traces; rxrpc_local_traces; rxrpc_pmtud_reduce_traces; @@ -1688,27 +1688,39 @@ TRACE_EVENT(rxrpc_retransmit, TRACE_EVENT(rxrpc_congest, TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, - rxrpc_serial_t ack_serial, enum rxrpc_congest_change change), + rxrpc_serial_t ack_serial), - TP_ARGS(call, summary, ack_serial, change), + TP_ARGS(call, summary, ack_serial), TP_STRUCT__entry( __field(unsigned int, call) - __field(enum rxrpc_congest_change, change) + __field(enum rxrpc_ca_state, ca_state) __field(rxrpc_seq_t, hard_ack) __field(rxrpc_seq_t, top) __field(rxrpc_seq_t, lowest_nak) __field(rxrpc_serial_t, ack_serial) + __field(u16, nr_sacks) + __field(u16, nr_snacks) + __field(u16, cwnd) + __field(u16, ssthresh) + __field(u16, cumul_acks) + __field(u16, dup_acks) __field_struct(struct rxrpc_ack_summary, sum) ), TP_fast_assign( __entry->call = call->debug_id; - __entry->change = change; + __entry->ca_state = call->cong_ca_state; __entry->hard_ack = call->acks_hard_ack; __entry->top = call->tx_top; __entry->lowest_nak = call->acks_lowest_nak; __entry->ack_serial = ack_serial; + __entry->nr_sacks = call->acks_nr_sacks; + __entry->nr_snacks = call->acks_nr_snacks; + __entry->cwnd = call->cong_cwnd; + __entry->ssthresh = call->cong_ssthresh; + __entry->cumul_acks = call->cong_cumul_acks; + __entry->dup_acks = call->cong_dup_acks; memcpy(&__entry->sum, summary, sizeof(__entry->sum)); ), @@ -1717,17 +1729,17 @@ TRACE_EVENT(rxrpc_congest, __entry->ack_serial, __print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names), __entry->hard_ack, - __print_symbolic(__entry->sum.mode, rxrpc_congest_modes), - __entry->sum.cwnd, - __entry->sum.ssthresh, - __entry->sum.nr_acks, __entry->sum.nr_retained_nacks, - __entry->sum.nr_new_acks, - __entry->sum.nr_new_nacks, + __print_symbolic(__entry->ca_state, rxrpc_ca_states), + __entry->cwnd, + __entry->ssthresh, + __entry->nr_sacks, __entry->sum.nr_retained_snacks, + __entry->sum.nr_new_sacks, + __entry->sum.nr_new_snacks, __entry->top - __entry->hard_ack, - __entry->sum.cumulative_acks, - __entry->sum.dup_acks, - __entry->lowest_nak, __entry->sum.new_low_nack ? "!" : "", - __print_symbolic(__entry->change, rxrpc_congest_changes), + __entry->cumul_acks, + __entry->dup_acks, + __entry->lowest_nak, __entry->sum.new_low_snack ? "!" : "", + __print_symbolic(__entry->sum.change, rxrpc_congest_changes), __entry->sum.retrans_timeo ? " rTxTo" : "") ); @@ -1738,7 +1750,7 @@ TRACE_EVENT(rxrpc_reset_cwnd, TP_STRUCT__entry( __field(unsigned int, call) - __field(enum rxrpc_congest_mode, mode) + __field(enum rxrpc_ca_state, ca_state) __field(unsigned short, cwnd) __field(unsigned short, extra) __field(rxrpc_seq_t, hard_ack) @@ -1749,7 +1761,7 @@ TRACE_EVENT(rxrpc_reset_cwnd, TP_fast_assign( __entry->call = call->debug_id; - __entry->mode = call->cong_mode; + __entry->ca_state = call->cong_ca_state; __entry->cwnd = call->cong_cwnd; __entry->extra = call->cong_extra; __entry->hard_ack = call->acks_hard_ack; @@ -1761,7 +1773,7 @@ TRACE_EVENT(rxrpc_reset_cwnd, TP_printk("c=%08x q=%08x %s cw=%u+%u pr=%u tm=%llu d=%u", __entry->call, __entry->hard_ack, - __print_symbolic(__entry->mode, rxrpc_congest_modes), + __print_symbolic(__entry->ca_state, rxrpc_ca_states), __entry->cwnd, __entry->extra, __entry->prepared, -- cgit v1.2.3 From 9b052c6b92f9316d670bf50566f70e183d0d19cb Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:51 +0000 Subject: rxrpc: Use the new rxrpc_tx_queue struct to more efficiently process ACKs With the change in the structure of the transmission buffer to store buffers in bunches of 32 or 64 (BITS_PER_LONG) we can place sets of per-buffer flags into the rxrpc_tx_queue struct rather than storing them in rxrpc_tx_buf, thereby vastly increasing efficiency when assessing the SACK table in an ACK packet. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-24-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 86 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 75 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index d47b8235fad3..609522a5bd0f 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -132,7 +132,6 @@ EM(rxrpc_skb_get_call_rx, "GET call-rx ") \ EM(rxrpc_skb_get_conn_secured, "GET conn-secd") \ EM(rxrpc_skb_get_conn_work, "GET conn-work") \ - EM(rxrpc_skb_get_last_nack, "GET last-nack") \ EM(rxrpc_skb_get_local_work, "GET locl-work") \ EM(rxrpc_skb_get_reject_work, "GET rej-work ") \ EM(rxrpc_skb_get_to_recvmsg, "GET to-recv ") \ @@ -147,7 +146,6 @@ EM(rxrpc_skb_put_error_report, "PUT error-rep") \ EM(rxrpc_skb_put_input, "PUT input ") \ EM(rxrpc_skb_put_jumbo_subpacket, "PUT jumbo-sub") \ - EM(rxrpc_skb_put_last_nack, "PUT last-nack") \ EM(rxrpc_skb_put_purge, "PUT purge ") \ EM(rxrpc_skb_put_rotate, "PUT rotate ") \ EM(rxrpc_skb_put_unknown, "PUT unknown ") \ @@ -499,6 +497,11 @@ EM(rxrpc_pmtud_reduce_icmp, "Icmp ") \ E_(rxrpc_pmtud_reduce_route, "Route") +#define rxrpc_rotate_traces \ + EM(rxrpc_rotate_trace_hack, "hard-ack") \ + EM(rxrpc_rotate_trace_sack, "soft-ack") \ + E_(rxrpc_rotate_trace_snak, "soft-nack") + /* * Generate enums for tracing information. */ @@ -525,6 +528,7 @@ enum rxrpc_propose_ack_trace { rxrpc_propose_ack_traces } __mode(byte); enum rxrpc_receive_trace { rxrpc_receive_traces } __mode(byte); enum rxrpc_recvmsg_trace { rxrpc_recvmsg_traces } __mode(byte); enum rxrpc_req_ack_trace { rxrpc_req_ack_traces } __mode(byte); +enum rxrpc_rotate_trace { rxrpc_rotate_traces } __mode(byte); enum rxrpc_rtt_rx_trace { rxrpc_rtt_rx_traces } __mode(byte); enum rxrpc_rtt_tx_trace { rxrpc_rtt_tx_traces } __mode(byte); enum rxrpc_sack_trace { rxrpc_sack_traces } __mode(byte); @@ -562,6 +566,7 @@ rxrpc_propose_ack_traces; rxrpc_receive_traces; rxrpc_recvmsg_traces; rxrpc_req_ack_traces; +rxrpc_rotate_traces; rxrpc_rtt_rx_traces; rxrpc_rtt_tx_traces; rxrpc_sack_traces; @@ -1667,6 +1672,7 @@ TRACE_EVENT(rxrpc_retransmit, TP_STRUCT__entry( __field(unsigned int, call) + __field(unsigned int, qbase) __field(rxrpc_seq_t, seq) __field(rxrpc_serial_t, serial) __field(ktime_t, expiry) @@ -1674,13 +1680,15 @@ TRACE_EVENT(rxrpc_retransmit, TP_fast_assign( __entry->call = call->debug_id; + __entry->qbase = req->tq->qbase; __entry->seq = req->seq; __entry->serial = txb->serial; __entry->expiry = expiry; ), - TP_printk("c=%08x q=%x r=%x xp=%lld", + TP_printk("c=%08x tq=%x q=%x r=%x xp=%lld", __entry->call, + __entry->qbase, __entry->seq, __entry->serial, ktime_to_us(__entry->expiry)) @@ -1724,7 +1732,7 @@ TRACE_EVENT(rxrpc_congest, memcpy(&__entry->sum, summary, sizeof(__entry->sum)); ), - TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u nA=%u,%u+%u,%u b=%u u=%u d=%u l=%x%s%s%s", + TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u A=%u+%u/%u+%u r=%u b=%u u=%u d=%u l=%x%s%s%s", __entry->call, __entry->ack_serial, __print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names), @@ -1732,9 +1740,9 @@ TRACE_EVENT(rxrpc_congest, __print_symbolic(__entry->ca_state, rxrpc_ca_states), __entry->cwnd, __entry->ssthresh, - __entry->nr_sacks, __entry->sum.nr_retained_snacks, - __entry->sum.nr_new_sacks, - __entry->sum.nr_new_snacks, + __entry->nr_sacks, __entry->sum.nr_new_sacks, + __entry->nr_snacks, __entry->sum.nr_new_snacks, + __entry->sum.nr_new_hacks, __entry->top - __entry->hard_ack, __entry->cumul_acks, __entry->dup_acks, @@ -1850,10 +1858,36 @@ TRACE_EVENT(rxrpc_connect_call, &__entry->srx.transport) ); +TRACE_EVENT(rxrpc_apply_acks, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq), + + TP_ARGS(call, tq), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(unsigned int, nr_rep) + __field(rxrpc_seq_t, qbase) + __field(unsigned long, acks) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->qbase = tq->qbase; + __entry->acks = tq->segment_acked; + __entry->nr_rep = tq->nr_reported_acks; + ), + + TP_printk("c=%08x tq=%x acks=%016lx rep=%u", + __entry->call, + __entry->qbase, + __entry->acks, + __entry->nr_rep) + ); + TRACE_EVENT(rxrpc_resend, - TP_PROTO(struct rxrpc_call *call, struct sk_buff *ack), + TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t ack_serial), - TP_ARGS(call, ack), + TP_ARGS(call, ack_serial), TP_STRUCT__entry( __field(unsigned int, call) @@ -1863,11 +1897,10 @@ TRACE_EVENT(rxrpc_resend, ), TP_fast_assign( - struct rxrpc_skb_priv *sp = ack ? rxrpc_skb(ack) : NULL; __entry->call = call->debug_id; __entry->seq = call->acks_hard_ack; __entry->transmitted = call->tx_transmitted; - __entry->ack_serial = sp ? sp->hdr.serial : 0; + __entry->ack_serial = ack_serial; ), TP_printk("c=%08x r=%x q=%x tq=%x", @@ -1877,6 +1910,37 @@ TRACE_EVENT(rxrpc_resend, __entry->transmitted) ); +TRACE_EVENT(rxrpc_rotate, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq, + struct rxrpc_ack_summary *summary, rxrpc_seq_t seq, + enum rxrpc_rotate_trace trace), + + TP_ARGS(call, tq, summary, seq, trace), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, qbase) + __field(rxrpc_seq_t, seq) + __field(unsigned int, nr_rep) + __field(enum rxrpc_rotate_trace, trace) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->qbase = tq->qbase; + __entry->seq = seq; + __entry->nr_rep = tq->nr_reported_acks; + __entry->trace = trace; + ), + + TP_printk("c=%08x tq=%x q=%x nr=%x %s", + __entry->call, + __entry->qbase, + __entry->seq, + __entry->nr_rep, + __print_symbolic(__entry->trace, rxrpc_rotate_traces)) + ); + TRACE_EVENT(rxrpc_rx_icmp, TP_PROTO(struct rxrpc_peer *peer, struct sock_extended_err *ee, struct sockaddr_rxrpc *srx), -- cgit v1.2.3 From dcdff0d8e3b61033b28c72926997d458949fcc05 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:52 +0000 Subject: rxrpc: Store the DATA serial in the txqueue and use this in RTT calc Store the serial number set on a DATA packet at the point of transmission in the rxrpc_txqueue struct and when an ACK is received, match the reference number in the ACK by trawling the txqueue rather than sharing an RTT table with ACK RTT. This can be done as part of Tx queue rotation. This means we have a lot more RTT samples available and is faster to search with all the serial numbers packed together into a few cachelines rather than being hung off different txbufs. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-25-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 609522a5bd0f..798bea0853c4 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -337,11 +337,10 @@ E_(rxrpc_rtt_tx_ping, "PING") #define rxrpc_rtt_rx_traces \ - EM(rxrpc_rtt_rx_other_ack, "OACK") \ + EM(rxrpc_rtt_rx_data_ack, "DACK") \ EM(rxrpc_rtt_rx_obsolete, "OBSL") \ EM(rxrpc_rtt_rx_lost, "LOST") \ - EM(rxrpc_rtt_rx_ping_response, "PONG") \ - E_(rxrpc_rtt_rx_requested_ack, "RACK") + E_(rxrpc_rtt_rx_ping_response, "PONG") #define rxrpc_timer_traces \ EM(rxrpc_timer_trace_delayed_ack, "DelayAck ") \ @@ -1695,10 +1694,9 @@ TRACE_EVENT(rxrpc_retransmit, ); TRACE_EVENT(rxrpc_congest, - TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, - rxrpc_serial_t ack_serial), + TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary), - TP_ARGS(call, summary, ack_serial), + TP_ARGS(call, summary), TP_STRUCT__entry( __field(unsigned int, call) @@ -1706,7 +1704,6 @@ TRACE_EVENT(rxrpc_congest, __field(rxrpc_seq_t, hard_ack) __field(rxrpc_seq_t, top) __field(rxrpc_seq_t, lowest_nak) - __field(rxrpc_serial_t, ack_serial) __field(u16, nr_sacks) __field(u16, nr_snacks) __field(u16, cwnd) @@ -1722,7 +1719,6 @@ TRACE_EVENT(rxrpc_congest, __entry->hard_ack = call->acks_hard_ack; __entry->top = call->tx_top; __entry->lowest_nak = call->acks_lowest_nak; - __entry->ack_serial = ack_serial; __entry->nr_sacks = call->acks_nr_sacks; __entry->nr_snacks = call->acks_nr_snacks; __entry->cwnd = call->cong_cwnd; @@ -1734,7 +1730,7 @@ TRACE_EVENT(rxrpc_congest, TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u A=%u+%u/%u+%u r=%u b=%u u=%u d=%u l=%x%s%s%s", __entry->call, - __entry->ack_serial, + __entry->sum.acked_serial, __print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names), __entry->hard_ack, __print_symbolic(__entry->ca_state, rxrpc_ca_states), -- cgit v1.2.3 From 93dfca65a1df42a3c8b1094299dc42ab8f18e5c8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:55 +0000 Subject: rxrpc: Adjust the rxrpc_rtt_rx tracepoint Adjust the rxrpc_rtt_rx tracepoint in the following ways: (1) Display the collected RTT sample in the rxrpc_rtt_rx trace. (2) Move the division of srtt by 8 to the TP_printk() rather doing it before invoking the trace point. (3) Display the min_rtt value. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 798bea0853c4..6e929f4448ac 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1415,9 +1415,9 @@ TRACE_EVENT(rxrpc_rtt_rx, TP_PROTO(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, int slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, - u32 rtt, u32 rto), + u32 rtt, u32 srtt, u32 rto), - TP_ARGS(call, why, slot, send_serial, resp_serial, rtt, rto), + TP_ARGS(call, why, slot, send_serial, resp_serial, rtt, srtt, rto), TP_STRUCT__entry( __field(unsigned int, call) @@ -1426,7 +1426,9 @@ TRACE_EVENT(rxrpc_rtt_rx, __field(rxrpc_serial_t, send_serial) __field(rxrpc_serial_t, resp_serial) __field(u32, rtt) + __field(u32, srtt) __field(u32, rto) + __field(u32, min_rtt) ), TP_fast_assign( @@ -1436,17 +1438,21 @@ TRACE_EVENT(rxrpc_rtt_rx, __entry->send_serial = send_serial; __entry->resp_serial = resp_serial; __entry->rtt = rtt; + __entry->srtt = srtt; __entry->rto = rto; + __entry->min_rtt = minmax_get(&call->peer->min_rtt) ), - TP_printk("c=%08x [%d] %s sr=%08x rr=%08x rtt=%u rto=%u", + TP_printk("c=%08x [%d] %s sr=%08x rr=%08x rtt=%u srtt=%u rto=%u min=%u", __entry->call, __entry->slot, __print_symbolic(__entry->why, rxrpc_rtt_rx_traces), __entry->send_serial, __entry->resp_serial, __entry->rtt, - __entry->rto) + __entry->srtt / 8, + __entry->rto, + __entry->min_rtt) ); TRACE_EVENT(rxrpc_timer_set, -- cgit v1.2.3 From a3d7f46d983fb2ed528b9cceb457c067fe4277a2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:46:56 +0000 Subject: rxrpc: Display userStatus in rxrpc_rx_ack trace Display the userStatus field from the Rx packet header in the rxrpc_rx_ack trace line. This is used for flow control purposes by FS.StoreData-type kafs RPC calls. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 6e929f4448ac..7681c67f7d65 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1031,11 +1031,13 @@ TRACE_EVENT(rxrpc_rx_ack, __field(rxrpc_seq_t, prev) __field(u8, reason) __field(u8, n_acks) + __field(u8, user_status) ), TP_fast_assign( __entry->call = call->debug_id; __entry->serial = sp->hdr.serial; + __entry->user_status = sp->hdr.userStatus; __entry->ack_serial = sp->ack.acked_serial; __entry->first = sp->ack.first_ack; __entry->prev = sp->ack.prev_ack; @@ -1043,11 +1045,12 @@ TRACE_EVENT(rxrpc_rx_ack, __entry->n_acks = sp->ack.nr_acks; ), - TP_printk("c=%08x %08x %s r=%08x f=%08x p=%08x n=%u", + TP_printk("c=%08x %08x %s r=%08x us=%02x f=%08x p=%08x n=%u", __entry->call, __entry->serial, __print_symbolic(__entry->reason, rxrpc_ack_names), __entry->ack_serial, + __entry->user_status, __entry->first, __entry->prev, __entry->n_acks) -- cgit v1.2.3 From 08d55d7cf3f33c730ce2694393efe16b7983a9c8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:47:00 +0000 Subject: rxrpc: Don't allocate a txbuf for an ACK transmission Don't allocate an rxrpc_txbuf struct for an ACK transmission. There's now no need as the memory to hold the ACK content is allocated with a page frag allocator. The allocation and freeing of a txbuf is just unnecessary overhead. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 7681c67f7d65..326a4c257aea 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -462,13 +462,11 @@ /* ---- Must update size of stat_why_req_ack[] if more are added! */ #define rxrpc_txbuf_traces \ - EM(rxrpc_txbuf_alloc_ack, "ALLOC ACK ") \ EM(rxrpc_txbuf_alloc_data, "ALLOC DATA ") \ EM(rxrpc_txbuf_free, "FREE ") \ EM(rxrpc_txbuf_get_buffer, "GET BUFFER ") \ EM(rxrpc_txbuf_get_trans, "GET TRANS ") \ EM(rxrpc_txbuf_get_retrans, "GET RETRANS") \ - EM(rxrpc_txbuf_put_ack_tx, "PUT ACK TX ") \ EM(rxrpc_txbuf_put_cleaned, "PUT CLEANED") \ EM(rxrpc_txbuf_put_nomem, "PUT NOMEM ") \ EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ -- cgit v1.2.3 From 372d12d191cb80720319e224d401fd82c602e9e4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:47:03 +0000 Subject: rxrpc: Add a reason indicator to the tx_data tracepoint Add an indicator to the rxrpc_tx_data tracepoint to indicate what triggered the transmission of a particular packet. At this point, it's only normal transmission and retransmission, plus the tracepoint is also used to record loss injection, but in a future patch, TLP-induced (re-)transmission will also be a thing. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 326a4c257aea..d79623fff746 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -302,6 +302,11 @@ EM(rxrpc_txqueue_rotate_last, "RLS") \ E_(rxrpc_txqueue_wait, "WAI") +#define rxrpc_txdata_traces \ + EM(rxrpc_txdata_inject_loss, " *INJ-LOSS*") \ + EM(rxrpc_txdata_new_data, " ") \ + E_(rxrpc_txdata_retransmit, " *RETRANS*") + #define rxrpc_receive_traces \ EM(rxrpc_receive_end, "END") \ EM(rxrpc_receive_front, "FRN") \ @@ -534,6 +539,7 @@ enum rxrpc_timer_trace { rxrpc_timer_traces } __mode(byte); enum rxrpc_tq_trace { rxrpc_tq_traces } __mode(byte); enum rxrpc_tx_point { rxrpc_tx_points } __mode(byte); enum rxrpc_txbuf_trace { rxrpc_txbuf_traces } __mode(byte); +enum rxrpc_txdata_trace { rxrpc_txdata_traces } __mode(byte); enum rxrpc_txqueue_trace { rxrpc_txqueue_traces } __mode(byte); #endif /* end __RXRPC_DECLARE_TRACE_ENUMS_ONCE_ONLY */ @@ -572,6 +578,7 @@ rxrpc_timer_traces; rxrpc_tq_traces; rxrpc_tx_points; rxrpc_txbuf_traces; +rxrpc_txdata_traces; rxrpc_txqueue_traces; /* @@ -1222,9 +1229,10 @@ TRACE_EVENT(rxrpc_tx_packet, TRACE_EVENT(rxrpc_tx_data, TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, - rxrpc_serial_t serial, unsigned int flags, bool lose), + rxrpc_serial_t serial, unsigned int flags, + enum rxrpc_txdata_trace trace), - TP_ARGS(call, seq, serial, flags, lose), + TP_ARGS(call, seq, serial, flags, trace), TP_STRUCT__entry( __field(unsigned int, call) @@ -1233,7 +1241,7 @@ TRACE_EVENT(rxrpc_tx_data, __field(u32, cid) __field(u32, call_id) __field(u16, flags) - __field(bool, lose) + __field(enum rxrpc_txdata_trace, trace) ), TP_fast_assign( @@ -1243,18 +1251,17 @@ TRACE_EVENT(rxrpc_tx_data, __entry->seq = seq; __entry->serial = serial; __entry->flags = flags; - __entry->lose = lose; + __entry->trace = trace; ), - TP_printk("c=%08x DATA %08x:%08x %08x q=%08x fl=%02x%s%s", + TP_printk("c=%08x DATA %08x:%08x %08x q=%08x fl=%02x%s", __entry->call, __entry->cid, __entry->call_id, __entry->serial, __entry->seq, __entry->flags & RXRPC_TXBUF_WIRE_FLAGS, - __entry->flags & RXRPC_TXBUF_RESENT ? " *RETRANS*" : "", - __entry->lose ? " *LOSE*" : "") + __print_symbolic(__entry->trace, rxrpc_txdata_traces)) ); TRACE_EVENT(rxrpc_tx_ack, -- cgit v1.2.3 From b509934094fd52ac3a49ee2a2c144e885517069f Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:47:04 +0000 Subject: rxrpc: Add a reason indicator to the tx_ack tracepoint Record the reason for the transmission of an ACK in the rxrpc_tx_ack tracepoint, and not just in the rxrpc_propose_ack tracepoint. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index d79623fff746..0cfc8e1baf1f 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -375,6 +375,7 @@ EM(rxrpc_propose_ack_processing_op, "ProcOp ") \ EM(rxrpc_propose_ack_respond_to_ack, "Rsp2Ack") \ EM(rxrpc_propose_ack_respond_to_ping, "Rsp2Png") \ + EM(rxrpc_propose_ack_retransmit, "Retrans") \ EM(rxrpc_propose_ack_retry_tx, "RetryTx") \ EM(rxrpc_propose_ack_rotate_rx, "RxAck ") \ EM(rxrpc_propose_ack_rx_idle, "RxIdle ") \ @@ -1267,9 +1268,10 @@ TRACE_EVENT(rxrpc_tx_data, TRACE_EVENT(rxrpc_tx_ack, TP_PROTO(unsigned int call, rxrpc_serial_t serial, rxrpc_seq_t ack_first, rxrpc_serial_t ack_serial, - u8 reason, u8 n_acks, u16 rwind), + u8 reason, u8 n_acks, u16 rwind, + enum rxrpc_propose_ack_trace trace), - TP_ARGS(call, serial, ack_first, ack_serial, reason, n_acks, rwind), + TP_ARGS(call, serial, ack_first, ack_serial, reason, n_acks, rwind, trace), TP_STRUCT__entry( __field(unsigned int, call) @@ -1279,6 +1281,7 @@ TRACE_EVENT(rxrpc_tx_ack, __field(u8, reason) __field(u8, n_acks) __field(u16, rwind) + __field(enum rxrpc_propose_ack_trace, trace) ), TP_fast_assign( @@ -1289,16 +1292,18 @@ TRACE_EVENT(rxrpc_tx_ack, __entry->reason = reason; __entry->n_acks = n_acks; __entry->rwind = rwind; + __entry->trace = trace; ), - TP_printk(" c=%08x ACK %08x %s f=%08x r=%08x n=%u rw=%u", + TP_printk(" c=%08x ACK %08x %s f=%08x r=%08x n=%u rw=%u %s", __entry->call, __entry->serial, __print_symbolic(__entry->reason, rxrpc_ack_names), __entry->ack_first, __entry->ack_serial, __entry->n_acks, - __entry->rwind) + __entry->rwind, + __print_symbolic(__entry->trace, rxrpc_propose_ack_traces)) ); TRACE_EVENT(rxrpc_receive, -- cgit v1.2.3 From b40ef2b85a7d117dd323b5910e504899e0a3e7dc Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:47:05 +0000 Subject: rxrpc: Manage RTT per-call rather than per-peer Manage the determination of RTT on a per-call (ie. per-RPC op) basis rather than on a per-peer basis, averaging across all calls going to that peer. The problem is that the RTT measurements from the initial packets on a call may be off because the server may do some setting up (such as getting a lock on a file) before accepting the rest of the data in the RPC and, further, the RTT may be affected by server-side file operations, for instance if a large amount of data is being written or read. Note: When handling the FS.StoreData-type RPCs, for example, the server uses the userStatus field in the header of ACK packets as supplementary flow control to aid in managing this. AF_RXRPC does not yet support this, but it should be added. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 0cfc8e1baf1f..71df5c48a413 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1453,7 +1453,7 @@ TRACE_EVENT(rxrpc_rtt_rx, __entry->rtt = rtt; __entry->srtt = srtt; __entry->rto = rto; - __entry->min_rtt = minmax_get(&call->peer->min_rtt) + __entry->min_rtt = minmax_get(&call->min_rtt) ), TP_printk("c=%08x [%d] %s sr=%08x rr=%08x rtt=%u srtt=%u rto=%u min=%u", -- cgit v1.2.3 From 7c482665931b6ce7bc72fa5feae6c35567070296 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Dec 2024 07:47:07 +0000 Subject: rxrpc: Implement RACK/TLP to deal with transmission stalls [RFC8985] When an rxrpc call is in its transmission phase and is sending a lot of packets, stalls occasionally occur that cause severe performance degradation (eg. increasing the transmission time for a 256MiB payload from 0.7s to 2.5s over a 10G link). rxrpc already implements TCP-style congestion control [RFC5681] and this helps mitigate the effects, but occasionally we're missing a time event that deals with a missing ACK, leading to a stall until the RTO expires. Fix this by implementing RACK/TLP in rxrpc. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 342 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 323 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 71df5c48a413..2f119d18a061 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -305,7 +305,9 @@ #define rxrpc_txdata_traces \ EM(rxrpc_txdata_inject_loss, " *INJ-LOSS*") \ EM(rxrpc_txdata_new_data, " ") \ - E_(rxrpc_txdata_retransmit, " *RETRANS*") + EM(rxrpc_txdata_retransmit, " *RETRANS*") \ + EM(rxrpc_txdata_tlp_new_data, " *TLP-NEW*") \ + E_(rxrpc_txdata_tlp_retransmit, " *TLP-RETRANS*") #define rxrpc_receive_traces \ EM(rxrpc_receive_end, "END") \ @@ -353,11 +355,12 @@ EM(rxrpc_timer_trace_hard, "HardLimit") \ EM(rxrpc_timer_trace_idle, "IdleLimit") \ EM(rxrpc_timer_trace_keepalive, "KeepAlive") \ - EM(rxrpc_timer_trace_lost_ack, "LostAck ") \ EM(rxrpc_timer_trace_ping, "DelayPing") \ - EM(rxrpc_timer_trace_resend, "Resend ") \ - EM(rxrpc_timer_trace_resend_reset, "ResendRst") \ - E_(rxrpc_timer_trace_resend_tx, "ResendTx ") + EM(rxrpc_timer_trace_rack_off, "RACK-OFF ") \ + EM(rxrpc_timer_trace_rack_zwp, "RACK-ZWP ") \ + EM(rxrpc_timer_trace_rack_reo, "RACK-Reo ") \ + EM(rxrpc_timer_trace_rack_tlp_pto, "TLP-PTO ") \ + E_(rxrpc_timer_trace_rack_rto, "RTO ") #define rxrpc_propose_ack_traces \ EM(rxrpc_propose_ack_client_tx_end, "ClTxEnd") \ @@ -478,9 +481,9 @@ EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ EM(rxrpc_txbuf_put_send_aborted, "PUT SEND-X ") \ EM(rxrpc_txbuf_put_trans, "PUT TRANS ") \ + EM(rxrpc_txbuf_see_lost, "SEE LOST ") \ EM(rxrpc_txbuf_see_out_of_step, "OUT-OF-STEP") \ - EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \ - E_(rxrpc_txbuf_see_unacked, "SEE UNACKED") + E_(rxrpc_txbuf_see_send_more, "SEE SEND+ ") #define rxrpc_tq_traces \ EM(rxrpc_tq_alloc, "ALLOC") \ @@ -505,6 +508,24 @@ EM(rxrpc_rotate_trace_sack, "soft-ack") \ E_(rxrpc_rotate_trace_snak, "soft-nack") +#define rxrpc_rack_timer_modes \ + EM(RXRPC_CALL_RACKTIMER_OFF, "---") \ + EM(RXRPC_CALL_RACKTIMER_RACK_REORDER, "REO") \ + EM(RXRPC_CALL_RACKTIMER_TLP_PTO, "TLP") \ + E_(RXRPC_CALL_RACKTIMER_RTO, "RTO") + +#define rxrpc_tlp_probe_traces \ + EM(rxrpc_tlp_probe_trace_busy, "busy") \ + EM(rxrpc_tlp_probe_trace_transmit_new, "transmit-new") \ + E_(rxrpc_tlp_probe_trace_retransmit, "retransmit") + +#define rxrpc_tlp_ack_traces \ + EM(rxrpc_tlp_ack_trace_acked, "acked") \ + EM(rxrpc_tlp_ack_trace_dup_acked, "dup-acked") \ + EM(rxrpc_tlp_ack_trace_hard_beyond, "hard-beyond") \ + EM(rxrpc_tlp_ack_trace_incomplete, "incomplete") \ + E_(rxrpc_tlp_ack_trace_new_data, "new-data") + /* * Generate enums for tracing information. */ @@ -537,6 +558,8 @@ enum rxrpc_rtt_tx_trace { rxrpc_rtt_tx_traces } __mode(byte); enum rxrpc_sack_trace { rxrpc_sack_traces } __mode(byte); enum rxrpc_skb_trace { rxrpc_skb_traces } __mode(byte); enum rxrpc_timer_trace { rxrpc_timer_traces } __mode(byte); +enum rxrpc_tlp_ack_trace { rxrpc_tlp_ack_traces } __mode(byte); +enum rxrpc_tlp_probe_trace { rxrpc_tlp_probe_traces } __mode(byte); enum rxrpc_tq_trace { rxrpc_tq_traces } __mode(byte); enum rxrpc_tx_point { rxrpc_tx_points } __mode(byte); enum rxrpc_txbuf_trace { rxrpc_txbuf_traces } __mode(byte); @@ -567,6 +590,7 @@ rxrpc_conn_traces; rxrpc_local_traces; rxrpc_pmtud_reduce_traces; rxrpc_propose_ack_traces; +rxrpc_rack_timer_modes; rxrpc_receive_traces; rxrpc_recvmsg_traces; rxrpc_req_ack_traces; @@ -576,6 +600,8 @@ rxrpc_rtt_tx_traces; rxrpc_sack_traces; rxrpc_skb_traces; rxrpc_timer_traces; +rxrpc_tlp_ack_traces; +rxrpc_tlp_probe_traces; rxrpc_tq_traces; rxrpc_tx_points; rxrpc_txbuf_traces; @@ -618,6 +644,20 @@ TRACE_EVENT(rxrpc_local, __entry->usage) ); +TRACE_EVENT(rxrpc_iothread_rx, + TP_PROTO(struct rxrpc_local *local, unsigned int nr_rx), + TP_ARGS(local, nr_rx), + TP_STRUCT__entry( + __field(unsigned int, local) + __field(unsigned int, nr_rx) + ), + TP_fast_assign( + __entry->local = local->debug_id; + __entry->nr_rx = nr_rx; + ), + TP_printk("L=%08x nrx=%u", __entry->local, __entry->nr_rx) + ); + TRACE_EVENT(rxrpc_peer, TP_PROTO(unsigned int peer_debug_id, int ref, enum rxrpc_peer_trace why), @@ -1684,16 +1724,15 @@ TRACE_EVENT(rxrpc_drop_ack, TRACE_EVENT(rxrpc_retransmit, TP_PROTO(struct rxrpc_call *call, struct rxrpc_send_data_req *req, - struct rxrpc_txbuf *txb, ktime_t expiry), + struct rxrpc_txbuf *txb), - TP_ARGS(call, req, txb, expiry), + TP_ARGS(call, req, txb), TP_STRUCT__entry( __field(unsigned int, call) __field(unsigned int, qbase) __field(rxrpc_seq_t, seq) __field(rxrpc_serial_t, serial) - __field(ktime_t, expiry) ), TP_fast_assign( @@ -1701,15 +1740,13 @@ TRACE_EVENT(rxrpc_retransmit, __entry->qbase = req->tq->qbase; __entry->seq = req->seq; __entry->serial = txb->serial; - __entry->expiry = expiry; ), - TP_printk("c=%08x tq=%x q=%x r=%x xp=%lld", + TP_printk("c=%08x tq=%x q=%x r=%x", __entry->call, __entry->qbase, __entry->seq, - __entry->serial, - ktime_to_us(__entry->expiry)) + __entry->serial) ); TRACE_EVENT(rxrpc_congest, @@ -1767,9 +1804,9 @@ TRACE_EVENT(rxrpc_congest, ); TRACE_EVENT(rxrpc_reset_cwnd, - TP_PROTO(struct rxrpc_call *call, ktime_t now), + TP_PROTO(struct rxrpc_call *call, ktime_t since_last_tx, ktime_t rtt), - TP_ARGS(call, now), + TP_ARGS(call, since_last_tx, rtt), TP_STRUCT__entry( __field(unsigned int, call) @@ -1779,6 +1816,7 @@ TRACE_EVENT(rxrpc_reset_cwnd, __field(rxrpc_seq_t, hard_ack) __field(rxrpc_seq_t, prepared) __field(ktime_t, since_last_tx) + __field(ktime_t, rtt) __field(bool, has_data) ), @@ -1789,18 +1827,20 @@ TRACE_EVENT(rxrpc_reset_cwnd, __entry->extra = call->cong_extra; __entry->hard_ack = call->acks_hard_ack; __entry->prepared = call->send_top - call->tx_bottom; - __entry->since_last_tx = ktime_sub(now, call->tx_last_sent); + __entry->since_last_tx = since_last_tx; + __entry->rtt = rtt; __entry->has_data = call->tx_bottom != call->tx_top; ), - TP_printk("c=%08x q=%08x %s cw=%u+%u pr=%u tm=%llu d=%u", + TP_printk("c=%08x q=%08x %s cw=%u+%u pr=%u tm=%llu/%llu d=%u", __entry->call, __entry->hard_ack, __print_symbolic(__entry->ca_state, rxrpc_ca_states), __entry->cwnd, __entry->extra, __entry->prepared, - ktime_to_ns(__entry->since_last_tx), + ktime_to_us(__entry->since_last_tx), + ktime_to_us(__entry->rtt), __entry->has_data) ); @@ -1925,6 +1965,32 @@ TRACE_EVENT(rxrpc_resend, __entry->transmitted) ); +TRACE_EVENT(rxrpc_resend_lost, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq, unsigned long lost), + + TP_ARGS(call, tq, lost), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, qbase) + __field(u8, nr_rep) + __field(unsigned long, lost) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->qbase = tq->qbase; + __entry->nr_rep = tq->nr_reported_acks; + __entry->lost = lost; + ), + + TP_printk("c=%08x tq=%x lost=%016lx nr=%u", + __entry->call, + __entry->qbase, + __entry->lost, + __entry->nr_rep) + ); + TRACE_EVENT(rxrpc_rotate, TP_PROTO(struct rxrpc_call *call, struct rxrpc_txqueue *tq, struct rxrpc_ack_summary *summary, rxrpc_seq_t seq, @@ -2363,6 +2429,244 @@ TRACE_EVENT(rxrpc_pmtud_reduce, __entry->serial, __entry->max_data) ); +TRACE_EVENT(rxrpc_rack, + TP_PROTO(struct rxrpc_call *call, ktime_t timo), + + TP_ARGS(call, timo), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_serial_t, ack_serial) + __field(rxrpc_seq_t, seq) + __field(enum rxrpc_rack_timer_mode, mode) + __field(unsigned short, nr_sent) + __field(unsigned short, nr_lost) + __field(unsigned short, nr_resent) + __field(unsigned short, nr_sacked) + __field(ktime_t, timo) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->ack_serial = call->rx_serial; + __entry->seq = call->rack_end_seq; + __entry->mode = call->rack_timer_mode; + __entry->nr_sent = call->tx_nr_sent; + __entry->nr_lost = call->tx_nr_lost; + __entry->nr_resent = call->tx_nr_resent; + __entry->nr_sacked = call->acks_nr_sacks; + __entry->timo = timo; + ), + + TP_printk("c=%08x r=%08x q=%08x %s slrs=%u,%u,%u,%u t=%lld", + __entry->call, __entry->ack_serial, __entry->seq, + __print_symbolic(__entry->mode, rxrpc_rack_timer_modes), + __entry->nr_sent, __entry->nr_lost, + __entry->nr_resent, __entry->nr_sacked, + ktime_to_us(__entry->timo)) + ); + +TRACE_EVENT(rxrpc_rack_update, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary), + + TP_ARGS(call, summary), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_serial_t, ack_serial) + __field(rxrpc_seq_t, seq) + __field(int, xmit_ts) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->ack_serial = call->rx_serial; + __entry->seq = call->rack_end_seq; + __entry->xmit_ts = ktime_sub(call->acks_latest_ts, call->rack_xmit_ts); + ), + + TP_printk("c=%08x r=%08x q=%08x xt=%lld", + __entry->call, __entry->ack_serial, __entry->seq, + ktime_to_us(__entry->xmit_ts)) + ); + +TRACE_EVENT(rxrpc_rack_scan_loss, + TP_PROTO(struct rxrpc_call *call), + + TP_ARGS(call), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(ktime_t, rack_rtt) + __field(ktime_t, rack_reo_wnd) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->rack_rtt = call->rack_rtt; + __entry->rack_reo_wnd = call->rack_reo_wnd; + ), + + TP_printk("c=%08x rtt=%lld reow=%lld", + __entry->call, ktime_to_us(__entry->rack_rtt), + ktime_to_us(__entry->rack_reo_wnd)) + ); + +TRACE_EVENT(rxrpc_rack_scan_loss_tq, + TP_PROTO(struct rxrpc_call *call, const struct rxrpc_txqueue *tq, + unsigned long nacks), + + TP_ARGS(call, tq, nacks), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, qbase) + __field(unsigned long, nacks) + __field(unsigned long, lost) + __field(unsigned long, retrans) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->qbase = tq->qbase; + __entry->nacks = nacks; + __entry->lost = tq->segment_lost; + __entry->retrans = tq->segment_retransmitted; + ), + + TP_printk("c=%08x q=%08x n=%lx l=%lx r=%lx", + __entry->call, __entry->qbase, + __entry->nacks, __entry->lost, __entry->retrans) + ); + +TRACE_EVENT(rxrpc_rack_detect_loss, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, + rxrpc_seq_t seq), + + TP_ARGS(call, summary, seq), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_serial_t, ack_serial) + __field(rxrpc_seq_t, seq) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->ack_serial = call->rx_serial; + __entry->seq = seq; + ), + + TP_printk("c=%08x r=%08x q=%08x", + __entry->call, __entry->ack_serial, __entry->seq) + ); + +TRACE_EVENT(rxrpc_rack_mark_loss_tq, + TP_PROTO(struct rxrpc_call *call, const struct rxrpc_txqueue *tq), + + TP_ARGS(call, tq), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_seq_t, qbase) + __field(rxrpc_seq_t, trans) + __field(unsigned long, acked) + __field(unsigned long, lost) + __field(unsigned long, retrans) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->qbase = tq->qbase; + __entry->trans = call->tx_transmitted; + __entry->acked = tq->segment_acked; + __entry->lost = tq->segment_lost; + __entry->retrans = tq->segment_retransmitted; + ), + + TP_printk("c=%08x tq=%08x txq=%08x a=%lx l=%lx r=%lx", + __entry->call, __entry->qbase, __entry->trans, + __entry->acked, __entry->lost, __entry->retrans) + ); + +TRACE_EVENT(rxrpc_tlp_probe, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_tlp_probe_trace trace), + + TP_ARGS(call, trace), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_serial_t, serial) + __field(rxrpc_seq_t, seq) + __field(enum rxrpc_tlp_probe_trace, trace) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->serial = call->tlp_serial; + __entry->seq = call->tlp_seq; + __entry->trace = trace; + ), + + TP_printk("c=%08x r=%08x pq=%08x %s", + __entry->call, __entry->serial, __entry->seq, + __print_symbolic(__entry->trace, rxrpc_tlp_probe_traces)) + ); + +TRACE_EVENT(rxrpc_tlp_ack, + TP_PROTO(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, + enum rxrpc_tlp_ack_trace trace), + + TP_ARGS(call, summary, trace), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(rxrpc_serial_t, serial) + __field(rxrpc_seq_t, tlp_seq) + __field(rxrpc_seq_t, hard_ack) + __field(enum rxrpc_tlp_ack_trace, trace) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->serial = call->tlp_serial; + __entry->tlp_seq = call->tlp_seq; + __entry->hard_ack = call->acks_hard_ack; + __entry->trace = trace; + ), + + TP_printk("c=%08x r=%08x pq=%08x hq=%08x %s", + __entry->call, __entry->serial, + __entry->tlp_seq, __entry->hard_ack, + __print_symbolic(__entry->trace, rxrpc_tlp_ack_traces)) + ); + +TRACE_EVENT(rxrpc_rack_timer, + TP_PROTO(struct rxrpc_call *call, ktime_t delay, bool exp), + + TP_ARGS(call, delay, exp), + + TP_STRUCT__entry( + __field(unsigned int, call) + __field(bool, exp) + __field(enum rxrpc_rack_timer_mode, mode) + __field(ktime_t, delay) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->exp = exp; + __entry->mode = call->rack_timer_mode; + __entry->delay = delay; + ), + + TP_printk("c=%08x %s %s to=%lld", + __entry->call, + __entry->exp ? "Exp" : "Set", + __print_symbolic(__entry->mode, rxrpc_rack_timer_modes), + ktime_to_us(__entry->delay)) + ); + #undef EM #undef E_ -- cgit v1.2.3 From 3f330db30638b6489d548084a7e8843374d41ad0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 5 Dec 2024 08:59:14 -0800 Subject: net: reformat kdoc return statements kernel-doc -Wall warns about missing Return: statement for non-void functions. We have a number of kdocs in our headers which are missing the colon, IOW they use * Return some value or * Returns some value Having the colon makes some sense, it should help kdoc parser avoid false positives. So add them. This is mostly done with a sed script, and removing the unnecessary cases (mostly the comments which aren't kdoc). Acked-by: Johannes Berg Acked-by: Richard Cochran Acked-by: Sergey Ryazanov Reviewed-by: Edward Cree Acked-by: Alexandra Winter Acked-by: Pablo Neira Ayuso Link: https://patch.msgid.link/20241205165914.1071102-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/etherdevice.h | 18 ++++++++-------- include/linux/ethtool.h | 6 +++--- include/linux/if_vlan.h | 28 ++++++++++++------------ include/linux/netdevice.h | 14 ++++++------ include/linux/netfilter/x_tables.h | 2 +- include/linux/netfilter_netdev.h | 3 ++- include/linux/ptp_clock_kernel.h | 4 ++-- include/linux/rfkill.h | 2 +- include/linux/rtnetlink.h | 2 +- include/linux/skbuff.h | 16 +++++++------- include/linux/wwan.h | 2 +- include/net/cfg80211.h | 2 +- include/net/dst.h | 2 +- include/net/genetlink.h | 6 +++--- include/net/ipv6.h | 2 +- include/net/iucv/iucv.h | 30 +++++++++++++------------- include/net/netfilter/nf_tproxy.h | 4 ++-- include/net/netlink.h | 44 +++++++++++++++++++------------------- include/net/page_pool/helpers.h | 9 +++----- include/net/pkt_cls.h | 4 ++-- include/net/tcp.h | 2 +- 21 files changed, 101 insertions(+), 101 deletions(-) (limited to 'include') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index ecf203f01034..9a1eacf35d37 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -81,7 +81,7 @@ static const u8 eth_ipv6_mcast_addr_base[ETH_ALEN] __aligned(2) = * is_link_local_ether_addr - Determine if given Ethernet address is link-local * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if address is link local reserved addr (01:80:c2:00:00:0X) per + * Return: true if address is link local reserved addr (01:80:c2:00:00:0X) per * IEEE 802.1Q 8.6.3 Frame filtering. * * Please note: addr must be aligned to u16. @@ -104,7 +104,7 @@ static inline bool is_link_local_ether_addr(const u8 *addr) * is_zero_ether_addr - Determine if give Ethernet address is all zeros. * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is all zeroes. + * Return: true if the address is all zeroes. * * Please note: addr must be aligned to u16. */ @@ -123,7 +123,7 @@ static inline bool is_zero_ether_addr(const u8 *addr) * is_multicast_ether_addr - Determine if the Ethernet address is a multicast. * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is a multicast address. + * Return: true if the address is a multicast address. * By definition the broadcast address is also a multicast address. */ static inline bool is_multicast_ether_addr(const u8 *addr) @@ -157,7 +157,7 @@ static inline bool is_multicast_ether_addr_64bits(const u8 *addr) * is_local_ether_addr - Determine if the Ethernet address is locally-assigned one (IEEE 802). * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is a local address. + * Return: true if the address is a local address. */ static inline bool is_local_ether_addr(const u8 *addr) { @@ -168,7 +168,7 @@ static inline bool is_local_ether_addr(const u8 *addr) * is_broadcast_ether_addr - Determine if the Ethernet address is broadcast * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is the broadcast address. + * Return: true if the address is the broadcast address. * * Please note: addr must be aligned to u16. */ @@ -183,7 +183,7 @@ static inline bool is_broadcast_ether_addr(const u8 *addr) * is_unicast_ether_addr - Determine if the Ethernet address is unicast * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return true if the address is a unicast address. + * Return: true if the address is a unicast address. */ static inline bool is_unicast_ether_addr(const u8 *addr) { @@ -197,7 +197,7 @@ static inline bool is_unicast_ether_addr(const u8 *addr) * Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not * a multicast address, and is not FF:FF:FF:FF:FF:FF. * - * Return true if the address is valid. + * Return: true if the address is valid. * * Please note: addr must be aligned to u16. */ @@ -214,7 +214,7 @@ static inline bool is_valid_ether_addr(const u8 *addr) * * Check that the value from the Ethertype/length field is a valid Ethertype. * - * Return true if the valid is an 802.3 supported Ethertype. + * Return: true if the valid is an 802.3 supported Ethertype. */ static inline bool eth_proto_is_802_3(__be16 proto) { @@ -458,7 +458,7 @@ static inline bool ether_addr_is_ip_mcast(const u8 *addr) * ether_addr_to_u64 - Convert an Ethernet address into a u64 value. * @addr: Pointer to a six-byte array containing the Ethernet address * - * Return a u64 value of the address + * Return: a u64 value of the address */ static inline u64 ether_addr_to_u64(const u8 *addr) { diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index b8b935b52603..e217c6321ed0 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -257,7 +257,7 @@ struct ethtool_link_ksettings { * @mode : one of the ETHTOOL_LINK_MODE_*_BIT * (not atomic, no bound checking) * - * Returns true/false. + * Returns: true/false. */ #define ethtool_link_ksettings_test_link_mode(ptr, name, mode) \ test_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name) @@ -1199,7 +1199,7 @@ ethtool_params_from_link_mode(struct ethtool_link_ksettings *link_ksettings, * @dev: pointer to net_device structure * @vclock_index: pointer to pointer of vclock index * - * Return number of phc vclocks + * Return: number of phc vclocks */ int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index); @@ -1253,7 +1253,7 @@ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add, * ethtool_get_ts_info_by_layer - Obtains time stamping capabilities from the MAC or PHY layer. * @dev: pointer to net_device structure * @info: buffer to hold the result - * Returns zero on success, non-zero otherwise. + * Returns: zero on success, non-zero otherwise. */ int ethtool_get_ts_info_by_layer(struct net_device *dev, struct kernel_ethtool_ts_info *info); diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index c1645c86eed9..d6326b53e336 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -310,7 +310,7 @@ static inline bool vlan_uses_dev(const struct net_device *dev) * eth_type_vlan - check for valid vlan ether type. * @ethertype: ether type to check * - * Returns true if the ether type is a vlan ether type. + * Returns: true if the ether type is a vlan ether type. */ static inline bool eth_type_vlan(__be16 ethertype) { @@ -341,9 +341,9 @@ static inline bool vlan_hw_offload_capable(netdev_features_t features, * @mac_len: MAC header length including outer vlan headers * * Inserts the VLAN tag into @skb as part of the payload at offset mac_len - * Returns error if skb_cow_head fails. - * * Does not change skb->protocol so this function can be used during receive. + * + * Returns: error if skb_cow_head fails. */ static inline int __vlan_insert_inner_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci, @@ -390,9 +390,9 @@ static inline int __vlan_insert_inner_tag(struct sk_buff *skb, * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload - * Returns error if skb_cow_head fails. - * * Does not change skb->protocol so this function can be used during receive. + * + * Returns: error if skb_cow_head fails. */ static inline int __vlan_insert_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) @@ -533,7 +533,7 @@ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb, * @skb: skbuff to query * @vlan_tci: buffer to store value * - * Returns error if the skb is not of VLAN type + * Returns: error if the skb is not of VLAN type */ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { @@ -551,7 +551,7 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * @skb: skbuff to query * @vlan_tci: buffer to store value * - * Returns error if @skb->vlan_tci is not set correctly + * Returns: error if @skb->vlan_tci is not set correctly */ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb, u16 *vlan_tci) @@ -570,7 +570,7 @@ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb, * @skb: skbuff to query * @vlan_tci: buffer to store value * - * Returns error if the skb is not VLAN tagged + * Returns: error if the skb is not VLAN tagged */ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { @@ -587,7 +587,7 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * @type: first vlan protocol * @depth: buffer to store length of eth and vlan tags in bytes * - * Returns the EtherType of the packet, regardless of whether it is + * Returns: the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, @@ -629,7 +629,7 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query * - * Returns the EtherType of the packet, regardless of whether it is + * Returns: the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ static inline __be16 vlan_get_protocol(const struct sk_buff *skb) @@ -710,7 +710,7 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb, * Expects the skb to contain a VLAN tag in the payload, and to have skb->data * pointing at the MAC header. * - * Returns a new pointer to skb->data, or NULL on failure to pull. + * Returns: a new pointer to skb->data, or NULL on failure to pull. */ static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) { @@ -727,7 +727,7 @@ static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) * skb_vlan_tagged - check if skb is vlan tagged. * @skb: skbuff to query * - * Returns true if the skb is tagged, regardless of whether it is hardware + * Returns: true if the skb is tagged, regardless of whether it is hardware * accelerated or not. */ static inline bool skb_vlan_tagged(const struct sk_buff *skb) @@ -743,7 +743,7 @@ static inline bool skb_vlan_tagged(const struct sk_buff *skb) * skb_vlan_tagged_multi - check if skb is vlan tagged with multiple headers. * @skb: skbuff to query * - * Returns true if the skb is tagged with multiple vlan headers, regardless + * Returns: true if the skb is tagged with multiple vlan headers, regardless * of whether it is hardware accelerated or not. */ static inline bool skb_vlan_tagged_multi(struct sk_buff *skb) @@ -774,7 +774,7 @@ static inline bool skb_vlan_tagged_multi(struct sk_buff *skb) * @skb: skbuff to query * @features: features to be checked * - * Returns features without unsafe ones if the skb has multiple tags. + * Returns: features without unsafe ones if the skb has multiple tags. */ static inline netdev_features_t vlan_features_check(struct sk_buff *skb, netdev_features_t features) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 135105441681..d917949bba03 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -509,7 +509,7 @@ static inline bool napi_prefer_busy_poll(struct napi_struct *n) * is scheduled for example in the context of delayed timer * that can be skipped if a NAPI is already scheduled. * - * Return True if NAPI is scheduled, False otherwise. + * Return: True if NAPI is scheduled, False otherwise. */ static inline bool napi_is_scheduled(struct napi_struct *n) { @@ -524,7 +524,7 @@ bool napi_schedule_prep(struct napi_struct *n); * * Schedule NAPI poll routine to be called if it is not already * running. - * Return true if we schedule a NAPI or false if not. + * Return: true if we schedule a NAPI or false if not. * Refer to napi_schedule_prep() for additional reason on why * a NAPI might not be scheduled. */ @@ -558,7 +558,7 @@ static inline void napi_schedule_irqoff(struct napi_struct *n) * Mark NAPI processing as complete. Should only be called if poll budget * has not been completely consumed. * Prefer over napi_complete(). - * Return false if device should avoid rearming interrupts. + * Return: false if device should avoid rearming interrupts. */ bool napi_complete_done(struct napi_struct *n, int work_done); @@ -3851,7 +3851,7 @@ static inline bool netif_attr_test_mask(unsigned long j, * @online_mask: bitmask for CPUs/Rx queues that are online * @nr_bits: number of bits in the bitmask * - * Returns true if a CPU/Rx queue is online. + * Returns: true if a CPU/Rx queue is online. */ static inline bool netif_attr_test_online(unsigned long j, const unsigned long *online_mask, @@ -3871,7 +3871,8 @@ static inline bool netif_attr_test_online(unsigned long j, * @srcp: the cpumask/Rx queue mask pointer * @nr_bits: number of bits in the bitmask * - * Returns >= nr_bits if no further CPUs/Rx queues set. + * Returns: next (after n) CPU/Rx queue index in the mask; + * >= nr_bits if no further CPUs/Rx queues set. */ static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, unsigned int nr_bits) @@ -3893,7 +3894,8 @@ static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, * @src2p: the second CPUs/Rx queues mask pointer * @nr_bits: number of bits in the bitmask * - * Returns >= nr_bits if no further CPUs/Rx queues set in both. + * Returns: next (after n) CPU/Rx queue index set in both masks; + * >= nr_bits if no further CPUs/Rx queues set in both. */ static inline int netif_attrmask_next_and(int n, const unsigned long *src1p, const unsigned long *src2p, diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 5897f3dbaf7c..f39f688d7285 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -357,7 +357,7 @@ extern struct static_key xt_tee_enabled; * Begin packet processing : all readers must wait the end * 1) Must be called with preemption disabled * 2) softirqs must be disabled too (or we should use this_cpu_add()) - * Returns : + * Returns: * 1 if no recursion on this cpu * 0 if recursion detected */ diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h index 8676316547cc..3175073a66ba 100644 --- a/include/linux/netfilter_netdev.h +++ b/include/linux/netfilter_netdev.h @@ -66,7 +66,6 @@ static inline bool nf_hook_egress_active(void) * @rc: result code which shall be returned by __dev_queue_xmit() on failure * @dev: netdev whose egress hooks shall be applied to @skb * - * Returns @skb on success or %NULL if the packet was consumed or filtered. * Caller must hold rcu_read_lock. * * On ingress, packets are classified first by tc, then by netfilter. @@ -81,6 +80,8 @@ static inline bool nf_hook_egress_active(void) * called recursively by tunnel drivers such as vxlan, the flag is reverted to * false after sch_handle_egress(). This ensures that netfilter is applied * both on the overlay and underlying network. + * + * Returns: @skb on success or %NULL if the packet was consumed or filtered. */ static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, struct net_device *dev) diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index c892d22ce0a7..0d68d09bedd1 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -307,7 +307,7 @@ static inline u64 adjust_by_scaled_ppm(u64 base, long scaled_ppm) * @info: Structure describing the new clock. * @parent: Pointer to the parent device of the new clock. * - * Returns a valid pointer on success or PTR_ERR on failure. If PHC + * Returns: a valid pointer on success or PTR_ERR on failure. If PHC * support is missing at the configuration level, this function * returns NULL, and drivers are expected to gracefully handle that * case separately. @@ -445,7 +445,7 @@ int ptp_get_vclocks_index(int pclock_index, int **vclock_index); * @hwtstamp: timestamp * @vclock_index: phc index of ptp vclock. * - * Returns converted timestamp, or 0 on error. + * Returns: converted timestamp, or 0 on error. */ ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp, int vclock_index); #else diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h index 997b34197385..6816e4c5f3f0 100644 --- a/include/linux/rfkill.h +++ b/include/linux/rfkill.h @@ -241,7 +241,7 @@ bool rfkill_soft_blocked(struct rfkill *rfkill); * rfkill_find_type - Helper for finding rfkill type by name * @name: the name of the type * - * Returns enum rfkill_type that corresponds to the name. + * Returns: enum rfkill_type that corresponds to the name. */ enum rfkill_type rfkill_find_type(const char *name); diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 14b88f551920..811ce44113f6 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -78,7 +78,7 @@ static inline bool lockdep_rtnl_is_held(void) * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL * @p: The pointer to read, prior to dereferencing * - * Return the value of the specified RCU-protected pointer, but omit + * Return: the value of the specified RCU-protected pointer, but omit * the READ_ONCE(), because caller holds RTNL. */ #define rtnl_dereference(p) \ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 95452d1a07fc..69624b394cd9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1134,7 +1134,7 @@ static inline bool skb_pfmemalloc(const struct sk_buff *skb) * skb_dst - returns skb dst_entry * @skb: buffer * - * Returns skb dst_entry, regardless of reference taken or not. + * Returns: skb dst_entry, regardless of reference taken or not. */ static inline struct dst_entry *skb_dst(const struct sk_buff *skb) { @@ -1222,7 +1222,7 @@ static inline bool skb_wifi_acked_valid(const struct sk_buff *skb) * skb_unref - decrement the skb's reference count * @skb: buffer * - * Returns true if we can free the skb. + * Returns: true if we can free the skb. */ static inline bool skb_unref(struct sk_buff *skb) { @@ -1344,7 +1344,7 @@ struct sk_buff_fclones { * @sk: socket * @skb: buffer * - * Returns true if skb is a fast clone, and its clone is not freed. + * Returns: true if skb is a fast clone, and its clone is not freed. * Some drivers call skb_orphan() in their ndo_start_xmit(), * so we also check that didn't happen. */ @@ -3516,7 +3516,7 @@ static inline struct page *__dev_alloc_page_noprof(gfp_t gfp_mask) * A page shouldn't be considered for reusing/recycling if it was allocated * under memory pressure or at a distant memory node. * - * Returns false if this page should be returned to page allocator, true + * Returns: false if this page should be returned to page allocator, true * otherwise. */ static inline bool dev_page_is_reusable(const struct page *page) @@ -3633,7 +3633,7 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, * skb_frag_address - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer * - * Returns the address of the data within @frag. The page must already + * Returns: the address of the data within @frag. The page must already * be mapped. */ static inline void *skb_frag_address(const skb_frag_t *frag) @@ -3648,7 +3648,7 @@ static inline void *skb_frag_address(const skb_frag_t *frag) * skb_frag_address_safe - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer * - * Returns the address of the data within @frag. Checks that the page + * Returns: the address of the data within @frag. Checks that the page * is mapped and returns %NULL otherwise. */ static inline void *skb_frag_address_safe(const skb_frag_t *frag) @@ -3890,7 +3890,7 @@ static inline int skb_linearize(struct sk_buff *skb) * skb_has_shared_frag - can any frag be overwritten * @skb: buffer to test * - * Return true if the skb has at least one frag that might be modified + * Return: true if the skb has at least one frag that might be modified * by an external entity (as in vmsplice()/sendfile()) */ static inline bool skb_has_shared_frag(const struct sk_buff *skb) @@ -4612,7 +4612,7 @@ static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb) /* Check if we need to perform checksum complete validation. * - * Returns true if checksum complete is needed, false otherwise + * Returns: true if checksum complete is needed, false otherwise * (either checksum is unnecessary or zero checksum is allowed). */ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb, diff --git a/include/linux/wwan.h b/include/linux/wwan.h index 79c781875c09..a4d6cc0c9f68 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -97,7 +97,7 @@ struct wwan_port_caps { * * This function must be balanced with a call to wwan_remove_port(). * - * Returns a valid pointer to wwan_port on success or PTR_ERR on failure + * Returns: a valid pointer to wwan_port on success or PTR_ERR on failure */ struct wwan_port *wwan_create_port(struct device *parent, enum wwan_port_type type, diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 27acf1292a5c..182f7965048f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5957,7 +5957,7 @@ int wiphy_register(struct wiphy *wiphy); * @wiphy: the wiphy to check the locking on * @p: The pointer to read, prior to dereferencing * - * Return the value of the specified RCU-protected pointer, but omit the + * Return: the value of the specified RCU-protected pointer, but omit the * READ_ONCE(), because caller holds the wiphy mutex used for updates. */ #define wiphy_dereference(wiphy, p) \ diff --git a/include/net/dst.h b/include/net/dst.h index 08647c99d79c..78c78cdce0e9 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -307,7 +307,7 @@ static inline bool dst_hold_safe(struct dst_entry *dst) * @skb: buffer * * If dst is not yet refcounted and not destroyed, grab a ref on it. - * Returns true if dst is refcounted. + * Returns: true if dst is refcounted. */ static inline bool skb_dst_force(struct sk_buff *skb) { diff --git a/include/net/genetlink.h b/include/net/genetlink.h index d096cc6352de..a03d56765832 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -354,7 +354,7 @@ __genlmsg_iput(struct sk_buff *skb, const struct genl_info *info, int flags) * such requests) or a struct initialized by genl_info_init_ntf() * when constructing notifications. * - * Returns pointer to new genetlink header. + * Returns: pointer to new genetlink header. */ static inline void * genlmsg_iput(struct sk_buff *skb, const struct genl_info *info) @@ -366,7 +366,7 @@ genlmsg_iput(struct sk_buff *skb, const struct genl_info *info) * genlmsg_nlhdr - Obtain netlink header from user specified header * @user_hdr: user header as returned from genlmsg_put() * - * Returns pointer to netlink header. + * Returns: pointer to netlink header. */ static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr) { @@ -435,7 +435,7 @@ static inline void genl_dump_check_consistent(struct netlink_callback *cb, * @flags: netlink message flags * @cmd: generic netlink command * - * Returns pointer to user specific header + * Returns: pointer to user specific header */ static inline void *genlmsg_put_reply(struct sk_buff *skb, struct genl_info *info, diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 248bfb26e2af..f5c43ad1565e 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -471,7 +471,7 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, /* This helper is specialized for BIG TCP needs. * It assumes the hop_jumbo_hdr will immediately follow the IPV6 header. * It assumes headers are already in skb->head. - * Returns 0, or IPPROTO_TCP if a BIG TCP packet is there. + * Returns: 0, or IPPROTO_TCP if a BIG TCP packet is there. */ static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb) { diff --git a/include/net/iucv/iucv.h b/include/net/iucv/iucv.h index dd9e93c12260..9804fa5d9c67 100644 --- a/include/net/iucv/iucv.h +++ b/include/net/iucv/iucv.h @@ -202,7 +202,7 @@ struct iucv_handler { * * Registers a driver with IUCV. * - * Returns 0 on success, -ENOMEM if the memory allocation for the pathid + * Returns: 0 on success, -ENOMEM if the memory allocation for the pathid * table failed, or -EIO if IUCV_DECLARE_BUFFER failed on all cpus. */ int iucv_register(struct iucv_handler *handler, int smp); @@ -224,7 +224,7 @@ void iucv_unregister(struct iucv_handler *handle, int smp); * * Allocate a new path structure for use with iucv_connect. * - * Returns NULL if the memory allocation failed or a pointer to the + * Returns: NULL if the memory allocation failed or a pointer to the * path structure. */ static inline struct iucv_path *iucv_path_alloc(u16 msglim, u8 flags, gfp_t gfp) @@ -260,7 +260,7 @@ static inline void iucv_path_free(struct iucv_path *path) * This function is issued after the user received a connection pending * external interrupt and now wishes to complete the IUCV communication path. * - * Returns the result of the CP IUCV call. + * Returns: the result of the CP IUCV call. */ int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler, u8 *userdata, void *private); @@ -278,7 +278,7 @@ int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler, * successfully, you are not able to use the path until you receive an IUCV * Connection Complete external interrupt. * - * Returns the result of the CP IUCV call. + * Returns: the result of the CP IUCV call. */ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, u8 *userid, u8 *system, u8 *userdata, @@ -292,7 +292,7 @@ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, * This function temporarily suspends incoming messages on an IUCV path. * You can later reactivate the path by invoking the iucv_resume function. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_quiesce(struct iucv_path *path, u8 *userdata); @@ -304,7 +304,7 @@ int iucv_path_quiesce(struct iucv_path *path, u8 *userdata); * This function resumes incoming messages on an IUCV path that has * been stopped with iucv_path_quiesce. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_resume(struct iucv_path *path, u8 *userdata); @@ -315,7 +315,7 @@ int iucv_path_resume(struct iucv_path *path, u8 *userdata); * * This function terminates an IUCV path. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_sever(struct iucv_path *path, u8 *userdata); @@ -327,7 +327,7 @@ int iucv_path_sever(struct iucv_path *path, u8 *userdata); * * Cancels a message you have sent. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg, u32 srccls); @@ -347,7 +347,7 @@ int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg, * * Locking: local_bh_enable/local_bh_disable * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *buffer, size_t size, size_t *residual); @@ -367,7 +367,7 @@ int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, * * Locking: no locking. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *buffer, size_t size, @@ -382,7 +382,7 @@ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, * are notified of a message and the time that you complete the message, * the message may be rejected. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg); @@ -399,7 +399,7 @@ int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg); * pathid, msgid, and trgcls. Prmmsg signifies the data is moved into * the parameter list. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *reply, size_t size); @@ -419,7 +419,7 @@ int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg, * * Locking: local_bh_enable/local_bh_disable * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_send(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size); @@ -439,7 +439,7 @@ int iucv_message_send(struct iucv_path *path, struct iucv_message *msg, * * Locking: no locking. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size); @@ -461,7 +461,7 @@ int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg, * reply to the message and a buffer is provided into which IUCV moves * the reply to this message. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_send2way(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size, diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h index 5adf6fda11e8..06985530517b 100644 --- a/include/net/netfilter/nf_tproxy.h +++ b/include/net/netfilter/nf_tproxy.h @@ -49,7 +49,7 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr); * * nf_tproxy_handle_time_wait4() consumes the socket reference passed in. * - * Returns the listener socket if there's one, the TIME_WAIT socket if + * Returns: the listener socket if there's one, the TIME_WAIT socket if * no such listener is found, or NULL if the TCP header is incomplete. */ struct sock * @@ -108,7 +108,7 @@ nf_tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, * * nf_tproxy_handle_time_wait6() consumes the socket reference passed in. * - * Returns the listener socket if there's one, the TIME_WAIT socket if + * Returns: the listener socket if there's one, the TIME_WAIT socket if * no such listener is found, or NULL if the TCP header is incomplete. */ struct sock * diff --git a/include/net/netlink.h b/include/net/netlink.h index 39eaa6be6ca8..e015ffbed819 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -649,7 +649,7 @@ static inline int nlmsg_ok(const struct nlmsghdr *nlh, int remaining) * @nlh: netlink message header * @remaining: number of bytes remaining in message stream * - * Returns the next netlink message in the message stream and + * Returns: the next netlink message in the message stream and * decrements remaining by the size of the current message. */ static inline struct nlmsghdr * @@ -676,7 +676,7 @@ nlmsg_next(const struct nlmsghdr *nlh, int *remaining) * exceeding maxtype will be rejected, policy must be specified, attributes * will be validated in the strictest way possible. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, @@ -701,7 +701,7 @@ static inline int nla_parse(struct nlattr **tb, int maxtype, * exceeding maxtype will be ignored and attributes from the policy are not * always strictly validated (only for new attributes). * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, @@ -726,7 +726,7 @@ static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype, * exceeding maxtype will be rejected as well as trailing data, but the * policy is not completely strictly validated (only for new attributes). * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype, const struct nlattr *head, @@ -833,7 +833,7 @@ nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen, * @hdrlen: length of family specific header * @attrtype: type of attribute to look for * - * Returns the first attribute which matches the specified type. + * Returns: the first attribute which matches the specified type. */ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, int hdrlen, int attrtype) @@ -854,7 +854,7 @@ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, * specified policy. Validation is done in liberal mode. * See documentation of struct nla_policy for more details. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_validate_deprecated(const struct nlattr *head, int len, int maxtype, @@ -877,7 +877,7 @@ static inline int nla_validate_deprecated(const struct nlattr *head, int len, * specified policy. Validation is done in strict mode. * See documentation of struct nla_policy for more details. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, @@ -914,7 +914,7 @@ static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh, * nlmsg_report - need to report back to application? * @nlh: netlink message header * - * Returns 1 if a report back to the application is requested. + * Returns: 1 if a report back to the application is requested. */ static inline int nlmsg_report(const struct nlmsghdr *nlh) { @@ -925,7 +925,7 @@ static inline int nlmsg_report(const struct nlmsghdr *nlh) * nlmsg_seq - return the seq number of netlink message * @nlh: netlink message header * - * Returns 0 if netlink message is NULL + * Returns: 0 if netlink message is NULL */ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) { @@ -952,7 +952,7 @@ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) * @payload: length of message payload * @flags: message flags * - * Returns NULL if the tailroom of the skb is insufficient to store + * Returns: NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, @@ -971,7 +971,7 @@ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 se * * Append data to an existing nlmsg, used when constructing a message * with multiple fixed-format headers (which is rare). - * Returns NULL if the tailroom of the skb is insufficient to store + * Returns: NULL if the tailroom of the skb is insufficient to store * the extra payload. */ static inline void *nlmsg_append(struct sk_buff *skb, u32 size) @@ -993,7 +993,7 @@ static inline void *nlmsg_append(struct sk_buff *skb, u32 size) * @payload: length of message payload * @flags: message flags * - * Returns NULL if the tailroom of the skb is insufficient to store + * Returns: NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb, @@ -1050,7 +1050,7 @@ static inline void nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh) * nlmsg_get_pos - return current position in netlink message * @skb: socket buffer the message is stored in * - * Returns a pointer to the current tail of the message. + * Returns: a pointer to the current tail of the message. */ static inline void *nlmsg_get_pos(struct sk_buff *skb) { @@ -1276,7 +1276,7 @@ static inline int nla_ok(const struct nlattr *nla, int remaining) * @nla: netlink attribute * @remaining: number of bytes remaining in attribute stream * - * Returns the next netlink attribute in the attribute stream and + * Returns: the next netlink attribute in the attribute stream and * decrements remaining by the size of the current attribute. */ static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining) @@ -1292,7 +1292,7 @@ static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining) * @nla: attribute containing the nested attributes * @attrtype: type of attribute to look for * - * Returns the first attribute which matches the specified type. + * Returns: the first attribute which matches the specified type. */ static inline struct nlattr * nla_find_nested(const struct nlattr *nla, int attrtype) @@ -2091,7 +2091,7 @@ static inline int nla_get_flag(const struct nlattr *nla) * nla_get_msecs - return payload of msecs attribute * @nla: msecs netlink attribute * - * Returns the number of milliseconds in jiffies. + * Returns: the number of milliseconds in jiffies. */ static inline unsigned long nla_get_msecs(const struct nlattr *nla) { @@ -2183,7 +2183,7 @@ static inline void *nla_memdup_noprof(const struct nlattr *src, gfp_t gfp) * marked their nest attributes with NLA_F_NESTED flag. New APIs should use * nla_nest_start() which sets the flag. * - * Returns the container attribute or NULL on error + * Returns: the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb, int attrtype) @@ -2204,7 +2204,7 @@ static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb, * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED * flag. This is the preferred function to use in new code. * - * Returns the container attribute or NULL on error + * Returns: the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) { @@ -2219,7 +2219,7 @@ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) * Corrects the container attribute header to include the all * appended attributes. * - * Returns the total data length of the skb. + * Returns: the total data length of the skb. */ static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start) { @@ -2252,7 +2252,7 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) * specified policy. Attributes with a type exceeding maxtype will be * ignored. See documentation of struct nla_policy for more details. * - * Returns 0 on success or a negative error code. + * Returns: 0 on success or a negative error code. */ static inline int __nla_validate_nested(const struct nlattr *start, int maxtype, const struct nla_policy *policy, @@ -2285,7 +2285,7 @@ nla_validate_nested_deprecated(const struct nlattr *start, int maxtype, * nla_need_padding_for_64bit - test 64-bit alignment of the next attribute * @skb: socket buffer the message is stored in * - * Return true if padding is needed to align the next attribute (nla_data()) to + * Return: true if padding is needed to align the next attribute (nla_data()) to * a 64-bit aligned area. */ static inline bool nla_need_padding_for_64bit(struct sk_buff *skb) @@ -2312,7 +2312,7 @@ static inline bool nla_need_padding_for_64bit(struct sk_buff *skb) * This will only be done in architectures which do not have * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS defined. * - * Returns zero on success or a negative error code. + * Returns: zero on success or a negative error code. */ static inline int nla_align_64bit(struct sk_buff *skb, int padattr) { diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 793e6fd78bc5..26caa2c20912 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -104,8 +104,7 @@ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) * * Get a page fragment from the page allocator or page_pool caches. * - * Return: - * Return allocated page fragment, otherwise return NULL. + * Return: allocated page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, unsigned int *offset, @@ -155,8 +154,7 @@ static inline struct page *page_pool_alloc(struct page_pool *pool, * depending on the requested size in order to allocate memory with least memory * utilization and performance penalty. * - * Return: - * Return allocated page or page fragment, otherwise return NULL. + * Return: allocated page or page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc(struct page_pool *pool, unsigned int *offset, @@ -190,8 +188,7 @@ static inline void *page_pool_alloc_va(struct page_pool *pool, * This is just a thin wrapper around the page_pool_alloc() API, and * it returns va of the allocated page or page fragment. * - * Return: - * Return the va for the allocated page or page fragment, otherwise return NULL. + * Return: the va for the allocated page or page fragment, otherwise return NULL. */ static inline void *page_pool_dev_alloc_va(struct page_pool *pool, unsigned int *size) diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index cf199af85c52..22c5ab4269d7 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -319,7 +319,7 @@ tcf_exts_hw_stats_update(const struct tcf_exts *exts, * tcf_exts_has_actions - check if at least one action is present * @exts: tc filter extensions handle * - * Returns true if at least one action is present. + * Returns: true if at least one action is present. */ static inline bool tcf_exts_has_actions(struct tcf_exts *exts) { @@ -501,7 +501,7 @@ int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *, * through all ematches respecting their logic relations returning * as soon as the result is obvious. * - * Returns 1 if the ematch tree as-one matches, no ematches are configured + * Returns: 1 if the ematch tree as-one matches, no ematches are configured * or ematch is not enabled in the kernel, otherwise 0 is returned. */ static inline int tcf_em_tree_match(struct sk_buff *skb, diff --git a/include/net/tcp.h b/include/net/tcp.h index e9b37b76e894..5b2b04835688 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1817,7 +1817,7 @@ int tcp_sigpool_hash_skb_data(struct tcp_sigpool *hp, * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash() * @c: returned tcp_sigpool for usage (uninitialized on failure) * - * Returns 0 on success, error otherwise. + * Returns: 0 on success, error otherwise. */ int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c); /** -- cgit v1.2.3 From e4f8647767cfac0291def86ddfac23b925294701 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:54 +0100 Subject: vxlan: Track reserved bits explicitly as part of the configuration In order to make it possible to configure which bits in VXLAN header should be considered reserved, introduce a new field vxlan_config::reserved_bits. Have it cover the whole header, except for the VNI-present bit and the bits for VNI itself, and have individual enabled features clear more bits off reserved_bits. (This is expressed as first constructing a used_bits set, and then inverting it to get the reserved_bits. The set of used_bits will be useful on its own for validation of user-set reserved_bits in a following patch.) The patch also moves a comment relevant to the validation from the unparsed validation site up to the new site. Logically this patch should add the new comment, and a later patch that removes the unparsed bits would remove the old comment. But keeping both legs in the same patch is better from the history spelunking point of view. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/984dbf98d5940d3900268dbffaf70961f731d4a4.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/vxlan.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 33ba6fc151cf..2dd23ee2bacd 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -227,6 +227,7 @@ struct vxlan_config { unsigned int addrmax; bool no_share; enum ifla_vxlan_df df; + struct vxlanhdr reserved_bits; }; enum { -- cgit v1.2.3 From 6c11379b104e3718135fd7fc37bb254b41e4cf65 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 5 Dec 2024 16:40:57 +0100 Subject: vxlan: Add an attribute to make VXLAN header validation configurable The set of bits that the VXLAN netdevice currently considers reserved is defined by the features enabled at the netdevice construction. In order to make this configurable, add an attribute, IFLA_VXLAN_RESERVED_BITS. The payload is a pair of big-endian u32's covering the VXLAN header. This is validated against the set of flags used by the various enabled VXLAN features, and attempts to override bits used by an enabled feature are bounced. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/c657275e5ceed301e62c69fe8e559e32909442e2.1733412063.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 2575e0cd9b48..77730c340c8f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1394,6 +1394,7 @@ enum { IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ IFLA_VXLAN_LOCALBYPASS, IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */ + IFLA_VXLAN_RESERVED_BITS, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) -- cgit v1.2.3 From 31cdd8418234e70043abd26894b57eb201489cba Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:42:58 +0100 Subject: net: stmmac: Fix CSR divider comment The comment in declaration of STMMAC_CSR_250_300M incorrectly describes the constant as '/* MDC = clk_scr_i/122 */' but the DWC Ether QOS Handbook version 5.20a says it is CSR clock/124. Signed-off-by: Jan Petrous (OSS) Reviewed-by: Jacob Keller Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-1-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index d79ff252cfdc..75cbfb576358 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -33,7 +33,7 @@ #define STMMAC_CSR_20_35M 0x2 /* MDC = clk_scr_i/16 */ #define STMMAC_CSR_35_60M 0x3 /* MDC = clk_scr_i/26 */ #define STMMAC_CSR_150_250M 0x4 /* MDC = clk_scr_i/102 */ -#define STMMAC_CSR_250_300M 0x5 /* MDC = clk_scr_i/122 */ +#define STMMAC_CSR_250_300M 0x5 /* MDC = clk_scr_i/124 */ /* MTL algorithms identifiers */ #define MTL_TX_ALGORITHM_WRR 0x0 -- cgit v1.2.3 From c8fab05d021dfc04401102f9fa1de07fc8f75d8d Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:42:59 +0100 Subject: net: stmmac: Extend CSR calc support Add support for CSR clock range up to 800 MHz. Reviewed-by: Jacob Keller Reviewed-by: Russell King (Oracle) Signed-off-by: Jan Petrous (OSS) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-2-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 75cbfb576358..865d0fe26f98 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -34,6 +34,8 @@ #define STMMAC_CSR_35_60M 0x3 /* MDC = clk_scr_i/26 */ #define STMMAC_CSR_150_250M 0x4 /* MDC = clk_scr_i/102 */ #define STMMAC_CSR_250_300M 0x5 /* MDC = clk_scr_i/124 */ +#define STMMAC_CSR_300_500M 0x6 /* MDC = clk_scr_i/204 */ +#define STMMAC_CSR_500_800M 0x7 /* MDC = clk_scr_i/324 */ /* MTL algorithms identifiers */ #define MTL_TX_ALGORITHM_WRR 0x0 -- cgit v1.2.3 From cb09f61a9ab84369c62f2ef7f8a2b797f596f6d1 Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:43:00 +0100 Subject: net: stmmac: Fix clock rate variables size The clock API clk_get_rate() returns unsigned long value. Expand affected members of stmmac platform data and convert the stmmac_clk_csr_set() and dwmac4_core_init() methods to defining the unsigned long clk_rate local variables. Reviewed-by: Andrew Lunn Reviewed-by: Serge Semin Reviewed-by: Russell King (Oracle) Signed-off-by: Jan Petrous (OSS) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-3-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 865d0fe26f98..c9878a612e53 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -252,8 +252,8 @@ struct plat_stmmacenet_data { struct clk *stmmac_clk; struct clk *pclk; struct clk *clk_ptp_ref; - unsigned int clk_ptp_rate; - unsigned int clk_ref_rate; + unsigned long clk_ptp_rate; + unsigned long clk_ref_rate; unsigned int mult_fact_100ns; s32 ptp_max_adj; u32 cdc_error_adj; @@ -265,7 +265,7 @@ struct plat_stmmacenet_data { int mac_port_sel_speed; int has_xgmac; u8 vlan_fail_q; - unsigned int eee_usecs_rate; + unsigned long eee_usecs_rate; struct pci_dev *pdev; int int_snapshot_num; int msi_mac_vec; -- cgit v1.2.3 From 386aa60abdb600a4e5ad818e6dba171685942e54 Mon Sep 17 00:00:00 2001 From: "Jan Petrous (OSS)" Date: Thu, 5 Dec 2024 17:43:01 +0100 Subject: net: phy: Add helper for mapping RGMII link speed to clock rate The RGMII interface supports three data rates: 10/100 Mbps and 1 Gbps. These speeds correspond to clock frequencies of 2.5/25 MHz and 125 MHz, respectively. Many Ethernet drivers, including glues in stmmac, follow a similar pattern of converting RGMII speed to clock frequency. To simplify code, define the helper rgmii_clock(speed) to convert connection speed to clock frequency. Suggested-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Signed-off-by: Jan Petrous (OSS) Link: https://patch.msgid.link/20241205-upstream_s32cc_gmac-v8-4-ec1d180df815@oss.nxp.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index bb157136351e..e597a32cc787 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -298,6 +298,29 @@ static inline const char *phy_modes(phy_interface_t interface) } } +/** + * rgmii_clock - map link speed to the clock rate + * @speed: link speed value + * + * Description: maps RGMII supported link speeds + * into the clock rates. + * + * Returns: clock rate or negative errno + */ +static inline long rgmii_clock(int speed) +{ + switch (speed) { + case SPEED_10: + return 2500000; + case SPEED_100: + return 25000000; + case SPEED_1000: + return 125000000; + default: + return -EINVAL; + } +} + #define PHY_INIT_TIMEOUT 100000 #define PHY_FORCE_TIMEOUT 10 -- cgit v1.2.3 From c17618cf664ddf54b264ea74df9e8ab3e3ceda3b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 4 Dec 2024 20:18:39 -0800 Subject: scsi: Eliminate scsi_register() and scsi_unregister() usage & docs scsi_mid_low_api.rst refers to scsi_register() and scsi_unregister() but these functions don't exist. They have been replaced by more meaningful names. Update one driver (megaraid_mbox.c) that uses "scsi_unregister" in a warning message. Update scsi_mid_low_api.rst to eliminate references to scsi_register() and scsi_unregister(). Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20241205041839.164404-1-rdunlap@infradead.org Cc: James E.J. Bottomley Cc: Martin K. Petersen Cc: Bart Van Assche Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: Kashyap Desai Cc: Sumit Saxena Cc: Shivasharan S Cc: Chandrakanth patil Cc: megaraidlinux.pdl@broadcom.com Signed-off-by: Martin K. Petersen --- include/scsi/scsi_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 8fccfd27393e..fac90ae884c7 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -595,7 +595,7 @@ struct Scsi_Host { * have some way of identifying each detected host adapter properly * and uniquely. For hosts that do not support more than one card * in the system at one time, this does not need to be set. It is - * initialized to 0 in scsi_register. + * initialized to 0 in scsi_host_alloc. */ unsigned int unique_id; -- cgit v1.2.3 From 09463346b6c23672cdd451f500d2a23b792bd6f0 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Fri, 15 Nov 2024 19:26:50 +0800 Subject: crypto: hisilicon/zip - add data aggregation feature The zip device adds data aggregation feature, data with the same key can be combined. This patch enables the device data aggregation feature. New feature is called "hashagg" name and registered to the uacce subsystem to allow applications to submit data aggregation operations in user space. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 6dbd0d49628f..3a13fb719dd0 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -97,6 +97,8 @@ /* page number for queue file region */ #define QM_DOORBELL_PAGE_NR 1 +#define QM_DEV_ALG_MAX_LEN 256 + /* uacce mode of the driver */ #define UACCE_MODE_NOUACCE 0 /* don't use uacce */ #define UACCE_MODE_SVA 1 /* use uacce sva mode */ @@ -156,6 +158,7 @@ enum qm_cap_bits { QM_SUPPORT_MB_COMMAND, QM_SUPPORT_SVA_PREFETCH, QM_SUPPORT_RPM, + QM_SUPPORT_DAE, }; struct qm_dev_alg { -- cgit v1.2.3 From 771ba5c982a28ede1d33de9702c0f3501f1f9e1c Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Fri, 15 Nov 2024 19:26:51 +0800 Subject: crypto: hisilicon/zip - support new error report The error detection of the data aggregation feature is separated from the compression/decompression feature. This patch enables the error detection and reporting of the data aggregation feature. When an unrecoverable error occurs in the algorithm core, the device reports the error to the driver, and the driver will reset the device. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 3a13fb719dd0..c1dafbabbd6b 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -269,6 +269,8 @@ struct hisi_qm_err_ini { void (*show_last_dfx_regs)(struct hisi_qm *qm); void (*err_info_init)(struct hisi_qm *qm); enum acc_err_result (*get_err_result)(struct hisi_qm *qm); + bool (*dev_is_abnormal)(struct hisi_qm *qm); + int (*set_priv_status)(struct hisi_qm *qm); }; struct hisi_qm_cap_info { -- cgit v1.2.3 From fbef60de6c753253e1337ea60cf818d079108974 Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Tue, 3 Dec 2024 15:57:11 +0200 Subject: RDMA/mlx5: Extend ODP statistics with operation count The current ODP counters represent the total number of pages handled, but it is not enough to understand the effectiveness of these operations. Extend the ODP counters to include the number of times page fault and invalidation events were handled. Example for a single page fault handling 512 pages: - page_fault: incremented by 512 (total pages) - page_fault_handled: incremented by 1 (operation count) The same example is applicable for page invalidation too. Previous output: $ rdma stat mr dev rocep8s0f0 mrn 8 page_faults 27 page_invalidations 0 page_prefetch 29 New output: $ rdma stat mr dev rocep8s0f0 mrn 21 page_faults 512 page_faults_handled 1 page_invalidations 0 page_invalidations_handled 0 page_prefetch 51200 Signed-off-by: Chiara Meiohas Reviewed-by: Michael Guralnik Link: https://patch.msgid.link/b18f29ed1392996ade66e9e6c45f018925253f6a.1733234165.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- include/rdma/ib_verbs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 3417636da960..6ddd5e3bb884 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2256,7 +2256,9 @@ struct rdma_netdev_alloc_params { struct ib_odp_counters { atomic64_t faults; + atomic64_t faults_handled; atomic64_t invalidations; + atomic64_t invalidations_handled; atomic64_t prefetch; }; -- cgit v1.2.3 From ea79df10331218b04d90f028a605f33c879c518d Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Mon, 25 Nov 2024 14:23:11 +0100 Subject: mmc: core: Drop the MMC_RSP_R1_NO_CRC response The MMC_RSP_R1_NO_CRC type of response is not being used by the mmc core for any commands. Let's therefore drop it, together with the corresponding code in the host drivers. Signed-off-by: Ulf Hansson Acked-by: Wolfram Sang # for TMIO Reviewed-by: Avri Altman Message-ID: <20241125132311.23939-1-ulf.hansson@linaro.org> --- include/linux/mmc/core.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index 56972bd78462..e13856ab6ad0 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -64,9 +64,6 @@ struct mmc_command { #define MMC_RSP_R6 (MMC_RSP_PRESENT|MMC_RSP_CRC|MMC_RSP_OPCODE) #define MMC_RSP_R7 (MMC_RSP_PRESENT|MMC_RSP_CRC|MMC_RSP_OPCODE) -/* Can be used by core to poll after switch to MMC HS mode */ -#define MMC_RSP_R1_NO_CRC (MMC_RSP_PRESENT|MMC_RSP_OPCODE) - #define mmc_resp_type(cmd) ((cmd)->flags & (MMC_RSP_PRESENT|MMC_RSP_136|MMC_RSP_CRC|MMC_RSP_BUSY|MMC_RSP_OPCODE)) /* -- cgit v1.2.3 From ed97550d470d00ebafe9de888fd100cb82d3abb6 Mon Sep 17 00:00:00 2001 From: Andy-ld Lu Date: Tue, 26 Nov 2024 20:48:22 +0800 Subject: mmc: core: Introduce the MMC_RSP_R1B_NO_CRC response The R1B response type with ignoring CRC is used in the mmc_cqe_recovery(), introduce the MMC_RSP_R1B_NO_CRC response type to simplify the code. Signed-off-by: Andy-ld Lu Message-ID: <20241126125041.16071-2-andy-ld.lu@mediatek.com> Signed-off-by: Ulf Hansson --- include/linux/mmc/core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index e13856ab6ad0..01e0f591a20b 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -57,6 +57,7 @@ struct mmc_command { #define MMC_RSP_NONE (0) #define MMC_RSP_R1 (MMC_RSP_PRESENT|MMC_RSP_CRC|MMC_RSP_OPCODE) #define MMC_RSP_R1B (MMC_RSP_PRESENT|MMC_RSP_CRC|MMC_RSP_OPCODE|MMC_RSP_BUSY) +#define MMC_RSP_R1B_NO_CRC (MMC_RSP_PRESENT|MMC_RSP_OPCODE|MMC_RSP_BUSY) #define MMC_RSP_R2 (MMC_RSP_PRESENT|MMC_RSP_136|MMC_RSP_CRC) #define MMC_RSP_R3 (MMC_RSP_PRESENT) #define MMC_RSP_R4 (MMC_RSP_PRESENT) -- cgit v1.2.3 From d4cc8912cbff4990940b33cc61a9b09ddaee9704 Mon Sep 17 00:00:00 2001 From: Cristian Marussi Date: Mon, 9 Dec 2024 16:49:56 +0000 Subject: firmware: arm_scmi: Add module aliases to i.MX vendor protocols Using the pattern 'scmi-protocol-0x-' as MODULE_ALIAS allows the SCMI core to autoload this protocol, if built as a module, when its protocol operations are requested by an SCMI driver. Cc: Peng Fan Acked-by: Peng Fan Signed-off-by: Cristian Marussi Message-Id: <20241209164957.1801886-3-cristian.marussi@arm.com> Signed-off-by: Sudeep Holla --- include/linux/scmi_imx_protocol.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/scmi_imx_protocol.h b/include/linux/scmi_imx_protocol.h index 066216f1357a..53b356a26414 100644 --- a/include/linux/scmi_imx_protocol.h +++ b/include/linux/scmi_imx_protocol.h @@ -13,10 +13,11 @@ #include #include -enum scmi_nxp_protocol { - SCMI_PROTOCOL_IMX_BBM = 0x81, - SCMI_PROTOCOL_IMX_MISC = 0x84, -}; +#define SCMI_PROTOCOL_IMX_BBM 0x81 +#define SCMI_PROTOCOL_IMX_MISC 0x84 + +#define SCMI_IMX_VENDOR "NXP" +#define SCMI_IMX_SUBVENDOR "IMX" struct scmi_imx_bbm_proto_ops { int (*rtc_time_set)(const struct scmi_protocol_handle *ph, u32 id, -- cgit v1.2.3 From a94204f4d48e28a711b7ed10399f749286c433e3 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:15 -0500 Subject: fsnotify: opt-in for permission events at file open time Legacy inotify/fanotify listeners can add watches for events on inode, parent or mount and expect to get events (e.g. FS_MODIFY) on files that were already open at the time of setting up the watches. fanotify permission events are typically used by Anti-malware sofware, that is watching the entire mount and it is not common to have more that one Anti-malware engine installed on a system. To reduce the overhead of the fsnotify_file_perm() hooks on every file access, relax the semantics of the legacy FAN_ACCESS_PERM event to generate events only if there were *any* permission event listeners on the filesystem at the time that the file was opened. The new semantic is implemented by extending the FMODE_NONOTIFY bit into two FMODE_NONOTIFY_* bits, that are used to store a mode for which of the events types to report. This is going to apply to the new fanotify pre-content events in order to reduce the cost of the new pre-content event vfs hooks. [Thanks to Bert Karwatzki for reporting a bug in this code with CONFIG_FANOTIFY_ACCESS_PERMISSIONS disabled] Suggested-by: Linus Torvalds Link: https://lore.kernel.org/linux-fsdevel/CAHk-=wj8L=mtcRTi=NECHMGfZQgXOp_uix1YVh04fEmrKaMnXA@mail.gmail.com/ Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/5ea5f8e283d1edb55aa79c35187bfe344056af14.1731684329.git.josef@toxicpanda.com --- include/linux/fs.h | 43 ++++++++++++++++++++++++++++++++++++++----- include/linux/fsnotify.h | 39 +++++++++++++++++++++++---------------- 2 files changed, 61 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 93c2b720271e..5f7ac5b548a4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -173,13 +173,20 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_NOREUSE ((__force fmode_t)(1 << 23)) -/* FMODE_* bit 24 */ - /* File is embedded in backing_file object */ -#define FMODE_BACKING ((__force fmode_t)(1 << 25)) +#define FMODE_BACKING ((__force fmode_t)(1 << 24)) + +/* + * Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be + * generated (see below) + */ +#define FMODE_NONOTIFY ((__force fmode_t)(1 << 25)) -/* File was opened by fanotify and shouldn't generate fanotify events */ -#define FMODE_NONOTIFY ((__force fmode_t)(1 << 26)) +/* + * Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be + * generated (see below) + */ +#define FMODE_NONOTIFY_PERM ((__force fmode_t)(1 << 26)) /* File is capable of returning -EAGAIN if I/O will block */ #define FMODE_NOWAIT ((__force fmode_t)(1 << 27)) @@ -190,6 +197,32 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File does not contribute to nr_files count */ #define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29)) +/* + * The two FMODE_NONOTIFY* define which fsnotify events should not be generated + * for a file. These are the possible values of (f->f_mode & + * FMODE_FSNOTIFY_MASK) and their meaning: + * + * FMODE_NONOTIFY - suppress all (incl. non-permission) events. + * FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events. + * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only pre-content events. + */ +#define FMODE_FSNOTIFY_MASK \ + (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM) + +#define FMODE_FSNOTIFY_NONE(mode) \ + ((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY) +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS +#define FMODE_FSNOTIFY_PERM(mode) \ + ((mode & FMODE_FSNOTIFY_MASK) == 0 || \ + (mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)) +#define FMODE_FSNOTIFY_HSM(mode) \ + ((mode & FMODE_FSNOTIFY_MASK) == 0) +#else +#define FMODE_FSNOTIFY_PERM(mode) 0 +#define FMODE_FSNOTIFY_HSM(mode) 0 +#endif + + /* * Attribute flags. These should be or-ed together to figure out what * has been changed! diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 278620e063ab..8d1849137a96 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -108,38 +108,35 @@ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask) fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY); } -static inline int fsnotify_file(struct file *file, __u32 mask) +static inline int fsnotify_path(const struct path *path, __u32 mask) { - const struct path *path; + return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); +} +static inline int fsnotify_file(struct file *file, __u32 mask) +{ /* * FMODE_NONOTIFY are fds generated by fanotify itself which should not * generate new events. We also don't want to generate events for * FMODE_PATH fds (involves open & close events) as they are just * handle creation / destruction events and not "real" file events. */ - if (file->f_mode & (FMODE_NONOTIFY | FMODE_PATH)) + if (FMODE_FSNOTIFY_NONE(file->f_mode)) return 0; - path = &file->f_path; - /* Permission events require group prio >= FSNOTIFY_PRIO_CONTENT */ - if (mask & ALL_FSNOTIFY_PERM_EVENTS && - !fsnotify_sb_has_priority_watchers(path->dentry->d_sb, - FSNOTIFY_PRIO_CONTENT)) - return 0; - - return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); + return fsnotify_path(&file->f_path, mask); } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + +void file_set_fsnotify_mode(struct file *file); + /* * fsnotify_file_area_perm - permission hook before access to file range */ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, const loff_t *ppos, size_t count) { - __u32 fsnotify_mask = FS_ACCESS_PERM; - /* * filesystem may be modified in the context of permission events * (e.g. by HSM filling a file on access), so sb freeze protection @@ -150,7 +147,10 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, if (!(perm_mask & MAY_READ)) return 0; - return fsnotify_file(file, fsnotify_mask); + if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode))) + return 0; + + return fsnotify_path(&file->f_path, FS_ACCESS_PERM); } /* @@ -168,16 +168,23 @@ static inline int fsnotify_open_perm(struct file *file) { int ret; + if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode))) + return 0; + if (file->f_flags & __FMODE_EXEC) { - ret = fsnotify_file(file, FS_OPEN_EXEC_PERM); + ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM); if (ret) return ret; } - return fsnotify_file(file, FS_OPEN_PERM); + return fsnotify_path(&file->f_path, FS_OPEN_PERM); } #else +static inline void file_set_fsnotify_mode(struct file *file) +{ +} + static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, const loff_t *ppos, size_t count) { -- cgit v1.2.3 From 318652e07fa5b1743d08eeccd69a1f47f2c15710 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:16 -0500 Subject: fsnotify: check if file is actually being watched for pre-content events on open So far, we set FMODE_NONOTIFY_ flags at open time if we know that there are no permission event watchers at all on the filesystem, but lack of FMODE_NONOTIFY_ flags does not mean that the file is actually watched. For pre-content events, it is possible to optimize things so that we don't bother trying to send pre-content events if file was not watched (through sb, mnt, parent or inode itself) on open. Set FMODE_NONOTIFY_ flags according to that. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/2ddcc9f8d1fde48d085318a6b5a889289d8871d8.1731684329.git.josef@toxicpanda.com --- include/linux/fsnotify_backend.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 3ecf7768e577..9c105244815d 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -77,6 +77,9 @@ */ #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME) +/* Pre-content events can be used to fill file content */ +#define FSNOTIFY_PRE_CONTENT_EVENTS 0 + #define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \ FS_OPEN_EXEC_PERM) -- cgit v1.2.3 From 0a076036b631f086a6bce93a45eaa216f234f121 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:19 -0500 Subject: fanotify: reserve event bit of deprecated FAN_DIR_MODIFY Avoid reusing it, because we would like to reserve it for future FAN_PATH_MODIFY pre-content event. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/632d9f80428e2e7a6b6a8ccc2925d87c92bbb518.1731684329.git.josef@toxicpanda.com --- include/linux/fsnotify_backend.h | 1 + include/uapi/linux/fanotify.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 9c105244815d..c38762b62bf1 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -55,6 +55,7 @@ #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */ #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */ +/* #define FS_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ /* * Set on inode mark that cares about things that happen to its children. diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 34f221d3a1b9..79072b6894f2 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -25,6 +25,7 @@ #define FAN_OPEN_PERM 0x00010000 /* File open in perm check */ #define FAN_ACCESS_PERM 0x00020000 /* File accessed in perm check */ #define FAN_OPEN_EXEC_PERM 0x00040000 /* File open/exec in perm check */ +/* #define FAN_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ #define FAN_EVENT_ON_CHILD 0x08000000 /* Interested in child events */ -- cgit v1.2.3 From f156524e5d72c81792eee81f828784dc8a37a7f2 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:20 -0500 Subject: fsnotify: introduce pre-content permission events The new FS_PRE_ACCESS permission event is similar to FS_ACCESS_PERM, but it meant for a different use case of filling file content before access to a file range, so it has slightly different semantics. Generate FS_PRE_ACCESS/FS_ACCESS_PERM as two seperate events, so content scanners could inspect the content filled by pre-content event handler. Unlike FS_ACCESS_PERM, FS_PRE_ACCESS is also called before a file is modified by syscalls as write() and fallocate(). FS_ACCESS_PERM is reported also on blockdev and pipes, but the new pre-content events are only reported for regular files and dirs. The pre-content events are meant to be used by hierarchical storage managers that want to fill the content of files on first access. There are some specific requirements from filesystems that could be used with pre-content events, so add a flag for fs to opt-in for pre-content events explicitly before they can be used. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/b934c5e3af205abc4e0e4709f6486815937ddfdf.1731684329.git.josef@toxicpanda.com --- include/linux/fs.h | 1 + include/linux/fsnotify.h | 19 ++++++++++++++++++- include/linux/fsnotify_backend.h | 11 ++++++++--- 3 files changed, 27 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 5f7ac5b548a4..3f4d59464965 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1265,6 +1265,7 @@ extern int send_sigurg(struct file *file); #define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ #define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ #define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ +#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ /* Possible states of 'frozen' field */ enum { diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 8d1849137a96..d91aa064f0e4 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -144,12 +144,29 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, */ lockdep_assert_once(file_write_not_started(file)); - if (!(perm_mask & MAY_READ)) + if (!(perm_mask & (MAY_READ | MAY_WRITE | MAY_ACCESS))) return 0; if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode))) return 0; + /* + * read()/write() and other types of access generate pre-content events. + */ + if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { + int ret = fsnotify_path(&file->f_path, FS_PRE_ACCESS); + + if (ret) + return ret; + } + + if (!(perm_mask & MAY_READ)) + return 0; + + /* + * read() also generates the legacy FS_ACCESS_PERM event, so content + * scanners can inspect the content filled by pre-content event. + */ return fsnotify_path(&file->f_path, FS_ACCESS_PERM); } diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index c38762b62bf1..9bda354b5538 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -57,6 +57,8 @@ #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */ /* #define FS_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ +#define FS_PRE_ACCESS 0x00100000 /* Pre-content access hook */ + /* * Set on inode mark that cares about things that happen to its children. * Always set for dnotify and inotify. @@ -78,11 +80,14 @@ */ #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME) +/* Content events can be used to inspect file content */ +#define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \ + FS_ACCESS_PERM) /* Pre-content events can be used to fill file content */ -#define FSNOTIFY_PRE_CONTENT_EVENTS 0 +#define FSNOTIFY_PRE_CONTENT_EVENTS (FS_PRE_ACCESS) -#define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \ - FS_OPEN_EXEC_PERM) +#define ALL_FSNOTIFY_PERM_EVENTS (FSNOTIFY_CONTENT_PERM_EVENTS | \ + FSNOTIFY_PRE_CONTENT_EVENTS) /* * This is a list of all events that may get sent to a parent that is watching -- cgit v1.2.3 From 9740d17162deca7138fad7dcf3ef52324832c32b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:21 -0500 Subject: fsnotify: pass optional file access range in pre-content event We would like to add file range information to pre-content events. Pass a struct file_range with offset and length to event handler along with pre-content permission event. The offset and length are aligned to page size, but we may need to align them to minimum folio size for filesystems with large block size. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/88eddee301231d814aede27fb4d5b41ae37c9702.1731684329.git.josef@toxicpanda.com --- include/linux/fsnotify.h | 4 ++-- include/linux/fsnotify_backend.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index d91aa064f0e4..87044acf8e79 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -154,7 +154,7 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, * read()/write() and other types of access generate pre-content events. */ if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { - int ret = fsnotify_path(&file->f_path, FS_PRE_ACCESS); + int ret = fsnotify_pre_content(&file->f_path, ppos, count); if (ret) return ret; @@ -171,7 +171,7 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, } /* - * fsnotify_file_perm - permission hook before file access + * fsnotify_file_perm - permission hook before file access (unknown range) */ static inline int fsnotify_file_perm(struct file *file, int perm_mask) { diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 9bda354b5538..0d24a21a8e60 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -294,6 +294,7 @@ static inline void fsnotify_group_assert_locked(struct fsnotify_group *group) /* When calling fsnotify tell it if the data is a path or inode */ enum fsnotify_data_type { FSNOTIFY_EVENT_NONE, + FSNOTIFY_EVENT_FILE_RANGE, FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, FSNOTIFY_EVENT_DENTRY, @@ -306,6 +307,17 @@ struct fs_error_report { struct super_block *sb; }; +struct file_range { + const struct path *path; + loff_t pos; + size_t count; +}; + +static inline const struct path *file_range_path(const struct file_range *range) +{ + return range->path; +} + static inline struct inode *fsnotify_data_inode(const void *data, int data_type) { switch (data_type) { @@ -315,6 +327,8 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type) return d_inode(data); case FSNOTIFY_EVENT_PATH: return d_inode(((const struct path *)data)->dentry); + case FSNOTIFY_EVENT_FILE_RANGE: + return d_inode(file_range_path(data)->dentry); case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *)data)->inode; default: @@ -330,6 +344,8 @@ static inline struct dentry *fsnotify_data_dentry(const void *data, int data_typ return (struct dentry *)data; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry; + case FSNOTIFY_EVENT_FILE_RANGE: + return file_range_path(data)->dentry; default: return NULL; } @@ -341,6 +357,8 @@ static inline const struct path *fsnotify_data_path(const void *data, switch (data_type) { case FSNOTIFY_EVENT_PATH: return data; + case FSNOTIFY_EVENT_FILE_RANGE: + return file_range_path(data); default: return NULL; } @@ -356,6 +374,8 @@ static inline struct super_block *fsnotify_data_sb(const void *data, return ((struct dentry *)data)->d_sb; case FSNOTIFY_EVENT_PATH: return ((const struct path *)data)->dentry->d_sb; + case FSNOTIFY_EVENT_FILE_RANGE: + return file_range_path(data)->dentry->d_sb; case FSNOTIFY_EVENT_ERROR: return ((struct fs_error_report *) data)->sb; default: @@ -375,6 +395,18 @@ static inline struct fs_error_report *fsnotify_data_error_report( } } +static inline const struct file_range *fsnotify_data_file_range( + const void *data, + int data_type) +{ + switch (data_type) { + case FSNOTIFY_EVENT_FILE_RANGE: + return (struct file_range *)data; + default: + return NULL; + } +} + /* * Index to merged marks iterator array that correlates to a type of watch. * The type of watched object can be deduced from the iterator type, but not @@ -863,9 +895,17 @@ static inline void fsnotify_init_event(struct fsnotify_event *event) { INIT_LIST_HEAD(&event->list); } +int fsnotify_pre_content(const struct path *path, const loff_t *ppos, + size_t count); #else +static inline int fsnotify_pre_content(const struct path *path, + const loff_t *ppos, size_t count) +{ + return 0; +} + static inline int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, struct inode *inode, u32 cookie) -- cgit v1.2.3 From 4acf3bc76e521b47acebcefc6312c97992f4ca29 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:22 -0500 Subject: fsnotify: generate pre-content permission event on truncate Generate FS_PRE_ACCESS event before truncate, without sb_writers held. Move the security hooks also before sb_start_write() to conform with other security hooks (e.g. in write, fallocate). The event will have a range info of the page surrounding the new size to provide an opportunity to fill the conetnt at the end of file before truncating to non-page aligned size. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/23af8201db6ac2efdea94f09ab067d81ba5de7a7.1731684329.git.josef@toxicpanda.com --- include/linux/fsnotify.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 87044acf8e79..1a9ef8f6784d 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -170,6 +170,21 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, return fsnotify_path(&file->f_path, FS_ACCESS_PERM); } +/* + * fsnotify_truncate_perm - permission hook before file truncate + */ +static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) +{ + struct inode *inode = d_inode(path->dentry); + + if (!(inode->i_sb->s_iflags & SB_I_ALLOW_HSM) || + !fsnotify_sb_has_priority_watchers(inode->i_sb, + FSNOTIFY_PRIO_PRE_CONTENT)) + return 0; + + return fsnotify_pre_content(path, &length, 0); +} + /* * fsnotify_file_perm - permission hook before file access (unknown range) */ @@ -208,6 +223,11 @@ static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, return 0; } +static inline int fsnotify_truncate_perm(const struct path *path, loff_t length) +{ + return 0; +} + static inline int fsnotify_file_perm(struct file *file, int perm_mask) { return 0; -- cgit v1.2.3 From 4f8afa33817a6420398d1c177c6e220a05081f51 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:23 -0500 Subject: fanotify: introduce FAN_PRE_ACCESS permission event Similar to FAN_ACCESS_PERM permission event, but it is only allowed with class FAN_CLASS_PRE_CONTENT and only allowed on regular files and dirs. Unlike FAN_ACCESS_PERM, it is safe to write to the file being accessed in the context of the event handler. This pre-content event is meant to be used by hierarchical storage managers that want to fill the content of files on first read access. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/b80986f8d5b860acea2c9a73c0acd93587be5fe4.1731684329.git.josef@toxicpanda.com --- include/linux/fanotify.h | 14 ++++++++++---- include/uapi/linux/fanotify.h | 2 ++ 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 89ff45bd6f01..c747af064d2c 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -89,6 +89,16 @@ #define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE | \ FAN_RENAME) +/* Content events can be used to inspect file content */ +#define FANOTIFY_CONTENT_PERM_EVENTS (FAN_OPEN_PERM | FAN_OPEN_EXEC_PERM | \ + FAN_ACCESS_PERM) +/* Pre-content events can be used to fill file content */ +#define FANOTIFY_PRE_CONTENT_EVENTS (FAN_PRE_ACCESS) + +/* Events that require a permission response from user */ +#define FANOTIFY_PERM_EVENTS (FANOTIFY_CONTENT_PERM_EVENTS | \ + FANOTIFY_PRE_CONTENT_EVENTS) + /* Events that can be reported with event->fd */ #define FANOTIFY_FD_EVENTS (FANOTIFY_PATH_EVENTS | FANOTIFY_PERM_EVENTS) @@ -104,10 +114,6 @@ FANOTIFY_INODE_EVENTS | \ FANOTIFY_ERROR_EVENTS) -/* Events that require a permission response from user */ -#define FANOTIFY_PERM_EVENTS (FAN_OPEN_PERM | FAN_ACCESS_PERM | \ - FAN_OPEN_EXEC_PERM) - /* Extra flags that may be reported with event or control handling of events */ #define FANOTIFY_EVENT_FLAGS (FAN_EVENT_ON_CHILD | FAN_ONDIR) diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 79072b6894f2..7596168c80eb 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -27,6 +27,8 @@ #define FAN_OPEN_EXEC_PERM 0x00040000 /* File open/exec in perm check */ /* #define FAN_DIR_MODIFY 0x00080000 */ /* Deprecated (reserved) */ +#define FAN_PRE_ACCESS 0x00100000 /* Pre-content access hook */ + #define FAN_EVENT_ON_CHILD 0x08000000 /* Interested in child events */ #define FAN_RENAME 0x10000000 /* File was renamed */ -- cgit v1.2.3 From 870499bc1d4dc04cba1f63dd5e7bc02b983e2458 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:24 -0500 Subject: fanotify: report file range info with pre-content events With group class FAN_CLASS_PRE_CONTENT, report offset and length info along with FAN_PRE_ACCESS pre-content events. This information is meant to be used by hierarchical storage managers that want to fill partial content of files on first access to range. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/b90a9e6c809dd3cad5684da90f23ea93ec6ce8c8.1731684329.git.josef@toxicpanda.com --- include/uapi/linux/fanotify.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 7596168c80eb..0636a9c85dd0 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -146,6 +146,7 @@ struct fanotify_event_metadata { #define FAN_EVENT_INFO_TYPE_DFID 3 #define FAN_EVENT_INFO_TYPE_PIDFD 4 #define FAN_EVENT_INFO_TYPE_ERROR 5 +#define FAN_EVENT_INFO_TYPE_RANGE 6 /* Special info types for FAN_RENAME */ #define FAN_EVENT_INFO_TYPE_OLD_DFID_NAME 10 @@ -192,6 +193,13 @@ struct fanotify_event_info_error { __u32 error_count; }; +struct fanotify_event_info_range { + struct fanotify_event_info_header hdr; + __u32 pad; + __u64 offset; + __u64 count; +}; + /* * User space may need to record additional information about its decision. * The extra information type records what kind of information is included. -- cgit v1.2.3 From b4b2ff4f61ded819bfa22e50fdec7693f51cbbee Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 15 Nov 2024 10:30:25 -0500 Subject: fanotify: allow to set errno in FAN_DENY permission response With FAN_DENY response, user trying to perform the filesystem operation gets an error with errno set to EPERM. It is useful for hierarchical storage management (HSM) service to be able to deny access for reasons more diverse than EPERM, for example EAGAIN, if HSM could retry the operation later. Allow fanotify groups with priority FAN_CLASSS_PRE_CONTENT to responsd to permission events with the response value FAN_DENY_ERRNO(errno), instead of FAN_DENY to return a custom error. Limit custom error values to errors expected on read(2)/write(2) and open(2) of regular files. This list could be extended in the future. Userspace can test for legitimate values of FAN_DENY_ERRNO(errno) by writing a response to an fanotify group fd with a value of FAN_NOFD in the fd field of the response. The change in fanotify_response is backward compatible, because errno is written in the high 8 bits of the 32bit response field and old kernels reject respose value with high bits set. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/1e5fb6af84b69ca96b5c849fa5f10bdf4d1dc414.1731684329.git.josef@toxicpanda.com --- include/linux/fanotify.h | 4 +++- include/uapi/linux/fanotify.h | 7 +++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index c747af064d2c..78f660ebc318 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -132,7 +132,9 @@ /* These masks check for invalid bits in permission responses. */ #define FANOTIFY_RESPONSE_ACCESS (FAN_ALLOW | FAN_DENY) #define FANOTIFY_RESPONSE_FLAGS (FAN_AUDIT | FAN_INFO) -#define FANOTIFY_RESPONSE_VALID_MASK (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS) +#define FANOTIFY_RESPONSE_VALID_MASK \ + (FANOTIFY_RESPONSE_ACCESS | FANOTIFY_RESPONSE_FLAGS | \ + (FAN_ERRNO_MASK << FAN_ERRNO_SHIFT)) /* Do not use these old uapi constants internally */ #undef FAN_ALL_CLASS_BITS diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 0636a9c85dd0..bd8167979707 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -235,6 +235,13 @@ struct fanotify_response_info_audit_rule { /* Legit userspace responses to a _PERM event */ #define FAN_ALLOW 0x01 #define FAN_DENY 0x02 +/* errno other than EPERM can specified in upper byte of deny response */ +#define FAN_ERRNO_BITS 8 +#define FAN_ERRNO_SHIFT (32 - FAN_ERRNO_BITS) +#define FAN_ERRNO_MASK ((1 << FAN_ERRNO_BITS) - 1) +#define FAN_DENY_ERRNO(err) \ + (FAN_DENY | ((((__u32)(err)) & FAN_ERRNO_MASK) << FAN_ERRNO_SHIFT)) + #define FAN_AUDIT 0x10 /* Bitmask to create audit record for result */ #define FAN_INFO 0x20 /* Bitmask to indicate additional information */ -- cgit v1.2.3 From b04b981f3a842ef63e06048fafaa8d20c20334c6 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Sat, 30 Nov 2024 17:39:37 +0100 Subject: pmdomain: core: Support naming idle states Commit 422f2d418186 ("arm64: dts: qcom: Drop undocumented domain "idle-state-name"") brought to light the common misbelief that idle-state-names also applies to e.g. PSCI power domain idle states. Make that a reality, mimicking the property name used by cpuidle states. Signed-off-by: Konrad Dybcio Message-ID: <20241130-topic-idle_state_name-v1-2-d0ff67b0c8e9@oss.qualcomm.com> Signed-off-by: Ulf Hansson --- include/linux/pm_domain.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 45646bfcaf1a..1aab31370065 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -147,6 +147,7 @@ struct genpd_governor_data { }; struct genpd_power_state { + const char *name; s64 power_off_latency_ns; s64 power_on_latency_ns; s64 residency_ns; -- cgit v1.2.3 From 7d5265ffcd8b41da5e09066360540d6e0716e9cd Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 12 Nov 2024 10:28:26 -0500 Subject: rseq: Validate read-only fields under DEBUG_RSEQ config The rseq uapi requires cooperation between users of the rseq fields to ensure that all libraries and applications using rseq within a process do not interfere with each other. This is especially important for fields which are meant to be read-only from user-space, as documented in uapi/linux/rseq.h: - cpu_id_start, - cpu_id, - node_id, - mm_cid. Storing to those fields from a user-space library prevents any sharing of the rseq ABI with other libraries and applications, as other users are not aware that the content of those fields has been altered by a third-party library. This is unfortunately the current behavior of tcmalloc: it purposefully overlaps part of a cached value with the cpu_id_start upper bits to get notified about preemption, because the kernel clears those upper bits before returning to user-space. This behavior does not conform to the rseq uapi header ABI. This prevents tcmalloc from using rseq when rseq is registered by the GNU C library 2.35+. It requires tcmalloc users to disable glibc rseq registration with a glibc tunable, which is a sad state of affairs. Considering that tcmalloc and the GNU C library are the two first upstream projects using rseq, and that they are already incompatible due to use of this hack, adding kernel-level validation of all read-only fields content is necessary to ensure future users of rseq abide by the rseq ABI requirements. Validate that user-space does not corrupt the read-only fields and conform to the rseq uapi header ABI when the kernel is built with CONFIG_DEBUG_RSEQ=y. This is done by storing a copy of the read-only fields in the task_struct, and validating the prior values present in user-space before updating them. If the values do not match, print a warning on the console (printk_ratelimited()). This is a first step to identify misuses of the rseq ABI by printing a warning on the console. After a giving some time to userspace to correct its use of rseq, the plan is to eventually terminate offending processes with SIGSEGV. This change is expected to produce warnings for the upstream tcmalloc implementation, but tcmalloc developers mentioned they were open to adapt their implementation to kernel-level change. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://github.com/google/tcmalloc/issues/144 --- include/linux/sched.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index d380bffee2ef..b5916be49f62 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1367,6 +1367,15 @@ struct task_struct { * with respect to preemption. */ unsigned long rseq_event_mask; +# ifdef CONFIG_DEBUG_RSEQ + /* + * This is a place holder to save a copy of the rseq fields for + * validation of read-only fields. The struct rseq has a + * variable-length array at the end, so it cannot be used + * directly. Reserve a size large enough for the known fields. + */ + char rseq_fields[sizeof(struct rseq)]; +# endif #endif #ifdef CONFIG_SCHED_MM_CID -- cgit v1.2.3 From 3c48780d48df029cf9d5f42b8971663e6fb975ae Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 4 Dec 2024 11:48:05 -0800 Subject: of: Hide of_default_bus_match_table[] This isn't used outside this file. Hide the array in the C file. Signed-off-by: Stephen Boyd Acked-by: Saravana Kannan Link: https://lore.kernel.org/r/20241204194806.2665589-1-swboyd@chromium.org Signed-off-by: Rob Herring (Arm) --- include/linux/of_platform.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h index a2ff1ad48f7f..17471ef8e092 100644 --- a/include/linux/of_platform.h +++ b/include/linux/of_platform.h @@ -47,8 +47,6 @@ struct of_dev_auxdata { { .compatible = _compat, .phys_addr = _phys, .name = _name, \ .platform_data = _pdata } -extern const struct of_device_id of_default_bus_match_table[]; - /* Platform drivers register/unregister */ extern struct platform_device *of_device_alloc(struct device_node *np, const char *bus_id, -- cgit v1.2.3 From 549de562d794a42bb647952e965e588390e16fe0 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 5 Dec 2024 21:18:57 -0600 Subject: ACPI: platform-profile: Add a name member to handlers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to prepare for allowing multiple handlers, introduce a name field that can be used to distinguish between different handlers. Tested-by: Mark Pearson Tested-by: Matthew Schwartz Reviewed-by: Hans de Goede Reviewed-by: Mark Pearson Reviewed-by: Maximilian Luz Reviewed-by: Ilpo Järvinen Reviewed-by: Armin Wolf Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20241206031918.1537-2-mario.limonciello@amd.com Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index f5492ed413f3..6fa988e41742 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -27,6 +27,7 @@ enum platform_profile_option { }; struct platform_profile_handler { + const char *name; unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; int (*profile_get)(struct platform_profile_handler *pprof, enum platform_profile_option *profile); -- cgit v1.2.3 From 6f5e63ddc333dae371be6f8a8f70a82043697a4c Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 5 Dec 2024 21:18:59 -0600 Subject: ACPI: platform_profile: Add device pointer into platform profile handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to let platform profile handlers manage platform profile for their driver the core code will need a pointer to the device. Add this to the structure and use it in the trivial driver cases. Reviewed-by: Armin Wolf Reviewed-by: Ilpo Järvinen Reviewed-by: Maximilian Luz Reviewed-by: Mark Pearson Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20241206031918.1537-4-mario.limonciello@amd.com Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 6fa988e41742..daec6b9bad81 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -28,6 +28,7 @@ enum platform_profile_option { struct platform_profile_handler { const char *name; + struct device *dev; unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; int (*profile_get)(struct platform_profile_handler *pprof, enum platform_profile_option *profile); -- cgit v1.2.3 From 9b3bb37b44a317626464e79da8b39989b421963f Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 5 Dec 2024 21:19:00 -0600 Subject: ACPI: platform_profile: Add platform handler argument to platform_profile_remove() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To allow registering and unregistering multiple platform handlers calls to platform_profile_remove() will need to know which handler is to be removed. Add an argument for this. Tested-by: Mark Pearson Tested-by: Matthew Schwartz Reviewed-by: Hans de Goede Reviewed-by: Mark Pearson Reviewed-by: Maximilian Luz Reviewed-by: Ilpo Järvinen Reviewed-by: Armin Wolf Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20241206031918.1537-5-mario.limonciello@amd.com Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index daec6b9bad81..bcaf3aa39160 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -37,7 +37,7 @@ struct platform_profile_handler { }; int platform_profile_register(struct platform_profile_handler *pprof); -int platform_profile_remove(void); +int platform_profile_remove(struct platform_profile_handler *pprof); int platform_profile_cycle(void); void platform_profile_notify(void); -- cgit v1.2.3 From 4d5c027bf55661da2621c694ea39908ae2d3a46a Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 5 Dec 2024 21:19:01 -0600 Subject: ACPI: platform_profile: Pass the profile handler into platform_profile_notify() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The profile handler will be used to notify the appropriate class devices. Reviewed-by: Armin Wolf Reviewed-by: Mark Pearson Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20241206031918.1537-6-mario.limonciello@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index bcaf3aa39160..8ec0b8da56db 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -39,6 +39,6 @@ struct platform_profile_handler { int platform_profile_register(struct platform_profile_handler *pprof); int platform_profile_remove(struct platform_profile_handler *pprof); int platform_profile_cycle(void); -void platform_profile_notify(void); +void platform_profile_notify(struct platform_profile_handler *pprof); #endif /*_PLATFORM_PROFILE_H_*/ -- cgit v1.2.3 From 77be5cacb2c2d8c3ddd069f0b4e9408f553af1d8 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 5 Dec 2024 21:19:06 -0600 Subject: ACPI: platform_profile: Create class for ACPI platform profile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When registering a platform profile handler create a class device that will allow changing a single platform profile handler. The class and sysfs group are no longer needed when the platform profile core is a module and unloaded, so remove them at that time as well. Reviewed-by: Armin Wolf Tested-by: Mark Pearson Reviewed-by: Mark Pearson Reviewed-by: Ilpo Järvinen Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20241206031918.1537-11-mario.limonciello@amd.com Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 8ec0b8da56db..a888fd085c51 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -29,6 +29,8 @@ enum platform_profile_option { struct platform_profile_handler { const char *name; struct device *dev; + struct device *class_dev; + int minor; unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; int (*profile_get)(struct platform_profile_handler *pprof, enum platform_profile_option *profile); -- cgit v1.2.3 From 494637cf5bf098ac0fe125dd6d23368419fe9da4 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 5 Dec 2024 21:19:12 -0600 Subject: ACPI: platform_profile: Add concept of a "custom" profile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When two profile handlers don't agree on the current profile it's ambiguous what to show to the legacy sysfs interface. Add a "custom" profile string that userspace will be able to use the legacy sysfs interface to distinguish this situation.. Additionally drivers can choose to use this to indicate that a user has modified driver settings in a way that the platform profile advertised by a driver is not accurate. Reviewed-by: Armin Wolf Tested-by: Mark Pearson Reviewed-by: Mark Pearson Reviewed-by: Ilpo Järvinen Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20241206031918.1537-17-mario.limonciello@amd.com Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index a888fd085c51..0682bb4c57e5 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -23,6 +23,7 @@ enum platform_profile_option { PLATFORM_PROFILE_BALANCED, PLATFORM_PROFILE_BALANCED_PERFORMANCE, PLATFORM_PROFILE_PERFORMANCE, + PLATFORM_PROFILE_CUSTOM, PLATFORM_PROFILE_LAST, /*must always be last */ }; -- cgit v1.2.3 From 9029409d1a250da19f1086ab1113752411c5163d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 10 Dec 2024 22:55:49 +0100 Subject: power: supply: core: introduce power_supply_for_each_psy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All existing callers of power_supply_for_each_device() want to iterate over 'struct power_supply', not 'struct device'. The power_supply_for_each_device() forces each caller to duplicate the logic to go from one to the other. Introduce power_supply_for_each_psy() to simplify the callers. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241210-power-supply-dev_to_psy-v2-2-9d8c9d24cfe4@weissschuh.net Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index b98106e1a90f..11d54270eaa9 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -882,6 +882,7 @@ extern int power_supply_powers(struct power_supply *psy, struct device *dev); extern void *power_supply_get_drvdata(struct power_supply *psy); extern int power_supply_for_each_device(void *data, int (*fn)(struct device *dev, void *data)); +extern int power_supply_for_each_psy(void *data, int (*fn)(struct power_supply *psy, void *data)); static inline bool power_supply_is_amp_property(enum power_supply_property psp) { -- cgit v1.2.3 From bfc330323cf3ea6d5c9985179384c0b56f2d5372 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 10 Dec 2024 22:55:53 +0100 Subject: power: supply: core: remove power_supply_for_each_device() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no users anymore. All potential future users are expected to use power_supply_for_each_psy(). Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241210-power-supply-dev_to_psy-v2-6-9d8c9d24cfe4@weissschuh.net Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 11d54270eaa9..3d67f4a6a1c9 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -881,7 +881,6 @@ extern int power_supply_powers(struct power_supply *psy, struct device *dev); #define to_power_supply(device) container_of(device, struct power_supply, dev) extern void *power_supply_get_drvdata(struct power_supply *psy); -extern int power_supply_for_each_device(void *data, int (*fn)(struct device *dev, void *data)); extern int power_supply_for_each_psy(void *data, int (*fn)(struct power_supply *psy, void *data)); static inline bool power_supply_is_amp_property(enum power_supply_property psp) -- cgit v1.2.3 From f52204036326bd9c07db08bab6607f423c801716 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 10 Dec 2024 22:55:54 +0100 Subject: power: supply: core: introduce dev_to_psy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The psy core and drivers currently use dev_get_drvdata() to go from a 'struct device' to its 'struct power_supply'. This is not typesafe and or documented. Introduce a new helper to make this pattern explicit. Instead of using dev_get_drvdata(), use container_of_const() which also preserves the constness. Furthermore 'dev' does need to be dereferenced anymore and at some point the drvdata could be reused for something else. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241210-power-supply-dev_to_psy-v2-7-9d8c9d24cfe4@weissschuh.net Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 3d67f4a6a1c9..17fc383785bf 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -318,6 +318,8 @@ struct power_supply { #endif }; +#define dev_to_psy(__dev) container_of_const(__dev, struct power_supply, dev) + /* * This is recommended structure to specify static power supply parameters. * Generic one, parametrizable for different power supplies. Power supply -- cgit v1.2.3 From be325f08c432ae5ac6d6594d163e1899cdf202df Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Dec 2024 10:07:45 +0000 Subject: rtnetlink: add ndo_fdb_dump_context rtnl_fdb_dump() and various ndo_fdb_dump() helpers share a hidden layout of cb->ctx. Before switching rtnl_fdb_dump() to for_each_netdev_dump() in the following patch, make this more explicit. Signed-off-by: Eric Dumazet Reviewed-by: Ido Schimmel Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241209100747.2269613-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 811ce44113f6..c43cffb014a7 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -178,6 +178,13 @@ void rtnetlink_init(void); void __rtnl_unlock(void); void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail); +/* Shared by rtnl_fdb_dump() and various ndo_fdb_dump() helpers. */ +struct ndo_fdb_dump_context { + unsigned long s_h; + unsigned long s_idx; + unsigned long fdb_idx; +}; + extern int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, -- cgit v1.2.3 From 53970a05f799087e2dd2005973609188504e7fcc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Dec 2024 10:07:46 +0000 Subject: rtnetlink: switch rtnl_fdb_dump() to for_each_netdev_dump() This is the last netdev iterator still using net->dev_index_head[]. Convert to modern for_each_netdev_dump() for better scalability, and use common patterns in our stack. Following patch in this series removes the pad field in struct ndo_fdb_dump_context. Signed-off-by: Eric Dumazet Reviewed-by: Ido Schimmel Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241209100747.2269613-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index c43cffb014a7..5546571c2553 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -180,8 +180,8 @@ void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail); /* Shared by rtnl_fdb_dump() and various ndo_fdb_dump() helpers. */ struct ndo_fdb_dump_context { - unsigned long s_h; - unsigned long s_idx; + unsigned long ifindex; + unsigned long pad; unsigned long fdb_idx; }; -- cgit v1.2.3 From 53a6d8912372fc23ea82cc7a49eb59047aa0a650 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Dec 2024 10:07:47 +0000 Subject: rtnetlink: remove pad field in ndo_fdb_dump_context I chose to remove this field in a separate patch to ease potential bisection, in case one ndo_fdb_dump() is still using the old way (cb->args[2] instead of ctx->fdb_idx) Signed-off-by: Eric Dumazet Reviewed-by: Ido Schimmel Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241209100747.2269613-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 5546571c2553..3b9d132cbc9e 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -181,7 +181,6 @@ void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail); /* Shared by rtnl_fdb_dump() and various ndo_fdb_dump() helpers. */ struct ndo_fdb_dump_context { unsigned long ifindex; - unsigned long pad; unsigned long fdb_idx; }; -- cgit v1.2.3 From 51d20d1dacbec589d459e11fc88fbca419f84a99 Mon Sep 17 00:00:00 2001 From: Long Li Date: Mon, 9 Dec 2024 19:42:40 +0800 Subject: iomap: fix zero padding data issue in concurrent append writes During concurrent append writes to XFS filesystem, zero padding data may appear in the file after power failure. This happens due to imprecise disk size updates when handling write completion. Consider this scenario with concurrent append writes same file: Thread 1: Thread 2: ------------ ----------- write [A, A+B] update inode size to A+B submit I/O [A, A+BS] write [A+B, A+B+C] update inode size to A+B+C After reboot: 1) with A+B+C < A+BS, the file has zero padding in range [A+B, A+B+C] |< Block Size (BS) >| |DDDDDDDDDDDDDDDD0000000000000000| ^ ^ ^ A A+B A+B+C (EOF) 2) with A+B+C > A+BS, the file has zero padding in range [A+B, A+BS] |< Block Size (BS) >|< Block Size (BS) >| |DDDDDDDDDDDDDDDD0000000000000000|00000000000000000000000000000000| ^ ^ ^ ^ A A+B A+BS A+B+C (EOF) D = Valid Data 0 = Zero Padding The issue stems from disk size being set to min(io_offset + io_size, inode->i_size) at I/O completion. Since io_offset+io_size is block size granularity, it may exceed the actual valid file data size. In the case of concurrent append writes, inode->i_size may be larger than the actual range of valid file data written to disk, leading to inaccurate disk size updates. This patch modifies the meaning of io_size to represent the size of valid data within EOF in an ioend. If the ioend spans beyond i_size, io_size will be trimmed to provide the file with more accurate size information. This is particularly useful for on-disk size updates at completion time. After this change, ioends that span i_size will not grow or merge with other ioends in concurrent scenarios. However, these cases that need growth/merging rarely occur and it seems no noticeable performance impact. Although rounding up io_size could enable ioend growth/merging in these scenarios, we decided to keep the code simple after discussion [1]. Another benefit is that it makes the xfs_ioend_is_append() check more accurate, which can reduce unnecessary end bio callbacks of xfs_end_bio() in certain scenarios, such as repeated writes at the file tail without extending the file size. Link [1]: https://patchwork.kernel.org/project/xfs/patch/20241113091907.56937-1-leo.lilong@huawei.com Fixes: ae259a9c8593 ("fs: introduce iomap infrastructure") # goes further back than this Signed-off-by: Long Li Link: https://lore.kernel.org/r/20241209114241.3725722-3-leo.lilong@huawei.com Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 5675af6b740c..75bf54e76f3b 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -335,7 +335,7 @@ struct iomap_ioend { u16 io_type; u16 io_flags; /* IOMAP_F_* */ struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of the extent */ + size_t io_size; /* size of data within eof */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ struct bio io_bio; /* MUST BE LAST! */ -- cgit v1.2.3 From 5aec7c065fba0c56d6c1ea5d629395210f174be8 Mon Sep 17 00:00:00 2001 From: James Clark Date: Thu, 28 Nov 2024 12:14:14 +0000 Subject: coresight: Drop atomics in connection refcounts These belong to the device being enabled or disabled and are only ever used inside the device's spinlock. Remove the atomics to not imply that there are any other concurrent accesses. If atomics were necessary I don't think they would have been enough anyway. There would be nothing to prevent an enable or disable running concurrently if not for the spinlock. Signed-off-by: James Clark Reviewed-by: Yeoreum Yun Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20241128121414.2425119-1-james.clark@linaro.org --- include/linux/coresight.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index c13342594278..834029cb9ba2 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -200,8 +200,8 @@ struct coresight_connection { struct coresight_device *dest_dev; struct coresight_sysfs_link *link; struct coresight_device *src_dev; - atomic_t src_refcnt; - atomic_t dest_refcnt; + int src_refcnt; + int dest_refcnt; }; /** -- cgit v1.2.3 From fd9b7e8e9fbc23d69fa4accc881dea2cf13a2e2e Mon Sep 17 00:00:00 2001 From: Mao Jinlong Date: Thu, 21 Nov 2024 14:28:28 +0800 Subject: coresight: Add support to get static id for system trace sources Dynamic trace id was introduced in coresight subsystem, so trace id is allocated dynamically. However, some hardware ATB source has static trace id and it cannot be changed via software programming. For such source, it can call coresight_get_static_trace_id to get the fixed trace id from device node and pass id to coresight_trace_id_get_static_system_id to reserve the id. Signed-off-by: Mao Jinlong Reviewed-by: Mike Leach Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20241121062829.11571-3-quic_jinlmao@quicinc.com --- include/linux/coresight.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 834029cb9ba2..055ce5cd5c44 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -662,6 +662,7 @@ void coresight_relaxed_write64(struct coresight_device *csdev, void coresight_write64(struct coresight_device *csdev, u64 val, u32 offset); extern int coresight_get_cpu(struct device *dev); +extern int coresight_get_static_trace_id(struct device *dev, u32 *id); struct coresight_platform_data *coresight_get_platform_data(struct device *dev); struct coresight_connection * -- cgit v1.2.3 From eb708cd631a8dca17ff004ccc39bbeb096c1db22 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Wed, 11 Dec 2024 13:35:58 +0000 Subject: regmap: regmap_multi_reg_read(): make register list const Mark the list of registers passed into regmap_multi_reg_read() as a pointer to const. This allows the caller to define the register list as const data. This requires making the same change to _regmap_bulk_read(), which is called by regmap_multi_reg_read(). Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20241211133558.884669-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/regmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index fd41baccbf3e..3871c74f7677 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -1244,7 +1244,7 @@ int regmap_noinc_read(struct regmap *map, unsigned int reg, void *val, size_t val_len); int regmap_bulk_read(struct regmap *map, unsigned int reg, void *val, size_t val_count); -int regmap_multi_reg_read(struct regmap *map, unsigned int *reg, void *val, +int regmap_multi_reg_read(struct regmap *map, const unsigned int *reg, void *val, size_t val_count); int regmap_update_bits_base(struct regmap *map, unsigned int reg, unsigned int mask, unsigned int val, -- cgit v1.2.3 From 8392bc2ff8c8bf7c4c5e6dfa71ccd893a3c046f6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Nov 2024 10:30:29 -0500 Subject: fsnotify: generate pre-content permission event on page fault FS_PRE_ACCESS will be generated on page fault depending on the faulting method. This pre-content event is meant to be used by hierarchical storage managers that want to fill in the file content on first read access. Export a simple helper that file systems that have their own ->fault() will use, and have a more complicated helper to be do fancy things in filemap_fault. Signed-off-by: Josef Bacik Signed-off-by: Jan Kara Link: https://patch.msgid.link/aa56c50ce81b1fd18d7f5d71dd2dfced5eba9687.1731684329.git.josef@toxicpanda.com --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index c39c4945946c..e6c3c9cbcfe5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3420,6 +3420,7 @@ extern vm_fault_t filemap_fault(struct vm_fault *vmf); extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); +extern vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf); extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ -- cgit v1.2.3 From 0357ef03c94ef835bd44a0658b8edb672a9dbf51 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 28 Nov 2024 15:25:32 +0100 Subject: fs: don't block write during exec on pre-content watched files Commit 2a010c412853 ("fs: don't block i_writecount during exec") removed the legacy behavior of getting ETXTBSY on attempt to open and executable file for write while it is being executed. This commit was reverted because an application that depends on this legacy behavior was broken by the change. We need to allow HSM writing into executable files while executed to fill their content on-the-fly. To that end, disable the ETXTBSY legacy behavior for files that are watched by pre-content events. This change is not expected to cause regressions with existing systems which do not have any pre-content event listeners. Signed-off-by: Amir Goldstein Acked-by: Christian Brauner Signed-off-by: Jan Kara Link: https://patch.msgid.link/20241128142532.465176-1-amir73il@gmail.com --- include/linux/fs.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3f4d59464965..a1230c40fef1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3095,6 +3095,28 @@ static inline void allow_write_access(struct file *file) if (file) atomic_inc(&file_inode(file)->i_writecount); } + +/* + * Do not prevent write to executable file when watched by pre-content events. + * + * Note that FMODE_FSNOTIFY_HSM mode is set depending on pre-content watches at + * the time of file open and remains constant for entire lifetime of the file, + * so if pre-content watches are added post execution or removed before the end + * of the execution, it will not cause i_writecount reference leak. + */ +static inline int exe_file_deny_write_access(struct file *exe_file) +{ + if (unlikely(FMODE_FSNOTIFY_HSM(exe_file->f_mode))) + return 0; + return deny_write_access(exe_file); +} +static inline void exe_file_allow_write_access(struct file *exe_file) +{ + if (unlikely(!exe_file || FMODE_FSNOTIFY_HSM(exe_file->f_mode))) + return; + allow_write_access(exe_file); +} + static inline bool inode_is_open_for_write(const struct inode *inode) { return atomic_read(&inode->i_writecount) > 0; -- cgit v1.2.3 From a87ef09b1fdf75fdc2d6b386ff23a35589173055 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 6 Dec 2024 18:28:36 +0100 Subject: iio: adc: ad_sigma_delta: Add support for reading irq status using a GPIO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some of the ADCs by Analog signal their irq condition on the MISO line. So typically that line is connected to an SPI controller and a GPIO. The GPIO is used as input and the respective interrupt is enabled when the last SPI transfer is completed. Depending on the GPIO controller the toggling MISO line might make the interrupt pending even while it's masked. In that case the irq handler is called immediately after irq_enable() and so before the device actually pulls that line low which results in non-sense values being reported to the upper layers. The only way to find out if the line was actually pulled low is to read the GPIO. (There is a flag in AD7124's status register that also signals if an interrupt was asserted, but reading that register toggles the MISO line and so might trigger another spurious interrupt.) Add the possibility to specify an interrupt GPIO in the machine description in addition to the plain interrupt. This GPIO is used then to check if the irq line is actually active in the irq handler. Signed-off-by: Uwe Kleine-König Link: https://patch.msgid.link/5be9a4cc4dc600ec384c88db01dd661a21506b9c.1733504533.git.u.kleine-koenig@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/ad_sigma_delta.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index 1851f8fed3a4..895b7ebf4be5 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -29,6 +29,7 @@ struct ad_sd_calib_data { struct ad_sigma_delta; struct device; +struct gpio_desc; struct iio_dev; /** @@ -96,6 +97,7 @@ struct ad_sigma_delta { unsigned int active_slots; unsigned int current_slot; unsigned int num_slots; + struct gpio_desc *rdy_gpiod; int irq_line; bool status_appended; /* map slots to channels in order to know what to expect from devices */ -- cgit v1.2.3 From f522589c139debb8af56dbead0c6e9dfca2d5ce4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 6 Dec 2024 18:28:38 +0100 Subject: iio: adc: ad_sigma_delta: Fix a race condition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ad_sigma_delta driver helper uses irq_disable_nosync(). With that one it is possible that the irq handler still runs after the irq_disable_nosync() function call returns. Also to properly synchronize irq disabling in the different threads proper locking is needed and because it's unclear if the irq handler's irq_disable_nosync() call comes first or the one in the enabler's error path, all code locations that disable the irq must check for .irq_dis first to ensure there is exactly one disable call per enable call. So add a spinlock to the struct ad_sigma_delta and use it to synchronize irq enabling and disabling. Also only act in the irq handler if the irq is still enabled. Fixes: af3008485ea0 ("iio:adc: Add common code for ADI Sigma Delta devices") Signed-off-by: Uwe Kleine-König Link: https://patch.msgid.link/9e6def47e2e773e0e15b7a2c29d22629b53d91b1.1733504533.git.u.kleine-koenig@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/ad_sigma_delta.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index 895b7ebf4be5..200130e4244d 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -86,6 +86,7 @@ struct ad_sigma_delta { /* private: */ struct completion completion; + spinlock_t irq_lock; /* protects .irq_dis and irq en/disable state */ bool irq_dis; bool bus_locked; -- cgit v1.2.3 From 07a28874bb49700036a3ab435dd95ae31afd21ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 6 Dec 2024 18:28:39 +0100 Subject: iio: adc: ad_sigma_delta: Store information about reset sequence length MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The various chips can be reset using a sequence of SPI transfers with MOSI = 1. The length of such a sequence varies from chip to chip. Store that length in struct ad_sigma_delta_info and replace the respective parameter to ad_sd_reset() with it. Note the ad7192 used to pass 48 as length but the documentation specifies 40 as the required length. Assuming the latter is right. (Using a too long sequence doesn't hurt apart from using a longer spi transfer than necessary, so this is no relevant fix.) The motivation for storing this information is that this is useful to clear a pending R̅D̅Y̅ signal in the next change. Signed-off-by: Uwe Kleine-König Link: https://patch.msgid.link/9750db62fce638bf140ff48172c23bff7f785e5b.1733504533.git.u.kleine-koenig@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/ad_sigma_delta.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index 200130e4244d..417073c52380 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -54,6 +54,7 @@ struct iio_dev; * @irq_flags: flags for the interrupt used by the triggered buffer * @num_slots: Number of sequencer slots * @irq_line: IRQ for reading conversions. If 0, spi->irq will be used + * @num_resetclks: Number of SPI clk cycles with MOSI=1 to reset the chip. */ struct ad_sigma_delta_info { int (*set_channel)(struct ad_sigma_delta *, unsigned int channel); @@ -70,6 +71,7 @@ struct ad_sigma_delta_info { unsigned long irq_flags; unsigned int num_slots; int irq_line; + unsigned int num_resetclks; }; /** @@ -181,8 +183,7 @@ int ad_sd_write_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg, int ad_sd_read_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg, unsigned int size, unsigned int *val); -int ad_sd_reset(struct ad_sigma_delta *sigma_delta, - unsigned int reset_length); +int ad_sd_reset(struct ad_sigma_delta *sigma_delta); int ad_sigma_delta_single_conversion(struct iio_dev *indio_dev, const struct iio_chan_spec *chan, int *val); -- cgit v1.2.3 From 22ccb0a1c57c436de899ccd3170d6d2ce7238836 Mon Sep 17 00:00:00 2001 From: Matteo Martelli Date: Mon, 2 Dec 2024 16:11:07 +0100 Subject: iio: consumers: ensure read buffers for labels and ext_info are page aligned Attributes of iio providers are exposed via sysfs. Typically, providers pass attribute values to the iio core, which handles formatting and printing to sysfs. However, some attributes, such as labels or extended info, are directly formatted and printed to sysfs by provider drivers using sysfs_emit() and sysfs_emit_at(). These helpers assume the read buffer, allocated by sysfs fop, is page-aligned. When these attributes are accessed by consumer drivers, the read buffer is allocated by the consumer and may not be page-aligned, leading to failures in the provider's callback that utilizes sysfs_emit*. Add a check to ensure that read buffers for labels and external info attributes are page-aligned. Update the prototype documentation as well. Signed-off-by: Matteo Martelli Link: https://patch.msgid.link/20241202-iio-kmalloc-align-v1-1-aa9568c03937@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/iio/consumer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h index 333d1d8ccb37..6a4479616479 100644 --- a/include/linux/iio/consumer.h +++ b/include/linux/iio/consumer.h @@ -418,7 +418,7 @@ unsigned int iio_get_channel_ext_info_count(struct iio_channel *chan); * @chan: The channel being queried. * @attr: The ext_info attribute to read. * @buf: Where to store the attribute value. Assumed to hold - * at least PAGE_SIZE bytes. + * at least PAGE_SIZE bytes and to be aligned at PAGE_SIZE. * * Returns the number of bytes written to buf (perhaps w/o zero termination; * it need not even be a string), or an error code. @@ -445,7 +445,7 @@ ssize_t iio_write_channel_ext_info(struct iio_channel *chan, const char *attr, * iio_read_channel_label() - read label for a given channel * @chan: The channel being queried. * @buf: Where to store the attribute value. Assumed to hold - * at least PAGE_SIZE bytes. + * at least PAGE_SIZE bytes and to be aligned at PAGE_SIZE. * * Returns the number of bytes written to buf, or an error code. */ -- cgit v1.2.3 From bad6722e478f5b17a5ceb039dfb4c680cf2c0b48 Mon Sep 17 00:00:00 2001 From: Eliav Farber Date: Wed, 4 Dec 2024 14:20:02 +0000 Subject: kexec: Consolidate machine_kexec_mask_interrupts() implementation Consolidate the machine_kexec_mask_interrupts implementation into a common function located in a new file: kernel/irq/kexec.c. This removes duplicate implementations from architecture-specific files in arch/arm, arch/arm64, arch/powerpc, and arch/riscv, reducing code duplication and improving maintainability. The new implementation retains architecture-specific behavior for CONFIG_GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD, which was previously implemented for ARM64. When enabled (currently for ARM64), it clears the active state of interrupts forwarded to virtual machines (VMs) before handling other interrupt masking operations. Signed-off-by: Eliav Farber Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241204142003.32859-2-farbere@amazon.com --- include/linux/irq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index fa711f80957b..25f51bf3c351 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -694,6 +694,9 @@ extern int irq_chip_request_resources_parent(struct irq_data *data); extern void irq_chip_release_resources_parent(struct irq_data *data); #endif +/* Disable or mask interrupts during a kernel kexec */ +extern void machine_kexec_mask_interrupts(void); + /* Handling of unhandled and spurious interrupts: */ extern void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret); -- cgit v1.2.3 From 41d7ea30494cc0dde3e124a75ce0add93f988ba9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 10 Dec 2024 12:27:12 -0800 Subject: lib: packing: add pack_fields() and unpack_fields() This is new API which caters to the following requirements: - Pack or unpack a large number of fields to/from a buffer with a small code footprint. The current alternative is to open-code a large number of calls to pack() and unpack(), or to use packing() to reduce that number to half. But packing() is not const-correct. - Use unpacked numbers stored in variables smaller than u64. This reduces the rodata footprint of the stored field arrays. - Perform error checking at compile time, rather than runtime, and return void from the API functions. Because the C preprocessor can't generate variable length code (loops), this is a bit tricky to do with macros. To handle this, implement macros which sanity check the packed field definitions based on their size. Finally, a single macro with a chain of __builtin_choose_expr() is used to select the appropriate macros. We enforce the use of ascending or descending order to avoid O(N^2) scaling when checking for overlap. Note that the macros are written with care to ensure that the compilers can correctly evaluate the resulting code at compile time. In particular, care was taken with avoiding too many nested statement expressions. Nested statement expressions trip up some compilers, especially when passing down variables created in previous statement expressions. There are two key design choices intended to keep the overall macro code size small. First, the definition of each CHECK_PACKED_FIELDS_N macro is implemented recursively, by calling the N-1 macro. This avoids needing the code to repeat multiple times. Second, the CHECK_PACKED_FIELD macro enforces that the fields in the array are sorted in order. This allows checking for overlap only with neighboring fields, rather than the general overlap case where each field would need to be checked against other fields. The overlap checks use the first two fields to determine the order of the remaining fields, thus allowing either ascending or descending order. This enables drivers the flexibility to keep the fields ordered in which ever order most naturally fits their hardware design and its associated documentation. The CHECK_PACKED_FIELDS macro is directly called from within pack_fields and unpack_fields, ensuring that all drivers using the API receive the benefits of the compile-time checks. Users do not need to directly call any of the macros directly. The CHECK_PACKED_FIELDS and its helper macros CHECK_PACKED_FIELDS_(0..50) are generated using a simple C program in scripts/gen_packed_field_checks.c This program can be compiled on demand and executed to generate the macro code in include/linux/packing.h. This will aid in the event that a driver needs more than 50 fields. The generator can be updated with a new size, and used to update the packing.h header file. In practice, the ice driver will need to support 27 fields, and the sja1105 driver will need to support 0 fields. This on-demand generation avoids the need to modify Kbuild. We do not anticipate the maximum number of fields to grow very often. - Reduced rodata footprint for the storage of the packed field arrays. To that end, we have struct packed_field_u8 and packed_field_u16, which define the fields with the associated type. More can be added as needed (unlikely for now). On these types, the same generic pack_fields() and unpack_fields() API can be used, thanks to the new C11 _Generic() selection feature, which can call pack_fields_u8() or pack_fields_16(), depending on the type of the "fields" array - a simplistic form of polymorphism. It is evaluated at compile time which function will actually be called. Over time, packing() is expected to be completely replaced either with pack() or with pack_fields(). Signed-off-by: Vladimir Oltean Co-developed-by: Jacob Keller Signed-off-by: Jacob Keller Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20241210-packing-pack-fields-and-ice-implementation-v10-3-ee56a47479ac@intel.com Signed-off-by: Jakub Kicinski --- include/linux/packing.h | 425 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 425 insertions(+) (limited to 'include') diff --git a/include/linux/packing.h b/include/linux/packing.h index 5d36dcd06f60..0589d70bbe04 100644 --- a/include/linux/packing.h +++ b/include/linux/packing.h @@ -8,6 +8,83 @@ #include #include +#define GEN_PACKED_FIELD_STRUCT(__type) \ + struct packed_field_ ## __type { \ + __type startbit; \ + __type endbit; \ + __type offset; \ + __type size; \ + } + +/* struct packed_field_u8. Use with bit offsets < 256, buffers < 32B and + * unpacked structures < 256B. + */ +GEN_PACKED_FIELD_STRUCT(u8); + +/* struct packed_field_u16. Use with bit offsets < 65536, buffers < 8KB and + * unpacked structures < 64KB. + */ +GEN_PACKED_FIELD_STRUCT(u16); + +#define PACKED_FIELD(start, end, struct_name, struct_field) \ +{ \ + (start), \ + (end), \ + offsetof(struct_name, struct_field), \ + sizeof_field(struct_name, struct_field), \ +} + +#define CHECK_PACKED_FIELD_OVERLAP(fields, index1, index2) ({ \ + typeof(&(fields)[0]) __f = (fields); \ + typeof(__f[0]) _f1 = __f[index1]; typeof(__f[0]) _f2 = __f[index2]; \ + const bool _ascending = __f[0].startbit < __f[1].startbit; \ + BUILD_BUG_ON_MSG(_ascending && _f1.startbit >= _f2.startbit, \ + __stringify(fields) " field " __stringify(index2) \ + " breaks ascending order"); \ + BUILD_BUG_ON_MSG(!_ascending && _f1.startbit <= _f2.startbit, \ + __stringify(fields) " field " __stringify(index2) \ + " breaks descending order"); \ + BUILD_BUG_ON_MSG(max(_f1.endbit, _f2.endbit) <= \ + min(_f1.startbit, _f2.startbit), \ + __stringify(fields) " field " __stringify(index2) \ + " overlaps with previous field"); \ +}) + +#define CHECK_PACKED_FIELD(fields, index) ({ \ + typeof(&(fields)[0]) _f = (fields); \ + typeof(_f[0]) __f = _f[index]; \ + BUILD_BUG_ON_MSG(__f.startbit < __f.endbit, \ + __stringify(fields) " field " __stringify(index) \ + " start bit must not be smaller than end bit"); \ + BUILD_BUG_ON_MSG(__f.size != 1 && __f.size != 2 && \ + __f.size != 4 && __f.size != 8, \ + __stringify(fields) " field " __stringify(index) \ + " has unsupported unpacked storage size"); \ + BUILD_BUG_ON_MSG(__f.startbit - __f.endbit >= BITS_PER_BYTE * __f.size, \ + __stringify(fields) " field " __stringify(index) \ + " exceeds unpacked storage size"); \ + __builtin_choose_expr(index != 0, \ + CHECK_PACKED_FIELD_OVERLAP(fields, index - 1, index), \ + 1); \ +}) + +/* Note that the packed fields may be either in ascending or descending order. + * Thus, we must check that both the first and last field wit within the + * packed buffer size. + */ +#define CHECK_PACKED_FIELDS_SIZE(fields, pbuflen) ({ \ + typeof(&(fields)[0]) _f = (fields); \ + typeof(pbuflen) _len = (pbuflen); \ + const size_t num_fields = ARRAY_SIZE(fields); \ + BUILD_BUG_ON_MSG(!__builtin_constant_p(_len), \ + __stringify(fields) " pbuflen " __stringify(pbuflen) \ + " must be a compile time constant"); \ + BUILD_BUG_ON_MSG(_f[0].startbit >= BITS_PER_BYTE * _len, \ + __stringify(fields) " first field exceeds packed buffer size"); \ + BUILD_BUG_ON_MSG(_f[num_fields - 1].startbit >= BITS_PER_BYTE * _len, \ + __stringify(fields) " last field exceeds packed buffer size"); \ +}) + #define QUIRK_MSB_ON_THE_RIGHT BIT(0) #define QUIRK_LITTLE_ENDIAN BIT(1) #define QUIRK_LSW32_IS_FIRST BIT(2) @@ -26,4 +103,352 @@ int pack(void *pbuf, u64 uval, size_t startbit, size_t endbit, size_t pbuflen, int unpack(const void *pbuf, u64 *uval, size_t startbit, size_t endbit, size_t pbuflen, u8 quirks); +void pack_fields_u8(void *pbuf, size_t pbuflen, const void *ustruct, + const struct packed_field_u8 *fields, size_t num_fields, + u8 quirks); + +void pack_fields_u16(void *pbuf, size_t pbuflen, const void *ustruct, + const struct packed_field_u16 *fields, size_t num_fields, + u8 quirks); + +void unpack_fields_u8(const void *pbuf, size_t pbuflen, void *ustruct, + const struct packed_field_u8 *fields, size_t num_fields, + u8 quirks); + +void unpack_fields_u16(const void *pbuf, size_t pbuflen, void *ustruct, + const struct packed_field_u16 *fields, size_t num_fields, + u8 quirks); + +/* Do not hand-edit the following packed field check macros! + * + * They are generated using scripts/gen_packed_field_checks.c, which may be + * built via "make scripts_gen_packed_field_checks". If larger macro sizes are + * needed in the future, please use this program to re-generate the macros and + * insert them here. + */ + +#define CHECK_PACKED_FIELDS_1(fields) \ + CHECK_PACKED_FIELD(fields, 0) + +#define CHECK_PACKED_FIELDS_2(fields) do { \ + CHECK_PACKED_FIELDS_1(fields); \ + CHECK_PACKED_FIELD(fields, 1); \ +} while (0) + +#define CHECK_PACKED_FIELDS_3(fields) do { \ + CHECK_PACKED_FIELDS_2(fields); \ + CHECK_PACKED_FIELD(fields, 2); \ +} while (0) + +#define CHECK_PACKED_FIELDS_4(fields) do { \ + CHECK_PACKED_FIELDS_3(fields); \ + CHECK_PACKED_FIELD(fields, 3); \ +} while (0) + +#define CHECK_PACKED_FIELDS_5(fields) do { \ + CHECK_PACKED_FIELDS_4(fields); \ + CHECK_PACKED_FIELD(fields, 4); \ +} while (0) + +#define CHECK_PACKED_FIELDS_6(fields) do { \ + CHECK_PACKED_FIELDS_5(fields); \ + CHECK_PACKED_FIELD(fields, 5); \ +} while (0) + +#define CHECK_PACKED_FIELDS_7(fields) do { \ + CHECK_PACKED_FIELDS_6(fields); \ + CHECK_PACKED_FIELD(fields, 6); \ +} while (0) + +#define CHECK_PACKED_FIELDS_8(fields) do { \ + CHECK_PACKED_FIELDS_7(fields); \ + CHECK_PACKED_FIELD(fields, 7); \ +} while (0) + +#define CHECK_PACKED_FIELDS_9(fields) do { \ + CHECK_PACKED_FIELDS_8(fields); \ + CHECK_PACKED_FIELD(fields, 8); \ +} while (0) + +#define CHECK_PACKED_FIELDS_10(fields) do { \ + CHECK_PACKED_FIELDS_9(fields); \ + CHECK_PACKED_FIELD(fields, 9); \ +} while (0) + +#define CHECK_PACKED_FIELDS_11(fields) do { \ + CHECK_PACKED_FIELDS_10(fields); \ + CHECK_PACKED_FIELD(fields, 10); \ +} while (0) + +#define CHECK_PACKED_FIELDS_12(fields) do { \ + CHECK_PACKED_FIELDS_11(fields); \ + CHECK_PACKED_FIELD(fields, 11); \ +} while (0) + +#define CHECK_PACKED_FIELDS_13(fields) do { \ + CHECK_PACKED_FIELDS_12(fields); \ + CHECK_PACKED_FIELD(fields, 12); \ +} while (0) + +#define CHECK_PACKED_FIELDS_14(fields) do { \ + CHECK_PACKED_FIELDS_13(fields); \ + CHECK_PACKED_FIELD(fields, 13); \ +} while (0) + +#define CHECK_PACKED_FIELDS_15(fields) do { \ + CHECK_PACKED_FIELDS_14(fields); \ + CHECK_PACKED_FIELD(fields, 14); \ +} while (0) + +#define CHECK_PACKED_FIELDS_16(fields) do { \ + CHECK_PACKED_FIELDS_15(fields); \ + CHECK_PACKED_FIELD(fields, 15); \ +} while (0) + +#define CHECK_PACKED_FIELDS_17(fields) do { \ + CHECK_PACKED_FIELDS_16(fields); \ + CHECK_PACKED_FIELD(fields, 16); \ +} while (0) + +#define CHECK_PACKED_FIELDS_18(fields) do { \ + CHECK_PACKED_FIELDS_17(fields); \ + CHECK_PACKED_FIELD(fields, 17); \ +} while (0) + +#define CHECK_PACKED_FIELDS_19(fields) do { \ + CHECK_PACKED_FIELDS_18(fields); \ + CHECK_PACKED_FIELD(fields, 18); \ +} while (0) + +#define CHECK_PACKED_FIELDS_20(fields) do { \ + CHECK_PACKED_FIELDS_19(fields); \ + CHECK_PACKED_FIELD(fields, 19); \ +} while (0) + +#define CHECK_PACKED_FIELDS_21(fields) do { \ + CHECK_PACKED_FIELDS_20(fields); \ + CHECK_PACKED_FIELD(fields, 20); \ +} while (0) + +#define CHECK_PACKED_FIELDS_22(fields) do { \ + CHECK_PACKED_FIELDS_21(fields); \ + CHECK_PACKED_FIELD(fields, 21); \ +} while (0) + +#define CHECK_PACKED_FIELDS_23(fields) do { \ + CHECK_PACKED_FIELDS_22(fields); \ + CHECK_PACKED_FIELD(fields, 22); \ +} while (0) + +#define CHECK_PACKED_FIELDS_24(fields) do { \ + CHECK_PACKED_FIELDS_23(fields); \ + CHECK_PACKED_FIELD(fields, 23); \ +} while (0) + +#define CHECK_PACKED_FIELDS_25(fields) do { \ + CHECK_PACKED_FIELDS_24(fields); \ + CHECK_PACKED_FIELD(fields, 24); \ +} while (0) + +#define CHECK_PACKED_FIELDS_26(fields) do { \ + CHECK_PACKED_FIELDS_25(fields); \ + CHECK_PACKED_FIELD(fields, 25); \ +} while (0) + +#define CHECK_PACKED_FIELDS_27(fields) do { \ + CHECK_PACKED_FIELDS_26(fields); \ + CHECK_PACKED_FIELD(fields, 26); \ +} while (0) + +#define CHECK_PACKED_FIELDS_28(fields) do { \ + CHECK_PACKED_FIELDS_27(fields); \ + CHECK_PACKED_FIELD(fields, 27); \ +} while (0) + +#define CHECK_PACKED_FIELDS_29(fields) do { \ + CHECK_PACKED_FIELDS_28(fields); \ + CHECK_PACKED_FIELD(fields, 28); \ +} while (0) + +#define CHECK_PACKED_FIELDS_30(fields) do { \ + CHECK_PACKED_FIELDS_29(fields); \ + CHECK_PACKED_FIELD(fields, 29); \ +} while (0) + +#define CHECK_PACKED_FIELDS_31(fields) do { \ + CHECK_PACKED_FIELDS_30(fields); \ + CHECK_PACKED_FIELD(fields, 30); \ +} while (0) + +#define CHECK_PACKED_FIELDS_32(fields) do { \ + CHECK_PACKED_FIELDS_31(fields); \ + CHECK_PACKED_FIELD(fields, 31); \ +} while (0) + +#define CHECK_PACKED_FIELDS_33(fields) do { \ + CHECK_PACKED_FIELDS_32(fields); \ + CHECK_PACKED_FIELD(fields, 32); \ +} while (0) + +#define CHECK_PACKED_FIELDS_34(fields) do { \ + CHECK_PACKED_FIELDS_33(fields); \ + CHECK_PACKED_FIELD(fields, 33); \ +} while (0) + +#define CHECK_PACKED_FIELDS_35(fields) do { \ + CHECK_PACKED_FIELDS_34(fields); \ + CHECK_PACKED_FIELD(fields, 34); \ +} while (0) + +#define CHECK_PACKED_FIELDS_36(fields) do { \ + CHECK_PACKED_FIELDS_35(fields); \ + CHECK_PACKED_FIELD(fields, 35); \ +} while (0) + +#define CHECK_PACKED_FIELDS_37(fields) do { \ + CHECK_PACKED_FIELDS_36(fields); \ + CHECK_PACKED_FIELD(fields, 36); \ +} while (0) + +#define CHECK_PACKED_FIELDS_38(fields) do { \ + CHECK_PACKED_FIELDS_37(fields); \ + CHECK_PACKED_FIELD(fields, 37); \ +} while (0) + +#define CHECK_PACKED_FIELDS_39(fields) do { \ + CHECK_PACKED_FIELDS_38(fields); \ + CHECK_PACKED_FIELD(fields, 38); \ +} while (0) + +#define CHECK_PACKED_FIELDS_40(fields) do { \ + CHECK_PACKED_FIELDS_39(fields); \ + CHECK_PACKED_FIELD(fields, 39); \ +} while (0) + +#define CHECK_PACKED_FIELDS_41(fields) do { \ + CHECK_PACKED_FIELDS_40(fields); \ + CHECK_PACKED_FIELD(fields, 40); \ +} while (0) + +#define CHECK_PACKED_FIELDS_42(fields) do { \ + CHECK_PACKED_FIELDS_41(fields); \ + CHECK_PACKED_FIELD(fields, 41); \ +} while (0) + +#define CHECK_PACKED_FIELDS_43(fields) do { \ + CHECK_PACKED_FIELDS_42(fields); \ + CHECK_PACKED_FIELD(fields, 42); \ +} while (0) + +#define CHECK_PACKED_FIELDS_44(fields) do { \ + CHECK_PACKED_FIELDS_43(fields); \ + CHECK_PACKED_FIELD(fields, 43); \ +} while (0) + +#define CHECK_PACKED_FIELDS_45(fields) do { \ + CHECK_PACKED_FIELDS_44(fields); \ + CHECK_PACKED_FIELD(fields, 44); \ +} while (0) + +#define CHECK_PACKED_FIELDS_46(fields) do { \ + CHECK_PACKED_FIELDS_45(fields); \ + CHECK_PACKED_FIELD(fields, 45); \ +} while (0) + +#define CHECK_PACKED_FIELDS_47(fields) do { \ + CHECK_PACKED_FIELDS_46(fields); \ + CHECK_PACKED_FIELD(fields, 46); \ +} while (0) + +#define CHECK_PACKED_FIELDS_48(fields) do { \ + CHECK_PACKED_FIELDS_47(fields); \ + CHECK_PACKED_FIELD(fields, 47); \ +} while (0) + +#define CHECK_PACKED_FIELDS_49(fields) do { \ + CHECK_PACKED_FIELDS_48(fields); \ + CHECK_PACKED_FIELD(fields, 48); \ +} while (0) + +#define CHECK_PACKED_FIELDS_50(fields) do { \ + CHECK_PACKED_FIELDS_49(fields); \ + CHECK_PACKED_FIELD(fields, 49); \ +} while (0) + +#define CHECK_PACKED_FIELDS(fields) \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 1, ({ CHECK_PACKED_FIELDS_1(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 2, ({ CHECK_PACKED_FIELDS_2(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 3, ({ CHECK_PACKED_FIELDS_3(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 4, ({ CHECK_PACKED_FIELDS_4(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 5, ({ CHECK_PACKED_FIELDS_5(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 6, ({ CHECK_PACKED_FIELDS_6(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 7, ({ CHECK_PACKED_FIELDS_7(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 8, ({ CHECK_PACKED_FIELDS_8(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 9, ({ CHECK_PACKED_FIELDS_9(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 10, ({ CHECK_PACKED_FIELDS_10(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 11, ({ CHECK_PACKED_FIELDS_11(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 12, ({ CHECK_PACKED_FIELDS_12(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 13, ({ CHECK_PACKED_FIELDS_13(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 14, ({ CHECK_PACKED_FIELDS_14(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 15, ({ CHECK_PACKED_FIELDS_15(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 16, ({ CHECK_PACKED_FIELDS_16(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 17, ({ CHECK_PACKED_FIELDS_17(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 18, ({ CHECK_PACKED_FIELDS_18(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 19, ({ CHECK_PACKED_FIELDS_19(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 20, ({ CHECK_PACKED_FIELDS_20(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 21, ({ CHECK_PACKED_FIELDS_21(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 22, ({ CHECK_PACKED_FIELDS_22(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 23, ({ CHECK_PACKED_FIELDS_23(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 24, ({ CHECK_PACKED_FIELDS_24(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 25, ({ CHECK_PACKED_FIELDS_25(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 26, ({ CHECK_PACKED_FIELDS_26(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 27, ({ CHECK_PACKED_FIELDS_27(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 28, ({ CHECK_PACKED_FIELDS_28(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 29, ({ CHECK_PACKED_FIELDS_29(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 30, ({ CHECK_PACKED_FIELDS_30(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 31, ({ CHECK_PACKED_FIELDS_31(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 32, ({ CHECK_PACKED_FIELDS_32(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 33, ({ CHECK_PACKED_FIELDS_33(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 34, ({ CHECK_PACKED_FIELDS_34(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 35, ({ CHECK_PACKED_FIELDS_35(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 36, ({ CHECK_PACKED_FIELDS_36(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 37, ({ CHECK_PACKED_FIELDS_37(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 38, ({ CHECK_PACKED_FIELDS_38(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 39, ({ CHECK_PACKED_FIELDS_39(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 40, ({ CHECK_PACKED_FIELDS_40(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 41, ({ CHECK_PACKED_FIELDS_41(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 42, ({ CHECK_PACKED_FIELDS_42(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 43, ({ CHECK_PACKED_FIELDS_43(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 44, ({ CHECK_PACKED_FIELDS_44(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 45, ({ CHECK_PACKED_FIELDS_45(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 46, ({ CHECK_PACKED_FIELDS_46(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 47, ({ CHECK_PACKED_FIELDS_47(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 48, ({ CHECK_PACKED_FIELDS_48(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 49, ({ CHECK_PACKED_FIELDS_49(fields); }), \ + __builtin_choose_expr(ARRAY_SIZE(fields) == 50, ({ CHECK_PACKED_FIELDS_50(fields); }), \ + ({ BUILD_BUG_ON_MSG(1, "CHECK_PACKED_FIELDS() must be regenerated to support array sizes larger than 50."); }) \ +)))))))))))))))))))))))))))))))))))))))))))))))))) + +/* End of generated content */ + +#define pack_fields(pbuf, pbuflen, ustruct, fields, quirks) \ + ({ \ + CHECK_PACKED_FIELDS(fields); \ + CHECK_PACKED_FIELDS_SIZE((fields), (pbuflen)); \ + _Generic((fields), \ + const struct packed_field_u8 * : pack_fields_u8, \ + const struct packed_field_u16 * : pack_fields_u16 \ + )((pbuf), (pbuflen), (ustruct), (fields), ARRAY_SIZE(fields), (quirks)); \ + }) + +#define unpack_fields(pbuf, pbuflen, ustruct, fields, quirks) \ + ({ \ + CHECK_PACKED_FIELDS(fields); \ + CHECK_PACKED_FIELDS_SIZE((fields), (pbuflen)); \ + _Generic((fields), \ + const struct packed_field_u8 * : unpack_fields_u8, \ + const struct packed_field_u16 * : unpack_fields_u16 \ + )((pbuf), (pbuflen), (ustruct), (fields), ARRAY_SIZE(fields), (quirks)); \ + }) + #endif -- cgit v1.2.3 From 19ce8cd3046587efbd2c6253947be7c22dfccc18 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 9 Dec 2024 20:38:03 +0100 Subject: tcp: Measure TIME-WAIT reuse delay with millisecond precision Prepare ground for TIME-WAIT socket reuse with subsecond delay. Today the last TS.Recent update timestamp, recorded in seconds and stored tp->ts_recent_stamp and tw->tw_ts_recent_stamp fields, has two purposes. Firstly, it is used to track the age of the last recorded TS.Recent value to detect when that value becomes outdated due to potential wrap-around of the other TCP timestamp clock (RFC 7323, section 5.5). For this purpose a second-based timestamp is completely sufficient as even in the worst case scenario of a peer using a high resolution microsecond timestamp, the wrap-around interval is ~36 minutes long. Secondly, it serves as a threshold value for allowing TIME-WAIT socket reuse. A TIME-WAIT socket can be reused only once the virtual 1 Hz clock, ktime_get_seconds, is past the TS.Recent update timestamp. The purpose behind delaying the TIME-WAIT socket reuse is to wait for the other TCP timestamp clock to tick at least once before reusing the connection. It is only then that the PAWS mechanism for the reopened connection can detect old duplicate segments from the previous connection incarnation (RFC 7323, appendix B.2). In this case using a timestamp with second resolution not only blocks the way toward allowing faster TIME-WAIT reuse after shorter subsecond delay, but also makes it impossible to reliably delay TW reuse by one second. As Eric Dumazet has pointed out [1], due to timestamp rounding, the TW reuse delay will actually be between (0, 1] seconds, and 0.5 seconds on average. We delay TW reuse for one full second only when last TS.Recent update coincides with our virtual 1 Hz clock tick. Considering the above, introduce a dedicated field to store a millisecond timestamp of transition into the TIME-WAIT state. Place it in an existing 4-byte hole inside inet_timewait_sock structure to avoid an additional memory cost. Use the new timestamp to (i) reliably delay TIME-WAIT reuse by one second, and (ii) prepare for configurable subsecond reuse delay in the subsequent change. We assume here that a full one second delay was the original intention in [2] because it accounts for the worst case scenario of the other TCP using the slowest recommended 1 Hz timestamp clock. A more involved alternative would be to change the resolution of the last TS.Recent update timestamp, tw->tw_ts_recent_stamp, to milliseconds. [1] https://lore.kernel.org/netdev/CANn89iKB4GFd8sVzCbRttqw_96o3i2wDhX-3DraQtsceNGYwug@mail.gmail.com/ [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b8439924316d5bcb266d165b93d632a4b4b859af Signed-off-by: Jakub Sitnicki Reviewed-by: Eric Dumazet Reviewed-by: Jason Xing Link: https://patch.msgid.link/20241209-jakub-krn-909-poc-msec-tw-tstamp-v2-1-66aca0eed03e@cloudflare.com Signed-off-by: Jakub Kicinski --- include/net/inet_timewait_sock.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 62c0a7e65d6b..67a313575780 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -74,6 +74,10 @@ struct inet_timewait_sock { tw_tos : 8; u32 tw_txhash; u32 tw_priority; + /** + * @tw_reuse_stamp: Time of entry into %TCP_TIME_WAIT state in msec. + */ + u32 tw_entry_stamp; struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; struct inet_bind2_bucket *tw_tb2; -- cgit v1.2.3 From ca6a6f93867a9763bdf8685c788e2e558d10975f Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 9 Dec 2024 20:38:04 +0100 Subject: tcp: Add sysctl to configure TIME-WAIT reuse delay Today we have a hardcoded delay of 1 sec before a TIME-WAIT socket can be reused by reopening a connection. This is a safe choice based on an assumption that the other TCP timestamp clock frequency, which is unknown to us, may be as low as 1 Hz (RFC 7323, section 5.4). However, this means that in the presence of short lived connections with an RTT of couple of milliseconds, the time during which a 4-tuple is blocked from reuse can be orders of magnitude longer that the connection lifetime. Combined with a reduced pool of ephemeral ports, when using IP_LOCAL_PORT_RANGE to share an egress IP address between hosts [1], the long TIME-WAIT reuse delay can lead to port exhaustion, where all available 4-tuples are tied up in TIME-WAIT state. Turn the reuse delay into a per-netns setting so that sysadmins can make more aggressive assumptions about remote TCP timestamp clock frequency and shorten the delay in order to allow connections to reincarnate faster. Note that applications can completely bypass the TIME-WAIT delay protection already today by locking the local port with bind() before connecting. Such immediate connection reuse may result in PAWS failing to detect old duplicate segments, leaving us with just the sequence number check as a safety net. This new configurable offers a trade off where the sysadmin can balance between the risk of PAWS detection failing to act versus exhausting ports by having sockets tied up in TIME-WAIT state for too long. [1] https://lpc.events/event/16/contributions/1349/ Signed-off-by: Jakub Sitnicki Reviewed-by: Eric Dumazet Reviewed-by: Jason Xing Link: https://patch.msgid.link/20241209-jakub-krn-909-poc-msec-tw-tstamp-v2-2-66aca0eed03e@cloudflare.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 3c014170e001..46452da35206 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -175,6 +175,7 @@ struct netns_ipv4 { u8 sysctl_tcp_retries2; u8 sysctl_tcp_orphan_retries; u8 sysctl_tcp_tw_reuse; + unsigned int sysctl_tcp_tw_reuse_delay; int sysctl_tcp_fin_timeout; u8 sysctl_tcp_sack; u8 sysctl_tcp_window_scaling; -- cgit v1.2.3 From 9723a77318b7c0cfd06ea207e52a042f8c815318 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 10 Dec 2024 14:18:16 +0000 Subject: net: dsa: add hook to determine whether EEE is supported Add a hook to determine whether the switch supports EEE. This will return false if the switch does not, or true if it does. If the method is not implemented, we assume (currently) that the switch supports EEE. Signed-off-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/E1tL144-006cZD-El@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 72ae65e7246a..aaa75bbaa0ea 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -988,6 +988,7 @@ struct dsa_switch_ops { /* * Port's MAC EEE settings */ + bool (*support_eee)(struct dsa_switch *ds, int port); int (*set_mac_eee)(struct dsa_switch *ds, int port, struct ethtool_keee *e); int (*get_mac_eee)(struct dsa_switch *ds, int port, -- cgit v1.2.3 From 99379f587278c818777cb4778e2c79c6c1440c65 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 10 Dec 2024 14:18:21 +0000 Subject: net: dsa: provide implementation of .support_eee() Provide a trivial implementation for the .support_eee() method which switch drivers can use to simply indicate that they support EEE on all their user ports. Signed-off-by: Russell King (Oracle) Reviewed-by: Florian Fainelli Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/E1tL149-006cZJ-JJ@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index aaa75bbaa0ea..4aeedb296d67 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1384,5 +1384,6 @@ static inline bool dsa_user_dev_check(const struct net_device *dev) netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); +bool dsa_supports_eee(struct dsa_switch *ds, int port); #endif -- cgit v1.2.3 From 082e8f6db9092d19ae84549874daaef240c2207b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Mon, 11 Nov 2024 11:03:45 +0100 Subject: turris-omnia-mcu-interface.h: Move command execution function to global header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the command execution functions from the turris-omnia-mcu platform driver private header to the global turris-omnia-mcu-interface.h header, so that they can be used by the LED driver. Signed-off-by: Marek Behún Link: https://lore.kernel.org/r/20241111100355.6978-2-kabel@kernel.org Signed-off-by: Lee Jones --- include/linux/turris-omnia-mcu-interface.h | 136 ++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/turris-omnia-mcu-interface.h b/include/linux/turris-omnia-mcu-interface.h index 2da8cbeb158a..7f24cc682780 100644 --- a/include/linux/turris-omnia-mcu-interface.h +++ b/include/linux/turris-omnia-mcu-interface.h @@ -9,7 +9,10 @@ #define __TURRIS_OMNIA_MCU_INTERFACE_H #include -#include +#include +#include +#include +#include enum omnia_commands_e { OMNIA_CMD_GET_STATUS_WORD = 0x01, /* slave sends status word back */ @@ -246,4 +249,135 @@ enum omnia_cmd_usb_ovc_prot_e { OMNIA_CMD_xET_USB_OVC_PROT_ENABLE = BIT(4), }; +/* Command execution functions */ + +struct i2c_client; + +int omnia_cmd_write_read(const struct i2c_client *client, + void *cmd, unsigned int cmd_len, + void *reply, unsigned int reply_len); + +static inline int omnia_cmd_write(const struct i2c_client *client, void *cmd, + unsigned int len) +{ + return omnia_cmd_write_read(client, cmd, len, NULL, 0); +} + +static inline int omnia_cmd_write_u8(const struct i2c_client *client, u8 cmd, + u8 val) +{ + u8 buf[2] = { cmd, val }; + + return omnia_cmd_write(client, buf, sizeof(buf)); +} + +static inline int omnia_cmd_write_u16(const struct i2c_client *client, u8 cmd, + u16 val) +{ + u8 buf[3]; + + buf[0] = cmd; + put_unaligned_le16(val, &buf[1]); + + return omnia_cmd_write(client, buf, sizeof(buf)); +} + +static inline int omnia_cmd_write_u32(const struct i2c_client *client, u8 cmd, + u32 val) +{ + u8 buf[5]; + + buf[0] = cmd; + put_unaligned_le32(val, &buf[1]); + + return omnia_cmd_write(client, buf, sizeof(buf)); +} + +static inline int omnia_cmd_read(const struct i2c_client *client, u8 cmd, + void *reply, unsigned int len) +{ + return omnia_cmd_write_read(client, &cmd, 1, reply, len); +} + +static inline unsigned int +omnia_compute_reply_length(unsigned long mask, bool interleaved, + unsigned int offset) +{ + if (!mask) + return 0; + + return ((__fls(mask) >> 3) << interleaved) + 1 + offset; +} + +/* Returns 0 on success */ +static inline int omnia_cmd_read_bits(const struct i2c_client *client, u8 cmd, + unsigned long bits, unsigned long *dst) +{ + __le32 reply; + int err; + + if (!bits) { + *dst = 0; + return 0; + } + + err = omnia_cmd_read(client, cmd, &reply, + omnia_compute_reply_length(bits, false, 0)); + if (err) + return err; + + *dst = le32_to_cpu(reply) & bits; + + return 0; +} + +static inline int omnia_cmd_read_bit(const struct i2c_client *client, u8 cmd, + unsigned long bit) +{ + unsigned long reply; + int err; + + err = omnia_cmd_read_bits(client, cmd, bit, &reply); + if (err) + return err; + + return !!reply; +} + +static inline int omnia_cmd_read_u32(const struct i2c_client *client, u8 cmd, + u32 *dst) +{ + __le32 reply; + int err; + + err = omnia_cmd_read(client, cmd, &reply, sizeof(reply)); + if (err) + return err; + + *dst = le32_to_cpu(reply); + + return 0; +} + +static inline int omnia_cmd_read_u16(const struct i2c_client *client, u8 cmd, + u16 *dst) +{ + __le16 reply; + int err; + + err = omnia_cmd_read(client, cmd, &reply, sizeof(reply)); + if (err) + return err; + + *dst = le16_to_cpu(reply); + + return 0; +} + +static inline int omnia_cmd_read_u8(const struct i2c_client *client, u8 cmd, + u8 *reply) +{ + return omnia_cmd_read(client, cmd, reply, sizeof(*reply)); +} + #endif /* __TURRIS_OMNIA_MCU_INTERFACE_H */ -- cgit v1.2.3 From d665d7f2800fff5da9311e4c8c236966ba57d440 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Mon, 11 Nov 2024 11:03:47 +0100 Subject: turris-omnia-mcu-interface.h: Add LED commands related definitions to global header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add definitions for contents of the OMNIA_CMD_LED_MODE and OMNIA_CMD_LED_STATE commands to the global turris-omnia-mcu-interface.h header. Signed-off-by: Marek Behún Link: https://lore.kernel.org/r/20241111100355.6978-4-kabel@kernel.org Signed-off-by: Lee Jones --- include/linux/turris-omnia-mcu-interface.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/turris-omnia-mcu-interface.h b/include/linux/turris-omnia-mcu-interface.h index 7f24cc682780..06c94e032c6f 100644 --- a/include/linux/turris-omnia-mcu-interface.h +++ b/include/linux/turris-omnia-mcu-interface.h @@ -239,6 +239,18 @@ enum omnia_int_e { OMNIA_INT_LAN5_LED1 = BIT(31), }; +enum omnia_cmd_led_mode_e { + OMNIA_CMD_LED_MODE_LED_MASK = GENMASK(3, 0), +#define OMNIA_CMD_LED_MODE_LED(_l) FIELD_PREP(OMNIA_CMD_LED_MODE_LED_MASK, _l) + OMNIA_CMD_LED_MODE_USER = BIT(4), +}; + +enum omnia_cmd_led_state_e { + OMNIA_CMD_LED_STATE_LED_MASK = GENMASK(3, 0), +#define OMNIA_CMD_LED_STATE_LED(_l) FIELD_PREP(OMNIA_CMD_LED_STATE_LED_MASK, _l) + OMNIA_CMD_LED_STATE_ON = BIT(4), +}; + enum omnia_cmd_poweroff_e { OMNIA_CMD_POWER_OFF_POWERON_BUTTON = BIT(0), OMNIA_CMD_POWER_OFF_MAGIC = 0xdead, -- cgit v1.2.3 From 76850b54943ffd5037c97cc27794449ce05c31e9 Mon Sep 17 00:00:00 2001 From: Rick Wertenbroek Date: Thu, 12 Dec 2024 17:25:47 +0100 Subject: PCI: endpoint: Replace magic number '6' by PCI_STD_NUM_BARS Replace the constant "6" by PCI_STD_NUM_BARS, as defined in include/uapi/linux/pci_regs.h: #define PCI_STD_NUM_BARS 6 /* Number of standard BARs */ Link: https://lore.kernel.org/r/20241212162547.225880-1-rick.wertenbroek@gmail.com Signed-off-by: Rick Wertenbroek Signed-off-by: Bjorn Helgaas --- include/linux/pci-epf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index 18a3aeb62ae4..ee6156bcbbd0 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -157,7 +157,7 @@ struct pci_epf { struct device dev; const char *name; struct pci_epf_header *header; - struct pci_epf_bar bar[6]; + struct pci_epf_bar bar[PCI_STD_NUM_BARS]; u8 msi_interrupts; u16 msix_interrupts; u8 func_no; @@ -174,7 +174,7 @@ struct pci_epf { /* Below members are to attach secondary EPC to an endpoint function */ struct pci_epc *sec_epc; struct list_head sec_epc_list; - struct pci_epf_bar sec_epc_bar[6]; + struct pci_epf_bar sec_epc_bar[PCI_STD_NUM_BARS]; u8 sec_epc_func_no; struct config_group *group; unsigned int is_bound; -- cgit v1.2.3 From d24bf99214b199c25f9c2cb04b3a4993d1c7ab60 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 11 Dec 2024 18:44:49 +0100 Subject: power: supply: core: Add new "charge_types" property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new "charge_types" property, this is identical to "charge_type" but reading returns a list of supported charge-types with the currently active type surrounded by square brackets, e.g.: Fast [Standard] "Long_Life" This has the advantage over the existing "charge_type" property that this allows userspace to find out which charge-types are supported for writable charge_type properties. Drivers which already support "charge_type" can easily add support for this by setting power_supply_desc.charge_types to a bitmask representing valid charge_type values. The existing "charge_type" get_property() and set_property() code paths can be re-used for "charge_types". Signed-off-by: Hans de Goede Reviewed-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241211174451.355421-3-hdegoede@redhat.com Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 17fc383785bf..0d96657d1a2b 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -40,7 +40,7 @@ enum { }; /* What algorithm is the charger using? */ -enum { +enum power_supply_charge_type { POWER_SUPPLY_CHARGE_TYPE_UNKNOWN = 0, POWER_SUPPLY_CHARGE_TYPE_NONE, POWER_SUPPLY_CHARGE_TYPE_TRICKLE, /* slow speed */ @@ -99,6 +99,7 @@ enum power_supply_property { /* Properties of type `int' */ POWER_SUPPLY_PROP_STATUS = 0, POWER_SUPPLY_PROP_CHARGE_TYPE, + POWER_SUPPLY_PROP_CHARGE_TYPES, POWER_SUPPLY_PROP_HEALTH, POWER_SUPPLY_PROP_PRESENT, POWER_SUPPLY_PROP_ONLINE, @@ -245,6 +246,7 @@ struct power_supply_desc { const char *name; enum power_supply_type type; u8 charge_behaviours; + u32 charge_types; u32 usb_types; const enum power_supply_property *properties; size_t num_properties; @@ -946,6 +948,11 @@ ssize_t power_supply_charge_behaviour_show(struct device *dev, char *buf); int power_supply_charge_behaviour_parse(unsigned int available_behaviours, const char *buf); +ssize_t power_supply_charge_types_show(struct device *dev, + unsigned int available_types, + enum power_supply_charge_type current_type, + char *buf); +int power_supply_charge_types_parse(unsigned int available_types, const char *buf); #else static inline ssize_t power_supply_charge_behaviour_show(struct device *dev, @@ -961,6 +968,20 @@ static inline int power_supply_charge_behaviour_parse(unsigned int available_beh { return -EOPNOTSUPP; } + +static inline +ssize_t power_supply_charge_types_show(struct device *dev, + unsigned int available_types, + enum power_supply_charge_type current_type, + char *buf) +{ + return -EOPNOTSUPP; +} + +static inline int power_supply_charge_types_parse(unsigned int available_types, const char *buf) +{ + return -EOPNOTSUPP; +} #endif #endif /* __LINUX_POWER_SUPPLY_H__ */ -- cgit v1.2.3 From a42d71e322a8066dcfa228ce8529bb073c521ae9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 11 Dec 2024 11:17:09 +0100 Subject: net_sched: sch_cake: Add drop reasons MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add three qdisc-specific drop reasons and use them in sch_cake: 1) SKB_DROP_REASON_QDISC_OVERLIMIT Whenever the total queue limit for a qdisc instance is exceeded and a packet is dropped to make room. 2) SKB_DROP_REASON_QDISC_CONGESTED Whenever a packet is dropped by the qdisc AQM algorithm because congestion is detected. 3) SKB_DROP_REASON_CAKE_FLOOD Whenever a packet is dropped by the flood protection part of the CAKE AQM algorithm (BLUE). Also use the existing SKB_DROP_REASON_QUEUE_PURGE in cake_clear_tin(). Reasons show up as: perf record -a -e skb:kfree_skb sleep 1; perf script iperf3 665 [005] 848.656964: skb:kfree_skb: skbaddr=0xffff98168a333500 rx_sk=(nil) protocol=34525 location=__dev_queue_xmit+0x10f0 reason: QDISC_OVERLIMIT swapper 0 [001] 909.166055: skb:kfree_skb: skbaddr=0xffff98168280cee0 rx_sk=(nil) protocol=34525 location=cake_dequeue+0x5ef reason: QDISC_CONGESTED Reviewed-by: Eric Dumazet Reviewed-by: Jamal Hadi Salim Acked-by: Dave Taht Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241211-cake-drop-reason-v2-1-920afadf4d1b@redhat.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index c29282fabae6..ead4170a1d0a 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -58,6 +58,9 @@ FN(TC_EGRESS) \ FN(SECURITY_HOOK) \ FN(QDISC_DROP) \ + FN(QDISC_OVERLIMIT) \ + FN(QDISC_CONGESTED) \ + FN(CAKE_FLOOD) \ FN(FQ_BAND_LIMIT) \ FN(FQ_HORIZON_LIMIT) \ FN(FQ_FLOW_LIMIT) \ @@ -314,6 +317,21 @@ enum skb_drop_reason { * failed to enqueue to current qdisc) */ SKB_DROP_REASON_QDISC_DROP, + /** + * @SKB_DROP_REASON_QDISC_OVERLIMIT: dropped by qdisc when a qdisc + * instance exceeds its total buffer size limit. + */ + SKB_DROP_REASON_QDISC_OVERLIMIT, + /** + * @SKB_DROP_REASON_QDISC_CONGESTED: dropped by a qdisc AQM algorithm + * due to congestion. + */ + SKB_DROP_REASON_QDISC_CONGESTED, + /** + * @SKB_DROP_REASON_CAKE_FLOOD: dropped by the flood protection part of + * CAKE qdisc AQM algorithm (BLUE). + */ + SKB_DROP_REASON_CAKE_FLOOD, /** * @SKB_DROP_REASON_FQ_BAND_LIMIT: dropped by fq qdisc when per band * limit is reached. -- cgit v1.2.3 From fcc680a647ba77370480fe753664cc10d572b240 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 11 Dec 2024 18:26:38 +0100 Subject: page_pool: allow mixing PPs within one bulk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The main reason for this change was to allow mixing pages from different &page_pools within one &xdp_buff/&xdp_frame. Why not? With stuff like devmem and io_uring zerocopy Rx, it's required to have separate PPs for header buffers and payload buffers. Adjust xdp_return_frame_bulk() and page_pool_put_netmem_bulk(), so that they won't be tied to a particular pool. Let the latter create a separate bulk of pages which's PP is different from the first netmem of the bulk and process it after the main loop. This greatly optimizes xdp_return_frame_bulk(): no more hashtable lookups and forced flushes on PP mismatch. Also make xdp_flush_frame_bulk() inline, as it's just one if + function call + one u32 read, not worth extending the call ladder. Co-developed-by: Toke Høiland-Jørgensen # iterative Signed-off-by: Toke Høiland-Jørgensen Suggested-by: Jakub Kicinski # while (count) Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241211172649.761483-2-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/types.h | 6 ++---- include/net/xdp.h | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 1ea16b0e9c79..05a864031271 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -259,8 +259,7 @@ void page_pool_disable_direct_recycling(struct page_pool *pool); void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), const struct xdp_mem_info *mem); -void page_pool_put_netmem_bulk(struct page_pool *pool, netmem_ref *data, - u32 count); +void page_pool_put_netmem_bulk(netmem_ref *data, u32 count); #else static inline void page_pool_destroy(struct page_pool *pool) { @@ -272,8 +271,7 @@ static inline void page_pool_use_xdp_mem(struct page_pool *pool, { } -static inline void page_pool_put_netmem_bulk(struct page_pool *pool, - netmem_ref *data, u32 count) +static inline void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) { } #endif diff --git a/include/net/xdp.h b/include/net/xdp.h index f4020b29122f..9e7eb8223513 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -11,6 +11,8 @@ #include #include /* skb_shared_info */ +#include + /** * DOC: XDP RX-queue information * @@ -193,14 +195,12 @@ xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame) #define XDP_BULK_QUEUE_SIZE 16 struct xdp_frame_bulk { int count; - void *xa; netmem_ref q[XDP_BULK_QUEUE_SIZE]; }; static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) { - /* bq->count will be zero'ed when bq->xa gets updated */ - bq->xa = NULL; + bq->count = 0; } static inline struct skb_shared_info * @@ -317,10 +317,18 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); -void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq); void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq); +static inline void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) +{ + if (unlikely(!bq->count)) + return; + + page_pool_put_netmem_bulk(bq->q, bq->count); + bq->count = 0; +} + static __always_inline unsigned int xdp_get_frame_len(const struct xdp_frame *xdpf) { -- cgit v1.2.3 From 56d95b0adfa224bb1c67733dbcad30dd8debd39e Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 11 Dec 2024 18:26:39 +0100 Subject: xdp: get rid of xdp_frame::mem.id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Initially, xdp_frame::mem.id was used to search for the corresponding &page_pool to return the page correctly. However, after that struct page was extended to have a direct pointer to its PP (netmem has it as well), further keeping of this field makes no sense. xdp_return_frame_bulk() still used it to do a lookup, and this leftover is now removed. Remove xdp_frame::mem and replace it with ::mem_type, as only memory type still matters and we need to know it to be able to free the frame correctly. As a cute side effect, we can now make every scalar field in &xdp_frame of 4 byte width, speeding up accesses to them. Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241211172649.761483-3-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 9e7eb8223513..1c260869a353 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -169,13 +169,13 @@ out: struct xdp_frame { void *data; - u16 len; - u16 headroom; + u32 len; + u32 headroom; u32 metasize; /* uses lower 8-bits */ /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, - * while mem info is valid on remote CPU. + * while mem_type is valid on remote CPU. */ - struct xdp_mem_info mem; + enum xdp_mem_type mem_type:32; struct net_device *dev_rx; /* used by cpumap */ u32 frame_sz; u32 flags; /* supported values defined in xdp_buff_flags */ @@ -306,13 +306,13 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0)) return NULL; - /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ - xdp_frame->mem = xdp->rxq->mem; + /* rxq only valid until napi_schedule ends, convert to xdp_mem_type */ + xdp_frame->mem_type = xdp->rxq->mem.type; return xdp_frame; } -void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, +void __xdp_return(void *data, enum xdp_mem_type mem_type, bool napi_direct, struct xdp_buff *xdp); void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); -- cgit v1.2.3 From 207ff83cecaeaacf0d47c8ccbe927c8354ac1280 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 11 Dec 2024 18:26:40 +0100 Subject: xdp: make __xdp_return() MP-agnostic Currently, __xdp_return() takes pointer to the virtual memory to free a buffer. Apart from that this sometimes provokes redundant data <--> page conversions, taking data pointer effectively prevents lots of XDP code to support non-page-backed buffers, as there's no mapping for the non-host memory (data is always NULL). Just convert it to always take netmem reference. For xdp_return_{buff,frame*}(), this chops off one page_address() per each frag and adds one virt_to_netmem() (same as virt_to_page()) per header buffer. For __xdp_return() itself, it removes one virt_to_page() for MEM_TYPE_PAGE_POOL and another one for MEM_TYPE_PAGE_ORDER0, adding one page_address() for [not really common nowadays] MEM_TYPE_PAGE_SHARED, but the main effect is that the abovementioned functions won't die or memleak anymore if the frame has non-host memory attached and will correctly free those. Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241211172649.761483-4-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 1c260869a353..d2089cfecefd 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -312,8 +312,8 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) return xdp_frame; } -void __xdp_return(void *data, enum xdp_mem_type mem_type, bool napi_direct, - struct xdp_buff *xdp); +void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, + bool napi_direct, struct xdp_buff *xdp); void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); -- cgit v1.2.3 From 0dffdb3b3366c932fb7d210f5032476c552f7000 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 11 Dec 2024 18:26:47 +0100 Subject: skbuff: allow 2-4-argument skb_frag_dma_map() skb_frag_dma_map(dev, frag, 0, skb_frag_size(frag), DMA_TO_DEVICE) is repeated across dozens of drivers and really wants a shorthand. Add a macro which will count args and handle all possible number from 2 to 5. Semantics: skb_frag_dma_map(dev, frag) -> __skb_frag_dma_map(dev, frag, 0, skb_frag_size(frag), DMA_TO_DEVICE) skb_frag_dma_map(dev, frag, offset) -> __skb_frag_dma_map(dev, frag, offset, skb_frag_size(frag) - offset, DMA_TO_DEVICE) skb_frag_dma_map(dev, frag, offset, size) -> __skb_frag_dma_map(dev, frag, offset, size, DMA_TO_DEVICE) skb_frag_dma_map(dev, frag, offset, size, dir) -> __skb_frag_dma_map(dev, frag, offset, size, dir) No object code size changes for the existing callers. Users passing less arguments also won't have bigger size comparing to the full equivalent call. Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241211172649.761483-11-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 69624b394cd9..b2509cd0b930 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3674,7 +3674,7 @@ static inline void skb_frag_page_copy(skb_frag_t *fragto, bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio); /** - * skb_frag_dma_map - maps a paged fragment via the DMA API + * __skb_frag_dma_map - maps a paged fragment via the DMA API * @dev: the device to map the fragment to * @frag: the paged fragment to map * @offset: the offset within the fragment (starting at the @@ -3684,15 +3684,36 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio); * * Maps the page associated with @frag to @device. */ -static inline dma_addr_t skb_frag_dma_map(struct device *dev, - const skb_frag_t *frag, - size_t offset, size_t size, - enum dma_data_direction dir) +static inline dma_addr_t __skb_frag_dma_map(struct device *dev, + const skb_frag_t *frag, + size_t offset, size_t size, + enum dma_data_direction dir) { return dma_map_page(dev, skb_frag_page(frag), skb_frag_off(frag) + offset, size, dir); } +#define skb_frag_dma_map(dev, frag, ...) \ + CONCATENATE(_skb_frag_dma_map, \ + COUNT_ARGS(__VA_ARGS__))(dev, frag, ##__VA_ARGS__) + +#define __skb_frag_dma_map1(dev, frag, offset, uf, uo) ({ \ + const skb_frag_t *uf = (frag); \ + size_t uo = (offset); \ + \ + __skb_frag_dma_map(dev, uf, uo, skb_frag_size(uf) - uo, \ + DMA_TO_DEVICE); \ +}) +#define _skb_frag_dma_map1(dev, frag, offset) \ + __skb_frag_dma_map1(dev, frag, offset, __UNIQUE_ID(frag_), \ + __UNIQUE_ID(offset_)) +#define _skb_frag_dma_map0(dev, frag) \ + _skb_frag_dma_map1(dev, frag, 0) +#define _skb_frag_dma_map2(dev, frag, offset, size) \ + __skb_frag_dma_map(dev, frag, offset, size, DMA_TO_DEVICE) +#define _skb_frag_dma_map3(dev, frag, offset, size, dir) \ + __skb_frag_dma_map(dev, frag, offset, size, dir) + static inline struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) { -- cgit v1.2.3 From 91a152cbb49c26609d217cf2f116d46143b9b8be Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Wed, 11 Dec 2024 21:20:28 +0000 Subject: net: page_pool: rename page_pool_alloc_netmem to *_netmems page_pool_alloc_netmem (without an s) was the mirror of page_pool_alloc_pages (with an s), which was confusing. Rename to page_pool_alloc_netmems so it's the mirror of page_pool_alloc_pages. Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20241211212033.1684197-2-almasrymina@google.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 05a864031271..3270c92841b4 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -242,7 +242,7 @@ struct page_pool { }; struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); -netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp); +netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp); struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp); netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, -- cgit v1.2.3 From 8156c310499a34c8f42b2e2b7360abb805683bbe Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Wed, 11 Dec 2024 21:20:29 +0000 Subject: net: page_pool: create page_pool_alloc_netmem Create page_pool_alloc_netmem to be the mirror of page_pool_alloc. This enables drivers that want currently use page_pool_alloc to transition to netmem by converting the call sites to page_pool_alloc_netmem. Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20241211212033.1684197-3-almasrymina@google.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/helpers.h | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 26caa2c20912..95af7f0b029e 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -115,22 +115,22 @@ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, return page_pool_alloc_frag(pool, offset, size, gfp); } -static inline struct page *page_pool_alloc(struct page_pool *pool, - unsigned int *offset, - unsigned int *size, gfp_t gfp) +static inline netmem_ref page_pool_alloc_netmem(struct page_pool *pool, + unsigned int *offset, + unsigned int *size, gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; - struct page *page; + netmem_ref netmem; if ((*size << 1) > max_size) { *size = max_size; *offset = 0; - return page_pool_alloc_pages(pool, gfp); + return page_pool_alloc_netmems(pool, gfp); } - page = page_pool_alloc_frag(pool, offset, *size, gfp); - if (unlikely(!page)) - return NULL; + netmem = page_pool_alloc_frag_netmem(pool, offset, *size, gfp); + if (unlikely(!netmem)) + return 0; /* There is very likely not enough space for another fragment, so append * the remaining size to the current fragment to avoid truesize @@ -141,7 +141,14 @@ static inline struct page *page_pool_alloc(struct page_pool *pool, pool->frag_offset = max_size; } - return page; + return netmem; +} + +static inline struct page *page_pool_alloc(struct page_pool *pool, + unsigned int *offset, + unsigned int *size, gfp_t gfp) +{ + return netmem_to_page(page_pool_alloc_netmem(pool, offset, size, gfp)); } /** -- cgit v1.2.3 From 7dba339faae991a23c54f7b93a58798c58f8c16f Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Wed, 11 Dec 2024 21:20:31 +0000 Subject: page_pool: disable sync for cpu for dmabuf memory provider dmabuf dma-addresses should not be dma_sync'd for CPU/device. Typically its the driver responsibility to dma_sync for CPU, but the driver should not dma_sync for CPU if the netmem is actually coming from a dmabuf memory provider. The page_pool already exposes a helper for dma_sync_for_cpu: page_pool_dma_sync_for_cpu. Upgrade this existing helper to handle netmem, and have it skip dma_sync if the memory is from a dmabuf memory provider. Drivers should migrate to using this helper when adding support for netmem. Also minimize the impact on the dma syncing performance for pages. Special case the dma-sync path for pages to not go through the overhead checks for dma-syncing and conversion to netmem. Cc: Alexander Lobakin Cc: Jason Gunthorpe Signed-off-by: Mina Almasry Link: https://patch.msgid.link/20241211212033.1684197-5-almasrymina@google.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/helpers.h | 35 ++++++++++++++++++++++++++++++----- include/net/page_pool/types.h | 3 ++- 2 files changed, 32 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 95af7f0b029e..e555921e5233 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -422,7 +422,21 @@ static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem) */ static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) { - return page_pool_get_dma_addr_netmem(page_to_netmem((struct page *)page)); + dma_addr_t ret = page->dma_addr; + + if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) + ret <<= PAGE_SHIFT; + + return ret; +} + +static inline void __page_pool_dma_sync_for_cpu(const struct page_pool *pool, + const dma_addr_t dma_addr, + u32 offset, u32 dma_sync_size) +{ + dma_sync_single_range_for_cpu(pool->p.dev, dma_addr, + offset + pool->p.offset, dma_sync_size, + page_pool_get_dma_dir(pool)); } /** @@ -441,10 +455,21 @@ static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool, const struct page *page, u32 offset, u32 dma_sync_size) { - dma_sync_single_range_for_cpu(pool->p.dev, - page_pool_get_dma_addr(page), - offset + pool->p.offset, dma_sync_size, - page_pool_get_dma_dir(pool)); + __page_pool_dma_sync_for_cpu(pool, page_pool_get_dma_addr(page), offset, + dma_sync_size); +} + +static inline void +page_pool_dma_sync_netmem_for_cpu(const struct page_pool *pool, + const netmem_ref netmem, u32 offset, + u32 dma_sync_size) +{ + if (!pool->dma_sync_for_cpu) + return; + + __page_pool_dma_sync_for_cpu(pool, + page_pool_get_dma_addr_netmem(netmem), + offset, dma_sync_size); } static inline bool page_pool_put(struct page_pool *pool) diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 3270c92841b4..ed4cd114180a 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -164,7 +164,8 @@ struct page_pool { bool has_init_callback:1; /* slow::init_callback is set */ bool dma_map:1; /* Perform DMA mapping */ - bool dma_sync:1; /* Perform DMA sync */ + bool dma_sync:1; /* Perform DMA sync for device */ + bool dma_sync_for_cpu:1; /* Perform DMA sync for cpu */ #ifdef CONFIG_PAGE_POOL_STATS bool system:1; /* This is a global percpu pool */ #endif -- cgit v1.2.3 From 0ef8047b737d7480a5d4c46d956e97c190f13050 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 29 Nov 2024 16:15:54 +0100 Subject: x86/static-call: provide a way to do very early static-call updates Add static_call_update_early() for updating static-call targets in very early boot. This will be needed for support of Xen guest type specific hypercall functions. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Co-developed-by: Peter Zijlstra Co-developed-by: Josh Poimboeuf --- include/linux/compiler.h | 37 ++++++++++++++++++++++++++----------- include/linux/static_call.h | 1 + 2 files changed, 27 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 469a64dd6495..240c632c5b95 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -216,28 +216,43 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif /* __KERNEL__ */ +/** + * offset_to_ptr - convert a relative memory offset to an absolute pointer + * @off: the address of the 32-bit offset value + */ +static inline void *offset_to_ptr(const int *off) +{ + return (void *)((unsigned long)off + *off); +} + +#endif /* __ASSEMBLY__ */ + +#ifdef CONFIG_64BIT +#define ARCH_SEL(a,b) a +#else +#define ARCH_SEL(a,b) b +#endif + /* * Force the compiler to emit 'sym' as a symbol, so that we can reference * it from inline assembler. Necessary in case 'sym' could be inlined * otherwise, or eliminated entirely due to lack of references that are * visible to the compiler. */ -#define ___ADDRESSABLE(sym, __attrs) \ - static void * __used __attrs \ +#define ___ADDRESSABLE(sym, __attrs) \ + static void * __used __attrs \ __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym; + #define __ADDRESSABLE(sym) \ ___ADDRESSABLE(sym, __section(".discard.addressable")) -/** - * offset_to_ptr - convert a relative memory offset to an absolute pointer - * @off: the address of the 32-bit offset value - */ -static inline void *offset_to_ptr(const int *off) -{ - return (void *)((unsigned long)off + *off); -} +#define __ADDRESSABLE_ASM(sym) \ + .pushsection .discard.addressable,"aw"; \ + .align ARCH_SEL(8,4); \ + ARCH_SEL(.quad, .long) __stringify(sym); \ + .popsection; -#endif /* __ASSEMBLY__ */ +#define __ADDRESSABLE_ASM_STR(sym) __stringify(__ADDRESSABLE_ASM(sym)) #ifdef __CHECKER__ #define __BUILD_BUG_ON_ZERO_MSG(e, msg) (0) diff --git a/include/linux/static_call.h b/include/linux/static_call.h index 141e6b176a1b..785980af8972 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -138,6 +138,7 @@ #ifdef CONFIG_HAVE_STATIC_CALL #include +extern int static_call_initialized; /* * Either @site or @tramp can be NULL. */ -- cgit v1.2.3 From f4425e3ab2f796d442a44f31262eade9b6427ff7 Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Thu, 12 Dec 2024 15:45:04 +0800 Subject: ALSA: compress: Add output rate and output format support Add 'pcm_format' for struct snd_codec, add 'pcm_formats' for struct snd_codec_desc, these are used for accelerator usage. Current accelerator example is sample rate converter (SRC). Define struct snd_codec_desc_src for descript minmum and maxmum sample rates. And add 'src_d' in union snd_codec_options structure. These are mainly used for capbility query. Signed-off-by: Jaroslav Kysela Signed-off-by: Shengjiu Wang Acked-by: Jaroslav Kysela Acked-by: Vinod Koul Link: https://patch.msgid.link/20241212074509.3445859-2-shengjiu.wang@nxp.com Signed-off-by: Mark Brown --- include/uapi/sound/compress_params.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/sound/compress_params.h b/include/uapi/sound/compress_params.h index ddc77322d571..bc7648a30746 100644 --- a/include/uapi/sound/compress_params.h +++ b/include/uapi/sound/compress_params.h @@ -334,6 +334,14 @@ union snd_codec_options { struct snd_dec_wma wma_d; struct snd_dec_alac alac_d; struct snd_dec_ape ape_d; + struct { + __u32 out_sample_rate; + } src_d; +} __attribute__((packed, aligned(4))); + +struct snd_codec_desc_src { + __u32 out_sample_rate_min; + __u32 out_sample_rate_max; } __attribute__((packed, aligned(4))); /** struct snd_codec_desc - description of codec capabilities @@ -347,6 +355,9 @@ union snd_codec_options { * @modes: Supported modes. See SND_AUDIOMODE defines * @formats: Supported formats. See SND_AUDIOSTREAMFORMAT defines * @min_buffer: Minimum buffer size handled by codec implementation + * @pcm_formats: Output (for decoders) or input (for encoders) + * PCM formats (required to accel mode, 0 for other modes) + * @u_space: union space (for codec dependent data) * @reserved: reserved for future use * * This structure provides a scalar value for profiles, modes and stream @@ -370,7 +381,12 @@ struct snd_codec_desc { __u32 modes; __u32 formats; __u32 min_buffer; - __u32 reserved[15]; + __u32 pcm_formats; + union { + __u32 u_space[6]; + struct snd_codec_desc_src src; + } __attribute__((packed, aligned(4))); + __u32 reserved[8]; } __attribute__((packed, aligned(4))); /** struct snd_codec @@ -395,6 +411,8 @@ struct snd_codec_desc { * @align: Block alignment in bytes of an audio sample. * Only required for PCM or IEC formats. * @options: encoder-specific settings + * @pcm_format: Output (for decoders) or input (for encoders) + * PCM formats (required to accel mode, 0 for other modes) * @reserved: reserved for future use */ @@ -411,7 +429,8 @@ struct snd_codec { __u32 format; __u32 align; union snd_codec_options options; - __u32 reserved[3]; + __u32 pcm_format; + __u32 reserved[2]; } __attribute__((packed, aligned(4))); #endif -- cgit v1.2.3 From 25458fdd39a18a5ce00c36f38992da54bb7453f3 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 3 Dec 2024 10:49:31 +0000 Subject: dt-bindings: clock: renesas: Document RZ/G3E SoC CPG Document the device tree bindings for the Renesas RZ/G3E SoC Clock Pulse Generator (CPG). Also define constants for the core clocks of the RZ/G3E SoC. Acked-by: Conor Dooley Signed-off-by: Biju Das Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20241203105005.103927-5-biju.das.jz@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- include/dt-bindings/clock/renesas,r9a09g047-cpg.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 include/dt-bindings/clock/renesas,r9a09g047-cpg.h (limited to 'include') diff --git a/include/dt-bindings/clock/renesas,r9a09g047-cpg.h b/include/dt-bindings/clock/renesas,r9a09g047-cpg.h new file mode 100644 index 000000000000..1d031bf6bf03 --- /dev/null +++ b/include/dt-bindings/clock/renesas,r9a09g047-cpg.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + * + * Copyright (C) 2024 Renesas Electronics Corp. + */ +#ifndef __DT_BINDINGS_CLOCK_RENESAS_R9A09G047_CPG_H__ +#define __DT_BINDINGS_CLOCK_RENESAS_R9A09G047_CPG_H__ + +#include + +/* Core Clock list */ +#define R9A09G047_SYS_0_PCLK 0 +#define R9A09G047_CA55_0_CORECLK0 1 +#define R9A09G047_CA55_0_CORECLK1 2 +#define R9A09G047_CA55_0_CORECLK2 3 +#define R9A09G047_CA55_0_CORECLK3 4 +#define R9A09G047_CA55_0_PERIPHCLK 5 +#define R9A09G047_CM33_CLK0 6 +#define R9A09G047_CST_0_SWCLKTCK 7 +#define R9A09G047_IOTOP_0_SHCLK 8 + +#endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G047_CPG_H__ */ -- cgit v1.2.3 From 62374ce1876be26b3f33575680e67ca69a59db54 Mon Sep 17 00:00:00 2001 From: Tao Zhang Date: Fri, 13 Dec 2024 18:07:29 +0800 Subject: coresight: Add a helper to check if a device is source Since there are a lot of places in the code to check whether the device is source, add a helper to check it. Signed-off-by: Tao Zhang Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20241213100731.25914-3-quic_taozha@quicinc.com --- include/linux/coresight.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 055ce5cd5c44..c50d128e8d93 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -588,9 +588,14 @@ static inline void csdev_access_write64(struct csdev_access *csa, u64 val, u32 o } #endif /* CONFIG_64BIT */ +static inline bool coresight_is_device_source(struct coresight_device *csdev) +{ + return csdev && (csdev->type == CORESIGHT_DEV_TYPE_SOURCE); +} + static inline bool coresight_is_percpu_source(struct coresight_device *csdev) { - return csdev && (csdev->type == CORESIGHT_DEV_TYPE_SOURCE) && + return csdev && coresight_is_device_source(csdev) && (csdev->subtype.source_subtype == CORESIGHT_DEV_SUBTYPE_SOURCE_PROC); } -- cgit v1.2.3 From ec9903d6cc34e61b77e609a0425e7a0a804fb95a Mon Sep 17 00:00:00 2001 From: Tao Zhang Date: Fri, 13 Dec 2024 18:07:30 +0800 Subject: coresight: Add support for trace filtering by source Some replicators have hard coded filtering of "trace" data, based on the source device. This is different from the trace filtering based on TraceID, available in the standard programmable replicators. e.g., Qualcomm replicators have filtering based on custom trace protocol format and is not programmable. The source device could be connected to the replicator via intermediate components (e.g., a funnel). Thus we need platform information from the firmware tables to decide the source device corresponding to a given output port from the replicator. Given this affects "trace path building" and traversing the path back from the sink to source, add the concept of "filtering by source" to the generic coresight connection. The specified source will be marked like below in the Devicetree. test-replicator { ... ... ... ... out-ports { ... ... ... ... port@0 { reg = <0>; xyz: endpoint { remote-endpoint = <&zyx>; filter-source = <&source_1>; <-- To specify the source to }; be filtered out here. }; port@1 { reg = <1>; abc: endpoint { remote-endpoint = <&cba>; filter-source = <&source_2>; <-- To specify the source to }; be filtered out here. }; }; }; Signed-off-by: Tao Zhang Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20241213100731.25914-4-quic_taozha@quicinc.com --- include/linux/coresight.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index c50d128e8d93..17276965ff1d 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -172,6 +172,9 @@ struct coresight_desc { * @dest_dev: a @coresight_device representation of the component connected to @src_port. NULL until the device is created * @link: Representation of the connection as a sysfs link. + * @filter_src_fwnode: filter source component's fwnode handle. + * @filter_src_dev: a @coresight_device representation of the component that + needs to be filtered. * * The full connection structure looks like this, where in_conns store * references to same connection as the source device's out_conns. @@ -200,6 +203,8 @@ struct coresight_connection { struct coresight_device *dest_dev; struct coresight_sysfs_link *link; struct coresight_device *src_dev; + struct fwnode_handle *filter_src_fwnode; + struct coresight_device *filter_src_dev; int src_refcnt; int dest_refcnt; }; -- cgit v1.2.3 From 70a667d70cce338ab8552dd762ae114a5ab96500 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Fri, 13 Dec 2024 12:11:22 +0200 Subject: ASoC: SOF: Add support for pause supported tokens from topology New tokens are added to topology: 1202: SOF_TKN_STREAM_PLAYBACK_PAUSE_SUPPORTED 1203: SOF_TKN_STREAM_CAPTURE_PAUSE_SUPPORTED The new tokens are used to advertise support for PAUSE/RESUME operation on a PCM device depending on firmware product, use case, pipeline topology. The snd_sof_pcm_stream.pause_supported is updated to reflect the advertised value for the PCM device. If the token does not exist then the pause_supported is set to false. Note: it is up to the platform code to use this flag to decide to advertise the PAUSE support for user space or not. Signed-off-by: Peter Ujfalusi Reviewed-by: Pierre-Louis Bossart Reviewed-by: Bard Liao Reviewed-by: Kai Vehmanen Reviewed-by: Liam Girdwood Link: https://patch.msgid.link/20241213101123.27318-2-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- include/uapi/sound/sof/tokens.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/sound/sof/tokens.h b/include/uapi/sound/sof/tokens.h index 0a246bc218d3..c28c766270de 100644 --- a/include/uapi/sound/sof/tokens.h +++ b/include/uapi/sound/sof/tokens.h @@ -153,6 +153,8 @@ /* Stream */ #define SOF_TKN_STREAM_PLAYBACK_COMPATIBLE_D0I3 1200 #define SOF_TKN_STREAM_CAPTURE_COMPATIBLE_D0I3 1201 +#define SOF_TKN_STREAM_PLAYBACK_PAUSE_SUPPORTED 1202 +#define SOF_TKN_STREAM_CAPTURE_PAUSE_SUPPORTED 1203 /* Led control for mute switches */ #define SOF_TKN_MUTE_LED_USE 1300 -- cgit v1.2.3 From d54a3fc6bf3db0db0e16cfdf7f48a8bbb803f6b0 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 12 Dec 2024 14:37:14 +0000 Subject: firmware: cs_dsp: Add mock regmap for KUnit testing Add a mock regmap implementation to act as a simulated DSP for KUnit testing. This is built as a utility module so that it could be used by clients of cs_dsp to create a mock "DSP" for their own testing. cs_dsp interacts with the DSP only through registers. Most of the register space of the DSP is RAM. ADSP cores have a small set of control registers. HALO Core DSPs have a much larger set of control registers but only a small subset are used. Most writes are "blind" in the sense that cs_dsp does not expect to receive any sort of response from the DSP. So there isn't any need to emulate a "DSP", only a set of registers that can be written and read back. The idea of the mock regmap is to use the cache to accumulate writes which can then be tested against the values that are expected to be in the registers. Stray writes can be detected by dropping the cache entries for all addresses that should have been written and then issuing a regcache_sync(). If this causes bus writes it means there were writes to unexpected registers. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20241212143725.1381013-2-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp_test_utils.h | 46 +++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 include/linux/firmware/cirrus/cs_dsp_test_utils.h (limited to 'include') diff --git a/include/linux/firmware/cirrus/cs_dsp_test_utils.h b/include/linux/firmware/cirrus/cs_dsp_test_utils.h new file mode 100644 index 000000000000..ac6b03f4c084 --- /dev/null +++ b/include/linux/firmware/cirrus/cs_dsp_test_utils.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Support utilities for cs_dsp testing. + * + * Copyright (C) 2024 Cirrus Logic, Inc. and + * Cirrus Logic International Semiconductor Ltd. + */ + +#include +#include + +struct kunit; +struct cs_dsp_test; +struct cs_dsp_test_local; + +/** + * struct cs_dsp_test - base class for test utilities + * + * @test: Pointer to struct kunit instance. + * @dsp: Pointer to struct cs_dsp instance. + * @local: Private data for each test suite. + */ +struct cs_dsp_test { + struct kunit *test; + struct cs_dsp *dsp; + + struct cs_dsp_test_local *local; + + /* Following members are private */ + bool saw_bus_write; +}; + +extern const unsigned int cs_dsp_mock_adsp2_32bit_sysbase; +extern const unsigned int cs_dsp_mock_adsp2_16bit_sysbase; +extern const unsigned int cs_dsp_mock_halo_core_base; +extern const unsigned int cs_dsp_mock_halo_sysinfo_base; + +int cs_dsp_mock_regmap_init(struct cs_dsp_test *priv); +void cs_dsp_mock_regmap_drop_range(struct cs_dsp_test *priv, + unsigned int first_reg, unsigned int last_reg); +void cs_dsp_mock_regmap_drop_regs(struct cs_dsp_test *priv, + unsigned int first_reg, size_t num_regs); +void cs_dsp_mock_regmap_drop_bytes(struct cs_dsp_test *priv, + unsigned int first_reg, size_t num_bytes); +void cs_dsp_mock_regmap_drop_system_regs(struct cs_dsp_test *priv); +bool cs_dsp_mock_regmap_is_dirty(struct cs_dsp_test *priv, bool drop_system_regs); -- cgit v1.2.3 From 41e78c0f44f97c958afcda3f82b23f4f4a05b968 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 12 Dec 2024 14:37:15 +0000 Subject: firmware: cs_dsp: Add mock DSP memory map for KUnit testing Add helper functions to implement an emulation of the DSP memory map. There are three main groups of functionality: 1. Define a mock cs_dsp_region table. 2. Calculate the addresses of memory and algorithms from the firmware header in XM. 3. Build a mock XM header in emulated XM. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20241212143725.1381013-3-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp_test_utils.h | 63 +++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'include') diff --git a/include/linux/firmware/cirrus/cs_dsp_test_utils.h b/include/linux/firmware/cirrus/cs_dsp_test_utils.h index ac6b03f4c084..899ae94198aa 100644 --- a/include/linux/firmware/cirrus/cs_dsp_test_utils.h +++ b/include/linux/firmware/cirrus/cs_dsp_test_utils.h @@ -30,11 +30,74 @@ struct cs_dsp_test { bool saw_bus_write; }; +/** + * struct cs_dsp_mock_alg_def - Info for creating a mock algorithm entry. + * + * @id Algorithm ID. + * @ver; Algorithm version. + * @xm_base_words XM base address in DSP words. + * @xm_size_words XM size in DSP words. + * @ym_base_words YM base address in DSP words. + * @ym_size_words YM size in DSP words. + * @zm_base_words ZM base address in DSP words. + * @zm_size_words ZM size in DSP words. + */ +struct cs_dsp_mock_alg_def { + unsigned int id; + unsigned int ver; + unsigned int xm_base_words; + unsigned int xm_size_words; + unsigned int ym_base_words; + unsigned int ym_size_words; + unsigned int zm_base_words; + unsigned int zm_size_words; +}; + +/** + * struct cs_dsp_mock_xm_header - XM header builder + * + * @test_priv: Pointer to the struct cs_dsp_test. + * @blob_data: Pointer to the created blob data. + * @blob_size_bytes: Size of the data at blob_data. + */ +struct cs_dsp_mock_xm_header { + struct cs_dsp_test *test_priv; + void *blob_data; + size_t blob_size_bytes; +}; + extern const unsigned int cs_dsp_mock_adsp2_32bit_sysbase; extern const unsigned int cs_dsp_mock_adsp2_16bit_sysbase; extern const unsigned int cs_dsp_mock_halo_core_base; extern const unsigned int cs_dsp_mock_halo_sysinfo_base; +extern const struct cs_dsp_region cs_dsp_mock_halo_dsp1_regions[]; +extern const unsigned int cs_dsp_mock_halo_dsp1_region_sizes[]; +extern const struct cs_dsp_region cs_dsp_mock_adsp2_32bit_dsp1_regions[]; +extern const unsigned int cs_dsp_mock_adsp2_32bit_dsp1_region_sizes[]; +extern const struct cs_dsp_region cs_dsp_mock_adsp2_16bit_dsp1_regions[]; +extern const unsigned int cs_dsp_mock_adsp2_16bit_dsp1_region_sizes[]; +int cs_dsp_mock_count_regions(const unsigned int *region_sizes); +unsigned int cs_dsp_mock_size_of_region(const struct cs_dsp *dsp, int mem_type); +unsigned int cs_dsp_mock_base_addr_for_mem(struct cs_dsp_test *priv, int mem_type); +unsigned int cs_dsp_mock_reg_addr_inc_per_unpacked_word(struct cs_dsp_test *priv); +unsigned int cs_dsp_mock_reg_block_length_bytes(struct cs_dsp_test *priv, int mem_type); +unsigned int cs_dsp_mock_reg_block_length_registers(struct cs_dsp_test *priv, int mem_type); +unsigned int cs_dsp_mock_reg_block_length_dsp_words(struct cs_dsp_test *priv, int mem_type); +bool cs_dsp_mock_has_zm(struct cs_dsp_test *priv); +int cs_dsp_mock_packed_to_unpacked_mem_type(int packed_mem_type); +unsigned int cs_dsp_mock_num_dsp_words_to_num_packed_regs(unsigned int num_dsp_words); +unsigned int cs_dsp_mock_xm_header_get_alg_base_in_words(struct cs_dsp_test *priv, + unsigned int alg_id, + int mem_type); +unsigned int cs_dsp_mock_xm_header_get_fw_version_from_regmap(struct cs_dsp_test *priv); +unsigned int cs_dsp_mock_xm_header_get_fw_version(struct cs_dsp_mock_xm_header *header); +void cs_dsp_mock_xm_header_drop_from_regmap_cache(struct cs_dsp_test *priv); +int cs_dsp_mock_xm_header_write_to_regmap(struct cs_dsp_mock_xm_header *header); +struct cs_dsp_mock_xm_header *cs_dsp_create_mock_xm_header(struct cs_dsp_test *priv, + const struct cs_dsp_mock_alg_def *algs, + size_t num_algs); + int cs_dsp_mock_regmap_init(struct cs_dsp_test *priv); void cs_dsp_mock_regmap_drop_range(struct cs_dsp_test *priv, unsigned int first_reg, unsigned int last_reg); -- cgit v1.2.3 From 5cf1b7b471803f7cc654a29ee16cb085ad69c097 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 12 Dec 2024 14:37:16 +0000 Subject: firmware: cs_dsp: Add mock wmfw file generator for KUnit testing Add a mock firmware file that emulates what the firmware build tools would normally create. This will be used by KUnit tests to generate a test wmfw file. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20241212143725.1381013-4-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp_test_utils.h | 33 +++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include') diff --git a/include/linux/firmware/cirrus/cs_dsp_test_utils.h b/include/linux/firmware/cirrus/cs_dsp_test_utils.h index 899ae94198aa..fde7e95a33e9 100644 --- a/include/linux/firmware/cirrus/cs_dsp_test_utils.h +++ b/include/linux/firmware/cirrus/cs_dsp_test_utils.h @@ -53,6 +53,17 @@ struct cs_dsp_mock_alg_def { unsigned int zm_size_words; }; +struct cs_dsp_mock_coeff_def { + const char *shortname; + const char *fullname; + const char *description; + u16 type; + u16 flags; + u16 mem_type; + unsigned int offset_dsp_words; + unsigned int length_bytes; +}; + /** * struct cs_dsp_mock_xm_header - XM header builder * @@ -66,6 +77,8 @@ struct cs_dsp_mock_xm_header { size_t blob_size_bytes; }; +struct cs_dsp_mock_wmfw_builder; + extern const unsigned int cs_dsp_mock_adsp2_32bit_sysbase; extern const unsigned int cs_dsp_mock_adsp2_16bit_sysbase; extern const unsigned int cs_dsp_mock_halo_core_base; @@ -107,3 +120,23 @@ void cs_dsp_mock_regmap_drop_bytes(struct cs_dsp_test *priv, unsigned int first_reg, size_t num_bytes); void cs_dsp_mock_regmap_drop_system_regs(struct cs_dsp_test *priv); bool cs_dsp_mock_regmap_is_dirty(struct cs_dsp_test *priv, bool drop_system_regs); + +struct cs_dsp_mock_wmfw_builder *cs_dsp_mock_wmfw_init(struct cs_dsp_test *priv, + int format_version); +void cs_dsp_mock_wmfw_add_raw_block(struct cs_dsp_mock_wmfw_builder *builder, + int mem_region, unsigned int mem_offset_dsp_words, + const void *payload_data, size_t payload_len_bytes); +void cs_dsp_mock_wmfw_add_info(struct cs_dsp_mock_wmfw_builder *builder, + const char *info); +void cs_dsp_mock_wmfw_add_data_block(struct cs_dsp_mock_wmfw_builder *builder, + int mem_region, unsigned int mem_offset_dsp_words, + const void *payload_data, size_t payload_len_bytes); +void cs_dsp_mock_wmfw_start_alg_info_block(struct cs_dsp_mock_wmfw_builder *builder, + unsigned int alg_id, + const char *name, + const char *description); +void cs_dsp_mock_wmfw_add_coeff_desc(struct cs_dsp_mock_wmfw_builder *builder, + const struct cs_dsp_mock_coeff_def *def); +void cs_dsp_mock_wmfw_end_alg_info_block(struct cs_dsp_mock_wmfw_builder *builder); +struct firmware *cs_dsp_mock_wmfw_get_firmware(struct cs_dsp_mock_wmfw_builder *builder); +int cs_dsp_mock_wmfw_format_version(struct cs_dsp_mock_wmfw_builder *builder); -- cgit v1.2.3 From 7c052c6615297ff32032105130cd5f02059f7ae4 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 12 Dec 2024 14:37:17 +0000 Subject: firmware: cs_dsp: Add mock bin file generator for KUnit testing Add a mock firmware file that emulates what the firmware build tools would normally create. This will be used by KUnit tests to generate a test bin file. The data payload in a bin is an opaque blob, so the mock bin only needs to generate the appropriate file header and description block for each payload blob. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20241212143725.1381013-5-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp_test_utils.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/firmware/cirrus/cs_dsp_test_utils.h b/include/linux/firmware/cirrus/cs_dsp_test_utils.h index fde7e95a33e9..4f87a908ab4f 100644 --- a/include/linux/firmware/cirrus/cs_dsp_test_utils.h +++ b/include/linux/firmware/cirrus/cs_dsp_test_utils.h @@ -78,6 +78,7 @@ struct cs_dsp_mock_xm_header { }; struct cs_dsp_mock_wmfw_builder; +struct cs_dsp_mock_bin_builder; extern const unsigned int cs_dsp_mock_adsp2_32bit_sysbase; extern const unsigned int cs_dsp_mock_adsp2_16bit_sysbase; @@ -121,6 +122,23 @@ void cs_dsp_mock_regmap_drop_bytes(struct cs_dsp_test *priv, void cs_dsp_mock_regmap_drop_system_regs(struct cs_dsp_test *priv); bool cs_dsp_mock_regmap_is_dirty(struct cs_dsp_test *priv, bool drop_system_regs); +struct cs_dsp_mock_bin_builder *cs_dsp_mock_bin_init(struct cs_dsp_test *priv, + int format_version, + unsigned int fw_version); +void cs_dsp_mock_bin_add_raw_block(struct cs_dsp_mock_bin_builder *builder, + unsigned int alg_id, unsigned int alg_ver, + int type, unsigned int offset, + const void *payload_data, size_t payload_len_bytes); +void cs_dsp_mock_bin_add_info(struct cs_dsp_mock_bin_builder *builder, + const char *info); +void cs_dsp_mock_bin_add_name(struct cs_dsp_mock_bin_builder *builder, + const char *name); +void cs_dsp_mock_bin_add_patch(struct cs_dsp_mock_bin_builder *builder, + unsigned int alg_id, unsigned int alg_ver, + int mem_region, unsigned int reg_addr_offset, + const void *payload_data, size_t payload_len_bytes); +struct firmware *cs_dsp_mock_bin_get_firmware(struct cs_dsp_mock_bin_builder *builder); + struct cs_dsp_mock_wmfw_builder *cs_dsp_mock_wmfw_init(struct cs_dsp_test *priv, int format_version); void cs_dsp_mock_wmfw_add_raw_block(struct cs_dsp_mock_wmfw_builder *builder, -- cgit v1.2.3 From b76d32422c09bc9310f61a5a89671975db34bd2a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 9 Dec 2024 16:09:44 +0000 Subject: kref: Improve documentation There is already kernel-doc written for many of the functions in kref.h but it's not linked into the html docs anywhere. Add it to kref.rst. Improve the kref documentation by using the standard Return: section, rewording some unclear verbiage and adding docs for some undocumented functions. Update Thomas' email address to his current one. Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Randy Dunlap Acked-by: Randy Dunlap Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20241209160953.757673-1-willy@infradead.org --- include/linux/kref.h | 48 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/kref.h b/include/linux/kref.h index d32e21a2538c..88e82ab1367c 100644 --- a/include/linux/kref.h +++ b/include/linux/kref.h @@ -46,18 +46,18 @@ static inline void kref_get(struct kref *kref) } /** - * kref_put - decrement refcount for object. - * @kref: object. - * @release: pointer to the function that will clean up the object when the + * kref_put - Decrement refcount for object + * @kref: Object + * @release: Pointer to the function that will clean up the object when the * last reference to the object is released. - * This pointer is required, and it is not acceptable to pass kfree - * in as this function. * - * Decrement the refcount, and if 0, call release(). - * Return 1 if the object was removed, otherwise return 0. Beware, if this - * function returns 0, you still can not count on the kref from remaining in - * memory. Only use the return value if you want to see if the kref is now - * gone, not present. + * Decrement the refcount, and if 0, call @release. The caller may not + * pass NULL or kfree() as the release function. + * + * Return: 1 if this call removed the object, otherwise return 0. Beware, + * if this function returns 0, another caller may have removed the object + * by the time this function returns. The return value is only certain + * if you want to see if the object is definitely released. */ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref)) { @@ -68,17 +68,37 @@ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref) return 0; } +/** + * kref_put_mutex - Decrement refcount for object + * @kref: Object + * @release: Pointer to the function that will clean up the object when the + * last reference to the object is released. + * @mutex: Mutex which protects the release function. + * + * This variant of kref_lock() calls the @release function with the @mutex + * held. The @release function will release the mutex. + */ static inline int kref_put_mutex(struct kref *kref, void (*release)(struct kref *kref), - struct mutex *lock) + struct mutex *mutex) { - if (refcount_dec_and_mutex_lock(&kref->refcount, lock)) { + if (refcount_dec_and_mutex_lock(&kref->refcount, mutex)) { release(kref); return 1; } return 0; } +/** + * kref_put_lock - Decrement refcount for object + * @kref: Object + * @release: Pointer to the function that will clean up the object when the + * last reference to the object is released. + * @lock: Spinlock which protects the release function. + * + * This variant of kref_lock() calls the @release function with the @lock + * held. The @release function will release the lock. + */ static inline int kref_put_lock(struct kref *kref, void (*release)(struct kref *kref), spinlock_t *lock) @@ -94,8 +114,6 @@ static inline int kref_put_lock(struct kref *kref, * kref_get_unless_zero - Increment refcount for object unless it is zero. * @kref: object. * - * Return non-zero if the increment succeeded. Otherwise return 0. - * * This function is intended to simplify locking around refcounting for * objects that can be looked up from a lookup structure, and which are * removed from that lookup structure in the object destructor. @@ -105,6 +123,8 @@ static inline int kref_put_lock(struct kref *kref, * With a lookup followed by a kref_get_unless_zero *with return value check* * locking in the kref_put path can be deferred to the actual removal from * the lookup structure and RCU lookups become trivial. + * + * Return: non-zero if the increment succeeded. Otherwise return 0. */ static inline int __must_check kref_get_unless_zero(struct kref *kref) { -- cgit v1.2.3 From 7f78c081d44ce0f9d73dcef3204df5936ddbfea7 Mon Sep 17 00:00:00 2001 From: Chun-Kuang Hu Date: Sun, 1 Sep 2024 14:32:59 +0000 Subject: soc: mediatek: cmdq: Remove cmdq_pkt_finalize() helper function In order to have fine-grained control, use cmdq_pkt_eoc() and cmdq_pkt_jump_rel() to replace cmdq_pkt_finalize(). Signed-off-by: Chun-Kuang Hu Acked-by: Matthias Brugger Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Sebastian Fricke Signed-off-by: Mauro Carvalho Chehab --- include/linux/soc/mediatek/mtk-cmdq.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h index 5bee6f7fc400..0c3906e8ad19 100644 --- a/include/linux/soc/mediatek/mtk-cmdq.h +++ b/include/linux/soc/mediatek/mtk-cmdq.h @@ -391,14 +391,6 @@ int cmdq_pkt_jump_rel(struct cmdq_pkt *pkt, s32 offset, u8 shift_pa); */ int cmdq_pkt_eoc(struct cmdq_pkt *pkt); -/** - * cmdq_pkt_finalize() - Append EOC and jump command to pkt. - * @pkt: the CMDQ packet - * - * Return: 0 for success; else the error code is returned - */ -int cmdq_pkt_finalize(struct cmdq_pkt *pkt); - #else /* IS_ENABLED(CONFIG_MTK_CMDQ) */ static inline int cmdq_dev_get_client_reg(struct device *dev, @@ -519,11 +511,6 @@ static inline int cmdq_pkt_eoc(struct cmdq_pkt *pkt) return -EINVAL; } -static inline int cmdq_pkt_finalize(struct cmdq_pkt *pkt) -{ - return -EINVAL; -} - #endif /* IS_ENABLED(CONFIG_MTK_CMDQ) */ #endif /* __MTK_CMDQ_H__ */ -- cgit v1.2.3 From 4e885fab7164689f031a6c73522a3d91674c5bdc Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Fri, 13 Dec 2024 13:09:28 +0000 Subject: bpf: Add a __btf_get_by_fd helper Add a new helper to get a pointer to a struct btf from a file descriptor. This helper doesn't increase a refcnt. Add a comment explaining this and pointing to a corresponding function which does take a reference. Signed-off-by: Anton Protopopov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241213130934.1087929-2-aspsk@isovalent.com --- include/linux/bpf.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eaee2a819f4c..ac44b857b2f9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2301,6 +2301,14 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); +/* + * The __bpf_map_get() and __btf_get_by_fd() functions parse a file + * descriptor and return a corresponding map or btf object. + * Their names are double underscored to emphasize the fact that they + * do not increase refcnt. To also increase refcnt use corresponding + * bpf_map_get() and btf_get_by_fd() functions. + */ + static inline struct bpf_map *__bpf_map_get(struct fd f) { if (fd_empty(f)) @@ -2310,6 +2318,15 @@ static inline struct bpf_map *__bpf_map_get(struct fd f) return fd_file(f)->private_data; } +static inline struct btf *__btf_get_by_fd(struct fd f) +{ + if (fd_empty(f)) + return ERR_PTR(-EBADF); + if (unlikely(fd_file(f)->f_op != &btf_fops)) + return ERR_PTR(-EINVAL); + return fd_file(f)->private_data; +} + void bpf_map_inc(struct bpf_map *map); void bpf_map_inc_with_uref(struct bpf_map *map); struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref); -- cgit v1.2.3 From 4d3ae294f900fb7232fb6c890dbd3176b8a5f121 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Fri, 13 Dec 2024 13:09:31 +0000 Subject: bpf: Add fd_array_cnt attribute for prog_load The fd_array attribute of the BPF_PROG_LOAD syscall may contain a set of file descriptors: maps or btfs. This field was introduced as a sparse array. Introduce a new attribute, fd_array_cnt, which, if present, indicates that the fd_array is a continuous array of the corresponding length. If fd_array_cnt is non-zero, then every map in the fd_array will be bound to the program, as if it was used by the program. This functionality is similar to the BPF_PROG_BIND_MAP syscall, but such maps can be used by the verifier during the program load. Signed-off-by: Anton Protopopov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20241213130934.1087929-5-aspsk@isovalent.com --- include/uapi/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4162afc6b5d0..2acf9b336371 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1573,6 +1573,16 @@ union bpf_attr { * If provided, prog_flags should have BPF_F_TOKEN_FD flag set. */ __s32 prog_token_fd; + /* The fd_array_cnt can be used to pass the length of the + * fd_array array. In this case all the [map] file descriptors + * passed in this array will be bound to the program, even if + * the maps are not referenced directly. The functionality is + * similar to the BPF_PROG_BIND_MAP syscall, but maps can be + * used by the verifier during the program load. If provided, + * then the fd_array[0,...,fd_array_cnt-1] is expected to be + * continuous. + */ + __u32 fd_array_cnt; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ -- cgit v1.2.3 From 6037802bbae892f3ad0c7b4c4faee39b967e32b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 11 Dec 2024 20:57:55 +0100 Subject: power: supply: core: implement extension API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various drivers, mostly in platform/x86 extend the ACPI battery driver with additional sysfs attributes to implement more UAPIs than are exposed through ACPI by using various side-channels, like WMI, nonstandard ACPI or EC communication. While the created sysfs attributes look similar to the attributes provided by the powersupply core, there are various deficiencies: * They don't show up in uevent payload. * They can't be queried with the standard in-kernel APIs. * They don't work with triggers. * The extending driver has to reimplement all of the parsing, formatting and sysfs display logic. * Writing a extension driver is completely different from writing a normal power supply driver. This extension API avoids all of these issues. An extension is just a "struct power_supply_ext" with the same kind of callbacks as in a normal "struct power_supply_desc". The API is meant to be used via battery_hook_register(), the same way as the current extensions. Signed-off-by: Thomas Weißschuh Reviewed-by: Armin Wolf Link: https://lore.kernel.org/r/20241211-power-supply-extensions-v6-1-9d9dc3f3d387@weissschuh.net Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 0d96657d1a2b..a877518cd963 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include #include @@ -283,6 +285,27 @@ struct power_supply_desc { int use_for_apm; }; +struct power_supply_ext { + u8 charge_behaviours; + const enum power_supply_property *properties; + size_t num_properties; + + int (*get_property)(struct power_supply *psy, + const struct power_supply_ext *ext, + void *data, + enum power_supply_property psp, + union power_supply_propval *val); + int (*set_property)(struct power_supply *psy, + const struct power_supply_ext *ext, + void *data, + enum power_supply_property psp, + const union power_supply_propval *val); + int (*property_is_writeable)(struct power_supply *psy, + const struct power_supply_ext *ext, + void *data, + enum power_supply_property psp); +}; + struct power_supply { const struct power_supply_desc *desc; @@ -302,10 +325,13 @@ struct power_supply { struct delayed_work deferred_register_work; spinlock_t changed_lock; bool changed; + bool update_groups; bool initialized; bool removing; atomic_t use_cnt; struct power_supply_battery_info *battery_info; + struct rw_semaphore extensions_sem; /* protects "extensions" */ + struct list_head extensions; #ifdef CONFIG_THERMAL struct thermal_zone_device *tzd; struct thermal_cooling_device *tcd; @@ -882,6 +908,13 @@ devm_power_supply_register(struct device *parent, extern void power_supply_unregister(struct power_supply *psy); extern int power_supply_powers(struct power_supply *psy, struct device *dev); +extern int __must_check +power_supply_register_extension(struct power_supply *psy, + const struct power_supply_ext *ext, + void *data); +extern void power_supply_unregister_extension(struct power_supply *psy, + const struct power_supply_ext *ext); + #define to_power_supply(device) container_of(device, struct power_supply, dev) extern void *power_supply_get_drvdata(struct power_supply *psy); -- cgit v1.2.3 From 07d58e0a60f70b3cc176c9427d1ea856d1756820 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 7 Dec 2024 11:05:03 -0800 Subject: crypto: skcipher - remove support for physical address walks Since the physical address support in skcipher_walk is not used anymore, remove all the code associated with it. This includes: - The skcipher_walk_async() and skcipher_walk_complete() functions; - The SKCIPHER_WALK_PHYS flag and everything conditional on it; - The buffers, phys, and virt.page fields in struct skcipher_walk; - struct skcipher_walk_buffer. As a result, skcipher_walk now just supports virtual addresses. Physical address support in skcipher_walk is unneeded because drivers that need physical addresses just use the scatterlists directly. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- include/crypto/internal/skcipher.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h index 7ae42afdcf3e..08d1e8c63afc 100644 --- a/include/crypto/internal/skcipher.h +++ b/include/crypto/internal/skcipher.h @@ -11,7 +11,6 @@ #include #include #include -#include #include /* @@ -58,12 +57,6 @@ struct crypto_lskcipher_spawn { struct skcipher_walk { union { struct { - struct page *page; - unsigned long offset; - } phys; - - struct { - u8 *page; void *addr; } virt; } src, dst; @@ -74,8 +67,6 @@ struct skcipher_walk { struct scatter_walk out; unsigned int total; - struct list_head buffers; - u8 *page; u8 *buffer; u8 *oiv; @@ -209,13 +200,10 @@ int skcipher_walk_done(struct skcipher_walk *walk, int err); int skcipher_walk_virt(struct skcipher_walk *walk, struct skcipher_request *req, bool atomic); -int skcipher_walk_async(struct skcipher_walk *walk, - struct skcipher_request *req); int skcipher_walk_aead_encrypt(struct skcipher_walk *walk, struct aead_request *req, bool atomic); int skcipher_walk_aead_decrypt(struct skcipher_walk *walk, struct aead_request *req, bool atomic); -void skcipher_walk_complete(struct skcipher_walk *walk, int err); static inline void skcipher_walk_abort(struct skcipher_walk *walk) { -- cgit v1.2.3 From 5feae3e79dbe2d357b223fc48ae907ba0aedb271 Mon Sep 17 00:00:00 2001 From: Igor Belwon Date: Mon, 9 Dec 2024 15:45:21 +0100 Subject: dt-bindings: clock: samsung: Add Exynos990 SoC CMU bindings Add dt-schema documentation for the Exynos990 SoC CMU. This clock management unit has a topmost block (CMU_TOP) that generates top clocks for other blocks. Currently the only other block implemented is CMU_HSI0, which provides clocks for the USB part of the SoC. Also, device-tree binding definitions added for these blocks: - CMU_TOP - CMU_HSI0 Reviewed-by: Krzysztof Kozlowski Signed-off-by: Igor Belwon Link: https://lore.kernel.org/r/20241209-exynos990-cmu-v4-1-57f07080f9e4@mentallysanemainliners.org Signed-off-by: Krzysztof Kozlowski --- include/dt-bindings/clock/samsung,exynos990.h | 236 ++++++++++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 include/dt-bindings/clock/samsung,exynos990.h (limited to 'include') diff --git a/include/dt-bindings/clock/samsung,exynos990.h b/include/dt-bindings/clock/samsung,exynos990.h new file mode 100644 index 000000000000..307215a3f3ed --- /dev/null +++ b/include/dt-bindings/clock/samsung,exynos990.h @@ -0,0 +1,236 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (C) 2024 Igor Belwon + * + * Device Tree binding constants for Exynos990 clock controller. + */ + +#ifndef _DT_BINDINGS_CLOCK_EXYNOS_990_H +#define _DT_BINDINGS_CLOCK_EXYNOS_990_H + +/* CMU_TOP */ +#define CLK_FOUT_SHARED0_PLL 1 +#define CLK_FOUT_SHARED1_PLL 2 +#define CLK_FOUT_SHARED2_PLL 3 +#define CLK_FOUT_SHARED3_PLL 4 +#define CLK_FOUT_SHARED4_PLL 5 +#define CLK_FOUT_G3D_PLL 6 +#define CLK_FOUT_MMC_PLL 7 +#define CLK_MOUT_PLL_SHARED0 8 +#define CLK_MOUT_PLL_SHARED1 9 +#define CLK_MOUT_PLL_SHARED2 10 +#define CLK_MOUT_PLL_SHARED3 11 +#define CLK_MOUT_PLL_SHARED4 12 +#define CLK_MOUT_PLL_MMC 13 +#define CLK_MOUT_PLL_G3D 14 +#define CLK_MOUT_CMU_APM_BUS 15 +#define CLK_MOUT_CMU_AUD_CPU 16 +#define CLK_MOUT_CMU_BUS0_BUS 17 +#define CLK_MOUT_CMU_BUS1_BUS 18 +#define CLK_MOUT_CMU_BUS1_SSS 19 +#define CLK_MOUT_CMU_CIS_CLK0 20 +#define CLK_MOUT_CMU_CIS_CLK1 21 +#define CLK_MOUT_CMU_CIS_CLK2 22 +#define CLK_MOUT_CMU_CIS_CLK3 23 +#define CLK_MOUT_CMU_CIS_CLK4 24 +#define CLK_MOUT_CMU_CIS_CLK5 25 +#define CLK_MOUT_CMU_CMU_BOOST 26 +#define CLK_MOUT_CMU_CORE_BUS 27 +#define CLK_MOUT_CMU_CPUCL0_DBG_BUS 28 +#define CLK_MOUT_CMU_CPUCL0_SWITCH 29 +#define CLK_MOUT_CMU_CPUCL1_SWITCH 30 +#define CLK_MOUT_CMU_CPUCL2_BUSP 31 +#define CLK_MOUT_CMU_CPUCL2_SWITCH 32 +#define CLK_MOUT_CMU_CSIS_BUS 33 +#define CLK_MOUT_CMU_CSIS_OIS_MCU 34 +#define CLK_MOUT_CMU_DNC_BUS 35 +#define CLK_MOUT_CMU_DNC_BUSM 36 +#define CLK_MOUT_CMU_DNS_BUS 37 +#define CLK_MOUT_CMU_DPU 38 +#define CLK_MOUT_CMU_DPU_ALT 39 +#define CLK_MOUT_CMU_DSP_BUS 40 +#define CLK_MOUT_CMU_G2D_G2D 41 +#define CLK_MOUT_CMU_G2D_MSCL 42 +#define CLK_MOUT_CMU_HPM 43 +#define CLK_MOUT_CMU_HSI0_BUS 44 +#define CLK_MOUT_CMU_HSI0_DPGTC 45 +#define CLK_MOUT_CMU_HSI0_USB31DRD 46 +#define CLK_MOUT_CMU_HSI0_USBDP_DEBUG 47 +#define CLK_MOUT_CMU_HSI1_BUS 48 +#define CLK_MOUT_CMU_HSI1_MMC_CARD 49 +#define CLK_MOUT_CMU_HSI1_PCIE 50 +#define CLK_MOUT_CMU_HSI1_UFS_CARD 51 +#define CLK_MOUT_CMU_HSI1_UFS_EMBD 52 +#define CLK_MOUT_CMU_HSI2_BUS 53 +#define CLK_MOUT_CMU_HSI2_PCIE 54 +#define CLK_MOUT_CMU_IPP_BUS 55 +#define CLK_MOUT_CMU_ITP_BUS 56 +#define CLK_MOUT_CMU_MCSC_BUS 57 +#define CLK_MOUT_CMU_MCSC_GDC 58 +#define CLK_MOUT_CMU_CMU_BOOST_CPU 59 +#define CLK_MOUT_CMU_MFC0_MFC0 60 +#define CLK_MOUT_CMU_MFC0_WFD 61 +#define CLK_MOUT_CMU_MIF_BUSP 62 +#define CLK_MOUT_CMU_MIF_SWITCH 63 +#define CLK_MOUT_CMU_NPU_BUS 64 +#define CLK_MOUT_CMU_PERIC0_BUS 65 +#define CLK_MOUT_CMU_PERIC0_IP 66 +#define CLK_MOUT_CMU_PERIC1_BUS 67 +#define CLK_MOUT_CMU_PERIC1_IP 68 +#define CLK_MOUT_CMU_PERIS_BUS 69 +#define CLK_MOUT_CMU_SSP_BUS 70 +#define CLK_MOUT_CMU_TNR_BUS 71 +#define CLK_MOUT_CMU_VRA_BUS 72 +#define CLK_DOUT_CMU_APM_BUS 73 +#define CLK_DOUT_CMU_AUD_CPU 74 +#define CLK_DOUT_CMU_BUS0_BUS 75 +#define CLK_DOUT_CMU_BUS1_BUS 76 +#define CLK_DOUT_CMU_BUS1_SSS 77 +#define CLK_DOUT_CMU_CIS_CLK0 78 +#define CLK_DOUT_CMU_CIS_CLK1 79 +#define CLK_DOUT_CMU_CIS_CLK2 80 +#define CLK_DOUT_CMU_CIS_CLK3 81 +#define CLK_DOUT_CMU_CIS_CLK4 82 +#define CLK_DOUT_CMU_CIS_CLK5 83 +#define CLK_DOUT_CMU_CMU_BOOST 84 +#define CLK_DOUT_CMU_CORE_BUS 85 +#define CLK_DOUT_CMU_CPUCL0_DBG_BUS 86 +#define CLK_DOUT_CMU_CPUCL0_SWITCH 87 +#define CLK_DOUT_CMU_CPUCL1_SWITCH 88 +#define CLK_DOUT_CMU_CPUCL2_BUSP 89 +#define CLK_DOUT_CMU_CPUCL2_SWITCH 90 +#define CLK_DOUT_CMU_CSIS_BUS 91 +#define CLK_DOUT_CMU_CSIS_OIS_MCU 92 +#define CLK_DOUT_CMU_DNC_BUS 93 +#define CLK_DOUT_CMU_DNC_BUSM 94 +#define CLK_DOUT_CMU_DNS_BUS 95 +#define CLK_DOUT_CMU_DSP_BUS 96 +#define CLK_DOUT_CMU_G2D_G2D 97 +#define CLK_DOUT_CMU_G2D_MSCL 98 +#define CLK_DOUT_CMU_G3D_SWITCH 99 +#define CLK_DOUT_CMU_HPM 100 +#define CLK_DOUT_CMU_HSI0_BUS 101 +#define CLK_DOUT_CMU_HSI0_DPGTC 102 +#define CLK_DOUT_CMU_HSI0_USB31DRD 103 +#define CLK_DOUT_CMU_HSI0_USBDP_DEBUG 104 +#define CLK_DOUT_CMU_HSI1_BUS 105 +#define CLK_DOUT_CMU_HSI1_MMC_CARD 106 +#define CLK_DOUT_CMU_HSI1_PCIE 107 +#define CLK_DOUT_CMU_HSI1_UFS_CARD 108 +#define CLK_DOUT_CMU_HSI1_UFS_EMBD 109 +#define CLK_DOUT_CMU_HSI2_BUS 110 +#define CLK_DOUT_CMU_HSI2_PCIE 111 +#define CLK_DOUT_CMU_IPP_BUS 112 +#define CLK_DOUT_CMU_ITP_BUS 113 +#define CLK_DOUT_CMU_MCSC_BUS 114 +#define CLK_DOUT_CMU_MCSC_GDC 115 +#define CLK_DOUT_CMU_CMU_BOOST_CPU 116 +#define CLK_DOUT_CMU_MFC0_MFC0 117 +#define CLK_DOUT_CMU_MFC0_WFD 118 +#define CLK_DOUT_CMU_MIF_BUSP 119 +#define CLK_DOUT_CMU_NPU_BUS 120 +#define CLK_DOUT_CMU_OTP 121 +#define CLK_DOUT_CMU_PERIC0_BUS 122 +#define CLK_DOUT_CMU_PERIC0_IP 123 +#define CLK_DOUT_CMU_PERIC1_BUS 124 +#define CLK_DOUT_CMU_PERIC1_IP 125 +#define CLK_DOUT_CMU_PERIS_BUS 126 +#define CLK_DOUT_CMU_SSP_BUS 127 +#define CLK_DOUT_CMU_TNR_BUS 128 +#define CLK_DOUT_CMU_VRA_BUS 129 +#define CLK_DOUT_CMU_DPU 130 +#define CLK_DOUT_CMU_DPU_ALT 131 +#define CLK_DOUT_CMU_SHARED0_DIV2 132 +#define CLK_DOUT_CMU_SHARED0_DIV3 133 +#define CLK_DOUT_CMU_SHARED0_DIV4 134 +#define CLK_DOUT_CMU_SHARED1_DIV2 135 +#define CLK_DOUT_CMU_SHARED1_DIV3 136 +#define CLK_DOUT_CMU_SHARED1_DIV4 137 +#define CLK_DOUT_CMU_SHARED2_DIV2 138 +#define CLK_DOUT_CMU_SHARED4_DIV2 139 +#define CLK_DOUT_CMU_SHARED4_DIV3 140 +#define CLK_DOUT_CMU_SHARED4_DIV4 141 +#define CLK_GOUT_CMU_G3D_BUS 142 +#define CLK_GOUT_CMU_MIF_SWITCH 143 +#define CLK_GOUT_CMU_APM_BUS 144 +#define CLK_GOUT_CMU_AUD_CPU 145 +#define CLK_GOUT_CMU_BUS0_BUS 146 +#define CLK_GOUT_CMU_BUS1_BUS 147 +#define CLK_GOUT_CMU_BUS1_SSS 148 +#define CLK_GOUT_CMU_CIS_CLK0 149 +#define CLK_GOUT_CMU_CIS_CLK1 150 +#define CLK_GOUT_CMU_CIS_CLK2 151 +#define CLK_GOUT_CMU_CIS_CLK3 152 +#define CLK_GOUT_CMU_CIS_CLK4 153 +#define CLK_GOUT_CMU_CIS_CLK5 154 +#define CLK_GOUT_CMU_CORE_BUS 155 +#define CLK_GOUT_CMU_CPUCL0_DBG_BUS 156 +#define CLK_GOUT_CMU_CPUCL0_SWITCH 157 +#define CLK_GOUT_CMU_CPUCL1_SWITCH 158 +#define CLK_GOUT_CMU_CPUCL2_BUSP 159 +#define CLK_GOUT_CMU_CPUCL2_SWITCH 160 +#define CLK_GOUT_CMU_CSIS_BUS 161 +#define CLK_GOUT_CMU_CSIS_OIS_MCU 162 +#define CLK_GOUT_CMU_DNC_BUS 163 +#define CLK_GOUT_CMU_DNC_BUSM 164 +#define CLK_GOUT_CMU_DNS_BUS 165 +#define CLK_GOUT_CMU_DPU 166 +#define CLK_GOUT_CMU_DPU_BUS 167 +#define CLK_GOUT_CMU_DSP_BUS 168 +#define CLK_GOUT_CMU_G2D_G2D 169 +#define CLK_GOUT_CMU_G2D_MSCL 170 +#define CLK_GOUT_CMU_G3D_SWITCH 171 +#define CLK_GOUT_CMU_HPM 172 +#define CLK_GOUT_CMU_HSI0_BUS 173 +#define CLK_GOUT_CMU_HSI0_DPGTC 174 +#define CLK_GOUT_CMU_HSI0_USB31DRD 175 +#define CLK_GOUT_CMU_HSI0_USBDP_DEBUG 176 +#define CLK_GOUT_CMU_HSI1_BUS 177 +#define CLK_GOUT_CMU_HSI1_MMC_CARD 178 +#define CLK_GOUT_CMU_HSI1_PCIE 179 +#define CLK_GOUT_CMU_HSI1_UFS_CARD 180 +#define CLK_GOUT_CMU_HSI1_UFS_EMBD 181 +#define CLK_GOUT_CMU_HSI2_BUS 182 +#define CLK_GOUT_CMU_HSI2_PCIE 183 +#define CLK_GOUT_CMU_IPP_BUS 184 +#define CLK_GOUT_CMU_ITP_BUS 185 +#define CLK_GOUT_CMU_MCSC_BUS 186 +#define CLK_GOUT_CMU_MCSC_GDC 187 +#define CLK_GOUT_CMU_MFC0_MFC0 188 +#define CLK_GOUT_CMU_MFC0_WFD 189 +#define CLK_GOUT_CMU_MIF_BUSP 190 +#define CLK_GOUT_CMU_NPU_BUS 191 +#define CLK_GOUT_CMU_PERIC0_BUS 192 +#define CLK_GOUT_CMU_PERIC0_IP 193 +#define CLK_GOUT_CMU_PERIC1_BUS 194 +#define CLK_GOUT_CMU_PERIC1_IP 195 +#define CLK_GOUT_CMU_PERIS_BUS 196 +#define CLK_GOUT_CMU_SSP_BUS 197 +#define CLK_GOUT_CMU_TNR_BUS 198 +#define CLK_GOUT_CMU_VRA_BUS 199 + +/* CMU_HSI0 */ +#define CLK_MOUT_HSI0_BUS_USER 1 +#define CLK_MOUT_HSI0_USB31DRD_USER 2 +#define CLK_MOUT_HSI0_USBDP_DEBUG_USER 3 +#define CLK_MOUT_HSI0_DPGTC_USER 4 +#define CLK_GOUT_HSI0_DP_LINK_DP_GTC_CLK 5 +#define CLK_GOUT_HSI0_DP_LINK_PCLK 6 +#define CLK_GOUT_HSI0_D_TZPC_HSI0_PCLK 7 +#define CLK_GOUT_HSI0_LHM_AXI_P_HSI0_CLK 8 +#define CLK_GOUT_HSI0_PPMU_HSI0_BUS1_ACLK 9 +#define CLK_GOUT_HSI0_PPMU_HSI0_BUS1_PCLK 10 +#define CLK_GOUT_HSI0_CLK_HSI0_BUS_CLK 11 +#define CLK_GOUT_HSI0_SYSMMU_USB_CLK_S2 12 +#define CLK_GOUT_HSI0_SYSREG_HSI0_PCLK 13 +#define CLK_GOUT_HSI0_USB31DRD_ACLK_PHYCTRL 14 +#define CLK_GOUT_HSI0_USB31DRD_BUS_CLK_EARLY 15 +#define CLK_GOUT_HSI0_USB31DRD_USB31DRD_REF_CLK_40 16 +#define CLK_GOUT_HSI0_USB31DRD_USBDPPHY_REF_SOC_PLL 17 +#define CLK_GOUT_HSI0_USB31DRD_USBDPPHY_SCL_APB 18 +#define CLK_GOUT_HSI0_USB31DRD_USBPCS_APB_CLK 19 +#define CLK_GOUT_HSI0_VGEN_LITE_HSI0_CLK 20 +#define CLK_GOUT_HSI0_CMU_HSI0_PCLK 21 +#define CLK_GOUT_HSI0_XIU_D_HSI0_ACLK 22 + +#endif -- cgit v1.2.3 From 5119e6b44f8ada5f5cea19935a7f005fee062aef Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 11 Dec 2024 21:42:27 +0000 Subject: memory: omap-gpmc: deadcode a pair of functions gpmc_get_client_irq() last use was removed by commit ac28e47ccc3f ("ARM: OMAP2+: Remove legacy gpmc-nand.c") gpmc_ticks_to_ns() last use was removed by commit 2514830b8b8c ("ARM: OMAP2+: Remove gpmc-onenand") Remove them. gpmc_clk_ticks_to_ns() is now only used in some DEBUG code; move inside the ifdef to avoid unused warnings. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Roger Quadros Acked-by: Kevin Hilman Link: https://lore.kernel.org/r/20241211214227.107980-1-linux@treblig.org Signed-off-by: Krzysztof Kozlowski --- include/linux/omap-gpmc.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/omap-gpmc.h b/include/linux/omap-gpmc.h index c9e3843d2dd5..263b915df1fb 100644 --- a/include/linux/omap-gpmc.h +++ b/include/linux/omap-gpmc.h @@ -66,10 +66,6 @@ extern int gpmc_calc_timings(struct gpmc_timings *gpmc_t, struct device_node; -extern int gpmc_get_client_irq(unsigned irq_config); - -extern unsigned int gpmc_ticks_to_ns(unsigned int ticks); - extern void gpmc_cs_write_reg(int cs, int idx, u32 val); extern int gpmc_calc_divider(unsigned int sync_clk); extern int gpmc_cs_set_timings(int cs, const struct gpmc_timings *t, -- cgit v1.2.3 From 9698d5a4836549d394e6efd858b5200878c9f255 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 29 Nov 2024 14:02:23 +0100 Subject: pidfs: rework inode number allocation Recently we received a patchset that aims to enable file handle encoding and decoding via name_to_handle_at(2) and open_by_handle_at(2). A crucical step in the patch series is how to go from inode number to struct pid without leaking information into unprivileged contexts. The issue is that in order to find a struct pid the pid number in the initial pid namespace must be encoded into the file handle via name_to_handle_at(2). This can be used by containers using a separate pid namespace to learn what the pid number of a given process in the initial pid namespace is. While this is a weak information leak it could be used in various exploits and in general is an ugly wart in the design. To solve this problem a new way is needed to lookup a struct pid based on the inode number allocated for that struct pid. The other part is to remove the custom inode number allocation on 32bit systems that is also an ugly wart that should go away. So, a new scheme is used that I was discusssing with Tejun some time back. A cyclic ida is used for the lower 32 bits and a the high 32 bits are used for the generation number. This gives a 64 bit inode number that is unique on both 32 bit and 64 bit. The lower 32 bit number is recycled slowly and can be used to lookup struct pids. Link: https://lore.kernel.org/r/20241129-work-pidfs-v2-1-61043d66fbce@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/pidfs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 75bdf9807802..2958652bb108 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -4,5 +4,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); +int pidfs_add_pid(struct pid *pid); +void pidfs_remove_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ -- cgit v1.2.3 From d2ab36bb115b720c9c738184d4007e1ca01c53da Mon Sep 17 00:00:00 2001 From: Erin Shepherd Date: Fri, 29 Nov 2024 14:38:00 +0100 Subject: pseudofs: add support for export_ops Pseudo-filesystems might reasonably wish to implement the export ops (particularly for name_to_handle_at/open_by_handle_at); plumb this through pseudo_fs_context Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Erin Shepherd Link: https://lore.kernel.org/r/20241113-pidfs_fh-v2-1-9a4d28155a37@e43.eu Link: https://lore.kernel.org/r/20241129-work-pidfs-file_handle-v1-1-87d803a42495@kernel.org Signed-off-by: Christian Brauner --- include/linux/pseudo_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h index 730f77381d55..2503f7625d65 100644 --- a/include/linux/pseudo_fs.h +++ b/include/linux/pseudo_fs.h @@ -5,6 +5,7 @@ struct pseudo_fs_context { const struct super_operations *ops; + const struct export_operations *eops; const struct xattr_handler * const *xattr; const struct dentry_operations *dops; unsigned long magic; -- cgit v1.2.3 From 50166d57ea8c5042ecba0ee22532617d72ed085a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 29 Nov 2024 14:38:02 +0100 Subject: exportfs: add open method This allows filesystems such as pidfs to provide their custom open. Link: https://lore.kernel.org/r/20241129-work-pidfs-file_handle-v1-3-87d803a42495@kernel.org Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 4cc8801e50e3..c69b79b64466 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -10,6 +10,7 @@ struct inode; struct iomap; struct super_block; struct vfsmount; +struct path; /* limit the handle size to NFSv4 handle size now */ #define MAX_HANDLE_SZ 128 @@ -225,6 +226,9 @@ struct fid { * is also a directory. In the event that it cannot be found, or storage * space cannot be allocated, a %ERR_PTR should be returned. * + * open: + * Allow filesystems to specify a custom open function. + * * commit_metadata: * @commit_metadata should commit metadata changes to stable storage. * @@ -251,6 +255,7 @@ struct export_operations { bool write, u32 *device_generation); int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int nr_iomaps, struct iattr *iattr); + struct file * (*open)(struct path *path, unsigned int oflags); #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ #define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */ -- cgit v1.2.3 From 0203b485d26d5b403ff4ed21e4cc85ba9ec0fe67 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 Oct 2024 16:42:37 -0700 Subject: torture: Add dowarn argument to torture_sched_setaffinity() Current use cases of torture_sched_setaffinity() are well served by its unconditional warning on error. However, an upcoming use case for a preemption kthread needs to avoid warnings that might otherwise arise when that kthread attempted to bind itself to a CPU on its way offline. This commit therefore adds a dowarn argument that, when false, suppresses the warning. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/torture.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/torture.h b/include/linux/torture.h index c2e979f82f8d..0134e7221cae 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -130,7 +130,7 @@ void _torture_stop_kthread(char *m, struct task_struct **tp); #endif #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST) -long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn); #endif #endif /* __LINUX_TORTURE_H */ -- cgit v1.2.3 From 868dc3cd1105bd328be864bf2c409891438df44a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 18 Nov 2024 07:15:58 +0100 Subject: thermal: core: Add stub for thermal_zone_device_update() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To simplify the !CONFIG_THERMAL case in the hwmon core, add a !CONFIG_THERMAL stub for thermal_zone_device_update(). Signed-off-by: Thomas Weißschuh Reviewed-by: Guenter Roeck Acked-by: Rafael J. Wysocki Signed-off-by: Guenter Roeck --- include/linux/thermal.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 754802478b96..69f9bedd0ee8 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -295,6 +295,10 @@ static inline struct thermal_zone_device *thermal_tripless_zone_device_register( static inline void thermal_zone_device_unregister(struct thermal_zone_device *tz) { } +static inline void thermal_zone_device_update(struct thermal_zone_device *tz, + enum thermal_notify_event event) +{ } + static inline struct thermal_cooling_device * thermal_cooling_device_register(const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) -- cgit v1.2.3 From f40452577557caf0e5d0ff182da8479c3d492ac5 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Mon, 2 Dec 2024 11:28:00 +0100 Subject: hwmon: (pmbus/core) improve handling of write protected regulators Writing PMBus protected registers does succeed from the smbus perspective, even if the write is ignored by the device and a communication fault is raised. This fault will silently be caught and cleared by pmbus irq if one has been registered. This means that the regulator call may return succeed although the operation was ignored. With this change, the operation which are not supported will be properly flagged as such and the regulator framework won't even try to execute them. Signed-off-by: Jerome Brunet [groeck: Adjust to EXPORT_SYMBOL_NS_GPL API change] Signed-off-by: Guenter Roeck --- include/linux/pmbus.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/pmbus.h b/include/linux/pmbus.h index fa9f08164c36..884040e1383b 100644 --- a/include/linux/pmbus.h +++ b/include/linux/pmbus.h @@ -73,6 +73,20 @@ */ #define PMBUS_USE_COEFFICIENTS_CMD BIT(5) +/* + * PMBUS_OP_PROTECTED + * Set if the chip OPERATION command is protected and protection is not + * determined by the standard WRITE_PROTECT command. + */ +#define PMBUS_OP_PROTECTED BIT(6) + +/* + * PMBUS_VOUT_PROTECTED + * Set if the chip VOUT_COMMAND command is protected and protection is not + * determined by the standard WRITE_PROTECT command. + */ +#define PMBUS_VOUT_PROTECTED BIT(7) + struct pmbus_platform_data { u32 flags; /* Device specific flags */ -- cgit v1.2.3 From 0f38c06cab7712fc82c314fe4264a8897f3e6365 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 28 Oct 2024 13:07:11 -0700 Subject: rcutorture: Check preemption for failing reader This commit checks to see if the RCU reader has been preempted within its read-side critical section for RCU flavors supporting this notion (currently only preemptible RCU). If such a preemption occurred, then this is printed at the end of the "Failure/close-call rcutorture reader segments" list at the end of the rcutorture run. [ paulmck: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Tested-by: kernel test robot Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/rcupdate_wait.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index 303ab9bee155..f9bed3d3f78d 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -65,4 +65,15 @@ static inline void cond_resched_rcu(void) #endif } +// Has the current task blocked within its current RCU read-side +// critical section? +static inline bool has_rcu_reader_blocked(void) +{ +#ifdef CONFIG_PREEMPT_RCU + return !list_empty(¤t->rcu_node_entry); +#else + return false; +#endif +} + #endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */ -- cgit v1.2.3 From 0fef924e3918e72768357a220c84e6b4dd2b6180 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Nov 2024 11:11:18 -0800 Subject: rcutorture: Use symbols for SRCU reader flavors This commit converts rcutorture.c values for the reader_flavor module parameter from hexadecimal to the SRCU_READ_FLAVOR_* C-preprocessor macros. The actual modprobe or kernel-boot-parameter values for read_flavor must still be entered in hexadecimal. Link: https://lore.kernel.org/all/c48c9dca-fe07-4833-acaa-28c827e5a79e@amd.com/ Suggested-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/srcu.h | 6 ++++++ include/linux/srcutree.h | 6 +----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 08339eb8a01c..da8224d0f71c 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -43,6 +43,12 @@ int init_srcu_struct(struct srcu_struct *ssp); #define __SRCU_DEP_MAP_INIT(srcu_name) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +/* Values for SRCU Tree srcu_data ->srcu_reader_flavor, but also used by rcutorture. */ +#define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). +#define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). +#define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). +#define SRCU_READ_FLAVOR_ALL 0x7 // All of the above. + #ifdef CONFIG_TINY_SRCU #include #elif defined(CONFIG_TREE_SRCU) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 490aeecc6bb4..80016bbed672 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -26,6 +26,7 @@ struct srcu_data { atomic_long_t srcu_lock_count[2]; /* Locks per CPU. */ atomic_long_t srcu_unlock_count[2]; /* Unlocks per CPU. */ int srcu_reader_flavor; /* Reader flavor for srcu_struct structure? */ + /* Values: SRCU_READ_FLAVOR_.* */ /* Update-side state. */ spinlock_t __private lock ____cacheline_internodealigned_in_smp; @@ -43,11 +44,6 @@ struct srcu_data { struct srcu_struct *ssp; }; -/* Values for ->srcu_reader_flavor. */ -#define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). -#define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). -#define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite(). - /* * Node in SRCU combining tree, similar in function to rcu_data. */ -- cgit v1.2.3 From d465492a224b2409508224cf6970d7b97e2285cc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Oct 2024 15:09:39 -0700 Subject: srcu: Guarantee non-negative return value from srcu_read_lock() For almost 20 years, the int return value from srcu_read_lock() has been always either zero or one. This commit therefore documents the fact that it will be non-negative, and does the same for the underlying __srcu_read_lock(). [ paulmck: Apply Andrii Nakryiko feedback. ] Signed-off-by: Paul E. McKenney Acked-by: Andrii Nakryiko Acked-by: Peter Zijlstra (Intel) Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/srcu.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 08339eb8a01c..abaddd7e6ddf 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -232,13 +232,14 @@ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) * a mutex that is held elsewhere while calling synchronize_srcu() or * synchronize_srcu_expedited(). * - * The return value from srcu_read_lock() must be passed unaltered - * to the matching srcu_read_unlock(). Note that srcu_read_lock() and - * the matching srcu_read_unlock() must occur in the same context, for - * example, it is illegal to invoke srcu_read_unlock() in an irq handler - * if the matching srcu_read_lock() was invoked in process context. Or, - * for that matter to invoke srcu_read_unlock() from one task and the - * matching srcu_read_lock() from another. + * The return value from srcu_read_lock() is guaranteed to be + * non-negative. This value must be passed unaltered to the matching + * srcu_read_unlock(). Note that srcu_read_lock() and the matching + * srcu_read_unlock() must occur in the same context, for example, it is + * illegal to invoke srcu_read_unlock() in an irq handler if the matching + * srcu_read_lock() was invoked in process context. Or, for that matter to + * invoke srcu_read_unlock() from one task and the matching srcu_read_lock() + * from another. */ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) { -- cgit v1.2.3 From cfb07b07dda2a17feed96c80c5af85937fcd2e9c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Nov 2024 16:53:53 -0800 Subject: srcu: Fix typo s/srcu_check_read_flavor()/__srcu_check_read_flavor()/ This commit fixes a typo in which a comment needed to have been updated from srcu_check_read_flavor() to __srcu_check_read_flavor(). Reported-by: Neeraj Upadhyay Closes: https://lore.kernel.org/all/b75d1fcd-6fcd-4619-bb5c-507fa599ee28@amd.com/ Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/srcutree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 490aeecc6bb4..4e69f88bcab9 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -258,7 +258,7 @@ static inline void srcu_check_read_flavor_lite(struct srcu_struct *ssp) if (likely(READ_ONCE(sdp->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE)) return; - // Note that the cmpxchg() in srcu_check_read_flavor() is fully ordered. + // Note that the cmpxchg() in __srcu_check_read_flavor() is fully ordered. __srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE); } -- cgit v1.2.3 From 288a2cabcf6bb35532e8b2708829bdc2b85bc690 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 11 Dec 2024 20:57:58 +0100 Subject: power: supply: core: add UAPI to discover currently used extensions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Userspace wants to now about the used power supply extensions, for example to handle a device extended by a certain extension differently or to discover information about the extending device. Add a sysfs directory to the power supply device. This directory contains links which are named after the used extension and point to the device implementing that extension. Signed-off-by: Thomas Weißschuh Reviewed-by: Armin Wolf Link: https://lore.kernel.org/r/20241211-power-supply-extensions-v6-4-9d9dc3f3d387@weissschuh.net Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index a877518cd963..c3ce9f2b17d4 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -286,6 +286,7 @@ struct power_supply_desc { }; struct power_supply_ext { + const char *const name; u8 charge_behaviours; const enum power_supply_property *properties; size_t num_properties; @@ -911,6 +912,7 @@ extern int power_supply_powers(struct power_supply *psy, struct device *dev); extern int __must_check power_supply_register_extension(struct power_supply *psy, const struct power_supply_ext *ext, + struct device *dev, void *data); extern void power_supply_unregister_extension(struct power_supply *psy, const struct power_supply_ext *ext); -- cgit v1.2.3 From 2c2b61d2138f472e50b5531ec0cb4a1485837e21 Mon Sep 17 00:00:00 2001 From: Yuyang Huang Date: Wed, 11 Dec 2024 17:22:41 +0900 Subject: netlink: add IGMP/MLD join/leave notifications MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change introduces netlink notifications for multicast address changes. The following features are included: * Addition and deletion of multicast addresses are reported using RTM_NEWMULTICAST and RTM_DELMULTICAST messages with AF_INET and AF_INET6. * Two new notification groups: RTNLGRP_IPV4_MCADDR and RTNLGRP_IPV6_MCADDR are introduced for receiving these events. This change allows user space applications (e.g., ip monitor) to efficiently track multicast group memberships by listening for netlink events. Previously, applications relied on inefficient polling of procfs, introducing delays. With netlink notifications, applications receive realtime updates on multicast group membership changes, enabling more precise metrics collection and system monitoring.  This change also unlocks the potential for implementing a wide range of sophisticated multicast related features in user space by allowing applications to combine kernel provided multicast address information with user space data and communicate decisions back to the kernel for more fine grained control. This mechanism can be used for various purposes, including multicast filtering, IGMP/MLD offload, and IGMP/MLD snooping. Cc: Maciej Żenczykowski Cc: Lorenzo Colitti Co-developed-by: Patrick Ruddy Signed-off-by: Patrick Ruddy Link: https://lore.kernel.org/r/20180906091056.21109-1-pruddy@vyatta.att-mail.com Signed-off-by: Yuyang Huang Signed-off-by: David S. Miller --- include/linux/igmp.h | 2 ++ include/net/addrconf.h | 21 +++++++++++++++++++++ include/uapi/linux/rtnetlink.h | 10 +++++++++- 3 files changed, 32 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 5171231f70a8..073b30a9b850 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -87,6 +87,8 @@ struct ip_mc_list { char loaded; unsigned char gsquery; /* check source marks? */ unsigned char crcount; + unsigned long mca_cstamp; + unsigned long mca_tstamp; struct rcu_head rcu; }; diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 363dd63babe7..58337898fa21 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -88,6 +88,23 @@ struct ifa6_config { u16 scope; }; +enum addr_type_t { + UNICAST_ADDR, + MULTICAST_ADDR, + ANYCAST_ADDR, +}; + +struct inet6_fill_args { + u32 portid; + u32 seq; + int event; + unsigned int flags; + int netnsid; + int ifindex; + enum addr_type_t type; + bool force_rt_scope_universe; +}; + int addrconf_init(void); void addrconf_cleanup(void); @@ -525,4 +542,8 @@ int if6_proc_init(void); void if6_proc_exit(void); #endif +int inet6_fill_ifmcaddr(struct sk_buff *skb, + const struct ifmcaddr6 *ifmca, + struct inet6_fill_args *args); + #endif diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index db7254d52d93..eccc0e7dcb7d 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -93,7 +93,11 @@ enum { RTM_NEWPREFIX = 52, #define RTM_NEWPREFIX RTM_NEWPREFIX - RTM_GETMULTICAST = 58, + RTM_NEWMULTICAST = 56, +#define RTM_NEWMULTICAST RTM_NEWMULTICAST + RTM_DELMULTICAST, +#define RTM_DELMULTICAST RTM_DELMULTICAST + RTM_GETMULTICAST, #define RTM_GETMULTICAST RTM_GETMULTICAST RTM_GETANYCAST = 62, @@ -774,6 +778,10 @@ enum rtnetlink_groups { #define RTNLGRP_TUNNEL RTNLGRP_TUNNEL RTNLGRP_STATS, #define RTNLGRP_STATS RTNLGRP_STATS + RTNLGRP_IPV4_MCADDR, +#define RTNLGRP_IPV4_MCADDR RTNLGRP_IPV4_MCADDR + RTNLGRP_IPV6_MCADDR, +#define RTNLGRP_IPV6_MCADDR RTNLGRP_IPV6_MCADDR __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) -- cgit v1.2.3 From 2ef6fc99e0d922a54073e7b6d6465c62f4d3b62b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thi=C3=A9baud=20Weksteen?= Date: Thu, 5 Dec 2024 12:21:00 +1100 Subject: selinux: add netlink nlmsg_type audit message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new audit message type to capture nlmsg-related information. This is similar to LSM_AUDIT_DATA_IOCTL_OP which was added for the other SELinux extended permission (ioctl). Adding a new type is preferred to adding to the existing lsm_network_audit structure which contains irrelevant information for the netlink sockets (i.e., dport, sport). Signed-off-by: Thiébaud Weksteen [PM: change "nlnk-msgtype" to "nl-msgtype" as discussed] Signed-off-by: Paul Moore --- include/linux/lsm_audit.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index 97a8b21eb033..69d2b7bc00ed 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -77,6 +77,7 @@ struct common_audit_data { #define LSM_AUDIT_DATA_LOCKDOWN 15 #define LSM_AUDIT_DATA_NOTIFICATION 16 #define LSM_AUDIT_DATA_ANONINODE 17 +#define LSM_AUDIT_DATA_NLMSGTYPE 18 union { struct path path; struct dentry *dentry; @@ -98,6 +99,7 @@ struct common_audit_data { struct lsm_ibendport_audit *ibendport; int reason; const char *anonclass; + u16 nlmsg_type; } u; /* this union contains LSM specific data */ union { -- cgit v1.2.3 From aeb3ec99026979287266e4b5a1194789c1488c1a Mon Sep 17 00:00:00 2001 From: Rongwei Liu Date: Fri, 13 Dec 2024 00:13:20 +0200 Subject: net/mlx5: Add device cap abs_native_port_num When the abs_native_port_num is set, the native_port_num reported by the device may not be continuous and bigger than the num_lag_ports. Signed-off-by: Rongwei Liu Reviewed-by: Shay Drory Reviewed-by: Saeed Mahameed Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241212221329.961628-2-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5451ff1d4356..43b3cb4bf8d1 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1599,7 +1599,8 @@ enum { struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_0[0x6]; u8 page_request_disable[0x1]; - u8 reserved_at_7[0x9]; + u8 abs_native_port_num[0x1]; + u8 reserved_at_8[0x8]; u8 shared_object_to_user_object_allowed[0x1]; u8 reserved_at_13[0xe]; u8 vhca_resource_manager[0x1]; -- cgit v1.2.3 From 0471b1093e3a5d702ba2bf5987c35ee0e2336855 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 12 Dec 2024 16:36:04 +0100 Subject: tls: block decryption when a rekey is pending When a TLS handshake record carrying a KeyUpdate message is received, all subsequent records will be encrypted with a new key. We need to stop decrypting incoming records with the old key, and wait until userspace provides a new key. Make a note of this in the RX context just after decrypting that record, and stop recvmsg/splice calls with EKEYEXPIRED until the new key is available. key_update_pending can't be combined with the existing bitfield, because we will read it locklessly in ->poll. Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- include/net/tls.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 61fef2880114..857340338b69 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -59,6 +59,8 @@ struct tls_rec; #define TLS_CRYPTO_INFO_READY(info) ((info)->cipher_type) +#define TLS_HANDSHAKE_KEYUPDATE 24 /* rfc8446 B.3: Key update */ + #define TLS_AAD_SPACE_SIZE 13 #define TLS_MAX_IV_SIZE 16 @@ -130,6 +132,7 @@ struct tls_sw_context_rx { u8 async_capable:1; u8 zc_capable:1; u8 reader_contended:1; + bool key_update_pending; struct tls_strparser strp; -- cgit v1.2.3 From 510128b30f2db1600172e9aaec44f66db3c16e15 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 12 Dec 2024 16:36:06 +0100 Subject: tls: add counters for rekey This introduces 5 counters to keep track of key updates: Tls{Rx,Tx}Rekey{Ok,Error} and TlsRxRekeyReceived. Suggested-by: Jakub Kicinski Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index adf5fd78dd50..51da2e00112d 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -358,6 +358,11 @@ enum LINUX_MIB_TLSRXDEVICERESYNC, /* TlsRxDeviceResync */ LINUX_MIB_TLSDECRYPTRETRY, /* TlsDecryptRetry */ LINUX_MIB_TLSRXNOPADVIOL, /* TlsRxNoPadViolation */ + LINUX_MIB_TLSRXREKEYOK, /* TlsRxRekeyOk */ + LINUX_MIB_TLSRXREKEYERROR, /* TlsRxRekeyError */ + LINUX_MIB_TLSTXREKEYOK, /* TlsTxRekeyOk */ + LINUX_MIB_TLSTXREKEYERROR, /* TlsTxRekeyError */ + LINUX_MIB_TLSRXREKEYRECEIVED, /* TlsRxRekeyReceived */ __LINUX_MIB_TLSMAX }; -- cgit v1.2.3 From 35f7cad1743e04bf2944a2aadb6b6a42adc57bca Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Thu, 12 Dec 2024 18:06:43 +0100 Subject: net: Add the possibility to support a selected hwtstamp in netdevice Introduce the description of a hwtstamp provider, mainly defined with a the hwtstamp source and the phydev pointer. Add a hwtstamp provider description within the netdev structure to allow saving the hwtstamp we want to use. This prepares for future support of an ethtool netlink command to select the desired hwtstamp provider. By default, the old API that does not support hwtstamp selectability is used, meaning the hwtstamp provider pointer is unset. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/linux/net_tstamp.h | 29 +++++++++++++++++++++++++++++ include/linux/netdevice.h | 4 ++++ include/uapi/linux/net_tstamp.h | 11 +++++++++++ 3 files changed, 44 insertions(+) (limited to 'include') diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h index 662074b08c94..ff0758e88ea1 100644 --- a/include/linux/net_tstamp.h +++ b/include/linux/net_tstamp.h @@ -19,6 +19,33 @@ enum hwtstamp_source { HWTSTAMP_SOURCE_PHYLIB, }; +/** + * struct hwtstamp_provider_desc - hwtstamp provider description + * + * @index: index of the hwtstamp provider. + * @qualifier: hwtstamp provider qualifier. + */ +struct hwtstamp_provider_desc { + int index; + enum hwtstamp_provider_qualifier qualifier; +}; + +/** + * struct hwtstamp_provider - hwtstamp provider object + * + * @rcu_head: RCU callback used to free the struct. + * @source: source of the hwtstamp provider. + * @phydev: pointer of the phydev source in case a PTP coming from phylib + * @desc: hwtstamp provider description. + */ + +struct hwtstamp_provider { + struct rcu_head rcu_head; + enum hwtstamp_source source; + struct phy_device *phydev; + struct hwtstamp_provider_desc desc; +}; + /** * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config * @@ -31,6 +58,7 @@ enum hwtstamp_source { * copied the ioctl request back to user space * @source: indication whether timestamps should come from the netdev or from * an attached phylib PHY + * @qualifier: qualifier of the hwtstamp provider * * Prefer using this structure for in-kernel processing of hardware * timestamping configuration, over the inextensible struct hwtstamp_config @@ -43,6 +71,7 @@ struct kernel_hwtstamp_config { struct ifreq *ifr; bool copied_to_user; enum hwtstamp_source source; + enum hwtstamp_provider_qualifier qualifier; }; static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d917949bba03..2593019ad5b1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -82,6 +82,7 @@ struct xdp_metadata_ops; struct xdp_md; struct ethtool_netdev_state; struct phy_link_topology; +struct hwtstamp_provider; typedef u32 xdp_features_t; @@ -2045,6 +2046,7 @@ enum netdev_reg_state { * * @neighbours: List heads pointing to this device's neighbours' * dev_list, one per address-family. + * @hwprov: Tracks which PTP performs hardware packet time stamping. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2457,6 +2459,8 @@ struct net_device { struct hlist_head neighbours[NEIGH_NR_TABLES]; + struct hwtstamp_provider __rcu *hwprov; + u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 858339d1c1c4..55b0ab51096c 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -13,6 +13,17 @@ #include #include /* for SO_TIMESTAMPING */ +/* + * Possible type of hwtstamp provider. Mainly "precise" the default one + * is for IEEE 1588 quality and "approx" is for NICs DMA point. + */ +enum hwtstamp_provider_qualifier { + HWTSTAMP_PROVIDER_QUALIFIER_PRECISE, + HWTSTAMP_PROVIDER_QUALIFIER_APPROX, + + HWTSTAMP_PROVIDER_QUALIFIER_CNT, +}; + /* SO_TIMESTAMPING flags */ enum { SOF_TIMESTAMPING_TX_HARDWARE = (1<<0), -- cgit v1.2.3 From b9e3f7dc9ed95daeb83cfa45b821cacaa01aa906 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Thu, 12 Dec 2024 18:06:44 +0100 Subject: net: ethtool: tsinfo: Enhance tsinfo to support several hwtstamp by net topology Either the MAC or the PHY can provide hwtstamp, so we should be able to read the tsinfo for any hwtstamp provider. Enhance 'get' command to retrieve tsinfo of hwtstamp providers within a network topology. Add support for a specific dump command to retrieve all hwtstamp providers within the network topology, with added functionality for filtered dump to target a single interface. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/linux/ethtool.h | 4 ++++ include/uapi/linux/ethtool_netlink_generated.h | 10 ++++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index e217c6321ed0..f711bfd75c4d 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -711,6 +711,7 @@ struct ethtool_rxfh_param { * @cmd: command number = %ETHTOOL_GET_TS_INFO * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags * @phc_index: device index of the associated PHC, or -1 if there is none + * @phc_qualifier: qualifier of the associated PHC * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values */ @@ -718,6 +719,7 @@ struct kernel_ethtool_ts_info { u32 cmd; u32 so_timestamping; int phc_index; + enum hwtstamp_provider_qualifier phc_qualifier; enum hwtstamp_tx_types tx_types; enum hwtstamp_rx_filters rx_filters; }; @@ -749,6 +751,7 @@ struct kernel_ethtool_ts_info { * @rss_context argument to @create_rxfh_context and friends. * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. + * @supported_hwtstamp_qualifiers: bitfield of supported hwtstamp qualifier. * @get_drvinfo: Report driver/device information. Modern drivers no * longer have to implement this callback. Most fields are * correctly filled in by the core using system information, or @@ -966,6 +969,7 @@ struct ethtool_ops { u32 rxfh_max_num_contexts; u32 supported_coalesce_params; u32 supported_ring_params; + u32 supported_hwtstamp_qualifiers; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); int (*get_regs_len)(struct net_device *); void (*get_regs)(struct net_device *, struct ethtool_regs *, void *); diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index b58f352fe4f2..df289dde0f61 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -385,6 +385,15 @@ enum { ETHTOOL_A_TS_STAT_MAX = (__ETHTOOL_A_TS_STAT_CNT - 1) }; +enum { + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_UNSPEC, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER, + + __ETHTOOL_A_TS_HWTSTAMP_PROVIDER_CNT, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_MAX = (__ETHTOOL_A_TS_HWTSTAMP_PROVIDER_CNT - 1) +}; + enum { ETHTOOL_A_TSINFO_UNSPEC, ETHTOOL_A_TSINFO_HEADER, @@ -393,6 +402,7 @@ enum { ETHTOOL_A_TSINFO_RX_FILTERS, ETHTOOL_A_TSINFO_PHC_INDEX, ETHTOOL_A_TSINFO_STATS, + ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER, __ETHTOOL_A_TSINFO_CNT, ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1) -- cgit v1.2.3 From 6e9e2eed4f39d52edf5fd006409d211facf49f6b Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Thu, 12 Dec 2024 18:06:45 +0100 Subject: net: ethtool: Add support for tsconfig command to get/set hwtstamp config Introduce support for ETHTOOL_MSG_TSCONFIG_GET/SET ethtool netlink socket to read and configure hwtstamp configuration of a PHC provider. Note that simultaneous hwtstamp isn't supported; configuring a new one disables the previous setting. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/uapi/linux/ethtool_netlink_generated.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index df289dde0f61..43993a2d68e5 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -694,6 +694,18 @@ enum { ETHTOOL_A_PHY_MAX = (__ETHTOOL_A_PHY_CNT - 1) }; +enum { + ETHTOOL_A_TSCONFIG_UNSPEC, + ETHTOOL_A_TSCONFIG_HEADER, + ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER, + ETHTOOL_A_TSCONFIG_TX_TYPES, + ETHTOOL_A_TSCONFIG_RX_FILTERS, + ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS, + + __ETHTOOL_A_TSCONFIG_CNT, + ETHTOOL_A_TSCONFIG_MAX = (__ETHTOOL_A_TSCONFIG_CNT - 1) +}; + enum { ETHTOOL_MSG_USER_NONE = 0, ETHTOOL_MSG_STRSET_GET = 1, @@ -741,6 +753,8 @@ enum { ETHTOOL_MSG_MM_SET, ETHTOOL_MSG_MODULE_FW_FLASH_ACT, ETHTOOL_MSG_PHY_GET, + ETHTOOL_MSG_TSCONFIG_GET, + ETHTOOL_MSG_TSCONFIG_SET, __ETHTOOL_MSG_USER_CNT, ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1) @@ -794,6 +808,8 @@ enum { ETHTOOL_MSG_MODULE_FW_FLASH_NTF, ETHTOOL_MSG_PHY_GET_REPLY, ETHTOOL_MSG_PHY_NTF, + ETHTOOL_MSG_TSCONFIG_GET_REPLY, + ETHTOOL_MSG_TSCONFIG_SET_REPLY, __ETHTOOL_MSG_KERNEL_CNT, ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1) -- cgit v1.2.3 From 87e2a15bc00840762b082399493597e8d3c1e42c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 28 Nov 2024 04:58:20 +0000 Subject: f2fs: Convert submit tracepoints to take a folio Remove accesses to page->index and page->mapping as well as unnecessary calls to page_file_mapping(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/trace/events/f2fs.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 2851c823095b..3253f45508e8 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1119,11 +1119,11 @@ TRACE_EVENT(f2fs_reserve_new_blocks, (unsigned long long)__entry->count) ); -DECLARE_EVENT_CLASS(f2fs__submit_page_bio, +DECLARE_EVENT_CLASS(f2fs__submit_folio_bio, - TP_PROTO(struct page *page, struct f2fs_io_info *fio), + TP_PROTO(struct folio *folio, struct f2fs_io_info *fio), - TP_ARGS(page, fio), + TP_ARGS(folio, fio), TP_STRUCT__entry( __field(dev_t, dev) @@ -1138,9 +1138,9 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, ), TP_fast_assign( - __entry->dev = page_file_mapping(page)->host->i_sb->s_dev; - __entry->ino = page_file_mapping(page)->host->i_ino; - __entry->index = page->index; + __entry->dev = folio->mapping->host->i_sb->s_dev; + __entry->ino = folio->mapping->host->i_ino; + __entry->index = folio->index; __entry->old_blkaddr = fio->old_blkaddr; __entry->new_blkaddr = fio->new_blkaddr; __entry->op = fio->op; @@ -1149,7 +1149,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __entry->type = fio->type; ), - TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, " + TP_printk("dev = (%d,%d), ino = %lu, folio_index = 0x%lx, " "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s(%s), type = %s_%s", show_dev_ino(__entry), (unsigned long)__entry->index, @@ -1160,22 +1160,22 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, show_block_type(__entry->type)) ); -DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_bio, +DEFINE_EVENT_CONDITION(f2fs__submit_folio_bio, f2fs_submit_folio_bio, - TP_PROTO(struct page *page, struct f2fs_io_info *fio), + TP_PROTO(struct folio *folio, struct f2fs_io_info *fio), - TP_ARGS(page, fio), + TP_ARGS(folio, fio), - TP_CONDITION(page->mapping) + TP_CONDITION(folio->mapping) ); -DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_write, +DEFINE_EVENT_CONDITION(f2fs__submit_folio_bio, f2fs_submit_folio_write, - TP_PROTO(struct page *page, struct f2fs_io_info *fio), + TP_PROTO(struct folio *folio, struct f2fs_io_info *fio), - TP_ARGS(page, fio), + TP_ARGS(folio, fio), - TP_CONDITION(page->mapping) + TP_CONDITION(folio->mapping) ); DECLARE_EVENT_CLASS(f2fs__bio, -- cgit v1.2.3 From c910a64bc4e21782959221b6ea2d6c4cce0506c7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 28 Nov 2024 04:58:26 +0000 Subject: f2fs: Remove calls to folio_file_mapping() All folios that f2fs sees belong to f2fs and not to the swapcache so it can dereference folio->mapping directly like all other filesystems do. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/trace/events/f2fs.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 3253f45508e8..eb3b2f1326b1 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1322,12 +1322,11 @@ DECLARE_EVENT_CLASS(f2fs__folio, ), TP_fast_assign( - __entry->dev = folio_file_mapping(folio)->host->i_sb->s_dev; - __entry->ino = folio_file_mapping(folio)->host->i_ino; + __entry->dev = folio->mapping->host->i_sb->s_dev; + __entry->ino = folio->mapping->host->i_ino; __entry->type = type; - __entry->dir = - S_ISDIR(folio_file_mapping(folio)->host->i_mode); - __entry->index = folio_index(folio); + __entry->dir = S_ISDIR(folio->mapping->host->i_mode); + __entry->index = folio->index; __entry->dirty = folio_test_dirty(folio); __entry->uptodate = folio_test_uptodate(folio); ), -- cgit v1.2.3 From 828fd3f1d611cff5accb9fbff75f6bb011d8c9e2 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 16 Dec 2024 09:32:17 +0100 Subject: Input: davinci-keyscan - remove leftover header The corresponding driver was removed two years ago but the platform data header was left behind. Remove it now. Fixes: 3c9cb34939fb ("input: remove davinci keyboard driver") Signed-off-by: Bartosz Golaszewski Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20241216083218.22926-1-brgl@bgdev.pl Signed-off-by: Dmitry Torokhov --- include/linux/platform_data/keyscan-davinci.h | 29 --------------------------- 1 file changed, 29 deletions(-) delete mode 100644 include/linux/platform_data/keyscan-davinci.h (limited to 'include') diff --git a/include/linux/platform_data/keyscan-davinci.h b/include/linux/platform_data/keyscan-davinci.h deleted file mode 100644 index 260d596ba0af..000000000000 --- a/include/linux/platform_data/keyscan-davinci.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2009 Texas Instruments, Inc - * - * Author: Miguel Aguilar - */ - -#ifndef DAVINCI_KEYSCAN_H -#define DAVINCI_KEYSCAN_H - -#include - -enum davinci_matrix_types { - DAVINCI_KEYSCAN_MATRIX_4X4, - DAVINCI_KEYSCAN_MATRIX_5X3, -}; - -struct davinci_ks_platform_data { - int (*device_enable)(struct device *dev); - unsigned short *keymap; - u32 keymapsize; - u8 rep:1; - u8 strobe; - u8 interval; - u8 matrix_type; -}; - -#endif - -- cgit v1.2.3 From 65c8c78cc74d5bcbc43f1f785a004796a2d78360 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 12 Dec 2024 21:13:10 +0100 Subject: thermal/thresholds: Fix uapi header macros leading to a compilation error The macros giving the direction of the crossing thresholds use the BIT macro which is not exported to the userspace. Consequently when an userspace program includes the header, it fails to compile. Replace the macros by their litteral to allow the compilation of userspace program using this header. Fixes: 445936f9e258 ("thermal: core: Add user thresholds support") Signed-off-by: Daniel Lezcano Link: https://patch.msgid.link/20241212201311.4143196-1-daniel.lezcano@linaro.org [ rjw: Add Fixes: ] Signed-off-by: Rafael J. Wysocki --- include/uapi/linux/thermal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h index ba8604bdf206..349718c271eb 100644 --- a/include/uapi/linux/thermal.h +++ b/include/uapi/linux/thermal.h @@ -3,8 +3,8 @@ #define _UAPI_LINUX_THERMAL_H #define THERMAL_NAME_LENGTH 20 -#define THERMAL_THRESHOLD_WAY_UP BIT(0) -#define THERMAL_THRESHOLD_WAY_DOWN BIT(1) +#define THERMAL_THRESHOLD_WAY_UP 0x1 +#define THERMAL_THRESHOLD_WAY_DOWN 0x2 enum thermal_device_mode { THERMAL_DEVICE_DISABLED = 0, -- cgit v1.2.3 From 1e7381f3617d14b3c11da80ff5f8a93ab14cfc46 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Oct 2024 08:04:50 -0700 Subject: KVM: Explicitly verify target vCPU is online in kvm_get_vcpu() Explicitly verify the target vCPU is fully online _prior_ to clamping the index in kvm_get_vcpu(). If the index is "bad", the nospec clamping will generate '0', i.e. KVM will return vCPU0 instead of NULL. In practice, the bug is unlikely to cause problems, as it will only come into play if userspace or the guest is buggy or misbehaving, e.g. KVM may send interrupts to vCPU0 instead of dropping them on the floor. However, returning vCPU0 when it shouldn't exist per online_vcpus is problematic now that KVM uses an xarray for the vCPUs array, as KVM needs to insert into the xarray before publishing the vCPU to userspace (see commit c5b077549136 ("KVM: Convert the kvm->vcpus array to a xarray")), i.e. before vCPU creation is guaranteed to succeed. As a result, incorrectly providing access to vCPU0 will trigger a use-after-free if vCPU0 is dereferenced and kvm_vm_ioctl_create_vcpu() bails out of vCPU creation due to an error and frees vCPU0. Commit afb2acb2e3a3 ("KVM: Fix vcpu_array[0] races") papered over that issue, but in doing so introduced an unsolvable teardown conundrum. Preventing accesses to vCPU0 before it's fully online will allow reverting commit afb2acb2e3a3, without re-introducing the vcpu_array[0] UAF race. Fixes: 1d487e9bf8ba ("KVM: fix spectrev1 gadgets") Cc: stable@vger.kernel.org Cc: Will Deacon Cc: Michal Luczaj Reviewed-by: Pankaj Gupta Acked-by: Will Deacon Link: https://lore.kernel.org/r/20241009150455.1057573-2-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 401439bb21e3..b0b38744c4b0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -963,6 +963,15 @@ static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) { int num_vcpus = atomic_read(&kvm->online_vcpus); + + /* + * Explicitly verify the target vCPU is online, as the anti-speculation + * logic only limits the CPU's ability to speculate, e.g. given a "bad" + * index, clamping the index to 0 would return vCPU0, not NULL. + */ + if (i >= num_vcpus) + return NULL; + i = array_index_nospec(i, num_vcpus); /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu. */ -- cgit v1.2.3 From 0664dc74e9d004c36b4400081811df795169809a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 9 Oct 2024 08:04:51 -0700 Subject: KVM: Verify there's at least one online vCPU when iterating over all vCPUs Explicitly check that there is at least online vCPU before iterating over all vCPUs. Because the max index is an unsigned long, passing "0 - 1" in the online_vcpus==0 case results in xa_for_each_range() using an unlimited max, i.e. allows it to access vCPU0 when it shouldn't. This will allow KVM to safely _erase_ from vcpu_array if the last stages of vCPU creation fail, i.e. without generating a use-after-free if a different task happens to be concurrently iterating over all vCPUs. Note, because xa_for_each_range() is a macro, kvm_for_each_vcpu() subtly reloads online_vcpus after each iteration, i.e. adding an extra load doesn't meaningfully impact the total cost of iterating over all vCPUs. And because online_vcpus is never decremented, there is no risk of a reload triggering a walk of the entire xarray. Cc: Will Deacon Cc: Michal Luczaj Acked-by: Will Deacon Link: https://lore.kernel.org/r/20241009150455.1057573-3-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b0b38744c4b0..92f192bec07b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -979,9 +979,10 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) return xa_load(&kvm->vcpu_array, i); } -#define kvm_for_each_vcpu(idx, vcpup, kvm) \ - xa_for_each_range(&kvm->vcpu_array, idx, vcpup, 0, \ - (atomic_read(&kvm->online_vcpus) - 1)) +#define kvm_for_each_vcpu(idx, vcpup, kvm) \ + if (atomic_read(&kvm->online_vcpus)) \ + xa_for_each_range(&kvm->vcpu_array, idx, vcpup, 0, \ + (atomic_read(&kvm->online_vcpus) - 1)) static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) { -- cgit v1.2.3 From 239d87327dcd361b0098038995f8908f3296864f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 12 Dec 2024 17:28:06 -0800 Subject: fortify: Hide run-time copy size from value range tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC performs value range tracking for variables as a way to provide better diagnostics. One place this is regularly seen is with warnings associated with bounds-checking, e.g. -Wstringop-overflow, -Wstringop-overread, -Warray-bounds, etc. In order to keep the signal-to-noise ratio high, warnings aren't emitted when a value range spans the entire value range representable by a given variable. For example: unsigned int len; char dst[8]; ... memcpy(dst, src, len); If len's value is unknown, it has the full "unsigned int" range of [0, UINT_MAX], and GCC's compile-time bounds checks against memcpy() will be ignored. However, when a code path has been able to narrow the range: if (len > 16) return; memcpy(dst, src, len); Then the range will be updated for the execution path. Above, len is now [0, 16] when reading memcpy(), so depending on other optimizations, we might see a -Wstringop-overflow warning like: error: '__builtin_memcpy' writing between 9 and 16 bytes into region of size 8 [-Werror=stringop-overflow] When building with CONFIG_FORTIFY_SOURCE, the fortified run-time bounds checking can appear to narrow value ranges of lengths for memcpy(), depending on how the compiler constructs the execution paths during optimization passes, due to the checks against the field sizes. For example: if (p_size_field != SIZE_MAX && p_size != p_size_field && p_size_field < size) As intentionally designed, these checks only affect the kernel warnings emitted at run-time and do not block the potentially overflowing memcpy(), so GCC thinks it needs to produce a warning about the resulting value range that might be reaching the memcpy(). We have seen this manifest a few times now, with the most recent being with cpumasks: In function ‘bitmap_copy’, inlined from ‘cpumask_copy’ at ./include/linux/cpumask.h:839:2, inlined from ‘__padata_set_cpumasks’ at kernel/padata.c:730:2: ./include/linux/fortify-string.h:114:33: error: ‘__builtin_memcpy’ reading between 257 and 536870904 bytes from a region of size 256 [-Werror=stringop-overread] 114 | #define __underlying_memcpy __builtin_memcpy | ^ ./include/linux/fortify-string.h:633:9: note: in expansion of macro ‘__underlying_memcpy’ 633 | __underlying_##op(p, q, __fortify_size); \ | ^~~~~~~~~~~~~ ./include/linux/fortify-string.h:678:26: note: in expansion of macro ‘__fortify_memcpy_chk’ 678 | #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ | ^~~~~~~~~~~~~~~~~~~~ ./include/linux/bitmap.h:259:17: note: in expansion of macro ‘memcpy’ 259 | memcpy(dst, src, len); | ^~~~~~ kernel/padata.c: In function ‘__padata_set_cpumasks’: kernel/padata.c:713:48: note: source object ‘pcpumask’ of size [0, 256] 713 | cpumask_var_t pcpumask, | ~~~~~~~~~~~~~~^~~~~~~~ This warning is _not_ emitted when CONFIG_FORTIFY_SOURCE is disabled, and with the recent -fdiagnostics-details we can confirm the origin of the warning is due to FORTIFY's bounds checking: ../include/linux/bitmap.h:259:17: note: in expansion of macro 'memcpy' 259 | memcpy(dst, src, len); | ^~~~~~ '__padata_set_cpumasks': events 1-2 ../include/linux/fortify-string.h:613:36: 612 | if (p_size_field != SIZE_MAX && | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 613 | p_size != p_size_field && p_size_field < size) | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~ | | | (1) when the condition is evaluated to false | (2) when the condition is evaluated to true '__padata_set_cpumasks': event 3 114 | #define __underlying_memcpy __builtin_memcpy | ^ | | | (3) out of array bounds here Note that the cpumask warning started appearing since bitmap functions were recently marked __always_inline in commit ed8cd2b3bd9f ("bitmap: Switch from inline to __always_inline"), which allowed GCC to gain visibility into the variables as they passed through the FORTIFY implementation. In order to silence these false positives but keep otherwise deterministic compile-time warnings intact, hide the length variable from GCC with OPTIMIZE_HIDE_VAR() before calling the builtin memcpy. Additionally add a comment about why all the macro args have copies with const storage. Reported-by: "Thomas Weißschuh" Closes: https://lore.kernel.org/all/db7190c8-d17f-4a0d-bc2f-5903c79f36c2@t-8ch.de/ Reported-by: Nilay Shroff Closes: https://lore.kernel.org/all/20241112124127.1666300-1-nilay@linux.ibm.com/ Tested-by: Nilay Shroff Acked-by: Yury Norov Acked-by: Greg Kroah-Hartman Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 0d99bf11d260..e4ce1cae03bf 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -616,6 +616,12 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, return false; } +/* + * To work around what seems to be an optimizer bug, the macro arguments + * need to have const copies or the values end up changed by the time they + * reach fortify_warn_once(). See commit 6f7630b1b5bc ("fortify: Capture + * __bos() results in const temp vars") for more details. + */ #define __fortify_memcpy_chk(p, q, size, p_size, q_size, \ p_size_field, q_size_field, op) ({ \ const size_t __fortify_size = (size_t)(size); \ @@ -623,6 +629,8 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, const size_t __q_size = (q_size); \ const size_t __p_size_field = (p_size_field); \ const size_t __q_size_field = (q_size_field); \ + /* Keep a mutable version of the size for the final copy. */ \ + size_t __copy_size = __fortify_size; \ fortify_warn_once(fortify_memcpy_chk(__fortify_size, __p_size, \ __q_size, __p_size_field, \ __q_size_field, FORTIFY_FUNC_ ##op), \ @@ -630,7 +638,11 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, __fortify_size, \ "field \"" #p "\" at " FILE_LINE, \ __p_size_field); \ - __underlying_##op(p, q, __fortify_size); \ + /* Hide only the run-time size from value range tracking to */ \ + /* silence compile-time false positive bounds warnings. */ \ + if (!__builtin_constant_p(__copy_size)) \ + OPTIMIZER_HIDE_VAR(__copy_size); \ + __underlying_##op(p, q, __copy_size); \ }) /* -- cgit v1.2.3 From 3a3f61ce5e0b4bcf730acc09c1af91012d241f85 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 29 Nov 2024 20:06:55 -0800 Subject: exec: Make sure task->comm is always NUL-terminated Using strscpy() meant that the final character in task->comm may be non-NUL for a moment before the "string too long" truncation happens. Instead of adding a new use of the ambiguous strncpy(), we'd want to use memtostr_pad() which enforces being able to check at compile time that sizes are sensible, but this requires being able to see string buffer lengths. Instead of trying to inline __set_task_comm() (which needs to call trace and perf functions), just open-code it. But to make sure we're always safe, add compile-time checking like we already do for get_task_comm(). Suggested-by: Linus Torvalds Suggested-by: "Eric W. Biederman" Signed-off-by: Kees Cook --- include/linux/sched.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index e6ee4258169a..ac9f429ddc17 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1932,11 +1932,10 @@ static inline void kick_process(struct task_struct *tsk) { } #endif extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); - -static inline void set_task_comm(struct task_struct *tsk, const char *from) -{ - __set_task_comm(tsk, from, false); -} +#define set_task_comm(tsk, from) ({ \ + BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN); \ + __set_task_comm(tsk, from, false); \ +}) extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); #define get_task_comm(buf, tsk) ({ \ -- cgit v1.2.3 From 543841d1806029889c2f69f040e88b247aba8e22 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 21 Nov 2024 07:07:05 -0800 Subject: exec: fix up /proc/pid/comm in the execveat(AT_EMPTY_PATH) case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zbigniew mentioned at Linux Plumber's that systemd is interested in switching to execveat() for service execution, but can't, because the contents of /proc/pid/comm are the file descriptor which was used, instead of the path to the binary[1]. This makes the output of tools like top and ps useless, especially in a world where most fds are opened CLOEXEC so the number is truly meaningless. When the filename passed in is empty (e.g. with AT_EMPTY_PATH), use the dentry's filename for "comm" instead of using the useless numeral from the synthetic fdpath construction. This way the actual exec machinery is unchanged, but cosmetically the comm looks reasonable to admins investigating things. Instead of adding TASK_COMM_LEN more bytes to bprm, use one of the unused flag bits to indicate that we need to set "comm" from the dentry. Suggested-by: Zbigniew Jędrzejewski-Szmek Suggested-by: Tycho Andersen Suggested-by: Al Viro Suggested-by: Linus Torvalds Link: https://github.com/uapi-group/kernel-features#set-comm-field-before-exec [1] Reviewed-by: Aleksa Sarai Tested-by: Zbigniew Jędrzejewski-Szmek Signed-off-by: Kees Cook --- include/linux/binfmts.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index e6c00e860951..3305c849abd6 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -42,7 +42,9 @@ struct linux_binprm { * Set when errors can no longer be returned to the * original userspace. */ - point_of_no_return:1; + point_of_no_return:1, + /* Set when "comm" must come from the dentry. */ + comm_from_dentry:1; struct file *executable; /* Executable to pass to the interpreter */ struct file *interpreter; struct file *file; -- cgit v1.2.3 From 5637797add2af632a5d037044ab1b0b35643902e Mon Sep 17 00:00:00 2001 From: Ashutosh Dixit Date: Thu, 12 Dec 2024 14:49:03 -0800 Subject: drm/xe/oa/uapi: Expose an unblock after N reports OA property Expose an "unblock after N reports" OA property, to allow userspace threads to be woken up less frequently. Co-developed-by: Umesh Nerlige Ramappa Signed-off-by: Umesh Nerlige Ramappa Signed-off-by: Ashutosh Dixit Reviewed-by: Jonathan Cavitt Reviewed-by: Umesh Nerlige Ramappa Link: https://patchwork.freedesktop.org/patch/msgid/20241212224903.1853862-1-ashutosh.dixit@intel.com --- include/uapi/drm/xe_drm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h index 0383b52cbd86..f62689ca861a 100644 --- a/include/uapi/drm/xe_drm.h +++ b/include/uapi/drm/xe_drm.h @@ -1487,6 +1487,7 @@ struct drm_xe_oa_unit { #define DRM_XE_OA_CAPS_BASE (1 << 0) #define DRM_XE_OA_CAPS_SYNCS (1 << 1) #define DRM_XE_OA_CAPS_OA_BUFFER_SIZE (1 << 2) +#define DRM_XE_OA_CAPS_WAIT_NUM_REPORTS (1 << 3) /** @oa_timestamp_freq: OA timestamp freq */ __u64 oa_timestamp_freq; @@ -1660,6 +1661,12 @@ enum drm_xe_oa_property_id { * buffer is allocated by default. */ DRM_XE_OA_PROPERTY_OA_BUFFER_SIZE, + + /** + * @DRM_XE_OA_PROPERTY_WAIT_NUM_REPORTS: Number of reports to wait + * for before unblocking poll or read + */ + DRM_XE_OA_PROPERTY_WAIT_NUM_REPORTS, }; /** -- cgit v1.2.3 From a32f3e9d1ed146f81162702605d65447a319eb76 Mon Sep 17 00:00:00 2001 From: Anna Emese Nyiri Date: Fri, 13 Dec 2024 09:44:55 +0100 Subject: sock: support SO_PRIORITY cmsg The Linux socket API currently allows setting SO_PRIORITY at the socket level, applying a uniform priority to all packets sent through that socket. The exception to this is IP_TOS, when the priority value is calculated during the handling of ancillary data, as implemented in commit f02db315b8d8 ("ipv4: IP_TOS and IP_TTL can be specified as ancillary data"). However, this is a computed value, and there is currently no mechanism to set a custom priority via control messages prior to this patch. According to this patch, if SO_PRIORITY is specified as ancillary data, the packet is sent with the priority value set through sockc->priority, overriding the socket-level values set via the traditional setsockopt() method. This is analogous to the existing support for SO_MARK, as implemented in commit c6af0c227a22 ("ip: support SO_MARK cmsg"). If both cmsg SO_PRIORITY and IP_TOS are passed, then the one that takes precedence is the last one in the cmsg list. This patch has the side effect that raw_send_hdrinc now interprets cmsg IP_TOS. Reviewed-by: Willem de Bruijn Suggested-by: Ferenc Fejes Signed-off-by: Anna Emese Nyiri Link: https://patch.msgid.link/20241213084457.45120-3-annaemesenyiri@gmail.com Signed-off-by: Jakub Kicinski --- include/net/inet_sock.h | 2 +- include/net/ip.h | 2 +- include/net/sock.h | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 56d8bc5593d3..3ccbad881d74 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -172,7 +172,7 @@ struct inet_cork { u8 tx_flags; __u8 ttl; __s16 tos; - char priority; + u32 priority; __u16 gso_size; u32 ts_opt_id; u64 transmit_time; diff --git a/include/net/ip.h b/include/net/ip.h index 0e548c1f2a0e..9f5e33e371fc 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -81,7 +81,6 @@ struct ipcm_cookie { __u8 protocol; __u8 ttl; __s16 tos; - char priority; __u16 gso_size; }; @@ -96,6 +95,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, ipcm_init(ipcm); ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark); + ipcm->sockc.priority = READ_ONCE(inet->sk.sk_priority); ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags); ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; diff --git a/include/net/sock.h b/include/net/sock.h index 7464e9f9f47c..316a34d6c48b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1814,13 +1814,15 @@ struct sockcm_cookie { u32 mark; u32 tsflags; u32 ts_opt_id; + u32 priority; }; static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { *sockc = (struct sockcm_cookie) { - .tsflags = READ_ONCE(sk->sk_tsflags) + .tsflags = READ_ONCE(sk->sk_tsflags), + .priority = READ_ONCE(sk->sk_priority), }; } -- cgit v1.2.3 From e45469e594b255ef8d750ed5576698743450d2ac Mon Sep 17 00:00:00 2001 From: Anna Emese Nyiri Date: Fri, 13 Dec 2024 09:44:57 +0100 Subject: sock: Introduce SO_RCVPRIORITY socket option Add new socket option, SO_RCVPRIORITY, to include SO_PRIORITY in the ancillary data returned by recvmsg(). This is analogous to the existing support for SO_RCVMARK, as implemented in commit 6fd1d51cfa253 ("net: SO_RCVMARK socket option for SO_MARK with recvmsg()"). Reviewed-by: Willem de Bruijn Suggested-by: Ferenc Fejes Signed-off-by: Anna Emese Nyiri Link: https://patch.msgid.link/20241213084457.45120-5-annaemesenyiri@gmail.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 4 +++- include/uapi/asm-generic/socket.h | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 316a34d6c48b..d4bdd3286e03 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -953,6 +953,7 @@ enum sock_flags { SOCK_XDP, /* XDP is attached */ SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */ + SOCK_RCVPRIORITY, /* Receive SO_PRIORITY ancillary data with packet */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) @@ -2660,7 +2661,8 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, { #define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \ (1UL << SOCK_RCVTSTAMP) | \ - (1UL << SOCK_RCVMARK)) + (1UL << SOCK_RCVMARK) |\ + (1UL << SOCK_RCVPRIORITY)) #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index deacfd6dd197..aa5016ff3d91 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -143,6 +143,8 @@ #define SCM_TS_OPT_ID 81 +#define SO_RCVPRIORITY 82 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) -- cgit v1.2.3 From 1c9eb9e684c606982c5dba019c58eaed5474c3c0 Mon Sep 17 00:00:00 2001 From: Dharma Balasubiramani Date: Fri, 6 Dec 2024 12:59:52 -0700 Subject: dt-bindings: clock: Add SAMA7D65 PMC compatible string Add the `microchip,sama7d65-pmc` compatible string to the existing binding, since the SAMA7D65 PMC shares the same properties and clock requirements as the SAMA7G5. Export MCK3 and MCK5 to be accessed and referenced in DT to assign to the clocks property for sama7d65 SoC. Signed-off-by: Dharma Balasubiramani Signed-off-by: Ryan Wanner Reviewed-by: Claudiu Beznea Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/5252a28531deaee67af1edd8e72d45ca57783464.1733505542.git.Ryan.Wanner@microchip.com [claudiu.beznea: use tabs instead of spaces in include/dt-bindings/clock/at91.h] Signed-off-by: Claudiu Beznea --- include/dt-bindings/clock/at91.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/at91.h b/include/dt-bindings/clock/at91.h index 99f4767ff6bb..f2a7b7d39c0d 100644 --- a/include/dt-bindings/clock/at91.h +++ b/include/dt-bindings/clock/at91.h @@ -42,6 +42,10 @@ #define PMC_PLLADIV2 (PMC_MAIN + 11) #define PMC_LVDSPLL (PMC_MAIN + 12) +/* SAMA7D65 */ +#define PMC_MCK3 (PMC_MAIN + 13) +#define PMC_MCK5 (PMC_MAIN + 14) + #ifndef AT91_PMC_MOSCS #define AT91_PMC_MOSCS 0 /* MOSCS Flag */ #define AT91_PMC_LOCKA 1 /* PLLA Lock */ -- cgit v1.2.3 From c220e216d6bcd52cc7333e38edf43dc66ba0dd13 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 29 Nov 2024 14:38:04 +0100 Subject: exportfs: add permission method This allows filesystems such as pidfs to provide their custom permission checks. Link: https://lore.kernel.org/r/20241129-work-pidfs-file_handle-v1-5-87d803a42495@kernel.org Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index c69b79b64466..a087606ace19 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -3,6 +3,7 @@ #define LINUX_EXPORTFS_H 1 #include +#include struct dentry; struct iattr; @@ -10,7 +11,6 @@ struct inode; struct iomap; struct super_block; struct vfsmount; -struct path; /* limit the handle size to NFSv4 handle size now */ #define MAX_HANDLE_SZ 128 @@ -157,6 +157,17 @@ struct fid { }; }; +enum handle_to_path_flags { + HANDLE_CHECK_PERMS = (1 << 0), + HANDLE_CHECK_SUBTREE = (1 << 1), +}; + +struct handle_to_path_ctx { + struct path root; + enum handle_to_path_flags flags; + unsigned int fh_flags; +}; + #define EXPORT_FH_CONNECTABLE 0x1 /* Encode file handle with parent */ #define EXPORT_FH_FID 0x2 /* File handle may be non-decodeable */ #define EXPORT_FH_DIR_ONLY 0x4 /* Only decode file handle for a directory */ @@ -226,6 +237,9 @@ struct fid { * is also a directory. In the event that it cannot be found, or storage * space cannot be allocated, a %ERR_PTR should be returned. * + * permission: + * Allow filesystems to specify a custom permission function. + * * open: * Allow filesystems to specify a custom open function. * @@ -255,6 +269,7 @@ struct export_operations { bool write, u32 *device_generation); int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int nr_iomaps, struct iattr *iattr); + int (*permission)(struct handle_to_path_ctx *ctx, unsigned int oflags); struct file * (*open)(struct path *path, unsigned int oflags); #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ -- cgit v1.2.3 From 16ecd47cb0cd895c7c2f5dd5db50f6c005c51639 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 14 Dec 2024 22:01:28 +0100 Subject: pidfs: lookup pid through rbtree The new pid inode number allocation scheme is neat but I overlooked a possible, even though unlikely, attack that can be used to trigger an overflow on both 32bit and 64bit. An unique 64 bit identifier was constructed for each struct pid by two combining a 32 bit idr with a 32 bit generation number. A 32bit number was allocated using the idr_alloc_cyclic() infrastructure. When the idr wrapped around a 32 bit wraparound counter was incremented. The 32 bit wraparound counter served as the upper 32 bits and the allocated idr number as the lower 32 bits. Since the idr can only allocate up to INT_MAX entries everytime a wraparound happens INT_MAX - 1 entries are lost (Ignoring that numbering always starts at 2 to avoid theoretical collisions with the root inode number.). If userspace fully populates the idr such that and puts itself into control of two entries such that one entry is somewhere in the middle and the other entry is the INT_MAX entry then it is possible to overflow the wraparound counter. That is probably difficult to pull off but the mere possibility is annoying. The problem could be contained to 32 bit by switching to a data structure such as the maple tree that allows allocating 64 bit numbers on 64 bit machines. That would leave 32 bit in a lurch but that probably doesn't matter that much. The other problem is that removing entries form the maple tree is somewhat non-trivial because the removal code can be called under the irq write lock of tasklist_lock and irq{save,restore} code. Instead, allocate unique identifiers for struct pid by simply incrementing a 64 bit counter and insert each struct pid into the rbtree so it can be looked up to decode file handles avoiding to leak actual pids across pid namespaces in file handles. On both 64 bit and 32 bit the same 64 bit identifier is used to lookup struct pid in the rbtree. On 64 bit the unique identifier for struct pid simply becomes the inode number. Comparing two pidfds continues to be as simple as comparing inode numbers. On 32 bit the 64 bit number assigned to struct pid is split into two 32 bit numbers. The lower 32 bits are used as the inode number and the upper 32 bits are used as the inode generation number. Whenever a wraparound happens on 32 bit the 64 bit number will be incremented by 2 so inode numbering starts at 2 again. When a wraparound happens on 32 bit multiple pidfds with the same inode number are likely to exist. This isn't a problem since before pidfs pidfds used the anonymous inode meaning all pidfds had the same inode number. On 32 bit sserspace can thus reconstruct the 64 bit identifier by retrieving both the inode number and the inode generation number to compare, or use file handles. This gives the same guarantees on both 32 bit and 64 bit. Link: https://lore.kernel.org/r/20241214-gekoppelt-erdarbeiten-a1f9a982a5a6@brauner Signed-off-by: Christian Brauner --- include/linux/pid.h | 2 ++ include/linux/pidfs.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pid.h b/include/linux/pid.h index a3aad9b4074c..fe575fcdb4af 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -59,6 +59,7 @@ struct pid spinlock_t lock; struct dentry *stashed; u64 ino; + struct rb_node pidfs_node; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head inodes; @@ -68,6 +69,7 @@ struct pid struct upid numbers[]; }; +extern seqcount_spinlock_t pidmap_lock_seq; extern struct pid init_struct_pid; struct file; diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 2958652bb108..df574d6708d4 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -4,7 +4,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); -int pidfs_add_pid(struct pid *pid); +void pidfs_add_pid(struct pid *pid); void pidfs_remove_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ -- cgit v1.2.3 From 2102773c60787d97ce93e8aa6890b74f166edd35 Mon Sep 17 00:00:00 2001 From: Raviteja Laggyshetty Date: Wed, 4 Dec 2024 13:26:05 -0800 Subject: dt-bindings: interconnect: add interconnect bindings for SM8750 Add interconnect device bindings. These devices can be used to describe any RPMh and NoC based interconnect devices. Signed-off-by: Raviteja Laggyshetty Signed-off-by: Melody Olvera Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20241204-sm8750_master_interconnects-v3-1-3d9aad4200e9@quicinc.com Signed-off-by: Georgi Djakov --- .../dt-bindings/interconnect/qcom,sm8750-rpmh.h | 143 +++++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 include/dt-bindings/interconnect/qcom,sm8750-rpmh.h (limited to 'include') diff --git a/include/dt-bindings/interconnect/qcom,sm8750-rpmh.h b/include/dt-bindings/interconnect/qcom,sm8750-rpmh.h new file mode 100644 index 000000000000..30563952a646 --- /dev/null +++ b/include/dt-bindings/interconnect/qcom,sm8750-rpmh.h @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#ifndef __DT_BINDINGS_INTERCONNECT_QCOM_SM8750_H +#define __DT_BINDINGS_INTERCONNECT_QCOM_SM8750_H + +#define MASTER_QSPI_0 0 +#define MASTER_QUP_1 1 +#define MASTER_QUP_3 2 +#define MASTER_SDCC_4 3 +#define MASTER_UFS_MEM 4 +#define MASTER_USB3_0 5 +#define SLAVE_A1NOC_SNOC 6 + +#define MASTER_QDSS_BAM 0 +#define MASTER_QUP_2 1 +#define MASTER_CRYPTO 2 +#define MASTER_IPA 3 +#define MASTER_SOCCP_AGGR_NOC 4 +#define MASTER_SP 5 +#define MASTER_QDSS_ETR 6 +#define MASTER_QDSS_ETR_1 7 +#define MASTER_SDCC_2 8 +#define SLAVE_A2NOC_SNOC 9 + +#define MASTER_QUP_CORE_0 0 +#define MASTER_QUP_CORE_1 1 +#define MASTER_QUP_CORE_2 2 +#define SLAVE_QUP_CORE_0 3 +#define SLAVE_QUP_CORE_1 4 +#define SLAVE_QUP_CORE_2 5 + +#define MASTER_CNOC_CFG 0 +#define SLAVE_AHB2PHY_SOUTH 1 +#define SLAVE_AHB2PHY_NORTH 2 +#define SLAVE_CAMERA_CFG 3 +#define SLAVE_CLK_CTL 4 +#define SLAVE_CRYPTO_0_CFG 5 +#define SLAVE_DISPLAY_CFG 6 +#define SLAVE_EVA_CFG 7 +#define SLAVE_GFX3D_CFG 8 +#define SLAVE_I2C 9 +#define SLAVE_I3C_IBI0_CFG 10 +#define SLAVE_I3C_IBI1_CFG 11 +#define SLAVE_IMEM_CFG 12 +#define SLAVE_CNOC_MSS 13 +#define SLAVE_PCIE_CFG 14 +#define SLAVE_PRNG 15 +#define SLAVE_QDSS_CFG 16 +#define SLAVE_QSPI_0 17 +#define SLAVE_QUP_3 18 +#define SLAVE_QUP_1 19 +#define SLAVE_QUP_2 20 +#define SLAVE_SDCC_2 21 +#define SLAVE_SDCC_4 22 +#define SLAVE_SPSS_CFG 23 +#define SLAVE_TCSR 24 +#define SLAVE_TLMM 25 +#define SLAVE_UFS_MEM_CFG 26 +#define SLAVE_USB3_0 27 +#define SLAVE_VENUS_CFG 28 +#define SLAVE_VSENSE_CTRL_CFG 29 +#define SLAVE_CNOC_MNOC_CFG 30 +#define SLAVE_PCIE_ANOC_CFG 31 +#define SLAVE_QDSS_STM 32 +#define SLAVE_TCU 33 + +#define MASTER_GEM_NOC_CNOC 0 +#define MASTER_GEM_NOC_PCIE_SNOC 1 +#define SLAVE_AOSS 2 +#define SLAVE_IPA_CFG 3 +#define SLAVE_IPC_ROUTER_CFG 4 +#define SLAVE_SOCCP 5 +#define SLAVE_TME_CFG 6 +#define SLAVE_APPSS 7 +#define SLAVE_CNOC_CFG 8 +#define SLAVE_DDRSS_CFG 9 +#define SLAVE_BOOT_IMEM 10 +#define SLAVE_IMEM 11 +#define SLAVE_BOOT_IMEM_2 12 +#define SLAVE_SERVICE_CNOC 13 +#define SLAVE_PCIE_0 14 + +#define MASTER_GPU_TCU 0 +#define MASTER_SYS_TCU 1 +#define MASTER_APPSS_PROC 2 +#define MASTER_GFX3D 3 +#define MASTER_LPASS_GEM_NOC 4 +#define MASTER_MSS_PROC 5 +#define MASTER_MNOC_HF_MEM_NOC 6 +#define MASTER_MNOC_SF_MEM_NOC 7 +#define MASTER_COMPUTE_NOC 8 +#define MASTER_ANOC_PCIE_GEM_NOC 9 +#define MASTER_SNOC_SF_MEM_NOC 10 +#define MASTER_UBWC_P 11 +#define MASTER_GIC 12 +#define SLAVE_UBWC_P 13 +#define SLAVE_GEM_NOC_CNOC 14 +#define SLAVE_LLCC 15 +#define SLAVE_MEM_NOC_PCIE_SNOC 16 + +#define MASTER_LPIAON_NOC 0 +#define SLAVE_LPASS_GEM_NOC 1 + +#define MASTER_LPASS_LPINOC 0 +#define SLAVE_LPIAON_NOC_LPASS_AG_NOC 1 + +#define MASTER_LPASS_PROC 0 +#define SLAVE_LPICX_NOC_LPIAON_NOC 1 + +#define MASTER_LLCC 0 +#define SLAVE_EBI1 1 + +#define MASTER_CAMNOC_HF 0 +#define MASTER_CAMNOC_NRT_ICP_SF 1 +#define MASTER_CAMNOC_RT_CDM_SF 2 +#define MASTER_CAMNOC_SF 3 +#define MASTER_MDP 4 +#define MASTER_CDSP_HCP 5 +#define MASTER_VIDEO_CV_PROC 6 +#define MASTER_VIDEO_EVA 7 +#define MASTER_VIDEO_MVP 8 +#define MASTER_VIDEO_V_PROC 9 +#define MASTER_CNOC_MNOC_CFG 10 +#define SLAVE_MNOC_HF_MEM_NOC 11 +#define SLAVE_MNOC_SF_MEM_NOC 12 +#define SLAVE_SERVICE_MNOC 13 + +#define MASTER_CDSP_PROC 0 +#define SLAVE_CDSP_MEM_NOC 1 + +#define MASTER_PCIE_ANOC_CFG 0 +#define MASTER_PCIE_0 1 +#define SLAVE_ANOC_PCIE_GEM_NOC 2 +#define SLAVE_SERVICE_PCIE_ANOC 3 + +#define MASTER_A1NOC_SNOC 0 +#define MASTER_A2NOC_SNOC 1 +#define SLAVE_SNOC_GEM_NOC_SF 2 + +#endif -- cgit v1.2.3 From fa52c04daec9ff9820260901a8b1d271bb532d12 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Thu, 7 Nov 2024 12:47:05 +0100 Subject: mfd: core: Make platform_data pointer const in struct mfd_cell The content of the platform_data of a struct mfd_cell is simply passed on to the platform_device_add_data() call in mfd_add_device() . platform_device_add_data() already handles the data behind that pointer as const and also uses kmemdup to create a copy of the data before handing that copy over to the newly created platform-device, so there is no reason to not extend this to struct mfd_cell, as the old copy in the mfd_cell will be stale anyway. This allows to pass structs gathered from of_device_get_match_data() as platform-data to sub-devices - which is retrieved as const already. Signed-off-by: Heiko Stuebner Link: https://lore.kernel.org/r/20241107114712.538976-3-heiko@sntech.de Signed-off-by: Lee Jones --- include/linux/mfd/core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mfd/core.h b/include/linux/mfd/core.h index e8bcad641d8c..faeea7abd688 100644 --- a/include/linux/mfd/core.h +++ b/include/linux/mfd/core.h @@ -72,7 +72,7 @@ struct mfd_cell { int (*resume)(struct platform_device *dev); /* platform data passed to the sub devices drivers */ - void *platform_data; + const void *platform_data; size_t pdata_size; /* Matches ACPI */ -- cgit v1.2.3 From 998f70d1806bb718a7565f350283e4a79c8cbb4b Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Thu, 7 Nov 2024 12:47:07 +0100 Subject: mfd: Add base driver for qnap-mcu devices These microcontroller units are used in network-attached-storage devices made by QNAP and provide additional functionality to the system. This adds the base driver that implements the serial protocol via serdev and additionally hooks into the poweroff handlers to turn off the parts of the system not supplied by the general PMIC. Turning off (at least the TSx33 devices using Rockchip SoCs) consists of two separate actions. Turning off the MCU alone does not turn off the main SoC and turning off only the SoC/PMIC does not turn off the hard-drives. Also if the MCU is not turned off, the system also won't start again until it is unplugged from power. So on shutdown the MCU needs to be turned off separately before the main PMIC. The protocol spoken by the MCU is sadly not documented, but was obtained by listening to the chatter on the serial port, as thankfully the "hal_app" program from QNAPs firmware allows triggering all/most MCU actions from the command line. The implementation of how to talk to the serial device got some inspiration from the rave-sp servdev driver. Signed-off-by: Heiko Stuebner Link: https://lore.kernel.org/r/20241107114712.538976-5-heiko@sntech.de Signed-off-by: Lee Jones --- include/linux/mfd/qnap-mcu.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 include/linux/mfd/qnap-mcu.h (limited to 'include') diff --git a/include/linux/mfd/qnap-mcu.h b/include/linux/mfd/qnap-mcu.h new file mode 100644 index 000000000000..8d48c212fd44 --- /dev/null +++ b/include/linux/mfd/qnap-mcu.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Core definitions for QNAP MCU MFD driver. + * Copyright (C) 2024 Heiko Stuebner + */ + +#ifndef _LINUX_QNAP_MCU_H_ +#define _LINUX_QNAP_MCU_H_ + +struct qnap_mcu; + +struct qnap_mcu_variant { + u32 baud_rate; + int num_drives; + int fan_pwm_min; + int fan_pwm_max; + bool usb_led; +}; + +int qnap_mcu_exec(struct qnap_mcu *mcu, + const u8 *cmd_data, size_t cmd_data_size, + u8 *reply_data, size_t reply_data_size); +int qnap_mcu_exec_with_ack(struct qnap_mcu *mcu, + const u8 *cmd_data, size_t cmd_data_size); + +#endif /* _LINUX_QNAP_MCU_H_ */ -- cgit v1.2.3 From c925bb8853dae5cb25e7108298e905b55301bbff Mon Sep 17 00:00:00 2001 From: Marcus Folkesson Date: Tue, 10 Dec 2024 16:24:40 +0100 Subject: mfd: da9052: Store result from fault_log Other sub-components (da9052-wdt) could use the result to determine reboot cause. Expose the result by make it part of the da9052 structure. Signed-off-by: Marcus Folkesson Link: https://lore.kernel.org/r/20241210-da9052-wdt-v2-1-95a5756e9ac8@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/da9052/da9052.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mfd/da9052/da9052.h b/include/linux/mfd/da9052/da9052.h index 76feb3a7066d..9cb2fc2938ce 100644 --- a/include/linux/mfd/da9052/da9052.h +++ b/include/linux/mfd/da9052/da9052.h @@ -93,6 +93,8 @@ struct da9052 { int chip_irq; + int fault_log; + /* SOC I/O transfer related fixes for DA9052/53 */ int (*fix_io) (struct da9052 *da9052, unsigned char reg); }; -- cgit v1.2.3 From c2b148f3bc94b61e885dc8529d6b6136576bd865 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Wed, 11 Dec 2024 17:27:16 +0100 Subject: mfd: Add support for AAEON UP board FPGA The UP boards implement some features (pin controller, LEDs) through an on-board FPGA. This MFD driver implements the line protocol to communicate with the FPGA through regmap, and registers pin controller and led cells. This commit adds support for UP and UP Squared boards. Based on the work done by Gary Wang . Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20241211-aaeon-up-board-pinctrl-support-v1-1-24719be27631@bootlin.com Signed-off-by: Lee Jones --- include/linux/mfd/upboard-fpga.h | 55 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 include/linux/mfd/upboard-fpga.h (limited to 'include') diff --git a/include/linux/mfd/upboard-fpga.h b/include/linux/mfd/upboard-fpga.h new file mode 100644 index 000000000000..12231e40f5da --- /dev/null +++ b/include/linux/mfd/upboard-fpga.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * UP Board CPLD/FPGA driver + * + * Copyright (c) AAEON. All rights reserved. + * Copyright (C) 2024 Bootlin + * + * Author: Gary Wang + * Author: Thomas Richard + * + */ + +#ifndef __LINUX_MFD_UPBOARD_FPGA_H +#define __LINUX_MFD_UPBOARD_FPGA_H + +#define UPBOARD_REGISTER_SIZE 16 + +enum upboard_fpgareg { + UPBOARD_REG_PLATFORM_ID = 0x10, + UPBOARD_REG_FIRMWARE_ID = 0x11, + UPBOARD_REG_FUNC_EN0 = 0x20, + UPBOARD_REG_FUNC_EN1 = 0x21, + UPBOARD_REG_GPIO_EN0 = 0x30, + UPBOARD_REG_GPIO_EN1 = 0x31, + UPBOARD_REG_GPIO_EN2 = 0x32, + UPBOARD_REG_GPIO_DIR0 = 0x40, + UPBOARD_REG_GPIO_DIR1 = 0x41, + UPBOARD_REG_GPIO_DIR2 = 0x42, + UPBOARD_REG_MAX, +}; + +enum upboard_fpga_type { + UPBOARD_UP_FPGA, + UPBOARD_UP2_FPGA, +}; + +struct upboard_fpga_data { + enum upboard_fpga_type type; + const struct regmap_config *regmap_config; +}; + +struct upboard_fpga { + struct device *dev; + struct regmap *regmap; + struct gpio_desc *enable_gpio; + struct gpio_desc *reset_gpio; + struct gpio_desc *clear_gpio; + struct gpio_desc *strobe_gpio; + struct gpio_desc *datain_gpio; + struct gpio_desc *dataout_gpio; + unsigned int firmware_version; + const struct upboard_fpga_data *fpga_data; +}; + +#endif /* __LINUX_MFD_UPBOARD_FPGA_H */ -- cgit v1.2.3 From 1c896113f04e34d0036ef506532d2e6cf77dd1e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Sun, 15 Dec 2024 22:13:23 +0100 Subject: turris-omnia-mcu-interface.h: Move macro definitions outside of enums MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the definitions of enumerator related macros outside of the enumerator definitions. Suggested-by: Lee Jones Link: https://lore.kernel.org/linux-leds/20241212183357.GK7139@google.com/ Signed-off-by: Marek Behún Link: https://lore.kernel.org/r/20241215211323.23364-1-kabel@kernel.org Signed-off-by: Lee Jones --- include/linux/turris-omnia-mcu-interface.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/turris-omnia-mcu-interface.h b/include/linux/turris-omnia-mcu-interface.h index 06c94e032c6f..38b45ab00053 100644 --- a/include/linux/turris-omnia-mcu-interface.h +++ b/include/linux/turris-omnia-mcu-interface.h @@ -241,16 +241,18 @@ enum omnia_int_e { enum omnia_cmd_led_mode_e { OMNIA_CMD_LED_MODE_LED_MASK = GENMASK(3, 0), -#define OMNIA_CMD_LED_MODE_LED(_l) FIELD_PREP(OMNIA_CMD_LED_MODE_LED_MASK, _l) OMNIA_CMD_LED_MODE_USER = BIT(4), }; +#define OMNIA_CMD_LED_MODE_LED(_l) FIELD_PREP(OMNIA_CMD_LED_MODE_LED_MASK, _l) + enum omnia_cmd_led_state_e { OMNIA_CMD_LED_STATE_LED_MASK = GENMASK(3, 0), -#define OMNIA_CMD_LED_STATE_LED(_l) FIELD_PREP(OMNIA_CMD_LED_STATE_LED_MASK, _l) OMNIA_CMD_LED_STATE_ON = BIT(4), }; +#define OMNIA_CMD_LED_STATE_LED(_l) FIELD_PREP(OMNIA_CMD_LED_STATE_LED_MASK, _l) + enum omnia_cmd_poweroff_e { OMNIA_CMD_POWER_OFF_POWERON_BUTTON = BIT(0), OMNIA_CMD_POWER_OFF_MAGIC = 0xdead, -- cgit v1.2.3 From 020b40f3562495f3c703a283ece145ffec19e82d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Dec 2024 08:21:46 -0700 Subject: io_uring: make ctx->timeout_lock a raw spinlock Chase reports that their tester complaints about a locking context mismatch: ============================= [ BUG: Invalid wait context ] 6.13.0-rc1-gf137f14b7ccb-dirty #9 Not tainted ----------------------------- syz.1.25198/182604 is trying to lock: ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: spin_lock_irq include/linux/spinlock.h:376 [inline] ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: io_match_task_safe io_uring/io_uring.c:218 [inline] ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: io_match_task_safe+0x187/0x250 io_uring/io_uring.c:204 other info that might help us debug this: context-{5:5} 1 lock held by syz.1.25198/182604: #0: ffff88802b7d48c0 (&acct->lock){+.+.}-{2:2}, at: io_acct_cancel_pending_work+0x2d/0x6b0 io_uring/io-wq.c:1049 stack backtrace: CPU: 0 UID: 0 PID: 182604 Comm: syz.1.25198 Not tainted 6.13.0-rc1-gf137f14b7ccb-dirty #9 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x82/0xd0 lib/dump_stack.c:120 print_lock_invalid_wait_context kernel/locking/lockdep.c:4826 [inline] check_wait_context kernel/locking/lockdep.c:4898 [inline] __lock_acquire+0x883/0x3c80 kernel/locking/lockdep.c:5176 lock_acquire.part.0+0x11b/0x370 kernel/locking/lockdep.c:5849 __raw_spin_lock_irq include/linux/spinlock_api_smp.h:119 [inline] _raw_spin_lock_irq+0x36/0x50 kernel/locking/spinlock.c:170 spin_lock_irq include/linux/spinlock.h:376 [inline] io_match_task_safe io_uring/io_uring.c:218 [inline] io_match_task_safe+0x187/0x250 io_uring/io_uring.c:204 io_acct_cancel_pending_work+0xb8/0x6b0 io_uring/io-wq.c:1052 io_wq_cancel_pending_work io_uring/io-wq.c:1074 [inline] io_wq_cancel_cb+0xb0/0x390 io_uring/io-wq.c:1112 io_uring_try_cancel_requests+0x15e/0xd70 io_uring/io_uring.c:3062 io_uring_cancel_generic+0x6ec/0x8c0 io_uring/io_uring.c:3140 io_uring_files_cancel include/linux/io_uring.h:20 [inline] do_exit+0x494/0x27a0 kernel/exit.c:894 do_group_exit+0xb3/0x250 kernel/exit.c:1087 get_signal+0x1d77/0x1ef0 kernel/signal.c:3017 arch_do_signal_or_restart+0x79/0x5b0 arch/x86/kernel/signal.c:337 exit_to_user_mode_loop kernel/entry/common.c:111 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:329 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] syscall_exit_to_user_mode+0x150/0x2a0 kernel/entry/common.c:218 do_syscall_64+0xd8/0x250 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x77/0x7f which is because io_uring has ctx->timeout_lock nesting inside the io-wq acct lock, the latter of which is used from inside the scheduler and hence is a raw spinlock, while the former is a "normal" spinlock and can hence be sleeping on PREEMPT_RT. Change ctx->timeout_lock to be a raw spinlock to solve this nesting dependency on PREEMPT_RT=y. Reported-by: chase xd Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 011860ade268..fd4cdb0860a2 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -345,7 +345,7 @@ struct io_ring_ctx { /* timeouts */ struct { - spinlock_t timeout_lock; + raw_spinlock_t timeout_lock; struct list_head timeout_list; struct list_head ltimeout_list; unsigned cq_last_tm_flush; -- cgit v1.2.3 From afd2627f727b89496d79a6b934a025fc916d4ded Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 21:41:22 -0500 Subject: tracing: Check "%s" dereference via the field and not the TP_printk format The TP_printk() portion of a trace event is executed at the time a event is read from the trace. This can happen seconds, minutes, hours, days, months, years possibly later since the event was recorded. If the print format contains a dereference to a string via "%s", and that string was allocated, there's a chance that string could be freed before it is read by the trace file. To protect against such bugs, there are two functions that verify the event. The first one is test_event_printk(), which is called when the event is created. It reads the TP_printk() format as well as its arguments to make sure nothing may be dereferencing a pointer that was not copied into the ring buffer along with the event. If it is, it will trigger a WARN_ON(). For strings that use "%s", it is not so easy. The string may not reside in the ring buffer but may still be valid. Strings that are static and part of the kernel proper which will not be freed for the life of the running system, are safe to dereference. But to know if it is a pointer to a static string or to something on the heap can not be determined until the event is triggered. This brings us to the second function that tests for the bad dereferencing of strings, trace_check_vprintf(). It would walk through the printf format looking for "%s", and when it finds it, it would validate that the pointer is safe to read. If not, it would produces a WARN_ON() as well and write into the ring buffer "[UNSAFE-MEMORY]". The problem with this is how it used va_list to have vsnprintf() handle all the cases that it didn't need to check. Instead of re-implementing vsnprintf(), it would make a copy of the format up to the %s part, and call vsnprintf() with the current va_list ap variable, where the ap would then be ready to point at the string in question. For architectures that passed va_list by reference this was possible. For architectures that passed it by copy it was not. A test_can_verify() function was used to differentiate between the two, and if it wasn't possible, it would disable it. Even for architectures where this was feasible, it was a stretch to rely on such a method that is undocumented, and could cause issues later on with new optimizations of the compiler. Instead, the first function test_event_printk() was updated to look at "%s" as well. If the "%s" argument is a pointer outside the event in the ring buffer, it would find the field type of the event that is the problem and mark the structure with a new flag called "needs_test". The event itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that this event has a field that needs to be verified before the event can be printed using the printf format. When the event fields are created from the field type structure, the fields would copy the field type's "needs_test" value. Finally, before being printed, a new function ignore_event() is called which will check if the event has the TEST_STR flag set (if not, it returns false). If the flag is set, it then iterates through the events fields looking for the ones that have the "needs_test" flag set. Then it uses the offset field from the field structure to find the pointer in the ring buffer event. It runs the tests to make sure that pointer is safe to print and if not, it triggers the WARN_ON() and also adds to the trace output that the event in question has an unsafe memory access. The ignore_event() makes the trace_check_vprintf() obsolete so it is removed. Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Al Viro Cc: Linus Torvalds Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 2a5df5b62cfc..91b8ffbdfa8c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -273,7 +273,8 @@ struct trace_event_fields { const char *name; const int size; const int align; - const int is_signed; + const unsigned int is_signed:1; + unsigned int needs_test:1; const int filter_type; const int len; }; @@ -324,6 +325,7 @@ enum { TRACE_EVENT_FL_EPROBE_BIT, TRACE_EVENT_FL_FPROBE_BIT, TRACE_EVENT_FL_CUSTOM_BIT, + TRACE_EVENT_FL_TEST_STR_BIT, }; /* @@ -340,6 +342,7 @@ enum { * CUSTOM - Event is a custom event (to be attached to an exsiting tracepoint) * This is set when the custom event has not been attached * to a tracepoint yet, then it is cleared when it is. + * TEST_STR - The event has a "%s" that points to a string outside the event */ enum { TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), @@ -352,6 +355,7 @@ enum { TRACE_EVENT_FL_EPROBE = (1 << TRACE_EVENT_FL_EPROBE_BIT), TRACE_EVENT_FL_FPROBE = (1 << TRACE_EVENT_FL_FPROBE_BIT), TRACE_EVENT_FL_CUSTOM = (1 << TRACE_EVENT_FL_CUSTOM_BIT), + TRACE_EVENT_FL_TEST_STR = (1 << TRACE_EVENT_FL_TEST_STR_BIT), }; #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE) -- cgit v1.2.3 From 915d2f0718a42ee0b334be34cc53664a865a5928 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 27 Nov 2024 16:55:32 -0800 Subject: KVM: Move KVM_REG_SIZE() definition to common uAPI header Define KVM_REG_SIZE() in the common kvm.h header, and delete the arm64 and RISC-V versions. As evidenced by the surrounding definitions, all aspects of the register size encoding are generic, i.e. RISC-V should have moved arm64's definition to common code instead of copy+pasting. Acked-by: Anup Patel Reviewed-by: Andrew Jones Reviewed-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20241128005547.4077116-2-seanjc@google.com Signed-off-by: Sean Christopherson --- include/uapi/linux/kvm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 502ea63b5d2e..343de0a51797 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1070,6 +1070,10 @@ struct kvm_dirty_tlb { #define KVM_REG_SIZE_SHIFT 52 #define KVM_REG_SIZE_MASK 0x00f0000000000000ULL + +#define KVM_REG_SIZE(id) \ + (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT)) + #define KVM_REG_SIZE_U8 0x0000000000000000ULL #define KVM_REG_SIZE_U16 0x0010000000000000ULL #define KVM_REG_SIZE_U32 0x0020000000000000ULL -- cgit v1.2.3 From 346947223bacf96155f603528823a60b18b92d9a Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Mon, 16 Dec 2024 15:15:31 +0100 Subject: devlink: add devlink_fmsg_put() macro Add devlink_fmsg_put() that dispatches based on the type of the value to put, example: bool -> devlink_fmsg_bool_pair_put(). Reviewed-by: Wojciech Drewek Reviewed-by: Simon Horman Signed-off-by: Mateusz Polchlopek Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Przemek Kitszel Signed-off-by: Tony Nguyen --- include/net/devlink.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index fbb9a2668e24..b5e1427ea4d7 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1261,6 +1261,17 @@ enum devlink_trap_group_generic_id { .min_burst = _min_burst, \ } +#define devlink_fmsg_put(fmsg, name, value) ( \ + _Generic((value), \ + bool : devlink_fmsg_bool_pair_put, \ + u8 : devlink_fmsg_u8_pair_put, \ + u16 : devlink_fmsg_u32_pair_put, \ + u32 : devlink_fmsg_u32_pair_put, \ + u64 : devlink_fmsg_u64_pair_put, \ + char * : devlink_fmsg_string_pair_put, \ + const char * : devlink_fmsg_string_pair_put) \ + (fmsg, name, (value))) + enum { /* device supports reload operations */ DEVLINK_F_RELOAD = 1UL << 0, -- cgit v1.2.3 From 3dbfde7f6bc7b8efff26e3e98fdd8cba20287da7 Mon Sep 17 00:00:00 2001 From: Mateusz Polchlopek Date: Mon, 16 Dec 2024 15:15:32 +0100 Subject: devlink: add devlink_fmsg_dump_skb() function Add devlink_fmsg_dump_skb() function that adds some diagnostic information about skb (like length, pkt type, MAC, etc) to devlink fmsg mechanism using bunch of devlink_fmsg_put() function calls. Signed-off-by: Mateusz Polchlopek Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Przemek Kitszel Signed-off-by: Tony Nguyen --- include/net/devlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index b5e1427ea4d7..58e33959c852 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1268,6 +1268,7 @@ enum devlink_trap_group_generic_id { u16 : devlink_fmsg_u32_pair_put, \ u32 : devlink_fmsg_u32_pair_put, \ u64 : devlink_fmsg_u64_pair_put, \ + int : devlink_fmsg_u32_pair_put, \ char * : devlink_fmsg_string_pair_put, \ const char * : devlink_fmsg_string_pair_put) \ (fmsg, name, (value))) @@ -2005,6 +2006,7 @@ int devlink_compat_switch_id_get(struct net_device *dev, int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port); size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port); +void devlink_fmsg_dump_skb(struct devlink_fmsg *fmsg, const struct sk_buff *skb); #else -- cgit v1.2.3 From 7c4b497fd4032935676b9024396f187fee005739 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 17 Dec 2024 18:41:54 +0100 Subject: clk: davinci: remove platform data struct There are no board files using struct davinci_pll_platform_data anymore. The structure itself is currently used to store a single pointer. Let's remove the struct definition, the header and rework the driver to not require the syscon regmap to be stored in probe(). Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20241217174154.84441-1-brgl@bgdev.pl Reviewed-by: David Lechner Signed-off-by: Stephen Boyd --- include/linux/platform_data/clk-davinci-pll.h | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 include/linux/platform_data/clk-davinci-pll.h (limited to 'include') diff --git a/include/linux/platform_data/clk-davinci-pll.h b/include/linux/platform_data/clk-davinci-pll.h deleted file mode 100644 index e55dab1d578b..000000000000 --- a/include/linux/platform_data/clk-davinci-pll.h +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * PLL clock driver for TI Davinci SoCs - * - * Copyright (C) 2018 David Lechner - */ - -#ifndef __LINUX_PLATFORM_DATA_CLK_DAVINCI_PLL_H__ -#define __LINUX_PLATFORM_DATA_CLK_DAVINCI_PLL_H__ - -#include - -/** - * davinci_pll_platform_data - * @cfgchip: CFGCHIP syscon regmap - */ -struct davinci_pll_platform_data { - struct regmap *cfgchip; -}; - -#endif /* __LINUX_PLATFORM_DATA_CLK_DAVINCI_PLL_H__ */ -- cgit v1.2.3 From d5af79c05e9382d38b8546dc5362381ce07ba3d1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 9 Dec 2024 16:00:41 -0800 Subject: Documentation: move dev-tools debugging files to process/debugging/ Move gdb and kgdb debugging documentation to the dedicated debugging directory (Documentation/process/debugging/). Adjust the index.rst files to follow the file movement. Adjust files that refer to these moved files to follow the file movement. Update location of kgdb.rst in MAINTAINERS file. Add a link from dev-tools/index to process/debugging/index. Note: translations are not updated. Signed-off-by: Randy Dunlap Cc: Sebastian Fricke Cc: Jonathan Corbet Cc: workflows@vger.kernel.org Cc: Jason Wessel Cc: Daniel Thompson Cc: Douglas Anderson Cc: linux-debuggers@vger.kernel.org Cc: kgdb-bugreport@lists.sourceforge.net Cc: Doug Anderson Cc: Alex Shi Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Andrew Morton Cc: Greg Kroah-Hartman Cc: linux-serial@vger.kernel.org Acked-by: Greg Kroah-Hartman Acked-by: Daniel Thompson Reviewed-by: Douglas Anderson Signed-off-by: Jonathan Corbet Link: https://lore.kernel.org/r/20241210000041.305477-1-rdunlap@infradead.org --- include/linux/tty_driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index dd4b31ce6d5d..d4cdc089f6c3 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -320,7 +320,7 @@ struct serial_struct; * * @poll_init: ``int ()(struct tty_driver *driver, int line, char *options)`` * - * kgdboc support (Documentation/dev-tools/kgdb.rst). This routine is + * kgdboc support (Documentation/process/debugging/kgdb.rst). This routine is * called to initialize the HW for later use by calling @poll_get_char or * @poll_put_char. * -- cgit v1.2.3 From 85b08180df07b9a5984b15ae31d76b904d42a115 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 13 Dec 2024 10:51:29 -0800 Subject: x86/cpu: Expose only stepping min/max interface The x86_match_cpu() infrastructure can match CPU steppings. Since there are only 16 possible steppings, the matching infrastructure goes all out and stores the stepping match as a bitmap. That means it can match any possible steppings in a single list entry. Fun. But it exposes this bitmap to each of the X86_MATCH_*() helpers when none of them really need a bitmap. It makes up for this by exporting a helper (X86_STEPPINGS()) which converts a contiguous stepping range into the bitmap which every single user leverages. Instead of a bitmap, have the main helper for this sort of thing (X86_MATCH_VFM_STEPS()) just take a stepping range. This ends up actually being even more compact than before. Leave the helper in place (renamed to __X86_STEPPINGS()) to make it more clear what is going on instead of just having a random GENMASK() in the middle of an already complicated macro. One oddity that I hit was this macro: X86_MATCH_VFM_STEPS(vfm, X86_STEPPING_MIN, max_stepping, issues) It *could* have been converted over to take a min/max stepping value for each entry. But that would have been a bit too verbose and would prevent the one oddball in the list (INTEL_COMETLAKE_L stepping 0) from sticking out. Instead, just have it take a *maximum* stepping and imply that the match is from 0=>max_stepping. This is functional for all the cases now and also retains the nice property of having INTEL_COMETLAKE_L stepping 0 stick out like a sore thumb. skx_cpuids[] is goofy. It uses the stepping match but encodes all possible steppings. Just use a normal, non-stepping match helper. Suggested-by: Ingo Molnar Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20241213185129.65527B2A%40davehans-spike.ostc.intel.com --- include/linux/mod_devicetable.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 4338b1b4ac44..d67614f7b7f1 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -700,6 +700,8 @@ struct x86_cpu_id { #define X86_FAMILY_ANY 0 #define X86_MODEL_ANY 0 #define X86_STEPPING_ANY 0 +#define X86_STEP_MIN 0 +#define X86_STEP_MAX 0xf #define X86_FEATURE_ANY 0 /* Same as FPU, you can't test for that */ /* -- cgit v1.2.3 From d3c9510dc900e9ff3ea330189c0465c9f00fba18 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 15 Dec 2024 13:29:38 -0800 Subject: net: page_pool: rename page_pool_is_last_ref() page_pool_is_last_ref() releases a reference while the name, to me at least, suggests it just checks if the refcount is 1. The semantics of the function are the same as those of atomic_dec_and_test() and refcount_dec_and_test(), so just use the _and_test() suffix. Reviewed-by: Alexander Lobakin Reviewed-by: Ilias Apalodimas Acked-by: Jesper Dangaard Brouer Link: https://patch.msgid.link/20241215212938.99210-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/page_pool/helpers.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index e555921e5233..776a3008ac28 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -306,7 +306,7 @@ static inline void page_pool_ref_page(struct page *page) page_pool_ref_netmem(page_to_netmem(page)); } -static inline bool page_pool_is_last_ref(netmem_ref netmem) +static inline bool page_pool_unref_and_test(netmem_ref netmem) { /* If page_pool_unref_page() returns 0, we were the last user */ return page_pool_unref_netmem(netmem, 1) == 0; @@ -321,7 +321,7 @@ static inline void page_pool_put_netmem(struct page_pool *pool, * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - if (!page_pool_is_last_ref(netmem)) + if (!page_pool_unref_and_test(netmem)) return; page_pool_put_unrefed_netmem(pool, netmem, dma_sync_size, allow_direct); -- cgit v1.2.3 From 661cd8fc8e9039819ca0c22e0add52b632240a9e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 Dec 2024 17:56:26 +0000 Subject: inetpeer: remove create argument of inet_getpeer_v[46]() All callers of inet_getpeer_v4() and inet_getpeer_v6() want to create an inetpeer. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241215175629.1248773-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inetpeer.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 74ff688568a0..6f51f81d6cb1 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -101,25 +101,24 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base, __be32 v4daddr, - int vif, int create) + int vif) { struct inetpeer_addr daddr; daddr.a4.addr = v4daddr; daddr.a4.vif = vif; daddr.family = AF_INET; - return inet_getpeer(base, &daddr, create); + return inet_getpeer(base, &daddr, 1); } static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base, - const struct in6_addr *v6daddr, - int create) + const struct in6_addr *v6daddr) { struct inetpeer_addr daddr; daddr.a6 = *v6daddr; daddr.family = AF_INET6; - return inet_getpeer(base, &daddr, create); + return inet_getpeer(base, &daddr, 1); } static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a, -- cgit v1.2.3 From 7a596a50c4a4eab946aec149171c72321b4934aa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 Dec 2024 17:56:27 +0000 Subject: inetpeer: remove create argument of inet_getpeer() All callers of inet_getpeer() want to create an inetpeer. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241215175629.1248773-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inetpeer.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 6f51f81d6cb1..f475757daafb 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -96,8 +96,7 @@ static inline struct in6_addr *inetpeer_get_addr_v6(struct inetpeer_addr *iaddr) /* can be called with or without local BH being disabled */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, - const struct inetpeer_addr *daddr, - int create); + const struct inetpeer_addr *daddr); static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base, __be32 v4daddr, @@ -108,7 +107,7 @@ static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base, daddr.a4.addr = v4daddr; daddr.a4.vif = vif; daddr.family = AF_INET; - return inet_getpeer(base, &daddr, 1); + return inet_getpeer(base, &daddr); } static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base, @@ -118,7 +117,7 @@ static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base, daddr.a6 = *v6daddr; daddr.family = AF_INET6; - return inet_getpeer(base, &daddr, 1); + return inet_getpeer(base, &daddr); } static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a, -- cgit v1.2.3 From 647b7aad19490a7b90c52c883bda7df299457491 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 4 Dec 2024 04:29:28 -0800 Subject: iommu: Remove the remove_dev_pasid op The iommu drivers that supports PASID have supported attaching pasid to the blocked_domain, hence remove the remove_dev_pasid op from the iommu_ops. Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Vasant Hegde Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Link: https://lore.kernel.org/r/20241204122928.11987-8-yi.l.liu@intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 318d27841130..38c65e92ecd0 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -587,9 +587,6 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, * - IOMMU_DOMAIN_DMA: must use a dma domain * - 0: use the default setting * @default_domain_ops: the default ops for domains - * @remove_dev_pasid: Remove any translation configurations of a specific - * pasid, so that any DMA transactions with this pasid - * will be blocked by the hardware. * @viommu_alloc: Allocate an iommufd_viommu on a physical IOMMU instance behind * the @dev, as the set of virtualization resources shared/passed * to user space IOMMU instance. And associate it with a nesting @@ -647,8 +644,6 @@ struct iommu_ops { struct iommu_page_response *msg); int (*def_domain_type)(struct device *dev); - void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid, - struct iommu_domain *domain); struct iommufd_viommu *(*viommu_alloc)( struct device *dev, struct iommu_domain *parent_domain, -- cgit v1.2.3 From 349f0086ba8b2a169877d21ff15a4d9da3a60054 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 18 Dec 2024 09:02:28 +0100 Subject: x86/static-call: fix 32-bit build In 32-bit x86 builds CONFIG_STATIC_CALL_INLINE isn't set, leading to static_call_initialized not being available. Define it as "0" in that case. Reported-by: Stephen Rothwell Fixes: 0ef8047b737d ("x86/static-call: provide a way to do very early static-call updates") Signed-off-by: Juergen Gross Acked-by: Peter Zijlstra (Intel) Signed-off-by: Linus Torvalds --- include/linux/static_call.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/static_call.h b/include/linux/static_call.h index 785980af8972..78a77a4ae0ea 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -138,7 +138,6 @@ #ifdef CONFIG_HAVE_STATIC_CALL #include -extern int static_call_initialized; /* * Either @site or @tramp can be NULL. */ @@ -161,6 +160,8 @@ extern void arch_static_call_transform(void *site, void *tramp, void *func, bool #ifdef CONFIG_HAVE_STATIC_CALL_INLINE +extern int static_call_initialized; + extern int __init static_call_init(void); extern void static_call_force_reinit(void); @@ -226,6 +227,8 @@ extern long __static_call_return0(void); #elif defined(CONFIG_HAVE_STATIC_CALL) +#define static_call_initialized 0 + static inline int static_call_init(void) { return 0; } #define DEFINE_STATIC_CALL(name, _func) \ @@ -282,6 +285,8 @@ extern long __static_call_return0(void); #else /* Generic implementation */ +#define static_call_initialized 0 + static inline int static_call_init(void) { return 0; } static inline long __static_call_return0(void) -- cgit v1.2.3 From ebeeee390b6a341770789a50d81e677da9a103d9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 12 Dec 2024 13:01:02 +0100 Subject: PM: EM: Move sched domains rebuild function from schedutil to EM Function sugov_eas_rebuild_sd() defined in the schedutil cpufreq governor implements generic functionality that may be useful in other places. In particular, there is a plan to use it in the intel_pstate driver in the future. For this reason, move it from schedutil to the energy model code and rename it to em_rebuild_sched_domains(). This also helps to get rid of some #ifdeffery in schedutil which is a plus. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle --- include/linux/energy_model.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 752e0b297582..78318d49276d 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -179,6 +179,7 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, int em_dev_update_chip_binning(struct device *dev); int em_update_performance_limits(struct em_perf_domain *pd, unsigned long freq_min_khz, unsigned long freq_max_khz); +void em_rebuild_sched_domains(void); /** * em_pd_get_efficient_state() - Get an efficient performance state from the EM @@ -404,6 +405,7 @@ int em_update_performance_limits(struct em_perf_domain *pd, { return -EINVAL; } +static inline void em_rebuild_sched_domains(void) {} #endif #endif -- cgit v1.2.3 From b317268368546d6401af788648668f82e3ba1bd3 Mon Sep 17 00:00:00 2001 From: Joe Hattori Date: Wed, 18 Dec 2024 13:09:35 +0900 Subject: PM: wakeup: implement devm_device_init_wakeup() helper Some drivers that enable device wakeup fail to properly disable it during their cleanup, which results in a memory leak. To address this, introduce devm_device_init_wakeup(), a managed variant of device_init_wakeup(dev, true). With this managed helper, wakeup functionality will be automatically disabled when the device is released, ensuring a more reliable cleanup process. This need for this addition arose during a previous discussion [1]. Link: https://lore.kernel.org/linux-rtc/20241212100403.3799667-1-joe@pf.is.s.u-tokyo.ac.jp/ [1] Suggested-by: Alexandre Belloni Signed-off-by: Joe Hattori Link: https://patch.msgid.link/20241218040935.1921416-1-joe@pf.is.s.u-tokyo.ac.jp Signed-off-by: Rafael J. Wysocki --- include/linux/pm_wakeup.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index 222f7530806c..d501c09c60cd 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -240,4 +240,21 @@ static inline int device_init_wakeup(struct device *dev, bool enable) return 0; } +static void device_disable_wakeup(void *dev) +{ + device_init_wakeup(dev, false); +} + +/** + * devm_device_init_wakeup - Resource managed device wakeup initialization. + * @dev: Device to handle. + * + * This function is the devm managed version of device_init_wakeup(dev, true). + */ +static inline int devm_device_init_wakeup(struct device *dev) +{ + device_init_wakeup(dev, true); + return devm_add_action_or_reset(dev, device_disable_wakeup, dev); +} + #endif /* _LINUX_PM_WAKEUP_H */ -- cgit v1.2.3 From e22c369520d0a2a191820cc308f81a860b1b8d47 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 17 Dec 2024 09:55:13 -0800 Subject: KVM: arm64: Add unified helper for reprogramming counters by mask Having separate helpers for enabling/disabling counters provides the wrong abstraction, as the state of each counter needs to be evaluated independently and, in some cases, use a different global enable bit. Collapse the enable/disable accessors into a single, common helper that reconfigures every counter set in @mask, leaving the complexity of determining if an event is actually enabled in kvm_pmu_counter_is_enabled(). Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241217175513.3658056-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- include/kvm/arm_pmu.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index e61dd7dd2286..147bd3ee4f7b 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -53,8 +53,7 @@ u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1); void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu); -void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val); -void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val); +void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu); bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu); @@ -127,8 +126,7 @@ static inline u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) static inline void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_reset(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) {} -static inline void kvm_pmu_disable_counter_mask(struct kvm_vcpu *vcpu, u64 val) {} -static inline void kvm_pmu_enable_counter_mask(struct kvm_vcpu *vcpu, u64 val) {} +static inline void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) {} static inline bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From af5366bea2cb9dfb5da2880e1dff544f87505300 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 27 Nov 2024 17:33:42 -0800 Subject: KVM: x86: Drop the now unused KVM_X86_DISABLE_VALID_EXITS Drop the KVM_X86_DISABLE_VALID_EXITS definition, as it is misleading, and unused in KVM *because* it is misleading. The set of exits that can be disabled is dynamic, i.e. userspace (and KVM) must check KVM's actual capabilities. Suggested-by: Xiaoyao Li Link: https://lore.kernel.org/r/20241128013424.4096668-16-seanjc@google.com Signed-off-by: Sean Christopherson --- include/uapi/linux/kvm.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 343de0a51797..45e6d8fca9b9 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -617,10 +617,6 @@ struct kvm_ioeventfd { #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) #define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3) -#define KVM_X86_DISABLE_VALID_EXITS (KVM_X86_DISABLE_EXITS_MWAIT | \ - KVM_X86_DISABLE_EXITS_HLT | \ - KVM_X86_DISABLE_EXITS_PAUSE | \ - KVM_X86_DISABLE_EXITS_CSTATE) /* for KVM_ENABLE_CAP */ struct kvm_enable_cap { -- cgit v1.2.3 From 525f6a2c63e0958c25080e108a0cb7f8a3a23719 Mon Sep 17 00:00:00 2001 From: "Sicelo A. Mhlongo" Date: Mon, 25 Nov 2024 17:12:58 +0200 Subject: bq27xxx: add voltage min design for bq27000 and bq27200 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bq27x00 gauges have an EEPROM register which contains the value of the voltage that should be considered to be zero battery capacity. Expose this to userspace using the VOLTAGE_MIN_DESIGN property. Tested on Nokia N900 with bq27200. Signed-off-by: Sicelo A. Mhlongo Acked-by: Pali Rohár Link: https://lore.kernel.org/r/20241125151321.45440-1-absicsz@gmail.com Signed-off-by: Sebastian Reichel --- include/linux/power/bq27xxx_battery.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h index 5180dc9f1706..6b190639b08e 100644 --- a/include/linux/power/bq27xxx_battery.h +++ b/include/linux/power/bq27xxx_battery.h @@ -61,6 +61,7 @@ struct bq27xxx_device_info { struct bq27xxx_access_methods bus; struct bq27xxx_reg_cache cache; int charge_design_full; + int voltage_min_design; bool removed; unsigned long last_update; union power_supply_propval last_status; -- cgit v1.2.3 From e10c5cbd1c912c06d887c8a1980ac57e21d87b6f Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Mon, 16 Dec 2024 20:18:41 +0100 Subject: PCI: Update code comment on PCI_EXP_LNKCAP_SLS for PCIe r3.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Niklas notes that the code comment on the PCI_EXP_LNKCAP_SLS macro is outdated as it reflects the meaning of the field prior to PCIe r3.0. Update it to avoid confusion. Closes: https://lore.kernel.org/r/70829798889c6d779ca0f6cd3260a765780d1369.camel@kernel.org Link: https://lore.kernel.org/r/6152bd17cbe0876365d5f4624fc317529f4bbc85.1734376438.git.lukas@wunner.de Reported-by: Niklas Schnelle Signed-off-by: Lukas Wunner Signed-off-by: Krzysztof Wilczyński Reviewed-by: Ilpo Järvinen Reviewed-by: Niklas Schnelle --- include/uapi/linux/pci_regs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 1601c7ed5fab..02d0ba2999e0 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -533,7 +533,7 @@ #define PCI_EXP_DEVSTA_TRPND 0x0020 /* Transactions Pending */ #define PCI_CAP_EXP_RC_ENDPOINT_SIZEOF_V1 12 /* v1 endpoints without link end here */ #define PCI_EXP_LNKCAP 0x0c /* Link Capabilities */ -#define PCI_EXP_LNKCAP_SLS 0x0000000f /* Supported Link Speeds */ +#define PCI_EXP_LNKCAP_SLS 0x0000000f /* Max Link Speed (prior to PCIe r3.0: Supported Link Speeds) */ #define PCI_EXP_LNKCAP_SLS_2_5GB 0x00000001 /* LNKCAP2 SLS Vector bit 0 */ #define PCI_EXP_LNKCAP_SLS_5_0GB 0x00000002 /* LNKCAP2 SLS Vector bit 1 */ #define PCI_EXP_LNKCAP_SLS_8_0GB 0x00000003 /* LNKCAP2 SLS Vector bit 2 */ -- cgit v1.2.3 From a5874fde3c0884a33ed4145101052318c5e17c74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 12 Dec 2024 18:42:16 +0100 Subject: exec: Add a new AT_EXECVE_CHECK flag to execveat(2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new AT_EXECVE_CHECK flag to execveat(2) to check if a file would be allowed for execution. The main use case is for script interpreters and dynamic linkers to check execution permission according to the kernel's security policy. Another use case is to add context to access logs e.g., which script (instead of interpreter) accessed a file. As any executable code, scripts could also use this check [1]. This is different from faccessat(2) + X_OK which only checks a subset of access rights (i.e. inode permission and mount options for regular files), but not the full context (e.g. all LSM access checks). The main use case for access(2) is for SUID processes to (partially) check access on behalf of their caller. The main use case for execveat(2) + AT_EXECVE_CHECK is to check if a script execution would be allowed, according to all the different restrictions in place. Because the use of AT_EXECVE_CHECK follows the exact kernel semantic as for a real execution, user space gets the same error codes. An interesting point of using execveat(2) instead of openat2(2) is that it decouples the check from the enforcement. Indeed, the security check can be logged (e.g. with audit) without blocking an execution environment not yet ready to enforce a strict security policy. LSMs can control or log execution requests with security_bprm_creds_for_exec(). However, to enforce a consistent and complete access control (e.g. on binary's dependencies) LSMs should restrict file executability, or measure executed files, with security_file_open() by checking file->f_flags & __FMODE_EXEC. Because AT_EXECVE_CHECK is dedicated to user space interpreters, it doesn't make sense for the kernel to parse the checked files, look for interpreters known to the kernel (e.g. ELF, shebang), and return ENOEXEC if the format is unknown. Because of that, security_bprm_check() is never called when AT_EXECVE_CHECK is used. It should be noted that script interpreters cannot directly use execveat(2) (without this new AT_EXECVE_CHECK flag) because this could lead to unexpected behaviors e.g., `python script.sh` could lead to Bash being executed to interpret the script. Unlike the kernel, script interpreters may just interpret the shebang as a simple comment, which should not change for backward compatibility reasons. Because scripts or libraries files might not currently have the executable permission set, or because we might want specific users to be allowed to run arbitrary scripts, the following patch provides a dynamic configuration mechanism with the SECBIT_EXEC_RESTRICT_FILE and SECBIT_EXEC_DENY_INTERACTIVE securebits. This is a redesign of the CLIP OS 4's O_MAYEXEC: https://github.com/clipos-archive/src_platform_clip-patches/blob/f5cb330d6b684752e403b4e41b39f7004d88e561/1901_open_mayexec.patch This patch has been used for more than a decade with customized script interpreters. Some examples can be found here: https://github.com/clipos-archive/clipos4_portage-overlay/search?q=O_MAYEXEC Cc: Al Viro Cc: Christian Brauner Cc: Kees Cook Acked-by: Paul Moore Reviewed-by: Serge Hallyn Reviewed-by: Jeff Xu Tested-by: Jeff Xu Link: https://docs.python.org/3/library/io.html#io.open_code [1] Signed-off-by: Mickaël Salaün Link: https://lore.kernel.org/r/20241212174223.389435-2-mic@digikod.net Signed-off-by: Kees Cook --- include/linux/binfmts.h | 7 ++++++- include/uapi/linux/fcntl.h | 4 ++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index e6c00e860951..8ff0eb3644a1 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -42,7 +42,12 @@ struct linux_binprm { * Set when errors can no longer be returned to the * original userspace. */ - point_of_no_return:1; + point_of_no_return:1, + /* + * Set by user space to check executability according to the + * caller's environment. + */ + is_check:1; struct file *executable; /* Executable to pass to the interpreter */ struct file *interpreter; struct file *file; diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 6e6907e63bfc..a15ac2fa4b20 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -155,4 +155,8 @@ #define AT_HANDLE_MNT_ID_UNIQUE 0x001 /* Return the u64 unique mount ID. */ #define AT_HANDLE_CONNECTABLE 0x002 /* Request a connectable file handle */ +/* Flags for execveat2(2). */ +#define AT_EXECVE_CHECK 0x10000 /* Only perform a check if execution + would be allowed. */ + #endif /* _UAPI_LINUX_FCNTL_H */ -- cgit v1.2.3 From a0623b2a1d595341971c189b90a6b06f42cd209d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Thu, 12 Dec 2024 18:42:17 +0100 Subject: security: Add EXEC_RESTRICT_FILE and EXEC_DENY_INTERACTIVE securebits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new SECBIT_EXEC_RESTRICT_FILE, SECBIT_EXEC_DENY_INTERACTIVE, and their *_LOCKED counterparts are designed to be set by processes setting up an execution environment, such as a user session, a container, or a security sandbox. Unlike other securebits, these ones can be set by unprivileged processes. Like seccomp filters or Landlock domains, the securebits are inherited across processes. When SECBIT_EXEC_RESTRICT_FILE is set, programs interpreting code should control executable resources according to execveat(2) + AT_EXECVE_CHECK (see previous commit). When SECBIT_EXEC_DENY_INTERACTIVE is set, a process should deny execution of user interactive commands (which excludes executable regular files). Being able to configure each of these securebits enables system administrators or owner of image containers to gradually validate the related changes and to identify potential issues (e.g. with interpreter or audit logs). It should be noted that unlike other security bits, the SECBIT_EXEC_RESTRICT_FILE and SECBIT_EXEC_DENY_INTERACTIVE bits are dedicated to user space willing to restrict itself. Because of that, they only make sense in the context of a trusted environment (e.g. sandbox, container, user session, full system) where the process changing its behavior (according to these bits) and all its parent processes are trusted. Otherwise, any parent process could just execute its own malicious code (interpreting a script or not), or even enforce a seccomp filter to mask these bits. Such a secure environment can be achieved with an appropriate access control (e.g. mount's noexec option, file access rights, LSM policy) and an enlighten ld.so checking that libraries are allowed for execution e.g., to protect against illegitimate use of LD_PRELOAD. Ptrace restrictions according to these securebits would not make sense because of the processes' trust assumption. Scripts may need some changes to deal with untrusted data (e.g. stdin, environment variables), but that is outside the scope of the kernel. See chromeOS's documentation about script execution control and the related threat model: https://www.chromium.org/chromium-os/developer-library/guides/security/noexec-shell-scripts/ Cc: Al Viro Cc: Andy Lutomirski Cc: Christian Brauner Cc: Kees Cook Cc: Paul Moore Reviewed-by: Serge Hallyn Reviewed-by: Jeff Xu Tested-by: Jeff Xu Signed-off-by: Mickaël Salaün Link: https://lore.kernel.org/r/20241212174223.389435-3-mic@digikod.net Signed-off-by: Kees Cook --- include/uapi/linux/securebits.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/securebits.h b/include/uapi/linux/securebits.h index d6d98877ff1a..3fba30dbd68b 100644 --- a/include/uapi/linux/securebits.h +++ b/include/uapi/linux/securebits.h @@ -52,10 +52,32 @@ #define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED \ (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE_LOCKED)) +/* See Documentation/userspace-api/check_exec.rst */ +#define SECURE_EXEC_RESTRICT_FILE 8 +#define SECURE_EXEC_RESTRICT_FILE_LOCKED 9 /* make bit-8 immutable */ + +#define SECBIT_EXEC_RESTRICT_FILE (issecure_mask(SECURE_EXEC_RESTRICT_FILE)) +#define SECBIT_EXEC_RESTRICT_FILE_LOCKED \ + (issecure_mask(SECURE_EXEC_RESTRICT_FILE_LOCKED)) + +/* See Documentation/userspace-api/check_exec.rst */ +#define SECURE_EXEC_DENY_INTERACTIVE 10 +#define SECURE_EXEC_DENY_INTERACTIVE_LOCKED 11 /* make bit-10 immutable */ + +#define SECBIT_EXEC_DENY_INTERACTIVE \ + (issecure_mask(SECURE_EXEC_DENY_INTERACTIVE)) +#define SECBIT_EXEC_DENY_INTERACTIVE_LOCKED \ + (issecure_mask(SECURE_EXEC_DENY_INTERACTIVE_LOCKED)) + #define SECURE_ALL_BITS (issecure_mask(SECURE_NOROOT) | \ issecure_mask(SECURE_NO_SETUID_FIXUP) | \ issecure_mask(SECURE_KEEP_CAPS) | \ - issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE)) + issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE) | \ + issecure_mask(SECURE_EXEC_RESTRICT_FILE) | \ + issecure_mask(SECURE_EXEC_DENY_INTERACTIVE)) #define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1) +#define SECURE_ALL_UNPRIVILEGED (issecure_mask(SECURE_EXEC_RESTRICT_FILE) | \ + issecure_mask(SECURE_EXEC_DENY_INTERACTIVE)) + #endif /* _UAPI_LINUX_SECUREBITS_H */ -- cgit v1.2.3 From 95b3cdafd7cb74414070893445a9b731793f7b55 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Thu, 12 Dec 2024 18:42:23 +0100 Subject: ima: instantiate the bprm_creds_for_exec() hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Like direct file execution (e.g. ./script.sh), indirect file execution (e.g. sh script.sh) needs to be measured and appraised. Instantiate the new security_bprm_creds_for_exec() hook to measure and verify the indirect file's integrity. Unlike direct file execution, indirect file execution is optionally enforced by the interpreter. Differentiate kernel and userspace enforced integrity audit messages. Co-developed-by: Roberto Sassu Signed-off-by: Roberto Sassu Signed-off-by: Mimi Zohar Tested-by: Stefan Berger Reviewed-by: Mickaël Salaün Signed-off-by: Mickaël Salaün Link: https://lore.kernel.org/r/20241212174223.389435-9-mic@digikod.net Signed-off-by: Kees Cook --- include/uapi/linux/audit.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 75e21a135483..d9a069b4a775 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -161,6 +161,7 @@ #define AUDIT_INTEGRITY_RULE 1805 /* policy rule */ #define AUDIT_INTEGRITY_EVM_XATTR 1806 /* New EVM-covered xattr */ #define AUDIT_INTEGRITY_POLICY_RULE 1807 /* IMA policy rules */ +#define AUDIT_INTEGRITY_USERSPACE 1808 /* Userspace enforced data integrity */ #define AUDIT_KERNEL 2000 /* Asynchronous audit record. NOT A REQUEST. */ -- cgit v1.2.3 From 12d908116f7efd34f255a482b9afc729d7a5fb78 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 18 Dec 2024 17:56:25 +0100 Subject: io_uring: Fix registered ring file refcount leak Currently, io_uring_unreg_ringfd() (which cleans up registered rings) is only called on exit, but __io_uring_free (which frees the tctx in which the registered ring pointers are stored) is also called on execve (via begin_new_exec -> io_uring_task_cancel -> __io_uring_cancel -> io_uring_cancel_generic -> __io_uring_free). This means: A process going through execve while having registered rings will leak references to the rings' `struct file`. Fix it by zapping registered rings on execve(). This is implemented by moving the io_uring_unreg_ringfd() from io_uring_files_cancel() into its callee __io_uring_cancel(), which is called from io_uring_task_cancel() on execve. This could probably be exploited *on 32-bit kernels* by leaking 2^32 references to the same ring, because the file refcount is stored in a pointer-sized field and get_file() doesn't have protection against refcount overflow, just a WARN_ONCE(); but on 64-bit it should have no impact beyond a memory leak. Cc: stable@vger.kernel.org Fixes: e7a6c00dc77a ("io_uring: add support for registering ring file descriptors") Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20241218-uring-reg-ring-cleanup-v1-1-8f63e999045b@google.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index e123d5e17b52..85fe4e6b275c 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -15,10 +15,8 @@ bool io_is_uring_fops(struct file *file); static inline void io_uring_files_cancel(void) { - if (current->io_uring) { - io_uring_unreg_ringfd(); + if (current->io_uring) __io_uring_cancel(false); - } } static inline void io_uring_task_cancel(void) { -- cgit v1.2.3 From a126061c80d5efb4baef4bcf346094139cd81df6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Dec 2024 13:51:21 +0000 Subject: ptr_ring: do not block hard interrupts in ptr_ring_resize_multiple() Jakub added a lockdep_assert_no_hardirq() check in __page_pool_put_page() to increase test coverage. syzbot found a splat caused by hard irq blocking in ptr_ring_resize_multiple() [1] As current users of ptr_ring_resize_multiple() do not require hard irqs being masked, replace it to only block BH. Rename helpers to better reflect they are safe against BH only. - ptr_ring_resize_multiple() to ptr_ring_resize_multiple_bh() - skb_array_resize_multiple() to skb_array_resize_multiple_bh() [1] WARNING: CPU: 1 PID: 9150 at net/core/page_pool.c:709 __page_pool_put_page net/core/page_pool.c:709 [inline] WARNING: CPU: 1 PID: 9150 at net/core/page_pool.c:709 page_pool_put_unrefed_netmem+0x157/0xa40 net/core/page_pool.c:780 Modules linked in: CPU: 1 UID: 0 PID: 9150 Comm: syz.1.1052 Not tainted 6.11.0-rc3-syzkaller-00202-gf8669d7b5f5d #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024 RIP: 0010:__page_pool_put_page net/core/page_pool.c:709 [inline] RIP: 0010:page_pool_put_unrefed_netmem+0x157/0xa40 net/core/page_pool.c:780 Code: 74 0e e8 7c aa fb f7 eb 43 e8 75 aa fb f7 eb 3c 65 8b 1d 38 a8 6a 76 31 ff 89 de e8 a3 ae fb f7 85 db 74 0b e8 5a aa fb f7 90 <0f> 0b 90 eb 1d 65 8b 1d 15 a8 6a 76 31 ff 89 de e8 84 ae fb f7 85 RSP: 0018:ffffc9000bda6b58 EFLAGS: 00010083 RAX: ffffffff8997e523 RBX: 0000000000000000 RCX: 0000000000040000 RDX: ffffc9000fbd0000 RSI: 0000000000001842 RDI: 0000000000001843 RBP: 0000000000000000 R08: ffffffff8997df2c R09: 1ffffd40003a000d R10: dffffc0000000000 R11: fffff940003a000e R12: ffffea0001d00040 R13: ffff88802e8a4000 R14: dffffc0000000000 R15: 00000000ffffffff FS: 00007fb7aaf716c0(0000) GS:ffff8880b9300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa15a0d4b72 CR3: 00000000561b0000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tun_ptr_free drivers/net/tun.c:617 [inline] __ptr_ring_swap_queue include/linux/ptr_ring.h:571 [inline] ptr_ring_resize_multiple_noprof include/linux/ptr_ring.h:643 [inline] tun_queue_resize drivers/net/tun.c:3694 [inline] tun_device_event+0xaaf/0x1080 drivers/net/tun.c:3714 notifier_call_chain+0x19f/0x3e0 kernel/notifier.c:93 call_netdevice_notifiers_extack net/core/dev.c:2032 [inline] call_netdevice_notifiers net/core/dev.c:2046 [inline] dev_change_tx_queue_len+0x158/0x2a0 net/core/dev.c:9024 do_setlink+0xff6/0x41f0 net/core/rtnetlink.c:2923 rtnl_setlink+0x40d/0x5a0 net/core/rtnetlink.c:3201 rtnetlink_rcv_msg+0x73f/0xcf0 net/core/rtnetlink.c:6647 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2550 Fixes: ff4e538c8c3e ("page_pool: add a lockdep check for recycling in hardirq") Reported-by: syzbot+f56a5c5eac2b28439810@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/671e10df.050a0220.2b8c0f.01cf.GAE@google.com/T/ Signed-off-by: Eric Dumazet Acked-by: Michael S. Tsirkin Acked-by: Jason Wang Link: https://patch.msgid.link/20241217135121.326370-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/ptr_ring.h | 21 ++++++++++----------- include/linux/skb_array.h | 17 +++++++++-------- 2 files changed, 19 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index fd037c127bb0..551329220e4f 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -615,15 +615,14 @@ static inline int ptr_ring_resize_noprof(struct ptr_ring *r, int size, gfp_t gfp /* * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. - * In particular if you consume ring in interrupt or BH context, you must - * disable interrupts/BH when doing so. + * In particular if you consume ring in BH context, you must + * disable BH when doing so. */ -static inline int ptr_ring_resize_multiple_noprof(struct ptr_ring **rings, - unsigned int nrings, - int size, - gfp_t gfp, void (*destroy)(void *)) +static inline int ptr_ring_resize_multiple_bh_noprof(struct ptr_ring **rings, + unsigned int nrings, + int size, gfp_t gfp, + void (*destroy)(void *)) { - unsigned long flags; void ***queues; int i; @@ -638,12 +637,12 @@ static inline int ptr_ring_resize_multiple_noprof(struct ptr_ring **rings, } for (i = 0; i < nrings; ++i) { - spin_lock_irqsave(&(rings[i])->consumer_lock, flags); + spin_lock_bh(&(rings[i])->consumer_lock); spin_lock(&(rings[i])->producer_lock); queues[i] = __ptr_ring_swap_queue(rings[i], queues[i], size, gfp, destroy); spin_unlock(&(rings[i])->producer_lock); - spin_unlock_irqrestore(&(rings[i])->consumer_lock, flags); + spin_unlock_bh(&(rings[i])->consumer_lock); } for (i = 0; i < nrings; ++i) @@ -662,8 +661,8 @@ nomem: noqueues: return -ENOMEM; } -#define ptr_ring_resize_multiple(...) \ - alloc_hooks(ptr_ring_resize_multiple_noprof(__VA_ARGS__)) +#define ptr_ring_resize_multiple_bh(...) \ + alloc_hooks(ptr_ring_resize_multiple_bh_noprof(__VA_ARGS__)) static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *)) { diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h index 926496c9cc9c..bf178238a308 100644 --- a/include/linux/skb_array.h +++ b/include/linux/skb_array.h @@ -199,17 +199,18 @@ static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp) return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb); } -static inline int skb_array_resize_multiple_noprof(struct skb_array **rings, - int nrings, unsigned int size, - gfp_t gfp) +static inline int skb_array_resize_multiple_bh_noprof(struct skb_array **rings, + int nrings, + unsigned int size, + gfp_t gfp) { BUILD_BUG_ON(offsetof(struct skb_array, ring)); - return ptr_ring_resize_multiple_noprof((struct ptr_ring **)rings, - nrings, size, gfp, - __skb_array_destroy_skb); + return ptr_ring_resize_multiple_bh_noprof((struct ptr_ring **)rings, + nrings, size, gfp, + __skb_array_destroy_skb); } -#define skb_array_resize_multiple(...) \ - alloc_hooks(skb_array_resize_multiple_noprof(__VA_ARGS__)) +#define skb_array_resize_multiple_bh(...) \ + alloc_hooks(skb_array_resize_multiple_bh_noprof(__VA_ARGS__)) static inline void skb_array_cleanup(struct skb_array *a) { -- cgit v1.2.3 From 31c5629920b82ddf66059f20f79be2bc00c4197b Mon Sep 17 00:00:00 2001 From: Petr Malat Date: Tue, 10 Dec 2024 01:06:04 +0100 Subject: mm: add RCU annotation to pte_offset_map(_lock) RCU lock is taken by ___pte_offset_map() unless it returns NULL. Add this information to its inline callers to avoid sparse warning about context imbalance in pte_unmap(). Link: https://lkml.kernel.org/r/20241210000604.700710-1-oss@malat.biz Signed-off-by: Petr Malat Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index c39c4945946c..3a6ee6a05aa0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3010,7 +3010,15 @@ static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } -pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); +pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); +static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, + pmd_t *pmdvalp) +{ + pte_t *pte; + + __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp)); + return pte; +} static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) { return __pte_offset_map(pmd, addr, NULL); @@ -3023,7 +3031,8 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, { pte_t *pte; - __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp)); + __cond_lock(RCU, __cond_lock(*ptlp, + pte = __pte_offset_map_lock(mm, pmd, addr, ptlp))); return pte; } -- cgit v1.2.3 From 5c0541e11c16bd2f162e23a22d07c09d58017e5a Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 9 Dec 2024 13:23:25 -0500 Subject: mm: introduce cpu_icache_is_aliasing() across all architectures In commit eacd0e950dc2 ("ARC: [mm] Lazy D-cache flush (non aliasing VIPT)"), arc adds the need to flush dcache to make icache see the code page change. This also requires special handling for clear_user_(high)page(). Introduce cpu_icache_is_aliasing() to make MM code query special clear_user_(high)page() easier. This will be used by the following commit. Link: https://lkml.kernel.org/r/20241209182326.2955963-1-ziy@nvidia.com Fixes: 5708d96da20b ("mm: avoid zeroing user movable page twice with init_on_alloc=1") Signed-off-by: Zi Yan Suggested-by: Mathieu Desnoyers Reviewed-by: Mathieu Desnoyers Acked-by: Vlastimil Babka Cc: Alexander Potapenko Cc: David Hildenbrand Cc: Geert Uytterhoeven Cc: John Hubbard Cc: Kees Cook Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ryan Roberts Cc: Vineet Gupta Signed-off-by: Andrew Morton --- include/linux/cacheinfo.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 108060612bb8..7ad736538649 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -155,8 +155,14 @@ static inline int get_cpu_cacheinfo_id(int cpu, int level) #ifndef CONFIG_ARCH_HAS_CPU_CACHE_ALIASING #define cpu_dcache_is_aliasing() false +#define cpu_icache_is_aliasing() cpu_dcache_is_aliasing() #else #include + +#ifndef cpu_icache_is_aliasing +#define cpu_icache_is_aliasing() cpu_dcache_is_aliasing() +#endif + #endif #endif /* _LINUX_CACHEINFO_H */ -- cgit v1.2.3 From c51a4f11e6d8246590b5e64908c1ed84b33e8ba2 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 9 Dec 2024 13:23:26 -0500 Subject: mm: use clear_user_(high)page() for arch with special user folio handling Some architectures have special handling after clearing user folios: architectures, which set cpu_dcache_is_aliasing() to true, require flushing dcache; arc, which sets cpu_icache_is_aliasing() to true, changes folio->flags to make icache coherent to dcache. So __GFP_ZERO using only clear_page() is not enough to zero user folios and clear_user_(high)page() must be used. Otherwise, user data will be corrupted. Fix it by always clearing user folios with clear_user_(high)page() when cpu_dcache_is_aliasing() is true or cpu_icache_is_aliasing() is true. Rename alloc_zeroed() to user_alloc_needs_zeroing() and invert the logic to clarify its intend. Link: https://lkml.kernel.org/r/20241209182326.2955963-2-ziy@nvidia.com Fixes: 5708d96da20b ("mm: avoid zeroing user movable page twice with init_on_alloc=1") Signed-off-by: Zi Yan Reported-by: Geert Uytterhoeven Closes: https://lore.kernel.org/linux-mm/CAMuHMdV1hRp_NtR5YnJo=HsfgKQeH91J537Gh4gKk3PFZhSkbA@mail.gmail.com/ Tested-by: Geert Uytterhoeven Acked-by: Vlastimil Babka Cc: Alexander Potapenko Cc: David Hildenbrand Cc: John Hubbard Cc: Kees Cook Cc: Kefeng Wang Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ryan Roberts Cc: Vineet Gupta Signed-off-by: Andrew Morton --- include/linux/highmem.h | 8 +++++++- include/linux/mm.h | 18 ++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 6e452bd8e7e3..5c6bea81a90e 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -224,7 +224,13 @@ static inline struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { - return vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr); + struct folio *folio; + + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); + if (folio && user_alloc_needs_zeroing()) + clear_user_highpage(&folio->page, vaddr); + + return folio; } #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 3a6ee6a05aa0..338a76ce9083 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -31,6 +31,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -4184,6 +4185,23 @@ static inline int do_mseal(unsigned long start, size_t len_in, unsigned long fla } #endif +/* + * user_alloc_needs_zeroing checks if a user folio from page allocator needs to + * be zeroed or not. + */ +static inline bool user_alloc_needs_zeroing(void) +{ + /* + * for user folios, arch with cache aliasing requires cache flush and + * arc changes folio->flags to make icache coherent with dcache, so + * always return false to make caller use + * clear_user_page()/clear_user_highpage(). + */ + return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() || + !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc); +} + int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); -- cgit v1.2.3 From 42b2eb69835b0fda797f70eb5b4fc213dbe3a7ea Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Thu, 12 Dec 2024 18:33:51 +0000 Subject: mm: convert partially_mapped set/clear operations to be atomic Other page flags in the 2nd page, like PG_hwpoison and PG_anon_exclusive can get modified concurrently. Changes to other page flags might be lost if they are happening at the same time as non-atomic partially_mapped operations. Hence, make partially_mapped operations atomic. Link: https://lkml.kernel.org/r/20241212183351.1345389-1-usamaarif642@gmail.com Fixes: 8422acdc97ed ("mm: introduce a pageflag for partially mapped folios") Reported-by: David Hildenbrand Link: https://lore.kernel.org/all/e53b04ad-1827-43a2-a1ab-864c7efecf6e@redhat.com/ Signed-off-by: Usama Arif Acked-by: David Hildenbrand Acked-by: Johannes Weiner Acked-by: Roman Gushchin Cc: Barry Song Cc: Domenico Cerasuolo Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Nico Pache Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index cf46ac720802..691506bdf2c5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -862,18 +862,10 @@ static inline void ClearPageCompound(struct page *page) ClearPageHead(page); } FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE) -FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE) -/* - * PG_partially_mapped is protected by deferred_split split_queue_lock, - * so its safe to use non-atomic set/clear. - */ -__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE) -__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE) +FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(large_rmappable) -FOLIO_TEST_FLAG_FALSE(partially_mapped) -__FOLIO_SET_FLAG_NOOP(partially_mapped) -__FOLIO_CLEAR_FLAG_NOOP(partially_mapped) +FOLIO_FLAG_FALSE(partially_mapped) #endif #define PG_head_mask ((1UL << PG_head)) -- cgit v1.2.3 From 30c2de0a267c04046d89e678cc0067a9cfb455df Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 12 Dec 2024 13:31:26 -0800 Subject: mm/vmstat: fix a W=1 clang compiler warning Fix the following clang compiler warning that is reported if the kernel is built with W=1: ./include/linux/vmstat.h:518:36: error: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Werror,-Wenum-enum-conversion] 518 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" | ~~~~~~~~~~~ ^ ~~~ Link: https://lkml.kernel.org/r/20241212213126.1269116-1-bvanassche@acm.org Fixes: 9d7ea9a297e6 ("mm/vmstat: add helpers to get vmstat item names for each enum type") Signed-off-by: Bart Van Assche Cc: Konstantin Khlebnikov Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index d2761bf8ff32..9f3a04345b86 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -515,7 +515,7 @@ static inline const char *node_stat_name(enum node_stat_item item) static inline const char *lru_list_name(enum lru_list lru) { - return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" + return node_stat_name(NR_LRU_BASE + (enum node_stat_item)lru) + 3; // skip "nr_" } #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) -- cgit v1.2.3 From 640a603943a7659340c10044c0a1c98ae4e13189 Mon Sep 17 00:00:00 2001 From: David Wang <00107082@163.com> Date: Fri, 13 Dec 2024 09:33:32 +0800 Subject: mm/codetag: clear tags before swap When CONFIG_MEM_ALLOC_PROFILING_DEBUG is set, kernel WARN would be triggered when calling __alloc_tag_ref_set() during swap: alloc_tag was not cleared (got tag for mm/filemap.c:1951) WARNING: CPU: 0 PID: 816 at ./include/linux/alloc_tag.h... Clear code tags before swap can fix the warning. And this patch also fix a potential invalid address dereference in alloc_tag_add_check() when CONFIG_MEM_ALLOC_PROFILING_DEBUG is set and ref->ct is CODETAG_EMPTY, which is defined as ((void *)1). Link: https://lkml.kernel.org/r/20241213013332.89910-1-00107082@163.com Fixes: 51f43d5d82ed ("mm/codetag: swap tags when migrate pages") Signed-off-by: David Wang <00107082@163.com> Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202412112227.df61ebb-lkp@intel.com Acked-by: Suren Baghdasaryan Cc: Kent Overstreet Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 7c0786bdf9af..cba024bf2db3 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -135,7 +135,7 @@ static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag) #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) { - WARN_ONCE(ref && ref->ct, + WARN_ONCE(ref && ref->ct && !is_codetag_empty(ref), "alloc_tag was not cleared (got tag for %s:%u)\n", ref->ct->filename, ref->ct->lineno); -- cgit v1.2.3 From 60da7445a142bd15e67f3cda915497781c3f781f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 29 Nov 2024 16:14:23 -0800 Subject: alloc_tag: fix set_codetag_empty() when !CONFIG_MEM_ALLOC_PROFILING_DEBUG It was recently noticed that set_codetag_empty() might be used not only to mark NULL alloctag references as empty to avoid warnings but also to reset valid tags (in clear_page_tag_ref()). Since set_codetag_empty() is defined as NOOP for CONFIG_MEM_ALLOC_PROFILING_DEBUG=n, such use of set_codetag_empty() leads to subtle bugs. Fix set_codetag_empty() for CONFIG_MEM_ALLOC_PROFILING_DEBUG=n to reset the tag reference. Link: https://lkml.kernel.org/r/20241130001423.1114965-2-surenb@google.com Fixes: a8fc28dad6d5 ("alloc_tag: introduce clear_page_tag_ref() helper function") Signed-off-by: Suren Baghdasaryan Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/lkml/20241124074318.399027-1-00107082@163.com/ Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Mike Rapoport (Microsoft) Cc: Pasha Tatashin Cc: Sourav Panda Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index cba024bf2db3..0bbbe537c5f9 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -63,7 +63,12 @@ static inline void set_codetag_empty(union codetag_ref *ref) #else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ static inline bool is_codetag_empty(union codetag_ref *ref) { return false; } -static inline void set_codetag_empty(union codetag_ref *ref) {} + +static inline void set_codetag_empty(union codetag_ref *ref) +{ + if (ref) + ref->ct = NULL; +} #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ -- cgit v1.2.3 From 91a7088096a49eb413ca11a9d80bc8ba60695c18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Thu, 21 Nov 2024 14:41:05 +0100 Subject: media: dt-bindings: Add property to describe CSI-2 C-PHY line orders MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each data lane on a CSI-2 C-PHY bus uses three phase encoding and is constructed from three physical wires. The wires are referred to as A, B and C and their default order is ABC. However to ease hardware design the specification allows for the wires to be switched in any order. Add a vendor neutral property to describe the line order used. The property name 'line-orders', the possible values it can be assigned and there names are taken from the MIPI Discovery and Configuration (DisCo) Specification for Imaging. Signed-off-by: Niklas Söderlund Reviewed-by: Rob Herring (Arm) Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- include/dt-bindings/media/video-interfaces.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/media/video-interfaces.h b/include/dt-bindings/media/video-interfaces.h index 68ac4e05e37f..88b9d05d8075 100644 --- a/include/dt-bindings/media/video-interfaces.h +++ b/include/dt-bindings/media/video-interfaces.h @@ -13,4 +13,11 @@ #define MEDIA_BUS_TYPE_PARALLEL 5 #define MEDIA_BUS_TYPE_BT656 6 +#define MEDIA_BUS_CSI2_CPHY_LINE_ORDER_ABC 0 +#define MEDIA_BUS_CSI2_CPHY_LINE_ORDER_ACB 1 +#define MEDIA_BUS_CSI2_CPHY_LINE_ORDER_BAC 2 +#define MEDIA_BUS_CSI2_CPHY_LINE_ORDER_BCA 3 +#define MEDIA_BUS_CSI2_CPHY_LINE_ORDER_CAB 4 +#define MEDIA_BUS_CSI2_CPHY_LINE_ORDER_CBA 5 + #endif /* __DT_BINDINGS_MEDIA_VIDEO_INTERFACES_H__ */ -- cgit v1.2.3 From 573b4adddbd22baf14c5022b8e4b1dd93bac22ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Thu, 21 Nov 2024 14:41:06 +0100 Subject: media: v4l: fwnode: Parse MiPI DisCo for C-PHY line-orders MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the fwnode parsing to validate and fill in the CSI-2 C-PHY line-orders order properties as defined in MIPI Discovery and Configuration (DisCo) Specification for Imaging. Signed-off-by: Niklas Söderlund [Sakari Ailus: Use ARRAY_SIZE() instead of an integer.] Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- include/media/v4l2-mediabus.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/media/v4l2-mediabus.h b/include/media/v4l2-mediabus.h index 5bce6e423e94..e7f019f68c8d 100644 --- a/include/media/v4l2-mediabus.h +++ b/include/media/v4l2-mediabus.h @@ -73,6 +73,24 @@ #define V4L2_MBUS_CSI2_MAX_DATA_LANES 8 +/** + * enum v4l2_mbus_csi2_cphy_line_orders_type - CSI-2 C-PHY line order + * @V4L2_MBUS_CSI2_CPHY_LINE_ORDER_ABC: C-PHY line order ABC (default) + * @V4L2_MBUS_CSI2_CPHY_LINE_ORDER_ACB: C-PHY line order ACB + * @V4L2_MBUS_CSI2_CPHY_LINE_ORDER_BAC: C-PHY line order BAC + * @V4L2_MBUS_CSI2_CPHY_LINE_ORDER_BCA: C-PHY line order BCA + * @V4L2_MBUS_CSI2_CPHY_LINE_ORDER_CAB: C-PHY line order CAB + * @V4L2_MBUS_CSI2_CPHY_LINE_ORDER_CBA: C-PHY line order CBA + */ +enum v4l2_mbus_csi2_cphy_line_orders_type { + V4L2_MBUS_CSI2_CPHY_LINE_ORDER_ABC, + V4L2_MBUS_CSI2_CPHY_LINE_ORDER_ACB, + V4L2_MBUS_CSI2_CPHY_LINE_ORDER_BAC, + V4L2_MBUS_CSI2_CPHY_LINE_ORDER_BCA, + V4L2_MBUS_CSI2_CPHY_LINE_ORDER_CAB, + V4L2_MBUS_CSI2_CPHY_LINE_ORDER_CBA, +}; + /** * struct v4l2_mbus_config_mipi_csi2 - MIPI CSI-2 data bus configuration * @flags: media bus (V4L2_MBUS_*) flags @@ -81,6 +99,8 @@ * @num_data_lanes: number of data lanes * @lane_polarities: polarity of the lanes. The order is the same of * the physical lanes. + * @line_orders: line order of the data lanes. The order is the same of the + * physical lanes. */ struct v4l2_mbus_config_mipi_csi2 { unsigned int flags; @@ -88,6 +108,7 @@ struct v4l2_mbus_config_mipi_csi2 { unsigned char clock_lane; unsigned char num_data_lanes; bool lane_polarities[1 + V4L2_MBUS_CSI2_MAX_DATA_LANES]; + enum v4l2_mbus_csi2_cphy_line_orders_type line_orders[V4L2_MBUS_CSI2_MAX_DATA_LANES]; }; /** -- cgit v1.2.3 From 08a7ead3242f70f4415232800104ae458fd96d17 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 12 Dec 2024 20:19:49 -0800 Subject: mmc: crypto: add mmc_from_crypto_profile() Add a helper function that encapsulates a container_of expression. For now there is just one user but soon there will be more. Signed-off-by: Eric Biggers Message-ID: <20241213041958.202565-7-ebiggers@kernel.org> Signed-off-by: Ulf Hansson --- include/linux/mmc/host.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index f166d6611ddb..68f09a955a90 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -590,6 +590,14 @@ static inline struct mmc_host *mmc_from_priv(void *priv) return container_of(priv, struct mmc_host, private); } +#ifdef CONFIG_MMC_CRYPTO +static inline struct mmc_host * +mmc_from_crypto_profile(struct blk_crypto_profile *profile) +{ + return container_of(profile, struct mmc_host, crypto_profile); +} +#endif + #define mmc_host_is_spi(host) ((host)->caps & MMC_CAP_SPI) #define mmc_dev(x) ((x)->parent) -- cgit v1.2.3 From d1d761b3012e99d55d288d435384be606302cb2c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 16 Dec 2024 19:11:53 +0200 Subject: net: fib_rules: Add flow label selector attributes Add new FIB rule attributes which will allow user space to match on the IPv6 flow label with a mask. Temporarily set the type of the attributes to 'NLA_REJECT' while support is being added in the IPv6 code. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: Paolo Abeni --- include/uapi/linux/fib_rules.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index a6924dd3aff1..00e9890ca3c0 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -68,6 +68,8 @@ enum { FRA_SPORT_RANGE, /* sport */ FRA_DPORT_RANGE, /* dport */ FRA_DSCP, /* dscp */ + FRA_FLOWLABEL, /* flowlabel */ + FRA_FLOWLABEL_MASK, /* flowlabel mask */ __FRA_MAX }; -- cgit v1.2.3 From ba4138032ae3b5b8e2b68d2f2647cdc0817b05a6 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 16 Dec 2024 19:11:58 +0200 Subject: ipv6: Add flow label to route get requests The default IPv6 multipath hash policy takes the flow label into account when calculating a multipath hash and previous patches added a flow label selector to IPv6 FIB rules. Allow user space to specify a flow label in route get requests by adding a new netlink attribute and using its value to populate the "flowlabel" field in the IPv6 flow info structure prior to a route lookup. Deny the attribute in RTM_{NEW,DEL}ROUTE requests by checking for it in rtm_to_fib6_config() and returning an error if present. A subsequent patch will use this capability to test the new flow label selector in IPv6 FIB rules. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: Paolo Abeni --- include/uapi/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index eccc0e7dcb7d..5ee94c511a28 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -393,6 +393,7 @@ enum rtattr_type_t { RTA_SPORT, RTA_DPORT, RTA_NH_ID, + RTA_FLOWLABEL, __RTA_MAX }; -- cgit v1.2.3 From 002bf68a3b3e5f90ce61ea8fd11b8b62fd0765ce Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 16 Dec 2024 19:12:00 +0200 Subject: tracing: ipv6: Add flow label to fib6_table_lookup tracepoint The different parameters affecting the IPv6 route lookup are printed to the trace buffer by the fib6_table_lookup tracepoint. Add the IPv6 flow label for better observability as it can affect the route lookup both in terms of multipath hash calculation and policy based routing (FIB rules). Example: # echo 1 > /sys/kernel/tracing/events/fib6/fib6_table_lookup/enable # ip -6 route get ::1 flowlabel 0x12345 ipproto udp sport 12345 dport 54321 &> /dev/null # cat /sys/kernel/tracing/trace_pipe ip-358 [010] ..... 44.897484: fib6_table_lookup: table 255 oif 0 iif 1 proto 17 ::/12345 -> ::1/54321 flowlabel 0x12345 tos 0 scope 0 flags 0 ==> dev lo gw :: err 0 Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Guillaume Nault Signed-off-by: Paolo Abeni --- include/trace/events/fib6.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h index 5d7ee2610728..8d22b2e98d48 100644 --- a/include/trace/events/fib6.h +++ b/include/trace/events/fib6.h @@ -22,6 +22,7 @@ TRACE_EVENT(fib6_table_lookup, __field( int, err ) __field( int, oif ) __field( int, iif ) + __field( u32, flowlabel ) __field( __u8, tos ) __field( __u8, scope ) __field( __u8, flags ) @@ -42,6 +43,7 @@ TRACE_EVENT(fib6_table_lookup, __entry->err = ip6_rt_type_to_error(res->fib6_type); __entry->oif = flp->flowi6_oif; __entry->iif = flp->flowi6_iif; + __entry->flowlabel = ntohl(flowi6_get_flowlabel(flp)); __entry->tos = ip6_tclass(flp->flowlabel); __entry->scope = flp->flowi6_scope; __entry->flags = flp->flowi6_flags; @@ -76,11 +78,11 @@ TRACE_EVENT(fib6_table_lookup, } ), - TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u tos %d scope %d flags %x ==> dev %s gw %pI6c err %d", + TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u flowlabel %#x tos %d scope %d flags %x ==> dev %s gw %pI6c err %d", __entry->tb_id, __entry->oif, __entry->iif, __entry->proto, __entry->src, __entry->sport, __entry->dst, __entry->dport, - __entry->tos, __entry->scope, __entry->flags, - __entry->name, __entry->gw, __entry->err) + __entry->flowlabel, __entry->tos, __entry->scope, + __entry->flags, __entry->name, __entry->gw, __entry->err) ); #endif /* _TRACE_FIB6_H */ -- cgit v1.2.3 From fd265d9e0c3358e6b9fe244d8f5d2824fda1c0dc Mon Sep 17 00:00:00 2001 From: Yunxiang Li Date: Thu, 19 Dec 2024 10:14:07 -0500 Subject: drm: add drm_memory_stats_is_zero MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper to check if the memory stats is zero, this will be used to check for memory accounting errors. Signed-off-by: Yunxiang Li Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20241219151411.1150-2-Yunxiang.Li@amd.com Signed-off-by: Christian König --- include/drm/drm_file.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/drm/drm_file.h b/include/drm/drm_file.h index f0ef32e9fa5e..ef817926cddd 100644 --- a/include/drm/drm_file.h +++ b/include/drm/drm_file.h @@ -494,6 +494,7 @@ struct drm_memory_stats { enum drm_gem_object_status; +int drm_memory_stats_is_zero(const struct drm_memory_stats *stats); void drm_print_memory_stats(struct drm_printer *p, const struct drm_memory_stats *stats, enum drm_gem_object_status supported_status, -- cgit v1.2.3 From bebf2ebd70f210a6c8fe5f668dadefb083014217 Mon Sep 17 00:00:00 2001 From: Yunxiang Li Date: Thu, 19 Dec 2024 10:14:08 -0500 Subject: drm: make drm-active- stats optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When memory stats is generated fresh everytime by going though all the BOs, their active information is quite easy to get. But if the stats are tracked with BO's state this becomes harder since the job scheduling part doesn't really deal with individual buffers. Make drm-active- optional to enable amdgpu to switch to the second method. Signed-off-by: Yunxiang Li Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20241219151411.1150-3-Yunxiang.Li@amd.com Signed-off-by: Christian König --- include/drm/drm_gem.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h index 5b8b1b059d32..fdae947682cd 100644 --- a/include/drm/drm_gem.h +++ b/include/drm/drm_gem.h @@ -48,19 +48,21 @@ struct drm_gem_object; * enum drm_gem_object_status - bitmask of object state for fdinfo reporting * @DRM_GEM_OBJECT_RESIDENT: object is resident in memory (ie. not unpinned) * @DRM_GEM_OBJECT_PURGEABLE: object marked as purgeable by userspace + * @DRM_GEM_OBJECT_ACTIVE: object is currently used by an active submission * * Bitmask of status used for fdinfo memory stats, see &drm_gem_object_funcs.status - * and drm_show_fdinfo(). Note that an object can DRM_GEM_OBJECT_PURGEABLE if - * it still active or not resident, in which case drm_show_fdinfo() will not + * and drm_show_fdinfo(). Note that an object can report DRM_GEM_OBJECT_PURGEABLE + * and be active or not resident, in which case drm_show_fdinfo() will not * account for it as purgeable. So drivers do not need to check if the buffer - * is idle and resident to return this bit. (Ie. userspace can mark a buffer - * as purgeable even while it is still busy on the GPU.. it does not _actually_ - * become puregeable until it becomes idle. The status gem object func does - * not need to consider this.) + * is idle and resident to return this bit, i.e. userspace can mark a buffer as + * purgeable even while it is still busy on the GPU. It will not get reported in + * the puregeable stats until it becomes idle. The status gem object func does + * not need to consider this. */ enum drm_gem_object_status { DRM_GEM_OBJECT_RESIDENT = BIT(0), DRM_GEM_OBJECT_PURGEABLE = BIT(1), + DRM_GEM_OBJECT_ACTIVE = BIT(2), }; /** -- cgit v1.2.3 From 79f237bae910e1019f1f7617d7b0b900f717d209 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 16 Dec 2024 21:17:15 +0000 Subject: ACPI: bus: change the prototype for acpi_get_physical_device_location It generally is not OK to use acpi_status and/or AE_ error codes without CONFIG_ACPI and they really only should be used in drivers/acpi/ (and not everywhere in there for that matter). So acpi_get_physical_device_location() needs to be redefined to return something different from acpi_status (preferably bool) in order to be used in !CONFIG_ACPI code. Suggested-by: Rafael J. Wysocki Signed-off-by: Ricardo Ribalda Link: https://patch.msgid.link/20241216-fix-ipu-v5-1-3d6b35ddce7b@chromium.org Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_bus.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index b2e377b7f337..19f92852e127 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -43,7 +43,7 @@ acpi_status acpi_evaluate_ost(acpi_handle handle, u32 source_event, u32 status_code, struct acpi_buffer *status_buf); -acpi_status +bool acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld); bool acpi_has_method(acpi_handle handle, char *name); -- cgit v1.2.3 From e04efd34d9fe36e934e86e61f568c427a999aa5c Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 16 Dec 2024 21:17:16 +0000 Subject: ACPI: bus: implement for_each_acpi_dev_match when !ACPI Provide an implementation of for_each_acpi_dev_match that can be used when CONFIG_ACPI is not set. The condition `false && hid && uid && hrv` is used to avoid "variable not used" warnings. Reviewed-by: Sakari Ailus Acked-by: Mauro Carvalho Chehab Signed-off-by: Ricardo Ribalda Link: https://patch.msgid.link/20241216-fix-ipu-v5-2-3d6b35ddce7b@chromium.org Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_bus.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 19f92852e127..a9b5a5204781 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -1003,6 +1003,9 @@ static inline int unregister_acpi_bus_type(void *bus) { return 0; } static inline int acpi_wait_for_acpi_ipmi(void) { return 0; } +#define for_each_acpi_dev_match(adev, hid, uid, hrv) \ + for (adev = NULL; false && (hid) && (uid) && (hrv); ) + #endif /* CONFIG_ACPI */ #endif /*__ACPI_BUS_H__*/ -- cgit v1.2.3 From 34df77363dff6df1e95ee4c1e035de464da4ee4b Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 16 Dec 2024 21:17:17 +0000 Subject: ACPI: bus: implement acpi_get_physical_device_location when !ACPI Provide an implementation of acpi_get_physical_device_location that can be used when CONFIG_ACPI is not set. Reviewed-by: Sakari Ailus Acked-by: Mauro Carvalho Chehab Signed-off-by: Ricardo Ribalda Link: https://patch.msgid.link/20241216-fix-ipu-v5-3-3d6b35ddce7b@chromium.org Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_bus.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index a9b5a5204781..f38e8b1c8e1f 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -43,9 +43,6 @@ acpi_status acpi_evaluate_ost(acpi_handle handle, u32 source_event, u32 status_code, struct acpi_buffer *status_buf); -bool -acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld); - bool acpi_has_method(acpi_handle handle, char *name); acpi_status acpi_execute_simple_method(acpi_handle handle, char *method, u64 arg); @@ -60,6 +57,9 @@ bool acpi_check_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 funcs); union acpi_object *acpi_evaluate_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4); #ifdef CONFIG_ACPI +bool +acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld); + static inline union acpi_object * acpi_evaluate_dsm_typed(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4, @@ -1003,6 +1003,12 @@ static inline int unregister_acpi_bus_type(void *bus) { return 0; } static inline int acpi_wait_for_acpi_ipmi(void) { return 0; } +static inline bool +acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld) +{ + return false; +} + #define for_each_acpi_dev_match(adev, hid, uid, hrv) \ for (adev = NULL; false && (hid) && (uid) && (hrv); ) -- cgit v1.2.3 From 7d3707bbbbb1dc8b1802886e9ff0506b7d8b323b Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 16 Dec 2024 21:17:18 +0000 Subject: ACPI: header: implement acpi_device_handle when !ACPI Provide an implementation of acpi_device_handle that can be used when CONFIG_ACPI is not set. Reviewed-by: Sakari Ailus Acked-by: Mauro Carvalho Chehab Signed-off-by: Ricardo Ribalda Link: https://patch.msgid.link/20241216-fix-ipu-v5-4-3d6b35ddce7b@chromium.org Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 6adcd1b92b20..4e495b29c640 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -854,6 +854,11 @@ static inline struct fwnode_handle *acpi_fwnode_handle(struct acpi_device *adev) return NULL; } +static inline acpi_handle acpi_device_handle(struct acpi_device *adev) +{ + return NULL; +} + static inline bool has_acpi_companion(struct device *dev) { return false; -- cgit v1.2.3 From 46d10e5f9c346e6f3aa828895ed8b49cc6d9c0f6 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 16 Dec 2024 21:17:19 +0000 Subject: ACPI: bus: implement for_each_acpi_consumer_dev when !ACPI Provide an implementation of for_each_acpi_consumer_dev that can be use used when CONFIG_ACPI is not set. The expression `false && supplier` is used to avoid "variable not used" warnings. Reviewed-by: Sakari Ailus Acked-by: Mauro Carvalho Chehab Signed-off-by: Ricardo Ribalda Link: https://patch.msgid.link/20241216-fix-ipu-v5-5-3d6b35ddce7b@chromium.org Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_bus.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index f38e8b1c8e1f..68c0e2fb029f 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -1009,6 +1009,9 @@ acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld return false; } +#define for_each_acpi_consumer_dev(supplier, consumer) \ + for (consumer = NULL; false && (supplier);) + #define for_each_acpi_dev_match(adev, hid, uid, hrv) \ for (adev = NULL; false && (hid) && (uid) && (hrv); ) -- cgit v1.2.3 From 78c3227c5e268d40d0a08fea8b244afeb182610c Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 16 Dec 2024 21:17:20 +0000 Subject: ACPI: bus: implement acpi_device_hid when !ACPI Provide an implementation of acpi_device_hid that can be used when CONFIG_ACPI is not set. Reviewed-by: Sakari Ailus Acked-by: Mauro Carvalho Chehab Signed-off-by: Ricardo Ribalda Link: https://patch.msgid.link/20241216-fix-ipu-v5-6-3d6b35ddce7b@chromium.org Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_bus.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 68c0e2fb029f..aad1a95e6863 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -1003,6 +1003,11 @@ static inline int unregister_acpi_bus_type(void *bus) { return 0; } static inline int acpi_wait_for_acpi_ipmi(void) { return 0; } +static inline const char *acpi_device_hid(struct acpi_device *device) +{ + return ""; +} + static inline bool acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld) { -- cgit v1.2.3 From b55498ff14bd14860d48dc8d2a0b6889b218c408 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 16 Dec 2024 22:31:18 +0100 Subject: net: phy: add phy_disable_eee If a MAC driver doesn't support EEE, then the PHY shouldn't advertise it. Add phy_disable_eee() for this purpose. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/fd51738c-dcd6-4d61-b8c5-faa6ac0f1026@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index e597a32cc787..5bc71d59910c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2071,6 +2071,7 @@ void phy_advertise_eee_all(struct phy_device *phydev); void phy_support_sym_pause(struct phy_device *phydev); void phy_support_asym_pause(struct phy_device *phydev); void phy_support_eee(struct phy_device *phydev); +void phy_disable_eee(struct phy_device *phydev); void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx, bool autoneg); void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx); -- cgit v1.2.3 From 29b540795b42a3e610c0d5e9d908a8d6c1333676 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 18 Dec 2024 14:17:16 +0100 Subject: gre: Drop ip_route_output_gre(). We already have enough variants of ip_route_output*() functions. We don't need a GRE specific one in the generic route.h header file. Furthermore, ip_route_output_gre() is only used once, in ipgre_open(), where it can be easily replaced by a simple call to ip_route_output_key(). While there, and for clarity, explicitly set .flowi4_scope to RT_SCOPE_UNIVERSE instead of relying on the implicit zero initialisation. Signed-off-by: Guillaume Nault Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/ab7cba47b8558cd4bfe2dc843c38b622a95ee48e.1734527729.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include') diff --git a/include/net/route.h b/include/net/route.h index 84cb1e04f5cd..6947a155d501 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -185,20 +185,6 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi return ip_route_output_flow(net, fl4, sk); } -static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 *fl4, - __be32 daddr, __be32 saddr, - __be32 gre_key, __u8 tos, int oif) -{ - memset(fl4, 0, sizeof(*fl4)); - fl4->flowi4_oif = oif; - fl4->daddr = daddr; - fl4->saddr = saddr; - fl4->flowi4_tos = tos; - fl4->flowi4_proto = IPPROTO_GRE; - fl4->fl4_gre_key = gre_key; - return ip_route_output_key(net, fl4); -} - enum skb_drop_reason ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, -- cgit v1.2.3 From a19d0236f466f1ce8f44a04a96c302d3023eebf4 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 18 Dec 2024 18:44:29 +0100 Subject: page_pool: add page_pool_dev_alloc_netmem() Similarly to other _dev shorthands, add one for page_pool_alloc_netmem() to allocate a netmem using the default Rx GFP flags (ATOMIC | NOWARN) to make the page -> netmem transition of drivers easier. Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241218174435.1445282-2-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/helpers.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 776a3008ac28..543f54fa3020 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -144,6 +144,15 @@ static inline netmem_ref page_pool_alloc_netmem(struct page_pool *pool, return netmem; } +static inline netmem_ref page_pool_dev_alloc_netmem(struct page_pool *pool, + unsigned int *offset, + unsigned int *size) +{ + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; + + return page_pool_alloc_netmem(pool, offset, size, gfp); +} + static inline struct page *page_pool_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size, gfp_t gfp) -- cgit v1.2.3 From 68ddc8ae17685a8c4ac78260bde8fe4a79511aef Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 18 Dec 2024 18:44:30 +0100 Subject: xdp: add generic xdp_buff_add_frag() The code piece which would attach a frag to &xdp_buff is almost identical across the drivers supporting XDP multi-buffer on Rx. Make it a generic elegant "oneliner". Also, I see lots of drivers calculating frags_truesize as `xdp->frame_sz * nr_frags`. I can't say this is fully correct, since frags might be backed by chunks of different sizes, especially with stuff like the header split. Even page_pool_alloc() can give you two different truesizes on two subsequent requests to allocate the same buffer size. Add a field to &skb_shared_info (unionized as there's no free slot currently on x86_64) to track the "true" truesize. It can be used later when updating the skb. Reviewed-by: Maciej Fijalkowski Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241218174435.1445282-3-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 16 ++++++--- include/net/xdp.h | 96 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 107 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b2509cd0b930..bb2b751d274a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -608,11 +608,19 @@ struct skb_shared_info { * Warning : all fields before dataref are cleared in __alloc_skb() */ atomic_t dataref; - unsigned int xdp_frags_size; - /* Intermediate layers must ensure that destructor_arg - * remains valid until skb destructor */ - void * destructor_arg; + union { + struct { + u32 xdp_frags_size; + u32 xdp_frags_truesize; + }; + + /* + * Intermediate layers must ensure that destructor_arg + * remains valid until skb destructor. + */ + void *destructor_arg; + }; /* must be last field, see pskb_expand_head() */ skb_frag_t frags[MAX_SKB_FRAGS]; diff --git a/include/net/xdp.h b/include/net/xdp.h index d2089cfecefd..11139c210b49 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -167,6 +167,93 @@ out: return len; } +void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp); + +/** + * __xdp_buff_add_frag - attach frag to &xdp_buff + * @xdp: XDP buffer to attach the frag to + * @netmem: network memory containing the frag + * @offset: offset at which the frag starts + * @size: size of the frag + * @truesize: total memory size occupied by the frag + * @try_coalesce: whether to try coalescing the frags (not valid for XSk) + * + * Attach frag to the XDP buffer. If it currently has no frags attached, + * initialize the related fields, otherwise check that the frag number + * didn't reach the limit of ``MAX_SKB_FRAGS``. If possible, try coalescing + * the frag with the previous one. + * The function doesn't check/update the pfmemalloc bit. Please use the + * non-underscored wrapper in drivers. + * + * Return: true on success, false if there's no space for the frag in + * the shared info struct. + */ +static inline bool __xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem, + u32 offset, u32 size, u32 truesize, + bool try_coalesce) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + skb_frag_t *prev; + u32 nr_frags; + + if (!xdp_buff_has_frags(xdp)) { + xdp_buff_set_frags_flag(xdp); + + nr_frags = 0; + sinfo->xdp_frags_size = 0; + sinfo->xdp_frags_truesize = 0; + + goto fill; + } + + nr_frags = sinfo->nr_frags; + prev = &sinfo->frags[nr_frags - 1]; + + if (try_coalesce && netmem == skb_frag_netmem(prev) && + offset == skb_frag_off(prev) + skb_frag_size(prev)) { + skb_frag_size_add(prev, size); + /* Guaranteed to only decrement the refcount */ + xdp_return_frag(netmem, xdp); + } else if (unlikely(nr_frags == MAX_SKB_FRAGS)) { + return false; + } else { +fill: + __skb_fill_netmem_desc_noacc(sinfo, nr_frags++, netmem, + offset, size); + } + + sinfo->nr_frags = nr_frags; + sinfo->xdp_frags_size += size; + sinfo->xdp_frags_truesize += truesize; + + return true; +} + +/** + * xdp_buff_add_frag - attach frag to &xdp_buff + * @xdp: XDP buffer to attach the frag to + * @netmem: network memory containing the frag + * @offset: offset at which the frag starts + * @size: size of the frag + * @truesize: total memory size occupied by the frag + * + * Version of __xdp_buff_add_frag() which takes care of the pfmemalloc bit. + * + * Return: true on success, false if there's no space for the frag in + * the shared info struct. + */ +static inline bool xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem, + u32 offset, u32 size, u32 truesize) +{ + if (!__xdp_buff_add_frag(xdp, netmem, offset, size, truesize, true)) + return false; + + if (unlikely(netmem_is_pfmemalloc(netmem))) + xdp_buff_set_frag_pfmemalloc(xdp); + + return true; +} + struct xdp_frame { void *data; u32 len; @@ -230,7 +317,14 @@ xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, unsigned int size, unsigned int truesize, bool pfmemalloc) { - skb_shinfo(skb)->nr_frags = nr_frags; + struct skb_shared_info *sinfo = skb_shinfo(skb); + + sinfo->nr_frags = nr_frags; + /* + * ``destructor_arg`` is unionized with ``xdp_frags_{,true}size``, + * reset it after that these fields aren't used anymore. + */ + sinfo->destructor_arg = NULL; skb->len += size; skb->data_len += size; -- cgit v1.2.3 From 539c1fba1ac77184215d892eda0857f5687b7366 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 18 Dec 2024 18:44:31 +0100 Subject: xdp: add generic xdp_build_skb_from_buff() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code which builds an skb from an &xdp_buff keeps multiplying itself around the drivers with almost no changes. Let's try to stop that by adding a generic function. Unlike __xdp_build_skb_from_frame(), always allocate an skbuff head using napi_build_skb() and make use of the available xdp_rxq pointer to assign the Rx queue index. In case of PP-backed buffer, mark the skb to be recycled, as every PP user's been switched to recycle skbs. Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241218174435.1445282-4-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 11139c210b49..aa24fa78cbe6 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -336,6 +336,7 @@ xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) +struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp); struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, -- cgit v1.2.3 From 51205f841a495c78aa59d0e41683463dac23eb27 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 18 Dec 2024 18:44:32 +0100 Subject: xsk: make xsk_buff_add_frag() really add the frag via __xdp_buff_add_frag() Currently, xsk_buff_add_frag() only adds the frag to pool's linked list, not doing anything with the &xdp_buff. The drivers do that manually and the logic is the same. Make it really add an skb frag, just like xdp_buff_add_frag() does that, and freeing frags on error if needed. This allows to remove repeating code from i40e and ice and not add the same code again and again. Acked-by: Maciej Fijalkowski Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241218174435.1445282-5-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp_sock_drv.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index f3175a5d28f7..86620c818965 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -136,11 +136,21 @@ out: xp_free(xskb); } -static inline void xsk_buff_add_frag(struct xdp_buff *xdp) +static inline bool xsk_buff_add_frag(struct xdp_buff *head, + struct xdp_buff *xdp) { - struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); + const void *data = xdp->data; + struct xdp_buff_xsk *frag; + + if (!__xdp_buff_add_frag(head, virt_to_netmem(data), + offset_in_page(data), xdp->data_end - data, + xdp->frame_sz, false)) + return false; + frag = container_of(xdp, struct xdp_buff_xsk, xdp); list_add_tail(&frag->list_node, &frag->pool->xskb_list); + + return true; } static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) @@ -357,8 +367,10 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) { } -static inline void xsk_buff_add_frag(struct xdp_buff *xdp) +static inline bool xsk_buff_add_frag(struct xdp_buff *head, + struct xdp_buff *xdp) { + return false; } static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) -- cgit v1.2.3 From 560d958c6c68fa62ddb4bd6f890c363598d184b0 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 18 Dec 2024 18:44:33 +0100 Subject: xsk: add generic XSk &xdp_buff -> skb conversion Same as with converting &xdp_buff to skb on Rx, the code which allocates a new skb and copies the XSk frame there is identical across the drivers, so make it generic. This includes copying all the frags if they are present in the original buff. System percpu page_pools greatly improve XDP_PASS performance on XSk: instead of page_alloc() + page_free(), the net core recycles the same pages, so the only overhead left is memcpy()s. When the Page Pool is not compiled in, the whole function is a return-NULL (but it always gets selected when eBPF is enabled). Note that the passed buff gets freed if the conversion is done w/o any error, assuming you don't need this buffer after you convert it to an skb. Reviewed-by: Maciej Fijalkowski Signed-off-by: Alexander Lobakin Link: https://patch.msgid.link/20241218174435.1445282-6-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/net/xdp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index aa24fa78cbe6..6da0e746cf75 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -337,6 +337,7 @@ void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp); +struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp); struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, -- cgit v1.2.3 From a430d99e349026d53e2557b7b22bd2ebd61fe12a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Dec 2024 06:32:19 +0000 Subject: sched/fair: Fix value reported by hot tasks pulled in /proc/schedstat In /proc/schedstat, lb_hot_gained reports the number hot tasks pulled during load balance. This value is incremented in can_migrate_task() if the task is migratable and hot. After incrementing the value, load balancer can still decide not to migrate this task leading to wrong accounting. Fix this by incrementing stats when hot tasks are detached. This issue only exists in detach_tasks() where we can decide to not migrate hot task even if it is migratable. However, in detach_one_task(), we migrate it unconditionally. [Swapnil: Handled the case where nr_failed_migrations_hot was not accounted properly and wrote commit log] Fixes: d31980846f96 ("sched: Move up affinity check to mitigate useless redoing overhead") Signed-off-by: Peter Zijlstra (Intel) Reported-by: "Gautham R. Shenoy" Not-yet-signed-off-by: Peter Zijlstra Signed-off-by: Swapnil Sapkal Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241220063224.17767-2-swapnil.sapkal@amd.com --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index b5916be49f62..8c6a2ed9f80e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -937,6 +937,7 @@ struct task_struct { unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; + unsigned sched_task_hot:1; /* Force alignment to the next boundary: */ unsigned :0; -- cgit v1.2.3 From 3b2a793ea70fd14136b442df31e53935e8095034 Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Fri, 20 Dec 2024 06:32:21 +0000 Subject: sched: Report the different kinds of imbalances in /proc/schedstat In /proc/schedstat, lb_imbalance reports the sum of imbalances discovered in sched domains with each call to sched_balance_rq(), which is not very useful because lb_imbalance does not mention whether the imbalance is due to load, utilization, nr_tasks or misfit_tasks. Remove this field from /proc/schedstat. Currently there is no field in /proc/schedstat to report different types of imbalances. Introduce new fields in /proc/schedstat to report the total imbalances in load, utilization, nr_tasks or misfit_tasks. Added fields to /proc/schedstat: - lb_imbalance_load: Total imbalance due to load. - lb_imbalance_util: Total imbalance due to utilization. - lb_imbalance_task: Total imbalance due to number of tasks. - lb_imbalance_misfit: Total imbalance due to misfit tasks. Signed-off-by: Swapnil Sapkal Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Shrikanth Hegde Link: https://lore.kernel.org/r/20241220063224.17767-4-swapnil.sapkal@amd.com --- include/linux/sched/topology.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 4237daa5ac7a..76a662e1ec24 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -114,7 +114,10 @@ struct sched_domain { unsigned int lb_count[CPU_MAX_IDLE_TYPES]; unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; - unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_load[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_util[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_task[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_misfit[CPU_MAX_IDLE_TYPES]; unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; -- cgit v1.2.3 From 1c055a0f5d3bafaca5d218bbb3e4e63d6307be45 Mon Sep 17 00:00:00 2001 From: Swapnil Sapkal Date: Fri, 20 Dec 2024 06:32:22 +0000 Subject: sched: Move sched domain name out of CONFIG_SCHED_DEBUG /proc/schedstat file shows cpu and sched domain level scheduler statistics. It does not show domain name instead shows domain level. It will be very useful for tools like `perf sched stats`[1] to aggragate domain level stats if domain names are shown in /proc/schedstat. But sched domain name is guarded by CONFIG_SCHED_DEBUG. As per the discussion[2], move sched domain name out of CONFIG_SCHED_DEBUG. [1] https://lore.kernel.org/lkml/20241122084452.1064968-1-swapnil.sapkal@amd.com/ [2] https://lore.kernel.org/lkml/fcefeb4d-3acb-462d-9c9b-3df8d927e522@amd.com/ Suggested-by: "Gautham R. Shenoy" Signed-off-by: Swapnil Sapkal Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241220063224.17767-5-swapnil.sapkal@amd.com --- include/linux/sched/topology.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 76a662e1ec24..7f3dbafe1817 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -143,9 +143,7 @@ struct sched_domain { unsigned int ttwu_move_affine; unsigned int ttwu_move_balance; #endif -#ifdef CONFIG_SCHED_DEBUG char *name; -#endif union { void *private; /* used during construction */ struct rcu_head rcu; /* used during destruction */ @@ -201,18 +199,12 @@ struct sched_domain_topology_level { int flags; int numa_level; struct sd_data data; -#ifdef CONFIG_SCHED_DEBUG char *name; -#endif }; extern void __init set_sched_topology(struct sched_domain_topology_level *tl); -#ifdef CONFIG_SCHED_DEBUG # define SD_INIT_NAME(type) .name = #type -#else -# define SD_INIT_NAME(type) -#endif #else /* CONFIG_SMP */ -- cgit v1.2.3 From abfdccd6af2b071951633e57d6322c46a1ea791f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 16 Dec 2024 20:07:35 -0800 Subject: sched/wake_q: Add helper to call wake_up_q after unlock with preemption disabled A common pattern seen when wake_qs are used to defer a wakeup until after a lock is released is something like: preempt_disable(); raw_spin_unlock(lock); wake_up_q(wake_q); preempt_enable(); So create some raw_spin_unlock*_wake() helper functions to clean this up. Applies on top of the fix I submitted here: https://lore.kernel.org/lkml/20241212222138.2400498-1-jstultz@google.com/ NOTE: I recognise the unlock()/unlock_irq()/unlock_irqrestore() variants creates its own duplication, which we could use a macro to generate the similar functions, but I often dislike how those generation macros making finding the actual implementation harder, so I left the three functions as is. If folks would prefer otherwise, let me know and I'll switch it. Suggested-by: Peter Zijlstra Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20241217040803.243420-1-jstultz@google.com --- include/linux/sched/wake_q.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include') diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h index 06cd8fb2f409..0f28b4623ad4 100644 --- a/include/linux/sched/wake_q.h +++ b/include/linux/sched/wake_q.h @@ -63,4 +63,38 @@ extern void wake_q_add(struct wake_q_head *head, struct task_struct *task); extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task); extern void wake_up_q(struct wake_q_head *head); +/* Spin unlock helpers to unlock and call wake_up_q with preempt disabled */ +static inline +void raw_spin_unlock_wake(raw_spinlock_t *lock, struct wake_q_head *wake_q) +{ + guard(preempt)(); + raw_spin_unlock(lock); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } +} + +static inline +void raw_spin_unlock_irq_wake(raw_spinlock_t *lock, struct wake_q_head *wake_q) +{ + guard(preempt)(); + raw_spin_unlock_irq(lock); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } +} + +static inline +void raw_spin_unlock_irqrestore_wake(raw_spinlock_t *lock, unsigned long flags, + struct wake_q_head *wake_q) +{ + guard(preempt)(); + raw_spin_unlock_irqrestore(lock, flags); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } +} #endif /* _LINUX_SCHED_WAKE_Q_H */ -- cgit v1.2.3 From 54f89b3178d5448dd4457afbb98fc1ab99090a65 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 10 Dec 2024 01:20:38 +0000 Subject: tcp_bpf: Charge receive socket buffer in bpf_tcp_ingress() When bpf_tcp_ingress() is called, the skmsg is being redirected to the ingress of the destination socket. Therefore, we should charge its receive socket buffer, instead of sending socket buffer. Because sk_rmem_schedule() tests pfmemalloc of skb, we need to introduce a wrapper and call it for skmsg. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241210012039.1669389-2-zijianzhang@bytedance.com --- include/net/sock.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 7464e9f9f47c..c383126f691d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1527,7 +1527,7 @@ static inline bool sk_wmem_schedule(struct sock *sk, int size) } static inline bool -sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) +__sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc) { int delta; @@ -1535,7 +1535,13 @@ sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) return true; delta = size - sk->sk_forward_alloc; return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) || - skb_pfmemalloc(skb); + pfmemalloc; +} + +static inline bool +sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) +{ + return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb)); } static inline int sk_unused_reserved_mem(const struct sock *sk) -- cgit v1.2.3 From d888b7af7c149c115dd6ac772cc11c375da3e17c Mon Sep 17 00:00:00 2001 From: Zijian Zhang Date: Tue, 10 Dec 2024 01:20:39 +0000 Subject: tcp_bpf: Add sk_rmem_alloc related logic for tcp_bpf ingress redirection When we do sk_psock_verdict_apply->sk_psock_skb_ingress, an sk_msg will be created out of the skb, and the rmem accounting of the sk_msg will be handled by the skb. For skmsgs in __SK_REDIRECT case of tcp_bpf_send_verdict, when redirecting to the ingress of a socket, although we sk_rmem_schedule and add sk_msg to the ingress_msg of sk_redir, we do not update sk_rmem_alloc. As a result, except for the global memory limit, the rmem of sk_redir is nearly unlimited. Thus, add sk_rmem_alloc related logic to limit the recv buffer. Since the function sk_msg_recvmsg and __sk_psock_purge_ingress_msg are used in these two paths. We use "msg->skb" to test whether the sk_msg is skb backed up. If it's not, we shall do the memory accounting explicitly. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Zijian Zhang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241210012039.1669389-3-zijianzhang@bytedance.com --- include/linux/skmsg.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index d9b03e0746e7..2cbe0c22a32f 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -317,17 +317,22 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); } -static inline void sk_psock_queue_msg(struct sk_psock *psock, +static inline bool sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { + bool ret; + spin_lock_bh(&psock->ingress_lock); - if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { list_add_tail(&msg->list, &psock->ingress_msg); - else { + ret = true; + } else { sk_msg_free(psock->sk, msg); kfree(msg); + ret = false; } spin_unlock_bh(&psock->ingress_lock); + return ret; } static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) -- cgit v1.2.3 From 724c6ce38bbaeb4b3f109b0e066d6c0ecd15446c Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 19 Dec 2024 14:57:34 +0100 Subject: stddef: make __struct_group() UAPI C++-friendly For the most part of the C++ history, it couldn't have type declarations inside anonymous unions for different reasons. At the same time, __struct_group() relies on the latters, so when the @TAG argument is not empty, C++ code doesn't want to build (even under `extern "C"`): ../linux/include/uapi/linux/pkt_cls.h:25:24: error: 'struct tc_u32_sel::::tc_u32_sel_hdr,' invalid; an anonymous union may only have public non-static data members [-fpermissive] The safest way to fix this without trying to switch standards (which is impossible in UAPI anyway) etc., is to disable tag declaration for that language. This won't break anything since for now it's not buildable at all. Use a separate definition for __struct_group() when __cplusplus is defined to mitigate the error, including the version from tools/. Fixes: 50d7bd38c3aa ("stddef: Introduce struct_group() helper macro") Reported-by: Christopher Ferris Closes: https://lore.kernel.org/linux-hardening/Z1HZpe3WE5As8UAz@google.com Suggested-by: Kees Cook # __struct_group_tag() Signed-off-by: Alexander Lobakin Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20241219135734.2130002-1-aleksander.lobakin@intel.com Signed-off-by: Kees Cook --- include/uapi/linux/stddef.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h index 58154117d9b0..a6fce46aeb37 100644 --- a/include/uapi/linux/stddef.h +++ b/include/uapi/linux/stddef.h @@ -8,6 +8,13 @@ #define __always_inline inline #endif +/* Not all C++ standards support type declarations inside an anonymous union */ +#ifndef __cplusplus +#define __struct_group_tag(TAG) TAG +#else +#define __struct_group_tag(TAG) +#endif + /** * __struct_group() - Create a mirrored named and anonyomous struct * @@ -20,13 +27,13 @@ * and size: one anonymous and one named. The former's members can be used * normally without sub-struct naming, and the latter can be used to * reason about the start, end, and size of the group of struct members. - * The named struct can also be explicitly tagged for layer reuse, as well - * as both having struct attributes appended. + * The named struct can also be explicitly tagged for layer reuse (C only), + * as well as both having struct attributes appended. */ #define __struct_group(TAG, NAME, ATTRS, MEMBERS...) \ union { \ struct { MEMBERS } ATTRS; \ - struct TAG { MEMBERS } ATTRS NAME; \ + struct __struct_group_tag(TAG) { MEMBERS } ATTRS NAME; \ } ATTRS #ifdef __cplusplus -- cgit v1.2.3 From 208fff3f567e2a3c3e7e4788845e90245c3891b4 Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Wed, 18 Dec 2024 14:12:37 +0100 Subject: PCI: Add PCI_VDEVICE_SUB helper macro PCI_VDEVICE_SUB generates the pci_device_id struct layout for the specific PCI device/subdevice. Private data may follow the output. Reviewed-by: Przemek Kitszel Signed-off-by: Piotr Kwapulinski Acked-by: Bjorn Helgaas Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- include/linux/pci.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index db9b47ce3eef..414ee5fff66b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1046,6 +1046,20 @@ struct pci_driver { .vendor = PCI_VENDOR_ID_##vend, .device = (dev), \ .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, 0, 0 +/** + * PCI_VDEVICE_SUB - describe a specific PCI device/subdevice in a short form + * @vend: the vendor name + * @dev: the 16 bit PCI Device ID + * @subvend: the 16 bit PCI Subvendor ID + * @subdev: the 16 bit PCI Subdevice ID + * + * Generate the pci_device_id struct layout for the specific PCI + * device/subdevice. Private data may follow the output. + */ +#define PCI_VDEVICE_SUB(vend, dev, subvend, subdev) \ + .vendor = PCI_VENDOR_ID_##vend, .device = (dev), \ + .subvendor = (subvend), .subdevice = (subdev), 0, 0 + /** * PCI_DEVICE_DATA - macro used to describe a specific PCI device in very short form * @vend: the vendor name (without PCI_VENDOR_ID_ prefix) -- cgit v1.2.3 From 4acb665cf4f3e5436844f17ece0a8a55ce688c7b Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:08 +0000 Subject: netfs: Work around recursion by abandoning retry if nothing read syzkaller reported recursion with a loop of three calls (netfs_rreq_assess, netfs_retry_reads and netfs_rreq_terminated) hitting the limit of the stack during an unbuffered or direct I/O read. There are a number of issues: (1) There is no limit on the number of retries. (2) A subrequest is supposed to be abandoned if it does not transfer anything (NETFS_SREQ_NO_PROGRESS), but that isn't checked under all circumstances. (3) The actual root cause, which is this: if (atomic_dec_and_test(&rreq->nr_outstanding)) netfs_rreq_terminated(rreq, ...); When we do a retry, we bump the rreq->nr_outstanding counter to prevent the final cleanup phase running before we've finished dispatching the retries. The problem is if we hit 0, we have to do the cleanup phase - but we're in the cleanup phase and end up repeating the retry cycle, hence the recursion. Work around the problem by limiting the number of retries. This is based on Lizhi Xu's patch[1], and makes the following changes: (1) Replace NETFS_SREQ_NO_PROGRESS with NETFS_SREQ_MADE_PROGRESS and make the filesystem set it if it managed to read or write at least one byte of data. Clear this bit before issuing a subrequest. (2) Add a ->retry_count member to the subrequest and increment it any time we do a retry. (3) Remove the NETFS_SREQ_RETRYING flag as it is superfluous with ->retry_count. If the latter is non-zero, we're doing a retry. (4) Abandon a subrequest if retry_count is non-zero and we made no progress. (5) Use ->retry_count in both the write-side and the read-size. [?] Question: Should I set a hard limit on retry_count in both read and write? Say it hits 50, we always abandon it. The problem is that these changes only mitigate the issue. As long as it made at least one byte of progress, the recursion is still an issue. This patch mitigates the problem, but does not fix the underlying cause. I have patches that will do that, but it's an intrusive fix that's currently pending for the next merge window. The oops generated by KASAN looks something like: BUG: TASK stack guard page was hit at ffffc9000482ff48 (stack is ffffc90004830000..ffffc90004838000) Oops: stack guard page: 0000 [#1] PREEMPT SMP KASAN NOPTI ... RIP: 0010:mark_lock+0x25/0xc60 kernel/locking/lockdep.c:4686 ... mark_usage kernel/locking/lockdep.c:4646 [inline] __lock_acquire+0x906/0x3ce0 kernel/locking/lockdep.c:5156 lock_acquire.part.0+0x11b/0x380 kernel/locking/lockdep.c:5825 local_lock_acquire include/linux/local_lock_internal.h:29 [inline] ___slab_alloc+0x123/0x1880 mm/slub.c:3695 __slab_alloc.constprop.0+0x56/0xb0 mm/slub.c:3908 __slab_alloc_node mm/slub.c:3961 [inline] slab_alloc_node mm/slub.c:4122 [inline] kmem_cache_alloc_noprof+0x2a7/0x2f0 mm/slub.c:4141 radix_tree_node_alloc.constprop.0+0x1e8/0x350 lib/radix-tree.c:253 idr_get_free+0x528/0xa40 lib/radix-tree.c:1506 idr_alloc_u32+0x191/0x2f0 lib/idr.c:46 idr_alloc+0xc1/0x130 lib/idr.c:87 p9_tag_alloc+0x394/0x870 net/9p/client.c:321 p9_client_prepare_req+0x19f/0x4d0 net/9p/client.c:644 p9_client_zc_rpc.constprop.0+0x105/0x880 net/9p/client.c:793 p9_client_read_once+0x443/0x820 net/9p/client.c:1570 p9_client_read+0x13f/0x1b0 net/9p/client.c:1534 v9fs_issue_read+0x115/0x310 fs/9p/vfs_addr.c:74 netfs_retry_read_subrequests fs/netfs/read_retry.c:60 [inline] netfs_retry_reads+0x153a/0x1d00 fs/netfs/read_retry.c:232 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 ... netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_dispatch_unbuffered_reads fs/netfs/direct_read.c:103 [inline] netfs_unbuffered_read fs/netfs/direct_read.c:127 [inline] netfs_unbuffered_read_iter_locked+0x12f6/0x19b0 fs/netfs/direct_read.c:221 netfs_unbuffered_read_iter+0xc5/0x100 fs/netfs/direct_read.c:256 v9fs_file_read_iter+0xbf/0x100 fs/9p/vfs_file.c:361 do_iter_readv_writev+0x614/0x7f0 fs/read_write.c:832 vfs_readv+0x4cf/0x890 fs/read_write.c:1025 do_preadv fs/read_write.c:1142 [inline] __do_sys_preadv fs/read_write.c:1192 [inline] __se_sys_preadv fs/read_write.c:1187 [inline] __x64_sys_preadv+0x22d/0x310 fs/read_write.c:1187 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Closes: https://syzkaller.appspot.com/bug?extid=1fc6f64c40a9d143cfb6 Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241108034020.3695718-1-lizhi.xu@windriver.com/ [1] Link: https://lore.kernel.org/r/20241213135013.2964079-9-dhowells@redhat.com Tested-by: syzbot+885c03ad650731743489@syzkaller.appspotmail.com Suggested-by: Lizhi Xu cc: Dominique Martinet cc: Jeff Layton cc: v9fs@lists.linux.dev cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Reported-by: syzbot+885c03ad650731743489@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- include/linux/netfs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5eaceef41e6c..4083d77e3f39 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -185,6 +185,7 @@ struct netfs_io_subrequest { short error; /* 0 or error that occurred */ unsigned short debug_index; /* Index in list (for debugging output) */ unsigned int nr_segs; /* Number of segs in io_iter */ + u8 retry_count; /* The number of retries (0 on initial pass) */ enum netfs_io_source source; /* Where to read from/write to */ unsigned char stream_nr; /* I/O stream this belongs to */ unsigned char curr_folioq_slot; /* Folio currently being read */ @@ -194,14 +195,13 @@ struct netfs_io_subrequest { #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ #define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */ #define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */ -#define NETFS_SREQ_NO_PROGRESS 4 /* Set if we didn't manage to read any data */ +#define NETFS_SREQ_MADE_PROGRESS 4 /* Set if we transferred at least some data */ #define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */ #define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */ #define NETFS_SREQ_HIT_EOF 7 /* Set if short due to EOF */ #define NETFS_SREQ_IN_PROGRESS 8 /* Unlocked when the subrequest completes */ #define NETFS_SREQ_NEED_RETRY 9 /* Set if the filesystem requests a retry */ -#define NETFS_SREQ_RETRYING 10 /* Set if we're retrying */ -#define NETFS_SREQ_FAILED 11 /* Set if the subreq failed unretryably */ +#define NETFS_SREQ_FAILED 10 /* Set if the subreq failed unretryably */ }; enum netfs_io_origin { -- cgit v1.2.3 From d4e338de17cb6532bf805fae00db8b41e914009b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:34:45 +0000 Subject: netfs: Fix is-caching check in read-retry netfs: Fix is-caching check in read-retry The read-retry code checks the NETFS_RREQ_COPY_TO_CACHE flag to determine if there might be failed reads from the cache that need turning into reads from the server, with the intention of skipping the complicated part if it can. The code that set the flag, however, got lost during the read-side rewrite. Fix the check to see if the cache_resources are valid instead. The flag can then be removed. Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Signed-off-by: David Howells Link: https://lore.kernel.org/r/3752048.1734381285@warthog.procyon.org.uk cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 4083d77e3f39..ecdd5ced16a8 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -269,7 +269,6 @@ struct netfs_io_request { size_t prev_donated; /* Fallback for subreq->prev_donated */ refcount_t ref; unsigned long flags; -#define NETFS_RREQ_COPY_TO_CACHE 1 /* Need to write to the cache */ #define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */ #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ -- cgit v1.2.3 From d3d3ec86568090cc5a4501b745274114de6881e0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:40:51 +0000 Subject: netfs: Clean up some whitespace in trace header Clean up some whitespace in the netfs trace header. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-2-dhowells@redhat.com cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/trace/events/netfs.h | 130 +++++++++++++++++++++---------------------- 1 file changed, 65 insertions(+), 65 deletions(-) (limited to 'include') diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index bf511bca896e..c3c309f8fbe1 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -250,13 +250,13 @@ TRACE_EVENT(netfs_read, TP_ARGS(rreq, start, len, what), TP_STRUCT__entry( - __field(unsigned int, rreq ) - __field(unsigned int, cookie ) - __field(loff_t, i_size ) - __field(loff_t, start ) - __field(size_t, len ) - __field(enum netfs_read_trace, what ) - __field(unsigned int, netfs_inode ) + __field(unsigned int, rreq) + __field(unsigned int, cookie) + __field(loff_t, i_size) + __field(loff_t, start) + __field(size_t, len) + __field(enum netfs_read_trace, what) + __field(unsigned int, netfs_inode) ), TP_fast_assign( @@ -284,10 +284,10 @@ TRACE_EVENT(netfs_rreq, TP_ARGS(rreq, what), TP_STRUCT__entry( - __field(unsigned int, rreq ) - __field(unsigned int, flags ) - __field(enum netfs_io_origin, origin ) - __field(enum netfs_rreq_trace, what ) + __field(unsigned int, rreq) + __field(unsigned int, flags) + __field(enum netfs_io_origin, origin) + __field(enum netfs_rreq_trace, what) ), TP_fast_assign( @@ -311,15 +311,15 @@ TRACE_EVENT(netfs_sreq, TP_ARGS(sreq, what), TP_STRUCT__entry( - __field(unsigned int, rreq ) - __field(unsigned short, index ) - __field(short, error ) - __field(unsigned short, flags ) - __field(enum netfs_io_source, source ) - __field(enum netfs_sreq_trace, what ) - __field(size_t, len ) - __field(size_t, transferred ) - __field(loff_t, start ) + __field(unsigned int, rreq) + __field(unsigned short, index) + __field(short, error) + __field(unsigned short, flags) + __field(enum netfs_io_source, source) + __field(enum netfs_sreq_trace, what) + __field(size_t, len) + __field(size_t, transferred) + __field(loff_t, start) ), TP_fast_assign( @@ -351,15 +351,15 @@ TRACE_EVENT(netfs_failure, TP_ARGS(rreq, sreq, error, what), TP_STRUCT__entry( - __field(unsigned int, rreq ) - __field(short, index ) - __field(short, error ) - __field(unsigned short, flags ) - __field(enum netfs_io_source, source ) - __field(enum netfs_failure, what ) - __field(size_t, len ) - __field(size_t, transferred ) - __field(loff_t, start ) + __field(unsigned int, rreq) + __field(short, index) + __field(short, error) + __field(unsigned short, flags) + __field(enum netfs_io_source, source) + __field(enum netfs_failure, what) + __field(size_t, len) + __field(size_t, transferred) + __field(loff_t, start) ), TP_fast_assign( @@ -390,9 +390,9 @@ TRACE_EVENT(netfs_rreq_ref, TP_ARGS(rreq_debug_id, ref, what), TP_STRUCT__entry( - __field(unsigned int, rreq ) - __field(int, ref ) - __field(enum netfs_rreq_ref_trace, what ) + __field(unsigned int, rreq) + __field(int, ref) + __field(enum netfs_rreq_ref_trace, what) ), TP_fast_assign( @@ -414,10 +414,10 @@ TRACE_EVENT(netfs_sreq_ref, TP_ARGS(rreq_debug_id, subreq_debug_index, ref, what), TP_STRUCT__entry( - __field(unsigned int, rreq ) - __field(unsigned int, subreq ) - __field(int, ref ) - __field(enum netfs_sreq_ref_trace, what ) + __field(unsigned int, rreq) + __field(unsigned int, subreq) + __field(int, ref) + __field(enum netfs_sreq_ref_trace, what) ), TP_fast_assign( @@ -465,10 +465,10 @@ TRACE_EVENT(netfs_write_iter, TP_ARGS(iocb, from), TP_STRUCT__entry( - __field(unsigned long long, start ) - __field(size_t, len ) - __field(unsigned int, flags ) - __field(unsigned int, ino ) + __field(unsigned long long, start) + __field(size_t, len) + __field(unsigned int, flags) + __field(unsigned int, ino) ), TP_fast_assign( @@ -489,12 +489,12 @@ TRACE_EVENT(netfs_write, TP_ARGS(wreq, what), TP_STRUCT__entry( - __field(unsigned int, wreq ) - __field(unsigned int, cookie ) - __field(unsigned int, ino ) - __field(enum netfs_write_trace, what ) - __field(unsigned long long, start ) - __field(unsigned long long, len ) + __field(unsigned int, wreq) + __field(unsigned int, cookie) + __field(unsigned int, ino) + __field(enum netfs_write_trace, what) + __field(unsigned long long, start) + __field(unsigned long long, len) ), TP_fast_assign( @@ -522,10 +522,10 @@ TRACE_EVENT(netfs_collect, TP_ARGS(wreq), TP_STRUCT__entry( - __field(unsigned int, wreq ) - __field(unsigned int, len ) - __field(unsigned long long, transferred ) - __field(unsigned long long, start ) + __field(unsigned int, wreq) + __field(unsigned int, len) + __field(unsigned long long, transferred) + __field(unsigned long long, start) ), TP_fast_assign( @@ -548,12 +548,12 @@ TRACE_EVENT(netfs_collect_sreq, TP_ARGS(wreq, subreq), TP_STRUCT__entry( - __field(unsigned int, wreq ) - __field(unsigned int, subreq ) - __field(unsigned int, stream ) - __field(unsigned int, len ) - __field(unsigned int, transferred ) - __field(unsigned long long, start ) + __field(unsigned int, wreq) + __field(unsigned int, subreq) + __field(unsigned int, stream) + __field(unsigned int, len) + __field(unsigned int, transferred) + __field(unsigned long long, start) ), TP_fast_assign( @@ -579,11 +579,11 @@ TRACE_EVENT(netfs_collect_folio, TP_ARGS(wreq, folio, fend, collected_to), TP_STRUCT__entry( - __field(unsigned int, wreq ) - __field(unsigned long, index ) - __field(unsigned long long, fend ) - __field(unsigned long long, cleaned_to ) - __field(unsigned long long, collected_to ) + __field(unsigned int, wreq) + __field(unsigned long, index) + __field(unsigned long long, fend) + __field(unsigned long long, cleaned_to) + __field(unsigned long long, collected_to) ), TP_fast_assign( @@ -608,10 +608,10 @@ TRACE_EVENT(netfs_collect_state, TP_ARGS(wreq, collected_to, notes), TP_STRUCT__entry( - __field(unsigned int, wreq ) - __field(unsigned int, notes ) - __field(unsigned long long, collected_to ) - __field(unsigned long long, cleaned_to ) + __field(unsigned int, wreq) + __field(unsigned int, notes) + __field(unsigned long long, collected_to) + __field(unsigned long long, cleaned_to) ), TP_fast_assign( -- cgit v1.2.3 From 2a8a384621c35849bbc068e25e41cbe23243fa40 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:40:52 +0000 Subject: cachefiles: Clean up some whitespace in trace header Clean up some whitespace in the cachefiles trace header. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-3-dhowells@redhat.com cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/trace/events/cachefiles.h | 172 +++++++++++++++++++------------------- 1 file changed, 86 insertions(+), 86 deletions(-) (limited to 'include') diff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h index 7d931db02b93..74114c261bcd 100644 --- a/include/trace/events/cachefiles.h +++ b/include/trace/events/cachefiles.h @@ -223,10 +223,10 @@ TRACE_EVENT(cachefiles_ref, /* Note that obj may be NULL */ TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(unsigned int, cookie ) - __field(enum cachefiles_obj_ref_trace, why ) - __field(int, usage ) + __field(unsigned int, obj) + __field(unsigned int, cookie) + __field(enum cachefiles_obj_ref_trace, why) + __field(int, usage) ), TP_fast_assign( @@ -249,10 +249,10 @@ TRACE_EVENT(cachefiles_lookup, TP_ARGS(obj, dir, de), TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(short, error ) - __field(unsigned long, dino ) - __field(unsigned long, ino ) + __field(unsigned int, obj) + __field(short, error) + __field(unsigned long, dino) + __field(unsigned long, ino) ), TP_fast_assign( @@ -273,8 +273,8 @@ TRACE_EVENT(cachefiles_mkdir, TP_ARGS(dir, subdir), TP_STRUCT__entry( - __field(unsigned int, dir ) - __field(unsigned int, subdir ) + __field(unsigned int, dir) + __field(unsigned int, subdir) ), TP_fast_assign( @@ -293,8 +293,8 @@ TRACE_EVENT(cachefiles_tmpfile, TP_ARGS(obj, backer), TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(unsigned int, backer ) + __field(unsigned int, obj) + __field(unsigned int, backer) ), TP_fast_assign( @@ -313,8 +313,8 @@ TRACE_EVENT(cachefiles_link, TP_ARGS(obj, backer), TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(unsigned int, backer ) + __field(unsigned int, obj) + __field(unsigned int, backer) ), TP_fast_assign( @@ -336,9 +336,9 @@ TRACE_EVENT(cachefiles_unlink, /* Note that obj may be NULL */ TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(unsigned int, ino ) - __field(enum fscache_why_object_killed, why ) + __field(unsigned int, obj) + __field(unsigned int, ino) + __field(enum fscache_why_object_killed, why) ), TP_fast_assign( @@ -361,9 +361,9 @@ TRACE_EVENT(cachefiles_rename, /* Note that obj may be NULL */ TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(unsigned int, ino ) - __field(enum fscache_why_object_killed, why ) + __field(unsigned int, obj) + __field(unsigned int, ino) + __field(enum fscache_why_object_killed, why) ), TP_fast_assign( @@ -387,10 +387,10 @@ TRACE_EVENT(cachefiles_coherency, /* Note that obj may be NULL */ TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(enum cachefiles_coherency_trace, why ) - __field(enum cachefiles_content, content ) - __field(u64, ino ) + __field(unsigned int, obj) + __field(enum cachefiles_coherency_trace, why) + __field(enum cachefiles_content, content) + __field(u64, ino) ), TP_fast_assign( @@ -416,9 +416,9 @@ TRACE_EVENT(cachefiles_vol_coherency, /* Note that obj may be NULL */ TP_STRUCT__entry( - __field(unsigned int, vol ) - __field(enum cachefiles_coherency_trace, why ) - __field(u64, ino ) + __field(unsigned int, vol) + __field(enum cachefiles_coherency_trace, why) + __field(u64, ino) ), TP_fast_assign( @@ -445,14 +445,14 @@ TRACE_EVENT(cachefiles_prep_read, TP_ARGS(obj, start, len, flags, source, why, cache_inode, netfs_inode), TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(unsigned short, flags ) - __field(enum netfs_io_source, source ) - __field(enum cachefiles_prepare_read_trace, why ) - __field(size_t, len ) - __field(loff_t, start ) - __field(unsigned int, netfs_inode ) - __field(unsigned int, cache_inode ) + __field(unsigned int, obj) + __field(unsigned short, flags) + __field(enum netfs_io_source, source) + __field(enum cachefiles_prepare_read_trace, why) + __field(size_t, len) + __field(loff_t, start) + __field(unsigned int, netfs_inode) + __field(unsigned int, cache_inode) ), TP_fast_assign( @@ -484,10 +484,10 @@ TRACE_EVENT(cachefiles_read, TP_ARGS(obj, backer, start, len), TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(unsigned int, backer ) - __field(size_t, len ) - __field(loff_t, start ) + __field(unsigned int, obj) + __field(unsigned int, backer) + __field(size_t, len) + __field(loff_t, start) ), TP_fast_assign( @@ -513,10 +513,10 @@ TRACE_EVENT(cachefiles_write, TP_ARGS(obj, backer, start, len), TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(unsigned int, backer ) - __field(size_t, len ) - __field(loff_t, start ) + __field(unsigned int, obj) + __field(unsigned int, backer) + __field(size_t, len) + __field(loff_t, start) ), TP_fast_assign( @@ -540,11 +540,11 @@ TRACE_EVENT(cachefiles_trunc, TP_ARGS(obj, backer, from, to, why), TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(unsigned int, backer ) - __field(enum cachefiles_trunc_trace, why ) - __field(loff_t, from ) - __field(loff_t, to ) + __field(unsigned int, obj) + __field(unsigned int, backer) + __field(enum cachefiles_trunc_trace, why) + __field(loff_t, from) + __field(loff_t, to) ), TP_fast_assign( @@ -571,8 +571,8 @@ TRACE_EVENT(cachefiles_mark_active, /* Note that obj may be NULL */ TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(ino_t, inode ) + __field(unsigned int, obj) + __field(ino_t, inode) ), TP_fast_assign( @@ -592,8 +592,8 @@ TRACE_EVENT(cachefiles_mark_failed, /* Note that obj may be NULL */ TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(ino_t, inode ) + __field(unsigned int, obj) + __field(ino_t, inode) ), TP_fast_assign( @@ -613,8 +613,8 @@ TRACE_EVENT(cachefiles_mark_inactive, /* Note that obj may be NULL */ TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(ino_t, inode ) + __field(unsigned int, obj) + __field(ino_t, inode) ), TP_fast_assign( @@ -633,10 +633,10 @@ TRACE_EVENT(cachefiles_vfs_error, TP_ARGS(obj, backer, error, where), TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(unsigned int, backer ) - __field(enum cachefiles_error_trace, where ) - __field(short, error ) + __field(unsigned int, obj) + __field(unsigned int, backer) + __field(enum cachefiles_error_trace, where) + __field(short, error) ), TP_fast_assign( @@ -660,10 +660,10 @@ TRACE_EVENT(cachefiles_io_error, TP_ARGS(obj, backer, error, where), TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(unsigned int, backer ) - __field(enum cachefiles_error_trace, where ) - __field(short, error ) + __field(unsigned int, obj) + __field(unsigned int, backer) + __field(enum cachefiles_error_trace, where) + __field(short, error) ), TP_fast_assign( @@ -687,11 +687,11 @@ TRACE_EVENT(cachefiles_ondemand_open, TP_ARGS(obj, msg, load), TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(unsigned int, msg_id ) - __field(unsigned int, object_id ) - __field(unsigned int, fd ) - __field(unsigned int, flags ) + __field(unsigned int, obj) + __field(unsigned int, msg_id) + __field(unsigned int, object_id) + __field(unsigned int, fd) + __field(unsigned int, flags) ), TP_fast_assign( @@ -717,9 +717,9 @@ TRACE_EVENT(cachefiles_ondemand_copen, TP_ARGS(obj, msg_id, len), TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(unsigned int, msg_id ) - __field(long, len ) + __field(unsigned int, obj) + __field(unsigned int, msg_id) + __field(long, len) ), TP_fast_assign( @@ -740,9 +740,9 @@ TRACE_EVENT(cachefiles_ondemand_close, TP_ARGS(obj, msg), TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(unsigned int, msg_id ) - __field(unsigned int, object_id ) + __field(unsigned int, obj) + __field(unsigned int, msg_id) + __field(unsigned int, object_id) ), TP_fast_assign( @@ -764,11 +764,11 @@ TRACE_EVENT(cachefiles_ondemand_read, TP_ARGS(obj, msg, load), TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(unsigned int, msg_id ) - __field(unsigned int, object_id ) - __field(loff_t, start ) - __field(size_t, len ) + __field(unsigned int, obj) + __field(unsigned int, msg_id) + __field(unsigned int, object_id) + __field(loff_t, start) + __field(size_t, len) ), TP_fast_assign( @@ -793,8 +793,8 @@ TRACE_EVENT(cachefiles_ondemand_cread, TP_ARGS(obj, msg_id), TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(unsigned int, msg_id ) + __field(unsigned int, obj) + __field(unsigned int, msg_id) ), TP_fast_assign( @@ -814,10 +814,10 @@ TRACE_EVENT(cachefiles_ondemand_fd_write, TP_ARGS(obj, backer, start, len), TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(unsigned int, backer ) - __field(loff_t, start ) - __field(size_t, len ) + __field(unsigned int, obj) + __field(unsigned int, backer) + __field(loff_t, start) + __field(size_t, len) ), TP_fast_assign( @@ -840,8 +840,8 @@ TRACE_EVENT(cachefiles_ondemand_fd_release, TP_ARGS(obj, object_id), TP_STRUCT__entry( - __field(unsigned int, obj ) - __field(unsigned int, object_id ) + __field(unsigned int, obj) + __field(unsigned int, object_id) ), TP_fast_assign( -- cgit v1.2.3 From eb1181594417dafad0f75808ead71f6d5170c1ea Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:40:53 +0000 Subject: netfs: Use a folio_queue allocation and free functions Provide and use folio_queue allocation and free functions to combine the allocation, initialisation and stat (un)accounting steps that are repeated in several places. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-4-dhowells@redhat.com cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index ecdd5ced16a8..c69e0f02c30f 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -21,6 +21,7 @@ enum netfs_sreq_ref_trace; typedef struct mempool_s mempool_t; +struct folio_queue; /** * folio_start_private_2 - Start an fscache write on a folio. [DEPRECATED] @@ -453,6 +454,10 @@ void netfs_end_io_write(struct inode *inode); int netfs_start_io_direct(struct inode *inode); void netfs_end_io_direct(struct inode *inode); +/* Miscellaneous APIs. */ +struct folio_queue *netfs_folioq_alloc(gfp_t gfp); +void netfs_folioq_free(struct folio_queue *folioq); + /** * netfs_inode - Get the netfs inode context from the inode * @inode: The inode to query -- cgit v1.2.3 From aabcabf2746062253565b33aa3f8d25999a5ac01 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:40:54 +0000 Subject: netfs: Add a tracepoint to log the lifespan of folio_queue structs Add a tracepoint to log the lifespan of folio_queue structs. For tracing illustrative purposes, folio_queues are tagged with the debug ID of whatever they're related to (typically a netfs_io_request) and a debug ID of their own. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-5-dhowells@redhat.com cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/folio_queue.h | 12 +++++++++--- include/linux/netfs.h | 6 ++++-- include/trace/events/netfs.h | 41 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 52 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h index 3abe614ef5f0..4d3f8074c137 100644 --- a/include/linux/folio_queue.h +++ b/include/linux/folio_queue.h @@ -37,16 +37,20 @@ struct folio_queue { #if PAGEVEC_SIZE > BITS_PER_LONG #error marks is not big enough #endif + unsigned int rreq_id; + unsigned int debug_id; }; /** * folioq_init - Initialise a folio queue segment * @folioq: The segment to initialise + * @rreq_id: The request identifier to use in tracelines. * - * Initialise a folio queue segment. Note that the folio pointers are - * left uninitialised. + * Initialise a folio queue segment and set an identifier to be used in traces. + * + * Note that the folio pointers are left uninitialised. */ -static inline void folioq_init(struct folio_queue *folioq) +static inline void folioq_init(struct folio_queue *folioq, unsigned int rreq_id) { folio_batch_init(&folioq->vec); folioq->next = NULL; @@ -54,6 +58,8 @@ static inline void folioq_init(struct folio_queue *folioq) folioq->marks = 0; folioq->marks2 = 0; folioq->marks3 = 0; + folioq->rreq_id = rreq_id; + folioq->debug_id = 0; } /** diff --git a/include/linux/netfs.h b/include/linux/netfs.h index c69e0f02c30f..5b2f427f8e3e 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -455,8 +455,10 @@ int netfs_start_io_direct(struct inode *inode); void netfs_end_io_direct(struct inode *inode); /* Miscellaneous APIs. */ -struct folio_queue *netfs_folioq_alloc(gfp_t gfp); -void netfs_folioq_free(struct folio_queue *folioq); +struct folio_queue *netfs_folioq_alloc(unsigned int rreq_id, gfp_t gfp, + unsigned int trace /*enum netfs_folioq_trace*/); +void netfs_folioq_free(struct folio_queue *folioq, + unsigned int trace /*enum netfs_trace_folioq*/); /** * netfs_inode - Get the netfs inode context from the inode diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index c3c309f8fbe1..50aa6745df95 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -191,6 +191,16 @@ EM(netfs_trace_donate_to_next, "to-next") \ E_(netfs_trace_donate_to_deferred_next, "defer-next") +#define netfs_folioq_traces \ + EM(netfs_trace_folioq_alloc_append_folio, "alloc-apf") \ + EM(netfs_trace_folioq_alloc_read_prep, "alloc-r-prep") \ + EM(netfs_trace_folioq_alloc_read_prime, "alloc-r-prime") \ + EM(netfs_trace_folioq_alloc_read_sing, "alloc-r-sing") \ + EM(netfs_trace_folioq_clear, "clear") \ + EM(netfs_trace_folioq_delete, "delete") \ + EM(netfs_trace_folioq_prep_write, "prep-wr") \ + E_(netfs_trace_folioq_read_progress, "r-progress") + #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY #define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY @@ -209,6 +219,7 @@ enum netfs_sreq_ref_trace { netfs_sreq_ref_traces } __mode(byte); enum netfs_folio_trace { netfs_folio_traces } __mode(byte); enum netfs_collect_contig_trace { netfs_collect_contig_traces } __mode(byte); enum netfs_donate_trace { netfs_donate_traces } __mode(byte); +enum netfs_folioq_trace { netfs_folioq_traces } __mode(byte); #endif @@ -232,6 +243,7 @@ netfs_sreq_ref_traces; netfs_folio_traces; netfs_collect_contig_traces; netfs_donate_traces; +netfs_folioq_traces; /* * Now redefine the EM() and E_() macros to map the enums to the strings that @@ -317,6 +329,7 @@ TRACE_EVENT(netfs_sreq, __field(unsigned short, flags) __field(enum netfs_io_source, source) __field(enum netfs_sreq_trace, what) + __field(u8, slot) __field(size_t, len) __field(size_t, transferred) __field(loff_t, start) @@ -332,15 +345,16 @@ TRACE_EVENT(netfs_sreq, __entry->len = sreq->len; __entry->transferred = sreq->transferred; __entry->start = sreq->start; + __entry->slot = sreq->curr_folioq_slot; ), - TP_printk("R=%08x[%x] %s %s f=%02x s=%llx %zx/%zx e=%d", + TP_printk("R=%08x[%x] %s %s f=%02x s=%llx %zx/%zx s=%u e=%d", __entry->rreq, __entry->index, __print_symbolic(__entry->source, netfs_sreq_sources), __print_symbolic(__entry->what, netfs_sreq_traces), __entry->flags, __entry->start, __entry->transferred, __entry->len, - __entry->error) + __entry->slot, __entry->error) ); TRACE_EVENT(netfs_failure, @@ -745,6 +759,29 @@ TRACE_EVENT(netfs_donate, __entry->amount) ); +TRACE_EVENT(netfs_folioq, + TP_PROTO(const struct folio_queue *fq, + enum netfs_folioq_trace trace), + + TP_ARGS(fq, trace), + + TP_STRUCT__entry( + __field(unsigned int, rreq) + __field(unsigned int, id) + __field(enum netfs_folioq_trace, trace) + ), + + TP_fast_assign( + __entry->rreq = fq ? fq->rreq_id : 0; + __entry->id = fq ? fq->debug_id : 0; + __entry->trace = trace; + ), + + TP_printk("R=%08x fq=%x %s", + __entry->rreq, __entry->id, + __print_symbolic(__entry->trace, netfs_folioq_traces)) + ); + #undef EM #undef E_ #endif /* _TRACE_NETFS_H */ -- cgit v1.2.3 From 06fa229ceb36898e68022b5654c017d2c6582d7d Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:40:55 +0000 Subject: netfs: Abstract out a rolling folio buffer implementation A rolling buffer is a series of folios held in a list of folio_queues. New folios and folio_queue structs may be inserted at the head simultaneously with spent ones being removed from the tail without the need for locking. The rolling buffer includes an iov_iter and it has to be careful managing this as the list of folio_queues is extended such that an oops doesn't incurred because the iterator was pointing to the end of a folio_queue segment that got appended to and then removed. We need to use the mechanism twice, once for read and once for write, and, in future patches, we will use a second rolling buffer to handle bounce buffering for content encryption. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-6-dhowells@redhat.com cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 10 +++---- include/linux/rolling_buffer.h | 61 ++++++++++++++++++++++++++++++++++++++++++ include/trace/events/netfs.h | 2 ++ 3 files changed, 67 insertions(+), 6 deletions(-) create mode 100644 include/linux/rolling_buffer.h (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5b2f427f8e3e..bd922f0936e3 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -18,6 +18,7 @@ #include #include #include +#include enum netfs_sreq_ref_trace; typedef struct mempool_s mempool_t; @@ -238,10 +239,9 @@ struct netfs_io_request { struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */ #define NR_IO_STREAMS 2 //wreq->nr_io_streams struct netfs_group *group; /* Writeback group being written back */ - struct folio_queue *buffer; /* Head of I/O buffer */ - struct folio_queue *buffer_tail; /* Tail of I/O buffer */ - struct iov_iter iter; /* Unencrypted-side iterator */ - struct iov_iter io_iter; /* I/O (Encrypted-side) iterator */ + struct rolling_buffer buffer; /* Unencrypted buffer */ +#define NETFS_ROLLBUF_PUT_MARK ROLLBUF_MARK_1 +#define NETFS_ROLLBUF_PAGECACHE_MARK ROLLBUF_MARK_2 void *netfs_priv; /* Private data for the netfs */ void *netfs_priv2; /* Private data for the netfs */ struct bio_vec *direct_bv; /* DIO buffer list (when handling iovec-iter) */ @@ -259,8 +259,6 @@ struct netfs_io_request { long error; /* 0 or error that occurred */ enum netfs_io_origin origin; /* Origin of the request */ bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ - u8 buffer_head_slot; /* First slot in ->buffer */ - u8 buffer_tail_slot; /* Next slot in ->buffer_tail */ unsigned long long i_size; /* Size of the file */ unsigned long long start; /* Start position */ atomic64_t issued_to; /* Write issuer folio cursor */ diff --git a/include/linux/rolling_buffer.h b/include/linux/rolling_buffer.h new file mode 100644 index 000000000000..ac15b1ffdd83 --- /dev/null +++ b/include/linux/rolling_buffer.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Rolling buffer of folios + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#ifndef _ROLLING_BUFFER_H +#define _ROLLING_BUFFER_H + +#include +#include + +/* + * Rolling buffer. Whilst the buffer is live and in use, folios and folio + * queue segments can be added to one end by one thread and removed from the + * other end by another thread. The buffer isn't allowed to be empty; it must + * always have at least one folio_queue in it so that neither side has to + * modify both queue pointers. + * + * The iterator in the buffer is extended as buffers are inserted. It can be + * snapshotted to use a segment of the buffer. + */ +struct rolling_buffer { + struct folio_queue *head; /* Producer's insertion point */ + struct folio_queue *tail; /* Consumer's removal point */ + struct iov_iter iter; /* Iterator tracking what's left in the buffer */ + u8 next_head_slot; /* Next slot in ->head */ + u8 first_tail_slot; /* First slot in ->tail */ +}; + +/* + * Snapshot of a rolling buffer. + */ +struct rolling_buffer_snapshot { + struct folio_queue *curr_folioq; /* Queue segment in which current folio resides */ + unsigned char curr_slot; /* Folio currently being read */ + unsigned char curr_order; /* Order of folio */ +}; + +/* Marks to store per-folio in the internal folio_queue structs. */ +#define ROLLBUF_MARK_1 BIT(0) +#define ROLLBUF_MARK_2 BIT(1) + +int rolling_buffer_init(struct rolling_buffer *roll, unsigned int rreq_id, + unsigned int direction); +int rolling_buffer_make_space(struct rolling_buffer *roll); +ssize_t rolling_buffer_load_from_ra(struct rolling_buffer *roll, + struct readahead_control *ractl, + struct folio_batch *put_batch); +ssize_t rolling_buffer_append(struct rolling_buffer *roll, struct folio *folio, + unsigned int flags); +struct folio_queue *rolling_buffer_delete_spent(struct rolling_buffer *roll); +void rolling_buffer_clear(struct rolling_buffer *roll); + +static inline void rolling_buffer_advance(struct rolling_buffer *roll, size_t amount) +{ + iov_iter_advance(&roll->iter, amount); +} + +#endif /* _ROLLING_BUFFER_H */ diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 50aa6745df95..2dfc9f716e3b 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -198,7 +198,9 @@ EM(netfs_trace_folioq_alloc_read_sing, "alloc-r-sing") \ EM(netfs_trace_folioq_clear, "clear") \ EM(netfs_trace_folioq_delete, "delete") \ + EM(netfs_trace_folioq_make_space, "make-space") \ EM(netfs_trace_folioq_prep_write, "prep-wr") \ + EM(netfs_trace_folioq_rollbuf_init, "roll-init") \ E_(netfs_trace_folioq_read_progress, "r-progress") #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY -- cgit v1.2.3 From 360157829ee3dba848ffa817792d9a07969e0a95 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:40:58 +0000 Subject: netfs: Drop the error arg from netfs_read_subreq_terminated() Drop the error argument from netfs_read_subreq_terminated() in favour of passing the value in subreq->error. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-9-dhowells@redhat.com cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index bd922f0936e3..a882921460a9 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -427,10 +427,9 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp); vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group); /* (Sub)request management API. */ -void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, - bool was_async); -void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, - int error, bool was_async); +void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, bool was_async); +void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, bool was_async); +void netfs_read_subreq_termination_worker(struct work_struct *work); void netfs_get_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what); void netfs_put_subrequest(struct netfs_io_subrequest *subreq, -- cgit v1.2.3 From 31fc366aa7aa911ebc0744e99c82caee4e97315a Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:40:59 +0000 Subject: netfs: Drop the was_async arg from netfs_read_subreq_terminated() Drop the was_async argument from netfs_read_subreq_terminated(). Almost every caller is either in process context and passes false. Some filesystems delegate the call to a workqueue to avoid doing the work in their network message queue parsing thread. The only exception is netfs_cache_read_terminated() which handles completion in the cache - which is usually a callback from the backing filesystem in softirq context, though it can be from process context if an error occurred. In this case, delegate to a workqueue. Suggested-by: Linus Torvalds Link: https://lore.kernel.org/r/CAHk-=wiVC5Cgyz6QKXFu6fTaA6h4CjexDR-OV9kL6Vo5x9v8=A@mail.gmail.com/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-10-dhowells@redhat.com cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index a882921460a9..374e54beacbe 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -427,8 +427,8 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp); vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group); /* (Sub)request management API. */ -void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, bool was_async); -void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, bool was_async); +void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq); +void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq); void netfs_read_subreq_termination_worker(struct work_struct *work); void netfs_get_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what); -- cgit v1.2.3 From bcb33f79e15d0e4dc4b86106ceb01d64bfab9e35 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:41:05 +0000 Subject: cachefiles: Add some subrequest tracepoints Add some tracepoints into the cachefiles write paths. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-16-dhowells@redhat.com cc: netfs@lists.linux.dev Signed-off-by: Christian Brauner --- include/trace/events/netfs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 2dfc9f716e3b..02f6e179b7bc 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -74,6 +74,9 @@ #define netfs_sreq_traces \ EM(netfs_sreq_trace_add_donations, "+DON ") \ EM(netfs_sreq_trace_added, "ADD ") \ + EM(netfs_sreq_trace_cache_nowrite, "CA-NW") \ + EM(netfs_sreq_trace_cache_prepare, "CA-PR") \ + EM(netfs_sreq_trace_cache_write, "CA-WR") \ EM(netfs_sreq_trace_clear, "CLEAR") \ EM(netfs_sreq_trace_discard, "DSCRD") \ EM(netfs_sreq_trace_donate_to_prev, "DON-P") \ -- cgit v1.2.3 From 229105e5cfd9832a9ef1368c96e0098ec3a5fbf0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:41:06 +0000 Subject: cachefiles: Add auxiliary data trace Add a display of the first 8 bytes of the downloaded auxiliary data and of the on-disk stored auxiliary data as these are used in coherency management. In the case of afs, this holds the data version number. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-17-dhowells@redhat.com cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/trace/events/cachefiles.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h index 74114c261bcd..a743b2a35ea7 100644 --- a/include/trace/events/cachefiles.h +++ b/include/trace/events/cachefiles.h @@ -380,10 +380,11 @@ TRACE_EVENT(cachefiles_rename, TRACE_EVENT(cachefiles_coherency, TP_PROTO(struct cachefiles_object *obj, ino_t ino, + u64 disk_aux, enum cachefiles_content content, enum cachefiles_coherency_trace why), - TP_ARGS(obj, ino, content, why), + TP_ARGS(obj, ino, disk_aux, content, why), /* Note that obj may be NULL */ TP_STRUCT__entry( @@ -391,6 +392,8 @@ TRACE_EVENT(cachefiles_coherency, __field(enum cachefiles_coherency_trace, why) __field(enum cachefiles_content, content) __field(u64, ino) + __field(u64, aux) + __field(u64, disk_aux) ), TP_fast_assign( @@ -398,13 +401,17 @@ TRACE_EVENT(cachefiles_coherency, __entry->why = why; __entry->content = content; __entry->ino = ino; + __entry->aux = be64_to_cpup((__be64 *)obj->cookie->inline_aux); + __entry->disk_aux = disk_aux; ), - TP_printk("o=%08x %s B=%llx c=%u", + TP_printk("o=%08x %s B=%llx c=%u aux=%llx dsk=%llx", __entry->obj, __print_symbolic(__entry->why, cachefiles_coherency_traces), __entry->ino, - __entry->content) + __entry->content, + __entry->aux, + __entry->disk_aux) ); TRACE_EVENT(cachefiles_vol_coherency, -- cgit v1.2.3 From 9e705016eb8f3d4a58f2000e560ea2c7517e081b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:41:07 +0000 Subject: afs: Add more tracepoints to do with tracking validity Add wrappers to set and clear the callback promise and to mark a directory as invalidated, and add tracepoints to track these events: (1) afs_cb_promise: Log when a callback promise is set on a vnode. (2) afs_vnode_invalid: Log when the server's callback promise for a vnode is no longer valid and we need to refetch the vnode metadata. (3) afs_dir_invalid: Log when the contents of a directory are marked invalid and requiring refetching from the server and the cache invalidating. and two tracepoints to record data version number management: (4) afs_set_dv: Log when the DV is recorded on a vnode. (5) afs_dv_mismatch: Log when the DV recorded on a vnode plus the expected delta for the operation does not match the DV we got back from the server. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-18-dhowells@redhat.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner --- include/trace/events/afs.h | 169 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 164 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index a0aed1a428a1..7cb5583efb91 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -323,6 +323,43 @@ enum yfs_cm_operation { EM(yfs_CB_TellMeAboutYourself, "YFSCB.TellMeAboutYourself") \ E_(yfs_CB_CallBack, "YFSCB.CallBack") +#define afs_cb_promise_traces \ + EM(afs_cb_promise_clear_cb_break, "CLEAR cb-break") \ + EM(afs_cb_promise_clear_rmdir, "CLEAR rmdir") \ + EM(afs_cb_promise_clear_rotate_server, "CLEAR rot-srv") \ + EM(afs_cb_promise_clear_server_change, "CLEAR srv-chg") \ + EM(afs_cb_promise_clear_vol_init_cb, "CLEAR vol-init-cb") \ + EM(afs_cb_promise_set_apply_cb, "SET apply-cb") \ + EM(afs_cb_promise_set_new_inode, "SET new-inode") \ + E_(afs_cb_promise_set_new_symlink, "SET new-symlink") + +#define afs_vnode_invalid_traces \ + EM(afs_vnode_invalid_trace_cb_ro_snapshot, "cb-ro-snapshot") \ + EM(afs_vnode_invalid_trace_cb_scrub, "cb-scrub") \ + EM(afs_vnode_invalid_trace_cb_v_break, "cb-v-break") \ + EM(afs_vnode_invalid_trace_expired, "expired") \ + EM(afs_vnode_invalid_trace_no_cb_promise, "no-cb-promise") \ + EM(afs_vnode_invalid_trace_vol_expired, "vol-expired") \ + EM(afs_vnode_invalid_trace_zap_data, "zap-data") \ + E_(afs_vnode_valid_trace, "valid") + +#define afs_dir_invalid_traces \ + EM(afs_dir_invalid_edit_add_bad_size, "edit-add-bad-size") \ + EM(afs_dir_invalid_edit_add_no_slots, "edit-add-no-slots") \ + EM(afs_dir_invalid_edit_add_too_many_blocks, "edit-add-too-many-blocks") \ + EM(afs_dir_invalid_edit_get_block, "edit-get-block") \ + EM(afs_dir_invalid_edit_rem_bad_size, "edit-rem-bad-size") \ + EM(afs_dir_invalid_edit_rem_wrong_name, "edit-rem-wrong_name") \ + EM(afs_dir_invalid_edit_upd_bad_size, "edit-upd-bad-size") \ + EM(afs_dir_invalid_edit_upd_no_dd, "edit-upd-no-dotdot") \ + EM(afs_dir_invalid_dv_mismatch, "dv-mismatch") \ + EM(afs_dir_invalid_inval_folio, "inv-folio") \ + EM(afs_dir_invalid_iter_stale, "iter-stale") \ + EM(afs_dir_invalid_reclaimed_folio, "reclaimed-folio") \ + EM(afs_dir_invalid_release_folio, "rel-folio") \ + EM(afs_dir_invalid_remote, "remote") \ + E_(afs_dir_invalid_subdir_removed, "subdir-removed") + #define afs_edit_dir_ops \ EM(afs_edit_dir_create, "create") \ EM(afs_edit_dir_create_error, "c_fail") \ @@ -487,7 +524,9 @@ enum yfs_cm_operation { enum afs_alist_trace { afs_alist_traces } __mode(byte); enum afs_call_trace { afs_call_traces } __mode(byte); enum afs_cb_break_reason { afs_cb_break_reasons } __mode(byte); +enum afs_cb_promise_trace { afs_cb_promise_traces } __mode(byte); enum afs_cell_trace { afs_cell_traces } __mode(byte); +enum afs_dir_invalid_trace { afs_dir_invalid_traces} __mode(byte); enum afs_edit_dir_op { afs_edit_dir_ops } __mode(byte); enum afs_edit_dir_reason { afs_edit_dir_reasons } __mode(byte); enum afs_eproto_cause { afs_eproto_causes } __mode(byte); @@ -498,6 +537,7 @@ enum afs_flock_operation { afs_flock_operations } __mode(byte); enum afs_io_error { afs_io_errors } __mode(byte); enum afs_rotate_trace { afs_rotate_traces } __mode(byte); enum afs_server_trace { afs_server_traces } __mode(byte); +enum afs_vnode_invalid_trace { afs_vnode_invalid_traces} __mode(byte); enum afs_volume_trace { afs_volume_traces } __mode(byte); #endif /* end __AFS_GENERATE_TRACE_ENUMS_ONCE_ONLY */ @@ -513,8 +553,10 @@ enum afs_volume_trace { afs_volume_traces } __mode(byte); afs_alist_traces; afs_call_traces; afs_cb_break_reasons; +afs_cb_promise_traces; afs_cell_traces; afs_cm_operations; +afs_dir_invalid_traces; afs_edit_dir_ops; afs_edit_dir_reasons; afs_eproto_causes; @@ -526,6 +568,7 @@ afs_fs_operations; afs_io_errors; afs_rotate_traces; afs_server_traces; +afs_vnode_invalid_traces; afs_vl_operations; yfs_cm_operations; @@ -670,7 +713,7 @@ TRACE_EVENT(afs_make_fs_call, } ), - TP_printk("c=%08x %06llx:%06llx:%06x %s", + TP_printk("c=%08x V=%llx i=%llx:%x %s", __entry->call, __entry->fid.vid, __entry->fid.vnode, @@ -704,7 +747,7 @@ TRACE_EVENT(afs_make_fs_calli, } ), - TP_printk("c=%08x %06llx:%06llx:%06x %s i=%u", + TP_printk("c=%08x V=%llx i=%llx:%x %s i=%u", __entry->call, __entry->fid.vid, __entry->fid.vnode, @@ -741,7 +784,7 @@ TRACE_EVENT(afs_make_fs_call1, __entry->name[__len] = 0; ), - TP_printk("c=%08x %06llx:%06llx:%06x %s \"%s\"", + TP_printk("c=%08x V=%llx i=%llx:%x %s \"%s\"", __entry->call, __entry->fid.vid, __entry->fid.vnode, @@ -782,7 +825,7 @@ TRACE_EVENT(afs_make_fs_call2, __entry->name2[__len2] = 0; ), - TP_printk("c=%08x %06llx:%06llx:%06x %s \"%s\" \"%s\"", + TP_printk("c=%08x V=%llx i=%llx:%x %s \"%s\" \"%s\"", __entry->call, __entry->fid.vid, __entry->fid.vnode, @@ -1002,7 +1045,7 @@ TRACE_EVENT(afs_edit_dir, __entry->name[__len] = 0; ), - TP_printk("d=%x:%x %s %s %u[%u] f=%x:%x \"%s\"", + TP_printk("di=%x:%x %s %s %u[%u] fi=%x:%x \"%s\"", __entry->vnode, __entry->unique, __print_symbolic(__entry->why, afs_edit_dir_reasons), __print_symbolic(__entry->op, afs_edit_dir_ops), @@ -1011,6 +1054,122 @@ TRACE_EVENT(afs_edit_dir, __entry->name) ); +TRACE_EVENT(afs_dir_invalid, + TP_PROTO(const struct afs_vnode *dvnode, enum afs_dir_invalid_trace trace), + + TP_ARGS(dvnode, trace), + + TP_STRUCT__entry( + __field(unsigned int, vnode) + __field(unsigned int, unique) + __field(enum afs_dir_invalid_trace, trace) + ), + + TP_fast_assign( + __entry->vnode = dvnode->fid.vnode; + __entry->unique = dvnode->fid.unique; + __entry->trace = trace; + ), + + TP_printk("di=%x:%x %s", + __entry->vnode, __entry->unique, + __print_symbolic(__entry->trace, afs_dir_invalid_traces)) + ); + +TRACE_EVENT(afs_cb_promise, + TP_PROTO(const struct afs_vnode *vnode, enum afs_cb_promise_trace trace), + + TP_ARGS(vnode, trace), + + TP_STRUCT__entry( + __field(unsigned int, vnode) + __field(unsigned int, unique) + __field(enum afs_cb_promise_trace, trace) + ), + + TP_fast_assign( + __entry->vnode = vnode->fid.vnode; + __entry->unique = vnode->fid.unique; + __entry->trace = trace; + ), + + TP_printk("di=%x:%x %s", + __entry->vnode, __entry->unique, + __print_symbolic(__entry->trace, afs_cb_promise_traces)) + ); + +TRACE_EVENT(afs_vnode_invalid, + TP_PROTO(const struct afs_vnode *vnode, enum afs_vnode_invalid_trace trace), + + TP_ARGS(vnode, trace), + + TP_STRUCT__entry( + __field(unsigned int, vnode) + __field(unsigned int, unique) + __field(enum afs_vnode_invalid_trace, trace) + ), + + TP_fast_assign( + __entry->vnode = vnode->fid.vnode; + __entry->unique = vnode->fid.unique; + __entry->trace = trace; + ), + + TP_printk("di=%x:%x %s", + __entry->vnode, __entry->unique, + __print_symbolic(__entry->trace, afs_vnode_invalid_traces)) + ); + +TRACE_EVENT(afs_set_dv, + TP_PROTO(const struct afs_vnode *dvnode, u64 new_dv), + + TP_ARGS(dvnode, new_dv), + + TP_STRUCT__entry( + __field(unsigned int, vnode) + __field(unsigned int, unique) + __field(u64, old_dv) + __field(u64, new_dv) + ), + + TP_fast_assign( + __entry->vnode = dvnode->fid.vnode; + __entry->unique = dvnode->fid.unique; + __entry->old_dv = dvnode->status.data_version; + __entry->new_dv = new_dv; + ), + + TP_printk("di=%x:%x dv=%llx -> dv=%llx", + __entry->vnode, __entry->unique, + __entry->old_dv, __entry->new_dv) + ); + +TRACE_EVENT(afs_dv_mismatch, + TP_PROTO(const struct afs_vnode *dvnode, u64 before_dv, int delta, u64 new_dv), + + TP_ARGS(dvnode, before_dv, delta, new_dv), + + TP_STRUCT__entry( + __field(unsigned int, vnode) + __field(unsigned int, unique) + __field(int, delta) + __field(u64, before_dv) + __field(u64, new_dv) + ), + + TP_fast_assign( + __entry->vnode = dvnode->fid.vnode; + __entry->unique = dvnode->fid.unique; + __entry->delta = delta; + __entry->before_dv = before_dv; + __entry->new_dv = new_dv; + ), + + TP_printk("di=%x:%x xdv=%llx+%d dv=%llx", + __entry->vnode, __entry->unique, + __entry->before_dv, __entry->delta, __entry->new_dv) + ); + TRACE_EVENT(afs_protocol_error, TP_PROTO(struct afs_call *call, enum afs_eproto_cause cause), -- cgit v1.2.3 From e61bfaad8fd86ac84eac633e0bbaac47a5dfd358 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:41:08 +0000 Subject: netfs: Add functions to build/clean a buffer in a folio_queue Add two netfslib functions to build up or clean up a buffer in a folio_queue. The first, netfs_alloc_folioq_buffer() will add folios to a buffer, extending up at least to the given size. If it can, it will add multipage folios. The folios are optionally have the mapping set and will have the index set according to the distance from the front of the folio queue. The second function will free up a folio queue and put any folios in the queue that have the first mark set. The netfs_folio tracepoint is also altered to cope with folios that have a NULL mapping, and the folios being added/put will have trace lines emitted and will be accounted in the stats. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-19-dhowells@redhat.com cc: Jeff Layton cc: Marc Dionne cc: netfs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 6 ++++++ include/trace/events/netfs.h | 6 ++---- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 374e54beacbe..dd737344cff3 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -457,6 +457,12 @@ struct folio_queue *netfs_folioq_alloc(unsigned int rreq_id, gfp_t gfp, void netfs_folioq_free(struct folio_queue *folioq, unsigned int trace /*enum netfs_trace_folioq*/); +/* Buffer wrangling helpers API. */ +int netfs_alloc_folioq_buffer(struct address_space *mapping, + struct folio_queue **_buffer, + size_t *_cur_size, ssize_t size, gfp_t gfp); +void netfs_free_folioq_buffer(struct folio_queue *fq); + /** * netfs_inode - Get the netfs inode context from the inode * @inode: The inode to query diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 02f6e179b7bc..fc237ff23a33 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -155,6 +155,7 @@ EM(netfs_streaming_filled_page, "mod-streamw-f") \ EM(netfs_streaming_cont_filled_page, "mod-streamw-f+") \ EM(netfs_folio_trace_abandon, "abandon") \ + EM(netfs_folio_trace_alloc_buffer, "alloc-buf") \ EM(netfs_folio_trace_cancel_copy, "cancel-copy") \ EM(netfs_folio_trace_cancel_store, "cancel-store") \ EM(netfs_folio_trace_clear, "clear") \ @@ -195,10 +196,7 @@ E_(netfs_trace_donate_to_deferred_next, "defer-next") #define netfs_folioq_traces \ - EM(netfs_trace_folioq_alloc_append_folio, "alloc-apf") \ - EM(netfs_trace_folioq_alloc_read_prep, "alloc-r-prep") \ - EM(netfs_trace_folioq_alloc_read_prime, "alloc-r-prime") \ - EM(netfs_trace_folioq_alloc_read_sing, "alloc-r-sing") \ + EM(netfs_trace_folioq_alloc_buffer, "alloc-buf") \ EM(netfs_trace_folioq_clear, "clear") \ EM(netfs_trace_folioq_delete, "delete") \ EM(netfs_trace_folioq_make_space, "make-space") \ -- cgit v1.2.3 From 49866ce7ea8d41a3dc198f519cc9caa2d6be1891 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:41:09 +0000 Subject: netfs: Add support for caching single monolithic objects such as AFS dirs Add support for caching the content of a file that contains a single monolithic object that must be read/written with a single I/O operation, such as an AFS directory. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-20-dhowells@redhat.com cc: Jeff Layton cc: Marc Dionne cc: netfs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 10 ++++++++++ include/trace/events/netfs.h | 4 ++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index dd737344cff3..27e62f7d2940 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -73,6 +73,7 @@ struct netfs_inode { #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ #define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ #define NETFS_ICTX_MODIFIED_ATTR 3 /* Indicate change in mtime/ctime */ +#define NETFS_ICTX_SINGLE_NO_UPLOAD 4 /* Monolithic payload, cache but no upload */ }; /* @@ -210,9 +211,11 @@ enum netfs_io_origin { NETFS_READAHEAD, /* This read was triggered by readahead */ NETFS_READPAGE, /* This read is a synchronous read */ NETFS_READ_GAPS, /* This read is a synchronous read to fill gaps */ + NETFS_READ_SINGLE, /* This read should be treated as a single object */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ NETFS_DIO_READ, /* This is a direct I/O read */ NETFS_WRITEBACK, /* This write was triggered by writepages */ + NETFS_WRITEBACK_SINGLE, /* This monolithic write was triggered by writepages */ NETFS_WRITETHROUGH, /* This write was made by netfs_perform_write() */ NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */ NETFS_DIO_WRITE, /* This is a direct I/O write */ @@ -408,6 +411,13 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * struct netfs_group *netfs_group); ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from); +/* Single, monolithic object read/write API. */ +void netfs_single_mark_inode_dirty(struct inode *inode); +ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_iter *iter); +int netfs_writeback_single(struct address_space *mapping, + struct writeback_control *wbc, + struct iov_iter *iter); + /* Address operations API */ struct readahead_control; void netfs_readahead(struct readahead_control *); diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index fc237ff23a33..6df2e7313371 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -21,6 +21,7 @@ EM(netfs_read_trace_readahead, "READAHEAD") \ EM(netfs_read_trace_readpage, "READPAGE ") \ EM(netfs_read_trace_read_gaps, "READ-GAPS") \ + EM(netfs_read_trace_read_single, "READ-SNGL") \ EM(netfs_read_trace_prefetch_for_write, "PREFETCHW") \ E_(netfs_read_trace_write_begin, "WRITEBEGN") @@ -35,9 +36,11 @@ EM(NETFS_READAHEAD, "RA") \ EM(NETFS_READPAGE, "RP") \ EM(NETFS_READ_GAPS, "RG") \ + EM(NETFS_READ_SINGLE, "R1") \ EM(NETFS_READ_FOR_WRITE, "RW") \ EM(NETFS_DIO_READ, "DR") \ EM(NETFS_WRITEBACK, "WB") \ + EM(NETFS_WRITEBACK_SINGLE, "W1") \ EM(NETFS_WRITETHROUGH, "WT") \ EM(NETFS_UNBUFFERED_WRITE, "UW") \ EM(NETFS_DIO_WRITE, "DW") \ @@ -47,6 +50,7 @@ EM(netfs_rreq_trace_assess, "ASSESS ") \ EM(netfs_rreq_trace_copy, "COPY ") \ EM(netfs_rreq_trace_collect, "COLLECT") \ + EM(netfs_rreq_trace_dirty, "DIRTY ") \ EM(netfs_rreq_trace_done, "DONE ") \ EM(netfs_rreq_trace_free, "FREE ") \ EM(netfs_rreq_trace_redirty, "REDIRTY") \ -- cgit v1.2.3 From 6dd80936618c4ff852d4db73aca400351d9bd9f0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:41:11 +0000 Subject: afs: Use netfslib for directories In the AFS ecosystem, directories are just a special type of file that is downloaded and parsed locally. Download is done by the same mechanism as ordinary files and the data can be cached. There is one important semantic restriction on directories over files: the client must download the entire directory in one go because, for example, the server could fabricate the contents of the blob on the fly with each download and give a different image each time. So that we can cache the directory download, switch AFS directory support over to using the netfslib single-object API, thereby allowing directory content to be stored in the local cache. To make this work, the following changes are made: (1) A directory's contents are now stored in a folio_queue chain attached to the afs_vnode (inode) struct rather than its associated pagecache, though multipage folios are still used to hold the data. The folio queue is discarded when the directory inode is evicted. This also helps with the phasing out of ITER_XARRAY. (2) Various directory operations are made to use and unuse the cache cookie. (3) The content checking, content dumping and content iteration are now performed with a standard iov_iter iterator over the contents of the folio queue. (4) Iteration and modification must be done with the vnode's validate_lock held. In conjunction with (1), this means that the iteration can be done without the need to lock pages or take extra refs on them, unlike when accessing ->i_pages. (5) Convert to using netfs_read_single() to read data. (6) Provide a ->writepages() to call netfs_writeback_single() to save the data to the cache according to the VM's scheduling whilst holding the validate_lock read-locked as (4). (7) Change local directory image editing functions: (a) Provide a function to get a specific block by number from the folio_queue as we can no longer use the i_pages xarray to locate folios by index. This uses a cursor to remember the current position as we need to iterate through the directory contents. The block is kmapped before being returned. (b) Make the function in (a) extend the directory by an extra folio if we run out of space. (c) Raise the check of the block free space counter, for those blocks that have one, higher in the function to eliminate a call to get a block. (d) Remove the page unlocking and putting done during the editing loops. This is no longer necessary as the folio_queue holds the references and the pages are no longer in the pagecache. (e) Mark the inode dirty and pin the cache usage till writeback at the end of a successful edit. (8) Don't set the large_folios flag on the inode as we do the allocation ourselves rather than the VM doing it automatically. (9) Mark the inode as being a single object that isn't uploaded to the server. (10) Enable caching on directories. (11) Only set the upload key for writeback for regular files. Notes: (*) We keep the ->release_folio(), ->invalidate_folio() and ->migrate_folio() ops as we set the mapping pointer on the folio. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-22-dhowells@redhat.com cc: Marc Dionne cc: Jeff Layton cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/trace/events/afs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 7cb5583efb91..d05f2c09efe3 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -930,9 +930,9 @@ TRACE_EVENT(afs_sent_data, ); TRACE_EVENT(afs_dir_check_failed, - TP_PROTO(struct afs_vnode *vnode, loff_t off, loff_t i_size), + TP_PROTO(struct afs_vnode *vnode, loff_t off), - TP_ARGS(vnode, off, i_size), + TP_ARGS(vnode, off), TP_STRUCT__entry( __field(struct afs_vnode *, vnode) @@ -943,7 +943,7 @@ TRACE_EVENT(afs_dir_check_failed, TP_fast_assign( __entry->vnode = vnode; __entry->off = off; - __entry->i_size = i_size; + __entry->i_size = i_size_read(&vnode->netfs.inode); ), TP_printk("vn=%p %llx/%llx", -- cgit v1.2.3 From eae9e78951bb02a7b94a9adef6e981413d13c564 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:41:12 +0000 Subject: afs: Use netfslib for symlinks, allowing them to be cached Use netfslib to read symlinks, thereby allowing them to be cached by fscache and cachefiles. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-23-dhowells@redhat.com cc: Marc Dionne cc: Jeff Layton cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/trace/events/afs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index d05f2c09efe3..49a749672e38 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -422,6 +422,7 @@ enum yfs_cm_operation { EM(afs_file_error_dir_over_end, "DIR_ENT_OVER_END") \ EM(afs_file_error_dir_small, "DIR_SMALL") \ EM(afs_file_error_dir_unmarked_ext, "DIR_UNMARKED_EXT") \ + EM(afs_file_error_symlink_big, "SYM_BIG") \ EM(afs_file_error_mntpt, "MNTPT_READ_FAILED") \ E_(afs_file_error_writeback_fail, "WRITEBACK_FAILED") -- cgit v1.2.3 From 9750be93b2be12b6d92323b97d7c055099d279e6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:41:14 +0000 Subject: afs: Fix cleanup of immediately failed async calls If we manage to begin an async call, but fail to transmit any data on it due to a signal, we then abort it which causes a race between the notification of call completion from rxrpc and our attempt to cancel the notification. The notification will be necessary, however, for async FetchData to terminate the netfs subrequest. However, since we get a notification from rxrpc upon completion of a call (aborted or otherwise), we can just leave it to that. This leads to calls not getting cleaned up, but appearing in /proc/net/rxrpc/calls as being aborted with code 6. Fix this by making the "error_do_abort:" case of afs_make_call() abort the call and then abandon it to the notification handler. Fixes: 34fa47612bfe ("afs: Fix race in async call refcounting") Reported-by: Marc Dionne Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-25-dhowells@redhat.com cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner --- include/trace/events/afs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 49a749672e38..cdb5f2af7799 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -118,6 +118,8 @@ enum yfs_cm_operation { */ #define afs_call_traces \ EM(afs_call_trace_alloc, "ALLOC") \ + EM(afs_call_trace_async_abort, "ASYAB") \ + EM(afs_call_trace_async_kill, "ASYKL") \ EM(afs_call_trace_free, "FREE ") \ EM(afs_call_trace_get, "GET ") \ EM(afs_call_trace_put, "PUT ") \ -- cgit v1.2.3 From e2d46f2ec332533816417b60933954173f602121 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:41:17 +0000 Subject: netfs: Change the read result collector to only use one work item Change the way netfslib collects read results to do all the collection for a particular read request using a single work item that walks along the subrequest queue as subrequests make progress or complete, unlocking folios progressively rather than doing the unlock in parallel as parallel requests come in. The code is remodelled to be more like the write-side code, though only using a single stream. This makes it more directly comparable and thus easier to duplicate fixes between the two sides. This has a number of advantages: (1) It's simpler. There doesn't need to be a complex donation mechanism to handle mismatches between the size and alignment of subrequests and folios. The collector unlocks folios as the subrequests covering each complete. (2) It should cause less scheduler overhead as there's a single work item in play unlocking pages in parallel when a read gets split up into a lot of subrequests instead of one per subrequest. Whilst the parallellism is nice in theory, in practice, the vast majority of loads are sequential reads of the whole file, so committing a bunch of threads to unlocking folios out of order doesn't help in those cases. (3) It should make it easier to implement content decryption. A folio cannot be decrypted until all the requests that contribute to it have completed - and, again, most loads are sequential and so, most of the time, we want to begin decryption sequentially (though it's great if the decryption can happen in parallel). There is a disadvantage in that we're losing the ability to decrypt and unlock things on an as-things-arrive basis which may affect some applications. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-28-dhowells@redhat.com cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 16 ++++----- include/trace/events/netfs.h | 79 +++++++------------------------------------- 2 files changed, 18 insertions(+), 77 deletions(-) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 27e62f7d2940..071d05d81d38 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -181,9 +181,6 @@ struct netfs_io_subrequest { unsigned long long start; /* Where to start the I/O */ size_t len; /* Size of the I/O */ size_t transferred; /* Amount of data transferred */ - size_t consumed; /* Amount of read data consumed */ - size_t prev_donated; /* Amount of data donated from previous subreq */ - size_t next_donated; /* Amount of data donated from next subreq */ refcount_t ref; short error; /* 0 or error that occurred */ unsigned short debug_index; /* Index in list (for debugging output) */ @@ -191,9 +188,6 @@ struct netfs_io_subrequest { u8 retry_count; /* The number of retries (0 on initial pass) */ enum netfs_io_source source; /* Where to read from/write to */ unsigned char stream_nr; /* I/O stream this belongs to */ - unsigned char curr_folioq_slot; /* Folio currently being read */ - unsigned char curr_folio_order; /* Order of folio */ - struct folio_queue *curr_folioq; /* Queue segment in which current folio resides */ unsigned long flags; #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ #define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */ @@ -236,15 +230,16 @@ struct netfs_io_request { struct address_space *mapping; /* The mapping being accessed */ struct kiocb *iocb; /* AIO completion vector */ struct netfs_cache_resources cache_resources; + struct netfs_io_request *copy_to_cache; /* Request to write just-read data to the cache */ struct readahead_control *ractl; /* Readahead descriptor */ struct list_head proc_link; /* Link in netfs_iorequests */ - struct list_head subrequests; /* Contributory I/O operations */ struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */ #define NR_IO_STREAMS 2 //wreq->nr_io_streams struct netfs_group *group; /* Writeback group being written back */ struct rolling_buffer buffer; /* Unencrypted buffer */ #define NETFS_ROLLBUF_PUT_MARK ROLLBUF_MARK_1 #define NETFS_ROLLBUF_PAGECACHE_MARK ROLLBUF_MARK_2 + wait_queue_head_t waitq; /* Processor waiter */ void *netfs_priv; /* Private data for the netfs */ void *netfs_priv2; /* Private data for the netfs */ struct bio_vec *direct_bv; /* DIO buffer list (when handling iovec-iter) */ @@ -255,7 +250,6 @@ struct netfs_io_request { atomic_t subreq_counter; /* Next subreq->debug_index */ unsigned int nr_group_rel; /* Number of refs to release on ->group */ spinlock_t lock; /* Lock for queuing subreqs */ - atomic_t nr_outstanding; /* Number of ops in progress */ unsigned long long submitted; /* Amount submitted for I/O so far */ unsigned long long len; /* Length of the request */ size_t transferred; /* Amount to be indicated as transferred */ @@ -267,14 +261,17 @@ struct netfs_io_request { atomic64_t issued_to; /* Write issuer folio cursor */ unsigned long long collected_to; /* Point we've collected to */ unsigned long long cleaned_to; /* Position we've cleaned folios to */ + unsigned long long abandon_to; /* Position to abandon folios to */ pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ - size_t prev_donated; /* Fallback for subreq->prev_donated */ + unsigned char front_folio_order; /* Order (size) of front folio */ refcount_t ref; unsigned long flags; +#define NETFS_RREQ_OFFLOAD_COLLECTION 0 /* Offload collection to workqueue */ #define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */ #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ #define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ +#define NETFS_RREQ_FOLIO_COPY_TO_CACHE 6 /* Copy current folio to cache from read */ #define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ #define NETFS_RREQ_NONBLOCK 9 /* Don't block if possible (O_NONBLOCK) */ #define NETFS_RREQ_BLOCKED 10 /* We blocked */ @@ -439,7 +436,6 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr /* (Sub)request management API. */ void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq); void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq); -void netfs_read_subreq_termination_worker(struct work_struct *work); void netfs_get_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what); void netfs_put_subrequest(struct netfs_io_subrequest *subreq, diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 6df2e7313371..6e699cadcb29 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -50,18 +50,23 @@ EM(netfs_rreq_trace_assess, "ASSESS ") \ EM(netfs_rreq_trace_copy, "COPY ") \ EM(netfs_rreq_trace_collect, "COLLECT") \ + EM(netfs_rreq_trace_complete, "COMPLET") \ EM(netfs_rreq_trace_dirty, "DIRTY ") \ EM(netfs_rreq_trace_done, "DONE ") \ EM(netfs_rreq_trace_free, "FREE ") \ EM(netfs_rreq_trace_redirty, "REDIRTY") \ EM(netfs_rreq_trace_resubmit, "RESUBMT") \ + EM(netfs_rreq_trace_set_abandon, "S-ABNDN") \ EM(netfs_rreq_trace_set_pause, "PAUSE ") \ EM(netfs_rreq_trace_unlock, "UNLOCK ") \ EM(netfs_rreq_trace_unlock_pgpriv2, "UNLCK-2") \ EM(netfs_rreq_trace_unmark, "UNMARK ") \ EM(netfs_rreq_trace_wait_ip, "WAIT-IP") \ EM(netfs_rreq_trace_wait_pause, "WT-PAUS") \ + EM(netfs_rreq_trace_wait_queue, "WAIT-Q ") \ EM(netfs_rreq_trace_wake_ip, "WAKE-IP") \ + EM(netfs_rreq_trace_wake_queue, "WAKE-Q ") \ + EM(netfs_rreq_trace_woke_queue, "WOKE-Q ") \ EM(netfs_rreq_trace_unpause, "UNPAUSE") \ E_(netfs_rreq_trace_write_done, "WR-DONE") @@ -81,6 +86,7 @@ EM(netfs_sreq_trace_cache_nowrite, "CA-NW") \ EM(netfs_sreq_trace_cache_prepare, "CA-PR") \ EM(netfs_sreq_trace_cache_write, "CA-WR") \ + EM(netfs_sreq_trace_cancel, "CANCL") \ EM(netfs_sreq_trace_clear, "CLEAR") \ EM(netfs_sreq_trace_discard, "DSCRD") \ EM(netfs_sreq_trace_donate_to_prev, "DON-P") \ @@ -91,6 +97,9 @@ EM(netfs_sreq_trace_hit_eof, "EOF ") \ EM(netfs_sreq_trace_io_progress, "IO ") \ EM(netfs_sreq_trace_limited, "LIMIT") \ + EM(netfs_sreq_trace_need_clear, "N-CLR") \ + EM(netfs_sreq_trace_partial_read, "PARTR") \ + EM(netfs_sreq_trace_need_retry, "NRTRY") \ EM(netfs_sreq_trace_prepare, "PREP ") \ EM(netfs_sreq_trace_prep_failed, "PRPFL") \ EM(netfs_sreq_trace_progress, "PRGRS") \ @@ -136,6 +145,7 @@ EM(netfs_sreq_trace_get_submit, "GET SUBMIT") \ EM(netfs_sreq_trace_get_short_read, "GET SHORTRD") \ EM(netfs_sreq_trace_new, "NEW ") \ + EM(netfs_sreq_trace_put_abandon, "PUT ABANDON") \ EM(netfs_sreq_trace_put_cancel, "PUT CANCEL ") \ EM(netfs_sreq_trace_put_clear, "PUT CLEAR ") \ EM(netfs_sreq_trace_put_consumed, "PUT CONSUME") \ @@ -176,6 +186,7 @@ EM(netfs_folio_trace_mkwrite, "mkwrite") \ EM(netfs_folio_trace_mkwrite_plus, "mkwrite+") \ EM(netfs_folio_trace_not_under_wback, "!wback") \ + EM(netfs_folio_trace_not_locked, "!locked") \ EM(netfs_folio_trace_put, "put") \ EM(netfs_folio_trace_read, "read") \ EM(netfs_folio_trace_read_done, "read-done") \ @@ -204,7 +215,6 @@ EM(netfs_trace_folioq_clear, "clear") \ EM(netfs_trace_folioq_delete, "delete") \ EM(netfs_trace_folioq_make_space, "make-space") \ - EM(netfs_trace_folioq_prep_write, "prep-wr") \ EM(netfs_trace_folioq_rollbuf_init, "roll-init") \ E_(netfs_trace_folioq_read_progress, "r-progress") @@ -352,7 +362,7 @@ TRACE_EVENT(netfs_sreq, __entry->len = sreq->len; __entry->transferred = sreq->transferred; __entry->start = sreq->start; - __entry->slot = sreq->curr_folioq_slot; + __entry->slot = sreq->io_iter.folioq_slot; ), TP_printk("R=%08x[%x] %s %s f=%02x s=%llx %zx/%zx s=%u e=%d", @@ -701,71 +711,6 @@ TRACE_EVENT(netfs_collect_stream, __entry->collected_to, __entry->front) ); -TRACE_EVENT(netfs_progress, - TP_PROTO(const struct netfs_io_subrequest *subreq, - unsigned long long start, size_t avail, size_t part), - - TP_ARGS(subreq, start, avail, part), - - TP_STRUCT__entry( - __field(unsigned int, rreq) - __field(unsigned int, subreq) - __field(unsigned int, consumed) - __field(unsigned int, transferred) - __field(unsigned long long, f_start) - __field(unsigned int, f_avail) - __field(unsigned int, f_part) - __field(unsigned char, slot) - ), - - TP_fast_assign( - __entry->rreq = subreq->rreq->debug_id; - __entry->subreq = subreq->debug_index; - __entry->consumed = subreq->consumed; - __entry->transferred = subreq->transferred; - __entry->f_start = start; - __entry->f_avail = avail; - __entry->f_part = part; - __entry->slot = subreq->curr_folioq_slot; - ), - - TP_printk("R=%08x[%02x] s=%llx ct=%x/%x pa=%x/%x sl=%x", - __entry->rreq, __entry->subreq, __entry->f_start, - __entry->consumed, __entry->transferred, - __entry->f_part, __entry->f_avail, __entry->slot) - ); - -TRACE_EVENT(netfs_donate, - TP_PROTO(const struct netfs_io_request *rreq, - const struct netfs_io_subrequest *from, - const struct netfs_io_subrequest *to, - size_t amount, - enum netfs_donate_trace trace), - - TP_ARGS(rreq, from, to, amount, trace), - - TP_STRUCT__entry( - __field(unsigned int, rreq) - __field(unsigned int, from) - __field(unsigned int, to) - __field(unsigned int, amount) - __field(enum netfs_donate_trace, trace) - ), - - TP_fast_assign( - __entry->rreq = rreq->debug_id; - __entry->from = from->debug_index; - __entry->to = to ? to->debug_index : -1; - __entry->amount = amount; - __entry->trace = trace; - ), - - TP_printk("R=%08x[%02x] -> [%02x] %s am=%x", - __entry->rreq, __entry->from, __entry->to, - __print_symbolic(__entry->trace, netfs_donate_traces), - __entry->amount) - ); - TRACE_EVENT(netfs_folioq, TP_PROTO(const struct folio_queue *fq, enum netfs_folioq_trace trace), -- cgit v1.2.3 From 836bb70bde6a24a0069866b69d23eb61a00c422a Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:41:18 +0000 Subject: afs: Make afs_mkdir() locally initialise a new directory's content Initialise a new directory's content when it is created by mkdir locally rather than downloading the content from the server as we can predict what it's going to look like. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-29-dhowells@redhat.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner --- include/trace/events/afs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index cdb5f2af7799..c52fd83ca9b7 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -350,6 +350,7 @@ enum yfs_cm_operation { EM(afs_dir_invalid_edit_add_no_slots, "edit-add-no-slots") \ EM(afs_dir_invalid_edit_add_too_many_blocks, "edit-add-too-many-blocks") \ EM(afs_dir_invalid_edit_get_block, "edit-get-block") \ + EM(afs_dir_invalid_edit_mkdir, "edit-mkdir") \ EM(afs_dir_invalid_edit_rem_bad_size, "edit-rem-bad-size") \ EM(afs_dir_invalid_edit_rem_wrong_name, "edit-rem-wrong_name") \ EM(afs_dir_invalid_edit_upd_bad_size, "edit-upd-bad-size") \ @@ -371,6 +372,7 @@ enum yfs_cm_operation { EM(afs_edit_dir_delete_error, "d_err ") \ EM(afs_edit_dir_delete_inval, "d_invl") \ EM(afs_edit_dir_delete_noent, "d_nent") \ + EM(afs_edit_dir_mkdir, "mk_ent") \ EM(afs_edit_dir_update_dd, "u_ddot") \ EM(afs_edit_dir_update_error, "u_fail") \ EM(afs_edit_dir_update_inval, "u_invl") \ -- cgit v1.2.3 From 3c49e529e1c6aa71cb9b874fd60b72c97dae7ede Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:41:21 +0000 Subject: afs: Add a tracepoint for afs_read_receive() Add a tracepoint for afs_read_receive() to allow potential missed wakeups to be debugged. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241216204124.3752367-32-dhowells@redhat.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner --- include/trace/events/afs.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include') diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index c52fd83ca9b7..2e92487f3f34 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -1775,6 +1775,36 @@ TRACE_EVENT(afs_make_call, __entry->fid.unique) ); +TRACE_EVENT(afs_read_recv, + TP_PROTO(const struct afs_operation *op, const struct afs_call *call), + + TP_ARGS(op, call), + + TP_STRUCT__entry( + __field(unsigned int, rreq) + __field(unsigned int, sreq) + __field(unsigned int, op) + __field(unsigned int, op_flags) + __field(unsigned int, call) + __field(enum afs_call_state, call_state) + ), + + TP_fast_assign( + __entry->op = op->debug_id; + __entry->sreq = op->fetch.subreq->debug_index; + __entry->rreq = op->fetch.subreq->rreq->debug_id; + __entry->op_flags = op->flags; + __entry->call = call->debug_id; + __entry->call_state = call->state; + ), + + TP_printk("R=%08x[%x] OP=%08x c=%08x cs=%x of=%x", + __entry->rreq, __entry->sreq, + __entry->op, + __entry->call, __entry->call_state, + __entry->op_flags) + ); + #endif /* _TRACE_AFS_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 1dbdce30f040a87f5aa6a9dbe43be398737f090f Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Mon, 16 Dec 2024 18:21:44 +0100 Subject: ipv4: Define inet_sk_init_flowi4() and use it in inet_sk_rebuild_header(). IPv4 code commonly has to initialise a flowi4 structure from an IPv4 socket. This requires looking at potential IPv4 options to set the proper destination address, call flowi4_init_output() with the correct set of parameters and run the sk_classify_flow security hook. Instead of reimplementing these operations in different parts of the stack, let's define inet_sk_init_flowi4() which does all these operations. The first user is inet_sk_rebuild_header(), where inet_sk_init_flowi4() replaces ip_route_output_ports(). Unlike ip_route_output_ports(), which sets the flowi4 structure and performs the route lookup in one go, inet_sk_init_flowi4() only initialises the flow. The route lookup is then done by ip_route_output_flow(). Decoupling flow initialisation from route lookup makes this new interface applicable more broadly as it will allow some users to overwrite specific struct flowi4 members before the route lookup. Signed-off-by: Guillaume Nault Link: https://patch.msgid.link/fd416275262b1f518d5abfcef740ce4f4a1a6522.1734357769.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include') diff --git a/include/net/route.h b/include/net/route.h index 6947a155d501..f86775be3e29 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -129,6 +130,33 @@ struct in_device; int ip_rt_init(void); void rt_cache_flush(struct net *net); void rt_flush_dev(struct net_device *dev); + +static inline void inet_sk_init_flowi4(const struct inet_sock *inet, + struct flowi4 *fl4) +{ + const struct ip_options_rcu *ip4_opt; + const struct sock *sk; + __be32 daddr; + + rcu_read_lock(); + ip4_opt = rcu_dereference(inet->inet_opt); + + /* Source routing option overrides the socket destination address */ + if (ip4_opt && ip4_opt->opt.srr) + daddr = ip4_opt->opt.faddr; + else + daddr = inet->inet_daddr; + rcu_read_unlock(); + + sk = &inet->sk; + flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), + ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), + sk->sk_protocol, inet_sk_flowi_flags(sk), daddr, + inet->inet_saddr, inet->inet_dport, + inet->inet_sport, sk->sk_uid); + security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); +} + struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *flp, const struct sk_buff *skb); struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *flp, -- cgit v1.2.3 From 394033dcc976d1f83f0fc6e7d4dd041ce376d245 Mon Sep 17 00:00:00 2001 From: Integral Date: Wed, 23 Oct 2024 18:00:33 +0800 Subject: bcachefs: add support for true/false & yes/no in bool-type options Here is the patch which uses existing constant table: Currently, when using bcachefs-tools to set options, bool-type options can only accept 1 or 0. Add support for accepting true/false and yes/no for these options. Signed-off-by: Integral Signed-off-by: Kent Overstreet Acked-by: David Howells --- include/linux/fs_parser.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index 3cef566088fc..53e566efd5fd 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -84,6 +84,8 @@ extern int fs_lookup_param(struct fs_context *fc, extern int lookup_constant(const struct constant_table tbl[], const char *name, int not_found); +extern const struct constant_table bool_names[]; + #ifdef CONFIG_VALIDATE_FS_PARSER extern bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, int low, int high, int special); -- cgit v1.2.3 From dec6c0aac4fc5e4266cea18e9e6e47eecb2333e1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 6 Dec 2024 19:16:02 -0500 Subject: lib min_heap: Switch to size_t size_t is the correct type for a count of objects that can fit in memory: this also means heaps now have the same memory layout as darrays (fs/bcachefs/darray.h), and darrays can be used as heaps. Cc: Kuan-Wei Chiu Cc: Ian Rogers Cc: Andrew Morton Cc: Coly Li Cc: Peter Zijlstra Signed-off-by: Kent Overstreet --- include/linux/min_heap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index e781727c8916..6325f6ffb895 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -15,8 +15,8 @@ */ #define MIN_HEAP_PREALLOCATED(_type, _name, _nr) \ struct _name { \ - int nr; \ - int size; \ + size_t nr; \ + size_t size; \ _type *data; \ _type preallocated[_nr]; \ } -- cgit v1.2.3 From b9b894642fede191d50230d08608bd4f4f49f73d Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 11 Dec 2024 22:02:18 +0000 Subject: crypto: lib/gf128mul - Remove some bbe deadcode gf128mul_4k_bbe(), gf128mul_bbe() and gf128mul_init_4k_bbe() are part of the library originally added in 2006 by commit c494e0705d67 ("[CRYPTO] lib: table driven multiplications in GF(2^128)") but have never been used. Remove them. (BBE is Big endian Byte/Big endian bits Note the 64k table version is used and I've left that in) Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Herbert Xu --- include/crypto/gf128mul.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h index 81330c6446f6..b0853f7cada0 100644 --- a/include/crypto/gf128mul.h +++ b/include/crypto/gf128mul.h @@ -158,12 +158,10 @@ 64...71 72...79 80...87 88...95 96..103 104.111 112.119 120.127 */ -/* A slow generic version of gf_mul, implemented for lle and bbe +/* A slow generic version of gf_mul, implemented for lle * It multiplies a and b and puts the result in a */ void gf128mul_lle(be128 *a, const be128 *b); -void gf128mul_bbe(be128 *a, const be128 *b); - /* * The following functions multiply a field element by x in * the polynomial field representation. They use 64-bit word operations @@ -224,9 +222,7 @@ struct gf128mul_4k { }; struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g); -struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g); void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t); -void gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t); void gf128mul_x8_ble(le128 *r, const le128 *x); static inline void gf128mul_free_4k(struct gf128mul_4k *t) { -- cgit v1.2.3 From 4e39aded665f9c8966d0fd487d37fa3f30b94ba4 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 17 Dec 2024 01:38:59 +0000 Subject: video: hdmi: Remove unused hdmi_infoframe_check hdmi_infoframe_check() has been unused since it was added in commit c5e69ab35c0d ("video/hdmi: Constify infoframe passed to the pack functions") Remove it. Note that the individual check functions for each type are actually used, so they're staying. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Helge Deller --- include/linux/hdmi.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h index 455f855bc084..96bda41d9148 100644 --- a/include/linux/hdmi.h +++ b/include/linux/hdmi.h @@ -445,7 +445,6 @@ ssize_t hdmi_infoframe_pack(union hdmi_infoframe *frame, void *buffer, size_t size); ssize_t hdmi_infoframe_pack_only(const union hdmi_infoframe *frame, void *buffer, size_t size); -int hdmi_infoframe_check(union hdmi_infoframe *frame); int hdmi_infoframe_unpack(union hdmi_infoframe *frame, const void *buffer, size_t size); void hdmi_infoframe_log(const char *level, struct device *dev, -- cgit v1.2.3 From ef4144ac2dec35d47de666f35cd873eb1be4172e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 19 Dec 2024 18:01:32 +0100 Subject: pidfs: allow bind-mounts Allow bind-mounting pidfds. Similar to nsfs let's allow bind-mounts for pidfds. This allows pidfds to be safely recovered and checked for process recycling. Link: https://lore.kernel.org/r/20241219-work-pidfs-mount-v1-1-dbc56198b839@kernel.org Signed-off-by: Christian Brauner --- include/linux/pidfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index df574d6708d4..7c830d0dec9a 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -6,5 +6,6 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); void pidfs_add_pid(struct pid *pid); void pidfs_remove_pid(struct pid *pid); +extern const struct dentry_operations pidfs_dentry_operations; #endif /* _LINUX_PID_FS_H */ -- cgit v1.2.3 From c7175957b28a69947dd1d36e8b19ac0d3c1a5d7d Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 27 Jul 2023 20:03:55 +0200 Subject: seqlock: annotate spinning as unlikely() in __read_seqcount_begin Annotation already used to be there, but got lost in 52ac39e5db5148f7 ("seqlock: seqcount_t: Implement all read APIs as statement expressions"). Does not look like it was intentional. Without it gcc 12 decides to compile the following in path_init: nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); into 2 cases of conditional jumps forward if the value is even, aka branch prediction miss by default in the common case on x86-64. With the patch jumps are only for odd values. before: [snip] mov 0x104fe96(%rip),%eax # 0xffffffff82409680 test $0x1,%al je 0xffffffff813b97fa pause mov 0x104fe8a(%rip),%eax # 0xffffffff82409680 test $0x1,%al jne 0xffffffff813b97ee mov %eax,0x48(%rbx) mov 0x104fdfd(%rip),%eax # 0xffffffff82409600 test $0x1,%al je 0xffffffff813b9813 pause mov 0x104fdf1(%rip),%eax # 0xffffffff82409600 test $0x1,%al jne 0xffffffff813b9807 [/snip] after: [snip] mov 0x104fec6(%rip),%eax # 0xffffffff82409680 test $0x1,%al jne 0xffffffff813b99af mov %eax,0x48(%rbx) mov 0x104fe35(%rip),%eax # 0xffffffff82409600 test $0x1,%al jne 0xffffffff813b999d [/snip] Interestingly .text gets slightly smaller (as reported by size(1)): before: 20702563 after: 20702429 Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20230727180355.813995-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/seqlock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5298765d6ca4..eb20dcaa51b5 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -272,7 +272,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) ({ \ unsigned __seq; \ \ - while ((__seq = seqprop_sequence(s)) & 1) \ + while (unlikely((__seq = seqprop_sequence(s)) & 1)) \ cpu_relax(); \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ -- cgit v1.2.3 From 135ec43eb29c68ed26e2d10f221d43f7d9139a8f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 20 Nov 2024 17:13:52 -0800 Subject: fiemap: use kernel-doc includes in fiemap docbook Add some kernel-doc notation to structs in fiemap header files then pull that into Documentation/filesystems/fiemap.rst instead of duplicating the header file structs in fiemap.rst. This helps to future-proof fiemap.rst against struct changes. Add missing flags documentation from header files into fiemap.rst for FIEMAP_FLAG_CACHE and FIEMAP_EXTENT_SHARED. Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20241121011352.201907-1-rdunlap@infradead.org Cc: Christoph Hellwig Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: Matthew Wilcox Signed-off-by: Christian Brauner --- include/linux/fiemap.h | 16 ++++++++++----- include/uapi/linux/fiemap.h | 47 +++++++++++++++++++++++++++++++-------------- 2 files changed, 44 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/fiemap.h b/include/linux/fiemap.h index c50882f19235..966092ffa89a 100644 --- a/include/linux/fiemap.h +++ b/include/linux/fiemap.h @@ -5,12 +5,18 @@ #include #include +/** + * struct fiemap_extent_info - fiemap request to a filesystem + * @fi_flags: Flags as passed from user + * @fi_extents_mapped: Number of mapped extents + * @fi_extents_max: Size of fiemap_extent array + * @fi_extents_start: Start of fiemap_extent array + */ struct fiemap_extent_info { - unsigned int fi_flags; /* Flags as passed from user */ - unsigned int fi_extents_mapped; /* Number of mapped extents */ - unsigned int fi_extents_max; /* Size of fiemap_extent array */ - struct fiemap_extent __user *fi_extents_start; /* Start of - fiemap_extent array */ + unsigned int fi_flags; + unsigned int fi_extents_mapped; + unsigned int fi_extents_max; + struct fiemap_extent __user *fi_extents_start; }; int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/include/uapi/linux/fiemap.h b/include/uapi/linux/fiemap.h index 24ca0c00cae3..9d9e8ae32b41 100644 --- a/include/uapi/linux/fiemap.h +++ b/include/uapi/linux/fiemap.h @@ -14,37 +14,56 @@ #include +/** + * struct fiemap_extent - description of one fiemap extent + * @fe_logical: byte offset of the extent in the file + * @fe_physical: byte offset of extent on disk + * @fe_length: length in bytes for this extent + * @fe_flags: FIEMAP_EXTENT_* flags for this extent + */ struct fiemap_extent { - __u64 fe_logical; /* logical offset in bytes for the start of - * the extent from the beginning of the file */ - __u64 fe_physical; /* physical offset in bytes for the start - * of the extent from the beginning of the disk */ - __u64 fe_length; /* length in bytes for this extent */ + __u64 fe_logical; + __u64 fe_physical; + __u64 fe_length; + /* private: */ __u64 fe_reserved64[2]; - __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ + /* public: */ + __u32 fe_flags; + /* private: */ __u32 fe_reserved[3]; }; +/** + * struct fiemap - file extent mappings + * @fm_start: byte offset (inclusive) at which to start mapping (in) + * @fm_length: logical length of mapping which userspace wants (in) + * @fm_flags: FIEMAP_FLAG_* flags for request (in/out) + * @fm_mapped_extents: number of extents that were mapped (out) + * @fm_extent_count: size of fm_extents array (in) + * @fm_extents: array of mapped extents (out) + */ struct fiemap { - __u64 fm_start; /* logical offset (inclusive) at - * which to start mapping (in) */ - __u64 fm_length; /* logical length of mapping which - * userspace wants (in) */ - __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ - __u32 fm_mapped_extents;/* number of extents that were mapped (out) */ - __u32 fm_extent_count; /* size of fm_extents array (in) */ + __u64 fm_start; + __u64 fm_length; + __u32 fm_flags; + __u32 fm_mapped_extents; + __u32 fm_extent_count; + /* private: */ __u32 fm_reserved; - struct fiemap_extent fm_extents[]; /* array of mapped extents (out) */ + /* public: */ + struct fiemap_extent fm_extents[]; }; #define FIEMAP_MAX_OFFSET (~0ULL) +/* flags used in fm_flags: */ #define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ #define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ #define FIEMAP_FLAG_CACHE 0x00000004 /* request caching of the extents */ #define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) +/* flags used in fe_flags: */ #define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */ #define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */ #define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending. -- cgit v1.2.3 From ea382199071931d19aac5f688b543e07360e2b64 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 20 Nov 2024 12:20:34 +0100 Subject: vfs: support caching symlink lengths in inodes When utilized it dodges strlen() in vfs_readlink(), giving about 1.5% speed up when issuing readlink on /initrd.img on ext4. Filesystems opt in by calling inode_set_cached_link() when creating an inode. The size is stored in a new union utilizing the same space as i_devices, thus avoiding growing the struct or taking up any more space. Churn-wise the current readlink_copy() helper is patched to accept the size instead of calculating it. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20241120112037.822078-2-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..2cc98de5af43 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -626,6 +626,7 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 #define IOP_MGTIME 0x0020 +#define IOP_CACHED_LINK 0x0040 /* * Keep mostly read-only and often accessed (especially for @@ -723,7 +724,10 @@ struct inode { }; struct file_lock_context *i_flctx; struct address_space i_data; - struct list_head i_devices; + union { + struct list_head i_devices; + int i_linklen; + }; union { struct pipe_inode_info *i_pipe; struct cdev *i_cdev; @@ -749,6 +753,13 @@ struct inode { void *i_private; /* fs or device private pointer */ } __randomize_layout; +static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) +{ + inode->i_link = link; + inode->i_linklen = linklen; + inode->i_opflags |= IOP_CACHED_LINK; +} + /* * Get bit address from inode->i_state to use with wait_var_event() * infrastructre. @@ -3351,7 +3362,7 @@ extern const struct file_operations generic_ro_fops; #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) -extern int readlink_copy(char __user *, int, const char *); +extern int readlink_copy(char __user *, int, const char *, int); extern int page_readlink(struct dentry *, char __user *, int); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); -- cgit v1.2.3 From 3212a8f34021a16d13ace91d3ac5f451ef8d0103 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 30 Nov 2024 06:17:11 +0100 Subject: fs: use a consume fence in mnt_idmap() The routine is used in link_path_walk() for every path component. To my reading the entire point of the fence was to grab a fully populated mnt_idmap, but that's already going to happen with mere consume fence. Eliminates an actual fence on arm64. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20241130051712.1036527-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/mount.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mount.h b/include/linux/mount.h index c34c18b4e8f3..33f17b6e8732 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -76,7 +76,7 @@ struct vfsmount { static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt) { /* Pairs with smp_store_release() in do_idmap_mount(). */ - return smp_load_acquire(&mnt->mnt_idmap); + return READ_ONCE(mnt->mnt_idmap); } extern int mnt_want_write(struct vfsmount *mnt); -- cgit v1.2.3 From c38904ebb74b455a44e3b9a679aef320361654ae Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 8 Nov 2024 12:34:24 +0100 Subject: tracing: Add task_prctl_unknown tracepoint prctl() is a complex syscall which multiplexes its functionality based on a large set of PR_* options. Currently we count 64 such options. The return value of unknown options is -EINVAL, and doesn't distinguish from known options that were passed invalid args that also return -EINVAL. To understand if programs are attempting to use prctl() options not yet available on the running kernel, provide the task_prctl_unknown tracepoint. Note, this tracepoint is in an unlikely cold path, and would therefore be suitable for continuous monitoring (e.g. via perf_event_open). While the above is likely the simplest usecase, additionally this tracepoint can help unlock some testing scenarios (where probing sys_enter or sys_exit causes undesirable performance overheads): a. unprivileged triggering of a test module: test modules may register a probe to be called back on task_prctl_unknown, and pick a very large unknown prctl() option upon which they perform a test function for an unprivileged user; b. unprivileged triggering of an eBPF program function: similar as idea (a). Example trace_pipe output: test-380 [001] ..... 78.142904: task_prctl_unknown: option=1234 arg2=101 arg3=102 arg4=103 arg5=104 Signed-off-by: Marco Elver Reviewed-by: Alexander Potapenko Link: https://lore.kernel.org/r/20241108113455.2924361-1-elver@google.com Signed-off-by: Kees Cook --- include/trace/events/task.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include') diff --git a/include/trace/events/task.h b/include/trace/events/task.h index 47b527464d1a..209d315852fb 100644 --- a/include/trace/events/task.h +++ b/include/trace/events/task.h @@ -56,6 +56,43 @@ TRACE_EVENT(task_rename, __entry->newcomm, __entry->oom_score_adj) ); +/** + * task_prctl_unknown - called on unknown prctl() option + * @option: option passed + * @arg2: arg2 passed + * @arg3: arg3 passed + * @arg4: arg4 passed + * @arg5: arg5 passed + * + * Called on an unknown prctl() option. + */ +TRACE_EVENT(task_prctl_unknown, + + TP_PROTO(int option, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5), + + TP_ARGS(option, arg2, arg3, arg4, arg5), + + TP_STRUCT__entry( + __field( int, option) + __field( unsigned long, arg2) + __field( unsigned long, arg3) + __field( unsigned long, arg4) + __field( unsigned long, arg5) + ), + + TP_fast_assign( + __entry->option = option; + __entry->arg2 = arg2; + __entry->arg3 = arg3; + __entry->arg4 = arg4; + __entry->arg5 = arg5; + ), + + TP_printk("option=%d arg2=%ld arg3=%ld arg4=%ld arg5=%ld", + __entry->option, __entry->arg2, __entry->arg3, __entry->arg4, __entry->arg5) +); + #endif /* This part must be outside protection */ -- cgit v1.2.3 From e3f6a42272e028c46695acc83fc7d7c42f2750ad Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 8 Nov 2024 12:34:25 +0100 Subject: tracing: Remove pid in task_rename tracing output Remove pid in task_rename tracepoint output, since that tracepoint only deals with the current task, and is printed by default. This also saves some space in the entry and avoids wasted padding. Link: https://lkml.kernel.org/r/20241105120247.596a0dc9@gandalf.local.home Suggested-by: Steven Rostedt Signed-off-by: Marco Elver Link: https://lore.kernel.org/r/20241108113455.2924361-2-elver@google.com Signed-off-by: Kees Cook --- include/trace/events/task.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/trace/events/task.h b/include/trace/events/task.h index 209d315852fb..af535b053033 100644 --- a/include/trace/events/task.h +++ b/include/trace/events/task.h @@ -38,22 +38,19 @@ TRACE_EVENT(task_rename, TP_ARGS(task, comm), TP_STRUCT__entry( - __field( pid_t, pid) __array( char, oldcomm, TASK_COMM_LEN) __array( char, newcomm, TASK_COMM_LEN) __field( short, oom_score_adj) ), TP_fast_assign( - __entry->pid = task->pid; memcpy(entry->oldcomm, task->comm, TASK_COMM_LEN); strscpy(entry->newcomm, comm, TASK_COMM_LEN); __entry->oom_score_adj = task->signal->oom_score_adj; ), - TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%hd", - __entry->pid, __entry->oldcomm, - __entry->newcomm, __entry->oom_score_adj) + TP_printk("oldcomm=%s newcomm=%s oom_score_adj=%hd", + __entry->oldcomm, __entry->newcomm, __entry->oom_score_adj) ); /** -- cgit v1.2.3 From 7533d0df69452c3e7b69c727c1e8e1a7e1afc83c Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Wed, 18 Dec 2024 16:01:43 +0800 Subject: soundwire: mipi_disco: read lane mapping properties from ACPI The DisCo for SoundWire 2.0 added support for the 'mipi-sdw-lane--mapping' property. Co-developed-by: Chao Song Signed-off-by: Chao Song Signed-off-by: Bard Liao Link: https://lore.kernel.org/r/20241218080155.102405-3-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index bd9836690da6..bb4e33a4db17 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -54,6 +54,8 @@ struct sdw_slave; #define SDW_MAX_PORTS 15 #define SDW_VALID_PORT_RANGE(n) ((n) < SDW_MAX_PORTS && (n) >= 1) +#define SDW_MAX_LANES 8 + enum { SDW_PORT_DIRN_SINK = 0, SDW_PORT_DIRN_SOURCE, @@ -356,6 +358,7 @@ struct sdw_dpn_prop { * and masks are supported * @commit_register_supported: is PCP_Commit register supported * @scp_int1_mask: SCP_INT1_MASK desired settings + * @lane_maps: Lane mapping for the slave, only valid if lane_control_support is set * @clock_reg_supported: the Peripheral implements the clock base and scale * registers introduced with the SoundWire 1.2 specification. SDCA devices * do not need to set this boolean property as the registers are required. @@ -385,6 +388,7 @@ struct sdw_slave_prop { u32 sdca_interrupt_register_list; u8 commit_register_supported; u8 scp_int1_mask; + u8 lane_maps[SDW_MAX_LANES]; bool clock_reg_supported; bool use_domain_irq; }; @@ -450,6 +454,7 @@ struct sdw_master_prop { int sdw_master_read_prop(struct sdw_bus *bus); int sdw_slave_read_prop(struct sdw_slave *slave); +int sdw_slave_read_lane_mapping(struct sdw_slave *slave); /* * SDW Slave Structures and APIs -- cgit v1.2.3 From b6a2e1be7d9303d07eff72a13132a37e035fbcfa Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Wed, 18 Dec 2024 16:01:44 +0800 Subject: soundwire: add lane_used_bandwidth in struct sdw_bus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To support multi-lane, we need to know how much bandwidth is used on each lane. And to use the lane that has enough bandwidth. Signed-off-by: Bard Liao Reviewed-by: Péter Ujfalusi Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20241218080155.102405-4-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index bb4e33a4db17..ae38ac848d38 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -893,6 +893,7 @@ struct sdw_master_ops { * @multi_link: Store bus property that indicates if multi links * are supported. This flag is populated by drivers after reading * appropriate firmware (ACPI/DT). + * @lane_used_bandwidth: how much bandwidth in bits per second is used by each lane */ struct sdw_bus { struct device *dev; @@ -924,6 +925,7 @@ struct sdw_bus { struct dentry *debugfs; #endif bool multi_link; + unsigned int lane_used_bandwidth[SDW_MAX_LANES]; }; int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent, -- cgit v1.2.3 From 8f4e3343eda8cdedaf711bf3d8ef2d6ed571f420 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Wed, 18 Dec 2024 16:01:47 +0800 Subject: Soundwire: add sdw_slave_get_scale_index helper Currently, we only set peripheral frequency when the peripheral is initialized. However, curr_dr_freq may change to get required bandwidth. For example, curr_dr_freq may increase from 4.8MHz to 9.6MHz when the 4th stream is opened. Add a helper to get the scale index so that we can get the scale index and program it. Signed-off-by: Bard Liao Reviewed-by: Ranjani Sridharan Link: https://lore.kernel.org/r/20241218080155.102405-7-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index ae38ac848d38..05a85e2bd96d 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -1052,6 +1052,8 @@ int sdw_stream_add_slave(struct sdw_slave *slave, int sdw_stream_remove_slave(struct sdw_slave *slave, struct sdw_stream_runtime *stream); +int sdw_slave_get_scale_index(struct sdw_slave *slave, u8 *base); + /* messaging and data APIs */ int sdw_read(struct sdw_slave *slave, u32 addr); int sdw_write(struct sdw_slave *slave, u32 addr, u8 value); -- cgit v1.2.3 From 645291cfe5e52cce9571d73542476bec1d79ce26 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Wed, 18 Dec 2024 16:01:48 +0800 Subject: Soundwire: stream: program BUSCLOCK_SCALE We need to program bus clock scale to adjust the bus clock if current bus clock doesn't fit the bandwidth. Signed-off-by: Bard Liao Reviewed-by: Ranjani Sridharan Link: https://lore.kernel.org/r/20241218080155.102405-8-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 05a85e2bd96d..fc0a203c3ae0 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -1041,6 +1041,7 @@ int sdw_bus_exit_clk_stop(struct sdw_bus *bus); int sdw_compare_devid(struct sdw_slave *slave, struct sdw_slave_id id); void sdw_extract_slave_id(struct sdw_bus *bus, u64 addr, struct sdw_slave_id *id); +bool is_clock_scaling_supported_by_slave(struct sdw_slave *slave); #if IS_ENABLED(CONFIG_SOUNDWIRE) -- cgit v1.2.3 From 168cdf9cdef232225f6b6c617fd347b4d1c4a7d7 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Wed, 18 Dec 2024 16:01:54 +0800 Subject: SoundWire: pass stream to compute_params() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The stream parameter will be used in the follow up commit. No function change. Signed-off-by: Bard Liao Reviewed-by: Ranjani Sridharan Reviewed-by: Péter Ujfalusi Link: https://lore.kernel.org/r/20241218080155.102405-14-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 148 +++++++++++++++++++++--------------------- 1 file changed, 74 insertions(+), 74 deletions(-) (limited to 'include') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index fc0a203c3ae0..2d6c30317792 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -855,79 +855,6 @@ struct sdw_master_ops { int dev_num); }; -/** - * struct sdw_bus - SoundWire bus - * @dev: Shortcut to &bus->md->dev to avoid changing the entire code. - * @md: Master device - * @bus_lock_key: bus lock key associated to @bus_lock - * @bus_lock: bus lock - * @slaves: list of Slaves on this bus - * @msg_lock_key: message lock key associated to @msg_lock - * @msg_lock: message lock - * @m_rt_list: List of Master instance of all stream(s) running on Bus. This - * is used to compute and program bus bandwidth, clock, frame shape, - * transport and port parameters - * @defer_msg: Defer message - * @params: Current bus parameters - * @stream_refcount: number of streams currently using this bus - * @ops: Master callback ops - * @port_ops: Master port callback ops - * @prop: Master properties - * @vendor_specific_prop: pointer to non-standard properties - * @hw_sync_min_links: Number of links used by a stream above which - * hardware-based synchronization is required. This value is only - * meaningful if multi_link is set. If set to 1, hardware-based - * synchronization will be used even if a stream only uses a single - * SoundWire segment. - * @controller_id: system-unique controller ID. If set to -1, the bus @id will be used. - * @link_id: Link id number, can be 0 to N, unique for each Controller - * @id: bus system-wide unique id - * @compute_params: points to Bus resource management implementation - * @assigned: Bitmap for Slave device numbers. - * Bit set implies used number, bit clear implies unused number. - * @clk_stop_timeout: Clock stop timeout computed - * @bank_switch_timeout: Bank switch timeout computed - * @domain: IRQ domain - * @irq_chip: IRQ chip - * @debugfs: Bus debugfs (optional) - * @multi_link: Store bus property that indicates if multi links - * are supported. This flag is populated by drivers after reading - * appropriate firmware (ACPI/DT). - * @lane_used_bandwidth: how much bandwidth in bits per second is used by each lane - */ -struct sdw_bus { - struct device *dev; - struct sdw_master_device *md; - struct lock_class_key bus_lock_key; - struct mutex bus_lock; - struct list_head slaves; - struct lock_class_key msg_lock_key; - struct mutex msg_lock; - struct list_head m_rt_list; - struct sdw_defer defer_msg; - struct sdw_bus_params params; - int stream_refcount; - const struct sdw_master_ops *ops; - const struct sdw_master_port_ops *port_ops; - struct sdw_master_prop prop; - void *vendor_specific_prop; - int hw_sync_min_links; - int controller_id; - unsigned int link_id; - int id; - int (*compute_params)(struct sdw_bus *bus); - DECLARE_BITMAP(assigned, SDW_MAX_DEVICES); - unsigned int clk_stop_timeout; - u32 bank_switch_timeout; - struct irq_chip irq_chip; - struct irq_domain *domain; -#ifdef CONFIG_DEBUG_FS - struct dentry *debugfs; -#endif - bool multi_link; - unsigned int lane_used_bandwidth[SDW_MAX_LANES]; -}; - int sdw_bus_master_add(struct sdw_bus *bus, struct device *parent, struct fwnode_handle *fwnode); void sdw_bus_master_delete(struct sdw_bus *bus); @@ -1017,10 +944,83 @@ struct sdw_stream_runtime { struct list_head master_list; }; +/** + * struct sdw_bus - SoundWire bus + * @dev: Shortcut to &bus->md->dev to avoid changing the entire code. + * @md: Master device + * @bus_lock_key: bus lock key associated to @bus_lock + * @bus_lock: bus lock + * @slaves: list of Slaves on this bus + * @msg_lock_key: message lock key associated to @msg_lock + * @msg_lock: message lock + * @m_rt_list: List of Master instance of all stream(s) running on Bus. This + * is used to compute and program bus bandwidth, clock, frame shape, + * transport and port parameters + * @defer_msg: Defer message + * @params: Current bus parameters + * @stream_refcount: number of streams currently using this bus + * @ops: Master callback ops + * @port_ops: Master port callback ops + * @prop: Master properties + * @vendor_specific_prop: pointer to non-standard properties + * @hw_sync_min_links: Number of links used by a stream above which + * hardware-based synchronization is required. This value is only + * meaningful if multi_link is set. If set to 1, hardware-based + * synchronization will be used even if a stream only uses a single + * SoundWire segment. + * @controller_id: system-unique controller ID. If set to -1, the bus @id will be used. + * @link_id: Link id number, can be 0 to N, unique for each Controller + * @id: bus system-wide unique id + * @compute_params: points to Bus resource management implementation + * @assigned: Bitmap for Slave device numbers. + * Bit set implies used number, bit clear implies unused number. + * @clk_stop_timeout: Clock stop timeout computed + * @bank_switch_timeout: Bank switch timeout computed + * @domain: IRQ domain + * @irq_chip: IRQ chip + * @debugfs: Bus debugfs (optional) + * @multi_link: Store bus property that indicates if multi links + * are supported. This flag is populated by drivers after reading + * appropriate firmware (ACPI/DT). + * @lane_used_bandwidth: how much bandwidth in bits per second is used by each lane + */ +struct sdw_bus { + struct device *dev; + struct sdw_master_device *md; + struct lock_class_key bus_lock_key; + struct mutex bus_lock; + struct list_head slaves; + struct lock_class_key msg_lock_key; + struct mutex msg_lock; + struct list_head m_rt_list; + struct sdw_defer defer_msg; + struct sdw_bus_params params; + int stream_refcount; + const struct sdw_master_ops *ops; + const struct sdw_master_port_ops *port_ops; + struct sdw_master_prop prop; + void *vendor_specific_prop; + int hw_sync_min_links; + int controller_id; + unsigned int link_id; + int id; + int (*compute_params)(struct sdw_bus *bus, struct sdw_stream_runtime *stream); + DECLARE_BITMAP(assigned, SDW_MAX_DEVICES); + unsigned int clk_stop_timeout; + u32 bank_switch_timeout; + struct irq_chip irq_chip; + struct irq_domain *domain; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs; +#endif + bool multi_link; + unsigned int lane_used_bandwidth[SDW_MAX_LANES]; +}; + struct sdw_stream_runtime *sdw_alloc_stream(const char *stream_name); void sdw_release_stream(struct sdw_stream_runtime *stream); -int sdw_compute_params(struct sdw_bus *bus); +int sdw_compute_params(struct sdw_bus *bus, struct sdw_stream_runtime *stream); int sdw_stream_add_master(struct sdw_bus *bus, struct sdw_stream_config *stream_config, -- cgit v1.2.3 From 0b7a66a2c864859fbf9bb16229c03172eef02c05 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 5 Dec 2024 17:06:02 +0100 Subject: preempt: Move PREEMPT_RT before PREEMPT in vermagic. Since the dynamic preemption has been enabled for PREEMPT_RT we have now CONFIG_PREEMPT and CONFIG_PREEMPT_RT set simultaneously. This affects the vermagic strings which comes now PREEMPT with PREEMPT_RT enabled. The PREEMPT_RT module usually can not be loaded on a PREEMPT kernel because some symbols are missing. However if the symbols are fine then it continues and it crashes later. The problem is that the struct module has a different layout and the num_exentries or init members are at a different position leading to a crash later on. This is not necessary caught by the size check in elf_validity_cache_index_mod() because the mem member has an alignment requirement of __module_memory_align which is big enough keep the total size unchanged. Therefore we should keep the string accurate instead of removing it. Move the PREEMPT_RT check before the PREEMPT so that it takes precedence if both symbols are enabled. Fixes: 35772d627b55c ("sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT") Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Petr Pavlu Link: https://lore.kernel.org/r/20241205160602.3lIAsJRT@linutronix.de Signed-off-by: Petr Pavlu --- include/linux/vermagic.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index a54046bf37e5..939ceabcaf06 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -15,10 +15,10 @@ #else #define MODULE_VERMAGIC_SMP "" #endif -#ifdef CONFIG_PREEMPT_BUILD -#define MODULE_VERMAGIC_PREEMPT "preempt " -#elif defined(CONFIG_PREEMPT_RT) +#ifdef CONFIG_PREEMPT_RT #define MODULE_VERMAGIC_PREEMPT "preempt_rt " +#elif defined(CONFIG_PREEMPT_BUILD) +#define MODULE_VERMAGIC_PREEMPT "preempt " #else #define MODULE_VERMAGIC_PREEMPT "" #endif -- cgit v1.2.3 From b89c0ed09e1189217cd9d516b739627c523d53a4 Mon Sep 17 00:00:00 2001 From: Neil Armstrong Date: Tue, 19 Nov 2024 18:56:36 +0100 Subject: opp: core: implement dev_pm_opp_get_bw Add and implement dev_pm_opp_get_bw() to retrieve the OPP's bandwidth in the same way as the dev_pm_opp_get_voltage() helper. Retrieving bandwidth is required in the case of the Adreno GPU where the GPU Management Unit can handle the Bandwidth scaling. The helper can get the peak or average bandwidth for any of the interconnect path. Signed-off-by: Neil Armstrong [ Viresh: Fixed commit log and a comment in code ] Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 568183e3e641..414146abfe81 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -102,6 +102,8 @@ struct dev_pm_opp_data { struct opp_table *dev_pm_opp_get_opp_table(struct device *dev); void dev_pm_opp_put_opp_table(struct opp_table *opp_table); +unsigned long dev_pm_opp_get_bw(struct dev_pm_opp *opp, bool peak, int index); + unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp); int dev_pm_opp_get_supplies(struct dev_pm_opp *opp, struct dev_pm_opp_supply *supplies); @@ -205,6 +207,11 @@ static inline struct opp_table *dev_pm_opp_get_opp_table_indexed(struct device * static inline void dev_pm_opp_put_opp_table(struct opp_table *opp_table) {} +static inline unsigned long dev_pm_opp_get_bw(struct dev_pm_opp *opp, bool peak, int index) +{ + return 0; +} + static inline unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp) { return 0; -- cgit v1.2.3 From 67b43038ce14d6b0673bdffb2052d879065c94ae Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Mon, 4 Nov 2024 16:43:03 +0800 Subject: KVM: guest_memfd: Remove RCU-protected attribute from slot->gmem.file Remove the RCU-protected attribute from slot->gmem.file. No need to use RCU primitives rcu_assign_pointer()/synchronize_rcu() to update this pointer. - slot->gmem.file is updated in 3 places: kvm_gmem_bind(), kvm_gmem_unbind(), kvm_gmem_release(). All of them are protected by kvm->slots_lock. - slot->gmem.file is read in 2 paths: (1) kvm_gmem_populate kvm_gmem_get_file __kvm_gmem_get_pfn (2) kvm_gmem_get_pfn kvm_gmem_get_file __kvm_gmem_get_pfn Path (1) kvm_gmem_populate() requires holding kvm->slots_lock, so slot->gmem.file is protected by the kvm->slots_lock in this path. Path (2) kvm_gmem_get_pfn() does not require holding kvm->slots_lock. However, it's also not guarded by rcu_read_lock() and rcu_read_unlock(). So synchronize_rcu() in kvm_gmem_unbind()/kvm_gmem_release() actually will not wait for the readers in kvm_gmem_get_pfn() due to lack of RCU read-side critical section. The path (2) kvm_gmem_get_pfn() is safe without RCU protection because: a) kvm_gmem_bind() is called on a new memslot, before the memslot is visible to kvm_gmem_get_pfn(). b) kvm->srcu ensures that kvm_gmem_unbind() and freeing of a memslot occur after the memslot is no longer visible to kvm_gmem_get_pfn(). c) get_file_active() ensures that kvm_gmem_get_pfn() will not access the stale file if kvm_gmem_release() sets it to NULL. This is because if kvm_gmem_release() occurs before kvm_gmem_get_pfn(), get_file_active() will return NULL; if get_file_active() does not return NULL, kvm_gmem_release() should not occur until after kvm_gmem_get_pfn() releases the file reference. Signed-off-by: Yan Zhao Message-ID: <20241104084303.29909-1-yan.y.zhao@intel.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 401439bb21e3..9df216943eb4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -596,7 +596,12 @@ struct kvm_memory_slot { #ifdef CONFIG_KVM_PRIVATE_MEM struct { - struct file __rcu *file; + /* + * Writes protected by kvm->slots_lock. Acquiring a + * reference via kvm_gmem_get_file() is protected by + * either kvm->slots_lock or kvm->srcu. + */ + struct file *file; pgoff_t pgoff; } gmem; #endif -- cgit v1.2.3 From dca6c88532322830d5d92486467fcc91b67a9ad8 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Thu, 18 Jul 2024 14:12:14 -0700 Subject: KVM: Add member to struct kvm_gfn_range to indicate private/shared Add new members to strut kvm_gfn_range to indicate which mapping (private-vs-shared) to operate on: enum kvm_gfn_range_filter attr_filter. Update the core zapping operations to set them appropriately. TDX utilizes two GPA aliases for the same memslots, one for memory that is for private memory and one that is for shared. For private memory, KVM cannot always perform the same operations it does on memory for default VMs, such as zapping pages and having them be faulted back in, as this requires guest coordination. However, some operations such as guest driven conversion of memory between private and shared should zap private memory. Internally to the MMU, private and shared mappings are tracked on separate roots. Mapping and zapping operations will operate on the respective GFN alias for each root (private or shared). So zapping operations will by default zap both aliases. Add fields in struct kvm_gfn_range to allow callers to specify which aliases so they can only target the aliases appropriate for their specific operation. There was feedback that target aliases should be specified such that the default value (0) is to operate on both aliases. Several options were considered. Several variations of having separate bools defined such that the default behavior was to process both aliases. They either allowed nonsensical configurations, or were confusing for the caller. A simple enum was also explored and was close, but was hard to process in the caller. Instead, use an enum with the default value (0) reserved as a disallowed value. Catch ranges that didn't have the target aliases specified by looking for that specific value. Set target alias with enum appropriately for these MMU operations: - For KVM's mmu notifier callbacks, zap shared pages only because private pages won't have a userspace mapping - For setting memory attributes, kvm_arch_pre_set_memory_attributes() chooses the aliases based on the attribute. - For guest_memfd invalidations, zap private only. Link: https://lore.kernel.org/kvm/ZivIF9vjKcuGie3s@google.com/ Signed-off-by: Isaku Yamahata Co-developed-by: Rick Edgecombe Signed-off-by: Rick Edgecombe Message-ID: <20240718211230.1492011-3-rick.p.edgecombe@intel.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9df216943eb4..c788d0bd952a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -255,11 +255,17 @@ union kvm_mmu_notifier_arg { unsigned long attributes; }; +enum kvm_gfn_range_filter { + KVM_FILTER_SHARED = BIT(0), + KVM_FILTER_PRIVATE = BIT(1), +}; + struct kvm_gfn_range { struct kvm_memory_slot *slot; gfn_t start; gfn_t end; union kvm_mmu_notifier_arg arg; + enum kvm_gfn_range_filter attr_filter; bool may_block; }; bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); -- cgit v1.2.3 From 943d0609d0571af092dc13456cbca70351e4d20e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:22 +0000 Subject: io_uring: rename ->resize_lock ->resize_lock is used for resizing rings, but it's a good idea to reuse it in other cases as well. Rename it into mmap_lock as it's protects from races with mmap. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/68f705306f3ac4d2fb999eb80ea1615015ce9f7f.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index fd4cdb0860a2..fafc1d779eb1 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -424,7 +424,7 @@ struct io_ring_ctx { * side will need to grab this lock, to prevent either side from * being run concurrently with the other. */ - struct mutex resize_lock; + struct mutex mmap_lock; /* * If IORING_SETUP_NO_MMAP is used, then the below holds -- cgit v1.2.3 From a730d2047d4ef822262c37f51ff7267f2f0e7167 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:24 +0000 Subject: io_uring/memmap: flag vmap'ed regions Add internal flags for struct io_mapped_region. The first flag we need is IO_REGION_F_VMAPPED, that indicates that the pointer has to be unmapped on region destruction. For now all regions are vmap'ed, so it's set unconditionally. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5a3d8046a038da97c0f8a8c8f1733fa3fc689d31.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index fafc1d779eb1..0793a91b66a5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -78,8 +78,9 @@ struct io_hash_table { struct io_mapped_region { struct page **pages; - void *vmap_ptr; - size_t nr_pages; + void *ptr; + unsigned nr_pages; + unsigned flags; }; /* -- cgit v1.2.3 From 8078486e1d53591ed946c943177339e59e3089e0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:34 +0000 Subject: io_uring: use region api for SQ Convert internal parts of the SQ managment to the region API. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1fb73ced6b835cb319ab0fe1dc0b2e982a9a5650.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 0793a91b66a5..808accf1776b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -432,10 +432,9 @@ struct io_ring_ctx { * the gup'ed pages for the two rings, and the sqes. */ unsigned short n_ring_pages; - unsigned short n_sqe_pages; struct page **ring_pages; - struct page **sqe_pages; + struct io_mapped_region sq_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; }; -- cgit v1.2.3 From 81a4058e0cd0f07139f088fbeb65bc488f687829 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:35 +0000 Subject: io_uring: use region api for CQ Convert internal parts of the CQ/SQ array managment to the region API. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/46fc3c801290d6b1ac16023d78f6b8e685c87fd6.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 808accf1776b..b63f44220e8b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -427,14 +427,8 @@ struct io_ring_ctx { */ struct mutex mmap_lock; - /* - * If IORING_SETUP_NO_MMAP is used, then the below holds - * the gup'ed pages for the two rings, and the sqes. - */ - unsigned short n_ring_pages; - struct page **ring_pages; - struct io_mapped_region sq_region; + struct io_mapped_region ring_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; }; -- cgit v1.2.3 From 78fda3d056417ccb9921663383b12f771aa0dd43 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:36 +0000 Subject: io_uring/kbuf: use mmap_lock to sync with mmap A preparation / cleanup patch simplifying the buf ring - mmap synchronisation. Instead of relying on RCU, which is trickier, do it by grabbing the mmap_lock when when anyone tries to publish or remove a registered buffer to / from ->io_bl_xa. Modifications of the xarray should always be protected by both ->uring_lock and ->mmap_lock, while lookups should hold either of them. While a struct io_buffer_list is in the xarray, the mmap related fields like ->flags and ->buf_pages should stay stable. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/af13bde56ee1a26bcaefaa9aad37a9ea318a590e.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index b63f44220e8b..73575d545d3c 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -294,6 +294,11 @@ struct io_ring_ctx { struct io_submit_state submit_state; + /* + * Modifications are protected by ->uring_lock and ->mmap_lock. + * The flags, buf_pages and buf_nr_pages fields should be stable + * once published. + */ struct xarray io_bl_xa; struct io_hash_table cancel_table; -- cgit v1.2.3 From 5dbb3cbd060aa86a722d7d44278e537ae3f63081 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:31 +0530 Subject: block: define set of integrity flags to be inherited by cloned bip Introduce BIP_CLONE_FLAGS describing integrity flags that should be inherited in the cloned bip from the parent. Suggested-by: Christoph Hellwig Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-2-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index dbf0f74c1529..0f0cf10222e8 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -30,6 +30,9 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; +#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_CTRL_NOCHECK | \ + BIP_DISK_NOCHECK | BIP_IP_CHECKSUM) + #ifdef CONFIG_BLK_DEV_INTEGRITY #define bip_for_each_vec(bvl, bip, iter) \ -- cgit v1.2.3 From fe8f4ca7107e968b0eb7328155c8811f2a19424a Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:33 +0530 Subject: block: modify bio_integrity_map_user to accept iov_iter as argument This patch refactors bio_integrity_map_user to accept iov_iter as argument. This is a prep patch. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-4-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 0f0cf10222e8..58ff9988433a 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -75,7 +75,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr); int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len); +int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); void bio_integrity_unmap_user(struct bio *bio); bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); @@ -101,8 +101,7 @@ static inline void bioset_integrity_free(struct bio_set *bs) { } -static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, - ssize_t len) +static int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; } -- cgit v1.2.3 From 10783d0ba0d7731ec81d88c54f83cf0ff89d1c2a Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:34 +0530 Subject: fs, iov_iter: define meta io descriptor Add flags to describe checks for integrity meta buffer. Also, introduce a new 'uio_meta' structure that upper layer can use to pass the meta/integrity information. Signed-off-by: Kanchan Joshi Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241128112240.8867-5-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/uio.h | 9 +++++++++ include/uapi/linux/fs.h | 9 +++++++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/uio.h b/include/linux/uio.h index 853f9de5aa05..8ada84e85447 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -82,6 +82,15 @@ struct iov_iter { }; }; +typedef __u16 uio_meta_flags_t; + +struct uio_meta { + uio_meta_flags_t flags; + u16 app_tag; + u64 seed; + struct iov_iter iter; +}; + static inline const struct iovec *iter_iov(const struct iov_iter *iter) { if (iter->iter_type == ITER_UBUF) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 753971770733..9070ef19f0a3 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -40,6 +40,15 @@ #define BLOCK_SIZE_BITS 10 #define BLOCK_SIZE (1< Date: Thu, 28 Nov 2024 16:52:35 +0530 Subject: fs: introduce IOCB_HAS_METADATA for metadata Introduce an IOCB_HAS_METADATA flag for the kiocb struct, for handling requests containing meta payload. Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241128112240.8867-6-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..2cc3d45da7b0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -348,6 +348,7 @@ struct readahead_control; #define IOCB_DIO_CALLER_COMP (1 << 22) /* kiocb is a read or write operation submitted by fs/aio.c. */ #define IOCB_AIO_RW (1 << 23) +#define IOCB_HAS_METADATA (1 << 24) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ -- cgit v1.2.3 From 59a7d12a7fb5ab24efa893e6980a00ffc090c777 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:36 +0530 Subject: io_uring: introduce attributes for read/write and PI support Add the ability to pass additional attributes along with read/write. Application can prepare attibute specific information and pass its address using the SQE field: __u64 attr_ptr; Along with setting a mask indicating attributes being passed: __u64 attr_type_mask; Overall 64 attributes are allowed and currently one attribute 'IORING_RW_ATTR_FLAG_PI' is supported. With PI attribute, userspace can pass following information: - flags: integrity check flags IO_INTEGRITY_CHK_{GUARD/APPTAG/REFTAG} - len: length of PI/metadata buffer - addr: address of metadata buffer - seed: seed value for reftag remapping - app_tag: application defined 16b value Process this information to prepare uio_meta_descriptor and pass it down using kiocb->private. PI attribute is supported only for direct IO. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20241128112240.8867-7-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index aac9a4f8fa9a..38f0d6b10eaf 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -98,6 +98,10 @@ struct io_uring_sqe { __u64 addr3; __u64 __pad2[1]; }; + struct { + __u64 attr_ptr; /* pointer to attribute information */ + __u64 attr_type_mask; /* bit mask of attributes */ + }; __u64 optval; /* * If the ring is initialized with IORING_SETUP_SQE128, then @@ -107,6 +111,18 @@ struct io_uring_sqe { }; }; +/* sqe->attr_type_mask flags */ +#define IORING_RW_ATTR_FLAG_PI (1U << 0) +/* PI attribute information */ +struct io_uring_attr_pi { + __u16 flags; + __u16 app_tag; + __u32 len; + __u64 addr; + __u64 seed; + __u64 rsvd; +}; + /* * If sqe->file_index is set to this for opcodes that instantiate a new * direct descriptor (like openat/openat2/accept), then io_uring will allocate -- cgit v1.2.3 From 2c0487d8b1f1351d48a13b77b254a2bb6de49eb3 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:37 +0530 Subject: block: introduce BIP_CHECK_GUARD/REFTAG/APPTAG bip_flags This patch introduces BIP_CHECK_GUARD/REFTAG/APPTAG bip_flags which indicate how the hardware should check the integrity payload. BIP_CHECK_GUARD/REFTAG are conversion of existing semantics, while BIP_CHECK_APPTAG is a new flag. The driver can now just rely on block layer flags, and doesn't need to know the integrity source. Submitter of PI decides which tags to check. This would also give us a unified interface for user and kernel generated integrity. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-8-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 58ff9988433a..fe2bfe122db2 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -11,6 +11,9 @@ enum bip_flags { BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ BIP_COPY_USER = 1 << 5, /* Kernel bounce buffer in use */ + BIP_CHECK_GUARD = 1 << 6, /* guard check */ + BIP_CHECK_REFTAG = 1 << 7, /* reftag check */ + BIP_CHECK_APPTAG = 1 << 8, /* apptag check */ }; struct bio_integrity_payload { @@ -31,7 +34,8 @@ struct bio_integrity_payload { }; #define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_CTRL_NOCHECK | \ - BIP_DISK_NOCHECK | BIP_IP_CHECKSUM) + BIP_DISK_NOCHECK | BIP_IP_CHECKSUM | \ + BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) #ifdef CONFIG_BLK_DEV_INTEGRITY -- cgit v1.2.3 From 18623503a3a514780214850bf8ba8b03ea0f3a4b Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:39 +0530 Subject: scsi: add support for user-meta interface Add support for sending user-meta buffer. Set tags to be checked using flags specified by user/block-layer. With this change, BIP_CTRL_NOCHECK becomes unused. Remove it. Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241128112240.8867-10-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index fe2bfe122db2..2195bc06dcde 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -7,13 +7,12 @@ enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ - BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ - BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ - BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ - BIP_COPY_USER = 1 << 5, /* Kernel bounce buffer in use */ - BIP_CHECK_GUARD = 1 << 6, /* guard check */ - BIP_CHECK_REFTAG = 1 << 7, /* reftag check */ - BIP_CHECK_APPTAG = 1 << 8, /* apptag check */ + BIP_DISK_NOCHECK = 1 << 2, /* disable disk integrity checking */ + BIP_IP_CHECKSUM = 1 << 3, /* IP checksum */ + BIP_COPY_USER = 1 << 4, /* Kernel bounce buffer in use */ + BIP_CHECK_GUARD = 1 << 5, /* guard check */ + BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ + BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ }; struct bio_integrity_payload { @@ -33,8 +32,7 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; -#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_CTRL_NOCHECK | \ - BIP_DISK_NOCHECK | BIP_IP_CHECKSUM | \ +#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \ BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) #ifdef CONFIG_BLK_DEV_INTEGRITY -- cgit v1.2.3 From 3d8b5a22d40435b4a7e58f06ae2cd3506b222898 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Thu, 28 Nov 2024 16:52:40 +0530 Subject: block: add support to pass user meta buffer If an iocb contains metadata, extract that and prepare the bip. Based on flags specified by the user, set corresponding guard/app/ref tags to be checked in bip. Reviewed-by: Christoph Hellwig Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-11-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 2195bc06dcde..de0a6c9de4d1 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -23,6 +23,7 @@ struct bio_integrity_payload { unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ + u16 app_tag; /* application tag value */ struct bvec_iter bio_iter; /* for rewinding parent bio */ @@ -78,6 +79,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); +int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta); void bio_integrity_unmap_user(struct bio *bio); bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); @@ -108,6 +110,11 @@ static int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) return -EINVAL; } +static inline int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) +{ + return -EINVAL; +} + static inline void bio_integrity_unmap_user(struct bio *bio) { } -- cgit v1.2.3 From 6f491a8d4b92d1a840fd9209cba783c84437d0b7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 27 Nov 2024 21:51:28 +0800 Subject: block: track disk DEAD state automatically for modeling queue freeze lockdep Now we only verify the outmost freeze & unfreeze in current context in case that !q->mq_freeze_depth, so it is reliable to save disk DEAD state when we want to lock the freeze queue since the state is one per-task variable now. Doing this way can kill lots of false positive when freeze queue is called before adding disk[1]. [1] https://lore.kernel.org/linux-block/6741f6b2.050a0220.1cc393.0017.GAE@google.com/ Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241127135133.3952153-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 378d3a1a22fc..522cf8eef66c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -581,6 +581,8 @@ struct request_queue { #ifdef CONFIG_LOCKDEP struct task_struct *mq_freeze_owner; int mq_freeze_owner_depth; + /* Records disk state in current context, used in unfreeze queue */ + bool mq_freeze_disk_dead; #endif wait_queue_head_t mq_freeze_wq; /* -- cgit v1.2.3 From f6661b1d0525f3764596a1b65eeed9e75aecafa7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 27 Nov 2024 21:51:30 +0800 Subject: block: track queue dying state automatically for modeling queue freeze lockdep Now we only verify the outmost freeze & unfreeze in current context in case that !q->mq_freeze_depth, so it is reliable to save queue lying state when we want to lock the freeze queue since the state is one per-task variable now. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241127135133.3952153-5-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 522cf8eef66c..5d40af2ef971 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -581,8 +581,12 @@ struct request_queue { #ifdef CONFIG_LOCKDEP struct task_struct *mq_freeze_owner; int mq_freeze_owner_depth; - /* Records disk state in current context, used in unfreeze queue */ + /* + * Records disk & queue state in current context, used in unfreeze + * queue + */ bool mq_freeze_disk_dead; + bool mq_freeze_queue_dying; #endif wait_queue_head_t mq_freeze_wq; /* -- cgit v1.2.3 From 5c292ac6e69f390179b93dc104b40903cddce636 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 2 Dec 2024 11:19:56 +0000 Subject: block: Delete bio_prio() Since commit 43b62ce3ff0a ("block: move bio io prio to a new field"), macro bio_prio() does nothing but return the value in bio->bi_ioprio. Most other places just read bio->bi_ioprio directly, so replace bi_ioprio() callsites with reading bio->bi_ioprio directly and delete that macro. Signed-off-by: John Garry Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20241202111957.2311683-2-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 7a1b3b1a8fed..99676916f3db 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -19,7 +19,6 @@ static inline unsigned int bio_max_segs(unsigned int nr_segs) return min(nr_segs, BIO_MAX_VECS); } -#define bio_prio(bio) (bio)->bi_ioprio #define bio_set_prio(bio, prio) ((bio)->bi_ioprio = prio) #define bio_iter_iovec(bio, iter) \ -- cgit v1.2.3 From 19206d3f5ef7f051056d2fb49203a347e4844e6e Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 2 Dec 2024 11:19:57 +0000 Subject: block: Delete bio_set_prio() Since commit 43b62ce3ff0a ("block: move bio io prio to a new field"), macro bio_set_prio() does nothing but set bio->bi_ioprio. All other places just set bio->bi_ioprio directly, so replace bio_set_prio() remaining callsites with setting bio->bi_ioprio directly and delete that macro. Signed-off-by: John Garry Acked-by: Jack Wang Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20241202111957.2311683-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 99676916f3db..1eec59699100 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -19,8 +19,6 @@ static inline unsigned int bio_max_segs(unsigned int nr_segs) return min(nr_segs, BIO_MAX_VECS); } -#define bio_set_prio(bio, prio) ((bio)->bi_ioprio = prio) - #define bio_iter_iovec(bio, iter) \ bvec_iter_bvec((bio)->bi_io_vec, (iter)) -- cgit v1.2.3 From fea4952df0eeec4e1a295ebaac9f61c0065fae87 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 2 Dec 2024 15:00:09 +0100 Subject: driver core: bus: add irq_get_affinity callback to bus_type Introducing a callback in struct bus_type so that a subsystem can hook up the getters directly. This approach avoids exposing random getters in any subsystems APIs. Acked-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Acked-by: Greg Kroah-Hartman Signed-off-by: Daniel Wagner Link: https://lore.kernel.org/r/20241202-refactor-blk-affinity-helpers-v6-1-27211e9c2cd5@kernel.org Signed-off-by: Jens Axboe --- include/linux/device/bus.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index cdc4757217f9..b18658bce2c3 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -48,6 +48,7 @@ struct fwnode_handle; * will never get called until they do. * @remove: Called when a device removed from this bus. * @shutdown: Called at shut-down time to quiesce the device. + * @irq_get_affinity: Get IRQ affinity mask for the device on this bus. * * @online: Called to put the device back online (after offlining it). * @offline: Called to put the device offline for hot-removal. May fail. @@ -87,6 +88,8 @@ struct bus_type { void (*sync_state)(struct device *dev); void (*remove)(struct device *dev); void (*shutdown)(struct device *dev); + const struct cpumask *(*irq_get_affinity)(struct device *dev, + unsigned int irq_vec); int (*online)(struct device *dev); int (*offline)(struct device *dev); -- cgit v1.2.3 From 1452e9b470c903fc4137a448e9f5767e92d68229 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 2 Dec 2024 15:00:12 +0100 Subject: blk-mq: introduce blk_mq_map_hw_queues blk_mq_pci_map_queues and blk_mq_virtio_map_queues will create a CPU to hardware queue mapping based on affinity information. These two function share common code and only differ on how the affinity information is retrieved. Also, those functions are located in the block subsystem where it doesn't really fit in. They are virtio and pci subsystem specific. Thus introduce provide a generic mapping function which uses the irq_get_affinity callback from bus_type. Originally idea from Ming Lei Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Reviewed-by: John Garry Signed-off-by: Daniel Wagner Link: https://lore.kernel.org/r/20241202-refactor-blk-affinity-helpers-v6-4-27211e9c2cd5@kernel.org Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c596e0e4cb75..769eab6247d4 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -921,6 +921,8 @@ void blk_mq_unfreeze_queue_non_owner(struct request_queue *q); void blk_freeze_queue_start_non_owner(struct request_queue *q); void blk_mq_map_queues(struct blk_mq_queue_map *qmap); +void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap, + struct device *dev, unsigned int offset); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); void blk_mq_quiesce_queue_nowait(struct request_queue *q); -- cgit v1.2.3 From 9bc1e897a821f19ba3775bb013a8a6fb121c3ca1 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 2 Dec 2024 15:00:16 +0100 Subject: blk-mq: remove unused queue mapping helpers There are no users left of the pci and virtio queue mapping helpers. Thus remove them. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Reviewed-by: John Garry Signed-off-by: Daniel Wagner Link: https://lore.kernel.org/r/20241202-refactor-blk-affinity-helpers-v6-8-27211e9c2cd5@kernel.org Signed-off-by: Jens Axboe --- include/linux/blk-mq-pci.h | 11 ----------- include/linux/blk-mq-virtio.h | 11 ----------- 2 files changed, 22 deletions(-) delete mode 100644 include/linux/blk-mq-pci.h delete mode 100644 include/linux/blk-mq-virtio.h (limited to 'include') diff --git a/include/linux/blk-mq-pci.h b/include/linux/blk-mq-pci.h deleted file mode 100644 index ca544e1d3508..000000000000 --- a/include/linux/blk-mq-pci.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_BLK_MQ_PCI_H -#define _LINUX_BLK_MQ_PCI_H - -struct blk_mq_queue_map; -struct pci_dev; - -void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, - int offset); - -#endif /* _LINUX_BLK_MQ_PCI_H */ diff --git a/include/linux/blk-mq-virtio.h b/include/linux/blk-mq-virtio.h deleted file mode 100644 index 13226e9b22dd..000000000000 --- a/include/linux/blk-mq-virtio.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_BLK_MQ_VIRTIO_H -#define _LINUX_BLK_MQ_VIRTIO_H - -struct blk_mq_queue_map; -struct virtio_device; - -void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, - struct virtio_device *vdev, int first_vec); - -#endif /* _LINUX_BLK_MQ_VIRTIO_H */ -- cgit v1.2.3 From cc76ace465d6977b47daa427379b7be1e0976f12 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 19 Dec 2024 07:01:59 +0100 Subject: block: remove BLK_MQ_F_SHOULD_MERGE BLK_MQ_F_SHOULD_MERGE is set for all tag_sets except those that purely process passthrough commands (bsg-lib, ufs tmf, various nvme admin queues) and thus don't even check the flag. Remove it to simplify the driver interface. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241219060214.1928848-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 769eab6247d4..7f6c482ebf54 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -668,7 +668,6 @@ struct blk_mq_ops { /* Keep hctx_flag_name[] in sync with the definitions below */ enum { - BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, /* * Set when this device requires underlying blk-mq device for -- cgit v1.2.3 From 546d191427cf5cf3215529744c2ea8558f0279db Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Nov 2024 15:53:58 -0700 Subject: block: make bio_integrity_map_user() static inline If CONFIG_BLK_DEV_INTEGRITY isn't set, then the dummy helper must be static inline to avoid complaints about the function being unused. Fixes: fe8f4ca7107e ("block: modify bio_integrity_map_user to accept iov_iter as argument") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202411300229.y7h60mDg-lkp@intel.com/ Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index de0a6c9de4d1..802f52e38efd 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -105,7 +105,7 @@ static inline void bioset_integrity_free(struct bio_set *bs) { } -static int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) +static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; } -- cgit v1.2.3 From 46e0ccfb88f02ab2eb20a41d519d6e4c028652f2 Mon Sep 17 00:00:00 2001 From: Radu Rendec Date: Thu, 19 Dec 2024 11:36:05 -0500 Subject: net: vxlan: rename SKB_DROP_REASON_VXLAN_NO_REMOTE The SKB_DROP_REASON_VXLAN_NO_REMOTE skb drop reason was introduced in the specific context of vxlan. As it turns out, there are similar cases when a packet needs to be dropped in other parts of the network stack, such as the bridge module. Rename SKB_DROP_REASON_VXLAN_NO_REMOTE and give it a more generic name, so that it can be used in other parts of the network stack. This is not a functional change, and the numeric value of the drop reason even remains unchanged. Signed-off-by: Radu Rendec Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241219163606.717758-2-rrendec@redhat.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index ead4170a1d0a..6e32106d7229 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -106,7 +106,7 @@ FN(VXLAN_VNI_NOT_FOUND) \ FN(MAC_INVALID_SOURCE) \ FN(VXLAN_ENTRY_EXISTS) \ - FN(VXLAN_NO_REMOTE) \ + FN(NO_TX_TARGET) \ FN(IP_TUNNEL_ECN) \ FN(TUNNEL_TXINFO) \ FN(LOCAL_MAC) \ @@ -497,8 +497,8 @@ enum skb_drop_reason { * entry or an entry pointing to a nexthop. */ SKB_DROP_REASON_VXLAN_ENTRY_EXISTS, - /** @SKB_DROP_REASON_VXLAN_NO_REMOTE: no remote found for xmit */ - SKB_DROP_REASON_VXLAN_NO_REMOTE, + /** @SKB_DROP_REASON_NO_TX_TARGET: no target found for xmit */ + SKB_DROP_REASON_NO_TX_TARGET, /** * @SKB_DROP_REASON_IP_TUNNEL_ECN: skb is dropped according to * RFC 6040 4.2, see __INET_ECN_decapsulate() for detail. -- cgit v1.2.3 From 623e43c2f5023853cbf71d6a60898d448a06416a Mon Sep 17 00:00:00 2001 From: Radu Rendec Date: Thu, 19 Dec 2024 11:36:06 -0500 Subject: net: bridge: add skb drop reasons to the most common drop points The bridge input code may drop frames for various reasons and at various points in the ingress handling logic. Currently kfree_skb() is used everywhere, and therefore no drop reason is specified. Add drop reasons to the most common drop points. Drop reasons are not added exhaustively to the entire bridge code. The intention is to incrementally add drop reasons to the rest of the bridge code in follow up patches. Signed-off-by: Radu Rendec Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241219163606.717758-3-rrendec@redhat.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 6e32106d7229..3a6602f37978 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -111,6 +111,8 @@ FN(TUNNEL_TXINFO) \ FN(LOCAL_MAC) \ FN(ARP_PVLAN_DISABLE) \ + FN(MAC_IEEE_MAC_CONTROL) \ + FN(BRIDGE_INGRESS_STP_STATE) \ FNe(MAX) /** @@ -520,6 +522,16 @@ enum skb_drop_reason { * enabled. */ SKB_DROP_REASON_ARP_PVLAN_DISABLE, + /** + * @SKB_DROP_REASON_MAC_IEEE_MAC_CONTROL: the destination MAC address + * is an IEEE MAC Control address. + */ + SKB_DROP_REASON_MAC_IEEE_MAC_CONTROL, + /** + * @SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE: the STP state of the + * ingress bridge port does not allow frames to be forwarded. + */ + SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen -- cgit v1.2.3 From 94ddd8bf98d76f03297a2b33a951711b31f7bc38 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Mon, 23 Dec 2024 15:18:37 +0000 Subject: misc: trivial: Remove undesired double space from struct definition When one is too lazy to use an LSP to conduct look-ups on struct definitions, one might use the ever useful `struct {` search string. However this doesn't work with `struct miscdevice {` because of a stray double space. Assuming that this wasn't intentional, let's simply remove it. Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20241223151843.472645-1-lee@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/miscdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index c0fea6ca5076..69e110c2b86a 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -76,7 +76,7 @@ struct device; struct attribute_group; -struct miscdevice { +struct miscdevice { int minor; const char *name; const struct file_operations *fops; -- cgit v1.2.3 From 95f68e06b41b9e88291796efa3969409d13fdd4c Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 19 Dec 2024 19:58:33 +0200 Subject: net/mlx5: fs, add counter object to flow destination Currently mlx5_flow_destination includes counter_id which is assigned in case we use flow counter on the flow steering rule. However, counter_id is not enough data in case of using HW Steering. Thus, have mlx5_fc object as part of mlx5_flow_destination instead of counter_id and assign it where needed. In case counter_id is received from user space, create a local counter object to represent it. Signed-off-by: Moshe Shemesh Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241219175841.1094544-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/fs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 438db888bde0..2a69d9d71276 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -163,7 +163,7 @@ struct mlx5_flow_destination { u32 tir_num; u32 ft_num; struct mlx5_flow_table *ft; - u32 counter_id; + struct mlx5_fc *counter; struct { u16 num; u16 vhca_id; @@ -299,6 +299,8 @@ int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler, struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging); void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter); +struct mlx5_fc *mlx5_fc_local_create(u32 counter_id, u32 offset, u32 bulk_size); +void mlx5_fc_local_destroy(struct mlx5_fc *counter); u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter); void mlx5_fc_query_cached(struct mlx5_fc *counter, u64 *bytes, u64 *packets, u64 *lastuse); -- cgit v1.2.3 From 2a4f56fbcc473d8faeb29b73082df39efbe5893c Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 20 Dec 2024 10:15:05 +0200 Subject: net/mlx5e: Keep netdev when leave switchdev for devlink set legacy only In the cited commit, when changing from switchdev to legacy mode, uplink representor's netdev is kept, and its profile is replaced with nic profile, so netdev is detached from old profile, then attach to new profile. During profile change, the hardware resources allocated by the old profile will be cleaned up. However, the cleanup is relying on the related kernel modules. And they may need to flush themselves first, which is triggered by netdev events, for example, NETDEV_UNREGISTER. However, netdev is kept, or netdev_register is called after the cleanup, which may cause troubles because the resources are still referred by kernel modules. The same process applies to all the caes when uplink is leaving switchdev mode, including devlink eswitch mode set legacy, driver unload and devlink reload. For the first one, it can be blocked and returns failure to users, whenever possible. But it's hard for the others. Besides, the attachment to nic profile is unnecessary as the netdev will be unregistered anyway for such cases. So in this patch, the original behavior is kept only for devlink eswitch set mode legacy. For the others, moves netdev unregistration before the profile change. Fixes: 7a9fb35e8c3a ("net/mlx5e: Do not reload ethernet ports when changing eswitch mode") Signed-off-by: Jianbo Liu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fc7e6153b73d..8f5991168ccd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -524,6 +524,7 @@ enum { * creation/deletion on drivers rescan. Unset during device attach. */ MLX5_PRIV_FLAGS_DETACH = 1 << 2, + MLX5_PRIV_FLAGS_SWITCH_LEGACY = 1 << 3, }; struct mlx5_adev { -- cgit v1.2.3 From ef94ea4fc18ff3fa5034d0da4c1a52dba0b23f8c Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Tue, 17 Dec 2024 23:41:53 +0200 Subject: clk: Drop obsolete devm_clk_bulk_get_all_enable() helper Commit 265b07df758a ("clk: Provide managed helper to get and enable bulk clocks") added devm_clk_bulk_get_all_enable() function, but missed to return the number of clocks stored in the clk_bulk_data table referenced by the clks argument. Without knowing the number, it's not possible to iterate these clocks when needed, hence the argument is useless and could have been simply removed. A new helper devm_clk_bulk_get_all_enabled() has been introduced, which is consistent with devm_clk_bulk_get_all() in terms of the returned value. Drop the obsolete function since all users switched to the new helper. Reviewed-by: AngeloGioacchino Del Regno Reviewed-by: Manivannan Sadhasivam Signed-off-by: Cristian Ciocaltea Link: https://lore.kernel.org/r/20241217-clk_bulk_ena_fix-v5-3-aafbbb245155@collabora.com Signed-off-by: Stephen Boyd --- include/linux/clk.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include') diff --git a/include/linux/clk.h b/include/linux/clk.h index 1dcee6d701e4..b607482ca77e 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -1138,15 +1138,6 @@ static inline void clk_restore_context(void) {} #endif -/* Deprecated. Use devm_clk_bulk_get_all_enabled() */ -static inline int __must_check -devm_clk_bulk_get_all_enable(struct device *dev, struct clk_bulk_data **clks) -{ - int ret = devm_clk_bulk_get_all_enabled(dev, clks); - - return ret > 0 ? 0 : ret; -} - /* clk_prepare_enable helps cases using clk_enable in non-atomic context. */ static inline int clk_prepare_enable(struct clk *clk) { -- cgit v1.2.3 From 452f4b31e3f70a52b97890888eeb9eaa9a87139a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= Date: Mon, 25 Nov 2024 11:50:25 +0100 Subject: tracing: Constify string literal data member in struct trace_event_call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The name member of the struct trace_event_call is assigned with generated string literals; declare them pointer to read-only. Reported by clang: security/landlock/syscalls.c:179:1: warning: initializing 'char *' with an expression of type 'const char[34]' discards qualifiers [-Wincompatible-pointer-types-discards-qualifiers] 179 | SYSCALL_DEFINE3(landlock_create_ruleset, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 180 | const struct landlock_ruleset_attr __user *const, attr, | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 181 | const size_t, size, const __u32, flags) | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:226:36: note: expanded from macro 'SYSCALL_DEFINE3' 226 | #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:234:2: note: expanded from macro 'SYSCALL_DEFINEx' 234 | SYSCALL_METADATA(sname, x, __VA_ARGS__) \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:184:2: note: expanded from macro 'SYSCALL_METADATA' 184 | SYSCALL_TRACE_ENTER_EVENT(sname); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:151:30: note: expanded from macro 'SYSCALL_TRACE_ENTER_EVENT' 151 | .name = "sys_enter"#sname, \ | ^~~~~~~~~~~~~~~~~ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mickaël Salaün Cc: Günther Noack Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Bill Wendling Cc: Justin Stitt Link: https://lore.kernel.org/20241125105028.42807-1-cgoettsche@seltendoof.de Fixes: b77e38aa240c3 ("tracing: add event trace infrastructure") Signed-off-by: Christian Göttsche Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 91b8ffbdfa8c..58ad4ead33fc 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -364,7 +364,7 @@ struct trace_event_call { struct list_head list; struct trace_event_class *class; union { - char *name; + const char *name; /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */ struct tracepoint *tp; }; -- cgit v1.2.3 From 8cda395b79d90709fde3a9963c667d849cc5718f Mon Sep 17 00:00:00 2001 From: Amit Sunil Dhamne Date: Tue, 10 Dec 2024 19:07:09 -0800 Subject: usb: typec: tcpm: Add new AMS for Get_Revision response This commit adds a new AMS for responding to a "Get_Revision" request. Revision message consists of the following fields: +----------------------------------------------------+ | Header | RMDO | | No. of data objects = 1 | | +----------------------------------------------------+ While RMDO consists of: * B31..28 Revision Major * B27..24 Revision Minor * B23..20 Version Major * B19..16 Version Minor * B15..0 Reserved, shall be set to zero. As per the PD spec ("8.3.3.16.2.1 PR_Give_Revision State"), a request is only expected when an explicit contract is established and the port is in ready state. This AMS is only supported for PD >= 3.0. Signed-off-by: Amit Sunil Dhamne Reviewed-by: Badhri Jagan Sridharan Link: https://lore.kernel.org/r/20241210-get_rev_upstream-v2-3-d0094e52d48f@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/pd.h | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h index d50098fb16b5..3068c3084eb6 100644 --- a/include/linux/usb/pd.h +++ b/include/linux/usb/pd.h @@ -33,7 +33,9 @@ enum pd_ctrl_msg_type { PD_CTRL_FR_SWAP = 19, PD_CTRL_GET_PPS_STATUS = 20, PD_CTRL_GET_COUNTRY_CODES = 21, - /* 22-31 Reserved */ + /* 22-23 Reserved */ + PD_CTRL_GET_REVISION = 24, + /* 25-31 Reserved */ }; enum pd_data_msg_type { @@ -46,7 +48,9 @@ enum pd_data_msg_type { PD_DATA_ALERT = 6, PD_DATA_GET_COUNTRY_INFO = 7, PD_DATA_ENTER_USB = 8, - /* 9-14 Reserved */ + /* 9-11 Reserved */ + PD_DATA_REVISION = 12, + /* 13-14 Reserved */ PD_DATA_VENDOR_DEF = 15, /* 16-31 Reserved */ }; @@ -453,6 +457,20 @@ static inline unsigned int rdo_max_power(u32 rdo) #define EUDO_TBT_SUPPORT BIT(14) #define EUDO_HOST_PRESENT BIT(13) +/* + * Request Message Data Object (PD Revision 3.1+ only) + * -------- + * <31:28> :: Revision Major + * <27:24> :: Revision Minor + * <23:20> :: Version Major + * <19:16> :: Version Minor + * <15:0> :: Reserved, Shall be set to zero + */ + +#define RMDO(rev_maj, rev_min, ver_maj, ver_min) \ + (((rev_maj) & 0xf) << 28 | ((rev_min) & 0xf) << 24 | \ + ((ver_maj) & 0xf) << 20 | ((ver_min) & 0xf) << 16) + /* USB PD timers and counters */ #define PD_T_NO_RESPONSE 5000 /* 4.5 - 5.5 seconds */ #define PD_T_DB_DETECT 10000 /* 10 - 15 seconds */ -- cgit v1.2.3 From 100e257386595b3f1865ca8a991e2ba74f9701ff Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Fri, 13 Dec 2024 15:35:43 -0800 Subject: usb: typec: Add driver for Thunderbolt 3 Alternate Mode Thunderbolt 3 Alternate Mode entry flow is described in USB Type-C Specification Release 2.0. Signed-off-by: Heikki Krogerus Co-developed-by: Abhishek Pandit-Subedi Signed-off-by: Abhishek Pandit-Subedi Reviewed-by: Benson Leung Link: https://lore.kernel.org/r/20241213153543.v5.2.I3080b036e8de0b9957c57c1c3059db7149c5e549@changeid Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec_tbt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/usb/typec_tbt.h b/include/linux/usb/typec_tbt.h index fa97d7e00f5c..55dcea12082c 100644 --- a/include/linux/usb/typec_tbt.h +++ b/include/linux/usb/typec_tbt.h @@ -44,6 +44,7 @@ struct typec_thunderbolt_data { #define TBT_GEN3_NON_ROUNDED 0 #define TBT_GEN3_GEN4_ROUNDED_NON_ROUNDED 1 +#define TBT_CABLE_ROUNDED BIT(19) #define TBT_CABLE_OPTICAL BIT(21) #define TBT_CABLE_RETIMER BIT(22) #define TBT_CABLE_LINK_TRAINING BIT(23) -- cgit v1.2.3 From 183b194d8fb62694e81c18e1faec9ad418f952e3 Mon Sep 17 00:00:00 2001 From: Abhishek Pandit-Subedi Date: Fri, 13 Dec 2024 15:35:44 -0800 Subject: usb: typec: Make active on port altmode writable The active property of port altmode should be writable (to prevent or allow partner altmodes from entering) and needs to be part of typec_altmode_desc so we can initialize the port to an inactive state if desired. Signed-off-by: Abhishek Pandit-Subedi Reviewed-by: Heikki Krogerus Reviewed-by: Benson Leung Link: https://lore.kernel.org/r/20241213153543.v5.3.I794566684ab2965e209f326b08232006eff333f8@changeid Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h index d616b8807000..252af3f77039 100644 --- a/include/linux/usb/typec.h +++ b/include/linux/usb/typec.h @@ -140,6 +140,7 @@ int typec_cable_set_identity(struct typec_cable *cable); * @mode: Index of the Mode * @vdo: VDO returned by Discover Modes USB PD command * @roles: Only for ports. DRP if the mode is available in both roles + * @inactive: Only for ports. Make this port inactive (default is active). * * Description of an Alternate Mode which a connector, cable plug or partner * supports. @@ -150,6 +151,7 @@ struct typec_altmode_desc { u32 vdo; /* Only used with ports */ enum typec_port_data roles; + bool inactive; }; void typec_partner_set_pd_revision(struct typec_partner *partner, u16 pd_revision); -- cgit v1.2.3 From 30dd62fa3954cb7697dbae9c33b2a5c50d8b5c6a Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 21 Dec 2024 01:40:18 +0000 Subject: RDMA/core: Remove unused ib_ud_header_unpack ib_ud_header_unpack() is unused, and I can't see any sign of it ever having been used in git. The only reference I can find is from December 2004 BKrev: 41d30034XNbBUl0XnyC6ig9V61Nf-A when it looks like it was added. Remove it. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20241221014021.343979-2-linux@treblig.org Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- include/rdma/ib_pack.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h index b8c56d7dc35d..8266fab826a7 100644 --- a/include/rdma/ib_pack.h +++ b/include/rdma/ib_pack.h @@ -283,7 +283,4 @@ int ib_ud_header_init(int payload_bytes, int ib_ud_header_pack(struct ib_ud_header *header, void *buf); -int ib_ud_header_unpack(void *buf, - struct ib_ud_header *header); - #endif /* IB_PACK_H */ -- cgit v1.2.3 From ddc8fab40b9ae309052d37179a705430fc15db97 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 21 Dec 2024 01:40:19 +0000 Subject: RDMA/core: Remove unused ib_find_exact_cached_pkey The last use of ib_find_exact_cached_pkey() was removed in 2012 by commit 2c75d2ccb6e5 ("IB/mlx4: Fix QP1 P_Key processing in the Primary Physical Function (PPF)") Remove it. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20241221014021.343979-3-linux@treblig.org Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- include/rdma/ib_cache.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h index 226ae3702d8a..2bf09b594d10 100644 --- a/include/rdma/ib_cache.h +++ b/include/rdma/ib_cache.h @@ -63,22 +63,6 @@ int ib_find_cached_pkey(struct ib_device *device, u16 pkey, u16 *index); -/** - * ib_find_exact_cached_pkey - Returns the PKey table index where a specified - * PKey value occurs. Comparison uses the FULL 16 bits (incl membership bit) - * @device: The device to query. - * @port_num: The port number of the device to search for the PKey. - * @pkey: The PKey value to search for. - * @index: The index into the cached PKey table where the PKey was found. - * - * ib_find_exact_cached_pkey() searches the specified PKey table in - * the local software cache. - */ -int ib_find_exact_cached_pkey(struct ib_device *device, - u32 port_num, - u16 pkey, - u16 *index); - /** * ib_get_cached_lmc - Returns a cached lmc table entry * @device: The device to query. -- cgit v1.2.3 From 750efbb9c307f7d9ff43d38f58d3fca087dc041f Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 21 Dec 2024 01:40:20 +0000 Subject: RDMA/core: Remove unused ibdev_printk The last use of ibdev_printk() was removed in 2019 by commit b2299e83815c ("RDMA: Delete DEBUG code") Remove it. Note: The __ibdev_printk() is still used in the idev_err etc functions so leave that. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20241221014021.343979-4-linux@treblig.org Signed-off-by: Leon Romanovsky --- include/rdma/ib_verbs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6ddd5e3bb884..77472e19ec0c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -59,9 +59,6 @@ extern struct workqueue_struct *ib_comp_unbound_wq; struct ib_ucq_object; -__printf(3, 4) __cold -void ibdev_printk(const char *level, const struct ib_device *ibdev, - const char *format, ...); __printf(2, 3) __cold void ibdev_emerg(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold -- cgit v1.2.3 From 2028c2958775c4861756ace010b433cc1c81f516 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 21 Dec 2024 01:40:21 +0000 Subject: RDMA/core: Remove unused ib_copy_path_rec_from_user ib_copy_path_rec_from_user() has been unused since 2019's commit a1a8e4a85cf7 ("rdma: Delete the ib_ucm module") Remove it. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20241221014021.343979-5-linux@treblig.org Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- include/rdma/ib_marshall.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_marshall.h b/include/rdma/ib_marshall.h index 1838869aad28..b179e464e3d1 100644 --- a/include/rdma/ib_marshall.h +++ b/include/rdma/ib_marshall.h @@ -22,7 +22,4 @@ void ib_copy_ah_attr_to_user(struct ib_device *device, void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, struct sa_path_rec *src); -void ib_copy_path_rec_from_user(struct sa_path_rec *dst, - struct ib_user_path_rec *src); - #endif /* IB_USER_MARSHALL_H */ -- cgit v1.2.3 From 0c039a57b68dfb1dd49dfc16240791086d8e57ad Mon Sep 17 00:00:00 2001 From: Yuyu Li Date: Fri, 22 Nov 2024 18:52:57 +0800 Subject: RDMA/core: Add ib_query_netdev_port() to query netdev port by IB device. Query the port number of a netdev associated with an ibdev. Signed-off-by: Yuyu Li Signed-off-by: Junxian Huang Signed-off-by: Leon Romanovsky --- include/rdma/ib_verbs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 77472e19ec0c..c539a1706f66 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4468,6 +4468,8 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, unsigned int port); struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, u32 port); +int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, + u32 *port); struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr); int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata); -- cgit v1.2.3 From 1fb0644c3899b2f857b11037b19ed362b67bfe91 Mon Sep 17 00:00:00 2001 From: Yuyu Li Date: Fri, 22 Nov 2024 18:52:58 +0800 Subject: RDMA/core: Support link status events dispatching Currently the dispatching of link status events is implemented by each RDMA driver independently, and most of them have very similar patterns. Add support for this in ib_core so that we can get rid of duplicate codes in each driver. A new last_port_state is added in ib_port_cache to cache the port state of the last link status events dispatching. The original port_state in ib_port_cache is not used here because it will be updated when ib_dispatch_event() is called, which means it may be changed between two link status events, and may lead to a loss of event dispatching. Some drivers currently have some private stuff in their link status events handler in addition to event dispatching, and cannot be perfectly integrated into the ib_core handling process. For these drivers, add a new ops report_port_event() so that they can keep their current processing. Finally, events of LAG devices are not supported yet in this patch as currently there is no way to obtain ibdev from upper netdev in ib_core. This can be a TODO work after the core have more support for LAG. Signed-off-by: Yuyu Li Signed-off-by: Junxian Huang Signed-off-by: Leon Romanovsky --- include/rdma/ib_verbs.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index c539a1706f66..0ad104dae253 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2174,6 +2174,7 @@ struct ib_port_cache { struct ib_gid_table *gid; u8 lmc; enum ib_port_state port_state; + enum ib_port_state last_port_state; }; struct ib_port_immutable { @@ -2680,6 +2681,13 @@ struct ib_device_ops { */ void (*ufile_hw_cleanup)(struct ib_uverbs_file *ufile); + /** + * report_port_event - Drivers need to implement this if they have + * some private stuff to handle when link status changes. + */ + void (*report_port_event)(struct ib_device *ibdev, + struct net_device *ndev, unsigned long event); + DECLARE_RDMA_OBJ_SIZE(ib_ah); DECLARE_RDMA_OBJ_SIZE(ib_counters); DECLARE_RDMA_OBJ_SIZE(ib_cq); @@ -4470,6 +4478,15 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, u32 port); int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, u32 *port); + +static inline enum ib_port_state ib_get_curr_port_state(struct net_device *net_dev) +{ + return (netif_running(net_dev) && netif_carrier_ok(net_dev)) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; +} + +void ib_dispatch_port_state_event(struct ib_device *ibdev, + struct net_device *ndev); struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr); int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata); -- cgit v1.2.3 From fdf3ee5c6e5278dab4f60b998b47ed2d510bf80f Mon Sep 17 00:00:00 2001 From: Md Sadre Alam Date: Wed, 20 Nov 2024 14:45:02 +0530 Subject: mtd: nand: Add qpic_common API file Add qpic_common.c file which hold all the common qpic APIs which will be used by both qpic raw nand driver and qpic spi nand driver. Signed-off-by: Md Sadre Alam Signed-off-by: Miquel Raynal --- include/linux/mtd/nand-qpic-common.h | 468 +++++++++++++++++++++++++++++++++++ 1 file changed, 468 insertions(+) create mode 100644 include/linux/mtd/nand-qpic-common.h (limited to 'include') diff --git a/include/linux/mtd/nand-qpic-common.h b/include/linux/mtd/nand-qpic-common.h new file mode 100644 index 000000000000..425994429387 --- /dev/null +++ b/include/linux/mtd/nand-qpic-common.h @@ -0,0 +1,468 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * QCOM QPIC common APIs header file + * + * Copyright (c) 2023 Qualcomm Inc. + * Authors: Md sadre Alam + * + */ +#ifndef __MTD_NAND_QPIC_COMMON_H__ +#define __MTD_NAND_QPIC_COMMON_H__ + +/* NANDc reg offsets */ +#define NAND_FLASH_CMD 0x00 +#define NAND_ADDR0 0x04 +#define NAND_ADDR1 0x08 +#define NAND_FLASH_CHIP_SELECT 0x0c +#define NAND_EXEC_CMD 0x10 +#define NAND_FLASH_STATUS 0x14 +#define NAND_BUFFER_STATUS 0x18 +#define NAND_DEV0_CFG0 0x20 +#define NAND_DEV0_CFG1 0x24 +#define NAND_DEV0_ECC_CFG 0x28 +#define NAND_AUTO_STATUS_EN 0x2c +#define NAND_DEV1_CFG0 0x30 +#define NAND_DEV1_CFG1 0x34 +#define NAND_READ_ID 0x40 +#define NAND_READ_STATUS 0x44 +#define NAND_DEV_CMD0 0xa0 +#define NAND_DEV_CMD1 0xa4 +#define NAND_DEV_CMD2 0xa8 +#define NAND_DEV_CMD_VLD 0xac +#define SFLASHC_BURST_CFG 0xe0 +#define NAND_ERASED_CW_DETECT_CFG 0xe8 +#define NAND_ERASED_CW_DETECT_STATUS 0xec +#define NAND_EBI2_ECC_BUF_CFG 0xf0 +#define FLASH_BUF_ACC 0x100 + +#define NAND_CTRL 0xf00 +#define NAND_VERSION 0xf08 +#define NAND_READ_LOCATION_0 0xf20 +#define NAND_READ_LOCATION_1 0xf24 +#define NAND_READ_LOCATION_2 0xf28 +#define NAND_READ_LOCATION_3 0xf2c +#define NAND_READ_LOCATION_LAST_CW_0 0xf40 +#define NAND_READ_LOCATION_LAST_CW_1 0xf44 +#define NAND_READ_LOCATION_LAST_CW_2 0xf48 +#define NAND_READ_LOCATION_LAST_CW_3 0xf4c + +/* dummy register offsets, used by qcom_write_reg_dma */ +#define NAND_DEV_CMD1_RESTORE 0xdead +#define NAND_DEV_CMD_VLD_RESTORE 0xbeef + +/* NAND_FLASH_CMD bits */ +#define PAGE_ACC BIT(4) +#define LAST_PAGE BIT(5) + +/* NAND_FLASH_CHIP_SELECT bits */ +#define NAND_DEV_SEL 0 +#define DM_EN BIT(2) + +/* NAND_FLASH_STATUS bits */ +#define FS_OP_ERR BIT(4) +#define FS_READY_BSY_N BIT(5) +#define FS_MPU_ERR BIT(8) +#define FS_DEVICE_STS_ERR BIT(16) +#define FS_DEVICE_WP BIT(23) + +/* NAND_BUFFER_STATUS bits */ +#define BS_UNCORRECTABLE_BIT BIT(8) +#define BS_CORRECTABLE_ERR_MSK 0x1f + +/* NAND_DEVn_CFG0 bits */ +#define DISABLE_STATUS_AFTER_WRITE 4 +#define CW_PER_PAGE 6 +#define UD_SIZE_BYTES 9 +#define UD_SIZE_BYTES_MASK GENMASK(18, 9) +#define ECC_PARITY_SIZE_BYTES_RS 19 +#define SPARE_SIZE_BYTES 23 +#define SPARE_SIZE_BYTES_MASK GENMASK(26, 23) +#define NUM_ADDR_CYCLES 27 +#define STATUS_BFR_READ 30 +#define SET_RD_MODE_AFTER_STATUS 31 + +/* NAND_DEVn_CFG0 bits */ +#define DEV0_CFG1_ECC_DISABLE 0 +#define WIDE_FLASH 1 +#define NAND_RECOVERY_CYCLES 2 +#define CS_ACTIVE_BSY 5 +#define BAD_BLOCK_BYTE_NUM 6 +#define BAD_BLOCK_IN_SPARE_AREA 16 +#define WR_RD_BSY_GAP 17 +#define ENABLE_BCH_ECC 27 + +/* NAND_DEV0_ECC_CFG bits */ +#define ECC_CFG_ECC_DISABLE 0 +#define ECC_SW_RESET 1 +#define ECC_MODE 4 +#define ECC_PARITY_SIZE_BYTES_BCH 8 +#define ECC_NUM_DATA_BYTES 16 +#define ECC_NUM_DATA_BYTES_MASK GENMASK(25, 16) +#define ECC_FORCE_CLK_OPEN 30 + +/* NAND_DEV_CMD1 bits */ +#define READ_ADDR 0 + +/* NAND_DEV_CMD_VLD bits */ +#define READ_START_VLD BIT(0) +#define READ_STOP_VLD BIT(1) +#define WRITE_START_VLD BIT(2) +#define ERASE_START_VLD BIT(3) +#define SEQ_READ_START_VLD BIT(4) + +/* NAND_EBI2_ECC_BUF_CFG bits */ +#define NUM_STEPS 0 + +/* NAND_ERASED_CW_DETECT_CFG bits */ +#define ERASED_CW_ECC_MASK 1 +#define AUTO_DETECT_RES 0 +#define MASK_ECC BIT(ERASED_CW_ECC_MASK) +#define RESET_ERASED_DET BIT(AUTO_DETECT_RES) +#define ACTIVE_ERASED_DET (0 << AUTO_DETECT_RES) +#define CLR_ERASED_PAGE_DET (RESET_ERASED_DET | MASK_ECC) +#define SET_ERASED_PAGE_DET (ACTIVE_ERASED_DET | MASK_ECC) + +/* NAND_ERASED_CW_DETECT_STATUS bits */ +#define PAGE_ALL_ERASED BIT(7) +#define CODEWORD_ALL_ERASED BIT(6) +#define PAGE_ERASED BIT(5) +#define CODEWORD_ERASED BIT(4) +#define ERASED_PAGE (PAGE_ALL_ERASED | PAGE_ERASED) +#define ERASED_CW (CODEWORD_ALL_ERASED | CODEWORD_ERASED) + +/* NAND_READ_LOCATION_n bits */ +#define READ_LOCATION_OFFSET 0 +#define READ_LOCATION_SIZE 16 +#define READ_LOCATION_LAST 31 + +/* Version Mask */ +#define NAND_VERSION_MAJOR_MASK 0xf0000000 +#define NAND_VERSION_MAJOR_SHIFT 28 +#define NAND_VERSION_MINOR_MASK 0x0fff0000 +#define NAND_VERSION_MINOR_SHIFT 16 + +/* NAND OP_CMDs */ +#define OP_PAGE_READ 0x2 +#define OP_PAGE_READ_WITH_ECC 0x3 +#define OP_PAGE_READ_WITH_ECC_SPARE 0x4 +#define OP_PAGE_READ_ONFI_READ 0x5 +#define OP_PROGRAM_PAGE 0x6 +#define OP_PAGE_PROGRAM_WITH_ECC 0x7 +#define OP_PROGRAM_PAGE_SPARE 0x9 +#define OP_BLOCK_ERASE 0xa +#define OP_CHECK_STATUS 0xc +#define OP_FETCH_ID 0xb +#define OP_RESET_DEVICE 0xd + +/* Default Value for NAND_DEV_CMD_VLD */ +#define NAND_DEV_CMD_VLD_VAL (READ_START_VLD | WRITE_START_VLD | \ + ERASE_START_VLD | SEQ_READ_START_VLD) + +/* NAND_CTRL bits */ +#define BAM_MODE_EN BIT(0) + +/* + * the NAND controller performs reads/writes with ECC in 516 byte chunks. + * the driver calls the chunks 'step' or 'codeword' interchangeably + */ +#define NANDC_STEP_SIZE 512 + +/* + * the largest page size we support is 8K, this will have 16 steps/codewords + * of 512 bytes each + */ +#define MAX_NUM_STEPS (SZ_8K / NANDC_STEP_SIZE) + +/* we read at most 3 registers per codeword scan */ +#define MAX_REG_RD (3 * MAX_NUM_STEPS) + +/* ECC modes supported by the controller */ +#define ECC_NONE BIT(0) +#define ECC_RS_4BIT BIT(1) +#define ECC_BCH_4BIT BIT(2) +#define ECC_BCH_8BIT BIT(3) + +/* + * Returns the actual register address for all NAND_DEV_ registers + * (i.e. NAND_DEV_CMD0, NAND_DEV_CMD1, NAND_DEV_CMD2 and NAND_DEV_CMD_VLD) + */ +#define dev_cmd_reg_addr(nandc, reg) ((nandc)->props->dev_cmd_reg_start + (reg)) + +/* Returns the NAND register physical address */ +#define nandc_reg_phys(chip, offset) ((chip)->base_phys + (offset)) + +/* Returns the dma address for reg read buffer */ +#define reg_buf_dma_addr(chip, vaddr) \ + ((chip)->reg_read_dma + \ + ((u8 *)(vaddr) - (u8 *)(chip)->reg_read_buf)) + +#define QPIC_PER_CW_CMD_ELEMENTS 32 +#define QPIC_PER_CW_CMD_SGL 32 +#define QPIC_PER_CW_DATA_SGL 8 + +#define QPIC_NAND_COMPLETION_TIMEOUT msecs_to_jiffies(2000) + +/* + * Flags used in DMA descriptor preparation helper functions + * (i.e. qcom_read_reg_dma/qcom_write_reg_dma/qcom_read_data_dma/qcom_write_data_dma) + */ +/* Don't set the EOT in current tx BAM sgl */ +#define NAND_BAM_NO_EOT BIT(0) +/* Set the NWD flag in current BAM sgl */ +#define NAND_BAM_NWD BIT(1) +/* Finish writing in the current BAM sgl and start writing in another BAM sgl */ +#define NAND_BAM_NEXT_SGL BIT(2) +/* + * Erased codeword status is being used two times in single transfer so this + * flag will determine the current value of erased codeword status register + */ +#define NAND_ERASED_CW_SET BIT(4) + +#define MAX_ADDRESS_CYCLE 5 + +/* + * This data type corresponds to the BAM transaction which will be used for all + * NAND transfers. + * @bam_ce - the array of BAM command elements + * @cmd_sgl - sgl for NAND BAM command pipe + * @data_sgl - sgl for NAND BAM consumer/producer pipe + * @last_data_desc - last DMA desc in data channel (tx/rx). + * @last_cmd_desc - last DMA desc in command channel. + * @txn_done - completion for NAND transfer. + * @bam_ce_pos - the index in bam_ce which is available for next sgl + * @bam_ce_start - the index in bam_ce which marks the start position ce + * for current sgl. It will be used for size calculation + * for current sgl + * @cmd_sgl_pos - current index in command sgl. + * @cmd_sgl_start - start index in command sgl. + * @tx_sgl_pos - current index in data sgl for tx. + * @tx_sgl_start - start index in data sgl for tx. + * @rx_sgl_pos - current index in data sgl for rx. + * @rx_sgl_start - start index in data sgl for rx. + */ +struct bam_transaction { + struct bam_cmd_element *bam_ce; + struct scatterlist *cmd_sgl; + struct scatterlist *data_sgl; + struct dma_async_tx_descriptor *last_data_desc; + struct dma_async_tx_descriptor *last_cmd_desc; + struct completion txn_done; + u32 bam_ce_pos; + u32 bam_ce_start; + u32 cmd_sgl_pos; + u32 cmd_sgl_start; + u32 tx_sgl_pos; + u32 tx_sgl_start; + u32 rx_sgl_pos; + u32 rx_sgl_start; +}; + +/* + * This data type corresponds to the nand dma descriptor + * @dma_desc - low level DMA engine descriptor + * @list - list for desc_info + * + * @adm_sgl - sgl which will be used for single sgl dma descriptor. Only used by + * ADM + * @bam_sgl - sgl which will be used for dma descriptor. Only used by BAM + * @sgl_cnt - number of SGL in bam_sgl. Only used by BAM + * @dir - DMA transfer direction + */ +struct desc_info { + struct dma_async_tx_descriptor *dma_desc; + struct list_head node; + + union { + struct scatterlist adm_sgl; + struct { + struct scatterlist *bam_sgl; + int sgl_cnt; + }; + }; + enum dma_data_direction dir; +}; + +/* + * holds the current register values that we want to write. acts as a contiguous + * chunk of memory which we use to write the controller registers through DMA. + */ +struct nandc_regs { + __le32 cmd; + __le32 addr0; + __le32 addr1; + __le32 chip_sel; + __le32 exec; + + __le32 cfg0; + __le32 cfg1; + __le32 ecc_bch_cfg; + + __le32 clrflashstatus; + __le32 clrreadstatus; + + __le32 cmd1; + __le32 vld; + + __le32 orig_cmd1; + __le32 orig_vld; + + __le32 ecc_buf_cfg; + __le32 read_location0; + __le32 read_location1; + __le32 read_location2; + __le32 read_location3; + __le32 read_location_last0; + __le32 read_location_last1; + __le32 read_location_last2; + __le32 read_location_last3; + + __le32 erased_cw_detect_cfg_clr; + __le32 erased_cw_detect_cfg_set; +}; + +/* + * NAND controller data struct + * + * @dev: parent device + * + * @base: MMIO base + * + * @core_clk: controller clock + * @aon_clk: another controller clock + * + * @regs: a contiguous chunk of memory for DMA register + * writes. contains the register values to be + * written to controller + * + * @props: properties of current NAND controller, + * initialized via DT match data + * + * @controller: base controller structure + * @host_list: list containing all the chips attached to the + * controller + * + * @chan: dma channel + * @cmd_crci: ADM DMA CRCI for command flow control + * @data_crci: ADM DMA CRCI for data flow control + * + * @desc_list: DMA descriptor list (list of desc_infos) + * + * @data_buffer: our local DMA buffer for page read/writes, + * used when we can't use the buffer provided + * by upper layers directly + * @reg_read_buf: local buffer for reading back registers via DMA + * + * @base_phys: physical base address of controller registers + * @base_dma: dma base address of controller registers + * @reg_read_dma: contains dma address for register read buffer + * + * @buf_size/count/start: markers for chip->legacy.read_buf/write_buf + * functions + * @max_cwperpage: maximum QPIC codewords required. calculated + * from all connected NAND devices pagesize + * + * @reg_read_pos: marker for data read in reg_read_buf + * + * @cmd1/vld: some fixed controller register values + * + * @exec_opwrite: flag to select correct number of code word + * while reading status + */ +struct qcom_nand_controller { + struct device *dev; + + void __iomem *base; + + struct clk *core_clk; + struct clk *aon_clk; + + struct nandc_regs *regs; + struct bam_transaction *bam_txn; + + const struct qcom_nandc_props *props; + + struct nand_controller *controller; + struct list_head host_list; + + union { + /* will be used only by QPIC for BAM DMA */ + struct { + struct dma_chan *tx_chan; + struct dma_chan *rx_chan; + struct dma_chan *cmd_chan; + }; + + /* will be used only by EBI2 for ADM DMA */ + struct { + struct dma_chan *chan; + unsigned int cmd_crci; + unsigned int data_crci; + }; + }; + + struct list_head desc_list; + + u8 *data_buffer; + __le32 *reg_read_buf; + + phys_addr_t base_phys; + dma_addr_t base_dma; + dma_addr_t reg_read_dma; + + int buf_size; + int buf_count; + int buf_start; + unsigned int max_cwperpage; + + int reg_read_pos; + + u32 cmd1, vld; + bool exec_opwrite; +}; + +/* + * This data type corresponds to the NAND controller properties which varies + * among different NAND controllers. + * @ecc_modes - ecc mode for NAND + * @dev_cmd_reg_start - NAND_DEV_CMD_* registers starting offset + * @supports_bam - whether NAND controller is using BAM + * @nandc_part_of_qpic - whether NAND controller is part of qpic IP + * @qpic_version2 - flag to indicate QPIC IP version 2 + * @use_codeword_fixup - whether NAND has different layout for boot partitions + */ +struct qcom_nandc_props { + u32 ecc_modes; + u32 dev_cmd_reg_start; + bool supports_bam; + bool nandc_part_of_qpic; + bool qpic_version2; + bool use_codeword_fixup; +}; + +void qcom_free_bam_transaction(struct qcom_nand_controller *nandc); +struct bam_transaction *qcom_alloc_bam_transaction(struct qcom_nand_controller *nandc); +void qcom_clear_bam_transaction(struct qcom_nand_controller *nandc); +void qcom_qpic_bam_dma_done(void *data); +void qcom_nandc_dev_to_mem(struct qcom_nand_controller *nandc, bool is_cpu); +int qcom_prepare_bam_async_desc(struct qcom_nand_controller *nandc, + struct dma_chan *chan, unsigned long flags); +int qcom_prep_bam_dma_desc_cmd(struct qcom_nand_controller *nandc, bool read, + int reg_off, const void *vaddr, int size, unsigned int flags); +int qcom_prep_bam_dma_desc_data(struct qcom_nand_controller *nandc, bool read, + const void *vaddr, int size, unsigned int flags); +int qcom_prep_adm_dma_desc(struct qcom_nand_controller *nandc, bool read, int reg_off, + const void *vaddr, int size, bool flow_control); +int qcom_read_reg_dma(struct qcom_nand_controller *nandc, int first, int num_regs, + unsigned int flags); +int qcom_write_reg_dma(struct qcom_nand_controller *nandc, __le32 *vaddr, int first, + int num_regs, unsigned int flags); +int qcom_read_data_dma(struct qcom_nand_controller *nandc, int reg_off, const u8 *vaddr, + int size, unsigned int flags); +int qcom_write_data_dma(struct qcom_nand_controller *nandc, int reg_off, const u8 *vaddr, + int size, unsigned int flags); +int qcom_submit_descs(struct qcom_nand_controller *nandc); +void qcom_clear_read_regs(struct qcom_nand_controller *nandc); +void qcom_nandc_unalloc(struct qcom_nand_controller *nandc); +int qcom_nandc_alloc(struct qcom_nand_controller *nandc); +#endif + -- cgit v1.2.3 From 0c08080fd71cd5dd59643104b39d3c89d793ab3c Mon Sep 17 00:00:00 2001 From: Md Sadre Alam Date: Wed, 20 Nov 2024 14:45:03 +0530 Subject: mtd: rawnand: qcom: use FIELD_PREP and GENMASK Use the bitfield macro FIELD_PREP, and GENMASK to do the shift and mask in one go. This makes the code more readable. Reviewed-by: Konrad Dybcio Signed-off-by: Md Sadre Alam Signed-off-by: Miquel Raynal --- include/linux/mtd/nand-qpic-common.h | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/mtd/nand-qpic-common.h b/include/linux/mtd/nand-qpic-common.h index 425994429387..e79c79775eb8 100644 --- a/include/linux/mtd/nand-qpic-common.h +++ b/include/linux/mtd/nand-qpic-common.h @@ -70,35 +70,42 @@ #define BS_CORRECTABLE_ERR_MSK 0x1f /* NAND_DEVn_CFG0 bits */ -#define DISABLE_STATUS_AFTER_WRITE 4 +#define DISABLE_STATUS_AFTER_WRITE BIT(4) #define CW_PER_PAGE 6 +#define CW_PER_PAGE_MASK GENMASK(8, 6) #define UD_SIZE_BYTES 9 #define UD_SIZE_BYTES_MASK GENMASK(18, 9) -#define ECC_PARITY_SIZE_BYTES_RS 19 +#define ECC_PARITY_SIZE_BYTES_RS GENMASK(22, 19) #define SPARE_SIZE_BYTES 23 #define SPARE_SIZE_BYTES_MASK GENMASK(26, 23) #define NUM_ADDR_CYCLES 27 -#define STATUS_BFR_READ 30 -#define SET_RD_MODE_AFTER_STATUS 31 +#define NUM_ADDR_CYCLES_MASK GENMASK(29, 27) +#define STATUS_BFR_READ BIT(30) +#define SET_RD_MODE_AFTER_STATUS BIT(31) /* NAND_DEVn_CFG0 bits */ -#define DEV0_CFG1_ECC_DISABLE 0 -#define WIDE_FLASH 1 +#define DEV0_CFG1_ECC_DISABLE BIT(0) +#define WIDE_FLASH BIT(1) #define NAND_RECOVERY_CYCLES 2 -#define CS_ACTIVE_BSY 5 +#define NAND_RECOVERY_CYCLES_MASK GENMASK(4, 2) +#define CS_ACTIVE_BSY BIT(5) #define BAD_BLOCK_BYTE_NUM 6 -#define BAD_BLOCK_IN_SPARE_AREA 16 +#define BAD_BLOCK_BYTE_NUM_MASK GENMASK(15, 6) +#define BAD_BLOCK_IN_SPARE_AREA BIT(16) #define WR_RD_BSY_GAP 17 -#define ENABLE_BCH_ECC 27 +#define WR_RD_BSY_GAP_MASK GENMASK(22, 17) +#define ENABLE_BCH_ECC BIT(27) /* NAND_DEV0_ECC_CFG bits */ -#define ECC_CFG_ECC_DISABLE 0 -#define ECC_SW_RESET 1 +#define ECC_CFG_ECC_DISABLE BIT(0) +#define ECC_SW_RESET BIT(1) #define ECC_MODE 4 +#define ECC_MODE_MASK GENMASK(5, 4) #define ECC_PARITY_SIZE_BYTES_BCH 8 +#define ECC_PARITY_SIZE_BYTES_BCH_MASK GENMASK(12, 8) #define ECC_NUM_DATA_BYTES 16 #define ECC_NUM_DATA_BYTES_MASK GENMASK(25, 16) -#define ECC_FORCE_CLK_OPEN 30 +#define ECC_FORCE_CLK_OPEN BIT(30) /* NAND_DEV_CMD1 bits */ #define READ_ADDR 0 -- cgit v1.2.3 From deb015208f7be9a62cb68dd8337d075b1829ee1d Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Fri, 20 Dec 2024 17:35:12 +0000 Subject: ASoC: SDCA: Add missing header includes Several of the SDCA files don't include all the headers they use locally. These are included by the point of use through other headers, so it is not currently causing any issues. However, files should directly include things they directly use, so add the missing header includes. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20241220173516.907406-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca.h | 3 +++ include/sound/sdca_function.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/sound/sdca.h b/include/sound/sdca.h index 7e138229e8f3..3eea1dfec16c 100644 --- a/include/sound/sdca.h +++ b/include/sound/sdca.h @@ -9,6 +9,9 @@ #ifndef __SDCA_H__ #define __SDCA_H__ +#include +#include + struct sdw_slave; #define SDCA_MAX_FUNCTION_COUNT 8 diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h index a01eec86b9a6..6943df0851a9 100644 --- a/include/sound/sdca_function.h +++ b/include/sound/sdca_function.h @@ -9,6 +9,8 @@ #ifndef __SDCA_FUNCTION_H__ #define __SDCA_FUNCTION_H__ +#include + /* * SDCA Function Types from SDCA specification v1.0a Section 5.1.2 * all Function types not described are reserved -- cgit v1.2.3 From c36297b1bd6e52a75a8ed75eb5dbf35c50402398 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Fri, 20 Dec 2024 17:35:14 +0000 Subject: ASoC: SDCA: Add bounds check for function address SDCA only supports 3-bits for the function address, but the ACPI value is 64-bits. Update the code that parses this to do a bounds check and error out on invalid addresses. Currently, an invalid address would truncate to the bottom 3-bits when used and thus use a likely incorrect address. With the bounds check, it is also now safe to shrink the size of the adr member of sdca_function_desc to a u8 and rearrange the struct members to pack better with the new size of adr. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20241220173516.907406-3-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/sound/sdca.h b/include/sound/sdca.h index 3eea1dfec16c..973252d0adac 100644 --- a/include/sound/sdca.h +++ b/include/sound/sdca.h @@ -23,9 +23,9 @@ struct sdw_slave; * @name: human-readable string */ struct sdca_function_desc { - u64 adr; - u32 type; const char *name; + u32 type; + u8 adr; }; /** -- cgit v1.2.3 From c1ed5eb13f39b0058670bc2b1e251a040c306868 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Fri, 20 Dec 2024 17:35:15 +0000 Subject: ASoC: SDCA: Add missing function type names It is not helpful to error out on some SDCA function types, we might as well report the correct name and let the driver core simply not bind a driver to those functions for which the code lacks support. Also given no functions currently have support, it seems odd to select some as unsupported. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20241220173516.907406-4-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_function.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h index 6943df0851a9..89e42db6d591 100644 --- a/include/sound/sdca_function.h +++ b/include/sound/sdca_function.h @@ -42,6 +42,7 @@ enum sdca_function_type { #define SDCA_FUNCTION_TYPE_RJ_NAME "RJ" #define SDCA_FUNCTION_TYPE_SIMPLE_NAME "SimpleJack" #define SDCA_FUNCTION_TYPE_HID_NAME "HID" +#define SDCA_FUNCTION_TYPE_IMP_DEF_NAME "ImplementationDefined" enum sdca_entity0_controls { SDCA_CONTROL_ENTITY_0_COMMIT_GROUP_MASK = 0x01, -- cgit v1.2.3 From 542ed8145e6f9392e3d0a86a0e9027d2ffd183e4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 21 Dec 2024 00:29:20 +0100 Subject: netfilter: nft_set_hash: unaligned atomic read on struct nft_set_ext Access to genmask field in struct nft_set_ext results in unaligned atomic read: [ 72.130109] Unable to handle kernel paging request at virtual address ffff0000c2bb708c [ 72.131036] Mem abort info: [ 72.131213] ESR = 0x0000000096000021 [ 72.131446] EC = 0x25: DABT (current EL), IL = 32 bits [ 72.132209] SET = 0, FnV = 0 [ 72.133216] EA = 0, S1PTW = 0 [ 72.134080] FSC = 0x21: alignment fault [ 72.135593] Data abort info: [ 72.137194] ISV = 0, ISS = 0x00000021, ISS2 = 0x00000000 [ 72.142351] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 72.145989] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 72.150115] swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000237d27000 [ 72.154893] [ffff0000c2bb708c] pgd=0000000000000000, p4d=180000023ffff403, pud=180000023f84b403, pmd=180000023f835403, +pte=0068000102bb7707 [ 72.163021] Internal error: Oops: 0000000096000021 [#1] SMP [...] [ 72.170041] CPU: 7 UID: 0 PID: 54 Comm: kworker/7:0 Tainted: G E 6.13.0-rc3+ #2 [ 72.170509] Tainted: [E]=UNSIGNED_MODULE [ 72.170720] Hardware name: QEMU QEMU Virtual Machine, BIOS edk2-stable202302-for-qemu 03/01/2023 [ 72.171192] Workqueue: events_power_efficient nft_rhash_gc [nf_tables] [ 72.171552] pstate: 21400005 (nzCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) [ 72.171915] pc : nft_rhash_gc+0x200/0x2d8 [nf_tables] [ 72.172166] lr : nft_rhash_gc+0x128/0x2d8 [nf_tables] [ 72.172546] sp : ffff800081f2bce0 [ 72.172724] x29: ffff800081f2bd40 x28: ffff0000c2bb708c x27: 0000000000000038 [ 72.173078] x26: ffff0000c6780ef0 x25: ffff0000c643df00 x24: ffff0000c6778f78 [ 72.173431] x23: 000000000000001a x22: ffff0000c4b1f000 x21: ffff0000c6780f78 [ 72.173782] x20: ffff0000c2bb70dc x19: ffff0000c2bb7080 x18: 0000000000000000 [ 72.174135] x17: ffff0000c0a4e1c0 x16: 0000000000003000 x15: 0000ac26d173b978 [ 72.174485] x14: ffffffffffffffff x13: 0000000000000030 x12: ffff0000c6780ef0 [ 72.174841] x11: 0000000000000000 x10: ffff800081f2bcf8 x9 : ffff0000c3000000 [ 72.175193] x8 : 00000000000004be x7 : 0000000000000000 x6 : 0000000000000000 [ 72.175544] x5 : 0000000000000040 x4 : ffff0000c3000010 x3 : 0000000000000000 [ 72.175871] x2 : 0000000000003a98 x1 : ffff0000c2bb708c x0 : 0000000000000004 [ 72.176207] Call trace: [ 72.176316] nft_rhash_gc+0x200/0x2d8 [nf_tables] (P) [ 72.176653] process_one_work+0x178/0x3d0 [ 72.176831] worker_thread+0x200/0x3f0 [ 72.176995] kthread+0xe8/0xf8 [ 72.177130] ret_from_fork+0x10/0x20 [ 72.177289] Code: 54fff984 d503201f d2800080 91003261 (f820303f) [ 72.177557] ---[ end trace 0000000000000000 ]--- Align struct nft_set_ext to word size to address this and documentation it. pahole reports that this increases the size of elements for rhash and pipapo in 8 bytes on x86_64. Fixes: 7ffc7481153b ("netfilter: nft_set_hash: skip duplicated elements pending gc run") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4afa64c81304..0027beca5cd5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -733,15 +733,18 @@ struct nft_set_ext_tmpl { /** * struct nft_set_ext - set extensions * - * @genmask: generation mask + * @genmask: generation mask, but also flags (see NFT_SET_ELEM_DEAD_BIT) * @offset: offsets of individual extension types * @data: beginning of extension data + * + * This structure must be aligned to word size, otherwise atomic bitops + * on genmask field can cause alignment failure on some archs. */ struct nft_set_ext { u8 genmask; u8 offset[NFT_SET_EXT_NUM]; char data[]; -}; +} __aligned(BITS_PER_LONG / 8); static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) { -- cgit v1.2.3 From e8f81b561360d45832cfd546a1ce566745d184d9 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Sat, 21 Dec 2024 13:24:10 +0100 Subject: dt-bindings: clock: qcom,x1e80100-gpucc: Extend for X1P42100 To make it easier for X1P4 and X1E to share a common device tree base, extend the existing latter's GPUCC bindings and reuse them on the former platform. While not in the same file, it only makes sense to introduce the new compatible in this commit as well. Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20241221-topic-x1p4_clk-v1-2-dbaeccb74884@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,x1e80100-gpucc.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,x1e80100-gpucc.h b/include/dt-bindings/clock/qcom,x1e80100-gpucc.h index 61a3a8f3ac43..27b8f50541fd 100644 --- a/include/dt-bindings/clock/qcom,x1e80100-gpucc.h +++ b/include/dt-bindings/clock/qcom,x1e80100-gpucc.h @@ -33,9 +33,22 @@ #define GPU_CC_SLEEP_CLK 23 #define GPU_CC_XO_CLK_SRC 24 #define GPU_CC_XO_DIV_CLK_SRC 25 +#define GPU_CC_CX_ACCU_SHIFT_CLK 26 +#define GPU_CC_GX_ACCU_SHIFT_CLK 27 /* GDSCs */ #define GPU_CX_GDSC 0 #define GPU_GX_GDSC 1 +/* GPU_CC resets */ +#define GPU_CC_ACD_BCR 0 +#define GPU_CC_CB_BCR 1 +#define GPU_CC_CX_BCR 2 +#define GPU_CC_FAST_HUB_BCR 3 +#define GPU_CC_FF_BCR 4 +#define GPU_CC_GFX3D_AON_BCR 5 +#define GPU_CC_GMU_BCR 6 +#define GPU_CC_GX_BCR 7 +#define GPU_CC_XO_BCR 8 + #endif -- cgit v1.2.3 From 2705bce5b4c45e2a0a354ec4df937d2803241cd8 Mon Sep 17 00:00:00 2001 From: Eugen Hristev Date: Fri, 29 Nov 2024 16:24:46 +0200 Subject: soc: qcom: Rework BCM_TCS_CMD macro Reworked BCM_TCS_CMD macro in order to fix warnings from sparse: drivers/clk/qcom/clk-rpmh.c:270:28: warning: restricted __le32 degrades to integer drivers/clk/qcom/clk-rpmh.c:270:28: warning: restricted __le32 degrades to integer While at it, used u32_encode_bits which made the code easier to follow and removed unnecessary shift definitions. The use of cpu_to_le32 was wrong and thus removed. Signed-off-by: Eugen Hristev Reviewed-by: Stephen Boyd Link: https://lore.kernel.org/r/20241129142446.407443-1-eugen.hristev@linaro.org Signed-off-by: Bjorn Andersson --- include/soc/qcom/tcs.h | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/soc/qcom/tcs.h b/include/soc/qcom/tcs.h index 3acca067c72b..cff67ce25488 100644 --- a/include/soc/qcom/tcs.h +++ b/include/soc/qcom/tcs.h @@ -6,6 +6,9 @@ #ifndef __SOC_QCOM_TCS_H__ #define __SOC_QCOM_TCS_H__ +#include +#include + #define MAX_RPMH_PAYLOAD 16 /** @@ -60,22 +63,17 @@ struct tcs_request { struct tcs_cmd *cmds; }; -#define BCM_TCS_CMD_COMMIT_SHFT 30 -#define BCM_TCS_CMD_COMMIT_MASK 0x40000000 -#define BCM_TCS_CMD_VALID_SHFT 29 -#define BCM_TCS_CMD_VALID_MASK 0x20000000 -#define BCM_TCS_CMD_VOTE_X_SHFT 14 -#define BCM_TCS_CMD_VOTE_MASK 0x3fff -#define BCM_TCS_CMD_VOTE_Y_SHFT 0 -#define BCM_TCS_CMD_VOTE_Y_MASK 0xfffc000 +#define BCM_TCS_CMD_COMMIT_MASK BIT(30) +#define BCM_TCS_CMD_VALID_MASK BIT(29) +#define BCM_TCS_CMD_VOTE_MASK GENMASK(13, 0) +#define BCM_TCS_CMD_VOTE_Y_MASK GENMASK(13, 0) +#define BCM_TCS_CMD_VOTE_X_MASK GENMASK(27, 14) /* Construct a Bus Clock Manager (BCM) specific TCS command */ #define BCM_TCS_CMD(commit, valid, vote_x, vote_y) \ - (((commit) << BCM_TCS_CMD_COMMIT_SHFT) | \ - ((valid) << BCM_TCS_CMD_VALID_SHFT) | \ - ((cpu_to_le32(vote_x) & \ - BCM_TCS_CMD_VOTE_MASK) << BCM_TCS_CMD_VOTE_X_SHFT) | \ - ((cpu_to_le32(vote_y) & \ - BCM_TCS_CMD_VOTE_MASK) << BCM_TCS_CMD_VOTE_Y_SHFT)) + (u32_encode_bits(commit, BCM_TCS_CMD_COMMIT_MASK) | \ + u32_encode_bits(valid, BCM_TCS_CMD_VALID_MASK) | \ + u32_encode_bits(vote_x, BCM_TCS_CMD_VOTE_X_MASK) | \ + u32_encode_bits(vote_y, BCM_TCS_CMD_VOTE_Y_MASK)) #endif /* __SOC_QCOM_TCS_H__ */ -- cgit v1.2.3 From 32e9dea2645fa10dfa08b4e333918affaf1e4de5 Mon Sep 17 00:00:00 2001 From: Shengjiu Wang Date: Tue, 19 Nov 2024 09:58:03 +0800 Subject: dt-bindings: clock: imx93: Add SPDIF IPG clk Add SPDIF IPG clk. The SPDIF IPG clock and root clock share same clock gate. Fixes: 1c4a4f7362fd ("arm64: dts: imx93: Add audio device nodes") Signed-off-by: Shengjiu Wang Reviewed-by: Frank Li Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20241119015805.3840606-2-shengjiu.wang@nxp.com Signed-off-by: Abel Vesa --- include/dt-bindings/clock/imx93-clock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/dt-bindings/clock/imx93-clock.h b/include/dt-bindings/clock/imx93-clock.h index 6c685067288b..c393fad3a346 100644 --- a/include/dt-bindings/clock/imx93-clock.h +++ b/include/dt-bindings/clock/imx93-clock.h @@ -209,5 +209,6 @@ #define IMX91_CLK_ENET2_REGULAR 204 #define IMX91_CLK_ENET2_REGULAR_GATE 205 #define IMX91_CLK_ENET1_QOS_TSN_GATE 206 +#define IMX93_CLK_SPDIF_IPG 207 #endif -- cgit v1.2.3 From 9e49ca756d207f4313fb7af48648a67da8e4e250 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 20 Dec 2024 10:33:13 -0500 Subject: tracing/string: Create and use __free(argv_free) in trace_dynevent.c The function dyn_event_release() uses argv_split() which must be freed via argv_free(). It contains several error paths that do a goto out to call argv_free() for cleanup. This makes the code complex and error prone. Create a new __free() directive __free(argv_free) that will call argv_free() for data allocated with argv_split(), and use it in the dyn_event_release() function. Cc: Kees Cook Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Andy Shevchenko Cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/20241220103313.4a74ec8e@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- include/linux/string.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/string.h b/include/linux/string.h index 493ac4862c77..86d5d352068b 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -4,6 +4,7 @@ #include #include +#include /* for DEFINE_FREE() */ #include /* for inline */ #include /* for size_t */ #include /* for NULL */ @@ -312,6 +313,8 @@ extern void *kmemdup_array(const void *src, size_t count, size_t element_size, g extern char **argv_split(gfp_t gfp, const char *str, int *argcp); extern void argv_free(char **argv); +DEFINE_FREE(argv_free, char **, if (!IS_ERR_OR_NULL(_T)) argv_free(_T)) + /* lib/cmdline.c */ extern int get_option(char **str, int *pint); extern char *get_options(const char *str, int nints, int *ints); -- cgit v1.2.3 From cff6d93eab00bacf8b6bffdef775fc2de0273c96 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Thu, 12 Dec 2024 13:12:37 +0000 Subject: tracepoint: Reduce duplication of __DO_TRACE_CALL The logic for invoking __DO_TRACE_CALL was extracted to a static inline function called __rust_do_trace_##name so that Rust can call it directly. This logic does not include the static branch, to avoid a function call when the tracepoint is disabled. Since the C code needs to perform the same logic after checking the static key, this logic is currently duplicated. Thus, remove this duplication by having C call the static inline function too. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20241212131237.1988409-1-aliceryhl@google.com Signed-off-by: Alice Ryhl Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 76d9055b2cff..a351763e6965 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -218,7 +218,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DEFINE_RUST_DO_TRACE(name, proto, args) \ notrace void rust_do_trace_##name(proto) \ { \ - __rust_do_trace_##name(args); \ + __do_trace_##name(args); \ } /* @@ -268,7 +268,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ - static inline void __rust_do_trace_##name(proto) \ + static inline void __do_trace_##name(proto) \ { \ if (cond) { \ guard(preempt_notrace)(); \ @@ -277,12 +277,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) } \ static inline void trace_##name(proto) \ { \ - if (static_branch_unlikely(&__tracepoint_##name.key)) { \ - if (cond) { \ - guard(preempt_notrace)(); \ - __DO_TRACE_CALL(name, TP_ARGS(args)); \ - } \ - } \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ + __do_trace_##name(args); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ @@ -291,7 +287,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define __DECLARE_TRACE_SYSCALL(name, proto, args, data_proto) \ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ - static inline void __rust_do_trace_##name(proto) \ + static inline void __do_trace_##name(proto) \ { \ guard(rcu_tasks_trace)(); \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ @@ -299,10 +295,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) static inline void trace_##name(proto) \ { \ might_fault(); \ - if (static_branch_unlikely(&__tracepoint_##name.key)) { \ - guard(rcu_tasks_trace)(); \ - __DO_TRACE_CALL(name, TP_ARGS(args)); \ - } \ + if (static_branch_unlikely(&__tracepoint_##name.key)) \ + __do_trace_##name(args); \ if (IS_ENABLED(CONFIG_LOCKDEP)) { \ WARN_ONCE(!rcu_is_watching(), \ "RCU not watching for tracepoint"); \ -- cgit v1.2.3 From 41705c4262aaca49b8d9fe9b24fe048dc6c2b301 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:11:40 +0900 Subject: fgraph: Pass ftrace_regs to entryfunc Pass ftrace_regs to the fgraph_ops::entryfunc(). If ftrace_regs is not available, it passes a NULL instead. User callback function can access some registers (including return address) via this ftrace_regs. Note that the ftrace_regs can be NULL when the arch does NOT define: HAVE_DYNAMIC_FTRACE_WITH_ARGS or HAVE_DYNAMIC_FTRACE_WITH_REGS. More specifically, if HAVE_DYNAMIC_FTRACE_WITH_REGS is defined but not the HAVE_DYNAMIC_FTRACE_WITH_ARGS, and the ftrace ops used to register the function callback does not set FTRACE_OPS_FL_SAVE_REGS. In this case, ftrace_regs can be NULL in user callback. Signed-off-by: Masami Hiramatsu (Google) Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Cc: Huacai Chen Cc: WANG Xuerui Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Naveen N Rao Cc: Madhavan Srinivasan Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: Mathieu Desnoyers Link: https://lore.kernel.org/173518990044.391279.17406984900626078579.stgit@devnote2 Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index aa9ddd1e4bb6..c86ac786da3d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1071,10 +1071,12 @@ struct fgraph_ops; typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *, struct fgraph_ops *); /* return */ typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *, - struct fgraph_ops *); /* entry */ + struct fgraph_ops *, + struct ftrace_regs *); /* entry */ extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, - struct fgraph_ops *gops); + struct fgraph_ops *gops, + struct ftrace_regs *fregs); bool ftrace_pids_enabled(struct ftrace_ops *ops); #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -1114,8 +1116,15 @@ struct ftrace_ret_stack { extern void return_to_handler(void); extern int -function_graph_enter(unsigned long ret, unsigned long func, - unsigned long frame_pointer, unsigned long *retp); +function_graph_enter_regs(unsigned long ret, unsigned long func, + unsigned long frame_pointer, unsigned long *retp, + struct ftrace_regs *fregs); + +static inline int function_graph_enter(unsigned long ret, unsigned long func, + unsigned long fp, unsigned long *retp) +{ + return function_graph_enter_regs(ret, func, fp, retp, NULL); +} struct ftrace_ret_stack * ftrace_graph_get_ret_stack(struct task_struct *task, int skip); -- cgit v1.2.3 From a3ed4157b7d89800a0008de0c9e46a438a5c3745 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:11:55 +0900 Subject: fgraph: Replace fgraph_ret_regs with ftrace_regs Use ftrace_regs instead of fgraph_ret_regs for tracing return value on function_graph tracer because of simplifying the callback interface. The CONFIG_HAVE_FUNCTION_GRAPH_RETVAL is also replaced by CONFIG_HAVE_FUNCTION_GRAPH_FREGS. Signed-off-by: Masami Hiramatsu (Google) Acked-by: Heiko Carstens Acked-by: Will Deacon Cc: Catalin Marinas Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Cc: Huacai Chen Cc: WANG Xuerui Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Heiko Carstens Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: Mathieu Desnoyers Link: https://lore.kernel.org/173518991508.391279.16635322774382197642.stgit@devnote2 Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 12 +++++++++--- include/linux/ftrace_regs.h | 2 ++ 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index c86ac786da3d..069f270bd7ae 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -43,9 +43,8 @@ struct dyn_ftrace; char *arch_ftrace_match_adjust(char *str, const char *search); -#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL -struct fgraph_ret_regs; -unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs); +#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FREGS +unsigned long ftrace_return_to_handler(struct ftrace_regs *fregs); #else unsigned long ftrace_return_to_handler(unsigned long frame_pointer); #endif @@ -134,6 +133,13 @@ extern int ftrace_enabled; * Also, architecture dependent fields can be used for internal process. * (e.g. orig_ax on x86_64) * + * Basically, ftrace_regs stores the registers related to the context. + * On function entry, registers for function parameters and hooking the + * function call are stored, and on function exit, registers for function + * return value and frame pointers are stored. + * + * And also, it dpends on the context that which registers are restored + * from the ftrace_regs. * On the function entry, those registers will be restored except for * the stack pointer, so that user can change the function parameters * and instruction pointer (e.g. live patching.) diff --git a/include/linux/ftrace_regs.h b/include/linux/ftrace_regs.h index be1ed0c891d0..bbc1873ca6b8 100644 --- a/include/linux/ftrace_regs.h +++ b/include/linux/ftrace_regs.h @@ -30,6 +30,8 @@ struct ftrace_regs; override_function_with_return(&arch_ftrace_regs(fregs)->regs) #define ftrace_regs_query_register_offset(name) \ regs_query_register_offset(name) +#define ftrace_regs_get_frame_pointer(fregs) \ + frame_pointer(&arch_ftrace_regs(fregs)->regs) #endif /* HAVE_ARCH_FTRACE_REGS */ -- cgit v1.2.3 From 2ca8c112c9676e2394d76760db78ffddf21d93b5 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:12:09 +0900 Subject: fgraph: Pass ftrace_regs to retfunc Pass ftrace_regs to the fgraph_ops::retfunc(). If ftrace_regs is not available, it passes a NULL instead. User callback function can access some registers (including return address) via this ftrace_regs. Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Link: https://lore.kernel.org/173518992972.391279.14055405490327765506.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 069f270bd7ae..9a1e768e47da 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1075,7 +1075,8 @@ struct fgraph_ops; /* Type of the callback handlers for tracing function graph*/ typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *, - struct fgraph_ops *); /* return */ + struct fgraph_ops *, + struct ftrace_regs *); /* return */ typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *, struct fgraph_ops *, struct ftrace_regs *); /* entry */ -- cgit v1.2.3 From 46bc082388560a95e3649b698a4675e5ea3262e6 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:12:20 +0900 Subject: fprobe: Use ftrace_regs in fprobe entry handler This allows fprobes to be available with CONFIG_DYNAMIC_FTRACE_WITH_ARGS instead of CONFIG_DYNAMIC_FTRACE_WITH_REGS, then we can enable fprobe on arm64. Cc: Alexei Starovoitov Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Link: https://lore.kernel.org/173518994037.391279.2786805566359674586.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Acked-by: Florent Revest Signed-off-by: Steven Rostedt (Google) --- include/linux/fprobe.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index f39869588117..ca64ee5e45d2 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -10,7 +10,7 @@ struct fprobe; typedef int (*fprobe_entry_cb)(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *regs, void *entry_data); typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip, -- cgit v1.2.3 From 762abbc0d09f7ae123c82d315eb1a961c1a2cf7b Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:12:31 +0900 Subject: fprobe: Use ftrace_regs in fprobe exit handler Change the fprobe exit handler to use ftrace_regs structure instead of pt_regs. This also introduce HAVE_FTRACE_REGS_HAVING_PT_REGS which means the ftrace_regs is including the pt_regs so that ftrace_regs can provide pt_regs without memory allocation. Fprobe introduces a new dependency with that. Signed-off-by: Masami Hiramatsu (Google) Acked-by: Heiko Carstens # s390 Cc: Huacai Chen Cc: Alexei Starovoitov Cc: Florent Revest Cc: bpf Cc: Alan Maguire Cc: Heiko Carstens Cc: WANG Xuerui Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Song Liu Cc: Jiri Olsa Cc: KP Singh Cc: Matt Bobrowski Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: Eduard Zingerman Cc: Yonghong Song Cc: John Fastabend Cc: Stanislav Fomichev Cc: Hao Luo Cc: Andrew Morton Link: https://lore.kernel.org/173518995092.391279.6765116450352977627.stgit@devnote2 Signed-off-by: Steven Rostedt (Google) --- include/linux/fprobe.h | 2 +- include/linux/ftrace.h | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index ca64ee5e45d2..ef609bcca0f9 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -14,7 +14,7 @@ typedef int (*fprobe_entry_cb)(struct fprobe *fp, unsigned long entry_ip, void *entry_data); typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, + unsigned long ret_ip, struct ftrace_regs *regs, void *entry_data); /** diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9a1e768e47da..bf8bb6c10553 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -176,6 +176,12 @@ static inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *fregs) #define ftrace_regs_set_instruction_pointer(fregs, ip) do { } while (0) #endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ +#ifdef CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS + +static_assert(sizeof(struct pt_regs) == ftrace_regs_size()); + +#endif /* CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS */ + static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs) { if (!fregs) -- cgit v1.2.3 From b9b55c8912ce1e5555715d126486bdd63ddfeaec Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:12:47 +0900 Subject: tracing: Add ftrace_partial_regs() for converting ftrace_regs to pt_regs Add ftrace_partial_regs() which converts the ftrace_regs to pt_regs. This is for the eBPF which needs this to keep the same pt_regs interface to access registers. Thus when replacing the pt_regs with ftrace_regs in fprobes (which is used by kprobe_multi eBPF event), this will be used. If the architecture defines its own ftrace_regs, this copies partial registers to pt_regs and returns it. If not, ftrace_regs is the same as pt_regs and ftrace_partial_regs() will return ftrace_regs::regs. Signed-off-by: Masami Hiramatsu (Google) Acked-by: Florent Revest Cc: Alexei Starovoitov Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Link: https://lore.kernel.org/173518996761.391279.4987911298206448122.stgit@devnote2 Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index bf8bb6c10553..ad2b46e1d5b0 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -190,6 +190,23 @@ static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs return arch_ftrace_get_regs(fregs); } +#if !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) || \ + defined(CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS) + +static __always_inline struct pt_regs * +ftrace_partial_regs(struct ftrace_regs *fregs, struct pt_regs *regs) +{ + /* + * If CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS=y, ftrace_regs memory + * layout is including pt_regs. So always returns that address. + * Since arch_ftrace_get_regs() will check some members and may return + * NULL, we can not use it. + */ + return &arch_ftrace_regs(fregs)->regs; +} + +#endif /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS || CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS */ + /* * When true, the ftrace_regs_{get,set}_*() functions may be used on fregs. * Note: this can be true even when ftrace_get_regs() cannot provide a pt_regs. -- cgit v1.2.3 From d5d01b71996ec03af51b3c0736c92d0fc89703b5 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:12:59 +0900 Subject: tracing: Add ftrace_fill_perf_regs() for perf event Add ftrace_fill_perf_regs() which should be compatible with the perf_fetch_caller_regs(). In other words, the pt_regs returned from the ftrace_fill_perf_regs() must satisfy 'user_mode(regs) == false' and can be used for stack tracing. Signed-off-by: Masami Hiramatsu (Google) Acked-by: Will Deacon Acked-by: Heiko Carstens # s390 Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Heiko Carstens Cc: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Naveen N Rao Cc: Madhavan Srinivasan Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: x86@kernel.org Cc: "H. Peter Anvin" Link: https://lore.kernel.org/173518997908.391279.15910334347345106424.stgit@devnote2 Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ad2b46e1d5b0..6d29c640697c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -207,6 +207,37 @@ ftrace_partial_regs(struct ftrace_regs *fregs, struct pt_regs *regs) #endif /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS || CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS */ +#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS + +/* + * Please define arch dependent pt_regs which compatible to the + * perf_arch_fetch_caller_regs() but based on ftrace_regs. + * This requires + * - user_mode(_regs) returns false (always kernel mode). + * - able to use the _regs for stack trace. + */ +#ifndef arch_ftrace_fill_perf_regs +/* As same as perf_arch_fetch_caller_regs(), do nothing by default */ +#define arch_ftrace_fill_perf_regs(fregs, _regs) do {} while (0) +#endif + +static __always_inline struct pt_regs * +ftrace_fill_perf_regs(struct ftrace_regs *fregs, struct pt_regs *regs) +{ + arch_ftrace_fill_perf_regs(fregs, regs); + return regs; +} + +#else /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ + +static __always_inline struct pt_regs * +ftrace_fill_perf_regs(struct ftrace_regs *fregs, struct pt_regs *regs) +{ + return &arch_ftrace_regs(fregs)->regs; +} + +#endif + /* * When true, the ftrace_regs_{get,set}_*() functions may be used on fregs. * Note: this can be true even when ftrace_get_regs() cannot provide a pt_regs. -- cgit v1.2.3 From 0566cefe73b9a6ea38357b428d27460db032a03d Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:13:13 +0900 Subject: tracing/fprobe: Enable fprobe events with CONFIG_DYNAMIC_FTRACE_WITH_ARGS Allow fprobe events to be enabled with CONFIG_DYNAMIC_FTRACE_WITH_ARGS. With this change, fprobe events mostly use ftrace_regs instead of pt_regs. Note that if the arch doesn't enable HAVE_FTRACE_REGS_HAVING_PT_REGS, fprobe events will not be able to be used from perf. Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Link: https://lore.kernel.org/173518999352.391279.13332699755290175168.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 6d29c640697c..4c553fe9c026 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -250,6 +250,23 @@ static __always_inline bool ftrace_regs_has_args(struct ftrace_regs *fregs) return ftrace_get_regs(fregs) != NULL; } +#ifdef CONFIG_HAVE_REGS_AND_STACK_ACCESS_API +static __always_inline unsigned long +ftrace_regs_get_kernel_stack_nth(struct ftrace_regs *fregs, unsigned int nth) +{ + unsigned long *stackp; + + stackp = (unsigned long *)ftrace_regs_get_stack_pointer(fregs); + if (((unsigned long)(stackp + nth) & ~(THREAD_SIZE - 1)) == + ((unsigned long)stackp & ~(THREAD_SIZE - 1))) + return *(stackp + nth); + + return 0; +} +#else /* !CONFIG_HAVE_REGS_AND_STACK_ACCESS_API */ +#define ftrace_regs_get_kernel_stack_nth(fregs, nth) (0L) +#endif /* CONFIG_HAVE_REGS_AND_STACK_ACCESS_API */ + typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); -- cgit v1.2.3 From 4346ba1604093305a287e08eb465a9c15ba05b80 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:13:59 +0900 Subject: fprobe: Rewrite fprobe on function-graph tracer Rewrite fprobe implementation on function-graph tracer. Major API changes are: - 'nr_maxactive' field is deprecated. - This depends on CONFIG_DYNAMIC_FTRACE_WITH_ARGS or !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS, and CONFIG_HAVE_FUNCTION_GRAPH_FREGS. So currently works only on x86_64. - Currently the entry size is limited in 15 * sizeof(long). - If there is too many fprobe exit handler set on the same function, it will fail to probe. Signed-off-by: Masami Hiramatsu (Google) Acked-by: Heiko Carstens # s390 Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Heiko Carstens Cc: Mark Rutland Cc: Catalin Marinas Cc: Will Deacon Cc: Huacai Chen Cc: WANG Xuerui Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Naveen N Rao Cc: Madhavan Srinivasan Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/173519003970.391279.14406792285453830996.stgit@devnote2 Signed-off-by: Steven Rostedt (Google) --- include/linux/fprobe.h | 58 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index ef609bcca0f9..91337bcb452f 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -5,10 +5,11 @@ #include #include -#include +#include +#include +#include struct fprobe; - typedef int (*fprobe_entry_cb)(struct fprobe *fp, unsigned long entry_ip, unsigned long ret_ip, struct ftrace_regs *regs, void *entry_data); @@ -17,35 +18,57 @@ typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip, unsigned long ret_ip, struct ftrace_regs *regs, void *entry_data); +/** + * struct fprobe_hlist_node - address based hash list node for fprobe. + * + * @hlist: The hlist node for address search hash table. + * @addr: One of the probing address of @fp. + * @fp: The fprobe which owns this. + */ +struct fprobe_hlist_node { + struct hlist_node hlist; + unsigned long addr; + struct fprobe *fp; +}; + +/** + * struct fprobe_hlist - hash list nodes for fprobe. + * + * @hlist: The hlist node for existence checking hash table. + * @rcu: rcu_head for RCU deferred release. + * @fp: The fprobe which owns this fprobe_hlist. + * @size: The size of @array. + * @array: The fprobe_hlist_node for each address to probe. + */ +struct fprobe_hlist { + struct hlist_node hlist; + struct rcu_head rcu; + struct fprobe *fp; + int size; + struct fprobe_hlist_node array[] __counted_by(size); +}; + /** * struct fprobe - ftrace based probe. - * @ops: The ftrace_ops. + * * @nmissed: The counter for missing events. * @flags: The status flag. - * @rethook: The rethook data structure. (internal data) * @entry_data_size: The private data storage size. - * @nr_maxactive: The max number of active functions. + * @nr_maxactive: The max number of active functions. (*deprecated) * @entry_handler: The callback function for function entry. * @exit_handler: The callback function for function exit. + * @hlist_array: The fprobe_hlist for fprobe search from IP hash table. */ struct fprobe { -#ifdef CONFIG_FUNCTION_TRACER - /* - * If CONFIG_FUNCTION_TRACER is not set, CONFIG_FPROBE is disabled too. - * But user of fprobe may keep embedding the struct fprobe on their own - * code. To avoid build error, this will keep the fprobe data structure - * defined here, but remove ftrace_ops data structure. - */ - struct ftrace_ops ops; -#endif unsigned long nmissed; unsigned int flags; - struct rethook *rethook; size_t entry_data_size; int nr_maxactive; fprobe_entry_cb entry_handler; fprobe_exit_cb exit_handler; + + struct fprobe_hlist *hlist_array; }; /* This fprobe is soft-disabled. */ @@ -121,4 +144,9 @@ static inline void enable_fprobe(struct fprobe *fp) fp->flags &= ~FPROBE_FL_DISABLED; } +/* The entry data size is 4 bits (=16) * sizeof(long) in maximum */ +#define FPROBE_DATA_SIZE_BITS 4 +#define MAX_FPROBE_DATA_SIZE_WORD ((1L << FPROBE_DATA_SIZE_BITS) - 1) +#define MAX_FPROBE_DATA_SIZE (MAX_FPROBE_DATA_SIZE_WORD * sizeof(long)) + #endif -- cgit v1.2.3 From b5fa903b7f7c7ffc07430d1380936f72aaf09068 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:14:17 +0900 Subject: fprobe: Add fprobe_header encoding feature Fprobe store its data structure address and size on the fgraph return stack by __fprobe_header. But most 64bit architecture can combine those to one unsigned long value because 4 MSB in the kernel address are the same. With this encoding, fprobe can consume less space on ret_stack. This introduces asm/fprobe.h to define arch dependent encode/decode macros. Note that since fprobe depends on CONFIG_HAVE_FUNCTION_GRAPH_FREGS, currently only arm64, loongarch, riscv, s390 and x86 are supported. Signed-off-by: Masami Hiramatsu (Google) Acked-by: Heiko Carstens # s390 Cc: Catalin Marinas Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Cc: Heiko Carstens Cc: Will Deacon Cc: Huacai Chen Cc: WANG Xuerui Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: Arnd Bergmann Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/173519005783.391279.5307910947400277525.stgit@devnote2 Signed-off-by: Steven Rostedt (Google) --- include/asm-generic/fprobe.h | 46 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 include/asm-generic/fprobe.h (limited to 'include') diff --git a/include/asm-generic/fprobe.h b/include/asm-generic/fprobe.h new file mode 100644 index 000000000000..8659a4dc6eb6 --- /dev/null +++ b/include/asm-generic/fprobe.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Generic arch dependent fprobe macros. + */ +#ifndef __ASM_GENERIC_FPROBE_H__ +#define __ASM_GENERIC_FPROBE_H__ + +#include + +#ifdef CONFIG_64BIT +/* + * Encoding the size and the address of fprobe into one 64bit entry. + * The 32bit architectures should use 2 entries to store those info. + */ + +#define ARCH_DEFINE_ENCODE_FPROBE_HEADER + +#define FPROBE_HEADER_MSB_SIZE_SHIFT (BITS_PER_LONG - FPROBE_DATA_SIZE_BITS) +#define FPROBE_HEADER_MSB_MASK \ + GENMASK(FPROBE_HEADER_MSB_SIZE_SHIFT - 1, 0) + +/* + * By default, this expects the MSBs in the address of kprobe is 0xf. + * If any arch needs another fixed pattern (e.g. s390 is zero filled), + * override this. + */ +#define FPROBE_HEADER_MSB_PATTERN \ + GENMASK(BITS_PER_LONG - 1, FPROBE_HEADER_MSB_SIZE_SHIFT) + +#define arch_fprobe_header_encodable(fp) \ + (((unsigned long)(fp) & ~FPROBE_HEADER_MSB_MASK) == \ + FPROBE_HEADER_MSB_PATTERN) + +#define arch_encode_fprobe_header(fp, size) \ + (((unsigned long)(fp) & FPROBE_HEADER_MSB_MASK) | \ + ((unsigned long)(size) << FPROBE_HEADER_MSB_SIZE_SHIFT)) + +#define arch_decode_fprobe_header_size(val) \ + ((unsigned long)(val) >> FPROBE_HEADER_MSB_SIZE_SHIFT) + +#define arch_decode_fprobe_header_fp(val) \ + ((struct fprobe *)(((unsigned long)(val) & FPROBE_HEADER_MSB_MASK) | \ + FPROBE_HEADER_MSB_PATTERN)) +#endif /* CONFIG_64BIT */ + +#endif /* __ASM_GENERIC_FPROBE_H__ */ -- cgit v1.2.3 From a2224559cbba1db3a998dd100c60c85a1d078ad6 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:14:32 +0900 Subject: tracing/fprobe: Remove nr_maxactive from fprobe Remove depercated fprobe::nr_maxactive. This involves fprobe events to rejects the maxactive number. Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Link: https://lore.kernel.org/173519007257.391279.946804046982289337.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/fprobe.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 91337bcb452f..702099f08929 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -54,7 +54,6 @@ struct fprobe_hlist { * @nmissed: The counter for missing events. * @flags: The status flag. * @entry_data_size: The private data storage size. - * @nr_maxactive: The max number of active functions. (*deprecated) * @entry_handler: The callback function for function entry. * @exit_handler: The callback function for function exit. * @hlist_array: The fprobe_hlist for fprobe search from IP hash table. @@ -63,7 +62,6 @@ struct fprobe { unsigned long nmissed; unsigned int flags; size_t entry_data_size; - int nr_maxactive; fprobe_entry_cb entry_handler; fprobe_exit_cb exit_handler; -- cgit v1.2.3 From 2bc56fdae1ba3fc80ee37a648346abc5f152357d Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 26 Dec 2024 14:15:14 +0900 Subject: ftrace: Add ftrace_get_symaddr to convert fentry_ip to symaddr This introduces ftrace_get_symaddr() which tries to convert fentry_ip passed by ftrace or fgraph callback to symaddr without calling kallsyms API. It returns the symbol address or 0 if it fails to convert it. Cc: Alexei Starovoitov Cc: Florent Revest Cc: Martin KaFai Lau Cc: bpf Cc: Alexei Starovoitov Cc: Jiri Olsa Cc: Alan Maguire Cc: Mark Rutland Link: https://lore.kernel.org/173519011487.391279.5450806886342723151.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412061423.K79V55Hd-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202412061804.5VRzF14E-lkp@intel.com/ Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4c553fe9c026..07092dfb21a4 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -622,6 +622,19 @@ enum { FTRACE_MAY_SLEEP = (1 << 5), }; +/* Arches can override ftrace_get_symaddr() to convert fentry_ip to symaddr. */ +#ifndef ftrace_get_symaddr +/** + * ftrace_get_symaddr - return the symbol address from fentry_ip + * @fentry_ip: the address of ftrace location + * + * Get the symbol address from @fentry_ip (fast path). If there is no fast + * search path, this returns 0. + * User may need to use kallsyms API to find the symbol address. + */ +#define ftrace_get_symaddr(fentry_ip) (0) +#endif + #ifdef CONFIG_DYNAMIC_FTRACE void ftrace_arch_code_modify_prepare(void); -- cgit v1.2.3 From 1143be17d7acb02a7c4dba6169a33534983f4960 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Dec 2024 07:44:25 -0700 Subject: io_uring/rw: don't mask in f_iocb_flags A previous commit changed overwriting kiocb->ki_flags with ->f_iocb_flags with masking it in. This breaks for retry situations, where we don't necessarily want to retain previously set flags, like IOCB_NOWAIT. The use case needs IOCB_HAS_METADATA to be persistent, but the change makes all flags persistent, which is an issue. Add a request flag to track whether the request has metadata or not, as that is persistent across issues. Fixes: 59a7d12a7fb5 ("io_uring: introduce attributes for read/write and PI support") Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 73575d545d3c..623d8e798a11 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -480,6 +480,7 @@ enum { REQ_F_BL_NO_RECYCLE_BIT, REQ_F_BUFFERS_COMMIT_BIT, REQ_F_BUF_NODE_BIT, + REQ_F_HAS_METADATA_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -560,6 +561,8 @@ enum { REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT), /* buf node is valid */ REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT), + /* request has read/write metadata assigned */ + REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); -- cgit v1.2.3 From 6b830c6a023ff6e8fe05dbe47a9e5cd276df09ee Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 21 Dec 2024 12:09:14 +0100 Subject: netlink: specs: mptcp: add missing 'server-side' attr This attribute is added with the 'created' and 'established' events, but the documentation didn't mention it. The documentation in the UAPI header has been auto-generated by: ./tools/net/ynl/ynl-regen.sh Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241221-net-mptcp-netlink-specs-pm-doc-fixes-v2-1-e54f2db3f844@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp_pm.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 50589e5dd6a3..b34fd95b6f84 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -13,12 +13,13 @@ * enum mptcp_event_type * @MPTCP_EVENT_UNSPEC: unused event * @MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport A new MPTCP connection has been created. It is the good time - * to allocate memory and send ADD_ADDR if needed. Depending on the - * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED - * is sent. + * sport, dport, server-side A new MPTCP connection has been created. It is + * the good time to allocate memory and send ADD_ADDR if needed. Depending on + * the traffic-patterns it can take a long time until the + * MPTCP_EVENT_ESTABLISHED is sent. * @MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport A MPTCP connection is established (can start new subflows). + * sport, dport, server-side A MPTCP connection is established (can start new + * subflows). * @MPTCP_EVENT_CLOSED: token A MPTCP connection has stopped. * @MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport] A * new address has been announced by the peer. -- cgit v1.2.3 From bea87657b5ee8e6f18af2833ee4b88212ef52d28 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 21 Dec 2024 12:09:15 +0100 Subject: netlink: specs: mptcp: clearly mention attributes The rendered version of the MPTCP events [1] looked strange, because the whole content of the 'doc' was displayed in the same block. It was then not clear that the first words, not even ended by a period, were the attributes that are defined when such events are emitted. These attributes have now been moved to the end, prefixed by 'Attributes:' and ended with a period. Note that '>-' has been added after 'doc:' to allow ':' in the text below. The documentation in the UAPI header has been auto-generated by: ./tools/net/ynl/ynl-regen.sh Link: https://docs.kernel.org/networking/netlink_spec/mptcp_pm.html#event-type [1] Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241221-net-mptcp-netlink-specs-pm-doc-fixes-v2-2-e54f2db3f844@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp_pm.h | 53 ++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index b34fd95b6f84..84fa8a21dfd0 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -12,32 +12,33 @@ /** * enum mptcp_event_type * @MPTCP_EVENT_UNSPEC: unused event - * @MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side A new MPTCP connection has been created. It is - * the good time to allocate memory and send ADD_ADDR if needed. Depending on - * the traffic-patterns it can take a long time until the - * MPTCP_EVENT_ESTABLISHED is sent. - * @MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side A MPTCP connection is established (can start new - * subflows). - * @MPTCP_EVENT_CLOSED: token A MPTCP connection has stopped. - * @MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport] A - * new address has been announced by the peer. - * @MPTCP_EVENT_REMOVED: token, rem_id An address has been lost by the peer. - * @MPTCP_EVENT_SUB_ESTABLISHED: token, family, loc_id, rem_id, saddr4 | - * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error] A new - * subflow has been established. 'error' should not be set. - * @MPTCP_EVENT_SUB_CLOSED: token, family, loc_id, rem_id, saddr4 | saddr6, - * daddr4 | daddr6, sport, dport, backup, if_idx [, error] A subflow has been - * closed. An error (copy of sk_err) could be set if an error has been - * detected for this subflow. - * @MPTCP_EVENT_SUB_PRIORITY: token, family, loc_id, rem_id, saddr4 | saddr6, - * daddr4 | daddr6, sport, dport, backup, if_idx [, error] The priority of a - * subflow has changed. 'error' should not be set. - * @MPTCP_EVENT_LISTENER_CREATED: family, sport, saddr4 | saddr6 A new PM - * listener is created. - * @MPTCP_EVENT_LISTENER_CLOSED: family, sport, saddr4 | saddr6 A PM listener - * is closed. + * @MPTCP_EVENT_CREATED: A new MPTCP connection has been created. It is the + * good time to allocate memory and send ADD_ADDR if needed. Depending on the + * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED + * is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport, server-side. + * @MPTCP_EVENT_ESTABLISHED: A MPTCP connection is established (can start new + * subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport, server-side. + * @MPTCP_EVENT_CLOSED: A MPTCP connection has stopped. Attribute: token. + * @MPTCP_EVENT_ANNOUNCED: A new address has been announced by the peer. + * Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. + * @MPTCP_EVENT_REMOVED: An address has been lost by the peer. Attributes: + * token, rem_id. + * @MPTCP_EVENT_SUB_ESTABLISHED: A new subflow has been established. 'error' + * should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | + * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error]. + * @MPTCP_EVENT_SUB_CLOSED: A subflow has been closed. An error (copy of + * sk_err) could be set if an error has been detected for this subflow. + * Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | + * daddr6, sport, dport, backup, if_idx [, error]. + * @MPTCP_EVENT_SUB_PRIORITY: The priority of a subflow has changed. 'error' + * should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | + * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error]. + * @MPTCP_EVENT_LISTENER_CREATED: A new PM listener is created. Attributes: + * family, sport, saddr4 | saddr6. + * @MPTCP_EVENT_LISTENER_CLOSED: A PM listener is closed. Attributes: family, + * sport, saddr4 | saddr6. */ enum mptcp_event_type { MPTCP_EVENT_UNSPEC, -- cgit v1.2.3 From bc3d482dcc062963e7dc20565be2a887e5fc9a2d Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Fri, 27 Dec 2024 15:47:49 +0100 Subject: rv: Simplify manual steps in monitor creation While creating a new monitor in RV, besides generating code from dot2k, there are a few manual steps which can be tedious and error prone, like adding the tracepoints, makefile lines and kconfig. This patch restructures the existing monitors to keep some files in the monitor's folder itself, which can be automatically generated by future versions of dot2k. Monitors have now their own Kconfig and tracepoint snippets. For simplicity, the main tracepoint definition, is moved to the RV directory, it defines only the tracepoint classes and includes the monitor-specific tracepoints, which reside in the monitor directory. Tracepoints and Kconfig no longer need to be copied and adapted from existing ones but only need to be included in the main files. The Makefile remains untouched since there's little advantage in having a separated Makefile for each monitor with a single line and including it in the main RV Makefile. Cc: Juri Lelli Cc: Thomas Gleixner Cc: John Kacur Link: https://lore.kernel.org/20241227144752.362911-6-gmonaco@redhat.com Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- include/trace/events/rv.h | 142 ---------------------------------------------- 1 file changed, 142 deletions(-) delete mode 100644 include/trace/events/rv.h (limited to 'include') diff --git a/include/trace/events/rv.h b/include/trace/events/rv.h deleted file mode 100644 index 56592da9301c..000000000000 --- a/include/trace/events/rv.h +++ /dev/null @@ -1,142 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM rv - -#if !defined(_TRACE_RV_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_RV_H - -#include -#include - -#ifdef CONFIG_DA_MON_EVENTS_IMPLICIT -DECLARE_EVENT_CLASS(event_da_monitor, - - TP_PROTO(char *state, char *event, char *next_state, bool final_state), - - TP_ARGS(state, event, next_state, final_state), - - TP_STRUCT__entry( - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) - __array( char, next_state, MAX_DA_NAME_LEN ) - __field( bool, final_state ) - ), - - TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); - memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN); - __entry->final_state = final_state; - ), - - TP_printk("%s x %s -> %s %s", - __entry->state, - __entry->event, - __entry->next_state, - __entry->final_state ? "(final)" : "") -); - -DECLARE_EVENT_CLASS(error_da_monitor, - - TP_PROTO(char *state, char *event), - - TP_ARGS(state, event), - - TP_STRUCT__entry( - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) - ), - - TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); - ), - - TP_printk("event %s not expected in the state %s", - __entry->event, - __entry->state) -); - -#ifdef CONFIG_RV_MON_WIP -DEFINE_EVENT(event_da_monitor, event_wip, - TP_PROTO(char *state, char *event, char *next_state, bool final_state), - TP_ARGS(state, event, next_state, final_state)); - -DEFINE_EVENT(error_da_monitor, error_wip, - TP_PROTO(char *state, char *event), - TP_ARGS(state, event)); -#endif /* CONFIG_RV_MON_WIP */ -#endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ - -#ifdef CONFIG_DA_MON_EVENTS_ID -DECLARE_EVENT_CLASS(event_da_monitor_id, - - TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), - - TP_ARGS(id, state, event, next_state, final_state), - - TP_STRUCT__entry( - __field( int, id ) - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) - __array( char, next_state, MAX_DA_NAME_LEN ) - __field( bool, final_state ) - ), - - TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); - memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN); - __entry->id = id; - __entry->final_state = final_state; - ), - - TP_printk("%d: %s x %s -> %s %s", - __entry->id, - __entry->state, - __entry->event, - __entry->next_state, - __entry->final_state ? "(final)" : "") -); - -DECLARE_EVENT_CLASS(error_da_monitor_id, - - TP_PROTO(int id, char *state, char *event), - - TP_ARGS(id, state, event), - - TP_STRUCT__entry( - __field( int, id ) - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) - ), - - TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); - __entry->id = id; - ), - - TP_printk("%d: event %s not expected in the state %s", - __entry->id, - __entry->event, - __entry->state) -); - -#ifdef CONFIG_RV_MON_WWNR -/* id is the pid of the task */ -DEFINE_EVENT(event_da_monitor_id, event_wwnr, - TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), - TP_ARGS(id, state, event, next_state, final_state)); - -DEFINE_EVENT(error_da_monitor_id, error_wwnr, - TP_PROTO(int id, char *state, char *event), - TP_ARGS(id, state, event)); -#endif /* CONFIG_RV_MON_WWNR */ - -#endif /* CONFIG_DA_MON_EVENTS_ID */ -#endif /* _TRACE_RV_H */ - -/* This part ust be outside protection */ -#undef TRACE_INCLUDE_PATH -#include -- cgit v1.2.3 From 9351bbb1b022227644022850bf2160b04e970195 Mon Sep 17 00:00:00 2001 From: Vasileios Amoiridis Date: Sat, 14 Dec 2024 20:14:21 +0100 Subject: iio: core: mark scan_timestamp as __private Since there are no more direct accesses to the indio_dev->scan_timestamp value, it can be marked as __private and use the macro ACCESS_PRIVATE() in order to access it. Like this, static checkers will be able to inform in case someone tries to either write to the value, or read its value directly. Signed-off-by: Vasileios Amoiridis Link: https://patch.msgid.link/20241214191421.94172-5-vassilisamir@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer.h | 2 +- include/linux/iio/iio.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h index 418b1307d3f2..3b8d618bb3df 100644 --- a/include/linux/iio/buffer.h +++ b/include/linux/iio/buffer.h @@ -37,7 +37,7 @@ int iio_pop_from_buffer(struct iio_buffer *buffer, void *data); static inline int iio_push_to_buffers_with_timestamp(struct iio_dev *indio_dev, void *data, int64_t timestamp) { - if (indio_dev->scan_timestamp) { + if (ACCESS_PRIVATE(indio_dev, scan_timestamp)) { size_t ts_offset = indio_dev->scan_bytes / sizeof(int64_t) - 1; ((int64_t *)data)[ts_offset] = timestamp; } diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index ae65890d4567..56161e02f002 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -611,7 +611,7 @@ struct iio_dev { const unsigned long *available_scan_masks; unsigned int __private masklength; const unsigned long *active_scan_mask; - bool scan_timestamp; + bool __private scan_timestamp; struct iio_trigger *trig; struct iio_poll_func *pollfunc; struct iio_poll_func *pollfunc_event; -- cgit v1.2.3 From e2f9d754fc5b5dcb53a0df627f386b63f8ba2d68 Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Fri, 20 Dec 2024 10:59:21 +0100 Subject: iio: trigger: stm32-timer: add support for stm32mp25 Add support for STM32MP25 SoC. Use newly introduced compatible to handle this new HW variant. Add TIM20 trigger definitions that can be used by the stm32 analog-to-digital converter. Use compatible data to identify it. As the counter framework is now superseding the deprecated IIO counter interface (IIO_COUNT), don't support it. Only register IIO trigger devices for ADC usage. So, make the valids_table a cfg option. Signed-off-by: Fabrice Gasnier Link: https://patch.msgid.link/20241220095927.1122782-4-fabrice.gasnier@foss.st.com Signed-off-by: Jonathan Cameron --- include/linux/iio/timer/stm32-timer-trigger.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/iio/timer/stm32-timer-trigger.h b/include/linux/iio/timer/stm32-timer-trigger.h index 37572e4dc73a..1ee237b56183 100644 --- a/include/linux/iio/timer/stm32-timer-trigger.h +++ b/include/linux/iio/timer/stm32-timer-trigger.h @@ -72,6 +72,12 @@ #define TIM17_OC1 "tim17_oc1" +#define TIM20_OC1 "tim20_oc1" +#define TIM20_OC2 "tim20_oc2" +#define TIM20_OC3 "tim20_oc3" +#define TIM20_TRGO "tim20_trgo" +#define TIM20_TRGO2 "tim20_trgo2" + #if IS_REACHABLE(CONFIG_IIO_STM32_TIMER_TRIGGER) bool is_stm32_timer_trigger(struct iio_trigger *trig); #else -- cgit v1.2.3 From f718faf3940e95d5d34af9041f279f598396ab7d Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 17 Dec 2024 00:48:18 +0000 Subject: freezer, sched: Report frozen tasks as 'D' instead of 'R' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before commit: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") the frozen task stat was reported as 'D' in cgroup v1. However, after rewriting the core freezer logic, the frozen task stat is reported as 'R'. This is confusing, especially when a task with stat of 'S' is frozen. This bug can be reproduced with these steps: $ cd /sys/fs/cgroup/freezer/ $ mkdir test $ sleep 1000 & [1] 739 // task whose stat is 'S' $ echo 739 > test/cgroup.procs $ echo FROZEN > test/freezer.state $ ps -aux | grep 739 root 739 0.1 0.0 8376 1812 pts/0 R 10:56 0:00 sleep 1000 As shown above, a task whose stat is 'S' was changed to 'R' when it was frozen. To solve this regression, simply maintain the same reported state as before the rewrite. [ mingo: Enhanced the changelog and comments ] Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") Signed-off-by: Chen Ridong Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Tejun Heo Acked-by: Michal Koutný Link: https://lore.kernel.org/r/20241217004818.3200515-1-chenridong@huaweicloud.com --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 66b311fbd5d6..64934e0830af 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1637,8 +1637,9 @@ static inline unsigned int __task_state_index(unsigned int tsk_state, * We're lying here, but rather than expose a completely new task state * to userspace, we can make this appear as if the task has gone through * a regular rt_mutex_lock() call. + * Report frozen tasks as uninterruptible. */ - if (tsk_state & TASK_RTLOCK_WAIT) + if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN)) state = TASK_UNINTERRUPTIBLE; return fls(state); -- cgit v1.2.3 From b651ea8a44aab69f71c5ebeec7e472b03f1b2ca2 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Tue, 24 Dec 2024 09:01:32 -0500 Subject: ACPI: platform_profile: Add devm_platform_profile_register() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Platform profile's lifetime is usually tied to a device's lifetime, therefore add a device managed version of platform_profile_register(). Signed-off-by: Kurt Borja Reviewed-by: Armin Wolf Link: https://lore.kernel.org/r/20241224140131.30362-4-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 0682bb4c57e5..f1cd4b65e351 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -41,6 +41,7 @@ struct platform_profile_handler { int platform_profile_register(struct platform_profile_handler *pprof); int platform_profile_remove(struct platform_profile_handler *pprof); +int devm_platform_profile_register(struct platform_profile_handler *pprof); int platform_profile_cycle(void); void platform_profile_notify(struct platform_profile_handler *pprof); -- cgit v1.2.3 From 5ffa0dbfdc9fc05acae02d5b0dc766ec778569ac Mon Sep 17 00:00:00 2001 From: Dawid Niedzwiecki Date: Fri, 6 Dec 2024 09:15:13 +0000 Subject: platform/chrome: cros_ec: jump to RW before probing There are EC devices, like FPMCU, that use RWSIG as a method of authenticating RW section. After the authentication succeeds, EC device waits some time before jumping to RW. EC can be probed before the jump, which means there is a time window after jump to RW in which EC won't respond, because it is not initialized. It can cause a communication errors after probing. To avoid such problems, send the RWSIG continue command first, which skips waiting for the jump to RW. Send the command more times, to make sure EC is ready in RW before the start of the actual probing process. If a EC device doesn't support the RWSIG, it will respond with invalid command error code and probing will continue as usual. Signed-off-by: Dawid Niedzwiecki Link: https://lore.kernel.org/r/20241206091514.2538350-2-dawidn@google.com Signed-off-by: Tzung-Bi Shih --- include/linux/platform_data/cros_ec_proto.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index b34ed0cc1f8d..701389c16fa7 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -246,6 +246,8 @@ int cros_ec_cmd_xfer(struct cros_ec_device *ec_dev, int cros_ec_cmd_xfer_status(struct cros_ec_device *ec_dev, struct cros_ec_command *msg); +int cros_ec_rwsig_continue(struct cros_ec_device *ec_dev); + int cros_ec_query_all(struct cros_ec_device *ec_dev); int cros_ec_get_next_event(struct cros_ec_device *ec_dev, -- cgit v1.2.3 From fb1e493426d4da77a1d192fffa4dc55fc4ad5741 Mon Sep 17 00:00:00 2001 From: Rob Barnes Date: Wed, 18 Dec 2024 01:57:59 +0000 Subject: platform/chrome: cros_ec_lpc: Only check for events on MKBP notifies Only check EC for MKBP events when the ACPI notify value indicates the notify is due to an MKBP host event. This reduces unnecessary queries to the EC. Notify value 0x80 is reserved for devices specific notifies. It is used by many devices to indicate various events. It's only used by cros_ec for MKBP events. Signed-off-by: Rob Barnes Link: https://lore.kernel.org/r/20241218015759.3558830-1-robbarnes@google.com Signed-off-by: Tzung-Bi Shih --- include/linux/platform_data/cros_ec_proto.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 701389c16fa7..3ec24f445c29 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -41,6 +41,11 @@ #define EC_MAX_REQUEST_OVERHEAD 1 #define EC_MAX_RESPONSE_OVERHEAD 32 +/* + * ACPI notify value for MKBP host event. + */ +#define ACPI_NOTIFY_CROS_EC_MKBP 0x80 + /* * EC panic is not covered by the standard (0-F) ACPI notify values. * Arbitrarily choosing B0 to notify ec panic, which is in the 84-BF -- cgit v1.2.3 From 1b2ff639ff0cb999285d90c57d3f856b91c2aea6 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 30 Dec 2024 12:49:02 +0100 Subject: ALSA: Align the syntax of iov_iter helpers with standard ones We introduced a couple of helpers for copying iomem over iov_iter, and the functions were formed like the former copy_from/to_user(), and the return value was adjusted to 0/-EFAULT, which made the code transition a bit easier at that time. OTOH, the standard copy_from/to_iter() functions have different argument orders and the return value, and this difference can be confusing. It's not only confusing but dangerous; actually I did write a wrong code due to that once :-< For reducing the confusion, this patch changes the syntax of those helpers to align with the standard copy_from/to_iter(). The argument order is changed and the return value is the size of copied bytes. The callers of those functions are updated accordingly, too. Link: https://patch.msgid.link/20241230114903.4959-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- include/sound/pcm.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/sound/pcm.h b/include/sound/pcm.h index 67c99ffbf51b..8becb4504887 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -1532,9 +1532,10 @@ static inline u64 pcm_format_to_bits(snd_pcm_format_t pcm_format) dev_dbg((pcm)->card->dev, fmt, ##args) /* helpers for copying between iov_iter and iomem */ -int copy_to_iter_fromio(struct iov_iter *itert, const void __iomem *src, - size_t count); -int copy_from_iter_toio(void __iomem *dst, struct iov_iter *iter, size_t count); +size_t copy_to_iter_fromio(const void __iomem *src, size_t bytes, + struct iov_iter *iter) __must_check; +size_t copy_from_iter_toio(void __iomem *dst, size_t bytes, + struct iov_iter *iter) __must_check; struct snd_pcm_status64 { snd_pcm_state_t state; /* stream state */ -- cgit v1.2.3 From 6fdbc7b9aa20b1db47d13a5f2a4d31fb2f8f3822 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Mon, 30 Dec 2024 14:30:27 +0000 Subject: nvmem: specify ->reg_read/reg_write() expected return values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both ->reg_read() and ->reg_write() return values are not easy to deduce. Explicit that they should return zero on success (and negative values otherwise). Such callbacks, in some alternative world, could return the number of bytes in the success case. That would be translated to errors in the nvmem core because of checks like: ret = nvmem->reg_write(nvmem->priv, offset, val, bytes); if (ret) { // error case } This mistake is not just theoretical, see commit 28b008751aa2 ("nvmem: rmem: Fix return value of rmem_read()"). Signed-off-by: Théo Lebrun Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20241230143035.265518-4-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/nvmem-provider.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 3ebeaa0ded00..515676ebe598 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -92,8 +92,8 @@ struct nvmem_cell_info { * @read_only: Device is read-only. * @root_only: Device is accessibly to root only. * @of_node: If given, this will be used instead of the parent's of_node. - * @reg_read: Callback to read data. - * @reg_write: Callback to write data. + * @reg_read: Callback to read data; return zero if successful. + * @reg_write: Callback to write data; return zero if successful. * @size: Device size. * @word_size: Minimum read/write access granularity. * @stride: Minimum read/write access stride. -- cgit v1.2.3 From 4f3d1be4c2f8a22470f3625cbc778ba2e2130def Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 14 Nov 2024 02:18:32 +0900 Subject: compiler.h: add const_true() __builtin_constant_p() is known for not always being able to produce constant expression [1] which led to the introduction of __is_constexpr() [2]. Because of its dependency on __builtin_constant_p(), statically_true() suffers from the same issues. For example: void foo(int a) { /* fail on GCC */ BUILD_BUG_ON_ZERO(statically_true(a)); /* fail on both clang and GCC */ static char arr[statically_true(a) ? 1 : 2]; } For the same reasons why __is_constexpr() was created to cover __builtin_constant_p() edge cases, __is_constexpr() can be used to resolve statically_true() limitations. Note that, somehow, GCC is not always able to fold this: __is_constexpr(x) && (x) It is OK in BUILD_BUG_ON_ZERO() but not in array declarations nor in static_assert(): void bar(int a) { /* success */ BUILD_BUG_ON_ZERO(__is_constexpr(a) && (a)); /* fail on GCC */ static char arr[__is_constexpr(a) && (a) ? 1 : 2]; /* fail on GCC */ static_assert(__is_constexpr(a) && (a)); } Encapsulating the expression in a __builtin_choose_expr() switch resolves all these failed tests. Define a new const_true() macro which, by making use of the __builtin_choose_expr() and __is_constexpr(x) combo, always produces a constant expression. It should be noted that statically_true() is the only one able to fold tautological expressions in which at least one on the operands is not a constant expression. For example: statically_true(true || var) statically_true(var == var) statically_true(var * 0 + 1) statically_true(!(var * 8 % 4)) always evaluates to true, whereas all of these would be false under const_true() if var is not a constant expression [3]. For this reason, usage of const_true() should be the exception. Reflect in the documentation that const_true() is less powerful and that statically_true() is the overall preferred solution. [1] __builtin_constant_p cannot resolve to const when optimizing Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=19449 [2] commit 3c8ba0d61d04 ("kernel.h: Retain constant expression output for max()/min()") Link: https://git.kernel.org/torvalds/c/3c8ba0d61d04 [3] https://godbolt.org/z/c61PMxqbK CC: Linus Torvalds CC: Rasmus Villemoes CC: Luc Van Oostenryck Reviewed-by: Yury Norov , Signed-off-by: Vincent Mailhol Signed-off-by: Yury Norov --- include/linux/compiler.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 240c632c5b95..bea92d20f9d2 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -330,6 +330,28 @@ static inline void *offset_to_ptr(const int *off) */ #define statically_true(x) (__builtin_constant_p(x) && (x)) +/* + * Similar to statically_true() but produces a constant expression + * + * To be used in conjunction with macros, such as BUILD_BUG_ON_ZERO(), + * which require their input to be a constant expression and for which + * statically_true() would otherwise fail. + * + * This is a trade-off: const_true() requires all its operands to be + * compile time constants. Else, it would always returns false even on + * the most trivial cases like: + * + * true || non_const_var + * + * On the opposite, statically_true() is able to fold more complex + * tautologies and will return true on expressions such as: + * + * !(non_const_var * 8 % 4) + * + * For the general case, statically_true() is better. + */ +#define const_true(x) __builtin_choose_expr(__is_constexpr(x), x, false) + /* * This is needed in functions which generate the stack canary, see * arch/x86/kernel/smpboot.c::start_secondary() for an example. -- cgit v1.2.3 From 4463a445a64b719e6f501d80dcc5872dde42eb73 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 14 Nov 2024 02:18:33 +0900 Subject: linux/bits.h: simplify GENMASK_INPUT_CHECK() In GENMASK_INPUT_CHECK(), __builtin_choose_expr(__is_constexpr((l) > (h)), (l) > (h), 0) is the exact expansion of: const_true((l) > (h)) Apply const_true() to simplify GENMASK_INPUT_CHECK(). CC: Linus Torvalds CC: Rasmus Villemoes CC: Luc Van Oostenryck Reviewed-by: Yury Norov , Signed-off-by: Vincent Mailhol Signed-off-by: Yury Norov --- include/linux/bits.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bits.h b/include/linux/bits.h index 60044b608817..61a75d3f294b 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -20,9 +20,8 @@ */ #if !defined(__ASSEMBLY__) #include -#define GENMASK_INPUT_CHECK(h, l) \ - (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \ - __is_constexpr((l) > (h)), (l) > (h), 0))) +#include +#define GENMASK_INPUT_CHECK(h, l) BUILD_BUG_ON_ZERO(const_true((l) > (h))) #else /* * BUILD_BUG_ON_ZERO is not available in h files included from asm files, -- cgit v1.2.3 From 7f15d4abf925f33015fb62973ce2ddb45ce04bb9 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 20 Dec 2024 16:57:39 +0000 Subject: cpu: Remove unused init_cpu_online The last use of init_cpu_online() was removed by the commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture") Remove it. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Yury Norov --- include/linux/cpumask.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 9278a50d514f..590d8438514c 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1043,7 +1043,6 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); /* Wrappers for arch boot code to manipulate normally-constant masks */ void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); -void init_cpu_online(const struct cpumask *src); #define assign_cpu(cpu, mask, val) \ assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) -- cgit v1.2.3 From 8ec396d05d1b737c87311fb7311f753b02c2a6b1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 28 Nov 2024 15:06:17 +0000 Subject: mm: reinstate ability to map write-sealed memfd mappings read-only Patch series "mm: reinstate ability to map write-sealed memfd mappings read-only". In commit 158978945f31 ("mm: perform the mapping_map_writable() check after call_mmap()") (and preceding changes in the same series) it became possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only. Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") unintentionally undid this logic by moving the mapping_map_writable() check before the shmem_mmap() hook is invoked, thereby regressing this change. This series reworks how we both permit write-sealed mappings being mapped read-only and disallow mprotect() from undoing the write-seal, fixing this regression. We also add a regression test to ensure that we do not accidentally regress this in future. Thanks to Julian Orth for reporting this regression. This patch (of 2): In commit 158978945f31 ("mm: perform the mapping_map_writable() check after call_mmap()") (and preceding changes in the same series) it became possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only. This was previously unnecessarily disallowed, despite the man page documentation indicating that it would be, thereby limiting the usefulness of F_SEAL_WRITE logic. We fixed this by adapting logic that existed for the F_SEAL_FUTURE_WRITE seal (one which disallows future writes to the memfd) to also be used for F_SEAL_WRITE. For background - the F_SEAL_FUTURE_WRITE seal clears VM_MAYWRITE for a read-only mapping to disallow mprotect() from overriding the seal - an operation performed by seal_check_write(), invoked from shmem_mmap(), the f_op->mmap() hook used by shmem mappings. By extending this to F_SEAL_WRITE and critically - checking mapping_map_writable() to determine if we may map the memfd AFTER we invoke shmem_mmap() - the desired logic becomes possible. This is because mapping_map_writable() explicitly checks for VM_MAYWRITE, which we will have cleared. Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") unintentionally undid this logic by moving the mapping_map_writable() check before the shmem_mmap() hook is invoked, thereby regressing this change. We reinstate this functionality by moving the check out of shmem_mmap() and instead performing it in do_mmap() at the point at which VMA flags are being determined, which seems in any case to be a more appropriate place in which to make this determination. In order to achieve this we rework memfd seal logic to allow us access to this information using existing logic and eliminate the clearing of VM_MAYWRITE from seal_check_write() which we are performing in do_mmap() instead. Link: https://lkml.kernel.org/r/99fc35d2c62bd2e05571cf60d9f8b843c56069e0.1732804776.git.lorenzo.stoakes@oracle.com Fixes: 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") Signed-off-by: Lorenzo Stoakes Reported-by: Julian Orth Closes: https://lore.kernel.org/all/CAHijbEUMhvJTN9Xw1GmbM266FXXv=U7s4L_Jem5x3AaPZxrYpQ@mail.gmail.com/ Cc: Jann Horn Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Shuah Khan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/memfd.h | 14 +++++++++++++ include/linux/mm.h | 58 +++++++++++++++++++++++++++++++++++---------------- 2 files changed, 54 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/memfd.h b/include/linux/memfd.h index 3f2cf339ceaf..d437e3070850 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -7,6 +7,7 @@ #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); +unsigned int *memfd_file_seals_ptr(struct file *file); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { @@ -16,6 +17,19 @@ static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) { return ERR_PTR(-EINVAL); } + +static inline unsigned int *memfd_file_seals_ptr(struct file *file) +{ + return NULL; +} #endif +/* Retrieve memfd seals associated with the file, if any. */ +static inline unsigned int memfd_file_seals(struct file *file) +{ + unsigned int *sealsp = memfd_file_seals_ptr(file); + + return sealsp ? *sealsp : 0; +} + #endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 338a76ce9083..fb397918c43d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4101,6 +4101,37 @@ void mem_dump_obj(void *object); static inline void mem_dump_obj(void *object) {} #endif +static inline bool is_write_sealed(int seals) +{ + return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); +} + +/** + * is_readonly_sealed - Checks whether write-sealed but mapped read-only, + * in which case writes should be disallowing moving + * forwards. + * @seals: the seals to check + * @vm_flags: the VMA flags to check + * + * Returns whether readonly sealed, in which case writess should be disallowed + * going forward. + */ +static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags) +{ + /* + * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as + * MAP_SHARED and read-only, take care to not allow mprotect to + * revert protections on such mappings. Do this only for shared + * mappings. For private mappings, don't need to mask + * VM_MAYWRITE as we still want them to be COW-writable. + */ + if (is_write_sealed(seals) && + ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED)) + return true; + + return false; +} + /** * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and * handle them. @@ -4112,24 +4143,15 @@ static inline void mem_dump_obj(void *object) {} */ static inline int seal_check_write(int seals, struct vm_area_struct *vma) { - if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * write seals are active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; - - /* - * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as - * MAP_SHARED and read-only, take care to not allow mprotect to - * revert protections on such mappings. Do this only for shared - * mappings. For private mappings, don't need to mask - * VM_MAYWRITE as we still want them to be COW-writable. - */ - if (vma->vm_flags & VM_SHARED) - vm_flags_clear(vma, VM_MAYWRITE); - } + if (!is_write_sealed(seals)) + return 0; + + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * write seals are active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; return 0; } -- cgit v1.2.3 From 59d9094df3d79443937add8700b2ef1a866b1081 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 16 Dec 2024 15:11:47 +0800 Subject: mm: hugetlb: independent PMD page table shared count The folio refcount may be increased unexpectly through try_get_folio() by caller such as split_huge_pages. In huge_pmd_unshare(), we use refcount to check whether a pmd page table is shared. The check is incorrect if the refcount is increased by the above caller, and this can cause the page table leaked: BUG: Bad page state in process sh pfn:109324 page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x66 pfn:0x109324 flags: 0x17ffff800000000(node=0|zone=2|lastcpupid=0xfffff) page_type: f2(table) raw: 017ffff800000000 0000000000000000 0000000000000000 0000000000000000 raw: 0000000000000066 0000000000000000 00000000f2000000 0000000000000000 page dumped because: nonzero mapcount ... CPU: 31 UID: 0 PID: 7515 Comm: sh Kdump: loaded Tainted: G B 6.13.0-rc2master+ #7 Tainted: [B]=BAD_PAGE Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 Call trace: show_stack+0x20/0x38 (C) dump_stack_lvl+0x80/0xf8 dump_stack+0x18/0x28 bad_page+0x8c/0x130 free_page_is_bad_report+0xa4/0xb0 free_unref_page+0x3cc/0x620 __folio_put+0xf4/0x158 split_huge_pages_all+0x1e0/0x3e8 split_huge_pages_write+0x25c/0x2d8 full_proxy_write+0x64/0xd8 vfs_write+0xcc/0x280 ksys_write+0x70/0x110 __arm64_sys_write+0x24/0x38 invoke_syscall+0x50/0x120 el0_svc_common.constprop.0+0xc8/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x34/0x128 el0t_64_sync_handler+0xc8/0xd0 el0t_64_sync+0x190/0x198 The issue may be triggered by damon, offline_page, page_idle, etc, which will increase the refcount of page table. 1. The page table itself will be discarded after reporting the "nonzero mapcount". 2. The HugeTLB page mapped by the page table miss freeing since we treat the page table as shared and a shared page table will not be unmapped. Fix it by introducing independent PMD page table shared count. As described by comment, pt_index/pt_mm/pt_frag_refcount are used for s390 gmap, x86 pgds and powerpc, pt_share_count is used for x86/arm64/riscv pmds, so we can reuse the field as pt_share_count. Link: https://lkml.kernel.org/r/20241216071147.3984217-1-liushixin2@huawei.com Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Liu Shixin Cc: Kefeng Wang Cc: Ken Chen Cc: Muchun Song Cc: Nanyong Sun Cc: Jane Chu Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + include/linux/mm_types.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index fb397918c43d..b1c3db9cf355 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3125,6 +3125,7 @@ static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) if (!pmd_ptlock_init(ptdesc)) return false; __folio_set_pgtable(folio); + ptdesc_pmd_pts_init(ptdesc); lruvec_stat_add_folio(folio, NR_PAGETABLE); return true; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7361a8f3ab68..332cee285662 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -445,6 +445,7 @@ FOLIO_MATCH(compound_head, _head_2a); * @pt_index: Used for s390 gmap. * @pt_mm: Used for x86 pgds. * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. + * @pt_share_count: Used for HugeTLB PMD page table share count. * @_pt_pad_2: Padding to ensure proper alignment. * @ptl: Lock for the page table. * @__page_type: Same as page->page_type. Unused for page tables. @@ -471,6 +472,9 @@ struct ptdesc { pgoff_t pt_index; struct mm_struct *pt_mm; atomic_t pt_frag_refcount; +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING + atomic_t pt_share_count; +#endif }; union { @@ -516,6 +520,32 @@ static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); const struct page *: (const struct ptdesc *)(p), \ struct page *: (struct ptdesc *)(p))) +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING +static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) +{ + atomic_set(&ptdesc->pt_share_count, 0); +} + +static inline void ptdesc_pmd_pts_inc(struct ptdesc *ptdesc) +{ + atomic_inc(&ptdesc->pt_share_count); +} + +static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc) +{ + atomic_dec(&ptdesc->pt_share_count); +} + +static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc) +{ + return atomic_read(&ptdesc->pt_share_count); +} +#else +static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) +{ +} +#endif + /* * Used for sizing the vmemmap region on some architectures */ -- cgit v1.2.3 From 11673247700e2af3a6a95f7b3f1bb80b691c950e Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 19 Dec 2024 14:18:28 +0200 Subject: percpu: remove intermediate variable in PERCPU_PTR() The intermediate variable in the PERCPU_PTR() macro results in a kernel panic on boot [1] due to a compiler bug seen when compiling the kernel (+ KASAN) with gcc 11.3.1, but not when compiling with latest gcc (v14.2)/clang(v18.1). To solve it, remove the intermediate variable (which is not needed) and keep the casting that resolves the address space checks. [1] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] CPU: 0 UID: 0 PID: 547 Comm: iptables Not tainted 6.13.0-rc1_external_tested-master #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:nf_ct_netns_do_get+0x139/0x540 Code: 03 00 00 48 81 c4 88 00 00 00 5b 5d 41 5c 41 5d 41 5e 41 5f c3 4d 8d 75 08 48 b8 00 00 00 00 00 fc ff df 4c 89 f2 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e 27 03 00 00 41 8b 45 08 83 c0 RSP: 0018:ffff888116df75e8 EFLAGS: 00010207 RAX: dffffc0000000000 RBX: 1ffff11022dbeebe RCX: ffffffff839a2382 RDX: 0000000000000003 RSI: 0000000000000008 RDI: ffff88842ec46d10 RBP: 0000000000000002 R08: 0000000000000000 R09: fffffbfff0b0860c R10: ffff888116df75e8 R11: 0000000000000001 R12: ffffffff879d6a80 R13: 0000000000000016 R14: 000000000000001e R15: ffff888116df7908 FS: 00007fba01646740(0000) GS:ffff88842ec00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055bd901800d8 CR3: 00000001205f0003 CR4: 0000000000172eb0 Call Trace: ? die_addr+0x3d/0xa0 ? exc_general_protection+0x144/0x220 ? asm_exc_general_protection+0x22/0x30 ? __mutex_lock+0x2c2/0x1d70 ? nf_ct_netns_do_get+0x139/0x540 ? nf_ct_netns_do_get+0xb5/0x540 ? net_generic+0x1f0/0x1f0 ? __create_object+0x5e/0x80 xt_check_target+0x1f0/0x930 ? textify_hooks.constprop.0+0x110/0x110 ? pcpu_alloc_noprof+0x7cd/0xcf0 ? xt_find_target+0x148/0x1e0 find_check_entry.constprop.0+0x6c0/0x920 ? get_info+0x380/0x380 ? __virt_addr_valid+0x1df/0x3b0 ? kasan_quarantine_put+0xe3/0x200 ? kfree+0x13e/0x3d0 ? translate_table+0xaf5/0x1750 translate_table+0xbd8/0x1750 ? ipt_unregister_table_exit+0x30/0x30 ? __might_fault+0xbb/0x170 do_ipt_set_ctl+0x408/0x1340 ? nf_sockopt_find.constprop.0+0x17b/0x1f0 ? lock_downgrade+0x680/0x680 ? lockdep_hardirqs_on_prepare+0x284/0x400 ? ipt_register_table+0x440/0x440 ? bit_wait_timeout+0x160/0x160 nf_setsockopt+0x6f/0xd0 raw_setsockopt+0x7e/0x200 ? raw_bind+0x590/0x590 ? do_user_addr_fault+0x812/0xd20 do_sock_setsockopt+0x1e2/0x3f0 ? move_addr_to_user+0x90/0x90 ? lock_downgrade+0x680/0x680 __sys_setsockopt+0x9e/0x100 __x64_sys_setsockopt+0xb9/0x150 ? do_syscall_64+0x33/0x140 do_syscall_64+0x6d/0x140 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7fba015134ce Code: 0f 1f 40 00 48 8b 15 59 69 0e 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b1 0f 1f 00 f3 0f 1e fa 49 89 ca b8 36 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 0a c3 66 0f 1f 84 00 00 00 00 00 48 8b 15 21 RSP: 002b:00007ffd9de6f388 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 000055bd9017f490 RCX: 00007fba015134ce RDX: 0000000000000040 RSI: 0000000000000000 RDI: 0000000000000004 RBP: 0000000000000500 R08: 0000000000000560 R09: 0000000000000052 R10: 000055bd901800e0 R11: 0000000000000246 R12: 000055bd90180140 R13: 000055bd901800e0 R14: 000055bd9017f498 R15: 000055bd9017ff10 Modules linked in: xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay zram zsmalloc mlx4_ib mlx4_en mlx4_core rpcrdma rdma_ucm ib_uverbs ib_iser libiscsi scsi_transport_iscsi fuse ib_umad rdma_cm ib_ipoib iw_cm ib_cm ib_core ---[ end trace 0000000000000000 ]--- [akpm@linux-foundation.org: simplification, per Uros] Link: https://lkml.kernel.org/r/20241219121828.2120780-1-gal@nvidia.com Fixes: dabddd687c9e ("percpu: cast percpu pointer in PERCPU_PTR() via unsigned long") Signed-off-by: Gal Pressman Closes: https://lore.kernel.org/all/7590f546-4021-4602-9252-0d525de35b52@nvidia.com Cc: Uros Bizjak Cc: Bill Wendling Cc: Christoph Lameter Cc: Dennis Zhou Cc: Justin Stitt Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/percpu-defs.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 35842d1e3879..5b520fe86b60 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -221,10 +221,7 @@ do { \ } while (0) #define PERCPU_PTR(__p) \ -({ \ - unsigned long __pcpu_ptr = (__force unsigned long)(__p); \ - (typeof(*(__p)) __force __kernel *)(__pcpu_ptr); \ -}) + (typeof(*(__p)) __force __kernel *)((__force unsigned long)(__p)) #ifdef CONFIG_SMP -- cgit v1.2.3 From 202580b60229345dc2637099f10c8a8857c1fdc2 Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Fri, 20 Dec 2024 15:35:07 +0530 Subject: soc: ti: pruss: Fix pruss APIs PRUSS APIs in pruss_driver.h produce lots of compilation errors when CONFIG_TI_PRUSS is not set. The errors and warnings, warning: returning 'void *' from a function with return type 'int' makes integer from pointer without a cast [-Wint-conversion] error: expected identifier or '(' before '{' token Fix these warnings and errors by fixing the return type of pruss APIs as well as removing the misplaced semicolon from pruss_cfg_xfr_enable() Fixes: 0211cc1e4fbb ("soc: ti: pruss: Add helper functions to set GPI mode, MII_RT_event and XFR") Signed-off-by: MD Danish Anwar Reviewed-by: Roger Quadros Link: https://lore.kernel.org/r/20241220100508.1554309-2-danishanwar@ti.com Signed-off-by: Nishanth Menon --- include/linux/pruss_driver.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/pruss_driver.h b/include/linux/pruss_driver.h index c9a31c567e85..2e18fef1a2e1 100644 --- a/include/linux/pruss_driver.h +++ b/include/linux/pruss_driver.h @@ -144,32 +144,32 @@ static inline int pruss_release_mem_region(struct pruss *pruss, static inline int pruss_cfg_get_gpmux(struct pruss *pruss, enum pruss_pru_id pru_id, u8 *mux) { - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; } static inline int pruss_cfg_set_gpmux(struct pruss *pruss, enum pruss_pru_id pru_id, u8 mux) { - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; } static inline int pruss_cfg_gpimode(struct pruss *pruss, enum pruss_pru_id pru_id, enum pruss_gpi_mode mode) { - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; } static inline int pruss_cfg_miirt_enable(struct pruss *pruss, bool enable) { - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; } static inline int pruss_cfg_xfr_enable(struct pruss *pruss, enum pru_type pru_type, - bool enable); + bool enable) { - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; } #endif /* CONFIG_TI_PRUSS */ -- cgit v1.2.3 From 7bac65687510038390a0a54cbe14fba08d037e46 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 19 Dec 2024 22:20:41 +0530 Subject: scsi: ufs: qcom: Power off the PHY if it was already powered on in ufs_qcom_power_up_sequence() PHY might already be powered on during ufs_qcom_power_up_sequence() in a couple of cases: 1. During UFSHCD_QUIRK_REINIT_AFTER_MAX_GEAR_SWITCH quirk 2. Resuming from spm_lvl = 5 suspend In those cases, it is necessary to call phy_power_off() and phy_exit() in ufs_qcom_power_up_sequence() function to power off the PHY before calling phy_init() and phy_power_on(). Case (1) is doing it via ufs_qcom_reinit_notify() callback, but case (2) is not handled. So to satisfy both cases, call phy_power_off() and phy_exit() if the phy_count is non-zero. And with this change, the reinit_notify() callback is no longer needed. This fixes the below UFS resume failure with spm_lvl = 5: ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: ufshcd_host_reset_and_restore: Host init failed -5 ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: ufshcd_host_reset_and_restore: Host init failed -5 ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: ufshcd_host_reset_and_restore: Host init failed -5 ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: ufshcd_host_reset_and_restore: Host init failed -5 ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: Enabling the controller failed ufshcd-qcom 1d84000.ufshc: ufshcd_host_reset_and_restore: Host init failed -5 ufs_device_wlun 0:0:0:49488: ufshcd_wl_resume failed: -5 ufs_device_wlun 0:0:0:49488: PM: dpm_run_callback(): scsi_bus_resume returns -5 ufs_device_wlun 0:0:0:49488: PM: failed to resume async: error -5 Cc: stable@vger.kernel.org # 6.3 Fixes: baf5ddac90dc ("scsi: ufs: ufs-qcom: Add support for reinitializing the UFS device") Reported-by: Ram Kumar Dwivedi Tested-by: Amit Pundir # on SM8550-HDK Reviewed-by: Bart Van Assche Tested-by: Neil Armstrong # on SM8550-QRD Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20241219-ufs-qcom-suspend-fix-v3-1-63c4b95a70b9@linaro.org Signed-off-by: Martin K. Petersen --- include/ufs/ufshcd.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index d650ae6b58d3..74e5b9960c54 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -329,7 +329,6 @@ struct ufs_pwr_mode_info { * @program_key: program or evict an inline encryption key * @fill_crypto_prdt: initialize crypto-related fields in the PRDT * @event_notify: called to notify important events - * @reinit_notify: called to notify reinit of UFSHCD during max gear switch * @mcq_config_resource: called to configure MCQ platform resources * @get_hba_mac: reports maximum number of outstanding commands supported by * the controller. Should be implemented for UFSHCI 4.0 or later @@ -381,7 +380,6 @@ struct ufs_hba_variant_ops { void *prdt, unsigned int num_segments); void (*event_notify)(struct ufs_hba *hba, enum ufs_event_type evt, void *data); - void (*reinit_notify)(struct ufs_hba *); int (*mcq_config_resource)(struct ufs_hba *hba); int (*get_hba_mac)(struct ufs_hba *hba); int (*op_runtime_config)(struct ufs_hba *hba); -- cgit v1.2.3 From 6e67b32087e34965bbcd421251ba42a8e5667669 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 23 Dec 2024 18:01:10 +0000 Subject: scsi: iscsi: Remove unused iscsi_create_session() iscsi_create_session() last use was removed in 2008 by commit 756135215ec7 ("[SCSI] iscsi: remove session and host binding in libiscsi") Remove it. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20241223180110.50266-1-linux@treblig.org Signed-off-by: Martin K. Petersen --- include/scsi/scsi_transport_iscsi.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index bd1243657c01..5474494a1e99 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -447,10 +447,6 @@ extern int iscsi_add_session(struct iscsi_cls_session *session, unsigned int target_id); extern int iscsi_session_event(struct iscsi_cls_session *session, enum iscsi_uevent_e event); -extern struct iscsi_cls_session *iscsi_create_session(struct Scsi_Host *shost, - struct iscsi_transport *t, - int dd_size, - unsigned int target_id); extern void iscsi_force_destroy_session(struct iscsi_cls_session *session); extern void iscsi_remove_session(struct iscsi_cls_session *session); extern void iscsi_free_session(struct iscsi_cls_session *session); -- cgit v1.2.3 From 75d0c649eca47af21533d9636723574116fe6987 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 12 Dec 2024 20:19:45 -0800 Subject: scsi: ufs: crypto: Add ufs_hba_from_crypto_profile() Add a helper function that encapsulates a container_of expression. For now there are two users but soon there will be more. Tested-by: Bartosz Golaszewski # sm8650 Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20241213041958.202565-3-ebiggers@kernel.org Signed-off-by: Martin K. Petersen --- include/ufs/ufshcd.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index ce7667b020e2..ec0cbffc44ce 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -1211,6 +1211,14 @@ static inline size_t ufshcd_sg_entry_size(const struct ufs_hba *hba) ({ (void)(hba); BUILD_BUG_ON(sg_entry_size != sizeof(struct ufshcd_sg_entry)); }) #endif +#ifdef CONFIG_SCSI_UFS_CRYPTO +static inline struct ufs_hba * +ufs_hba_from_crypto_profile(struct blk_crypto_profile *profile) +{ + return container_of(profile, struct ufs_hba, crypto_profile); +} +#endif + static inline size_t ufshcd_get_ucd_size(const struct ufs_hba *hba) { return sizeof(struct utp_transfer_cmd_desc) + SG_ALL * ufshcd_sg_entry_size(hba); -- cgit v1.2.3 From 409f21010d92a9a5c3b66bacd3b853f4d71ca26c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 12 Dec 2024 20:19:47 -0800 Subject: scsi: ufs: crypto: Remove ufs_hba_variant_ops::program_key There are no longer any implementations of ufs_hba_variant_ops::program_key, so remove it. As a result, ufshcd_program_key() no longer can return an error, so also clean it up to return void. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20241213041958.202565-5-ebiggers@kernel.org Signed-off-by: Martin K. Petersen --- include/ufs/ufshcd.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index ec0cbffc44ce..4d9d9cf6ee46 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -324,7 +324,6 @@ struct ufs_pwr_mode_info { * @phy_initialization: used to initialize phys * @device_reset: called to issue a reset pulse on the UFS device * @config_scaling_param: called to configure clock scaling parameters - * @program_key: program or evict an inline encryption key * @fill_crypto_prdt: initialize crypto-related fields in the PRDT * @event_notify: called to notify important events * @reinit_notify: called to notify reinit of UFSHCD during max gear switch @@ -372,8 +371,6 @@ struct ufs_hba_variant_ops { void (*config_scaling_param)(struct ufs_hba *hba, struct devfreq_dev_profile *profile, struct devfreq_simple_ondemand_data *data); - int (*program_key)(struct ufs_hba *hba, - const union ufs_crypto_cfg_entry *cfg, int slot); int (*fill_crypto_prdt)(struct ufs_hba *hba, const struct bio_crypt_ctx *crypt_ctx, void *prdt, unsigned int num_segments); -- cgit v1.2.3 From b59dbb91f7636a89b54ab8fff756afe320ba6549 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Dec 2024 14:23:09 +0000 Subject: KVM: arm64: nv: Add handling of EL2-specific timer registers Add the required handling for EL2 and EL02 registers, as well as EL1 registers used in the E2H context. This includes handling the virtual timer accesses when CNTHCTL_EL2.EL1TVT or CNTHCTL_EL2.EL1TVCT are set. Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20241217142321.763801-2-maz@kernel.org Signed-off-by: Marc Zyngier --- include/clocksource/arm_arch_timer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index cbbc9a6dc571..877dcbb2601a 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -22,6 +22,8 @@ #define CNTHCTL_EVNTDIR (1 << 3) #define CNTHCTL_EVNTI (0xF << 4) #define CNTHCTL_ECV (1 << 12) +#define CNTHCTL_EL1TVT (1 << 13) +#define CNTHCTL_EL1TVCT (1 << 14) enum arch_timer_reg { ARCH_TIMER_REG_CTRL, -- cgit v1.2.3 From 4bad3068cfa9fc38dd767441871e0edab821105b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Dec 2024 14:23:10 +0000 Subject: KVM: arm64: nv: Sync nested timer state with FEAT_NV2 Emulating the timers with FEAT_NV2 is a bit odd, as the timers can be reconfigured behind our back without the hypervisor even noticing. In the VHE case, that's an actual regression in the architecture... Co-developed-by: Christoffer Dall Signed-off-by: Christoffer Dall Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20241217142321.763801-3-maz@kernel.org Signed-off-by: Marc Zyngier --- include/kvm/arm_arch_timer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index fd650a8789b9..6e3f6b7ff2b2 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -98,6 +98,7 @@ int __init kvm_timer_hyp_init(bool has_gic); int kvm_timer_enable(struct kvm_vcpu *vcpu); void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); +void kvm_timer_sync_nested(struct kvm_vcpu *vcpu); void kvm_timer_sync_user(struct kvm_vcpu *vcpu); bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu); void kvm_timer_update_run(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 2cd2a77f9c32f1eaf599fb72cbcd0394938a8b58 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Dec 2024 14:23:12 +0000 Subject: KVM: arm64: nv: Use FEAT_ECV to trap access to EL0 timers Although FEAT_NV2 makes most things fast, it also makes it impossible to correctly emulate the timers, as the sysreg accesses are redirected to memory. FEAT_ECV addresses this by giving a hypervisor the ability to trap the EL02 sysregs as well as the virtual timer. Add the required trap setting to make use of the feature, allowing us to elide the ugly resync in the middle of the run loop. Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20241217142321.763801-5-maz@kernel.org Signed-off-by: Marc Zyngier --- include/clocksource/arm_arch_timer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index 877dcbb2601a..c62811fb4130 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -24,6 +24,8 @@ #define CNTHCTL_ECV (1 << 12) #define CNTHCTL_EL1TVT (1 << 13) #define CNTHCTL_EL1TVCT (1 << 14) +#define CNTHCTL_EL1NVPCT (1 << 15) +#define CNTHCTL_EL1NVVCT (1 << 16) enum arch_timer_reg { ARCH_TIMER_REG_CTRL, -- cgit v1.2.3 From 338f8ea51944d02ea29eadb3d5fa9196e74a100d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Dec 2024 14:23:13 +0000 Subject: KVM: arm64: nv: Accelerate EL0 timer read accesses when FEAT_ECV in use Although FEAT_ECV allows us to correctly emulate the timers, it also reduces performances pretty badly. Mitigate this by emulating the CTL/CVAL register reads in the inner run loop, without returning to the general kernel. Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20241217142321.763801-6-maz@kernel.org Signed-off-by: Marc Zyngier --- include/kvm/arm_arch_timer.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 6e3f6b7ff2b2..c1ba31fab6f5 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -156,4 +156,19 @@ static inline bool has_cntpoff(void) return (has_vhe() && cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF)); } +static inline u64 timer_get_offset(struct arch_timer_context *ctxt) +{ + u64 offset = 0; + + if (!ctxt) + return 0; + + if (ctxt->offset.vm_offset) + offset += *ctxt->offset.vm_offset; + if (ctxt->offset.vcpu_offset) + offset += *ctxt->offset.vcpu_offset; + + return offset; +} + #endif -- cgit v1.2.3 From d1e37a50e1d781201768c89314532f6ab87e5a42 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Dec 2024 14:23:18 +0000 Subject: KVM: arm64: nv: Sanitise CNTHCTL_EL2 Inject some sanity in CNTHCTL_EL2, ensuring that we don't handle more than we advertise to the guest. Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20241217142321.763801-11-maz@kernel.org Signed-off-by: Marc Zyngier --- include/clocksource/arm_arch_timer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index c62811fb4130..ce6521ad04d1 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -26,6 +26,8 @@ #define CNTHCTL_EL1TVCT (1 << 14) #define CNTHCTL_EL1NVPCT (1 << 15) #define CNTHCTL_EL1NVVCT (1 << 16) +#define CNTHCTL_CNTVMASK (1 << 18) +#define CNTHCTL_CNTPMASK (1 << 19) enum arch_timer_reg { ARCH_TIMER_REG_CTRL, -- cgit v1.2.3 From 0bc9a9e85fcf4ffb69846b961273fde4eb0d03ab Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Dec 2024 14:23:19 +0000 Subject: KVM: arm64: Work around x1e's CNTVOFF_EL2 bogosity It appears that on Qualcomm's x1e CPU, CNTVOFF_EL2 doesn't really work, specially with HCR_EL2.E2H=1. A non-zero offset results in a screaming virtual timer interrupt, to the tune of a few 100k interrupts per second on a 4 vcpu VM. This is also evidenced by this CPU's inability to correctly run any of the timer selftests. The only case this doesn't break is when this register is set to 0, which breaks VM migration. When HCR_EL2.E2H=0, the timer seems to behave normally, and does not result in an interrupt storm. As a workaround, use the fact that this CPU implements FEAT_ECV, and trap all accesses to the virtual timer and counter, keeping CNTVOFF_EL2 set to zero, and emulate accesses to CVAL/TVAL/CTL and the counter itself, fixing up the timer to account for the missing offset. And if you think this is disgusting, you'd probably be right. Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20241217142321.763801-12-maz@kernel.org Signed-off-by: Marc Zyngier --- include/kvm/arm_arch_timer.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index c1ba31fab6f5..681cf0c8b9df 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -151,6 +151,13 @@ void kvm_timer_cpu_down(void); /* CNTKCTL_EL1 valid bits as of DDI0487J.a */ #define CNTKCTL_VALID_BITS (BIT(17) | GENMASK_ULL(9, 0)) +DECLARE_STATIC_KEY_FALSE(broken_cntvoff_key); + +static inline bool has_broken_cntvoff(void) +{ + return static_branch_unlikely(&broken_cntvoff_key); +} + static inline bool has_cntpoff(void) { return (has_vhe() && cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF)); -- cgit v1.2.3 From bb70b0d48d8eab20644ca0101fedfe23f8a26c59 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 24 Dec 2024 20:37:06 +0200 Subject: devlink: Improve the port attributes description Current PF number description is vague, sometimes interpreted as some PF index. VF number in the PCI specification starts at 1; however in kernel, it starts at 0 for representor model. Improve the description of devlink port attributes PF, VF and SF numbers with these details. Reviewed-by: Sridhar Samudrala Reviewed-by: Shay Drory Reviewed-by: Mark Bloch Reviewed-by: Jiri Pirko Signed-off-by: Parav Pandit Link: https://patch.msgid.link/20241224183706.26571-1-parav@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 58e33959c852..fc79fe2297a1 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -35,7 +35,7 @@ struct devlink_port_phys_attrs { /** * struct devlink_port_pci_pf_attrs - devlink port's PCI PF attributes * @controller: Associated controller number - * @pf: Associated PCI PF number for this port. + * @pf: associated PCI function number for the devlink port instance * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_pf_attrs { @@ -47,8 +47,9 @@ struct devlink_port_pci_pf_attrs { /** * struct devlink_port_pci_vf_attrs - devlink port's PCI VF attributes * @controller: Associated controller number - * @pf: Associated PCI PF number for this port. - * @vf: Associated PCI VF for of the PCI PF for this port. + * @pf: associated PCI function number for the devlink port instance + * @vf: associated PCI VF number of a PF for the devlink port instance; + * VF number starts from 0 for the first PCI virtual function * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_vf_attrs { @@ -61,8 +62,8 @@ struct devlink_port_pci_vf_attrs { /** * struct devlink_port_pci_sf_attrs - devlink port's PCI SF attributes * @controller: Associated controller number - * @sf: Associated PCI SF for of the PCI PF for this port. - * @pf: Associated PCI PF number for this port. + * @sf: associated SF number of a PF for the devlink port instance + * @pf: associated PCI function number for the devlink port instance * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_sf_attrs { -- cgit v1.2.3 From f91a5b8089389eb408501af2762f168c3aaa7b79 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2024 16:10:04 +0000 Subject: af_packet: fix vlan_get_protocol_dgram() vs MSG_PEEK Blamed commit forgot MSG_PEEK case, allowing a crash [1] as found by syzbot. Rework vlan_get_protocol_dgram() to not touch skb at all, so that it can be used from many cpus on the same skb. Add a const qualifier to skb argument. [1] skbuff: skb_under_panic: text:ffffffff8a8ccd05 len:29 put:14 head:ffff88807fc8e400 data:ffff88807fc8e3f4 tail:0x11 end:0x140 dev: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:206 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 UID: 0 PID: 5892 Comm: syz-executor883 Not tainted 6.13.0-rc4-syzkaller-00054-gd6ef8b40d075 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:skb_panic net/core/skbuff.c:206 [inline] RIP: 0010:skb_under_panic+0x14b/0x150 net/core/skbuff.c:216 Code: 0b 8d 48 c7 c6 86 d5 25 8e 48 8b 54 24 08 8b 0c 24 44 8b 44 24 04 4d 89 e9 50 41 54 41 57 41 56 e8 5a 69 79 f7 48 83 c4 20 90 <0f> 0b 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 RSP: 0018:ffffc900038d7638 EFLAGS: 00010282 RAX: 0000000000000087 RBX: dffffc0000000000 RCX: 609ffd18ea660600 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff88802483c8d0 R08: ffffffff817f0a8c R09: 1ffff9200071ae60 R10: dffffc0000000000 R11: fffff5200071ae61 R12: 0000000000000140 R13: ffff88807fc8e400 R14: ffff88807fc8e3f4 R15: 0000000000000011 FS: 00007fbac5e006c0(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fbac5e00d58 CR3: 000000001238e000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_push+0xe5/0x100 net/core/skbuff.c:2636 vlan_get_protocol_dgram+0x165/0x290 net/packet/af_packet.c:585 packet_recvmsg+0x948/0x1ef0 net/packet/af_packet.c:3552 sock_recvmsg_nosec net/socket.c:1033 [inline] sock_recvmsg+0x22f/0x280 net/socket.c:1055 ____sys_recvmsg+0x1c6/0x480 net/socket.c:2803 ___sys_recvmsg net/socket.c:2845 [inline] do_recvmmsg+0x426/0xab0 net/socket.c:2940 __sys_recvmmsg net/socket.c:3014 [inline] __do_sys_recvmmsg net/socket.c:3037 [inline] __se_sys_recvmmsg net/socket.c:3030 [inline] __x64_sys_recvmmsg+0x199/0x250 net/socket.c:3030 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 79eecf631c14 ("af_packet: Handle outgoing VLAN packets without hardware offloading") Reported-by: syzbot+74f70bb1cb968bf09e4f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772c485.050a0220.2f3838.04c5.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Chengen Du Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241230161004.2681892-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/if_vlan.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index c1645c86eed9..d65b5d71b93b 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -585,13 +585,16 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query * @type: first vlan protocol + * @mac_offset: MAC offset * @depth: buffer to store length of eth and vlan tags in bytes * * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, - int *depth) +static inline __be16 __vlan_get_protocol_offset(const struct sk_buff *skb, + __be16 type, + int mac_offset, + int *depth) { unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; @@ -610,7 +613,8 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, do { struct vlan_hdr vhdr, *vh; - vh = skb_header_pointer(skb, vlan_depth, sizeof(vhdr), &vhdr); + vh = skb_header_pointer(skb, mac_offset + vlan_depth, + sizeof(vhdr), &vhdr); if (unlikely(!vh || !--parse_depth)) return 0; @@ -625,6 +629,12 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, return type; } +static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, + int *depth) +{ + return __vlan_get_protocol_offset(skb, type, 0, depth); +} + /** * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query -- cgit v1.2.3 From 9b6442a3bdd7e0d528122d63c24bd57f6cb05671 Mon Sep 17 00:00:00 2001 From: Vitaliy Shevtsov Date: Wed, 25 Dec 2024 01:45:30 +0000 Subject: ipmi: make ipmi_destroy_user() return void Return value of ipmi_destroy_user() has no meaning, because it's always zero and callers can do nothing with it. And in most cases it's not checked. So make this function return void. This also will eliminate static code analyzer warnings such as unreachable code/redundant comparison when the return value is checked against non-zero value. Found by Linux Verification Center (linuxtesting.org) with Svace. Signed-off-by: Vitaliy Shevtsov Message-ID: <20241225014532.20091-1-v.shevtsov@maxima.ru> Signed-off-by: Corey Minyard --- include/linux/ipmi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h index a1c9c0d48ebf..2f74dd90c271 100644 --- a/include/linux/ipmi.h +++ b/include/linux/ipmi.h @@ -126,7 +126,7 @@ int ipmi_create_user(unsigned int if_num, * the users before you destroy the callback structures, it should be * safe, too. */ -int ipmi_destroy_user(struct ipmi_user *user); +void ipmi_destroy_user(struct ipmi_user *user); /* Get the IPMI version of the BMC we are talking to. */ int ipmi_get_version(struct ipmi_user *user, -- cgit v1.2.3 From 401d07d530bfaf7131d6ab9acd32ff12b9a6ddf1 Mon Sep 17 00:00:00 2001 From: Pavan Holla Date: Tue, 31 Dec 2024 13:10:46 +0000 Subject: platform/chrome: Update ChromeOS EC header for UCSI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add EC host commands for reading and writing UCSI structures in the EC. The corresponding kernel driver is cros-ec-ucsi. Also update PD events supported by the EC. Acked-by: Tzung-Bi Shih Signed-off-by: Pavan Holla Signed-off-by: Łukasz Bartosik Link: https://lore.kernel.org/r/20241231131047.1757434-2-ukaszb@chromium.org Signed-off-by: Greg Kroah-Hartman --- include/linux/platform_data/cros_ec_commands.h | 28 +++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index b3c4993e656e..ecf290a0c98f 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -5044,8 +5044,11 @@ struct ec_response_pd_status { #define PD_EVENT_POWER_CHANGE BIT(1) #define PD_EVENT_IDENTITY_RECEIVED BIT(2) #define PD_EVENT_DATA_SWAP BIT(3) +#define PD_EVENT_TYPEC BIT(4) +#define PD_EVENT_PPM BIT(5) + struct ec_response_host_event_status { - uint32_t status; /* PD MCU host event status */ + uint32_t status; /* PD MCU host event status */ } __ec_align4; /* Set USB type-C port role and muxes */ @@ -6105,6 +6108,29 @@ struct ec_response_typec_vdm_response { #undef VDO_MAX_SIZE +/* + * UCSI OPM-PPM commands + * + * These commands are used for communication between OPM and PPM. + * Only UCSI3.0 is tested. + */ + +#define EC_CMD_UCSI_PPM_SET 0x0140 + +/* The data size is stored in the host command protocol header. */ +struct ec_params_ucsi_ppm_set { + uint16_t offset; + uint8_t data[]; +} __ec_align2; + +#define EC_CMD_UCSI_PPM_GET 0x0141 + +/* For 'GET' sub-commands, data will be returned as a raw payload. */ +struct ec_params_ucsi_ppm_get { + uint16_t offset; + uint8_t size; +} __ec_align2; + /*****************************************************************************/ /* The command range 0x200-0x2FF is reserved for Rotor. */ -- cgit v1.2.3 From f1e8bf56320a7fb32095b6c51b707459361b403b Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Tue, 24 Dec 2024 21:05:03 +0800 Subject: driver core: Constify API device_find_child() and adapt for various usages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Constify the following API: struct device *device_find_child(struct device *dev, void *data, int (*match)(struct device *dev, void *data)); To : struct device *device_find_child(struct device *dev, const void *data, device_match_t match); typedef int (*device_match_t)(struct device *dev, const void *data); with the following reasons: - Protect caller's match data @*data which is for comparison and lookup and the API does not actually need to modify @*data. - Make the API's parameters (@match)() and @data have the same type as all of other device finding APIs (bus|class|driver)_find_device(). - All kinds of existing device match functions can be directly taken as the API's argument, they were exported by driver core. Constify the API and adapt for various existing usages. BTW, various subsystem changes are squashed into this commit to meet 'git bisect' requirement, and this commit has the minimal and simplest changes to complement squashing shortcoming, and that may bring extra code improvement. Reviewed-by: Alison Schofield Reviewed-by: Takashi Sakamoto Acked-by: Uwe Kleine-König # for drivers/pwm Signed-off-by: Zijun Hu Reviewed-by: Jonathan Cameron Reviewed-by: Mathieu Poirier Link: https://lore.kernel.org/r/20241224-const_dfc_done-v5-4-6623037414d4@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 4 ++-- include/scsi/scsi_transport_iscsi.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/device.h b/include/linux/device.h index 667cb6db9019..0e0bc9bfe0d1 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1081,8 +1081,8 @@ int device_for_each_child_reverse(struct device *dev, void *data, int device_for_each_child_reverse_from(struct device *parent, struct device *from, const void *data, int (*fn)(struct device *, const void *)); -struct device *device_find_child(struct device *dev, void *data, - int (*match)(struct device *dev, void *data)); +struct device *device_find_child(struct device *dev, const void *data, + device_match_t match); struct device *device_find_child_by_name(struct device *parent, const char *name); struct device *device_find_any_child(struct device *parent); diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index bd1243657c01..4d3baf324900 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -497,8 +497,8 @@ extern void iscsi_destroy_all_flashnode(struct Scsi_Host *shost); extern int iscsi_flashnode_bus_match(struct device *dev, const struct device_driver *drv); extern struct device * -iscsi_find_flashnode_sess(struct Scsi_Host *shost, void *data, - int (*fn)(struct device *dev, void *data)); +iscsi_find_flashnode_sess(struct Scsi_Host *shost, const void *data, + device_match_t fn); extern struct device * iscsi_find_flashnode_conn(struct iscsi_bus_flash_session *fnode_sess); -- cgit v1.2.3 From adf908c965798c33d1148393927a7c0c5d08053c Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Tue, 24 Dec 2024 21:05:08 +0800 Subject: driver core: Introduce an device matching API device_match_type() Introduce device_match_type() for purposes below: - Test if a device matches with a specified device type. - As argument of various device finding APIs to find a device with specified type. device_find_child() will use it to simplify operations later. Reviewed-by: Jonathan Cameron Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20241224-const_dfc_done-v5-9-6623037414d4@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device/bus.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index cdc4757217f9..bc3fd74bb763 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -131,6 +131,7 @@ typedef int (*device_match_t)(struct device *dev, const void *data); /* Generic device matching functions that all busses can use to match with */ int device_match_name(struct device *dev, const void *name); +int device_match_type(struct device *dev, const void *type); int device_match_of_node(struct device *dev, const void *np); int device_match_fwnode(struct device *dev, const void *fwnode); int device_match_devt(struct device *dev, const void *pdevt); -- cgit v1.2.3 From 56a50667cbcfaf95eea9128d5676af94e54b51a8 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 1 Nov 2024 23:09:51 +0100 Subject: i2c: Replace list-based mechanism for handling auto-detected clients So far a list is used to track auto-detected clients per driver. The same functionality can be achieved much simpler by flagging auto-detected clients. Two notes regarding the usage of driver_for_each_device: In our case it can't fail, however the function is annotated __must_check. So a little workaround is needed to avoid a compiler warning. Then we may remove nodes from the list over which we iterate. This is safe, see the explanation at the beginning of lib/klist.c. Signed-off-by: Heiner Kallweit [wsa: fixed description of the new flag] Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 388ce71a29a9..072a62244e14 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -244,7 +244,6 @@ enum i2c_driver_flags { * @id_table: List of I2C devices supported by this driver * @detect: Callback for device detection * @address_list: The I2C addresses to probe (for detect) - * @clients: List of detected clients we created (for i2c-core use only) * @flags: A bitmask of flags defined in &enum i2c_driver_flags * * The driver.owner field should be set to the module owner of this driver. @@ -299,7 +298,6 @@ struct i2c_driver { /* Device detection callback for automatic device creation */ int (*detect)(struct i2c_client *client, struct i2c_board_info *info); const unsigned short *address_list; - struct list_head clients; u32 flags; }; @@ -334,6 +332,7 @@ struct i2c_client { #define I2C_CLIENT_SLAVE 0x20 /* we are the slave */ #define I2C_CLIENT_HOST_NOTIFY 0x40 /* We want to use I2C host notify */ #define I2C_CLIENT_WAKE 0x80 /* for board_info; true iff can wake */ +#define I2C_CLIENT_AUTO 0x100 /* client was auto-detected */ #define I2C_CLIENT_SCCB 0x9000 /* Use Omnivision SCCB protocol */ /* Must match I2C_M_STOP|IGNORE_NAK */ -- cgit v1.2.3 From 3cfe39b3a845593a485ab1c716615979004ef9f6 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 1 Nov 2024 23:11:39 +0100 Subject: i2c: Replace list-based mechanism for handling userspace-created clients Similar to the list of auto-detected clients, we can also replace the list of userspace-created clients with flagging such client devices. Signed-off-by: Heiner Kallweit [wsa: fixed description of the new flag; reordered new code in 'device_store' to have single exit point; fixed whitespace errors; folded cleanup patch into this one] Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 072a62244e14..66fb3d6cf686 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -313,8 +313,6 @@ struct i2c_driver { * @dev: Driver model device node for the slave. * @init_irq: IRQ that was set at initialization * @irq: indicates the IRQ generated by this device (if any) - * @detected: member of an i2c_driver.clients list or i2c-core's - * userspace_devices list * @slave_cb: Callback when I2C slave mode of an adapter is used. The adapter * calls it to pass on slave events to the slave driver. * @devres_group_id: id of the devres group that will be created for resources @@ -333,6 +331,7 @@ struct i2c_client { #define I2C_CLIENT_HOST_NOTIFY 0x40 /* We want to use I2C host notify */ #define I2C_CLIENT_WAKE 0x80 /* for board_info; true iff can wake */ #define I2C_CLIENT_AUTO 0x100 /* client was auto-detected */ +#define I2C_CLIENT_USER 0x200 /* client was userspace-created */ #define I2C_CLIENT_SCCB 0x9000 /* Use Omnivision SCCB protocol */ /* Must match I2C_M_STOP|IGNORE_NAK */ @@ -344,7 +343,6 @@ struct i2c_client { struct device dev; /* the device structure */ int init_irq; /* irq set at initialization */ int irq; /* irq issued by device */ - struct list_head detected; #if IS_ENABLED(CONFIG_I2C_SLAVE) i2c_slave_cb_t slave_cb; /* callback for slave mode */ #endif @@ -750,9 +748,6 @@ struct i2c_adapter { char name[48]; struct completion dev_released; - struct mutex userspace_clients_lock; - struct list_head userspace_clients; - struct i2c_bus_recovery_info *bus_recovery_info; const struct i2c_adapter_quirks *quirks; -- cgit v1.2.3 From 45d339fefaa3dcd237038769e0d34584fb867390 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Thu, 19 Dec 2024 14:23:36 +0200 Subject: RDMA/mlx5: Enable multiplane mode only when it is supported Driver queries vport_cxt.num_plane and enables multiplane when it is greater then 0, but some old FWs (versions from x.40.1000 till x.42.1000), report vport_cxt.num_plane = 1 unexpectedly. Fix it by querying num_plane only when HCA_CAP2.multiplane bit is set. Fixes: 2a5db20fa532 ("RDMA/mlx5: Add support to multi-plane device and port") Link: https://patch.msgid.link/r/1ef901acdf564716fcf550453cf5e94f343777ec.1734610916.git.leon@kernel.org Cc: stable@vger.kernel.org Reported-by: Francesco Poli Closes: https://lore.kernel.org/all/nvs4i2v7o6vn6zhmtq4sgazy2hu5kiulukxcntdelggmznnl7h@so3oul6uwgbl/ Signed-off-by: Mark Zhang Signed-off-by: Leon Romanovsky Reviewed-by: Michal Swiatkowski Signed-off-by: Jason Gunthorpe --- include/linux/mlx5/mlx5_ifc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4fbbcf35498b..48d47181c7cd 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2119,7 +2119,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 migration_in_chunks[0x1]; u8 reserved_at_d1[0x1]; u8 sf_eq_usage[0x1]; - u8 reserved_at_d3[0xd]; + u8 reserved_at_d3[0x5]; + u8 multiplane[0x1]; + u8 reserved_at_d9[0x7]; u8 cross_vhca_object_to_object_supported[0x20]; -- cgit v1.2.3 From 7a637e5e27a68fd52327a80136d5d0184c43888f Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Tue, 3 Dec 2024 10:59:20 +0100 Subject: drm/msm: Expose uche trap base via uapi This adds MSM_PARAM_UCHE_TRAP_BASE that will be used by Mesa implementation for VK_KHR_shader_clock and GL_ARB_shader_clock. Signed-off-by: Danylo Piliaiev Patchwork: https://patchwork.freedesktop.org/patch/627036/ Signed-off-by: Rob Clark --- include/uapi/drm/msm_drm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h index b916aab80dde..2342cb90857e 100644 --- a/include/uapi/drm/msm_drm.h +++ b/include/uapi/drm/msm_drm.h @@ -90,6 +90,7 @@ struct drm_msm_timespec { #define MSM_PARAM_RAYTRACING 0x11 /* RO */ #define MSM_PARAM_UBWC_SWIZZLE 0x12 /* RO */ #define MSM_PARAM_MACROTILE_MODE 0x13 /* RO */ +#define MSM_PARAM_UCHE_TRAP_BASE 0x14 /* RO */ /* For backwards compat. The original support for preemption was based on * a single ring per priority level so # of priority levels equals the # -- cgit v1.2.3 From 7687c66c18c66d4ccd9949c6f641c0e7b5773483 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 13 Dec 2024 10:08:23 -0800 Subject: kunit: platform: Resolve 'struct completion' warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the header is included in a test without certain other headers, it produces compiler warnings like: In file included from [...] ../include/kunit/platform_device.h:15:57: warning: ‘struct completion’ declared inside parameter list will not be visible outside of this definition or declaration 15 | struct completion *x); | ^~~~~~~~~~ Add a 'struct completion' forward declaration to resolve this. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412241958.dbAImJsA-lkp@intel.com/ Signed-off-by: Brian Norris Reviewed-by: David Gow Link: https://lore.kernel.org/r/20241213180841.3023843-1-briannorris@chromium.org Signed-off-by: Greg Kroah-Hartman --- include/kunit/platform_device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/kunit/platform_device.h b/include/kunit/platform_device.h index 0fc0999d2420..f8236a8536f7 100644 --- a/include/kunit/platform_device.h +++ b/include/kunit/platform_device.h @@ -2,6 +2,7 @@ #ifndef _KUNIT_PLATFORM_DRIVER_H #define _KUNIT_PLATFORM_DRIVER_H +struct completion; struct kunit; struct platform_device; struct platform_driver; -- cgit v1.2.3 From 96ea081ed52bf077cad6d00153b6fba68e510767 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 20 Dec 2024 12:18:18 -0800 Subject: bpf: Reject struct_ops registration that uses module ptr and the module btf_id is missing There is a UAF report in the bpf_struct_ops when CONFIG_MODULES=n. In particular, the report is on tcp_congestion_ops that has a "struct module *owner" member. For struct_ops that has a "struct module *owner" member, it can be extended either by the regular kernel module or by the bpf_struct_ops. bpf_try_module_get() will be used to do the refcounting and different refcount is done based on the owner pointer. When CONFIG_MODULES=n, the btf_id of the "struct module" is missing: WARN: resolve_btfids: unresolved symbol module Thus, the bpf_try_module_get() cannot do the correct refcounting. Not all subsystem's struct_ops requires the "struct module *owner" member. e.g. the recent sched_ext_ops. This patch is to disable bpf_struct_ops registration if the struct_ops has the "struct module *" member and the "struct module" btf_id is missing. The btf_type_is_fwd() helper is moved to the btf.h header file for this test. This has happened since the beginning of bpf_struct_ops which has gone through many changes. The Fixes tag is set to a recent commit that this patch can apply cleanly. Considering CONFIG_MODULES=n is not common and the age of the issue, targeting for bpf-next also. Fixes: 1611603537a4 ("bpf: Create argument information for nullable arguments.") Reported-by: Robert Morris Closes: https://lore.kernel.org/bpf/74665.1733669976@localhost/ Signed-off-by: Martin KaFai Lau Tested-by: Eduard Zingerman Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20241220201818.127152-1-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 4214e76c9168..2a08a2b55592 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -353,6 +353,11 @@ static inline bool btf_type_is_scalar(const struct btf_type *t) return btf_type_is_int(t) || btf_type_is_enum(t); } +static inline bool btf_type_is_fwd(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; +} + static inline bool btf_type_is_typedef(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF; -- cgit v1.2.3 From 3e4863d24818a41db42b4f2680715f204657839e Mon Sep 17 00:00:00 2001 From: Biju Das Date: Mon, 16 Dec 2024 19:53:11 +0000 Subject: dt-bindings: pinctrl: renesas: Add alpha-numerical port support for RZ/V2H RZ/V2H has ports P0-P9 and PA-PB. Add support for defining alpha-numerical ports in DT using RZV2H_* macros. Signed-off-by: Biju Das Acked-by: Rob Herring (Arm) Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20241216195325.164212-2-biju.das.jz@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- .../pinctrl/renesas,r9a09g057-pinctrl.h | 31 ++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 include/dt-bindings/pinctrl/renesas,r9a09g057-pinctrl.h (limited to 'include') diff --git a/include/dt-bindings/pinctrl/renesas,r9a09g057-pinctrl.h b/include/dt-bindings/pinctrl/renesas,r9a09g057-pinctrl.h new file mode 100644 index 000000000000..2e83bf43160b --- /dev/null +++ b/include/dt-bindings/pinctrl/renesas,r9a09g057-pinctrl.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * This header provides constants for Renesas RZ/V2H family pinctrl bindings. + * + * Copyright (C) 2024 Renesas Electronics Corp. + * + */ + +#ifndef __DT_BINDINGS_PINCTRL_RENESAS_R9A09G057_PINCTRL_H__ +#define __DT_BINDINGS_PINCTRL_RENESAS_R9A09G057_PINCTRL_H__ + +#include + +/* RZV2H_Px = Offset address of PFC_P_mn - 0x20 */ +#define RZV2H_P0 0 +#define RZV2H_P1 1 +#define RZV2H_P2 2 +#define RZV2H_P3 3 +#define RZV2H_P4 4 +#define RZV2H_P5 5 +#define RZV2H_P6 6 +#define RZV2H_P7 7 +#define RZV2H_P8 8 +#define RZV2H_P9 9 +#define RZV2H_PA 10 +#define RZV2H_PB 11 + +#define RZV2H_PORT_PINMUX(b, p, f) RZG2L_PORT_PINMUX(RZV2H_P##b, p, f) +#define RZV2H_GPIO(port, pin) RZG2L_GPIO(RZV2H_P##port, pin) + +#endif /* __DT_BINDINGS_PINCTRL_RENESAS_R9A09G057_PINCTRL_H__ */ -- cgit v1.2.3 From 5c7fb203d0dbfbfeed51991a4f98499b245634a7 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Mon, 16 Dec 2024 19:53:12 +0000 Subject: dt-bindings: pinctrl: renesas: Document RZ/G3E SoC Add documentation for the pin controller found on the Renesas RZ/G3E (R9A09G047) SoC. The RZ/G3E PFC is similar to the RZ/V2H SoC but has more pins(P00-PS3). Acked-by: Conor Dooley Signed-off-by: Biju Das Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/20241216195325.164212-3-biju.das.jz@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- .../pinctrl/renesas,r9a09g047-pinctrl.h | 41 ++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 include/dt-bindings/pinctrl/renesas,r9a09g047-pinctrl.h (limited to 'include') diff --git a/include/dt-bindings/pinctrl/renesas,r9a09g047-pinctrl.h b/include/dt-bindings/pinctrl/renesas,r9a09g047-pinctrl.h new file mode 100644 index 000000000000..5917096720bd --- /dev/null +++ b/include/dt-bindings/pinctrl/renesas,r9a09g047-pinctrl.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * This header provides constants for Renesas RZ/G3E family pinctrl bindings. + * + * Copyright (C) 2024 Renesas Electronics Corp. + * + */ + +#ifndef __DT_BINDINGS_PINCTRL_RENESAS_R9A09G047_PINCTRL_H__ +#define __DT_BINDINGS_PINCTRL_RENESAS_R9A09G047_PINCTRL_H__ + +#include + +/* RZG3E_Px = Offset address of PFC_P_mn - 0x20 */ +#define RZG3E_P0 0 +#define RZG3E_P1 1 +#define RZG3E_P2 2 +#define RZG3E_P3 3 +#define RZG3E_P4 4 +#define RZG3E_P5 5 +#define RZG3E_P6 6 +#define RZG3E_P7 7 +#define RZG3E_P8 8 +#define RZG3E_PA 10 +#define RZG3E_PB 11 +#define RZG3E_PC 12 +#define RZG3E_PD 13 +#define RZG3E_PE 14 +#define RZG3E_PF 15 +#define RZG3E_PG 16 +#define RZG3E_PH 17 +#define RZG3E_PJ 19 +#define RZG3E_PK 20 +#define RZG3E_PL 21 +#define RZG3E_PM 22 +#define RZG3E_PS 28 + +#define RZG3E_PORT_PINMUX(b, p, f) RZG2L_PORT_PINMUX(RZG3E_P##b, p, f) +#define RZG3E_GPIO(port, pin) RZG2L_GPIO(RZG3E_P##port, pin) + +#endif /* __DT_BINDINGS_PINCTRL_RENESAS_R9A09G047_PINCTRL_H__ */ -- cgit v1.2.3 From 7fa4817340161a34d5b4ca39e96d6318d37c1d3a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 27 Dec 2024 14:48:29 -0800 Subject: crypto: ahash - make hash walk functions private to ahash.c Due to the removal of the Niagara2 SPU driver, crypto_hash_walk_first(), crypto_hash_walk_done(), crypto_hash_walk_last(), and struct crypto_hash_walk are now only used in crypto/ahash.c. Therefore, make them all private to crypto/ahash.c. I.e. un-export the two functions that were exported, make the functions static, and move the struct definition to the .c file. As part of this, move the functions to earlier in the file to avoid needing to add forward declarations. Signed-off-by: Eric Biggers Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- include/crypto/internal/hash.h | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'include') diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h index 58967593b6b4..84da3424decc 100644 --- a/include/crypto/internal/hash.h +++ b/include/crypto/internal/hash.h @@ -12,20 +12,6 @@ #include struct ahash_request; -struct scatterlist; - -struct crypto_hash_walk { - char *data; - - unsigned int offset; - unsigned int flags; - - struct page *pg; - unsigned int entrylen; - - unsigned int total; - struct scatterlist *sg; -}; struct ahash_instance { void (*free)(struct ahash_instance *inst); @@ -57,15 +43,6 @@ struct crypto_shash_spawn { struct crypto_spawn base; }; -int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err); -int crypto_hash_walk_first(struct ahash_request *req, - struct crypto_hash_walk *walk); - -static inline int crypto_hash_walk_last(struct crypto_hash_walk *walk) -{ - return !(walk->entrylen | walk->total); -} - int crypto_register_ahash(struct ahash_alg *alg); void crypto_unregister_ahash(struct ahash_alg *alg); int crypto_register_ahashes(struct ahash_alg *algs, int count); -- cgit v1.2.3 From 6af45d7df1099ccac634b36f8cdfa32fbca8c1d1 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 24 Dec 2024 03:47:53 +0200 Subject: ASoC: hdmi-codec: pass data to get_dai_id too The upcoming DRM connector HDMI codec implementation is going to use codec-specific data in the .get_dai_id to get drm_connector. Pass data to the callback, as it is done with other hdmi_codec_ops callbacks. Acked-by: Mark Brown Tested-by: Dave Stevenson Link: https://patchwork.freedesktop.org/patch/msgid/20241224-drm-bridge-hdmi-connector-v10-1-dc89577cd438@linaro.org Signed-off-by: Dmitry Baryshkov --- include/sound/hdmi-codec.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/hdmi-codec.h b/include/sound/hdmi-codec.h index 5e1a9eafd10f..b3407b47b4a7 100644 --- a/include/sound/hdmi-codec.h +++ b/include/sound/hdmi-codec.h @@ -105,7 +105,8 @@ struct hdmi_codec_ops { * Optional */ int (*get_dai_id)(struct snd_soc_component *comment, - struct device_node *endpoint); + struct device_node *endpoint, + void *data); /* * Hook callback function to handle connector plug event. -- cgit v1.2.3 From bb1d67bf82fbd2c550fa637e0b8a966ee81a293b Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 24 Dec 2024 03:47:54 +0200 Subject: ASoC: hdmi-codec: move no_capture_mute to struct hdmi_codec_pdata The no_capture_mute flag might differ from platform to platform, especially in the case of the wrapping implementations, like the upcoming DRM HDMI Codec framework. Move the flag next to all other flags in struct hdmi_codec_pdata. Acked-by: Mark Brown Tested-by: Dave Stevenson Link: https://patchwork.freedesktop.org/patch/msgid/20241224-drm-bridge-hdmi-connector-v10-2-dc89577cd438@linaro.org Signed-off-by: Dmitry Baryshkov --- include/sound/hdmi-codec.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/sound/hdmi-codec.h b/include/sound/hdmi-codec.h index b3407b47b4a7..b220072cfa1b 100644 --- a/include/sound/hdmi-codec.h +++ b/include/sound/hdmi-codec.h @@ -115,9 +115,6 @@ struct hdmi_codec_ops { int (*hook_plugged_cb)(struct device *dev, void *data, hdmi_codec_plugged_cb fn, struct device *codec_dev); - - /* bit field */ - unsigned int no_capture_mute:1; }; /* HDMI codec initalization data */ @@ -129,6 +126,7 @@ struct hdmi_codec_pdata { uint spdif:1; uint no_spdif_playback:1; uint no_spdif_capture:1; + uint no_capture_mute:1; int max_i2s_channels; void *data; }; -- cgit v1.2.3 From baf616647fe6f857a0cf2187197de31e9bb17a71 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 24 Dec 2024 03:47:55 +0200 Subject: drm/connector: implement generic HDMI audio helpers Several DRM drivers implement HDMI codec support (despite its name it applies to both HDMI and DisplayPort drivers). Implement generic framework to be used by these drivers. This removes a requirement to implement get_eld() callback and provides default implementation for codec's plug handling. Acked-by: Maxime Ripard Tested-by: Dave Stevenson Link: https://patchwork.freedesktop.org/patch/msgid/20241224-drm-bridge-hdmi-connector-v10-3-dc89577cd438@linaro.org Signed-off-by: Dmitry Baryshkov --- include/drm/display/drm_hdmi_audio_helper.h | 22 ++++++ include/drm/drm_connector.h | 116 ++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 include/drm/display/drm_hdmi_audio_helper.h (limited to 'include') diff --git a/include/drm/display/drm_hdmi_audio_helper.h b/include/drm/display/drm_hdmi_audio_helper.h new file mode 100644 index 000000000000..c9a6faef4109 --- /dev/null +++ b/include/drm/display/drm_hdmi_audio_helper.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: MIT */ + +#ifndef DRM_DISPLAY_HDMI_AUDIO_HELPER_H_ +#define DRM_DISPLAY_HDMI_AUDIO_HELPER_H_ + +#include + +struct drm_connector; +struct drm_connector_hdmi_audio_funcs; + +struct device; + +int drm_connector_hdmi_audio_init(struct drm_connector *connector, + struct device *hdmi_codec_dev, + const struct drm_connector_hdmi_audio_funcs *funcs, + unsigned int max_i2s_playback_channels, + bool spdif_playback, + int sound_dai_port); +void drm_connector_hdmi_audio_plugged_notify(struct drm_connector *connector, + bool plugged); + +#endif diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h index d1be19242a5c..1d4c27948e87 100644 --- a/include/drm/drm_connector.h +++ b/include/drm/drm_connector.h @@ -46,6 +46,8 @@ struct drm_property_blob; struct drm_printer; struct drm_privacy_screen; struct edid; +struct hdmi_codec_daifmt; +struct hdmi_codec_params; struct i2c_adapter; enum drm_connector_force { @@ -1141,6 +1143,53 @@ struct drm_connector_state { struct drm_connector_hdmi_state hdmi; }; +struct drm_connector_hdmi_audio_funcs { + /** + * @startup: + * + * Called when ASoC starts an audio stream setup. The + * @startup() is optional. + * + * Returns: + * 0 on success, a negative error code otherwise + */ + int (*startup)(struct drm_connector *connector); + + /** + * @prepare: + * Configures HDMI-encoder for audio stream. Can be called + * multiple times for each setup. Mandatory. + * + * Returns: + * 0 on success, a negative error code otherwise + */ + int (*prepare)(struct drm_connector *connector, + struct hdmi_codec_daifmt *fmt, + struct hdmi_codec_params *hparms); + + /** + * @shutdown: + * + * Shut down the audio stream. Mandatory. + * + * Returns: + * 0 on success, a negative error code otherwise + */ + void (*shutdown)(struct drm_connector *connector); + + /** + * @mute_stream: + * + * Mute/unmute HDMI audio stream. The @mute_stream callback is + * optional. + * + * Returns: + * 0 on success, a negative error code otherwise + */ + int (*mute_stream)(struct drm_connector *connector, + bool enable, int direction); +}; + /** * struct drm_connector_hdmi_funcs - drm_hdmi_connector control functions */ @@ -1660,6 +1709,68 @@ struct drm_cmdline_mode { bool tv_mode_specified; }; +/** + * struct drm_connector_hdmi_audio - DRM gemeric HDMI Codec-related structure + * + * HDMI drivers usually incorporate a HDMI Codec. This structure expresses the + * generic HDMI Codec as used by the DRM HDMI Codec framework. + */ +struct drm_connector_hdmi_audio { + /** + * @funcs: + * + * Implementation of the HDMI codec functionality to be used by the DRM + * HDMI Codec framework. + */ + const struct drm_connector_hdmi_audio_funcs *funcs; + + /** + * @codec_pdev: + * + * Platform device created to hold the HDMI Codec. It will be + * automatically unregistered during drm_connector_cleanup(). + */ + struct platform_device *codec_pdev; + + /** + * @lock: + * + * Mutex to protect @last_state, @plugged_cb and @plugged_cb_dev. + */ + struct mutex lock; + + /** + * @plugged_cb: + * + * Callback to be called when the HDMI sink get plugged to or unplugged + * from this connector. This is assigned by the framework when + * requested by the ASoC code. + */ + void (*plugged_cb)(struct device *dev, bool plugged); + + /** + * @plugged_cb_dev: + * + * The data for @plugged_cb(). It is being provided by the ASoC. + */ + struct device *plugged_cb_dev; + + /** + * @last_state: + * + * Last plugged state recored by the framework. It is used to correctly + * report the state to @plugged_cb(). + */ + bool last_state; + + /** + * @dai_port: + * + * The port in DT that is used for the Codec DAI. + */ + int dai_port; +}; + /* * struct drm_connector_hdmi - DRM Connector HDMI-related structure */ @@ -2121,6 +2232,11 @@ struct drm_connector { * @hdmi: HDMI-related variable and properties. */ struct drm_connector_hdmi hdmi; + + /** + * @hdmi_audio: HDMI codec properties and non-DRM state. + */ + struct drm_connector_hdmi_audio hdmi_audio; }; #define obj_to_connector(x) container_of(x, struct drm_connector, base) -- cgit v1.2.3 From 0beba3f9d366c6df10e5b080fc99c45ac17248ed Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 24 Dec 2024 03:47:56 +0200 Subject: drm/bridge: connector: add support for HDMI codec framework Add necessary glue code to be able to use new HDMI codec framework from the DRM bridge drivers. The drm_bridge implements a limited set of the hdmi_codec_ops interface, with the functions accepting both drm_connector and drm_bridge instead of just a generic void pointer. This framework is integrated with the DRM HDMI Connector framework, but can also be used for DisplayPort connectors. Reviewed-by: Maxime Ripard Tested-by: Dave Stevenson Link: https://patchwork.freedesktop.org/patch/msgid/20241224-drm-bridge-hdmi-connector-v10-4-dc89577cd438@linaro.org Signed-off-by: Dmitry Baryshkov --- include/drm/drm_bridge.h | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'include') diff --git a/include/drm/drm_bridge.h b/include/drm/drm_bridge.h index e8d735b7f6a4..4b84faf14e36 100644 --- a/include/drm/drm_bridge.h +++ b/include/drm/drm_bridge.h @@ -41,6 +41,8 @@ struct drm_display_info; struct drm_minor; struct drm_panel; struct edid; +struct hdmi_codec_daifmt; +struct hdmi_codec_params; struct i2c_adapter; /** @@ -676,6 +678,57 @@ struct drm_bridge_funcs { enum hdmi_infoframe_type type, const u8 *buffer, size_t len); + /** + * @hdmi_audio_startup: + * + * Called when ASoC starts an audio stream setup. The + * @hdmi_audio_startup() is optional. + * + * Returns: + * 0 on success, a negative error code otherwise + */ + int (*hdmi_audio_startup)(struct drm_connector *connector, + struct drm_bridge *bridge); + + /** + * @prepare: + * Configures HDMI-encoder for audio stream. Can be called multiple + * times for each setup. Mandatory if HDMI audio is enabled in the + * bridge's configuration. + * + * Returns: + * 0 on success, a negative error code otherwise + */ + int (*hdmi_audio_prepare)(struct drm_connector *connector, + struct drm_bridge *bridge, + struct hdmi_codec_daifmt *fmt, + struct hdmi_codec_params *hparms); + + /** + * @hdmi_audio_shutdown: + * + * Shut down the audio stream. Mandatory if HDMI audio is enabled in + * the bridge's configuration. + * + * Returns: + * 0 on success, a negative error code otherwise + */ + void (*hdmi_audio_shutdown)(struct drm_connector *connector, + struct drm_bridge *bridge); + + /** + * @hdmi_audio_mute_stream: + * + * Mute/unmute HDMI audio stream. The @hdmi_audio_mute_stream callback + * is optional. + * + * Returns: + * 0 on success, a negative error code otherwise + */ + int (*hdmi_audio_mute_stream)(struct drm_connector *connector, + struct drm_bridge *bridge, + bool enable, int direction); + /** * @debugfs_init: * @@ -859,6 +912,27 @@ struct drm_bridge { * @DRM_BRIDGE_OP_HDMI is set. */ unsigned int max_bpc; + + /** + * @hdmi_audio_dev: device to be used as a parent for the HDMI Codec + */ + struct device *hdmi_audio_dev; + + /** + * @hdmi_audio_max_i2s_playback_channels: maximum number of playback + * I2S channels for the HDMI codec + */ + int hdmi_audio_max_i2s_playback_channels; + + /** + * @hdmi_audio_spdif_playback: set if HDMI codec has S/PDIF playback port + */ + unsigned int hdmi_audio_spdif_playback : 1; + + /** + * @hdmi_audio_dai_port: sound DAI port, -1 if it is not enabled + */ + int hdmi_audio_dai_port; }; static inline struct drm_bridge * -- cgit v1.2.3 From ab716b74dc9dd4903b9006f473137e1aa624af56 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 24 Dec 2024 03:47:58 +0200 Subject: drm/display/hdmi: implement hotplug functions The HDMI Connectors need to perform a variety of tasks when the HDMI connector state changes. Such tasks include setting or invalidating CEC address, notifying HDMI codec driver, updating scrambler data, etc. Implementing such tasks in a driver-specific callbacks is error prone. Start implementing the generic helper function (currently handling only the HDMI Codec framework) to be used by drivers utilizing HDMI Connector framework. Reviewed-by: Maxime Ripard Tested-by: Dave Stevenson Link: https://patchwork.freedesktop.org/patch/msgid/20241224-drm-bridge-hdmi-connector-v10-6-dc89577cd438@linaro.org Signed-off-by: Dmitry Baryshkov --- include/drm/display/drm_hdmi_state_helper.h | 5 +++++ include/drm/drm_connector.h | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/drm/display/drm_hdmi_state_helper.h b/include/drm/display/drm_hdmi_state_helper.h index d6d65da6d8f9..9ae19f3caf72 100644 --- a/include/drm/display/drm_hdmi_state_helper.h +++ b/include/drm/display/drm_hdmi_state_helper.h @@ -8,6 +8,8 @@ struct drm_connector; struct drm_connector_state; struct hdmi_audio_infoframe; +enum drm_connector_status; + void __drm_atomic_helper_connector_hdmi_reset(struct drm_connector *connector, struct drm_connector_state *new_conn_state); @@ -19,6 +21,9 @@ int drm_atomic_helper_connector_hdmi_update_audio_infoframe(struct drm_connector int drm_atomic_helper_connector_hdmi_clear_audio_infoframe(struct drm_connector *connector); int drm_atomic_helper_connector_hdmi_update_infoframes(struct drm_connector *connector, struct drm_atomic_state *state); +void drm_atomic_helper_connector_hdmi_hotplug(struct drm_connector *connector, + enum drm_connector_status status); +void drm_atomic_helper_connector_hdmi_force(struct drm_connector *connector); enum drm_mode_status drm_hdmi_connector_mode_valid(struct drm_connector *connector, diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h index 1d4c27948e87..f13d597370a3 100644 --- a/include/drm/drm_connector.h +++ b/include/drm/drm_connector.h @@ -45,6 +45,7 @@ struct drm_property; struct drm_property_blob; struct drm_printer; struct drm_privacy_screen; +struct drm_edid; struct edid; struct hdmi_codec_daifmt; struct hdmi_codec_params; @@ -1247,6 +1248,21 @@ struct drm_connector_hdmi_funcs { int (*write_infoframe)(struct drm_connector *connector, enum hdmi_infoframe_type type, const u8 *buffer, size_t len); + + /** + * @read_edid: + * + * This callback is used by the framework as a replacement for reading + * the EDID from connector->ddc. It is still recommended to provide + * connector->ddc instead of implementing this callback. Returned EDID + * should be freed via the drm_edid_free(). + * + * The @read_edid callback is optional. + * + * Returns: + * Valid EDID on success, NULL in case of failure. + */ + const struct drm_edid *(*read_edid)(struct drm_connector *connector); }; /** -- cgit v1.2.3 From af6505e5745b9f3a670de405b08b73573343c15c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:45 -0700 Subject: fs: add RWF_DONTCACHE iocb and FOP_DONTCACHE file_operations flag If a file system supports uncached buffered IO, it may set FOP_DONTCACHE and enable support for RWF_DONTCACHE. If RWF_DONTCACHE is attempted without the file system supporting it, it'll get errored with -EOPNOTSUPP. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20241220154831.1086649-8-axboe@kernel.dk Signed-off-by: Christian Brauner --- include/linux/fs.h | 14 +++++++++++++- include/uapi/linux/fs.h | 6 +++++- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..6a838b5479a6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -322,6 +322,7 @@ struct readahead_control; #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC +#define IOCB_DONTCACHE (__force int) RWF_DONTCACHE /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -356,7 +357,8 @@ struct readahead_control; { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ - { IOCB_ATOMIC, "ATOMIC"}, \ + { IOCB_ATOMIC, "ATOMIC" }, \ + { IOCB_DONTCACHE, "DONTCACHE" }, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ @@ -2127,6 +2129,8 @@ struct file_operations { #define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) /* Supports asynchronous lock callbacks */ #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) +/* File system supports uncached read/write buffered IO */ +#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, @@ -3614,6 +3618,14 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) return -EOPNOTSUPP; } + if (flags & RWF_DONTCACHE) { + /* file system must support it */ + if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE)) + return -EOPNOTSUPP; + /* DAX mappings not supported */ + if (IS_DAX(ki->ki_filp->f_mapping->host)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 753971770733..56a4f93a08f4 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -332,9 +332,13 @@ typedef int __bitwise __kernel_rwf_t; /* Atomic Write */ #define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) +/* buffered IO that drops the cache after reading or writing data */ +#define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC) + RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\ + RWF_DONTCACHE) #define PROCFS_IOCTL_MAGIC 'f' -- cgit v1.2.3 From d7bde4f27ceef3dc6d72010a20d4da23db835a32 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 28 Dec 2024 12:55:18 -0500 Subject: Revert "libfs: Add simple_offset_empty()" simple_empty() and simple_offset_empty() perform the same task. The latter's use as a canary to find bugs has not found any new issues. A subsequent patch will remove the use of the mtree for iterating directory contents, so revert back to using a similar mechanism for determining whether a directory is indeed empty. Only one such mechanism is ever needed. Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/20241228175522.1854234-3-cel@kernel.org Reviewed-by: Yang Erkun Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..f7efc6866ebc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3468,7 +3468,6 @@ struct offset_ctx { void simple_offset_init(struct offset_ctx *octx); int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry); void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry); -int simple_offset_empty(struct dentry *dentry); int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); int simple_offset_rename_exchange(struct inode *old_dir, -- cgit v1.2.3 From 9897831de614f1d8d5184547f0e7bf7665eed436 Mon Sep 17 00:00:00 2001 From: Vasily Khoruzhick Date: Fri, 3 Jan 2025 23:36:57 -0800 Subject: dt-bindings: clock: sunxi: Export PLL_VIDEO_2X and PLL_MIPI Export PLL_VIDEO_2X and PLL_MIPI, these will be used to explicitly select TCON0 clock parent in dts Fixes: ca1170b69968 ("clk: sunxi-ng: a64: force select PLL_MIPI in TCON0 mux") Reviewed-by: Dragan Simic Reviewed-by: Chen-Yu Tsai Tested-by: Frank Oltmanns # on PinePhone Tested-by: Stuart Gathman # on OG Pinebook Signed-off-by: Vasily Khoruzhick Acked-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20250104074035.1611136-2-anarsoul@gmail.com Signed-off-by: Chen-Yu Tsai --- include/dt-bindings/clock/sun50i-a64-ccu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/sun50i-a64-ccu.h b/include/dt-bindings/clock/sun50i-a64-ccu.h index 175892189e9d..4f220ea7a23c 100644 --- a/include/dt-bindings/clock/sun50i-a64-ccu.h +++ b/include/dt-bindings/clock/sun50i-a64-ccu.h @@ -44,7 +44,9 @@ #define _DT_BINDINGS_CLK_SUN50I_A64_H_ #define CLK_PLL_VIDEO0 7 +#define CLK_PLL_VIDEO0_2X 8 #define CLK_PLL_PERIPH0 11 +#define CLK_PLL_MIPI 17 #define CLK_CPUX 21 #define CLK_BUS_MIPI_DSI 28 -- cgit v1.2.3 From 7716d085531bf797c882ed67eda184ac58a387a8 Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Mon, 30 Dec 2024 16:13:52 +0100 Subject: iio: gts-helper: add helpers to ease searches of gain_sel and new_gain This helper functions reduce the burden in the drivers that want to fetch a gain and time selector for a given scale or a new optimal gain. The former is currently achieved by calling iio_gts_find_gain_sel_for_scale_using_time() for the current time selector, and then iterating over the rest of time selectors if the gain selector was not found. The latter requires a combination of multiple iio-gts helpers to find the new gain, look for an optimal gain if there was no exact match, and set a minimum gain if the optimal gain is not in the range of available gains. Provide simpler workflows by means of functions that address common patterns in the users of the iio-gts helpers. Acked-by: Matti Vaittinen Signed-off-by: Javier Carrasco Link: https://patch.msgid.link/20241230-veml3235_scale-v3-1-48a5795e2f64@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio-gts-helper.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/iio/iio-gts-helper.h b/include/linux/iio/iio-gts-helper.h index 9cb6c80dea71..e5de7a124bad 100644 --- a/include/linux/iio/iio-gts-helper.h +++ b/include/linux/iio/iio-gts-helper.h @@ -188,6 +188,9 @@ int iio_gts_total_gain_to_scale(struct iio_gts *gts, int total_gain, int iio_gts_find_gain_sel_for_scale_using_time(struct iio_gts *gts, int time_sel, int scale_int, int scale_nano, int *gain_sel); +int iio_gts_find_gain_time_sel_for_scale(struct iio_gts *gts, int scale_int, + int scale_nano, int *gain_sel, + int *time_sel); int iio_gts_get_scale(struct iio_gts *gts, int gain, int time, int *scale_int, int *scale_nano); int iio_gts_find_new_gain_sel_by_old_gain_time(struct iio_gts *gts, @@ -196,6 +199,9 @@ int iio_gts_find_new_gain_sel_by_old_gain_time(struct iio_gts *gts, int iio_gts_find_new_gain_by_old_gain_time(struct iio_gts *gts, int old_gain, int old_time, int new_time, int *new_gain); +int iio_gts_find_new_gain_by_gain_time_min(struct iio_gts *gts, int old_gain, + int old_time, int new_time, + int *new_gain, bool *in_range); int iio_gts_avail_times(struct iio_gts *gts, const int **vals, int *type, int *length); int iio_gts_all_avail_scales(struct iio_gts *gts, const int **vals, int *type, -- cgit v1.2.3 From 3479c7549fb1dfa7a1db4efb7347c7b8ef50de4b Mon Sep 17 00:00:00 2001 From: Zhongqiu Duan Date: Thu, 2 Jan 2025 17:14:26 +0000 Subject: tcp/dccp: allow a connection when sk_max_ack_backlog is zero If the backlog of listen() is set to zero, sk_acceptq_is_full() allows one connection to be made, but inet_csk_reqsk_queue_is_full() does not. When the net.ipv4.tcp_syncookies is zero, inet_csk_reqsk_queue_is_full() will cause an immediate drop before the sk_acceptq_is_full() check in tcp_conn_request(), resulting in no connection can be made. This patch tries to keep consistent with 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes."). Link: https://lore.kernel.org/netdev/20250102080258.53858-1-kuniyu@amazon.com/ Fixes: ef547f2ac16b ("tcp: remove max_qlen_log") Signed-off-by: Zhongqiu Duan Reviewed-by: Kuniyuki Iwashima Reviewed-by: Jason Xing Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250102171426.915276-1-dzq.aishenghu0@gmail.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 3c82fad904d4..c7f42844c79a 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -282,7 +282,7 @@ static inline int inet_csk_reqsk_queue_len(const struct sock *sk) static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) { - return inet_csk_reqsk_queue_len(sk) >= READ_ONCE(sk->sk_max_ack_backlog); + return inet_csk_reqsk_queue_len(sk) > READ_ONCE(sk->sk_max_ack_backlog); } bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); -- cgit v1.2.3 From 7ccbe076d987598b04b4b9c9b61f042291f9cc77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Sala=C3=BCn?= Date: Fri, 22 Nov 2024 15:33:31 +0100 Subject: lsm: Only build lsm_audit.c if CONFIG_SECURITY and CONFIG_AUDIT are set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_AUDIT is set, its CONFIG_NET dependency is also set, and the dev_get_by_index and init_net symbols (used by dump_common_audit_data) are found by the linker. dump_common_audit_data() should then failed to build when CONFIG_NET is not set. However, because the compiler is smart, it knows that audit_log_start() always return NULL when !CONFIG_AUDIT, and it doesn't build the body of common_lsm_audit(). As a side effect, dump_common_audit_data() is not built and the linker doesn't error out because of missing symbols. Let's only build lsm_audit.o when CONFIG_SECURITY and CONFIG_AUDIT are both set, which is checked with the new CONFIG_HAS_SECURITY_AUDIT. ipv4_skb_to_auditdata() and ipv6_skb_to_auditdata() are only used by Smack if CONFIG_AUDIT is set, so they don't need fake implementations. Because common_lsm_audit() is used in multiple places without CONFIG_AUDIT checks, add a fake implementation. Link: https://lore.kernel.org/r/20241122143353.59367-2-mic@digikod.net Cc: Casey Schaufler Cc: James Morris Cc: Paul Moore Cc: Serge E. Hallyn Signed-off-by: Mickaël Salaün Signed-off-by: Paul Moore --- include/linux/lsm_audit.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index 97a8b21eb033..c2b01380262c 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -116,14 +116,28 @@ struct common_audit_data { #define v4info fam.v4 #define v6info fam.v6 +#ifdef CONFIG_AUDIT + int ipv4_skb_to_auditdata(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto); +#if IS_ENABLED(CONFIG_IPV6) int ipv6_skb_to_auditdata(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto); +#endif /* IS_ENABLED(CONFIG_IPV6) */ void common_lsm_audit(struct common_audit_data *a, void (*pre_audit)(struct audit_buffer *, void *), void (*post_audit)(struct audit_buffer *, void *)); +#else /* CONFIG_AUDIT */ + +static inline void common_lsm_audit(struct common_audit_data *a, + void (*pre_audit)(struct audit_buffer *, void *), + void (*post_audit)(struct audit_buffer *, void *)) +{ +} + +#endif /* CONFIG_AUDIT */ + #endif -- cgit v1.2.3 From 6aeb4f836480617be472de767c4cb09c1060a067 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Jan 2025 08:33:57 +0100 Subject: block: remove bio_add_pc_page Lift bio_split_rw_at into blk_rq_append_bio so that it validates the hardware limits. With this all passthrough callers can simply add bio_add_page to build the bio and delay checking for exceeding of limits to this point instead of doing it for each page. While this looks like adding a new expensive loop over all bio_vecs, blk_rq_append_bio is already doing that just to counter the number of segments. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Link: https://lore.kernel.org/r/20250103073417.459715-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 1eec59699100..4b79bf50f4f0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -413,8 +413,6 @@ int __must_check bio_add_page(struct bio *bio, struct page *page, unsigned len, unsigned off); bool __must_check bio_add_folio(struct bio *bio, struct folio *folio, size_t len, size_t off); -extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, - unsigned int, unsigned int); void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, -- cgit v1.2.3 From 02ee5d69e3baf2796ba75b928fcbc9cf7884c5e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Jan 2025 08:33:58 +0100 Subject: block: remove blk_rq_bio_prep There is not real point in a helper just to assign three values to four fields, especially when the surrounding code is working on the neighbor fields directly. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Link: https://lore.kernel.org/r/20250103073417.459715-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7f6c482ebf54..6340293511c9 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -978,14 +978,6 @@ static inline void blk_mq_cleanup_rq(struct request *rq) rq->q->mq_ops->cleanup_rq(rq); } -static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, - unsigned int nr_segs) -{ - rq->nr_phys_segments = nr_segs; - rq->__data_len = bio->bi_iter.bi_size; - rq->bio = rq->biotail = bio; -} - void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, struct lock_class_key *key); -- cgit v1.2.3 From 600258d555f0710b9c47fb78d2d80a4aecd608cc Mon Sep 17 00:00:00 2001 From: Alexandre Cassen Date: Thu, 2 Jan 2025 12:11:11 +0200 Subject: xfrm: delete intermediate secpath entry in packet offload mode Packets handled by hardware have added secpath as a way to inform XFRM core code that this path was already handled. That secpath is not needed at all after policy is checked and it is removed later in the stack. However, in the case of IP forwarding is enabled (/proc/sys/net/ipv4/ip_forward), that secpath is not removed and packets which already were handled are reentered to the driver TX path with xfrm_offload set. The following kernel panic is observed in mlx5 in such case: mlx5_core 0000:04:00.0 enp4s0f0np0: Link up mlx5_core 0000:04:00.1 enp4s0f1np1: Link up Initializing XFRM netlink socket IPsec XFRM device driver BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor instruction fetch in kernel mode #PF: error_code(0x0010) - not-present page PGD 0 P4D 0 Oops: Oops: 0010 [#1] PREEMPT SMP CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.13.0-rc1-alex #3 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-1ubuntu1.1 04/01/2014 RIP: 0010:0x0 Code: Unable to access opcode bytes at 0xffffffffffffffd6. RSP: 0018:ffffb87380003800 EFLAGS: 00010206 RAX: ffff8df004e02600 RBX: ffffb873800038d8 RCX: 00000000ffff98cf RDX: ffff8df00733e108 RSI: ffff8df00521fb80 RDI: ffff8df001661f00 RBP: ffffb87380003850 R08: ffff8df013980000 R09: 0000000000000010 R10: 0000000000000002 R11: 0000000000000002 R12: ffff8df001661f00 R13: ffff8df00521fb80 R14: ffff8df00733e108 R15: ffff8df011faf04e FS: 0000000000000000(0000) GS:ffff8df46b800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 0000000106384000 CR4: 0000000000350ef0 Call Trace: ? show_regs+0x63/0x70 ? __die_body+0x20/0x60 ? __die+0x2b/0x40 ? page_fault_oops+0x15c/0x550 ? do_user_addr_fault+0x3ed/0x870 ? exc_page_fault+0x7f/0x190 ? asm_exc_page_fault+0x27/0x30 mlx5e_ipsec_handle_tx_skb+0xe7/0x2f0 [mlx5_core] mlx5e_xmit+0x58e/0x1980 [mlx5_core] ? __fib_lookup+0x6a/0xb0 dev_hard_start_xmit+0x82/0x1d0 sch_direct_xmit+0xfe/0x390 __dev_queue_xmit+0x6d8/0xee0 ? __fib_lookup+0x6a/0xb0 ? internal_add_timer+0x48/0x70 ? mod_timer+0xe2/0x2b0 neigh_resolve_output+0x115/0x1b0 __neigh_update+0x26a/0xc50 neigh_update+0x14/0x20 arp_process+0x2cb/0x8e0 ? __napi_build_skb+0x5e/0x70 arp_rcv+0x11e/0x1c0 ? dev_gro_receive+0x574/0x820 __netif_receive_skb_list_core+0x1cf/0x1f0 netif_receive_skb_list_internal+0x183/0x2a0 napi_complete_done+0x76/0x1c0 mlx5e_napi_poll+0x234/0x7a0 [mlx5_core] __napi_poll+0x2d/0x1f0 net_rx_action+0x1a6/0x370 ? atomic_notifier_call_chain+0x3b/0x50 ? irq_int_handler+0x15/0x20 [mlx5_core] handle_softirqs+0xb9/0x2f0 ? handle_irq_event+0x44/0x60 irq_exit_rcu+0xdb/0x100 common_interrupt+0x98/0xc0 asm_common_interrupt+0x27/0x40 RIP: 0010:pv_native_safe_halt+0xb/0x10 Code: 09 c3 66 66 2e 0f 1f 84 00 00 00 00 00 66 90 0f 22 0f 1f 84 00 00 00 00 00 90 eb 07 0f 00 2d 7f e9 36 00 fb 40 00 83 ff 07 77 21 89 ff ff 24 fd 88 3d a1 bd 0f 21 f8 RSP: 0018:ffffffffbe603de8 EFLAGS: 00000202 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000f92f46680 RDX: 0000000000000037 RSI: 00000000ffffffff RDI: 00000000000518d4 RBP: ffffffffbe603df0 R08: 000000cd42e4dffb R09: ffffffffbe603d70 R10: 0000004d80d62680 R11: 0000000000000001 R12: ffffffffbe60bf40 R13: 0000000000000000 R14: 0000000000000000 R15: ffffffffbe60aff8 ? default_idle+0x9/0x20 arch_cpu_idle+0x9/0x10 default_idle_call+0x29/0xf0 do_idle+0x1f2/0x240 cpu_startup_entry+0x2c/0x30 rest_init+0xe7/0x100 start_kernel+0x76b/0xb90 x86_64_start_reservations+0x18/0x30 x86_64_start_kernel+0xc0/0x110 ? setup_ghcb+0xe/0x130 common_startup_64+0x13e/0x141 Modules linked in: esp4_offload esp4 xfrm_interface xfrm6_tunnel tunnel4 tunnel6 xfrm_user xfrm_algo binfmt_misc intel_rapl_msr intel_rapl_common kvm_amd ccp kvm input_leds serio_raw qemu_fw_cfg sch_fq_codel dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua efi_pstore ip_tables x_tables autofs4 raid10 raid456 async_raid6_recov async_memcpy async_pq raid6_pq async_xor xor async_tx libcrc32c raid1 raid0 mlx5_core crct10dif_pclmul crc32_pclmul polyval_clmulni polyval_generic ghash_clmulni_intel sha256_ssse3 sha1_ssse3 ahci mlxfw i2c_i801 libahci i2c_mux i2c_smbus psample virtio_rng pci_hyperv_intf aesni_intel crypto_simd cryptd CR2: 0000000000000000 ---[ end trace 0000000000000000 ]--- RIP: 0010:0x0 Code: Unable to access opcode bytes at 0xffffffffffffffd6. RSP: 0018:ffffb87380003800 EFLAGS: 00010206 RAX: ffff8df004e02600 RBX: ffffb873800038d8 RCX: 00000000ffff98cf RDX: ffff8df00733e108 RSI: ffff8df00521fb80 RDI: ffff8df001661f00 RBP: ffffb87380003850 R08: ffff8df013980000 R09: 0000000000000010 R10: 0000000000000002 R11: 0000000000000002 R12: ffff8df001661f00 R13: ffff8df00521fb80 R14: ffff8df00733e108 R15: ffff8df011faf04e FS: 0000000000000000(0000) GS:ffff8df46b800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 0000000106384000 CR4: 0000000000350ef0 Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: 0x3b800000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- Fixes: 5958372ddf62 ("xfrm: add RX datapath protection for IPsec packet offload mode") Signed-off-by: Alexandre Cassen Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 32c09e85a64c..2c4eda6a8596 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1224,9 +1224,19 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir, if (xo) { x = xfrm_input_state(skb); - if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) - return (xo->flags & CRYPTO_DONE) && - (xo->status & CRYPTO_SUCCESS); + if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) { + bool check = (xo->flags & CRYPTO_DONE) && + (xo->status & CRYPTO_SUCCESS); + + /* The packets here are plain ones and secpath was + * needed to indicate that hardware already handled + * them and there is no need to do nothing in addition. + * + * Consume secpath which was set by drivers. + */ + secpath_reset(skb); + return check; + } } return __xfrm_check_nopolicy(net, skb, dir) || -- cgit v1.2.3 From b9ed315d3c4c0c294a4348edb6874d489bac47fa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 21 Dec 2024 00:46:56 +0100 Subject: netkit: Allow for configuring needed_{head,tail}room Allow the user to configure needed_{head,tail}room for both netkit devices. The idea is similar to 163e529200af ("veth: implement ndo_set_rx_headroom") with the difference that the two parameters can be specified upon device creation. By default the current behavior stays as is which is needed_{head,tail}room is 0. In case of Cilium, for example, the netkit devices are not enslaved into a bridge or openvswitch device (rather, BPF-based redirection is used out of tcx), and as such these parameters are not propagated into the Pod's netns via peer device. Given Cilium can run in vxlan/geneve tunneling mode (needed_headroom) and/or be used in combination with WireGuard (needed_{head,tail}room), allow the Cilium CNI plugin to specify these two upon netkit device creation. Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Kicinski Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/bpf/20241220234658.490686-1-daniel@iogearbox.net --- include/uapi/linux/if_link.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 2575e0cd9b48..2fa2c265dcba 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1315,6 +1315,8 @@ enum { IFLA_NETKIT_MODE, IFLA_NETKIT_SCRUB, IFLA_NETKIT_PEER_SCRUB, + IFLA_NETKIT_HEADROOM, + IFLA_NETKIT_TAILROOM, __IFLA_NETKIT_MAX, }; #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) -- cgit v1.2.3 From 34d813e45ecb8e84f7154509b1acf7dda57ef09f Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sat, 4 Jan 2025 09:58:28 +0200 Subject: drm/display: hdmi-state-helper: add drm_display_mode declaration Add forward-declaration for the struct drm_display_mode, missed in the commit 47368ab437fd ("drm/display: hdmi: add generic mode_valid helper") Fixes: 47368ab437fd ("drm/display: hdmi: add generic mode_valid helper") Signed-off-by: Dmitry Baryshkov Link: https://patchwork.freedesktop.org/patch/msgid/20250104-hdmi-state-display-mode-v1-1-3c06d36e726f@linaro.org Signed-off-by: Maxime Ripard --- include/drm/display/drm_hdmi_state_helper.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/drm/display/drm_hdmi_state_helper.h b/include/drm/display/drm_hdmi_state_helper.h index 9ae19f3caf72..44ec5c4a7503 100644 --- a/include/drm/display/drm_hdmi_state_helper.h +++ b/include/drm/display/drm_hdmi_state_helper.h @@ -6,6 +6,7 @@ struct drm_atomic_state; struct drm_connector; struct drm_connector_state; +struct drm_display_mode; struct hdmi_audio_infoframe; enum drm_connector_status; -- cgit v1.2.3 From 5bb494d5cbb9a3403ba8b1c8bc145b42fc119078 Mon Sep 17 00:00:00 2001 From: Gao Shiyuan Date: Sat, 4 Jan 2025 00:58:08 +0800 Subject: iommu/amd: remove return value of amd_iommu_detect The return value of amd_iommu_detect is not used, so remove it and is consistent with other iommu detect functions. Signed-off-by: Gao Shiyuan Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20250103165808.80939-1-gaoshiyuan@baidu.com Signed-off-by: Joerg Roedel --- include/linux/amd-iommu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 2b90c48a6a87..062fbd4c9b77 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -31,11 +31,11 @@ struct amd_iommu_pi_data { struct task_struct; struct pci_dev; -extern int amd_iommu_detect(void); +extern void amd_iommu_detect(void); #else /* CONFIG_AMD_IOMMU */ -static inline int amd_iommu_detect(void) { return -ENODEV; } +static inline void amd_iommu_detect(void) { } #endif /* CONFIG_AMD_IOMMU */ -- cgit v1.2.3 From dadf03cfd4eaa09f1d0e8b2521de1e11d3e3bec1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Jan 2025 15:02:23 +0000 Subject: io_uring/cmd: rename struct uring_cache to io_uring_cmd_data In preparation for making this more generically available for ->uring_cmd() usage that needs stable command data, rename it and move it to io_uring/cmd.h instead. Signed-off-by: Jens Axboe Signed-off-by: David Sterba --- include/linux/io_uring/cmd.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index c189d36ad55e..24cff2b9b9d4 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -18,6 +18,10 @@ struct io_uring_cmd { u8 pdu[32]; /* available inline for free use */ }; +struct io_uring_cmd_data { + struct io_uring_sqe sqes[2]; +}; + static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) { return sqe->cmd; -- cgit v1.2.3 From 3347fa658a1baecd61b007787d031b729cd86537 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Jan 2025 15:02:24 +0000 Subject: io_uring/cmd: add per-op data to struct io_uring_cmd_data In case an op handler for ->uring_cmd() needs stable storage for user data, it can allocate io_uring_cmd_data->op_data and use it for the duration of the request. When the request gets cleaned up, uring_cmd will free it automatically. Signed-off-by: Jens Axboe Signed-off-by: David Sterba --- include/linux/io_uring/cmd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 24cff2b9b9d4..3df6636ec3a3 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -20,6 +20,7 @@ struct io_uring_cmd { struct io_uring_cmd_data { struct io_uring_sqe sqes[2]; + void *op_data; }; static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) -- cgit v1.2.3 From b0af20d33f63c74985a6dd98344326e5111b2fea Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Fri, 3 Jan 2025 15:02:25 +0000 Subject: io_uring: add io_uring_cmd_get_async_data helper Add a helper function in include/linux/io_uring/cmd.h to read the async_data pointer from a struct io_uring_cmd. Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- include/linux/io_uring/cmd.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 3df6636ec3a3..b0aeec834c1d 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -118,4 +118,9 @@ static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd return cmd_to_io_kiocb(cmd)->task; } +static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_uring_cmd *cmd) +{ + return cmd_to_io_kiocb(cmd)->async_data; +} + #endif /* _LINUX_IO_URING_CMD_H */ -- cgit v1.2.3 From 1156b5e8be98c97087f8971609c852e418daf03b Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Sat, 4 Jan 2025 17:20:57 +0530 Subject: regulator: Guard of_regulator_bulk_get_all() with CONFIG_OF Since the definition is in drivers/regulator/of_regulator.c and compiled only if CONFIG_OF is enabled, building the consumer driver without CONFIG_OF and with CONFIG_REGULATOR will result in below build error: ERROR: modpost: "of_regulator_bulk_get_all" [drivers/pci/pwrctrl/pci-pwrctl-slot.ko] undefined! Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412181640.12Iufkvd-lkp@intel.com/ Fixes: 27b9ecc7a9ba ("regulator: Add of_regulator_bulk_get_all") Signed-off-by: Manivannan Sadhasivam Reviewed-by: Bartosz Golaszewski Link: https://patch.msgid.link/20250104115058.19216-2-manivannan.sadhasivam@linaro.org Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 8c3c372ad735..85be83c8fa17 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -175,6 +175,8 @@ struct regulator *__must_check of_regulator_get_optional(struct device *dev, struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, struct device_node *node, const char *id); +int __must_check of_regulator_bulk_get_all(struct device *dev, struct device_node *np, + struct regulator_bulk_data **consumers); #else static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, struct device_node *node, @@ -189,6 +191,13 @@ static inline struct regulator *__must_check devm_of_regulator_get_optional(stru { return ERR_PTR(-ENODEV); } + +static inline int of_regulator_bulk_get_all(struct device *dev, struct device_node *np, + struct regulator_bulk_data **consumers) +{ + return 0; +} + #endif int regulator_register_supply_alias(struct device *dev, const char *id, @@ -223,8 +232,6 @@ int regulator_disable_deferred(struct regulator *regulator, int ms); int __must_check regulator_bulk_get(struct device *dev, int num_consumers, struct regulator_bulk_data *consumers); -int __must_check of_regulator_bulk_get_all(struct device *dev, struct device_node *np, - struct regulator_bulk_data **consumers); int __must_check devm_regulator_bulk_get(struct device *dev, int num_consumers, struct regulator_bulk_data *consumers); void devm_regulator_bulk_put(struct regulator_bulk_data *consumers); @@ -483,12 +490,6 @@ static inline int devm_regulator_bulk_get(struct device *dev, int num_consumers, return 0; } -static inline int of_regulator_bulk_get_all(struct device *dev, struct device_node *np, - struct regulator_bulk_data **consumers) -{ - return 0; -} - static inline int devm_regulator_bulk_get_const( struct device *dev, int num_consumers, const struct regulator_bulk_data *in_consumers, -- cgit v1.2.3 From 907af7d6e0c8cf4086b1bc5218281b2ca09f130b Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Sat, 4 Jan 2025 17:20:58 +0530 Subject: regulator: Move OF_ API declarations/definitions outside CONFIG_REGULATOR Since these are hidden inside CONFIG_REGULATOR, building the consumer drivers without CONFIG_REGULATOR will result in the following build error: >> drivers/pci/pwrctrl/slot.c:39:15: error: implicit declaration of function 'of_regulator_bulk_get_all'; did you mean 'regulator_bulk_get'? [-Werror=implicit-function-declaration] 39 | ret = of_regulator_bulk_get_all(dev, dev_of_node(dev), | ^~~~~~~~~~~~~~~~~~~~~~~~~ | regulator_bulk_get cc1: some warnings being treated as errors This also removes the duplicated definitions that were possibly added to fix the build issues. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501020407.HmQQQKa0-lkp@intel.com/ Fixes: 27b9ecc7a9ba ("regulator: Add of_regulator_bulk_get_all") Signed-off-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20250104115058.19216-3-manivannan.sadhasivam@linaro.org Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 78 ++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 85be83c8fa17..bcba3935c6f9 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -168,38 +168,6 @@ int devm_regulator_get_enable_read_voltage(struct device *dev, const char *id); void regulator_put(struct regulator *regulator); void devm_regulator_put(struct regulator *regulator); -#if IS_ENABLED(CONFIG_OF) -struct regulator *__must_check of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id); -struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id); -int __must_check of_regulator_bulk_get_all(struct device *dev, struct device_node *np, - struct regulator_bulk_data **consumers); -#else -static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id) -{ - return ERR_PTR(-ENODEV); -} - -static inline struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id) -{ - return ERR_PTR(-ENODEV); -} - -static inline int of_regulator_bulk_get_all(struct device *dev, struct device_node *np, - struct regulator_bulk_data **consumers) -{ - return 0; -} - -#endif - int regulator_register_supply_alias(struct device *dev, const char *id, struct device *alias_dev, const char *alias_id); @@ -380,20 +348,6 @@ devm_regulator_get_optional(struct device *dev, const char *id) return ERR_PTR(-ENODEV); } -static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id) -{ - return ERR_PTR(-ENODEV); -} - -static inline struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id) -{ - return ERR_PTR(-ENODEV); -} - static inline void regulator_put(struct regulator *regulator) { } @@ -701,6 +655,38 @@ regulator_is_equal(struct regulator *reg1, struct regulator *reg2) } #endif +#if IS_ENABLED(CONFIG_OF) && IS_ENABLED(CONFIG_REGULATOR) +struct regulator *__must_check of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id); +struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id); +int __must_check of_regulator_bulk_get_all(struct device *dev, struct device_node *np, + struct regulator_bulk_data **consumers); +#else +static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id) +{ + return ERR_PTR(-ENODEV); +} + +static inline struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id) +{ + return ERR_PTR(-ENODEV); +} + +static inline int of_regulator_bulk_get_all(struct device *dev, struct device_node *np, + struct regulator_bulk_data **consumers) +{ + return 0; +} + +#endif + static inline int regulator_set_voltage_triplet(struct regulator *regulator, int min_uV, int target_uV, int max_uV) -- cgit v1.2.3 From 8c588fe965d95fd2d18e1a1d55489b9237eeddfe Mon Sep 17 00:00:00 2001 From: Stephen Gordon Date: Mon, 6 Jan 2025 16:06:58 +1100 Subject: ASoC: simple_card: Improve debugging messages These fields are from the dai_link, not the dai. Signed-off-by: Stephen Gordon Acked-by: Kuninori Morimoto Link: https://patch.msgid.link/20250106050659.57924-2-gordoste@iinet.net.au Signed-off-by: Mark Brown --- include/sound/simple_card_utils.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/sound/simple_card_utils.h b/include/sound/simple_card_utils.h index 3360d9eab068..af79f680c72a 100644 --- a/include/sound/simple_card_utils.h +++ b/include/sound/simple_card_utils.h @@ -264,9 +264,9 @@ static inline void simple_util_debug_info(struct simple_util_priv *priv) simple_util_debug_dai(priv, "codec", dai); if (link->name) - dev_dbg(dev, "dai name = %s\n", link->name); + dev_dbg(dev, "link name = %s\n", link->name); if (link->dai_fmt) - dev_dbg(dev, "dai format = %04x\n", link->dai_fmt); + dev_dbg(dev, "link format = %04x\n", link->dai_fmt); if (props->adata.convert_rate) dev_dbg(dev, "convert_rate = %d\n", props->adata.convert_rate); if (props->adata.convert_channels) -- cgit v1.2.3 From f1a92bb81a6cf9ae9a50d175be8b396bfabcb487 Mon Sep 17 00:00:00 2001 From: Stephen Gordon Date: Mon, 6 Jan 2025 16:06:59 +1100 Subject: ASoC: simple_card: Show if link is unidirectional It is handy to know whether the link has playback-only/capture-only flags when debugging. Signed-off-by: Stephen Gordon Acked-by: Kuninori Morimoto Link: https://patch.msgid.link/20250106050659.57924-3-gordoste@iinet.net.au Signed-off-by: Mark Brown --- include/sound/simple_card_utils.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/sound/simple_card_utils.h b/include/sound/simple_card_utils.h index af79f680c72a..f6bfd485c31a 100644 --- a/include/sound/simple_card_utils.h +++ b/include/sound/simple_card_utils.h @@ -267,6 +267,10 @@ static inline void simple_util_debug_info(struct simple_util_priv *priv) dev_dbg(dev, "link name = %s\n", link->name); if (link->dai_fmt) dev_dbg(dev, "link format = %04x\n", link->dai_fmt); + if (link->playback_only) + dev_dbg(dev, "link has playback_only"); + if (link->capture_only) + dev_dbg(dev, "link has capture_only"); if (props->adata.convert_rate) dev_dbg(dev, "convert_rate = %d\n", props->adata.convert_rate); if (props->adata.convert_channels) -- cgit v1.2.3 From 2caca8fc7aad9ea9a6ea3ed26ed146b1e5f06fab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jan 2025 09:14:37 +0100 Subject: block: use page_to_phys in bvec_phys Use page_to_phys instead of open coding it now that it is available in an architecture independent way. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250106081437.798213-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bvec.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index f41c7f0ef91e..ba8f52d48b94 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -286,12 +286,7 @@ static inline void *bvec_virt(struct bio_vec *bvec) */ static inline phys_addr_t bvec_phys(const struct bio_vec *bvec) { - /* - * Note this open codes page_to_phys because page_to_phys is defined in - * , which we don't want to pull in here. If it ever moves to - * a sensible place we should start using it. - */ - return PFN_PHYS(page_to_pfn(bvec->bv_page)) + bvec->bv_offset; + return page_to_phys(bvec->bv_page) + bvec->bv_offset; } #endif /* __LINUX_BVEC_H */ -- cgit v1.2.3 From 6e1d75f778d644d02147d8e61ca2cef033ce045d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 6 Dec 2024 13:55:53 +1100 Subject: sunrpc/svc: use store_release_wake_up() svc_thread_init_status() contains an open-coded store_release_wake_up(). It is cleaner to use that function directly rather than needing to remember the barrier. Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index e68fecf6eab5..e4f09f58d58c 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -327,12 +327,7 @@ static inline bool svc_thread_should_stop(struct svc_rqst *rqstp) */ static inline void svc_thread_init_status(struct svc_rqst *rqstp, int err) { - rqstp->rq_err = err; - /* memory barrier ensures assignment to error above is visible before - * waitqueue_active() test below completes. - */ - smp_mb(); - wake_up_var(&rqstp->rq_err); + store_release_wake_up(&rqstp->rq_err, err); if (err) kthread_exit(1); } -- cgit v1.2.3 From eccbbc7c00a5aae5e704d4002adfaf4c3fa4b30d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 Dec 2024 11:41:26 +1100 Subject: nfsd: don't use sv_nrthreads in connection limiting calculations. The heuristic for limiting the number of incoming connections to nfsd currently uses sv_nrthreads - allowing more connections if more threads were configured. A future patch will allow number of threads to grow dynamically so that there will be no need to configure sv_nrthreads. So we need a different solution for limiting connections. It isn't clear what problem is solved by limiting connections (as mentioned in a code comment) but the most likely problem is a connection storm - many connections that are not doing productive work. These will be closed after about 6 minutes already but it might help to slow down a storm. This patch adds a per-connection flag XPT_PEER_VALID which indicates that the peer has presented a filehandle for which it has some sort of access. i.e the peer is known to be trusted in some way. We now only count connections which have NOT been determined to be valid. There should be relative few of these at any given time. If the number of non-validated peer exceed a limit - currently 64 - we close the oldest non-validated peer to avoid having too many of these useless connections. Note that this patch significantly changes the meaning of the various configuration parameters for "max connections". The next patch will remove all of these. Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 2 +- include/linux/sunrpc/svc_xprt.h | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index e4f09f58d58c..4f9418cbf8c9 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -81,7 +81,7 @@ struct svc_serv { unsigned int sv_xdrsize; /* XDR buffer size */ struct list_head sv_permsocks; /* all permanent sockets */ struct list_head sv_tempsocks; /* all temporary sockets */ - int sv_tmpcnt; /* count of temporary sockets */ + int sv_tmpcnt; /* count of temporary "valid" sockets */ struct timer_list sv_temptimer; /* timer for aging temporary sockets */ char * sv_name; /* service name */ diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 0981e35a9fed..7064ebbd550b 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -99,8 +99,24 @@ enum { XPT_HANDSHAKE, /* xprt requests a handshake */ XPT_TLS_SESSION, /* transport-layer security established */ XPT_PEER_AUTH, /* peer has been authenticated */ + XPT_PEER_VALID, /* peer has presented a filehandle that + * it has access to. It is NOT counted + * in ->sv_tmpcnt. + */ }; +static inline void svc_xprt_set_valid(struct svc_xprt *xpt) +{ + if (test_bit(XPT_TEMP, &xpt->xpt_flags) && + !test_and_set_bit(XPT_PEER_VALID, &xpt->xpt_flags)) { + struct svc_serv *serv = xpt->xpt_server; + + spin_lock(&serv->sv_lock); + serv->sv_tmpcnt -= 1; + spin_unlock(&serv->sv_lock); + } +} + static inline void unregister_xpt_user(struct svc_xprt *xpt, struct svc_xpt_user *u) { spin_lock(&xpt->xpt_lock); -- cgit v1.2.3 From a4b853f183a19a88ad635f9ae8ba97e7cb377a23 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 Dec 2024 11:41:27 +1100 Subject: sunrpc: remove all connection limit configuration Now that the connection limit only apply to unconfirmed connections, there is no need to configure it. So remove all the configuration and fix the number of unconfirmed connections as always 64 - which is now given a name: XPT_MAX_TMP_CONN Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 4 ---- include/linux/sunrpc/svc_xprt.h | 6 ++++++ 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 4f9418cbf8c9..74658cca0f38 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -72,10 +72,6 @@ struct svc_serv { spinlock_t sv_lock; unsigned int sv_nprogs; /* Number of sv_programs */ unsigned int sv_nrthreads; /* # of server threads */ - unsigned int sv_maxconn; /* max connections allowed or - * '0' causing max to be based - * on number of threads. */ - unsigned int sv_max_payload; /* datagram payload size */ unsigned int sv_max_mesg; /* max_payload + 1 page for overheads */ unsigned int sv_xdrsize; /* XDR buffer size */ diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 7064ebbd550b..72be60952579 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -105,6 +105,12 @@ enum { */ }; +/* + * Maximum number of "tmp" connections - those without XPT_PEER_VALID - + * permitted on any service. + */ +#define XPT_MAX_TMP_CONN 64 + static inline void svc_xprt_set_valid(struct svc_xprt *xpt) { if (test_bit(XPT_TEMP, &xpt->xpt_flags) && -- cgit v1.2.3 From 2f55dbe4e2072c9e99298c6f37473778a98c9107 Mon Sep 17 00:00:00 2001 From: Yang Erkun Date: Wed, 25 Dec 2024 14:59:05 +0800 Subject: SUNRPC: introduce cache_check_rcu to help check in rcu context This is a prepare patch to add cache_check_rcu, will use it with follow patch. Suggested-by: NeilBrown Signed-off-by: Yang Erkun Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/cache.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index 35766963dd14..e783132e481f 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -222,6 +222,8 @@ static inline bool cache_is_expired(struct cache_detail *detail, struct cache_he return detail->flush_time >= h->last_refresh; } +extern int cache_check_rcu(struct cache_detail *detail, + struct cache_head *h, struct cache_req *rqstp); extern int cache_check(struct cache_detail *detail, struct cache_head *h, struct cache_req *rqstp); extern void cache_flush(void); -- cgit v1.2.3 From e7602bb4f3a1234df8b75728ac3260bcb8242612 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jan 2025 09:35:10 +0100 Subject: block: remove BLK_MQ_F_NO_SCHED The only queues that really can't support a scheduler are those that do not have a gendisk associated with them, and thus can't be used for non-passthrough commands. In addition to those null_blk can optionally set the flag, which is a bad odd. Replace the null_blk usage with BLK_MQ_F_NO_SCHED_BY_DEFAULT to keep the expected semantics and then remove BLK_MQ_F_NO_SCHED as the non-disk queues never call into elevator_init_mq or blk_register_queue which adds the sysfs attributes. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250106083531.799976-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 6340293511c9..f2ff0ffa0535 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -676,8 +676,6 @@ enum { BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 4, - /* Do not allow an I/O scheduler to be configured. */ - BLK_MQ_F_NO_SCHED = 1 << 5, /* * Select 'none' during queue registration in case of a single hwq -- cgit v1.2.3 From ce32496ec1abe866225f2e2005ceda68cf4c7bf4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jan 2025 09:35:11 +0100 Subject: block: simplify tag allocation policy selection Use a plain BLK_MQ_F_* flag to select the round robin tag selection instead of overlaying an enum with just two possible values into the flags space. Doing so allows adding a BLK_MQ_F_MAX sentinel for simplified overflow checking in the messy debugfs helpers. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Link: https://lore.kernel.org/r/20250106083531.799976-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 22 +++++++--------------- include/linux/libata.h | 4 ++-- include/scsi/scsi_host.h | 6 ++++-- 3 files changed, 13 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f2ff0ffa0535..a0a9007cc1e3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -296,13 +296,6 @@ enum blk_eh_timer_return { BLK_EH_RESET_TIMER, }; -/* Keep alloc_policy_name[] in sync with the definitions below */ -enum { - BLK_TAG_ALLOC_FIFO, /* allocate starting from 0 */ - BLK_TAG_ALLOC_RR, /* allocate starting from last allocated tag */ - BLK_TAG_ALLOC_MAX -}; - /** * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware * block device @@ -677,20 +670,19 @@ enum { BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 4, + /* + * Alloc tags on a round-robin base instead of the first available one. + */ + BLK_MQ_F_TAG_RR = 1 << 5, + /* * Select 'none' during queue registration in case of a single hwq * or shared hwqs instead of 'mq-deadline'. */ BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 6, - BLK_MQ_F_ALLOC_POLICY_START_BIT = 7, - BLK_MQ_F_ALLOC_POLICY_BITS = 1, + + BLK_MQ_F_MAX = 1 << 7, }; -#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ - ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ - ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) -#define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ - ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ - << BLK_MQ_F_ALLOC_POLICY_START_BIT) #define BLK_MQ_MAX_DEPTH (10240) #define BLK_MQ_NO_HCTX_IDX (-1U) diff --git a/include/linux/libata.h b/include/linux/libata.h index c1a85d46eba6..be5183d75736 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1467,13 +1467,13 @@ extern const struct attribute_group *ata_common_sdev_groups[]; #define ATA_SUBBASE_SHT(drv_name) \ __ATA_BASE_SHT(drv_name), \ .can_queue = ATA_DEF_QUEUE, \ - .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ + .tag_alloc_policy_rr = true, \ .device_configure = ata_scsi_device_configure #define ATA_SUBBASE_SHT_QD(drv_name, drv_qd) \ __ATA_BASE_SHT(drv_name), \ .can_queue = drv_qd, \ - .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ + .tag_alloc_policy_rr = true, \ .device_configure = ata_scsi_device_configure #define ATA_BASE_SHT(drv_name) \ diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 2b4ab0369ffb..02823d6af37d 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -438,8 +438,10 @@ struct scsi_host_template { */ short cmd_per_lun; - /* If use block layer to manage tags, this is tag allocation policy */ - int tag_alloc_policy; + /* + * Allocate tags starting from last allocated tag. + */ + bool tag_alloc_policy_rr : 1; /* * Track QUEUE_FULL events and reduce queue depth on demand. -- cgit v1.2.3 From b168ed458ddecc176f3b9a1f4bcd83d7a4541c14 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Wed, 4 Dec 2024 15:31:11 +0100 Subject: kernel/cgroup: Add "dmem" memory accounting cgroup This code is based on the RDMA and misc cgroup initially, but now uses page_counter. It uses the same min/low/max semantics as the memory cgroup as a result. There's a small mismatch as TTM uses u64, and page_counter long pages. In practice it's not a problem. 32-bits systems don't really come with >=4GB cards and as long as we're consistently wrong with units, it's fine. The device page size may not be in the same units as kernel page size, and each region might also have a different page size (VRAM vs GART for example). The interface is simple: - Call dmem_cgroup_register_region() - Use dmem_cgroup_try_charge to check if you can allocate a chunk of memory, use dmem_cgroup__uncharge when freeing it. This may return an error code, or -EAGAIN when the cgroup limit is reached. In that case a reference to the limiting pool is returned. - The limiting cs can be used as compare function for dmem_cgroup_state_evict_valuable. - After having evicted enough, drop reference to limiting cs with dmem_cgroup_pool_state_put. This API allows you to limit device resources with cgroups. You can see the supported cards in /sys/fs/cgroup/dmem.capacity You need to echo +dmem to cgroup.subtree_control, and then you can partition device memory. Co-developed-by: Friedrich Vock Signed-off-by: Friedrich Vock Co-developed-by: Maxime Ripard Signed-off-by: Maarten Lankhorst Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20241204143112.1250983-1-dev@lankhorst.se Signed-off-by: Maxime Ripard --- include/linux/cgroup_dmem.h | 66 +++++++++++++++++++++++++++++++++++++++++++ include/linux/cgroup_subsys.h | 4 +++ include/linux/page_counter.h | 2 +- 3 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 include/linux/cgroup_dmem.h (limited to 'include') diff --git a/include/linux/cgroup_dmem.h b/include/linux/cgroup_dmem.h new file mode 100644 index 000000000000..dd4869f1d736 --- /dev/null +++ b/include/linux/cgroup_dmem.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2023-2024 Intel Corporation + */ + +#ifndef _CGROUP_DMEM_H +#define _CGROUP_DMEM_H + +#include +#include + +struct dmem_cgroup_pool_state; + +/* Opaque definition of a cgroup region, used internally */ +struct dmem_cgroup_region; + +#if IS_ENABLED(CONFIG_CGROUP_DMEM) +struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *name_fmt, ...) __printf(2,3); +void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region); +int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, + struct dmem_cgroup_pool_state **ret_pool, + struct dmem_cgroup_pool_state **ret_limit_pool); +void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size); +bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool, + struct dmem_cgroup_pool_state *test_pool, + bool ignore_low, bool *ret_hit_low); + +void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool); +#else +static inline __printf(2,3) struct dmem_cgroup_region * +dmem_cgroup_register_region(u64 size, const char *name_fmt, ...) +{ + return NULL; +} + +static inline void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region) +{ } + +static inline int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size, + struct dmem_cgroup_pool_state **ret_pool, + struct dmem_cgroup_pool_state **ret_limit_pool) +{ + *ret_pool = NULL; + + if (ret_limit_pool) + *ret_limit_pool = NULL; + + return 0; +} + +static inline void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size) +{ } + +static inline +bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool, + struct dmem_cgroup_pool_state *test_pool, + bool ignore_low, bool *ret_hit_low) +{ + return true; +} + +static inline void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool) +{ } + +#endif +#endif /* _CGROUP_DMEM_H */ diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 445235487230..3fd0bcbf3080 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -65,6 +65,10 @@ SUBSYS(rdma) SUBSYS(misc) #endif +#if IS_ENABLED(CONFIG_CGROUP_DMEM) +SUBSYS(dmem) +#endif + /* * The following subsystems are not supported on the default hierarchy. */ diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 79dbd8bc35a7..46406f3fe34d 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -96,7 +96,7 @@ static inline void page_counter_reset_watermark(struct page_counter *counter) counter->watermark = usage; } -#ifdef CONFIG_MEMCG +#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM) void page_counter_calculate_protection(struct page_counter *root, struct page_counter *counter, bool recursive_protection); -- cgit v1.2.3 From 7b0af165e2d4b612de5c40daa586d3b9a40b3af6 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 4 Dec 2024 14:44:02 +0100 Subject: drm/drv: Add drmm managed registration helper for dmem cgroups. Drivers will need to register dmem regions at probe time, so let's give them a drm-managed helper. Signed-off-by: Maarten Lankhorst Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20241204134410.1161769-3-dev@lankhorst.se Signed-off-by: Maxime Ripard --- include/drm/drm_drv.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h index 1bbbcb8e2d23..7dd49d9ab505 100644 --- a/include/drm/drm_drv.h +++ b/include/drm/drm_drv.h @@ -34,6 +34,7 @@ #include +struct dmem_cgroup_region; struct drm_fb_helper; struct drm_fb_helper_surface_size; struct drm_file; @@ -438,6 +439,10 @@ void *__devm_drm_dev_alloc(struct device *parent, const struct drm_driver *driver, size_t size, size_t offset); +struct dmem_cgroup_region * +drmm_cgroup_register_region(struct drm_device *dev, + const char *region_name, u64 size); + /** * devm_drm_dev_alloc - Resource managed allocation of a &drm_device instance * @parent: Parent device object -- cgit v1.2.3 From 42b00f445616335becee9142c0f7ef7abfee5c61 Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Wed, 4 Dec 2024 11:37:17 -0800 Subject: dt-bindings: clock: qcom: Add SM8750 GCC Add device tree bindings for the global clock controller on Qualcomm SM8750 platform. Signed-off-by: Taniya Das Signed-off-by: Melody Olvera Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20241204-sm8750_master_clks-v3-5-1a8f31a53a86@quicinc.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,sm8750-gcc.h | 226 ++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 include/dt-bindings/clock/qcom,sm8750-gcc.h (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,sm8750-gcc.h b/include/dt-bindings/clock/qcom,sm8750-gcc.h new file mode 100644 index 000000000000..e234595d7f42 --- /dev/null +++ b/include/dt-bindings/clock/qcom,sm8750-gcc.h @@ -0,0 +1,226 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_GCC_SM8750_H +#define _DT_BINDINGS_CLK_QCOM_GCC_SM8750_H + +/* GCC clocks */ +#define GCC_AGGRE_NOC_PCIE_AXI_CLK 0 +#define GCC_AGGRE_UFS_PHY_AXI_CLK 1 +#define GCC_AGGRE_UFS_PHY_AXI_HW_CTL_CLK 2 +#define GCC_AGGRE_USB3_PRIM_AXI_CLK 3 +#define GCC_BOOT_ROM_AHB_CLK 4 +#define GCC_CAM_BIST_MCLK_AHB_CLK 5 +#define GCC_CAMERA_AHB_CLK 6 +#define GCC_CAMERA_HF_AXI_CLK 7 +#define GCC_CAMERA_SF_AXI_CLK 8 +#define GCC_CAMERA_XO_CLK 9 +#define GCC_CFG_NOC_PCIE_ANOC_AHB_CLK 10 +#define GCC_CFG_NOC_USB3_PRIM_AXI_CLK 11 +#define GCC_CNOC_PCIE_SF_AXI_CLK 12 +#define GCC_DDRSS_GPU_AXI_CLK 13 +#define GCC_DDRSS_PCIE_SF_QTB_CLK 14 +#define GCC_DISP_AHB_CLK 15 +#define GCC_DISP_HF_AXI_CLK 16 +#define GCC_EVA_AHB_CLK 17 +#define GCC_EVA_AXI0_CLK 18 +#define GCC_EVA_AXI0C_CLK 19 +#define GCC_EVA_XO_CLK 20 +#define GCC_GP1_CLK 21 +#define GCC_GP1_CLK_SRC 22 +#define GCC_GP2_CLK 23 +#define GCC_GP2_CLK_SRC 24 +#define GCC_GP3_CLK 25 +#define GCC_GP3_CLK_SRC 26 +#define GCC_GPLL0 27 +#define GCC_GPLL0_OUT_EVEN 28 +#define GCC_GPLL1 29 +#define GCC_GPLL4 30 +#define GCC_GPLL7 31 +#define GCC_GPLL9 32 +#define GCC_GPU_CFG_AHB_CLK 33 +#define GCC_GPU_GEMNOC_GFX_CLK 34 +#define GCC_GPU_GPLL0_CLK_SRC 35 +#define GCC_GPU_GPLL0_DIV_CLK_SRC 36 +#define GCC_PCIE_0_AUX_CLK 37 +#define GCC_PCIE_0_AUX_CLK_SRC 38 +#define GCC_PCIE_0_CFG_AHB_CLK 39 +#define GCC_PCIE_0_MSTR_AXI_CLK 40 +#define GCC_PCIE_0_PHY_RCHNG_CLK 41 +#define GCC_PCIE_0_PHY_RCHNG_CLK_SRC 42 +#define GCC_PCIE_0_PIPE_CLK 43 +#define GCC_PCIE_0_PIPE_CLK_SRC 44 +#define GCC_PCIE_0_SLV_AXI_CLK 45 +#define GCC_PCIE_0_SLV_Q2A_AXI_CLK 46 +#define GCC_PCIE_RSCC_CFG_AHB_CLK 47 +#define GCC_PCIE_RSCC_XO_CLK 48 +#define GCC_PDM2_CLK 49 +#define GCC_PDM2_CLK_SRC 50 +#define GCC_PDM_AHB_CLK 51 +#define GCC_PDM_XO4_CLK 52 +#define GCC_QMIP_CAMERA_CMD_AHB_CLK 53 +#define GCC_QMIP_CAMERA_NRT_AHB_CLK 54 +#define GCC_QMIP_CAMERA_RT_AHB_CLK 55 +#define GCC_QMIP_GPU_AHB_CLK 56 +#define GCC_QMIP_PCIE_AHB_CLK 57 +#define GCC_QMIP_VIDEO_CV_CPU_AHB_CLK 58 +#define GCC_QMIP_VIDEO_CVP_AHB_CLK 59 +#define GCC_QMIP_VIDEO_V_CPU_AHB_CLK 60 +#define GCC_QMIP_VIDEO_VCODEC_AHB_CLK 61 +#define GCC_QUPV3_I2C_CORE_CLK 62 +#define GCC_QUPV3_I2C_S0_CLK 63 +#define GCC_QUPV3_I2C_S0_CLK_SRC 64 +#define GCC_QUPV3_I2C_S1_CLK 65 +#define GCC_QUPV3_I2C_S1_CLK_SRC 66 +#define GCC_QUPV3_I2C_S2_CLK 67 +#define GCC_QUPV3_I2C_S2_CLK_SRC 68 +#define GCC_QUPV3_I2C_S3_CLK 69 +#define GCC_QUPV3_I2C_S3_CLK_SRC 70 +#define GCC_QUPV3_I2C_S4_CLK 71 +#define GCC_QUPV3_I2C_S4_CLK_SRC 72 +#define GCC_QUPV3_I2C_S5_CLK 73 +#define GCC_QUPV3_I2C_S5_CLK_SRC 74 +#define GCC_QUPV3_I2C_S6_CLK 75 +#define GCC_QUPV3_I2C_S6_CLK_SRC 76 +#define GCC_QUPV3_I2C_S7_CLK 77 +#define GCC_QUPV3_I2C_S7_CLK_SRC 78 +#define GCC_QUPV3_I2C_S8_CLK 79 +#define GCC_QUPV3_I2C_S8_CLK_SRC 80 +#define GCC_QUPV3_I2C_S9_CLK 81 +#define GCC_QUPV3_I2C_S9_CLK_SRC 82 +#define GCC_QUPV3_I2C_S_AHB_CLK 83 +#define GCC_QUPV3_WRAP1_CORE_2X_CLK 84 +#define GCC_QUPV3_WRAP1_CORE_CLK 85 +#define GCC_QUPV3_WRAP1_QSPI_REF_CLK 86 +#define GCC_QUPV3_WRAP1_QSPI_REF_CLK_SRC 87 +#define GCC_QUPV3_WRAP1_S0_CLK 88 +#define GCC_QUPV3_WRAP1_S0_CLK_SRC 89 +#define GCC_QUPV3_WRAP1_S1_CLK 90 +#define GCC_QUPV3_WRAP1_S1_CLK_SRC 91 +#define GCC_QUPV3_WRAP1_S2_CLK 92 +#define GCC_QUPV3_WRAP1_S2_CLK_SRC 93 +#define GCC_QUPV3_WRAP1_S3_CLK 94 +#define GCC_QUPV3_WRAP1_S3_CLK_SRC 95 +#define GCC_QUPV3_WRAP1_S4_CLK 96 +#define GCC_QUPV3_WRAP1_S4_CLK_SRC 97 +#define GCC_QUPV3_WRAP1_S5_CLK 98 +#define GCC_QUPV3_WRAP1_S5_CLK_SRC 99 +#define GCC_QUPV3_WRAP1_S6_CLK 100 +#define GCC_QUPV3_WRAP1_S6_CLK_SRC 101 +#define GCC_QUPV3_WRAP1_S7_CLK 102 +#define GCC_QUPV3_WRAP1_S7_CLK_SRC 103 +#define GCC_QUPV3_WRAP2_CORE_2X_CLK 104 +#define GCC_QUPV3_WRAP2_CORE_CLK 105 +#define GCC_QUPV3_WRAP2_IBI_CTRL_0_CLK_SRC 106 +#define GCC_QUPV3_WRAP2_IBI_CTRL_2_CLK 107 +#define GCC_QUPV3_WRAP2_IBI_CTRL_3_CLK 108 +#define GCC_QUPV3_WRAP2_S0_CLK 109 +#define GCC_QUPV3_WRAP2_S0_CLK_SRC 110 +#define GCC_QUPV3_WRAP2_S1_CLK 111 +#define GCC_QUPV3_WRAP2_S1_CLK_SRC 112 +#define GCC_QUPV3_WRAP2_S2_CLK 113 +#define GCC_QUPV3_WRAP2_S2_CLK_SRC 114 +#define GCC_QUPV3_WRAP2_S3_CLK 115 +#define GCC_QUPV3_WRAP2_S3_CLK_SRC 116 +#define GCC_QUPV3_WRAP2_S4_CLK 117 +#define GCC_QUPV3_WRAP2_S4_CLK_SRC 118 +#define GCC_QUPV3_WRAP2_S5_CLK 119 +#define GCC_QUPV3_WRAP2_S5_CLK_SRC 120 +#define GCC_QUPV3_WRAP2_S6_CLK 121 +#define GCC_QUPV3_WRAP2_S6_CLK_SRC 122 +#define GCC_QUPV3_WRAP2_S7_CLK 123 +#define GCC_QUPV3_WRAP2_S7_CLK_SRC 124 +#define GCC_QUPV3_WRAP_1_M_AHB_CLK 125 +#define GCC_QUPV3_WRAP_1_S_AHB_CLK 126 +#define GCC_QUPV3_WRAP_2_IBI_2_AHB_CLK 127 +#define GCC_QUPV3_WRAP_2_IBI_3_AHB_CLK 128 +#define GCC_QUPV3_WRAP_2_M_AHB_CLK 129 +#define GCC_QUPV3_WRAP_2_S_AHB_CLK 130 +#define GCC_SDCC2_AHB_CLK 131 +#define GCC_SDCC2_APPS_CLK 132 +#define GCC_SDCC2_APPS_CLK_SRC 133 +#define GCC_SDCC4_AHB_CLK 134 +#define GCC_SDCC4_APPS_CLK 135 +#define GCC_SDCC4_APPS_CLK_SRC 136 +#define GCC_UFS_PHY_AHB_CLK 137 +#define GCC_UFS_PHY_AXI_CLK 138 +#define GCC_UFS_PHY_AXI_CLK_SRC 139 +#define GCC_UFS_PHY_AXI_HW_CTL_CLK 140 +#define GCC_UFS_PHY_ICE_CORE_CLK 141 +#define GCC_UFS_PHY_ICE_CORE_CLK_SRC 142 +#define GCC_UFS_PHY_ICE_CORE_HW_CTL_CLK 143 +#define GCC_UFS_PHY_PHY_AUX_CLK 144 +#define GCC_UFS_PHY_PHY_AUX_CLK_SRC 145 +#define GCC_UFS_PHY_PHY_AUX_HW_CTL_CLK 146 +#define GCC_UFS_PHY_RX_SYMBOL_0_CLK 147 +#define GCC_UFS_PHY_RX_SYMBOL_0_CLK_SRC 148 +#define GCC_UFS_PHY_RX_SYMBOL_1_CLK 149 +#define GCC_UFS_PHY_RX_SYMBOL_1_CLK_SRC 150 +#define GCC_UFS_PHY_TX_SYMBOL_0_CLK 151 +#define GCC_UFS_PHY_TX_SYMBOL_0_CLK_SRC 152 +#define GCC_UFS_PHY_UNIPRO_CORE_CLK 153 +#define GCC_UFS_PHY_UNIPRO_CORE_CLK_SRC 154 +#define GCC_UFS_PHY_UNIPRO_CORE_HW_CTL_CLK 155 +#define GCC_USB30_PRIM_MASTER_CLK 156 +#define GCC_USB30_PRIM_MASTER_CLK_SRC 157 +#define GCC_USB30_PRIM_MOCK_UTMI_CLK 158 +#define GCC_USB30_PRIM_MOCK_UTMI_CLK_SRC 159 +#define GCC_USB30_PRIM_MOCK_UTMI_POSTDIV_CLK_SRC 160 +#define GCC_USB30_PRIM_SLEEP_CLK 161 +#define GCC_USB3_PRIM_PHY_AUX_CLK 162 +#define GCC_USB3_PRIM_PHY_AUX_CLK_SRC 163 +#define GCC_USB3_PRIM_PHY_COM_AUX_CLK 164 +#define GCC_USB3_PRIM_PHY_PIPE_CLK 165 +#define GCC_USB3_PRIM_PHY_PIPE_CLK_SRC 166 +#define GCC_VIDEO_AHB_CLK 167 +#define GCC_VIDEO_AXI0_CLK 168 +#define GCC_VIDEO_AXI1_CLK 169 +#define GCC_VIDEO_XO_CLK 170 + +/* GCC power domains */ +#define GCC_PCIE_0_GDSC 0 +#define GCC_PCIE_0_PHY_GDSC 1 +#define GCC_UFS_MEM_PHY_GDSC 2 +#define GCC_UFS_PHY_GDSC 3 +#define GCC_USB30_PRIM_GDSC 4 +#define GCC_USB3_PHY_GDSC 5 + +/* GCC resets */ +#define GCC_CAMERA_BCR 0 +#define GCC_DISPLAY_BCR 1 +#define GCC_EVA_BCR 2 +#define GCC_GPU_BCR 3 +#define GCC_PCIE_0_BCR 4 +#define GCC_PCIE_0_LINK_DOWN_BCR 5 +#define GCC_PCIE_0_NOCSR_COM_PHY_BCR 6 +#define GCC_PCIE_0_PHY_BCR 7 +#define GCC_PCIE_0_PHY_NOCSR_COM_PHY_BCR 8 +#define GCC_PCIE_PHY_BCR 9 +#define GCC_PCIE_PHY_CFG_AHB_BCR 10 +#define GCC_PCIE_PHY_COM_BCR 11 +#define GCC_PCIE_RSCC_BCR 12 +#define GCC_PDM_BCR 13 +#define GCC_QUPV3_WRAPPER_1_BCR 14 +#define GCC_QUPV3_WRAPPER_2_BCR 15 +#define GCC_QUPV3_WRAPPER_I2C_BCR 16 +#define GCC_QUSB2PHY_PRIM_BCR 17 +#define GCC_QUSB2PHY_SEC_BCR 18 +#define GCC_SDCC2_BCR 19 +#define GCC_SDCC4_BCR 20 +#define GCC_UFS_PHY_BCR 21 +#define GCC_USB30_PRIM_BCR 22 +#define GCC_USB3_DP_PHY_PRIM_BCR 23 +#define GCC_USB3_DP_PHY_SEC_BCR 24 +#define GCC_USB3_PHY_PRIM_BCR 25 +#define GCC_USB3_PHY_SEC_BCR 26 +#define GCC_USB3PHY_PHY_PRIM_BCR 27 +#define GCC_USB3PHY_PHY_SEC_BCR 28 +#define GCC_VIDEO_AXI0_CLK_ARES 29 +#define GCC_VIDEO_AXI1_CLK_ARES 30 +#define GCC_VIDEO_BCR 31 +#define GCC_EVA_AXI0_CLK_ARES 32 +#define GCC_EVA_AXI0C_CLK_ARES 33 + +#endif -- cgit v1.2.3 From 8817c21a45b62c17f18417efbd0b04a3805a1e23 Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Wed, 4 Dec 2024 11:37:19 -0800 Subject: dt-bindings: clock: qcom: Document the SM8750 TCSR Clock Controller Add bindings documentation for the SM8750 Clock Controller. Acked-by: Krzysztof Kozlowski Signed-off-by: Melody Olvera Signed-off-by: Taniya Das Link: https://lore.kernel.org/r/20241204-sm8750_master_clks-v3-7-1a8f31a53a86@quicinc.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,sm8750-tcsr.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 include/dt-bindings/clock/qcom,sm8750-tcsr.h (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,sm8750-tcsr.h b/include/dt-bindings/clock/qcom,sm8750-tcsr.h new file mode 100644 index 000000000000..1c502ac7c7f4 --- /dev/null +++ b/include/dt-bindings/clock/qcom,sm8750-tcsr.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_TCSR_CC_SM8750_H +#define _DT_BINDINGS_CLK_QCOM_TCSR_CC_SM8750_H + +/* TCSR_CC clocks */ +#define TCSR_PCIE_0_CLKREF_EN 0 +#define TCSR_UFS_CLKREF_EN 1 +#define TCSR_USB2_CLKREF_EN 2 +#define TCSR_USB3_CLKREF_EN 3 + +#endif -- cgit v1.2.3 From 4f1a62e2b3961946a924c093bc2bdd44a2a46c9d Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 6 Jan 2025 14:44:29 +0100 Subject: dt-bindings: clock: qcom,sm8550-dispcc: Add SM8750 DISPCC Add bindings for the Qualcomm SM8750 Display Clock Controller (DISPCC). Bindings are similar to existing SM8550 and SM8650 (same clock inputs), but the clock hierarchy is quite different and these are not compatible devices. The binding header was copied from downstream sources, so I retained original copyrights. Acked-by: Conor Dooley Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250106-sm8750-dispcc-v2-1-6f42beda6317@linaro.org Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,sm8750-dispcc.h | 112 +++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 include/dt-bindings/clock/qcom,sm8750-dispcc.h (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,sm8750-dispcc.h b/include/dt-bindings/clock/qcom,sm8750-dispcc.h new file mode 100644 index 000000000000..dafb5069c96a --- /dev/null +++ b/include/dt-bindings/clock/qcom,sm8750-dispcc.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2022, The Linux Foundation. All rights reserved. + * Copyright (c) 2023, Qualcomm Innovation Center, Inc. All rights reserved. + * Copyright (c) 2024, Linaro Ltd. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_SM8750_DISP_CC_H +#define _DT_BINDINGS_CLK_QCOM_SM8750_DISP_CC_H + +/* DISP_CC clocks */ +#define DISP_CC_ESYNC0_CLK 0 +#define DISP_CC_ESYNC0_CLK_SRC 1 +#define DISP_CC_ESYNC1_CLK 2 +#define DISP_CC_ESYNC1_CLK_SRC 3 +#define DISP_CC_MDSS_ACCU_SHIFT_CLK 4 +#define DISP_CC_MDSS_AHB1_CLK 5 +#define DISP_CC_MDSS_AHB_CLK 6 +#define DISP_CC_MDSS_AHB_CLK_SRC 7 +#define DISP_CC_MDSS_BYTE0_CLK 8 +#define DISP_CC_MDSS_BYTE0_CLK_SRC 9 +#define DISP_CC_MDSS_BYTE0_DIV_CLK_SRC 10 +#define DISP_CC_MDSS_BYTE0_INTF_CLK 11 +#define DISP_CC_MDSS_BYTE1_CLK 12 +#define DISP_CC_MDSS_BYTE1_CLK_SRC 13 +#define DISP_CC_MDSS_BYTE1_DIV_CLK_SRC 14 +#define DISP_CC_MDSS_BYTE1_INTF_CLK 15 +#define DISP_CC_MDSS_DPTX0_AUX_CLK 16 +#define DISP_CC_MDSS_DPTX0_AUX_CLK_SRC 17 +#define DISP_CC_MDSS_DPTX0_CRYPTO_CLK 18 +#define DISP_CC_MDSS_DPTX0_LINK_CLK 19 +#define DISP_CC_MDSS_DPTX0_LINK_CLK_SRC 20 +#define DISP_CC_MDSS_DPTX0_LINK_DIV_CLK_SRC 21 +#define DISP_CC_MDSS_DPTX0_LINK_INTF_CLK 22 +#define DISP_CC_MDSS_DPTX0_PIXEL0_CLK 23 +#define DISP_CC_MDSS_DPTX0_PIXEL0_CLK_SRC 24 +#define DISP_CC_MDSS_DPTX0_PIXEL1_CLK 25 +#define DISP_CC_MDSS_DPTX0_PIXEL1_CLK_SRC 26 +#define DISP_CC_MDSS_DPTX0_USB_ROUTER_LINK_INTF_CLK 27 +#define DISP_CC_MDSS_DPTX1_AUX_CLK 28 +#define DISP_CC_MDSS_DPTX1_AUX_CLK_SRC 29 +#define DISP_CC_MDSS_DPTX1_CRYPTO_CLK 30 +#define DISP_CC_MDSS_DPTX1_LINK_CLK 31 +#define DISP_CC_MDSS_DPTX1_LINK_CLK_SRC 32 +#define DISP_CC_MDSS_DPTX1_LINK_DIV_CLK_SRC 33 +#define DISP_CC_MDSS_DPTX1_LINK_INTF_CLK 34 +#define DISP_CC_MDSS_DPTX1_PIXEL0_CLK 35 +#define DISP_CC_MDSS_DPTX1_PIXEL0_CLK_SRC 36 +#define DISP_CC_MDSS_DPTX1_PIXEL1_CLK 37 +#define DISP_CC_MDSS_DPTX1_PIXEL1_CLK_SRC 38 +#define DISP_CC_MDSS_DPTX1_USB_ROUTER_LINK_INTF_CLK 39 +#define DISP_CC_MDSS_DPTX2_AUX_CLK 40 +#define DISP_CC_MDSS_DPTX2_AUX_CLK_SRC 41 +#define DISP_CC_MDSS_DPTX2_CRYPTO_CLK 42 +#define DISP_CC_MDSS_DPTX2_LINK_CLK 43 +#define DISP_CC_MDSS_DPTX2_LINK_CLK_SRC 44 +#define DISP_CC_MDSS_DPTX2_LINK_DIV_CLK_SRC 45 +#define DISP_CC_MDSS_DPTX2_LINK_INTF_CLK 46 +#define DISP_CC_MDSS_DPTX2_PIXEL0_CLK 47 +#define DISP_CC_MDSS_DPTX2_PIXEL0_CLK_SRC 48 +#define DISP_CC_MDSS_DPTX2_PIXEL1_CLK 49 +#define DISP_CC_MDSS_DPTX2_PIXEL1_CLK_SRC 50 +#define DISP_CC_MDSS_DPTX3_AUX_CLK 51 +#define DISP_CC_MDSS_DPTX3_AUX_CLK_SRC 52 +#define DISP_CC_MDSS_DPTX3_CRYPTO_CLK 53 +#define DISP_CC_MDSS_DPTX3_LINK_CLK 54 +#define DISP_CC_MDSS_DPTX3_LINK_CLK_SRC 55 +#define DISP_CC_MDSS_DPTX3_LINK_DIV_CLK_SRC 56 +#define DISP_CC_MDSS_DPTX3_LINK_INTF_CLK 57 +#define DISP_CC_MDSS_DPTX3_PIXEL0_CLK 58 +#define DISP_CC_MDSS_DPTX3_PIXEL0_CLK_SRC 59 +#define DISP_CC_MDSS_ESC0_CLK 60 +#define DISP_CC_MDSS_ESC0_CLK_SRC 61 +#define DISP_CC_MDSS_ESC1_CLK 62 +#define DISP_CC_MDSS_ESC1_CLK_SRC 63 +#define DISP_CC_MDSS_MDP1_CLK 64 +#define DISP_CC_MDSS_MDP_CLK 65 +#define DISP_CC_MDSS_MDP_CLK_SRC 66 +#define DISP_CC_MDSS_MDP_LUT1_CLK 67 +#define DISP_CC_MDSS_MDP_LUT_CLK 68 +#define DISP_CC_MDSS_NON_GDSC_AHB_CLK 69 +#define DISP_CC_MDSS_PCLK0_CLK 70 +#define DISP_CC_MDSS_PCLK0_CLK_SRC 71 +#define DISP_CC_MDSS_PCLK1_CLK 72 +#define DISP_CC_MDSS_PCLK1_CLK_SRC 73 +#define DISP_CC_MDSS_PCLK2_CLK 74 +#define DISP_CC_MDSS_PCLK2_CLK_SRC 75 +#define DISP_CC_MDSS_RSCC_AHB_CLK 76 +#define DISP_CC_MDSS_RSCC_VSYNC_CLK 77 +#define DISP_CC_MDSS_VSYNC1_CLK 78 +#define DISP_CC_MDSS_VSYNC_CLK 79 +#define DISP_CC_MDSS_VSYNC_CLK_SRC 80 +#define DISP_CC_OSC_CLK 81 +#define DISP_CC_OSC_CLK_SRC 82 +#define DISP_CC_PLL0 83 +#define DISP_CC_PLL1 84 +#define DISP_CC_PLL2 85 +#define DISP_CC_SLEEP_CLK 86 +#define DISP_CC_SLEEP_CLK_SRC 87 +#define DISP_CC_XO_CLK 88 +#define DISP_CC_XO_CLK_SRC 89 + +/* DISP_CC resets */ +#define DISP_CC_MDSS_CORE_BCR 0 +#define DISP_CC_MDSS_CORE_INT2_BCR 1 +#define DISP_CC_MDSS_RSCC_BCR 2 + +/* DISP_CC GDSCR */ +#define MDSS_GDSC 0 +#define MDSS_INT2_GDSC 1 + +#endif -- cgit v1.2.3 From 3f9f5cd005f5b5243eaa2647d40b9857fa1a901d Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 2 Jan 2025 17:34:18 +0100 Subject: sctp: Prepare sctp_v4_get_dst() to dscp_t conversion. Define inet_sk_dscp() to get a dscp_t value from struct inet_sock, so that sctp_v4_get_dst() can easily set ->flowi4_tos from a dscp_t variable. For the SCTP_DSCP_SET_MASK case, we can just use inet_dsfield_to_dscp() to get a dscp_t value. Then, when converting ->flowi4_tos from __u8 to dscp_t, we'll just have to drop the inet_dscp_to_dsfield() conversion function. Signed-off-by: Guillaume Nault Acked-by: Xin Long Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/1a645f4a0bc60ad18e7c0916642883ce8a43c013.1735835456.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/inet_sock.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 3ccbad881d74..1086256549fa 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -302,6 +303,11 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet) return READ_ONCE(inet->inet_flags) & IP_CMSG_ALL; } +static inline dscp_t inet_sk_dscp(const struct inet_sock *inet) +{ + return inet_dsfield_to_dscp(READ_ONCE(inet->tos)); +} + #define inet_test_bit(nr, sk) \ test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags) #define inet_set_bit(nr, sk) \ -- cgit v1.2.3 From c0f1cbf795095c21b92a46fa1dc47a7b787ce538 Mon Sep 17 00:00:00 2001 From: Luo Jie Date: Fri, 3 Jan 2025 15:31:34 +0800 Subject: dt-bindings: clock: qcom: Add CMN PLL clock controller for IPQ SoC The CMN PLL controller provides clocks to networking hardware blocks and to GCC on Qualcomm IPQ9574 SoC. It receives input clock from the on-chip Wi-Fi, and produces output clocks at fixed rates. These output rates are predetermined, and are unrelated to the input clock rate. The primary purpose of CMN PLL is to supply clocks to the networking hardware such as PPE (packet process engine), PCS and the externally connected switch or PHY device. The CMN PLL block also outputs fixed rate clocks to GCC, such as 24 MHZ as XO clock and 32 KHZ as sleep clock supplied to GCC. Signed-off-by: Luo Jie Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250103-qcom_ipq_cmnpll-v8-1-c89fb4d4849d@quicinc.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,ipq-cmn-pll.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 include/dt-bindings/clock/qcom,ipq-cmn-pll.h (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,ipq-cmn-pll.h b/include/dt-bindings/clock/qcom,ipq-cmn-pll.h new file mode 100644 index 000000000000..936e92b3b62c --- /dev/null +++ b/include/dt-bindings/clock/qcom,ipq-cmn-pll.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_IPQ_CMN_PLL_H +#define _DT_BINDINGS_CLK_QCOM_IPQ_CMN_PLL_H + +/* CMN PLL core clock. */ +#define CMN_PLL_CLK 0 + +/* The output clocks from CMN PLL of IPQ9574. */ +#define XO_24MHZ_CLK 1 +#define SLEEP_32KHZ_CLK 2 +#define PCS_31P25MHZ_CLK 3 +#define NSS_1200MHZ_CLK 4 +#define PPE_353MHZ_CLK 5 +#define ETH0_50MHZ_CLK 6 +#define ETH1_50MHZ_CLK 7 +#define ETH2_50MHZ_CLK 8 +#define ETH_25MHZ_CLK 9 +#endif -- cgit v1.2.3 From 95fc45d1dea8e1253f8ec58abc5befb71553d666 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jan 2025 21:05:14 +0000 Subject: ax25: rcu protect dev->ax25_ptr syzbot found a lockdep issue [1]. We should remove ax25 RTNL dependency in ax25_setsockopt() This should also fix a variety of possible UAF in ax25. [1] WARNING: possible circular locking dependency detected 6.13.0-rc3-syzkaller-00762-g9268abe611b0 #0 Not tainted ------------------------------------------------------ syz.5.1818/12806 is trying to acquire lock: ffffffff8fcb3988 (rtnl_mutex){+.+.}-{4:4}, at: ax25_setsockopt+0xa55/0xe90 net/ax25/af_ax25.c:680 but task is already holding lock: ffff8880617ac258 (sk_lock-AF_AX25){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1618 [inline] ffff8880617ac258 (sk_lock-AF_AX25){+.+.}-{0:0}, at: ax25_setsockopt+0x209/0xe90 net/ax25/af_ax25.c:574 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (sk_lock-AF_AX25){+.+.}-{0:0}: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 lock_sock_nested+0x48/0x100 net/core/sock.c:3642 lock_sock include/net/sock.h:1618 [inline] ax25_kill_by_device net/ax25/af_ax25.c:101 [inline] ax25_device_event+0x24d/0x580 net/ax25/af_ax25.c:146 notifier_call_chain+0x1a5/0x3f0 kernel/notifier.c:85 __dev_notify_flags+0x207/0x400 dev_change_flags+0xf0/0x1a0 net/core/dev.c:9026 dev_ifsioc+0x7c8/0xe70 net/core/dev_ioctl.c:563 dev_ioctl+0x719/0x1340 net/core/dev_ioctl.c:820 sock_do_ioctl+0x240/0x460 net/socket.c:1234 sock_ioctl+0x626/0x8e0 net/socket.c:1339 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl+0xf5/0x170 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #0 (rtnl_mutex){+.+.}-{4:4}: check_prev_add kernel/locking/lockdep.c:3161 [inline] check_prevs_add kernel/locking/lockdep.c:3280 [inline] validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904 __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 __mutex_lock_common kernel/locking/mutex.c:585 [inline] __mutex_lock+0x1ac/0xee0 kernel/locking/mutex.c:735 ax25_setsockopt+0xa55/0xe90 net/ax25/af_ax25.c:680 do_sock_setsockopt+0x3af/0x720 net/socket.c:2324 __sys_setsockopt net/socket.c:2349 [inline] __do_sys_setsockopt net/socket.c:2355 [inline] __se_sys_setsockopt net/socket.c:2352 [inline] __x64_sys_setsockopt+0x1ee/0x280 net/socket.c:2352 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(sk_lock-AF_AX25); lock(rtnl_mutex); lock(sk_lock-AF_AX25); lock(rtnl_mutex); *** DEADLOCK *** 1 lock held by syz.5.1818/12806: #0: ffff8880617ac258 (sk_lock-AF_AX25){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1618 [inline] #0: ffff8880617ac258 (sk_lock-AF_AX25){+.+.}-{0:0}, at: ax25_setsockopt+0x209/0xe90 net/ax25/af_ax25.c:574 stack backtrace: CPU: 1 UID: 0 PID: 12806 Comm: syz.5.1818 Not tainted 6.13.0-rc3-syzkaller-00762-g9268abe611b0 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 print_circular_bug+0x13a/0x1b0 kernel/locking/lockdep.c:2074 check_noncircular+0x36a/0x4a0 kernel/locking/lockdep.c:2206 check_prev_add kernel/locking/lockdep.c:3161 [inline] check_prevs_add kernel/locking/lockdep.c:3280 [inline] validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904 __lock_acquire+0x1397/0x2100 kernel/locking/lockdep.c:5226 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5849 __mutex_lock_common kernel/locking/mutex.c:585 [inline] __mutex_lock+0x1ac/0xee0 kernel/locking/mutex.c:735 ax25_setsockopt+0xa55/0xe90 net/ax25/af_ax25.c:680 do_sock_setsockopt+0x3af/0x720 net/socket.c:2324 __sys_setsockopt net/socket.c:2349 [inline] __do_sys_setsockopt net/socket.c:2355 [inline] __se_sys_setsockopt net/socket.c:2352 [inline] __x64_sys_setsockopt+0x1ee/0x280 net/socket.c:2352 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f7b62385d29 Fixes: c433570458e4 ("ax25: fix a use-after-free in ax25_fillin_cb()") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250103210514.87290-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- include/net/ax25.h | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2593019ad5b1..e84602e0226c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2261,7 +2261,7 @@ struct net_device { void *atalk_ptr; #endif #if IS_ENABLED(CONFIG_AX25) - void *ax25_ptr; + struct ax25_dev __rcu *ax25_ptr; #endif #if IS_ENABLED(CONFIG_CFG80211) struct wireless_dev *ieee80211_ptr; diff --git a/include/net/ax25.h b/include/net/ax25.h index cb622d84cd0c..4ee141aae0a2 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -231,6 +231,7 @@ typedef struct ax25_dev { #endif refcount_t refcount; bool device_up; + struct rcu_head rcu; } ax25_dev; typedef struct ax25_cb { @@ -290,9 +291,8 @@ static inline void ax25_dev_hold(ax25_dev *ax25_dev) static inline void ax25_dev_put(ax25_dev *ax25_dev) { - if (refcount_dec_and_test(&ax25_dev->refcount)) { - kfree(ax25_dev); - } + if (refcount_dec_and_test(&ax25_dev->refcount)) + kfree_rcu(ax25_dev, rcu); } static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev) { @@ -335,9 +335,9 @@ void ax25_digi_invert(const ax25_digi *, ax25_digi *); extern spinlock_t ax25_dev_lock; #if IS_ENABLED(CONFIG_AX25) -static inline ax25_dev *ax25_dev_ax25dev(struct net_device *dev) +static inline ax25_dev *ax25_dev_ax25dev(const struct net_device *dev) { - return dev->ax25_ptr; + return rcu_dereference_rtnl(dev->ax25_ptr); } #endif -- cgit v1.2.3 From cee3947b1aed42f71f99ce4e5d1410ee8670621a Mon Sep 17 00:00:00 2001 From: Wasim Nazir Date: Sun, 29 Dec 2024 20:53:27 +0530 Subject: dt-bindings: arm: qcom,ids: add SoC ID for QCS9075 Add the unique ID for Qualcomm QCS9075 SoC. This value is used to differentiate the SoC across qcom targets. Acked-by: Rob Herring (Arm) Signed-off-by: Wasim Nazir Link: https://lore.kernel.org/r/20241229152332.3068172-2-quic_wasimn@quicinc.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/arm/qcom,ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/dt-bindings/arm/qcom,ids.h b/include/dt-bindings/arm/qcom,ids.h index e850dc3a1ad3..1b3e0176dcb7 100644 --- a/include/dt-bindings/arm/qcom,ids.h +++ b/include/dt-bindings/arm/qcom,ids.h @@ -284,6 +284,7 @@ #define QCOM_ID_QCS9100 667 #define QCOM_ID_QCS8300 674 #define QCOM_ID_QCS8275 675 +#define QCOM_ID_QCS9075 676 #define QCOM_ID_QCS615 680 /* -- cgit v1.2.3 From 0a0693fb2642604b4e14390dbf792f36e3485aaa Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 24 Dec 2024 12:12:14 +0200 Subject: dt-bindings: clock: qcom,mmcc-msm8960: add LCDC-related clocks APQ8064 / MSM8960 have separate LVDS / LCDC clock, driving the MDP4 LCD controller. Add corresponding indices to clock controller bindings. Signed-off-by: Dmitry Baryshkov Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20241224-apq8064-fix-mmcc-v1-2-c95d2e2bf143@linaro.org Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,mmcc-msm8960.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,mmcc-msm8960.h b/include/dt-bindings/clock/qcom,mmcc-msm8960.h index 81714fc859c5..717431d735c1 100644 --- a/include/dt-bindings/clock/qcom,mmcc-msm8960.h +++ b/include/dt-bindings/clock/qcom,mmcc-msm8960.h @@ -133,5 +133,7 @@ #define VCAP_CLK 124 #define VCAP_NPL_CLK 125 #define PLL15 126 +#define DSI2_PIXEL_LVDS_SRC 127 +#define LVDS_CLK 128 #endif -- cgit v1.2.3 From 46e6075287e68e1d3d0ea8ecda610064636e0854 Mon Sep 17 00:00:00 2001 From: Manikanta Mylavarapu Date: Tue, 17 Dec 2024 17:09:09 +0530 Subject: dt-bindings: clock: qcom: gcc-ipq5424: remove apss_dbg clock macro The gcc_apss_dbg clk is access protected by trust zone, and accessing it results in a kernel crash. Therefore remove the gcc_apss_dbg_clk macro. Acked-by: Krzysztof Kozlowski Signed-off-by: Manikanta Mylavarapu Link: https://lore.kernel.org/r/20241217113909.3522305-3-quic_mmanikan@quicinc.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,ipq5424-gcc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,ipq5424-gcc.h b/include/dt-bindings/clock/qcom,ipq5424-gcc.h index 755ce7a71c7c..9f14680052a0 100644 --- a/include/dt-bindings/clock/qcom,ipq5424-gcc.h +++ b/include/dt-bindings/clock/qcom,ipq5424-gcc.h @@ -12,7 +12,6 @@ #define GPLL2 2 #define GPLL2_OUT_MAIN 3 #define GCC_SLEEP_CLK_SRC 4 -#define GCC_APSS_DBG_CLK 5 #define GCC_USB0_EUD_AT_CLK 6 #define GCC_PCIE0_AXI_M_CLK_SRC 7 #define GCC_PCIE0_AXI_M_CLK 8 -- cgit v1.2.3 From fbb9a9d263a68f60a16c8ba5a51d6198d67171cd Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 3 Jan 2025 11:16:31 +0000 Subject: net: phylink: add support for PCS supported_interfaces bitmap Add support for the PCS to specify which interfaces it supports, which can be used by MAC drivers to build the main supported_interfaces bitmap. Phylink also validates that the PCS returned by the MAC driver supports the interface that the MAC was asked for. An empty supported_interfaces bitmap from the PCS indicates that it does not provide this information, and we handle that appropriately. Reviewed-by: Maxime Chevallier Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tTffL-007RoD-1Y@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 5462cc6a37dc..4b7a20620b49 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -393,6 +393,8 @@ struct phylink_pcs_ops; /** * struct phylink_pcs - PHYLINK PCS instance + * @supported_interfaces: describing which PHY_INTERFACE_MODE_xxx + * are supported by this PCS. * @ops: a pointer to the &struct phylink_pcs_ops structure * @phylink: pointer to &struct phylink_config * @neg_mode: provide PCS neg mode via "mode" argument @@ -409,6 +411,7 @@ struct phylink_pcs_ops; * the PCS driver. */ struct phylink_pcs { + DECLARE_PHY_INTERFACE_MASK(supported_interfaces); const struct phylink_pcs_ops *ops; struct phylink *phylink; bool neg_mode; -- cgit v1.2.3 From 2410719cdd49d9b062e87dddaf5ec990edafc6e3 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 3 Jan 2025 11:16:56 +0000 Subject: net: pcs: xpcs: make xpcs_get_interfaces() static xpcs_get_interfaces() should no longer be used outside of the XPCS code, so make it static. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1tTffk-007Roi-JM@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/pcs/pcs-xpcs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index b5b5d17998b8..733f4ddd2ef1 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -50,7 +50,6 @@ struct dw_xpcs; struct phylink_pcs *xpcs_to_phylink_pcs(struct dw_xpcs *xpcs); int xpcs_get_an_mode(struct dw_xpcs *xpcs, phy_interface_t interface); -void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces); int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable); struct dw_xpcs *xpcs_create_mdiodev(struct mii_bus *bus, int addr); -- cgit v1.2.3 From a8b56cb27d47ecba7b26041c05fe423130ee7a24 Mon Sep 17 00:00:00 2001 From: Manikanta Mylavarapu Date: Tue, 10 Dec 2024 12:11:09 +0530 Subject: dt-bindings: clock: qcom: gcc-ipq5424: add gcc_xo_clk macro The GCC_XO_CLK is required for the functionality of the WiFi copy engine block. Therefore, add the GCC_XO_CLK macro. Signed-off-by: Manikanta Mylavarapu Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20241210064110.130466-2-quic_mmanikan@quicinc.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,ipq5424-gcc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,ipq5424-gcc.h b/include/dt-bindings/clock/qcom,ipq5424-gcc.h index 9f14680052a0..c15ad16923bd 100644 --- a/include/dt-bindings/clock/qcom,ipq5424-gcc.h +++ b/include/dt-bindings/clock/qcom,ipq5424-gcc.h @@ -151,5 +151,6 @@ #define GCC_PCIE3_RCHNG_CLK_SRC 142 #define GCC_PCIE3_RCHNG_CLK 143 #define GCC_IM_SLEEP_CLK 144 +#define GCC_XO_CLK 145 #endif -- cgit v1.2.3 From d76bf5267a6bcf25b95dfa1e71da0d3af3435682 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Tue, 17 Dec 2024 12:00:24 +0200 Subject: media: cec: include linux/debugfs.h and linux/seq_file.h where needed Having cec.h include linux/debugfs.h leads to all users of all cec headers include and depend on debugfs.h and its dependencies for no reason. Drop the include from cec.h, and include debugfs.h and seq_file.h where needed. Sort all the modified include lists while at it. Signed-off-by: Jani Nikula Signed-off-by: Hans Verkuil --- include/media/cec.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/media/cec.h b/include/media/cec.h index 16b412b3131b..0c8e86115b6f 100644 --- a/include/media/cec.h +++ b/include/media/cec.h @@ -10,7 +10,6 @@ #include #include -#include #include #include #include -- cgit v1.2.3 From 7bd72a4aa226c3ef752bcd6b33c54f6e85efcc60 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 4 Jan 2025 17:21:48 +0900 Subject: rtnetlink: Add rtnl_net_lock_killable(). rtnl_lock_killable() is used only in register_netdev() and will be converted to per-netns RTNL. Let's unexport it and add the corresponding helper. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/rtnetlink.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 3b9d132cbc9e..4bc2ee0b10b0 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -102,6 +102,7 @@ void __rtnl_net_unlock(struct net *net); void rtnl_net_lock(struct net *net); void rtnl_net_unlock(struct net *net); int rtnl_net_trylock(struct net *net); +int rtnl_net_lock_killable(struct net *net); int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b); bool rtnl_net_is_locked(struct net *net); @@ -138,6 +139,11 @@ static inline int rtnl_net_trylock(struct net *net) return rtnl_trylock(); } +static inline int rtnl_net_lock_killable(struct net *net) +{ + return rtnl_lock_killable(); +} + static inline void ASSERT_RTNL_NET(struct net *net) { ASSERT_RTNL(); -- cgit v1.2.3 From 7f2ef1bfc758f0f206eac863ff8ee417d5bb1493 Mon Sep 17 00:00:00 2001 From: Bibek Kumar Patro Date: Thu, 12 Dec 2024 20:44:00 +0530 Subject: iommu/arm-smmu: Add support for PRR bit setup Add an adreno-smmu-priv interface for drm/msm to call into arm-smmu-qcom and initiate the "Partially Resident Region" (PRR) bit setup or reset sequence as per request. This will be used by GPU to setup the PRR bit and related configuration registers through adreno-smmu private interface instead of directly poking the smmu hardware. Suggested-by: Rob Clark Signed-off-by: Bibek Kumar Patro Link: https://lore.kernel.org/r/20241212151402.159102-4-quic_bibekkum@quicinc.com Signed-off-by: Will Deacon --- include/linux/adreno-smmu-priv.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/adreno-smmu-priv.h b/include/linux/adreno-smmu-priv.h index c637e0997f6d..abec23c7744f 100644 --- a/include/linux/adreno-smmu-priv.h +++ b/include/linux/adreno-smmu-priv.h @@ -50,6 +50,11 @@ struct adreno_smmu_fault_info { * the GPU driver must call resume_translation() * @resume_translation: Resume translation after a fault * + * @set_prr_bit: [optional] Configure the GPU's Partially Resident + * Region (PRR) bit in the ACTLR register. + * @set_prr_addr: [optional] Configure the PRR_CFG_*ADDR register with + * the physical address of PRR page passed from GPU + * driver. * * The GPU driver (drm/msm) and adreno-smmu work together for controlling * the GPU's SMMU instance. This is by necessity, as the GPU is directly @@ -67,6 +72,8 @@ struct adreno_smmu_priv { void (*get_fault_info)(const void *cookie, struct adreno_smmu_fault_info *info); void (*set_stall)(const void *cookie, bool enabled); void (*resume_translation)(const void *cookie, bool terminate); + void (*set_prr_bit)(const void *cookie, bool set); + void (*set_prr_addr)(const void *cookie, phys_addr_t page_addr); }; #endif /* __ADRENO_SMMU_PRIV_H */ -- cgit v1.2.3 From c79a39dc8d060b9e64e8b0fa9d245d44befeefbe Mon Sep 17 00:00:00 2001 From: Calvin Owens Date: Mon, 11 Nov 2024 20:13:29 -0800 Subject: pps: Fix a use-after-free On a board running ntpd and gpsd, I'm seeing a consistent use-after-free in sys_exit() from gpsd when rebooting: pps pps1: removed ------------[ cut here ]------------ kobject: '(null)' (00000000db4bec24): is not initialized, yet kobject_put() is being called. WARNING: CPU: 2 PID: 440 at lib/kobject.c:734 kobject_put+0x120/0x150 CPU: 2 UID: 299 PID: 440 Comm: gpsd Not tainted 6.11.0-rc6-00308-gb31c44928842 #1 Hardware name: Raspberry Pi 4 Model B Rev 1.1 (DT) pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : kobject_put+0x120/0x150 lr : kobject_put+0x120/0x150 sp : ffffffc0803d3ae0 x29: ffffffc0803d3ae0 x28: ffffff8042dc9738 x27: 0000000000000001 x26: 0000000000000000 x25: ffffff8042dc9040 x24: ffffff8042dc9440 x23: ffffff80402a4620 x22: ffffff8042ef4bd0 x21: ffffff80405cb600 x20: 000000000008001b x19: ffffff8040b3b6e0 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 696e6920746f6e20 x14: 7369203a29343263 x13: 205d303434542020 x12: 0000000000000000 x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000 x8 : 0000000000000000 x7 : 0000000000000000 x6 : 0000000000000000 x5 : 0000000000000000 x4 : 0000000000000000 x3 : 0000000000000000 x2 : 0000000000000000 x1 : 0000000000000000 x0 : 0000000000000000 Call trace: kobject_put+0x120/0x150 cdev_put+0x20/0x3c __fput+0x2c4/0x2d8 ____fput+0x1c/0x38 task_work_run+0x70/0xfc do_exit+0x2a0/0x924 do_group_exit+0x34/0x90 get_signal+0x7fc/0x8c0 do_signal+0x128/0x13b4 do_notify_resume+0xdc/0x160 el0_svc+0xd4/0xf8 el0t_64_sync_handler+0x140/0x14c el0t_64_sync+0x190/0x194 ---[ end trace 0000000000000000 ]--- ...followed by more symptoms of corruption, with similar stacks: refcount_t: underflow; use-after-free. kernel BUG at lib/list_debug.c:62! Kernel panic - not syncing: Oops - BUG: Fatal exception This happens because pps_device_destruct() frees the pps_device with the embedded cdev immediately after calling cdev_del(), but, as the comment above cdev_del() notes, fops for previously opened cdevs are still callable even after cdev_del() returns. I think this bug has always been there: I can't explain why it suddenly started happening every time I reboot this particular board. In commit d953e0e837e6 ("pps: Fix a use-after free bug when unregistering a source."), George Spelvin suggested removing the embedded cdev. That seems like the simplest way to fix this, so I've implemented his suggestion, using __register_chrdev() with pps_idr becoming the source of truth for which minor corresponds to which device. But now that pps_idr defines userspace visibility instead of cdev_add(), we need to be sure the pps->dev refcount can't reach zero while userspace can still find it again. So, the idr_remove() call moves to pps_unregister_cdev(), and pps_idr now holds a reference to pps->dev. pps_core: source serial1 got cdev (251:1) <...> pps pps1: removed pps_core: unregistering pps1 pps_core: deallocating pps1 Fixes: d953e0e837e6 ("pps: Fix a use-after free bug when unregistering a source.") Cc: stable@vger.kernel.org Signed-off-by: Calvin Owens Reviewed-by: Michal Schmidt Link: https://lore.kernel.org/r/a17975fd5ae99385791929e563f72564edbcf28f.1731383727.git.calvin@wbinvd.org Signed-off-by: Greg Kroah-Hartman --- include/linux/pps_kernel.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h index 78c8ac4951b5..c7abce28ed29 100644 --- a/include/linux/pps_kernel.h +++ b/include/linux/pps_kernel.h @@ -56,8 +56,7 @@ struct pps_device { unsigned int id; /* PPS source unique ID */ void const *lookup_cookie; /* For pps_lookup_dev() only */ - struct cdev cdev; - struct device *dev; + struct device dev; struct fasync_struct *async_queue; /* fasync method */ spinlock_t lock; }; -- cgit v1.2.3 From aff028a8192d056d346541c5fc7d88c0eb43412c Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Tue, 10 Dec 2024 08:51:21 -0800 Subject: iommu/io-pgtable-arm: Add way to debug pgtable walk Add an io-pgtable method to walk the pgtable returning the raw PTEs that would be traversed for a given iova access. Signed-off-by: Rob Clark Reviewed-by: Mostafa Saleh Link: https://lore.kernel.org/r/20241210165127.600817-4-robdclark@gmail.com [will: Removed 'arm_lpae_io_pgtable_walk_data::level' per Mostafa] Signed-off-by: Will Deacon --- include/linux/io-pgtable.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index ce86b09ae80f..bba2a51c87d2 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -180,12 +180,22 @@ struct io_pgtable_cfg { }; }; +/** + * struct arm_lpae_io_pgtable_walk_data - information from a pgtable walk + * + * @ptes: The recorded PTE values from the walk + */ +struct arm_lpae_io_pgtable_walk_data { + u64 ptes[4]; +}; + /** * struct io_pgtable_ops - Page table manipulation API for IOMMU drivers. * * @map_pages: Map a physically contiguous range of pages of the same size. * @unmap_pages: Unmap a range of virtually contiguous pages of the same size. * @iova_to_phys: Translate iova to physical address. + * @pgtable_walk: (optional) Perform a page table walk for a given iova. * * These functions map directly onto the iommu_ops member functions with * the same names. @@ -199,6 +209,7 @@ struct io_pgtable_ops { struct iommu_iotlb_gather *gather); phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops, unsigned long iova); + int (*pgtable_walk)(struct io_pgtable_ops *ops, unsigned long iova, void *wd); int (*read_and_clear_dirty)(struct io_pgtable_ops *ops, unsigned long iova, size_t size, unsigned long flags, -- cgit v1.2.3 From 92d6254f58120011c93610b4cb7def214409731d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 5 Dec 2024 18:07:31 +0100 Subject: sysfs: constify macro BIN_ATTRIBUTE_GROUPS() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As there is only one in-tree user, avoid a transition phase and switch that user in the same commit. As there are some interdependencies between the constness of the different symbols in the s390 driver, covert the whole driver at once. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20241205-sysfs-const-bin_attr-groups_macro-v1-1-ac5e855031e8@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 0f2fcd244523..b4368377fac9 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -293,7 +293,7 @@ __ATTRIBUTE_GROUPS(_name) #define BIN_ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ - .bin_attrs = _name##_attrs, \ + .bin_attrs_new = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) -- cgit v1.2.3 From e91609f1c3b0ce06d80b1b3bd0e6b942782be016 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 7 Jan 2025 09:37:29 +0100 Subject: dt-bindings: clock: renesas,r9a08g045-vbattb: Fix include guard Add the missing "RENESAS" part to the include guard. Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/34953d1e9f472e4f29533ed06cf092dd3c0d1178.1736238939.git.geert+renesas@glider.be --- include/dt-bindings/clock/renesas,r9a08g045-vbattb.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/dt-bindings/clock/renesas,r9a08g045-vbattb.h b/include/dt-bindings/clock/renesas,r9a08g045-vbattb.h index 67774eafad06..4cc8fc34b23c 100644 --- a/include/dt-bindings/clock/renesas,r9a08g045-vbattb.h +++ b/include/dt-bindings/clock/renesas,r9a08g045-vbattb.h @@ -2,12 +2,12 @@ * * Copyright (C) 2024 Renesas Electronics Corp. */ -#ifndef __DT_BINDINGS_CLOCK_R9A08G045_VBATTB_H__ -#define __DT_BINDINGS_CLOCK_R9A08G045_VBATTB_H__ +#ifndef __DT_BINDINGS_CLOCK_RENESAS_R9A08G045_VBATTB_H__ +#define __DT_BINDINGS_CLOCK_RENESAS_R9A08G045_VBATTB_H__ #define VBATTB_XC 0 #define VBATTB_XBYP 1 #define VBATTB_MUX 2 #define VBATTB_VBATTCLK 3 -#endif /* __DT_BINDINGS_CLOCK_R9A08G045_VBATTB_H__ */ +#endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A08G045_VBATTB_H__ */ -- cgit v1.2.3 From 1bd13edbbed6e7e396f1aab92b224a4775218e68 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 27 Dec 2024 13:07:57 +0900 Subject: tracing/hist: Add poll(POLLIN) support on hist file Add poll syscall support on the `hist` file. The Waiter will be waken up when the histogram is updated with POLLIN. Currently, there is no way to wait for a specific event in userspace. So user needs to peek the `trace` periodicaly, or wait on `trace_pipe`. But it is not a good idea to peek at the `trace` for an event that randomly happens. And `trace_pipe` is not coming back until a page is filled with events. This allows a user to wait for a specific event on the `hist` file. User can set a histogram trigger on the event which they want to monitor and poll() on its `hist` file. Since this poll() returns POLLIN, the next poll() will return soon unless a read() happens on that hist file. NOTE: To read the hist file again, you must set the file offset to 0, but just for monitoring the event, you may not need to read the histogram. Cc: Shuah Khan Cc: Mathieu Desnoyers Link: https://lore.kernel.org/173527247756.464571.14236296701625509931.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 91b8ffbdfa8c..02cde1174487 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -673,6 +673,20 @@ struct trace_event_file { atomic_t tm_ref; /* trigger-mode reference counter */ }; +#ifdef CONFIG_HIST_TRIGGERS +extern struct irq_work hist_poll_work; +extern wait_queue_head_t hist_poll_wq; + +static inline void hist_poll_wakeup(void) +{ + if (wq_has_sleeper(&hist_poll_wq)) + irq_work_queue(&hist_poll_work); +} + +#define hist_poll_wait(file, wait) \ + poll_wait(file, &hist_poll_wq, wait) +#endif + #define __TRACE_EVENT_FLAGS(name, value) \ static int __init trace_init_flags_##name(void) \ { \ -- cgit v1.2.3 From 85b60ca9ad2c94661acf86a0c11278246cc5ea86 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Mon, 6 Jan 2025 18:16:25 +0530 Subject: x86/sev: Add Secure TSC support for SNP guests Add support for Secure TSC in SNP-enabled guests. Secure TSC allows guests to securely use RDTSC/RDTSCP instructions, ensuring that the parameters used cannot be altered by the hypervisor once the guest is launched. Secure TSC-enabled guests need to query TSC information from the AMD Security Processor. This communication channel is encrypted between the AMD Security Processor and the guest, with the hypervisor acting merely as a conduit to deliver the guest messages to the AMD Security Processor. Each message is protected with AEAD (AES-256 GCM). [ bp: Zap a stray newline over amd_cc_platform_has() while at it, simplify CC_ATTR_GUEST_SNP_SECURE_TSC check ] Signed-off-by: Nikunj A Dadhania Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250106124633.1418972-6-nikunj@amd.com --- include/linux/cc_platform.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index caa4b4430634..0bf7d33a1048 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -81,6 +81,14 @@ enum cc_attr { */ CC_ATTR_GUEST_SEV_SNP, + /** + * @CC_ATTR_GUEST_SNP_SECURE_TSC: SNP Secure TSC is active. + * + * The platform/OS is running as a guest/virtual machine and actively + * using AMD SEV-SNP Secure TSC feature. + */ + CC_ATTR_GUEST_SNP_SECURE_TSC, + /** * @CC_ATTR_HOST_SEV_SNP: AMD SNP enabled on the host. * -- cgit v1.2.3 From 7b16e60b31202254c62a29f5c709ffb42684b6f9 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 7 Jan 2025 15:44:03 +0000 Subject: soundwire: SDCA: Add additional SDCA address macros Compliment the existing macro to construct an SDCA control address with macros to extract the constituent parts, and validation of such an address. Also update the masks for the original macro to use GENMASK to make mental comparisons with the included comment on the address format easier. Acked-by: Vinod Koul Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20250107154408.814455-2-ckeepax@opensource.cirrus.com Reviewed-by: Pierre-Louis Bossart Signed-off-by: Mark Brown --- include/linux/soundwire/sdw_registers.h | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/soundwire/sdw_registers.h b/include/linux/soundwire/sdw_registers.h index 658b10fa5b20..0a5939285583 100644 --- a/include/linux/soundwire/sdw_registers.h +++ b/include/linux/soundwire/sdw_registers.h @@ -4,6 +4,9 @@ #ifndef __SDW_REGISTERS_H #define __SDW_REGISTERS_H +#include +#include + /* * SDW registers as defined by MIPI 1.2 Spec */ @@ -329,16 +332,27 @@ * 2:0 Control Number[2:0] */ -#define SDW_SDCA_CTL(fun, ent, ctl, ch) (BIT(30) | \ - (((fun) & 0x7) << 22) | \ - (((ent) & 0x40) << 15) | \ - (((ent) & 0x3f) << 7) | \ - (((ctl) & 0x30) << 15) | \ - (((ctl) & 0x0f) << 3) | \ - (((ch) & 0x38) << 12) | \ - ((ch) & 0x07)) +#define SDW_SDCA_CTL(fun, ent, ctl, ch) (BIT(30) | \ + (((fun) & GENMASK(2, 0)) << 22) | \ + (((ent) & BIT(6)) << 15) | \ + (((ent) & GENMASK(5, 0)) << 7) | \ + (((ctl) & GENMASK(5, 4)) << 15) | \ + (((ctl) & GENMASK(3, 0)) << 3) | \ + (((ch) & GENMASK(5, 3)) << 12) | \ + ((ch) & GENMASK(2, 0))) + +#define SDW_SDCA_CTL_FUNC(reg) FIELD_GET(GENMASK(24, 22), (reg)) +#define SDW_SDCA_CTL_ENT(reg) ((FIELD_GET(BIT(21), (reg)) << 6) | \ + FIELD_GET(GENMASK(12, 7), (reg))) +#define SDW_SDCA_CTL_CSEL(reg) ((FIELD_GET(GENMASK(20, 19), (reg)) << 4) | \ + FIELD_GET(GENMASK(6, 3), (reg))) +#define SDW_SDCA_CTL_CNUM(reg) ((FIELD_GET(GENMASK(17, 15), (reg)) << 3) | \ + FIELD_GET(GENMASK(2, 0), (reg))) #define SDW_SDCA_MBQ_CTL(reg) ((reg) | BIT(13)) #define SDW_SDCA_NEXT_CTL(reg) ((reg) | BIT(14)) +/* Check the reserved and fixed bits in address */ +#define SDW_SDCA_VALID_CTL(reg) (((reg) & (GENMASK(31, 25) | BIT(18) | BIT(13))) == BIT(30)) + #endif /* __SDW_REGISTERS_H */ -- cgit v1.2.3 From b21468e83b787ab31aa9df8f429d2ca61def0cc9 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 7 Jan 2025 15:44:04 +0000 Subject: ASoC: SDCA: Update list of entity_0 controls Update the list of entity_0 controls to better match version v1.0 of the SDCA specification. Remove both INTSTAT_CLEAR and INT_ENABLE as these are no longer used, and add some missing controls and bits into the enum. Also rename the SDCA_CONTROL prefix to SDCA_CTL because this better matches the macros in the sdw_registers.h header, and the names can get quite long so saving a few characters is definitely a plus. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20250107154408.814455-3-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_function.h | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h index a01eec86b9a6..9dc5bfec07e5 100644 --- a/include/sound/sdca_function.h +++ b/include/sound/sdca_function.h @@ -42,14 +42,31 @@ enum sdca_function_type { #define SDCA_FUNCTION_TYPE_HID_NAME "HID" enum sdca_entity0_controls { - SDCA_CONTROL_ENTITY_0_COMMIT_GROUP_MASK = 0x01, - SDCA_CONTROL_ENTITY_0_INTSTAT_CLEAR = 0x02, - SDCA_CONTROL_ENTITY_0_INT_ENABLE = 0x03, - SDCA_CONTROL_ENTITY_0_FUNCTION_SDCA_VERSION = 0x04, - SDCA_CONTROL_ENTITY_0_FUNCTION_TOPOLOGY = 0x05, - SDCA_CONTROL_ENTITY_0_FUNCTION_MANUFACTURER_ID = 0x06, - SDCA_CONTROL_ENTITY_0_FUNCTION_ID = 0x07, - SDCA_CONTROL_ENTITY_0_FUNCTION_VERSION = 0x08 + SDCA_CTL_ENTITY_0_COMMIT_GROUP_MASK = 0x01, + SDCA_CTL_ENTITY_0_FUNCTION_SDCA_VERSION = 0x04, + SDCA_CTL_ENTITY_0_FUNCTION_TYPE = 0x05, + SDCA_CTL_ENTITY_0_FUNCTION_MANUFACTURER_ID = 0x06, + SDCA_CTL_ENTITY_0_FUNCTION_ID = 0x07, + SDCA_CTL_ENTITY_0_FUNCTION_VERSION = 0x08, + SDCA_CTL_ENTITY_0_FUNCTION_EXTENSION_ID = 0x09, + SDCA_CTL_ENTITY_0_FUNCTION_EXTENSION_VERSION = 0x0A, + SDCA_CTL_ENTITY_0_FUNCTION_STATUS = 0x10, + SDCA_CTL_ENTITY_0_FUNCTION_ACTION = 0x11, + SDCA_CTL_ENTITY_0_MATCHING_GUID = 0x12, + SDCA_CTL_ENTITY_0_DEVICE_MANUFACTURER_ID = 0x2C, + SDCA_CTL_ENTITY_0_DEVICE_PART_ID = 0x2D, + SDCA_CTL_ENTITY_0_DEVICE_VERSION = 0x2E, + SDCA_CTL_ENTITY_0_DEVICE_SDCA_VERSION = 0x2F, + + /* Function Status Bits */ + SDCA_CTL_ENTITY_0_DEVICE_NEWLY_ATTACHED = BIT(0), + SDCA_CTL_ENTITY_0_INTS_DISABLED_ABNORMALLY = BIT(1), + SDCA_CTL_ENTITY_0_STREAMING_STOPPED_ABNORMALLY = BIT(2), + SDCA_CTL_ENTITY_0_FUNCTION_FAULT = BIT(3), + SDCA_CTL_ENTITY_0_UMP_SEQUENCE_FAULT = BIT(4), + SDCA_CTL_ENTITY_0_FUNCTION_NEEDS_INITIALIZATION = BIT(5), + SDCA_CTL_ENTITY_0_FUNCTION_HAS_BEEN_RESET = BIT(6), + SDCA_CTL_ENTITY_0_FUNCTION_BUSY = BIT(7), }; #endif -- cgit v1.2.3 From fdd9ef3dce98e035d21c17fac587cb6e3c7706fd Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 7 Jan 2025 15:44:05 +0000 Subject: regmap: sdw-mbq: Add support for further MBQ register sizes SoundWire MBQ register maps typically contain a variety of register sizes, which doesn't map ideally to the regmap abstraction which expects register maps to have a consistent size. Currently the MBQ register map only allows 16-bit registers to be defined, however this leads to complex CODEC driver implementations with an 8-bit register map and a 16-bit MBQ, every control will then have a custom get and put handler that allows them to access different register maps. Further more 32-bit MBQ quantities are not currently supported. Add support for additional MBQ sizes and to avoid the complexity of multiple register maps treat the val_size as a maximum size for the register map. Within the regmap use an ancillary callback to determine how many bytes to actually read/write to the hardware for a specific register. In the case that no callback is defined the behaviour defaults back to the existing behaviour of a fixed size register map. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20250107154408.814455-4-ckeepax@opensource.cirrus.com Reviewed-by: Pierre-Louis Bossart Signed-off-by: Mark Brown --- include/linux/regmap.h | 47 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index fd41baccbf3e..dd96a22f5657 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -506,6 +506,17 @@ struct regmap_range_cfg { unsigned int window_len; }; +/** + * struct regmap_sdw_mbq_cfg - Configuration for Multi-Byte Quantities + * + * @mbq_size: Callback returning the actual size of the given register. + * + * Provides additional configuration required for SoundWire MBQ register maps. + */ +struct regmap_sdw_mbq_cfg { + int (*mbq_size)(struct device *dev, unsigned int reg); +}; + struct regmap_async; typedef int (*regmap_hw_write)(void *context, const void *data, @@ -652,6 +663,7 @@ struct regmap *__regmap_init_sdw(struct sdw_slave *sdw, const char *lock_name); struct regmap *__regmap_init_sdw_mbq(struct sdw_slave *sdw, const struct regmap_config *config, + const struct regmap_sdw_mbq_cfg *mbq_config, struct lock_class_key *lock_key, const char *lock_name); struct regmap *__regmap_init_spi_avmm(struct spi_device *spi, @@ -713,6 +725,7 @@ struct regmap *__devm_regmap_init_sdw(struct sdw_slave *sdw, const char *lock_name); struct regmap *__devm_regmap_init_sdw_mbq(struct sdw_slave *sdw, const struct regmap_config *config, + const struct regmap_sdw_mbq_cfg *mbq_config, struct lock_class_key *lock_key, const char *lock_name); struct regmap *__devm_regmap_init_slimbus(struct slim_device *slimbus, @@ -942,7 +955,22 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg); */ #define regmap_init_sdw_mbq(sdw, config) \ __regmap_lockdep_wrapper(__regmap_init_sdw_mbq, #config, \ - sdw, config) + sdw, config, NULL) + +/** + * regmap_init_sdw_mbq_cfg() - Initialise MBQ SDW register map with config + * + * @sdw: Device that will be interacted with + * @config: Configuration for register map + * @mbq_config: Properties for the MBQ registers + * + * The return value will be an ERR_PTR() on error or a valid pointer + * to a struct regmap. The regmap will be automatically freed by the + * device management code. + */ +#define regmap_init_sdw_mbq_cfg(sdw, config, mbq_config) \ + __regmap_lockdep_wrapper(__regmap_init_sdw_mbq, #config, \ + sdw, config, mbq_config) /** * regmap_init_spi_avmm() - Initialize register map for Intel SPI Slave @@ -1155,7 +1183,22 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg); */ #define devm_regmap_init_sdw_mbq(sdw, config) \ __regmap_lockdep_wrapper(__devm_regmap_init_sdw_mbq, #config, \ - sdw, config) + sdw, config, NULL) + +/** + * devm_regmap_init_sdw_mbq_cfg() - Initialise managed MBQ SDW register map with config + * + * @sdw: Device that will be interacted with + * @config: Configuration for register map + * @mbq_config: Properties for the MBQ registers + * + * The return value will be an ERR_PTR() on error or a valid pointer + * to a struct regmap. The regmap will be automatically freed by the + * device management code. + */ +#define devm_regmap_init_sdw_mbq_cfg(sdw, config, mbq_config) \ + __regmap_lockdep_wrapper(__devm_regmap_init_sdw_mbq, \ + #config, sdw, config, mbq_config) /** * devm_regmap_init_slimbus() - Initialise managed register map -- cgit v1.2.3 From 5bc493bf0c37c157bf2eb364e55a1c6f8bc43a69 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 7 Jan 2025 15:44:06 +0000 Subject: regmap: sdw-mbq: Add support for SDCA deferred controls The SDCA specification allows for controls to be deferred. In the case of a deferred control the device will return COMMAND_IGNORED to the 8-bit operation that would cause the value to commit. Which is the final 8-bits on a write, or the first 8-bits on a read. In the case of receiving a defer, the regmap will poll the SDCA function busy bit, after which the transaction will be retried, returning an error if the function busy does not clear within a chip specific timeout. Since this is common SDCA functionality which is the 99% use-case for MBQs it makes sense to incorporate this functionality into the register map. If no MBQ configuration is specified, the behaviour will default to the existing behaviour. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20250107154408.814455-5-ckeepax@opensource.cirrus.com Reviewed-by: Pierre-Louis Bossart Signed-off-by: Mark Brown --- include/linux/regmap.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index dd96a22f5657..198067d3cf10 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -510,11 +510,26 @@ struct regmap_range_cfg { * struct regmap_sdw_mbq_cfg - Configuration for Multi-Byte Quantities * * @mbq_size: Callback returning the actual size of the given register. + * @deferrable: Callback returning true if the hardware can defer + * transactions to the given register. Deferral should + * only be used by SDCA parts and typically which controls + * are deferrable will be specified in either as a hard + * coded list or from the DisCo tables in the platform + * firmware. + * + * @timeout_us: The time in microseconds after which waiting for a deferred + * transaction should time out. + * @retry_us: The time in microseconds between polls of the function busy + * status whilst waiting for an opportunity to retry a deferred + * transaction. * * Provides additional configuration required for SoundWire MBQ register maps. */ struct regmap_sdw_mbq_cfg { int (*mbq_size)(struct device *dev, unsigned int reg); + bool (*deferrable)(struct device *dev, unsigned int reg); + unsigned long timeout_us; + unsigned long retry_us; }; struct regmap_async; -- cgit v1.2.3 From b04ce63859793e3439b394976b8d29e785d4d69a Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 11 Dec 2024 11:23:35 +0100 Subject: i2c: davinci: kill platform data There are no more board file users of this driver. The platform data structure is only used internally. Two of the four fields it stores are not used at all anymore. Pull the remainder into the driver data struct and shrink code by removing parts that are now dead code. Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20241211102337.37956-1-brgl@bgdev.pl Signed-off-by: Andi Shyti --- include/linux/platform_data/i2c-davinci.h | 26 -------------------------- 1 file changed, 26 deletions(-) delete mode 100644 include/linux/platform_data/i2c-davinci.h (limited to 'include') diff --git a/include/linux/platform_data/i2c-davinci.h b/include/linux/platform_data/i2c-davinci.h deleted file mode 100644 index 98967df07468..000000000000 --- a/include/linux/platform_data/i2c-davinci.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * DaVinci I2C controller platform_device info - * - * Author: Vladimir Barinov, MontaVista Software, Inc. - * - * 2007 (c) MontaVista Software, Inc. This file is licensed under - * the terms of the GNU General Public License version 2. This program - * is licensed "as is" without any warranty of any kind, whether express - * or implied. -*/ - -#ifndef __ASM_ARCH_I2C_H -#define __ASM_ARCH_I2C_H - -/* All frequencies are expressed in kHz */ -struct davinci_i2c_platform_data { - unsigned int bus_freq; /* standard bus frequency (kHz) */ - unsigned int bus_delay; /* post-transaction delay (usec) */ - bool gpio_recovery; /* Use GPIO recovery method */ - bool has_pfunc; /* Chip has a ICPFUNC register */ -}; - -/* for board setup code */ -void davinci_init_i2c(struct davinci_i2c_platform_data *); - -#endif /* __ASM_ARCH_I2C_H */ -- cgit v1.2.3 From 1b960cd19311c0bb653afa3633aaa9ef8edcfdde Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Jan 2025 09:09:24 +0000 Subject: net: watchdog: rename __dev_watchdog_up() and dev_watchdog_down() In commit d7811e623dd4 ("[NET]: Drop tx lock in dev_watchdog_up") dev_watchdog_up() became a simple wrapper for __netdev_watchdog_up() Herbert also said : "In 2.6.19 we can eliminate the unnecessary __dev_watchdog_up and replace it with dev_watchdog_up." This patch consolidates things to have only two functions, with a common prefix. - netdev_watchdog_up(), exported for the sake of one freescale driver. This replaces __netdev_watchdog_up() and dev_watchdog_up(). - netdev_watchdog_down(), static to net/sched/sch_generic.c This replaces dev_watchdog_down(). Signed-off-by: Eric Dumazet Cc: Herbert Xu Reviewed-by: Jason Xing Link: https://patch.msgid.link/20250105090924.1661822-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e84602e0226c..1812564b5204 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4295,7 +4295,7 @@ static inline bool netif_carrier_ok(const struct net_device *dev) unsigned long dev_trans_start(struct net_device *dev); -void __netdev_watchdog_up(struct net_device *dev); +void netdev_watchdog_up(struct net_device *dev); void netif_carrier_on(struct net_device *dev); void netif_carrier_off(struct net_device *dev); -- cgit v1.2.3 From 2fa8b4383d24c1c788528ed4377d9f07c6c10227 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 6 Jan 2025 11:59:24 +0000 Subject: net: dsa: remove get_mac_eee() method The get_mac_eee() is no longer called by the core DSA code, nor are there any implementations of this method. Remove it. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tUllU-007UzL-KV@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 4aeedb296d67..9640d5c67f56 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -991,8 +991,6 @@ struct dsa_switch_ops { bool (*support_eee)(struct dsa_switch *ds, int port); int (*set_mac_eee)(struct dsa_switch *ds, int port, struct ethtool_keee *e); - int (*get_mac_eee)(struct dsa_switch *ds, int port, - struct ethtool_keee *e); /* EEPROM access */ int (*get_eeprom_len)(struct dsa_switch *ds); -- cgit v1.2.3 From d8c2e5f33acec38cf478c509c65646d029cc378e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 6 Jan 2025 09:46:20 -0800 Subject: if_vlan: fix kdoc warnings While merging net to net-next I noticed that the kdoc above __vlan_get_protocol_offset() has the wrong function name. Fix that and all the other kdoc warnings in this file. Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250106174620.1855269-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/if_vlan.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index d495cbdb52cb..38456b42cdb5 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -176,6 +176,7 @@ struct netpoll; * @real_dev_addr: address of underlying netdevice * @dent: proc dir entry * @vlan_pcpu_stats: ptr to percpu rx stats + * @netpoll: netpoll instance "propagated" down to @real_dev */ struct vlan_dev_priv { unsigned int nr_ingress_mappings; @@ -414,6 +415,8 @@ static inline int __vlan_insert_tag(struct sk_buff *skb, * doesn't have to worry about freeing the original skb. * * Does not change skb->protocol so this function can be used during receive. + * + * Return: modified @skb on success, NULL on error (@skb is freed). */ static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb, __be16 vlan_proto, @@ -443,6 +446,8 @@ static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb, * doesn't have to worry about freeing the original skb. * * Does not change skb->protocol so this function can be used during receive. + * + * Return: modified @skb on success, NULL on error (@skb is freed). */ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) @@ -461,6 +466,8 @@ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. + * + * Return: modified @skb on success, NULL on error (@skb is freed). */ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb, __be16 vlan_proto, @@ -582,7 +589,7 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) } /** - * vlan_get_protocol - get protocol EtherType. + * __vlan_get_protocol_offset() - get protocol EtherType. * @skb: skbuff to query * @type: first vlan protocol * @mac_offset: MAC offset @@ -808,9 +815,11 @@ static inline netdev_features_t vlan_features_check(struct sk_buff *skb, * @h1: Pointer to vlan header * @h2: Pointer to vlan header * - * Compare two vlan headers, returns 0 if equal. + * Compare two vlan headers. * * Please note that alignment of h1 & h2 are only guaranteed to be 16 bits. + * + * Return: 0 if equal, arbitrary non-zero value if not equal. */ static inline unsigned long compare_vlan_header(const struct vlan_hdr *h1, const struct vlan_hdr *h2) -- cgit v1.2.3 From 030de8eafdcbc3d6c087bddb5450aea2b29520e5 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Thu, 12 Dec 2024 00:25:50 +0000 Subject: dt-bindings: clock: Add Qualcomm SM6115 LPASS clock controller SM6115 (and its derivatives or similar SoCs) has an LPASS clock controller block which provides audio-related resets. Add bindings for it. Cc: Konrad Dybcio Cc: Konrad Dybcio Cc: Srinivas Kandagatla Reviewed-by: Krzysztof Kozlowski Signed-off-by: Konrad Dybcio [alexey.klimov slightly changed the commit message] Signed-off-by: Alexey Klimov Link: https://lore.kernel.org/r/20241212002551.2902954-2-alexey.klimov@linaro.org [bjorn: Adjusted Konrad's address] Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,sm6115-lpasscc.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 include/dt-bindings/clock/qcom,sm6115-lpasscc.h (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,sm6115-lpasscc.h b/include/dt-bindings/clock/qcom,sm6115-lpasscc.h new file mode 100644 index 000000000000..799274517c9a --- /dev/null +++ b/include/dt-bindings/clock/qcom,sm6115-lpasscc.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2023, Linaro Ltd. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_LPASSCC_SM6115_H +#define _DT_BINDINGS_CLK_QCOM_LPASSCC_SM6115_H + +/* LPASS CC */ +#define LPASS_SWR_TX_CONFIG_CGCR 0 + +/* LPASS_AUDIO CC */ +#define LPASS_AUDIO_SWR_RX_CGCR 0 + +#endif -- cgit v1.2.3 From 2e1c78bf674024375de6eea33e113acc3473d2e2 Mon Sep 17 00:00:00 2001 From: Varadarajan Narayanan Date: Fri, 13 Dec 2024 16:28:07 +0530 Subject: dt-bindings: interconnect: Add Qualcomm IPQ5424 support Add master/slave ids for Qualcomm IPQ5424 Network-On-Chip interfaces. This will be used by the gcc-ipq5424 driver for providing interconnect services using the icc-clk framework. Signed-off-by: Varadarajan Narayanan Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20241213105808.674620-1-quic_varada@quicinc.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/interconnect/qcom,ipq5424.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 include/dt-bindings/interconnect/qcom,ipq5424.h (limited to 'include') diff --git a/include/dt-bindings/interconnect/qcom,ipq5424.h b/include/dt-bindings/interconnect/qcom,ipq5424.h new file mode 100644 index 000000000000..a770356112ee --- /dev/null +++ b/include/dt-bindings/interconnect/qcom,ipq5424.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +#ifndef INTERCONNECT_QCOM_IPQ5424_H +#define INTERCONNECT_QCOM_IPQ5424_H + +#define MASTER_ANOC_PCIE0 0 +#define SLAVE_ANOC_PCIE0 1 +#define MASTER_CNOC_PCIE0 2 +#define SLAVE_CNOC_PCIE0 3 +#define MASTER_ANOC_PCIE1 4 +#define SLAVE_ANOC_PCIE1 5 +#define MASTER_CNOC_PCIE1 6 +#define SLAVE_CNOC_PCIE1 7 +#define MASTER_ANOC_PCIE2 8 +#define SLAVE_ANOC_PCIE2 9 +#define MASTER_CNOC_PCIE2 10 +#define SLAVE_CNOC_PCIE2 11 +#define MASTER_ANOC_PCIE3 12 +#define SLAVE_ANOC_PCIE3 13 +#define MASTER_CNOC_PCIE3 14 +#define SLAVE_CNOC_PCIE3 15 +#define MASTER_CNOC_USB 16 +#define SLAVE_CNOC_USB 17 + +#endif /* INTERCONNECT_QCOM_IPQ5424_H */ -- cgit v1.2.3 From 9696d9ae016573568dfd65dd2a92d6e8d277b25b Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Mon, 25 Nov 2024 15:24:40 -0800 Subject: hyperv: Move hv_connection_id to hyperv-tlfs.h This definition is in the wrong file; it is part of the TLFS doc. Signed-off-by: Nuno Das Neves Acked-by: Wei Liu Reviewed-by: Easwar Hariharan Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/1732577084-2122-2-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1732577084-2122-2-git-send-email-nunodasneves@linux.microsoft.com> --- include/asm-generic/hyperv-tlfs.h | 9 +++++++++ include/linux/hyperv.h | 9 --------- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index 814207e7c37f..52274c9aefef 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -871,4 +871,13 @@ struct hv_mmio_write_input { u8 data[HV_HYPERCALL_MMIO_MAX_DATA_LENGTH]; } __packed; +/* Define connection identifier type. */ +union hv_connection_id { + u32 asu32; + struct { + u32 id:24; + u32 reserved:8; + } u; +}; + #endif diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 02a226bcf0ed..b0dbba3b9108 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -768,15 +768,6 @@ struct vmbus_close_msg { struct vmbus_channel_close_channel msg; }; -/* Define connection identifier type. */ -union hv_connection_id { - u32 asu32; - struct { - u32 id:24; - u32 reserved:8; - } u; -}; - enum vmbus_device_type { HV_IDE = 0, HV_SCSI, -- cgit v1.2.3 From b989531e0bd7704dc837e37f8d16bbc921e6cf6c Mon Sep 17 00:00:00 2001 From: Yongji Xie Date: Tue, 19 Nov 2024 15:42:38 +0800 Subject: vduse: relicense under GPL-2.0 OR BSD-3-Clause Dual-license the vduse kernel header file to dual GPL-2.0 OR BSD-3-Clause license to make it possible to ship it with DPDK (under BSD-3-Clause) for older distros. Signed-off-by: Yongji Xie Message-Id: <20241119074238.38299-1-xieyongji@bytedance.com> Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/vduse.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h index 11bd48c72c6c..68a627d04afa 100644 --- a/include/uapi/linux/vduse.h +++ b/include/uapi/linux/vduse.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ #ifndef _UAPI_VDUSE_H_ #define _UAPI_VDUSE_H_ -- cgit v1.2.3 From 86b525bed2758878e788c9fb6b8fb281fd61bdb0 Mon Sep 17 00:00:00 2001 From: Rodolfo Giometti Date: Fri, 8 Nov 2024 08:31:12 +0100 Subject: drivers pps: add PPS generators support Sometimes one needs to be able not only to catch PPS signals but to produce them also. For example, running a distributed simulation, which requires computers' clock to be synchronized very tightly. This patch adds PPS generators class in order to have a well-defined interface for these devices. Signed-off-by: Rodolfo Giometti Link: https://lore.kernel.org/r/20241108073115.759039-2-giometti@enneenne.com Signed-off-by: Greg Kroah-Hartman --- include/linux/pps_gen_kernel.h | 78 ++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/pps_gen.h | 37 ++++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 include/linux/pps_gen_kernel.h create mode 100644 include/uapi/linux/pps_gen.h (limited to 'include') diff --git a/include/linux/pps_gen_kernel.h b/include/linux/pps_gen_kernel.h new file mode 100644 index 000000000000..022ea0ac4440 --- /dev/null +++ b/include/linux/pps_gen_kernel.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * PPS generator API kernel header + * + * Copyright (C) 2024 Rodolfo Giometti + */ + +#ifndef LINUX_PPS_GEN_KERNEL_H +#define LINUX_PPS_GEN_KERNEL_H + +#include +#include +#include + +/* + * Global defines + */ + +#define PPS_GEN_MAX_SOURCES 16 /* should be enough... */ + +struct pps_gen_device; + +/** + * struct pps_gen_source_info - the specific PPS generator info + * @use_system_clock: true, if the system clock is used to generate pulses + * @get_time: query the time stored into the generator clock + * @enable: enable/disable the PPS pulses generation + * + * This is the main generator struct where all needed information must be + * placed before calling the pps_gen_register_source(). + */ +struct pps_gen_source_info { + bool use_system_clock; + + int (*get_time)(struct pps_gen_device *pps_gen, + struct timespec64 *time); + int (*enable)(struct pps_gen_device *pps_gen, bool enable); + +/* private: internal use only */ + struct module *owner; + struct device *parent; /* for device_create */ +}; + +/* The main struct */ +struct pps_gen_device { + struct pps_gen_source_info info; /* PSS generator info */ + bool enabled; /* PSS generator status */ + + unsigned int event; + unsigned int sequence; + + unsigned int last_ev; /* last PPS event id */ + wait_queue_head_t queue; /* PPS event queue */ + + unsigned int id; /* PPS generator unique ID */ + struct cdev cdev; + struct device *dev; + struct fasync_struct *async_queue; /* fasync method */ + spinlock_t lock; +}; + +/* + * Global variables + */ + +extern const struct attribute_group *pps_gen_groups[]; + +/* + * Exported functions + */ + +extern struct pps_gen_device *pps_gen_register_source( + struct pps_gen_source_info *info); +extern void pps_gen_unregister_source(struct pps_gen_device *pps_gen); +extern void pps_gen_event(struct pps_gen_device *pps_gen, + unsigned int event, void *data); + +#endif /* LINUX_PPS_GEN_KERNEL_H */ diff --git a/include/uapi/linux/pps_gen.h b/include/uapi/linux/pps_gen.h new file mode 100644 index 000000000000..60a5d0fcfa68 --- /dev/null +++ b/include/uapi/linux/pps_gen.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * PPS generator API header + * + * Copyright (C) 2024 Rodolfo Giometti + */ + +#ifndef _PPS_GEN_H_ +#define _PPS_GEN_H_ + +#include +#include + +/** + * struct pps_gen_event - the PPS generator events + * @event: the event type + * @sequence: the event sequence number + * + * Userspace can get the last PPS generator event by using the + * ioctl(pps_gen, PPS_GEN_FETCHEVENT, ...) syscall. + * The sequence field can be used to save the last event ID, while in the + * event field is stored the last event type. Currently known event is: + * + * PPS_GEN_EVENT_MISSEDPULSE : last pulse was not generated + */ +struct pps_gen_event { + unsigned int event; + unsigned int sequence; +}; + +#define PPS_GEN_EVENT_MISSEDPULSE 1 + +#define PPS_GEN_SETENABLE _IOW('p', 0xb1, unsigned int *) +#define PPS_GEN_USESYSTEMCLOCK _IOR('p', 0xb2, unsigned int *) +#define PPS_GEN_FETCHEVENT _IOR('p', 0xb3, struct pps_gen_event *) + +#endif /* _PPS_GEN_H_ */ -- cgit v1.2.3 From 567a311d0a1a433f1e5bff508f3eb7ebfa189aa3 Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Fri, 20 Dec 2024 00:29:57 +0100 Subject: VMCI: remove unused ioctl definitions IOCTL_VMCI_SOCKETS_VERSION and IOCTL_VMCI_SOCKETS_GET_AF_VALUE were never implemented, because VSOCK ended up being implemented as a generic mechanism with a static AF value. Likewise, IOCTL_VMCI_SOCKETS_GET_LOCAL_CID ended up being implemented as IOCTL_VM_SOCKETS_GET_LOCAL_CID. This isn't a UAPI header, so it should be fine to remove the unused values. I've left a comment noting IOCTL_VM_SOCKETS_GET_LOCAL_CID is in the VMCI range to avoid unintentional reuse. Signed-off-by: Alyssa Ross Acked-by: Vishnu Dasa Link: https://lore.kernel.org/r/fzdcrz4yfedokmbm22h2iwsluix4jwejwaltuwcdr6kz3yu6eu@nue5xc6ayevo Signed-off-by: Greg Kroah-Hartman --- include/linux/vmw_vmci_defs.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/vmw_vmci_defs.h b/include/linux/vmw_vmci_defs.h index 6fb663b36f72..c2df94696593 100644 --- a/include/linux/vmw_vmci_defs.h +++ b/include/linux/vmw_vmci_defs.h @@ -453,9 +453,7 @@ enum { #define IOCTL_VMCI_CTX_GET_CPT_STATE _IO(7, 0xb1) #define IOCTL_VMCI_CTX_SET_CPT_STATE _IO(7, 0xb2) #define IOCTL_VMCI_GET_CONTEXT_ID _IO(7, 0xb3) -#define IOCTL_VMCI_SOCKETS_VERSION _IO(7, 0xb4) -#define IOCTL_VMCI_SOCKETS_GET_AF_VALUE _IO(7, 0xb8) -#define IOCTL_VMCI_SOCKETS_GET_LOCAL_CID _IO(7, 0xb9) +/*IOCTL_VM_SOCKETS_GET_LOCAL_CID _IO(7, 0xb9)*/ #define IOCTL_VMCI_SET_NOTIFY _IO(7, 0xcb) /* 1995 */ /*IOCTL_VMMON_START _IO(7, 0xd1)*/ /* 2001 */ -- cgit v1.2.3 From d75abf2f9f2e584633bd1072267a2ff25deb422b Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:42 -0600 Subject: ntsync: Return the fd from NTSYNC_IOC_CREATE_SEM. Simplify the user API a bit by returning the fd as return value from the ioctl instead of through the argument pointer. Signed-off-by: Elizabeth Figura Link: https://lore.kernel.org/r/20241213193511.457338-2-zfigura@codeweavers.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/ntsync.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index dcfa38fdc93c..27d8cb3dd5b7 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -11,12 +11,11 @@ #include struct ntsync_sem_args { - __u32 sem; __u32 count; __u32 max; }; -#define NTSYNC_IOC_CREATE_SEM _IOWR('N', 0x80, struct ntsync_sem_args) +#define NTSYNC_IOC_CREATE_SEM _IOW ('N', 0x80, struct ntsync_sem_args) #define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32) -- cgit v1.2.3 From 5ec43d6b0328a2383525a2b90306c9077480e0bf Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:43 -0600 Subject: ntsync: Rename NTSYNC_IOC_SEM_POST to NTSYNC_IOC_SEM_RELEASE. Use the more common "release" terminology, which is also the term used by NT, instead of "post" (which is used by POSIX). Signed-off-by: Elizabeth Figura Link: https://lore.kernel.org/r/20241213193511.457338-3-zfigura@codeweavers.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/ntsync.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index 27d8cb3dd5b7..9af9d8125553 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -17,6 +17,6 @@ struct ntsync_sem_args { #define NTSYNC_IOC_CREATE_SEM _IOW ('N', 0x80, struct ntsync_sem_args) -#define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32) +#define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) #endif -- cgit v1.2.3 From b4a7b5fe3f5149fa35278807e0dc13ddb093f4b8 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:44 -0600 Subject: ntsync: Introduce NTSYNC_IOC_WAIT_ANY. This corresponds to part of the functionality of the NT syscall NtWaitForMultipleObjects(). Specifically, it implements the behaviour where the third argument (wait_any) is TRUE, and it does not handle alertable waits. Those features have been split out into separate patches to ease review. This patch therefore implements the wait/wake infrastructure which comprises the core of ntsync's functionality. NTSYNC_IOC_WAIT_ANY is a vectored wait function similar to poll(). Unlike poll(), it "consumes" objects when they are signaled. For semaphores, this means decreasing one from the internal counter. At most one object can be consumed by this function. This wait/wake model is fundamentally different from that used anywhere else in the kernel, and for that reason ntsync does not use any existing infrastructure, such as futexes, kernel mutexes or semaphores, or wait_event(). Up to 64 objects can be waited on at once. As soon as one is signaled, the object with the lowest index is consumed, and that index is returned via the "index" field. A timeout is supported. The timeout is passed as a u64 nanosecond value, which represents absolute time measured against either the MONOTONIC or REALTIME clock (controlled by the flags argument). If U64_MAX is passed, the ioctl waits indefinitely. This ioctl validates that all objects belong to the relevant device. This is not necessary for any technical reason related to NTSYNC_IOC_WAIT_ANY, but will be necessary for NTSYNC_IOC_WAIT_ALL introduced in the following patch. Some padding fields are added for alignment and for fields which will be added in future patches (split out to ease review). Signed-off-by: Elizabeth Figura Link: https://lore.kernel.org/r/20241213193511.457338-4-zfigura@codeweavers.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/ntsync.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index 9af9d8125553..40ffdc41d5bb 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -15,7 +15,21 @@ struct ntsync_sem_args { __u32 max; }; +#define NTSYNC_WAIT_REALTIME 0x1 + +struct ntsync_wait_args { + __u64 timeout; + __u64 objs; + __u32 count; + __u32 index; + __u32 flags; + __u32 pad[3]; +}; + +#define NTSYNC_MAX_WAIT_COUNT 64 + #define NTSYNC_IOC_CREATE_SEM _IOW ('N', 0x80, struct ntsync_sem_args) +#define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args) #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) -- cgit v1.2.3 From cdbb997822806cc1071619b67aef485bb2b921b1 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:45 -0600 Subject: ntsync: Introduce NTSYNC_IOC_WAIT_ALL. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is similar to NTSYNC_IOC_WAIT_ANY, but waits until all of the objects are simultaneously signaled, and then acquires all of them as a single atomic operation. Because acquisition of multiple objects is atomic, some complex locking is required. We cannot simply spin-lock multiple objects simultaneously, as that may disable preëmption for a problematically long time. Instead, modifying any object which may be involved in a wait-all operation takes a device-wide sleeping mutex, "wait_all_lock", instead of the normal object spinlock. Because wait-for-all is a rare operation, in order to optimize wait-for-any, this lock is only taken when necessary. "all_hint" is used to mark objects which are involved in a wait-for-all operation, and if an object is not, only its spinlock is taken. The locking scheme used here was written by Peter Zijlstra. Signed-off-by: Elizabeth Figura Link: https://lore.kernel.org/r/20241213193511.457338-5-zfigura@codeweavers.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/ntsync.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index 40ffdc41d5bb..d64420ffbcd7 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -30,6 +30,7 @@ struct ntsync_wait_args { #define NTSYNC_IOC_CREATE_SEM _IOW ('N', 0x80, struct ntsync_sem_args) #define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args) +#define NTSYNC_IOC_WAIT_ALL _IOWR('N', 0x83, struct ntsync_wait_args) #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) -- cgit v1.2.3 From 5bc2479a3585be35ea32e431bc82282bbf5a5d5b Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:46 -0600 Subject: ntsync: Introduce NTSYNC_IOC_CREATE_MUTEX. This corresponds to the NT syscall NtCreateMutant(). An NT mutex is recursive, with a 32-bit recursion counter. When acquired via NtWaitForMultipleObjects(), the recursion counter is incremented by one. The OS records the thread which acquired it. The OS records the thread which acquired it. However, in order to keep this driver self-contained, the owning thread ID is managed by user-space, and passed as a parameter to all relevant ioctls. The initial owner and recursion count, if any, are specified when the mutex is created. Signed-off-by: Elizabeth Figura Link: https://lore.kernel.org/r/20241213193511.457338-6-zfigura@codeweavers.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/ntsync.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index d64420ffbcd7..bb7fb94f5856 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -15,6 +15,11 @@ struct ntsync_sem_args { __u32 max; }; +struct ntsync_mutex_args { + __u32 owner; + __u32 count; +}; + #define NTSYNC_WAIT_REALTIME 0x1 struct ntsync_wait_args { @@ -23,7 +28,8 @@ struct ntsync_wait_args { __u32 count; __u32 index; __u32 flags; - __u32 pad[3]; + __u32 owner; + __u32 pad[2]; }; #define NTSYNC_MAX_WAIT_COUNT 64 @@ -31,6 +37,7 @@ struct ntsync_wait_args { #define NTSYNC_IOC_CREATE_SEM _IOW ('N', 0x80, struct ntsync_sem_args) #define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args) #define NTSYNC_IOC_WAIT_ALL _IOWR('N', 0x83, struct ntsync_wait_args) +#define NTSYNC_IOC_CREATE_MUTEX _IOW ('N', 0x84, struct ntsync_mutex_args) #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) -- cgit v1.2.3 From 31ca7bb8e8539b454414a2c4f8ad643c939cb0cf Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:47 -0600 Subject: ntsync: Introduce NTSYNC_IOC_MUTEX_UNLOCK. This corresponds to the NT syscall NtReleaseMutant(). This syscall decrements the mutex's recursion count by one, and returns the previous value. If the mutex is not owned by the current task, the function instead fails and returns -EPERM. Signed-off-by: Elizabeth Figura Link: https://lore.kernel.org/r/20241213193511.457338-7-zfigura@codeweavers.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/ntsync.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index bb7fb94f5856..9186304b253c 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -40,5 +40,6 @@ struct ntsync_wait_args { #define NTSYNC_IOC_CREATE_MUTEX _IOW ('N', 0x84, struct ntsync_mutex_args) #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) +#define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) #endif -- cgit v1.2.3 From ecc2ee361466e4a22e95b1c27d90f8cbd4f22e93 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:48 -0600 Subject: ntsync: Introduce NTSYNC_IOC_MUTEX_KILL. This does not correspond to any NT syscall. Rather, when a thread dies, it should be called by the NT emulator for each mutex, with the TID of the dying thread. NT mutexes are robust (in the pthread sense). When an NT thread dies, any mutexes it owned are immediately released. Acquisition of those mutexes by other threads will return a special value indicating that the mutex was abandoned, like EOWNERDEAD returned from pthread_mutex_lock(), and EOWNERDEAD is indeed used here for that purpose. Signed-off-by: Elizabeth Figura Link: https://lore.kernel.org/r/20241213193511.457338-8-zfigura@codeweavers.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/ntsync.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index 9186304b253c..633958d90be3 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -41,5 +41,6 @@ struct ntsync_wait_args { #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) #define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) +#define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32) #endif -- cgit v1.2.3 From 4c7404b9c2b572b42dc63bfde5e862290dab53b5 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:49 -0600 Subject: ntsync: Introduce NTSYNC_IOC_CREATE_EVENT. This correspond to the NT syscall NtCreateEvent(). An NT event holds a single bit of state denoting whether it is signaled or unsignaled. There are two types of events: manual-reset and automatic-reset. When an automatic-reset event is acquired via a wait function, its state is reset to unsignaled. Manual-reset events are not affected by wait functions. Whether the event is manual-reset, and its initial state, are specified at creation time. Signed-off-by: Elizabeth Figura Link: https://lore.kernel.org/r/20241213193511.457338-9-zfigura@codeweavers.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/ntsync.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index 633958d90be3..e08aa02f4dcc 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -20,6 +20,11 @@ struct ntsync_mutex_args { __u32 count; }; +struct ntsync_event_args { + __u32 manual; + __u32 signaled; +}; + #define NTSYNC_WAIT_REALTIME 0x1 struct ntsync_wait_args { @@ -38,6 +43,7 @@ struct ntsync_wait_args { #define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args) #define NTSYNC_IOC_WAIT_ALL _IOWR('N', 0x83, struct ntsync_wait_args) #define NTSYNC_IOC_CREATE_MUTEX _IOW ('N', 0x84, struct ntsync_mutex_args) +#define NTSYNC_IOC_CREATE_EVENT _IOW ('N', 0x87, struct ntsync_event_args) #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) #define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) -- cgit v1.2.3 From 2dcba6fc15a442b1cf1df8cdd0da12f20db38c43 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:50 -0600 Subject: ntsync: Introduce NTSYNC_IOC_EVENT_SET. This corresponds to the NT syscall NtSetEvent(). This sets the event to the signaled state, and returns its previous state. Signed-off-by: Elizabeth Figura Link: https://lore.kernel.org/r/20241213193511.457338-10-zfigura@codeweavers.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/ntsync.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index e08aa02f4dcc..db2512f6e3f4 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -48,5 +48,6 @@ struct ntsync_wait_args { #define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) #define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) #define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32) +#define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32) #endif -- cgit v1.2.3 From bbb9797514b20c316314b78a439f87fd16b7c509 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:51 -0600 Subject: ntsync: Introduce NTSYNC_IOC_EVENT_RESET. This corresponds to the NT syscall NtResetEvent(). This sets the event to the unsignaled state, and returns its previous state. Signed-off-by: Elizabeth Figura Acked-by: Peter Zijlstra (Intel) Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20241213193511.457338-11-zfigura@codeweavers.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/ntsync.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index db2512f6e3f4..d74c4e4f93d8 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -49,5 +49,6 @@ struct ntsync_wait_args { #define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) #define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32) #define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32) +#define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32) #endif -- cgit v1.2.3 From 12b29d3008e6bd5118af716ae2971fe6823b117a Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:52 -0600 Subject: ntsync: Introduce NTSYNC_IOC_EVENT_PULSE. This corresponds to the NT syscall NtPulseEvent(). This wakes up any waiters as if the event had been set, but does not set the event, instead resetting it if it had been signalled. Thus, for a manual-reset event, all waiters are woken, whereas for an auto-reset event, at most one waiter is woken. Signed-off-by: Elizabeth Figura Acked-by: Peter Zijlstra (Intel) Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20241213193511.457338-12-zfigura@codeweavers.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/ntsync.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index d74c4e4f93d8..9eab7666b8b9 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -50,5 +50,6 @@ struct ntsync_wait_args { #define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32) #define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32) #define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32) +#define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32) #endif -- cgit v1.2.3 From a948f4177c3cce5f27648696da95c62c0253a099 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:53 -0600 Subject: ntsync: Introduce NTSYNC_IOC_SEM_READ. This corresponds to the NT syscall NtQuerySemaphore(). This returns the current count and maximum count of the semaphore. Signed-off-by: Elizabeth Figura Link: https://lore.kernel.org/r/20241213193511.457338-13-zfigura@codeweavers.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/ntsync.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index 9eab7666b8b9..e36b4cfb7d34 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -51,5 +51,6 @@ struct ntsync_wait_args { #define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32) #define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32) #define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32) +#define NTSYNC_IOC_SEM_READ _IOR ('N', 0x8b, struct ntsync_sem_args) #endif -- cgit v1.2.3 From 0b3c31449d28c83c0b8a8a7be569aa30d70b7764 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:54 -0600 Subject: ntsync: Introduce NTSYNC_IOC_MUTEX_READ. This corresponds to the NT syscall NtQueryMutant(). This returns the recursion count, owner, and abandoned state of the mutex. Signed-off-by: Elizabeth Figura Link: https://lore.kernel.org/r/20241213193511.457338-14-zfigura@codeweavers.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/ntsync.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index e36b4cfb7d34..207646e63ef3 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -52,5 +52,6 @@ struct ntsync_wait_args { #define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32) #define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32) #define NTSYNC_IOC_SEM_READ _IOR ('N', 0x8b, struct ntsync_sem_args) +#define NTSYNC_IOC_MUTEX_READ _IOR ('N', 0x8c, struct ntsync_mutex_args) #endif -- cgit v1.2.3 From e864071a630cfbbd55251e7b45461003f4f79877 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:55 -0600 Subject: ntsync: Introduce NTSYNC_IOC_EVENT_READ. This corresponds to the NT syscall NtQueryEvent(). This returns the signaled state of the event and whether it is manual-reset. Signed-off-by: Elizabeth Figura Link: https://lore.kernel.org/r/20241213193511.457338-15-zfigura@codeweavers.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/ntsync.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index 207646e63ef3..b9d208a8c00f 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -53,5 +53,6 @@ struct ntsync_wait_args { #define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32) #define NTSYNC_IOC_SEM_READ _IOR ('N', 0x8b, struct ntsync_sem_args) #define NTSYNC_IOC_MUTEX_READ _IOR ('N', 0x8c, struct ntsync_mutex_args) +#define NTSYNC_IOC_EVENT_READ _IOR ('N', 0x8d, struct ntsync_event_args) #endif -- cgit v1.2.3 From a138179a59d47690b069fbded1be12f47648ef07 Mon Sep 17 00:00:00 2001 From: Elizabeth Figura Date: Fri, 13 Dec 2024 13:34:56 -0600 Subject: ntsync: Introduce alertable waits. NT waits can optionally be made "alertable". This is a special channel for thread wakeup that is mildly similar to SIGIO. A thread has an internal single bit of "alerted" state, and if a thread is alerted while an alertable wait, the wait will return a special value, consume the "alerted" state, and will not consume any of its objects. Alerts are implemented using events; the user-space NT emulator is expected to create an internal ntsync event for each thread and pass that event to wait functions. Signed-off-by: Elizabeth Figura Acked-by: Peter Zijlstra (Intel) Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20241213193511.457338-16-zfigura@codeweavers.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/ntsync.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h index b9d208a8c00f..6d06793512b1 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h @@ -34,7 +34,8 @@ struct ntsync_wait_args { __u32 index; __u32 flags; __u32 owner; - __u32 pad[2]; + __u32 alert; + __u32 pad; }; #define NTSYNC_MAX_WAIT_COUNT 64 -- cgit v1.2.3 From 3a5446612a3f2579c751ddb77c5e16b9a0d47001 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:48:59 +0200 Subject: sched,arm64: Handle CPU isolation on last resort fallback rq selection When a kthread or any other task has an affinity mask that is fully offline or unallowed, the scheduler reaffines the task to all possible CPUs as a last resort. This default decision doesn't mix up very well with nohz_full CPUs that are part of the possible cpumask but don't want to be disturbed by unbound kthreads or even detached pinned user tasks. Make the fallback affinity setting aware of nohz_full. Suggested-by: Michal Hocko Acked-by: Will Deacon Signed-off-by: Frederic Weisbecker --- include/linux/mmu_context.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h index bbaec80c78c5..ac01dc4eb2ce 100644 --- a/include/linux/mmu_context.h +++ b/include/linux/mmu_context.h @@ -24,6 +24,7 @@ static inline void leave_mm(void) { } #ifndef task_cpu_possible_mask # define task_cpu_possible_mask(p) cpu_possible_mask # define task_cpu_possible(cpu, p) true +# define task_cpu_fallback_mask(p) housekeeping_cpumask(HK_TYPE_TICK) #else # define task_cpu_possible(cpu, p) cpumask_test_cpu((cpu), task_cpu_possible_mask(p)) #endif -- cgit v1.2.3 From d1a89197589c4a77468298ef2b14ff4084c4ea29 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:01 +0200 Subject: kthread: Default affine kthread to its preferred NUMA node Kthreads attached to a preferred NUMA node for their task structure allocation can also be assumed to run preferrably within that same node. A more precise affinity is usually notified by calling kthread_create_on_cpu() or kthread_bind[_mask]() before the first wakeup. For the others, a default affinity to the node is desired and sometimes implemented with more or less success when it comes to deal with hotplug events and nohz_full / CPU Isolation interactions: - kcompactd is affine to its node and handles hotplug but not CPU Isolation - kswapd is affine to its node and ignores hotplug and CPU Isolation - A bunch of drivers create their kthreads on a specific node and don't take care about affining further. Handle that default node affinity preference at the generic level instead, provided a kthread is created on an actual node and doesn't apply any specific affinity such as a given CPU or a custom cpumask to bind to before its first wake-up. This generic handling is aware of CPU hotplug events and CPU isolation such that: * When a housekeeping CPU goes up that is part of the node of a given kthread, the related task is re-affined to that own node if it was previously running on the default last resort online housekeeping set from other nodes. * When a housekeeping CPU goes down while it was part of the node of a kthread, the running task is migrated (or the sleeping task is woken up) automatically by the scheduler to other housekeepers within the same node or, as a last resort, to all housekeepers from other nodes. Acked-by: Vlastimil Babka Signed-off-by: Frederic Weisbecker --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index a04b73c40173..6cc5e484547c 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -240,6 +240,7 @@ enum cpuhp_state { CPUHP_AP_WORKQUEUE_ONLINE, CPUHP_AP_RANDOM_ONLINE, CPUHP_AP_RCUTREE_ONLINE, + CPUHP_AP_KTHREADS_ONLINE, CPUHP_AP_BASE_CACHEINFO_ONLINE, CPUHP_AP_ONLINE_DYN, CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 40, -- cgit v1.2.3 From 4d13f4304fa43471bfea101658a11feec7b28ac0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:04 +0200 Subject: kthread: Implement preferred affinity Affining kthreads follow either of four existing different patterns: 1) Per-CPU kthreads must stay affine to a single CPU and never execute relevant code on any other CPU. This is currently handled by smpboot code which takes care of CPU-hotplug operations. 2) Kthreads that _have_ to be affine to a specific set of CPUs and can't run anywhere else. The affinity is set through kthread_bind_mask() and the subsystem takes care by itself to handle CPU-hotplug operations. 3) Kthreads that prefer to be affine to a specific NUMA node. That preferred affinity is applied by default when an actual node ID is passed on kthread creation, provided the kthread is not per-CPU and no call to kthread_bind_mask() has been issued before the first wake-up. 4) Similar to the previous point but kthreads have a preferred affinity different than a node. It is set manually like any other task and CPU-hotplug is supposed to be handled by the relevant subsystem so that the task is properly reaffined whenever a given CPU from the preferred affinity comes up. Also care must be taken so that the preferred affinity doesn't cross housekeeping cpumask boundaries. Provide a function to handle the last usecase, mostly reusing the current node default affinity infrastructure. kthread_affine_preferred() is introduced, to be used just like kthread_bind_mask(), right after kthread creation and before the first wake up. The kthread is then affine right away to the cpumask passed through the API if it has online housekeeping CPUs. Otherwise it will be affine to all online housekeeping CPUs as a last resort. As with node affinity, it is aware of CPU hotplug events such that: * When a housekeeping CPU goes up that is part of the preferred affinity of a given kthread, the related task is re-affined to that preferred affinity if it was previously running on the default last resort online housekeeping set. * When a housekeeping CPU goes down while it was part of the preferred affinity of a kthread, the running task is migrated (or the sleeping task is woken up) automatically by the scheduler to other housekeepers within the preferred affinity or, as a last resort, to all housekeepers from other nodes. Acked-by: Vlastimil Babka Signed-off-by: Frederic Weisbecker --- include/linux/kthread.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index b11f53c1ba2e..30209bdf83a2 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -85,6 +85,7 @@ kthread_run_on_cpu(int (*threadfn)(void *data), void *data, void free_kthread_struct(struct task_struct *k); void kthread_bind(struct task_struct *k, unsigned int cpu); void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask); +int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask); int kthread_stop(struct task_struct *k); int kthread_stop_put(struct task_struct *k); bool kthread_should_stop(void); -- cgit v1.2.3 From 41f70d8e16349c65abdc0dd88a7d0ab94e5ce639 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:06 +0200 Subject: kthread: Unify kthread_create_on_cpu() and kthread_create_worker_on_cpu() automatic format kthread_create_on_cpu() uses the CPU argument as an implicit and unique printf argument to add to the format whereas kthread_create_worker_on_cpu() still relies on explicitly passing the printf arguments. This difference in behaviour is error prone and doesn't help standardizing per-CPU kthread names. Unify the behaviours and convert kthread_create_worker_on_cpu() to use the printf behaviour of kthread_create_on_cpu(). Signed-off-by: Frederic Weisbecker --- include/linux/kthread.h | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 30209bdf83a2..0c66e7c1092a 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -187,13 +187,24 @@ extern void __kthread_init_worker(struct kthread_worker *worker, int kthread_worker_fn(void *worker_ptr); -__printf(2, 3) -struct kthread_worker * -kthread_create_worker(unsigned int flags, const char namefmt[], ...); +__printf(3, 4) +struct kthread_worker *kthread_create_worker_on_node(unsigned int flags, + int node, + const char namefmt[], ...); -__printf(3, 4) struct kthread_worker * +#define kthread_create_worker(flags, namefmt, ...) \ +({ \ + struct kthread_worker *__kw \ + = kthread_create_worker_on_node(flags, NUMA_NO_NODE, \ + namefmt, ## __VA_ARGS__); \ + if (!IS_ERR(__kw)) \ + wake_up_process(__kw->task); \ + __kw; \ +}) + +struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, - const char namefmt[], ...); + const char namefmt[]); bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work); -- cgit v1.2.3 From b04e317b522630b46f78ee62ecbdc5734e8d43de Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:07 +0200 Subject: treewide: Introduce kthread_run_worker[_on_cpu]() kthread_create() creates a kthread without running it yet. kthread_run() creates a kthread and runs it. On the other hand, kthread_create_worker() creates a kthread worker and runs it. This difference in behaviours is confusing. Also there is no way to create a kthread worker and affine it using kthread_bind_mask() or kthread_affine_preferred() before starting it. Consolidate the behaviours and introduce kthread_run_worker[_on_cpu]() that behaves just like kthread_run(). kthread_create_worker[_on_cpu]() will now only create a kthread worker without starting it. Signed-off-by: Frederic Weisbecker Signed-off-by: Dan Carpenter --- include/linux/kthread.h | 48 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 0c66e7c1092a..8d27403888ce 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -193,19 +193,53 @@ struct kthread_worker *kthread_create_worker_on_node(unsigned int flags, const char namefmt[], ...); #define kthread_create_worker(flags, namefmt, ...) \ -({ \ - struct kthread_worker *__kw \ - = kthread_create_worker_on_node(flags, NUMA_NO_NODE, \ - namefmt, ## __VA_ARGS__); \ - if (!IS_ERR(__kw)) \ - wake_up_process(__kw->task); \ - __kw; \ + kthread_create_worker_on_node(flags, NUMA_NO_NODE, namefmt, ## __VA_ARGS__); + +/** + * kthread_run_worker - create and wake a kthread worker. + * @flags: flags modifying the default behavior of the worker + * @namefmt: printf-style name for the thread. + * + * Description: Convenient wrapper for kthread_create_worker() followed by + * wake_up_process(). Returns the kthread_worker or ERR_PTR(-ENOMEM). + */ +#define kthread_run_worker(flags, namefmt, ...) \ +({ \ + struct kthread_worker *__kw \ + = kthread_create_worker(flags, namefmt, ## __VA_ARGS__); \ + if (!IS_ERR(__kw)) \ + wake_up_process(__kw->task); \ + __kw; \ }) struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[]); +/** + * kthread_run_worker_on_cpu - create and wake a cpu bound kthread worker. + * @cpu: CPU number + * @flags: flags modifying the default behavior of the worker + * @namefmt: printf-style name for the thread. Format is restricted + * to "name.*%u". Code fills in cpu number. + * + * Description: Convenient wrapper for kthread_create_worker_on_cpu() + * followed by wake_up_process(). Returns the kthread_worker or + * ERR_PTR(-ENOMEM). + */ +static inline struct kthread_worker * +kthread_run_worker_on_cpu(int cpu, unsigned int flags, + const char namefmt[]) +{ + struct kthread_worker *kw; + + kw = kthread_create_worker_on_cpu(cpu, flags, namefmt); + if (!IS_ERR(kw)) + wake_up_process(kw->task); + + return kw; +} + bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work); -- cgit v1.2.3 From b9371866799d67a80be0ea9e01bd41987db22f26 Mon Sep 17 00:00:00 2001 From: Md Sadre Alam Date: Mon, 6 Jan 2025 18:45:58 +0530 Subject: mtd: rawnand: qcom: Fix build issue on x86 architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a buffer overflow issue in qcom_clear_bam_transaction by using struct_group to group related fields and avoid FORTIFY_SOURCE warnings. On x86 architecture, the following error occurs due to warnings being treated as errors: In function ‘fortify_memset_chk’, inlined from ‘qcom_clear_bam_transaction’ at drivers/mtd/nand/qpic_common.c:88:2: ./include/linux/fortify-string.h:480:25: error: call to ‘__write_overflow_field’ declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror=attribute-warning] 480 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LD [M] drivers/mtd/nand/nandcore.o CC [M] drivers/w1/masters/mxc_w1.o cc1: all warnings being treated as errors This patch addresses the issue by grouping the related fields in struct bam_transaction using struct_group and updating the memset call accordingly. Fixes: 8c52932da5e6 ("mtd: rawnand: qcom: cleanup qcom_nandc driver") Signed-off-by: Md Sadre Alam Signed-off-by: Miquel Raynal --- include/linux/mtd/nand-qpic-common.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/mtd/nand-qpic-common.h b/include/linux/mtd/nand-qpic-common.h index e79c79775eb8..4d9b736ff8b7 100644 --- a/include/linux/mtd/nand-qpic-common.h +++ b/include/linux/mtd/nand-qpic-common.h @@ -254,14 +254,17 @@ struct bam_transaction { struct dma_async_tx_descriptor *last_data_desc; struct dma_async_tx_descriptor *last_cmd_desc; struct completion txn_done; - u32 bam_ce_pos; - u32 bam_ce_start; - u32 cmd_sgl_pos; - u32 cmd_sgl_start; - u32 tx_sgl_pos; - u32 tx_sgl_start; - u32 rx_sgl_pos; - u32 rx_sgl_start; + struct_group(bam_positions, + u32 bam_ce_pos; + u32 bam_ce_start; + u32 cmd_sgl_pos; + u32 cmd_sgl_start; + u32 tx_sgl_pos; + u32 tx_sgl_start; + u32 rx_sgl_pos; + u32 rx_sgl_start; + + ); }; /* -- cgit v1.2.3 From f90877dd7fb5085dd9abd6399daf63dd2969fc90 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 8 Jan 2025 23:44:45 +0100 Subject: seccomp: Stub for !CONFIG_SECCOMP When using !CONFIG_SECCOMP with CONFIG_GENERIC_ENTRY, the randconfig bots found the following snag: kernel/entry/common.c: In function 'syscall_trace_enter': >> kernel/entry/common.c:52:23: error: implicit declaration of function '__secure_computing' [-Wimplicit-function-declaration] 52 | ret = __secure_computing(NULL); | ^~~~~~~~~~~~~~~~~~ Since generic entry calls __secure_computing() unconditionally, fix this by moving the stub out of the ifdef clause for CONFIG_HAVE_ARCH_SECCOMP_FILTER so it's always available. Link: https://lore.kernel.org/oe-kbuild-all/202501061240.Fzk9qiFZ-lkp@intel.com/ Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20250108-seccomp-stub-2-v2-1-74523d49420f@linaro.org Signed-off-by: Kees Cook --- include/linux/seccomp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 341980599c71..e45531455d3b 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -55,10 +55,10 @@ struct seccomp_data; #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER static inline int secure_computing(void) { return 0; } -static inline int __secure_computing(const struct seccomp_data *sd) { return 0; } #else static inline void secure_computing_strict(int this_syscall) { return; } #endif +static inline int __secure_computing(const struct seccomp_data *sd) { return 0; } static inline long prctl_get_seccomp(void) { -- cgit v1.2.3 From 1d45a1cd9f3ae849db868e07e5fee5e5b37eff55 Mon Sep 17 00:00:00 2001 From: Gaurav Kashyap Date: Thu, 12 Dec 2024 20:19:51 -0800 Subject: firmware: qcom: scm: add calls for wrapped key support Add helper functions for the SCM calls required to support hardware-wrapped inline storage encryption keys. These SCM calls manage wrapped keys via Qualcomm's Hardware Key Manager (HWKM), which can only be accessed from TrustZone. QCOM_SCM_ES_GENERATE_ICE_KEY and QCOM_SCM_ES_IMPORT_ICE_KEY create a new long-term wrapped key, with the former making the hardware generate the key and the latter importing a raw key. QCOM_SCM_ES_PREPARE_ICE_KEY converts the key to ephemerally-wrapped form so that it can be used for inline storage encryption. These are planned to be wired up to new ioctls via the blk-crypto framework; see the proposed documentation for the hardware-wrapped keys feature for more information. Similarly there's also QCOM_SCM_ES_DERIVE_SW_SECRET which derives a "software secret" from an ephemerally-wrapped key and will be wired up to the corresponding operation in the blk_crypto_profile. These will all be used by the ICE driver in drivers/soc/qcom/ice.c. [EB: merged related patches, fixed error handling, fixed naming, fixed docs for size parameters, fixed qcom_scm_has_wrapped_key_support(), improved comments, improved commit message.] Signed-off-by: Gaurav Kashyap Signed-off-by: Bartosz Golaszewski Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20241213041958.202565-9-ebiggers@kernel.org Signed-off-by: Bjorn Andersson --- include/linux/firmware/qcom/qcom_scm.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h index 4621aec0328c..983e1591bbba 100644 --- a/include/linux/firmware/qcom/qcom_scm.h +++ b/include/linux/firmware/qcom/qcom_scm.h @@ -105,6 +105,14 @@ bool qcom_scm_ice_available(void); int qcom_scm_ice_invalidate_key(u32 index); int qcom_scm_ice_set_key(u32 index, const u8 *key, u32 key_size, enum qcom_scm_ice_cipher cipher, u32 data_unit_size); +bool qcom_scm_has_wrapped_key_support(void); +int qcom_scm_derive_sw_secret(const u8 *eph_key, size_t eph_key_size, + u8 *sw_secret, size_t sw_secret_size); +int qcom_scm_generate_ice_key(u8 *lt_key, size_t lt_key_size); +int qcom_scm_prepare_ice_key(const u8 *lt_key, size_t lt_key_size, + u8 *eph_key, size_t eph_key_size); +int qcom_scm_import_ice_key(const u8 *raw_key, size_t raw_key_size, + u8 *lt_key, size_t lt_key_size); bool qcom_scm_hdcp_available(void); int qcom_scm_hdcp_req(struct qcom_scm_hdcp_req *req, u32 req_cnt, u32 *resp); -- cgit v1.2.3 From 80818fdc068eaab729bb793d790ae9fd053f7053 Mon Sep 17 00:00:00 2001 From: Terry Tritton Date: Fri, 20 Dec 2024 19:23:18 +0000 Subject: HID: fix generic desktop D-Pad controls The addition of the "System Do Not Disturb" event code caused the Generic Desktop D-Pad configuration to be skipped. This commit allows both to be configured without conflicting with each other. Fixes: 22d6d060ac77 ("input: Add support for "Do Not Disturb"") Signed-off-by: Terry Tritton Reviewed-by: Aseda Aboagye Reviewed-by: Carlos Llamas Signed-off-by: Jiri Kosina --- include/linux/hid.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/hid.h b/include/linux/hid.h index d11e9c9a5f15..cdc0dc13c87f 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -218,6 +218,7 @@ struct hid_item { #define HID_GD_DOWN 0x00010091 #define HID_GD_RIGHT 0x00010092 #define HID_GD_LEFT 0x00010093 +#define HID_GD_DO_NOT_DISTURB 0x0001009b /* Microsoft Win8 Wireless Radio Controls CA usage codes */ #define HID_GD_RFKILL_BTN 0x000100c6 #define HID_GD_RFKILL_LED 0x000100c7 -- cgit v1.2.3 From 6657d899ce35a313d7e0e7ddc0988aa80d304ca8 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 25 Dec 2024 01:55:08 +0000 Subject: HID: intel-ish-hid: Remove unused ishtp_cl_tx_empty ishtp_cl_tx_empty() was added in 2018 by commit a1c40ce62fd2 ("HID: intel-ish-hid: ishtp: add helper functions for client buffer operation") but has remained unused. Remove it. Signed-off-by: Dr. David Alan Gilbert Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- include/linux/intel-ish-client-if.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h index 771622650247..dfbf7d9d7bb5 100644 --- a/include/linux/intel-ish-client-if.h +++ b/include/linux/intel-ish-client-if.h @@ -100,7 +100,6 @@ void ishtp_cl_destroy_connection(struct ishtp_cl *cl, bool reset); int ishtp_cl_send(struct ishtp_cl *cl, uint8_t *buf, size_t length); int ishtp_cl_flush_queues(struct ishtp_cl *cl); int ishtp_cl_io_rb_recycle(struct ishtp_cl_rb *rb); -bool ishtp_cl_tx_empty(struct ishtp_cl *cl); struct ishtp_cl_rb *ishtp_cl_rx_get_rb(struct ishtp_cl *cl); void *ishtp_get_client_data(struct ishtp_cl *cl); void ishtp_set_client_data(struct ishtp_cl *cl, void *data); -- cgit v1.2.3 From 4751113f24048f96cb696ff8d939e38530dcdfc1 Mon Sep 17 00:00:00 2001 From: Even Xu Date: Mon, 6 Jan 2025 10:31:41 +0800 Subject: HID: intel-thc-hid: intel-quickspi: Add THC QuickSPI driver hid layer Add HID Low level driver callbacks and hid probe function to register QucikSPI as a HID driver, and external touch device as a HID device. Co-developed-by: Xinpeng Sun Signed-off-by: Xinpeng Sun Signed-off-by: Even Xu Tested-by: Rui Zhang Tested-by: Mark Pearson Reviewed-by: Srinivas Pandruvada Reviewed-by: Mark Pearson Tested-by: Aaron Ma Signed-off-by: Jiri Kosina --- include/linux/hid-over-spi.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 include/linux/hid-over-spi.h (limited to 'include') diff --git a/include/linux/hid-over-spi.h b/include/linux/hid-over-spi.h new file mode 100644 index 000000000000..ddbe41c5d8fd --- /dev/null +++ b/include/linux/hid-over-spi.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright 2024 Intel Corporation */ + +#ifndef _HID_OVER_SPI_H_ +#define _HID_OVER_SPI_H_ + +/** + * struct hidspi_dev_descriptor - HIDSPI device descriptor definition + * @dev_desc_len: The length of the complete device descriptor, fixed to 0x18 (24). + * @bcd_ver: The version number of the HIDSPI protocol supported. + * In binary coded decimal (BCD) format. Must be fixed to 0x0300. + * @rep_desc_len: The length of the report descriptor + * @max_input_len: The length of the largest possible HID input (or feature) report + * @max_output_len: The length of the largest output (or feature) report + * @max_frag_len: The length of the largest fragment, where a fragment represents + * the body of an input report. + * @vendor_id: Device manufacturers vendor ID + * @product_id: Device unique model/product ID + * @version_id: Device’s unique version + * @flags: Specify flags for the device’s operation + * @reserved: Reserved and should be 0 + */ +struct hidspi_dev_descriptor { + __le16 dev_desc_len; + __le16 bcd_ver; + __le16 rep_desc_len; + __le16 max_input_len; + __le16 max_output_len; + __le16 max_frag_len; + __le16 vendor_id; + __le16 product_id; + __le16 version_id; + __le16 flags; + __le32 reserved; +}; + +#endif /* _HID_OVER_SPI_H_ */ -- cgit v1.2.3 From 9d8d51735a3af40b722346931a6a1e50227df4b5 Mon Sep 17 00:00:00 2001 From: Even Xu Date: Mon, 6 Jan 2025 10:31:43 +0800 Subject: HID: intel-thc-hid: intel-quickspi: Add HIDSPI protocol implementation Intel QuickSPI driver uses THC hardware to accelerate HID over SPI (HIDSPI) protocol flow. This patch implements all data flows described in HID over SPI protocol SPEC by using THC hardware layer APIs. HID over SPI SPEC: https://www.microsoft.com/download/details.aspx?id=103325 Co-developed-by: Xinpeng Sun Signed-off-by: Xinpeng Sun Signed-off-by: Even Xu Tested-by: Rui Zhang Tested-by: Mark Pearson Reviewed-by: Srinivas Pandruvada Reviewed-by: Mark Pearson Tested-by: Aaron Ma Signed-off-by: Jiri Kosina --- include/linux/hid-over-spi.h | 118 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) (limited to 'include') diff --git a/include/linux/hid-over-spi.h b/include/linux/hid-over-spi.h index ddbe41c5d8fd..da5a14b5e89b 100644 --- a/include/linux/hid-over-spi.h +++ b/include/linux/hid-over-spi.h @@ -4,6 +4,120 @@ #ifndef _HID_OVER_SPI_H_ #define _HID_OVER_SPI_H_ +#include +#include + +/* Input report type definition in HIDSPI protocol */ +enum input_report_type { + INVALID_INPUT_REPORT_TYPE_0 = 0, + DATA = 1, + INVALID_TYPE_2 = 2, + RESET_RESPONSE = 3, + COMMAND_RESPONSE = 4, + GET_FEATURE_RESPONSE = 5, + INVALID_TYPE_6 = 6, + DEVICE_DESCRIPTOR_RESPONSE = 7, + REPORT_DESCRIPTOR_RESPONSE = 8, + SET_FEATURE_RESPONSE = 9, + OUTPUT_REPORT_RESPONSE = 10, + GET_INPUT_REPORT_RESPONSE = 11, + INVALID_INPUT_REPORT_TYPE = 0xF, +}; + +/* Output report type definition in HIDSPI protocol */ +enum output_report_type { + INVALID_OUTPUT_REPORT_TYPE_0 = 0, + DEVICE_DESCRIPTOR = 1, + REPORT_DESCRIPTOR = 2, + SET_FEATURE = 3, + GET_FEATURE = 4, + OUTPUT_REPORT = 5, + GET_INPUT_REPORT = 6, + COMMAND_CONTENT = 7, +}; + +/* Set power command ID for output report */ +#define HIDSPI_SET_POWER_CMD_ID 1 + +/* Power state definition in HIDSPI protocol */ +enum hidspi_power_state { + HIDSPI_ON = 1, + HIDSPI_SLEEP = 2, + HIDSPI_OFF = 3, +}; + +/** + * Input report header definition in HIDSPI protocol + * Report header size is 32bits, it includes: + * protocol_ver: [0:3] Current supported HIDSPI protocol version, must be 0x3 + * reserved0: [4:7] Reserved bits + * input_report_len: [8:21] Input report length in number bytes divided by 4 + * last_frag_flag: [22]Indicate if this packet is last fragment. + * 1 - indicates last fragment + * 0 - indicates additional fragments + * reserved1: [23] Reserved bits + * @sync_const: [24:31] Used to validate input report header, must be 0x5A + */ +#define HIDSPI_INPUT_HEADER_SIZE sizeof(u32) +#define HIDSPI_INPUT_HEADER_VER GENMASK(3, 0) +#define HIDSPI_INPUT_HEADER_REPORT_LEN GENMASK(21, 8) +#define HIDSPI_INPUT_HEADER_LAST_FLAG BIT(22) +#define HIDSPI_INPUT_HEADER_SYNC GENMASK(31, 24) + +/** + * struct input_report_body_header - Input report body header definition in HIDSPI protocol + * @input_report_type: indicate input report type, reference to enum input_report_type + * @content_len: this input report body packet length + * @content_id: indicate this input report's report id + */ +struct input_report_body_header { + u8 input_report_type; + __le16 content_len; + u8 content_id; +} __packed; + +#define HIDSPI_INPUT_BODY_HEADER_SIZE sizeof(struct input_report_body_header) + +/** + * struct input_report_body - Input report body definition in HIDSPI protocol + * @body_hdr: input report body header + * @content: input report body content + */ +struct input_report_body { + struct input_report_body_header body_hdr; + u8 content[]; +} __packed; + +#define HIDSPI_INPUT_BODY_SIZE(content_len) ((content_len) + HIDSPI_INPUT_BODY_HEADER_SIZE) + +/** + * struct output_report_header - Output report header definition in HIDSPI protocol + * @report_type: output report type, reference to enum output_report_type + * @content_len: length of content + * @content_id: 0x00 - descriptors + * report id - Set/Feature feature or Input/Output Reports + * command opcode - for commands + */ +struct output_report_header { + u8 report_type; + __le16 content_len; + u8 content_id; +} __packed; + +#define HIDSPI_OUTPUT_REPORT_HEADER_SIZE sizeof(struct output_report_header) + +/** + * struct output_report - Output report definition in HIDSPI protocol + * @output_hdr: output report header + * @content: output report content + */ +struct output_report { + struct output_report_header output_hdr; + u8 content[]; +} __packed; + +#define HIDSPI_OUTPUT_REPORT_SIZE(content_len) ((content_len) + HIDSPI_OUTPUT_REPORT_HEADER_SIZE) + /** * struct hidspi_dev_descriptor - HIDSPI device descriptor definition * @dev_desc_len: The length of the complete device descriptor, fixed to 0x18 (24). @@ -34,4 +148,8 @@ struct hidspi_dev_descriptor { __le32 reserved; }; +#define HIDSPI_DEVICE_DESCRIPTOR_SIZE sizeof(struct hidspi_dev_descriptor) +#define HIDSPI_INPUT_DEVICE_DESCRIPTOR_SIZE \ + (HIDSPI_INPUT_BODY_HEADER_SIZE + HIDSPI_DEVICE_DESCRIPTOR_SIZE) + #endif /* _HID_OVER_SPI_H_ */ -- cgit v1.2.3 From ba38d7f87f159c94f947d5b8336f763ec760d4ea Mon Sep 17 00:00:00 2001 From: Even Xu Date: Mon, 6 Jan 2025 10:31:47 +0800 Subject: HID: intel-thc-hid: intel-quicki2c: Add THC QuickI2C driver hid layer Add HID Low level driver callbacks and hid probe function to register QucikI2C as a HID driver, and external touch device as a HID device. Co-developed-by: Xinpeng Sun Signed-off-by: Xinpeng Sun Signed-off-by: Even Xu Tested-by: Rui Zhang Tested-by: Mark Pearson Reviewed-by: Srinivas Pandruvada Reviewed-by: Mark Pearson Tested-by: Aaron Ma Signed-off-by: Jiri Kosina --- include/linux/hid-over-i2c.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 include/linux/hid-over-i2c.h (limited to 'include') diff --git a/include/linux/hid-over-i2c.h b/include/linux/hid-over-i2c.h new file mode 100644 index 000000000000..b70626723a38 --- /dev/null +++ b/include/linux/hid-over-i2c.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright 2024 Intel Corporation */ + +#ifndef _HID_OVER_I2C_H_ +#define _HID_OVER_I2C_H_ + +/** + * struct hidi2c_dev_descriptor - HIDI2C device descriptor definition + * @dev_desc_len: The length of the complete device descriptor, fixed to 0x1E (30). + * @bcd_ver: The version number of the HIDI2C protocol supported. + * In binary coded decimal (BCD) format. + * @report_desc_len: The length of the report descriptor + * @report_desc_reg: The register address to retrieve report descriptor + * @input_reg: the register address to retrieve input report + * @max_input_len: The length of the largest possible HID input (or feature) report + * @output_reg: the register address to send output report + * @max_output_len: The length of the largest output (or feature) report + * @cmd_reg: the register address to send command + * @data_reg: the register address to send command data + * @vendor_id: Device manufacturers vendor ID + * @product_id: Device unique model/product ID + * @version_id: Device’s unique version + * @reserved0: Reserved and should be 0 + * @reserved1: Reserved and should be 0 + */ +struct hidi2c_dev_descriptor { + __le16 dev_desc_len; + __le16 bcd_ver; + __le16 report_desc_len; + __le16 report_desc_reg; + __le16 input_reg; + __le16 max_input_len; + __le16 output_reg; + __le16 max_output_len; + __le16 cmd_reg; + __le16 data_reg; + __le16 vendor_id; + __le16 product_id; + __le16 version_id; + __le16 reserved0; + __le16 reserved1; +} __packed; + +#endif /* _HID_OVER_I2C_H_ */ -- cgit v1.2.3 From 6fc761385bcf62b235b6b48b1db32e2558a7904a Mon Sep 17 00:00:00 2001 From: Even Xu Date: Mon, 6 Jan 2025 10:31:49 +0800 Subject: HID: intel-thc-hid: intel-quicki2c: Add HIDI2C protocol implementation Intel QuickI2C driver uses THC hardware to accelerate HID over I2C (HIDI2C) protocol flow. This patch implements all data flows described in HID over I2C protocol SPEC by using THC hardware layer APIs. HID over I2C SPEC: https://learn.microsoft.com/en-us/previous-versions/windows/hardware/design/dn642101(v=vs.85) Co-developed-by: Xinpeng Sun Signed-off-by: Xinpeng Sun Signed-off-by: Even Xu Tested-by: Rui Zhang Tested-by: Mark Pearson Reviewed-by: Srinivas Pandruvada Reviewed-by: Mark Pearson Tested-by: Aaron Ma Signed-off-by: Jiri Kosina --- include/linux/hid-over-i2c.h | 73 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'include') diff --git a/include/linux/hid-over-i2c.h b/include/linux/hid-over-i2c.h index b70626723a38..3b1a0208a6b8 100644 --- a/include/linux/hid-over-i2c.h +++ b/include/linux/hid-over-i2c.h @@ -1,9 +1,80 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright 2024 Intel Corporation */ +#include + #ifndef _HID_OVER_I2C_H_ #define _HID_OVER_I2C_H_ +#define HIDI2C_REG_LEN sizeof(__le16) + +/* Input report type definition in HIDI2C protocol */ +enum hidi2c_report_type { + HIDI2C_RESERVED = 0, + HIDI2C_INPUT, + HIDI2C_OUTPUT, + HIDI2C_FEATURE, +}; + +/* Power state type definition in HIDI2C protocol */ +enum hidi2c_power_state { + HIDI2C_ON, + HIDI2C_SLEEP, +}; + +/* Opcode type definition in HIDI2C protocol */ +enum hidi2c_opcode { + HIDI2C_RESET = 1, + HIDI2C_GET_REPORT, + HIDI2C_SET_REPORT, + HIDI2C_GET_IDLE, + HIDI2C_SET_IDLE, + HIDI2C_GET_PROTOCOL, + HIDI2C_SET_PROTOCOL, + HIDI2C_SET_POWER, +}; + +/** + * struct hidi2c_report_packet - Report packet definition in HIDI2C protocol + * @len: data field length + * @data: HIDI2C report packet data + */ +struct hidi2c_report_packet { + __le16 len; + u8 data[]; +} __packed; + +#define HIDI2C_LENGTH_LEN sizeof(__le16) + +#define HIDI2C_PACKET_LEN(data_len) ((data_len) + HIDI2C_LENGTH_LEN) +#define HIDI2C_DATA_LEN(pkt_len) ((pkt_len) - HIDI2C_LENGTH_LEN) + +#define HIDI2C_CMD_MAX_RI 0x0F + +/** + * HIDI2C command data packet - Command packet definition in HIDI2C protocol + * @report_id: [0:3] report id (<15) for features or output reports + * @report_type: [4:5] indicate report type, reference to hidi2c_report_type + * @reserved0: [6:7] reserved bits + * @opcode: [8:11] command operation code, reference to hidi2c_opcode + * @reserved1: [12:15] reserved bits + * @report_id_optional: [23:16] appended 3rd byte. + * If the report_id in the low byte is set to the + * sentinel value (HIDI2C_CMD_MAX_RI), then this + * optional third byte represents the report id (>=15) + * Otherwise, not this 3rd byte. + */ + +#define HIDI2C_CMD_LEN sizeof(__le16) +#define HIDI2C_CMD_LEN_OPT (sizeof(__le16) + 1) +#define HIDI2C_CMD_REPORT_ID GENMASK(3, 0) +#define HIDI2C_CMD_REPORT_TYPE GENMASK(5, 4) +#define HIDI2C_CMD_OPCODE GENMASK(11, 8) +#define HIDI2C_CMD_OPCODE GENMASK(11, 8) +#define HIDI2C_CMD_3RD_BYTE GENMASK(23, 16) + +#define HIDI2C_HID_DESC_BCDVERSION 0x100 + /** * struct hidi2c_dev_descriptor - HIDI2C device descriptor definition * @dev_desc_len: The length of the complete device descriptor, fixed to 0x1E (30). @@ -41,4 +112,6 @@ struct hidi2c_dev_descriptor { __le16 reserved1; } __packed; +#define HIDI2C_DEV_DESC_LEN sizeof(struct hidi2c_dev_descriptor) + #endif /* _HID_OVER_I2C_H_ */ -- cgit v1.2.3 From 3675a926feefdf3afabea12f806f31ea582065e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 28 Dec 2024 09:43:41 +0100 Subject: sysfs: constify bin_attribute argument of sysfs_bin_attr_simple_read() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most users use this function through the BIN_ATTR_SIMPLE* macros, they can handle the switch transparently. Also adapt the two non-macro users in the same change. Signed-off-by: Thomas Weißschuh Acked-by: Madhavan Srinivasan Reviewed-by: Mahesh Salgaonkar Tested-by: Aditya Gupta Link: https://lore.kernel.org/r/20241228-sysfs-const-bin_attr-simple-v2-1-7c6f3f1767a3@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index b4368377fac9..18f7e1fd093c 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -511,7 +511,7 @@ __printf(3, 4) int sysfs_emit_at(char *buf, int at, const char *fmt, ...); ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, char *buf, + const struct bin_attribute *attr, char *buf, loff_t off, size_t count); #else /* CONFIG_SYSFS */ @@ -774,7 +774,7 @@ static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...) static inline ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj, - struct bin_attribute *attr, + const struct bin_attribute *attr, char *buf, loff_t off, size_t count) { -- cgit v1.2.3 From 09a897432637aa0b99545ce13d57760cf0cb09d1 Mon Sep 17 00:00:00 2001 From: Shree Ramamoorthy Date: Tue, 17 Dec 2024 14:49:35 -0600 Subject: mfd: tps65219: Remove unused macros & add regmap.h These macros are not used by the driver, and the structs are accounted for with the addition of the linux/regmap.h file. Signed-off-by: Shree Ramamoorthy Link: https://lore.kernel.org/r/20241217204935.1012106-3-s-ramamoorthy@ti.com Signed-off-by: Lee Jones --- include/linux/mfd/tps65219.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/mfd/tps65219.h b/include/linux/mfd/tps65219.h index e6826e34e2a6..546bceec7173 100644 --- a/include/linux/mfd/tps65219.h +++ b/include/linux/mfd/tps65219.h @@ -10,14 +10,9 @@ #include #include +#include #include -struct regmap; -struct regmap_irq_chip_data; - -#define TPS65219_1V35 1350000 -#define TPS65219_1V8 1800000 - /* TPS chip id list */ #define TPS65219 0xF0 -- cgit v1.2.3 From 26769582bf353ae613f5113a1414ff3a80e08264 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Tue, 17 Dec 2024 12:11:41 -0600 Subject: mfd: syscon: Remove the platform driver support The platform driver is dead code. It is not used by DT platforms since commit bdb0066df96e ("mfd: syscon: Decouple syscon interface from platform devices") which said: For non-DT based platforms, this patch keeps syscon platform driver structure so that syscon can be probed and such non-DT based drivers can use syscon_regmap_lookup_by_pdev API and access regmap handles. Once all users of "syscon_regmap_lookup_by_pdev" migrated to DT based, we can completely remove platform driver of syscon, and keep only helper functions to get regmap handles. The last user of syscon_regmap_lookup_by_pdevname() was removed in 2018. syscon_regmap_lookup_by_pdevname() was then removed in 2019, but that commit failed to remove the rest of the platform driver. Signed-off-by: Rob Herring (Arm) Tested-by: Krzysztof Kozlowski Tested-by: Will McVicker Acked-by: Liviu Dudau Reviewed-by: Pankaj Dubey Tested-by: Pankaj Dubey Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20241217-syscon-fixes-v2-2-4f56d750541d@kernel.org Signed-off-by: Lee Jones --- include/linux/platform_data/syscon.h | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 include/linux/platform_data/syscon.h (limited to 'include') diff --git a/include/linux/platform_data/syscon.h b/include/linux/platform_data/syscon.h deleted file mode 100644 index 2c089dd3e2bd..000000000000 --- a/include/linux/platform_data/syscon.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef PLATFORM_DATA_SYSCON_H -#define PLATFORM_DATA_SYSCON_H - -struct syscon_platform_data { - const char *label; -}; - -#endif -- cgit v1.2.3 From e61e6c415ba9ff2b32bb6780ce1b17d1d76238f1 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Tue, 7 Jan 2025 02:48:12 -0800 Subject: net/mlx5: use do_aux_work for PHC overflow checks The overflow_work is using system wq to do overflow checks and updates for PHC device timecounter, which might be overhelmed by other tasks. But there is dedicated kthread in PTP subsystem designed for such things. This patch changes the work queue to proper align with PTP subsystem and to avoid overloading system work queue. The adjfine() function acts the same way as overflow check worker, we can postpone ptp aux worker till the next overflow period after adjfine() was called. Reviewed-by: Dragos Tatulea Signed-off-by: Vadim Fedorenko Acked-by: Tariq Toukan Link: https://patch.msgid.link/20250107104812.380225-1-vadfed@meta.com Signed-off-by: Paolo Abeni --- include/linux/mlx5/driver.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ea48eb879a0f..fed666c5bd16 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -691,7 +691,6 @@ struct mlx5_timer { struct timecounter tc; u32 nominal_c_mult; unsigned long overflow_period; - struct delayed_work overflow_work; }; struct mlx5_clock { -- cgit v1.2.3 From d1c444b47100d81a4b8c84aa3ac1c8159c22066a Mon Sep 17 00:00:00 2001 From: Basavaraj Natikar Date: Tue, 17 Dec 2024 20:46:26 +0530 Subject: HID: amd_sfh: Add support to export device operating states MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support to export device operating states, such as laptop placement, platform types and propagate this data to AMD PMF driver for use in actions. To retrieve the device operating states data, SRA sensor support need to be enabled in AMD SFH driver. So add support to enable the SRA sensor. Also, remove explicit assignments to sensor_index enum. Co-developed-by: Akshata MukundShetty Signed-off-by: Akshata MukundShetty Signed-off-by: Basavaraj Natikar Signed-off-by: Shyam Sundar S K Reviewed-by: Mario Limonciello Acked-by: Jiri Kosina Link: https://lore.kernel.org/r/20241217151627.757477-2-Shyam-sundar.S-k@amd.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/amd-pmf-io.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/amd-pmf-io.h b/include/linux/amd-pmf-io.h index b4f818205216..6fa510f419c0 100644 --- a/include/linux/amd-pmf-io.h +++ b/include/linux/amd-pmf-io.h @@ -18,10 +18,12 @@ * enum sfh_message_type - Query the SFH message type * @MT_HPD: Message ID to know the Human presence info from MP2 FW * @MT_ALS: Message ID to know the Ambient light info from MP2 FW + * @MT_SRA: Message ID to know the SRA data from MP2 FW */ enum sfh_message_type { MT_HPD, MT_ALS, + MT_SRA, }; /** @@ -40,10 +42,23 @@ enum sfh_hpd_info { * struct amd_sfh_info - get HPD sensor info from MP2 FW * @ambient_light: Populates the ambient light information * @user_present: Populates the user presence information + * @platform_type: Operating modes (clamshell, flat, tent, etc.) + * @laptop_placement: Device states (ontable, onlap, outbag) */ struct amd_sfh_info { u32 ambient_light; u8 user_present; + u32 platform_type; + u32 laptop_placement; +}; + +enum laptop_placement { + LP_UNKNOWN = 0, + ON_TABLE, + ON_LAP_MOTION, + IN_BAG, + OUT_OF_BAG, + LP_UNDEFINED, }; int amd_get_sfh_info(struct amd_sfh_info *sfh_info, enum sfh_message_type op); -- cgit v1.2.3 From 33d97a07b3ae6fa713919de4e1864ca04fff8f80 Mon Sep 17 00:00:00 2001 From: Yuyang Huang Date: Tue, 7 Jan 2025 20:43:55 +0900 Subject: netlink: add IPv6 anycast join/leave notifications MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change introduces a mechanism for notifying userspace applications about changes to IPv6 anycast addresses via netlink. It includes: * Addition and deletion of IPv6 anycast addresses are reported using RTM_NEWANYCAST and RTM_DELANYCAST. * A new netlink group (RTNLGRP_IPV6_ACADDR) for subscribing to these notifications. This enables user space applications(e.g. ip monitor) to efficiently track anycast addresses through netlink messages, improving metrics collection and system monitoring. It also unlocks the potential for advanced anycast management in user space, such as hardware offload control and fine grained network control. Cc: Maciej Żenczykowski Cc: Lorenzo Colitti Signed-off-by: Yuyang Huang Reviewed-by: David Ahern Link: https://patch.msgid.link/20250107114355.1766086-1-yuyanghuang@google.com Signed-off-by: Paolo Abeni --- include/net/addrconf.h | 3 +++ include/uapi/linux/rtnetlink.h | 8 +++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 58337898fa21..f8f91b2038ea 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -546,4 +546,7 @@ int inet6_fill_ifmcaddr(struct sk_buff *skb, const struct ifmcaddr6 *ifmca, struct inet6_fill_args *args); +int inet6_fill_ifacaddr(struct sk_buff *skb, + const struct ifacaddr6 *ifaca, + struct inet6_fill_args *args); #endif diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 5ee94c511a28..66c3903d29cf 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -100,7 +100,11 @@ enum { RTM_GETMULTICAST, #define RTM_GETMULTICAST RTM_GETMULTICAST - RTM_GETANYCAST = 62, + RTM_NEWANYCAST = 60, +#define RTM_NEWANYCAST RTM_NEWANYCAST + RTM_DELANYCAST, +#define RTM_DELANYCAST RTM_DELANYCAST + RTM_GETANYCAST, #define RTM_GETANYCAST RTM_GETANYCAST RTM_NEWNEIGHTBL = 64, @@ -783,6 +787,8 @@ enum rtnetlink_groups { #define RTNLGRP_IPV4_MCADDR RTNLGRP_IPV4_MCADDR RTNLGRP_IPV6_MCADDR, #define RTNLGRP_IPV6_MCADDR RTNLGRP_IPV6_MCADDR + RTNLGRP_IPV6_ACADDR, +#define RTNLGRP_IPV6_ACADDR RTNLGRP_IPV6_ACADDR __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) -- cgit v1.2.3 From 601731fc7c6111bbca49ce3c9499c2e4d426079d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 15 Nov 2024 14:46:09 +0100 Subject: netfilter: conntrack: add conntrack event timestamp Nadia Pinaeva writes: I am working on a tool that allows collecting network performance metrics by using conntrack events. Start time of a conntrack entry is used to evaluate seen_reply latency, therefore the sooner it is timestamped, the better the precision is. In particular, when using this tool to compare the performance of the same feature implemented using iptables/nftables/OVS it is crucial to have the entry timestamped earlier to see any difference. At this time, conntrack events can only get timestamped at recv time in userspace, so there can be some delay between the event being generated and the userspace process consuming the message. There is sys/net/netfilter/nf_conntrack_timestamp, which adds a 64bit timestamp (ns resolution) that records start and stop times, but its not suited for this either, start time is the 'hashtable insertion time', not 'conntrack allocation time'. There is concern that moving the start-time moment to conntrack allocation will add overhead in case of flooding, where conntrack entries are allocated and released right away without getting inserted into the hashtable. Also, even if this was changed it would not with events other than new (start time) and destroy (stop time). Pablo suggested to add new CTA_TIMESTAMP_EVENT, this adds this feature. The timestamp is recorded in case both events are requested and the sys/net/netfilter/nf_conntrack_timestamp toggle is enabled. Reported-by: Nadia Pinaeva Suggested-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_ecache.h | 12 ++++++++++++ include/uapi/linux/netfilter/nfnetlink_conntrack.h | 1 + 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index 0c1dac318e02..8dcf7c371ee9 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -12,6 +12,7 @@ #include #include #include +#include enum nf_ct_ecache_state { NFCT_ECACHE_DESTROY_FAIL, /* tried but failed to send destroy event */ @@ -20,6 +21,9 @@ enum nf_ct_ecache_state { struct nf_conntrack_ecache { unsigned long cache; /* bitops want long */ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + local64_t timestamp; /* event timestamp, in nanoseconds */ +#endif u16 ctmask; /* bitmask of ct events to be delivered */ u16 expmask; /* bitmask of expect events to be delivered */ u32 missed; /* missed events */ @@ -108,6 +112,14 @@ nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct) if (e == NULL) return; +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + /* renew only if this is the first cached event, so that the + * timestamp reflects the first, not the last, generated event. + */ + if (local64_read(&e->timestamp) && READ_ONCE(e->cache) == 0) + local64_set(&e->timestamp, ktime_get_real_ns()); +#endif + set_bit(event, &e->cache); #endif } diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index c2ac7269acf7..43233af75b9d 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -57,6 +57,7 @@ enum ctattr_type { CTA_SYNPROXY, CTA_FILTER, CTA_STATUS_MASK, + CTA_TIMESTAMP_EVENT, __CTA_MAX }; #define CTA_MAX (__CTA_MAX - 1) -- cgit v1.2.3 From dd2395162c07e4102fc83878dc394ff63f2eaae8 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 9 Jan 2025 00:14:41 +0000 Subject: ASoC: remove disable_route_checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No driver is using disable_route_checks, let's remove it. Because snd_soc_dapm_add_routes() itself will indicate detail error when failed, this patch removes duplicate dev_err() not only dev_warn() in error case. Signed-off-by: Kuninori Morimoto Suggested-by: Amadeusz Sławiński Tested-by: Amadeusz Sławiński Link: https://patch.msgid.link/87tta8268e.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index 4f5d411e3823..1e09ff084247 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -1118,7 +1118,6 @@ struct snd_soc_card { unsigned int instantiated:1; unsigned int topology_shortname_created:1; unsigned int fully_routed:1; - unsigned int disable_route_checks:1; unsigned int probed:1; unsigned int component_chaining:1; -- cgit v1.2.3 From 8fc7e23a9bd851e6997d3ea2ea1c3475b91862d3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Jan 2025 09:31:01 +0100 Subject: fs: reformat the statx definition The comments after the declaration are becoming rather unreadable with long enough comments. Move them into lines of their own. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250109083109.1441561-2-hch@lst.de Reviewed-by: John Garry Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/uapi/linux/stat.h | 95 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 72 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index 887a25286441..8b35d7d511a2 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -98,43 +98,92 @@ struct statx_timestamp { */ struct statx { /* 0x00 */ - __u32 stx_mask; /* What results were written [uncond] */ - __u32 stx_blksize; /* Preferred general I/O size [uncond] */ - __u64 stx_attributes; /* Flags conveying information about the file [uncond] */ + /* What results were written [uncond] */ + __u32 stx_mask; + + /* Preferred general I/O size [uncond] */ + __u32 stx_blksize; + + /* Flags conveying information about the file [uncond] */ + __u64 stx_attributes; + /* 0x10 */ - __u32 stx_nlink; /* Number of hard links */ - __u32 stx_uid; /* User ID of owner */ - __u32 stx_gid; /* Group ID of owner */ - __u16 stx_mode; /* File mode */ + /* Number of hard links */ + __u32 stx_nlink; + + /* User ID of owner */ + __u32 stx_uid; + + /* Group ID of owner */ + __u32 stx_gid; + + /* File mode */ + __u16 stx_mode; __u16 __spare0[1]; + /* 0x20 */ - __u64 stx_ino; /* Inode number */ - __u64 stx_size; /* File size */ - __u64 stx_blocks; /* Number of 512-byte blocks allocated */ - __u64 stx_attributes_mask; /* Mask to show what's supported in stx_attributes */ + /* Inode number */ + __u64 stx_ino; + + /* File size */ + __u64 stx_size; + + /* Number of 512-byte blocks allocated */ + __u64 stx_blocks; + + /* Mask to show what's supported in stx_attributes */ + __u64 stx_attributes_mask; + /* 0x40 */ - struct statx_timestamp stx_atime; /* Last access time */ - struct statx_timestamp stx_btime; /* File creation time */ - struct statx_timestamp stx_ctime; /* Last attribute change time */ - struct statx_timestamp stx_mtime; /* Last data modification time */ + /* Last access time */ + struct statx_timestamp stx_atime; + + /* File creation time */ + struct statx_timestamp stx_btime; + + /* Last attribute change time */ + struct statx_timestamp stx_ctime; + + /* Last data modification time */ + struct statx_timestamp stx_mtime; + /* 0x80 */ - __u32 stx_rdev_major; /* Device ID of special file [if bdev/cdev] */ + /* Device ID of special file [if bdev/cdev] */ + __u32 stx_rdev_major; __u32 stx_rdev_minor; - __u32 stx_dev_major; /* ID of device containing file [uncond] */ + + /* ID of device containing file [uncond] */ + __u32 stx_dev_major; __u32 stx_dev_minor; + /* 0x90 */ __u64 stx_mnt_id; - __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ - __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ + + /* Memory buffer alignment for direct I/O */ + __u32 stx_dio_mem_align; + + /* File offset alignment for direct I/O */ + __u32 stx_dio_offset_align; + /* 0xa0 */ - __u64 stx_subvol; /* Subvolume identifier */ - __u32 stx_atomic_write_unit_min; /* Min atomic write unit in bytes */ - __u32 stx_atomic_write_unit_max; /* Max atomic write unit in bytes */ + /* Subvolume identifier */ + __u64 stx_subvol; + + /* Min atomic write unit in bytes */ + __u32 stx_atomic_write_unit_min; + + /* Max atomic write unit in bytes */ + __u32 stx_atomic_write_unit_max; + /* 0xb0 */ - __u32 stx_atomic_write_segments_max; /* Max atomic write segment count */ + /* Max atomic write segment count */ + __u32 stx_atomic_write_segments_max; + __u32 __spare1[1]; + /* 0xb8 */ __u64 __spare3[9]; /* Spare space for future expansion */ + /* 0x100 */ }; -- cgit v1.2.3 From 7ed6cbe0f8caa6ee38a2dc8f1b925acb904cc01f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Jan 2025 09:31:02 +0100 Subject: fs: add STATX_DIO_READ_ALIGN Add a separate dio read align field, as many out of place write file systems can easily do reads aligned to the device sector size, but require bigger alignment for writes. This is usually papered over by falling back to buffered I/O for smaller writes and doing read-modify-write cycles, but performance for this sucks, so applications benefit from knowing the actual write alignment. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250109083109.1441561-3-hch@lst.de Reviewed-by: John Garry Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/stat.h | 1 + include/uapi/linux/stat.h | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/stat.h b/include/linux/stat.h index 3d900c86981c..9d8382e23a9c 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -52,6 +52,7 @@ struct kstat { u64 mnt_id; u32 dio_mem_align; u32 dio_offset_align; + u32 dio_read_offset_align; u64 change_cookie; u64 subvol; u32 atomic_write_unit_min; diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index 8b35d7d511a2..f78ee3670dd5 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -179,7 +179,8 @@ struct statx { /* Max atomic write segment count */ __u32 stx_atomic_write_segments_max; - __u32 __spare1[1]; + /* File offset alignment for direct I/O reads */ + __u32 stx_dio_read_offset_align; /* 0xb8 */ __u64 __spare3[9]; /* Spare space for future expansion */ @@ -213,6 +214,7 @@ struct statx { #define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ #define STATX_SUBVOL 0x00008000U /* Want/got stx_subvol */ #define STATX_WRITE_ATOMIC 0x00010000U /* Want/got atomic_write_* fields */ +#define STATX_DIO_READ_ALIGN 0x00020000U /* Want/got dio read alignment info */ #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ -- cgit v1.2.3 From 344bac8f0d73fe970cd9f5b2f132906317d29e8b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 15 Dec 2024 21:17:05 +0100 Subject: fs: kill MNT_ONRB Move mnt->mnt_node into the union with mnt->mnt_rcu and mnt->mnt_llist instead of keeping it with mnt->mnt_list. This allows us to use RB_CLEAR_NODE(&mnt->mnt_node) in umount_tree() as well as list_empty(&mnt->mnt_node). That in turn allows us to remove MNT_ONRB. This also fixes the bug reported in [1] where seemingly MNT_ONRB wasn't set in @mnt->mnt_flags even though the mount was present in the mount rbtree of the mount namespace. The root cause is the following race. When a btrfs subvolume is mounted a temporary mount is created: btrfs_get_tree_subvol() { mnt = fc_mount() // Register the newly allocated mount with sb->mounts: lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); } and registered on sb->s_mounts. Later it is added to an anonymous mount namespace via mount_subvol(): -> mount_subvol() -> mount_subtree() -> alloc_mnt_ns() mnt_add_to_ns() vfs_path_lookup() put_mnt_ns() The mnt_add_to_ns() call raises MNT_ONRB in @mnt->mnt_flags. If someone concurrently does a ro remount: reconfigure_super() -> sb_prepare_remount_readonly() { list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { } all mounts registered in sb->s_mounts are visited and first MNT_WRITE_HOLD is raised, then MNT_READONLY is raised, and finally MNT_WRITE_HOLD is removed again. The flag modification for MNT_WRITE_HOLD/MNT_READONLY and MNT_ONRB race so MNT_ONRB might be lost. Fixes: 2eea9ce4310d ("mounts: keep list of mounts in an rbtree") Cc: # v6.8+ Link: https://lore.kernel.org/r/20241215-vfs-6-14-mount-work-v1-1-fd55922c4af8@kernel.org Link: https://lore.kernel.org/r/ec6784ed-8722-4695-980a-4400d4e7bd1a@gmx.com [1] Signed-off-by: Christian Brauner --- include/linux/mount.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mount.h b/include/linux/mount.h index c34c18b4e8f3..04213d8ef837 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -50,7 +50,7 @@ struct path; #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB) + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED) #define MNT_INTERNAL 0x4000 @@ -64,7 +64,6 @@ struct path; #define MNT_SYNC_UMOUNT 0x2000000 #define MNT_MARKED 0x4000000 #define MNT_UMOUNT 0x8000000 -#define MNT_ONRB 0x10000000 struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ -- cgit v1.2.3 From 67d676bb135cd4de9647616e73cfd059ef57c9a6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 13 Dec 2024 00:03:43 +0100 Subject: rculist: add list_bidir_{del,prev}_rcu() Currently there is no primitive for retrieving the previous list member. To do this we need a new deletion primitive that doesn't poison the prev pointer and a corresponding retrieval helper. Note that it is not valid to ues both list_del_rcu() and list_bidir_del_rcu() on the same list. Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-4-6e3cdaf9b280@kernel.org Reviewed-by: Paul E. McKenney Reviewed-by: Jeff Layton Suggested-by: Paul E. McKenney Signed-off-by: Christian Brauner --- include/linux/rculist.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'include') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 14dfa6008467..1b11926ddd47 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -30,6 +30,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) * way, we must not access it directly */ #define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) +/* + * Return the ->prev pointer of a list_head in an rcu safe way. Don't + * access it directly. + * + * Any list traversed with list_bidir_prev_rcu() must never use + * list_del_rcu(). Doing so will poison the ->prev pointer that + * list_bidir_prev_rcu() relies on, which will result in segfaults. + * To prevent these segfaults, use list_bidir_del_rcu() instead + * of list_del_rcu(). + */ +#define list_bidir_prev_rcu(list) (*((struct list_head __rcu **)(&(list)->prev))) /** * list_tail_rcu - returns the prev pointer of the head of the list @@ -158,6 +169,39 @@ static inline void list_del_rcu(struct list_head *entry) entry->prev = LIST_POISON2; } +/** + * list_bidir_del_rcu - deletes entry from list without re-initialization + * @entry: the element to delete from the list. + * + * In contrast to list_del_rcu() doesn't poison the prev pointer thus + * allowing backwards traversal via list_bidir_prev_rcu(). + * + * Note: list_empty() on entry does not return true after this because + * the entry is in a special undefined state that permits RCU-based + * lockfree reverse traversal. In particular this means that we can not + * poison the forward and backwards pointers that may still be used for + * walking the list. + * + * The caller must take whatever precautions are necessary (such as + * holding appropriate locks) to avoid racing with another list-mutation + * primitive, such as list_bidir_del_rcu() or list_add_rcu(), running on + * this same list. However, it is perfectly legal to run concurrently + * with the _rcu list-traversal primitives, such as + * list_for_each_entry_rcu(). + * + * Note that list_del_rcu() and list_bidir_del_rcu() must not be used on + * the same list. + * + * Note that the caller is not permitted to immediately free + * the newly deleted entry. Instead, either synchronize_rcu() + * or call_rcu() must be used to defer freeing until an RCU + * grace period has elapsed. + */ +static inline void list_bidir_del_rcu(struct list_head *entry) +{ + __list_del_entry(entry); +} + /** * hlist_del_init_rcu - deletes entry from hash list with re-initialization * @n: the element to delete from the hash list. -- cgit v1.2.3 From 0fefeade90e74bc8f40ab0e460f483565c492e28 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 24 Dec 2024 18:05:46 +0100 Subject: spi: spi-mem: Extend spi-mem operations with a per-operation maximum frequency In the spi subsystem, the bus frequency is derived as follows: - the controller may expose a minimum and maximum operating frequency - the hardware description, through the spi peripheral properties, advise what is the maximum acceptable frequency from a device/wiring point of view. Transfers must be observed at a frequency which fits both (so in practice, the lowest maximum). Actually, this second point mixes two information and already takes the lowest frequency among: - what the spi device is capable of (what is written in the component datasheet) - what the wiring allows (electromagnetic sensibility, crossovers, terminations, antenna effect, etc). This logic works until spi devices are no longer capable of sustaining their highest frequency regardless of the operation. Spi memories are typically subject to such variation. Some devices are capable of spitting their internally stored data (essentially in read mode) at a very fast rate, typically up to 166MHz on Winbond SPI-NAND chips, using "fast" commands. However, some of the low-end operations, such as regular page read-from-cache commands, are more limited and can only be executed at 54MHz at most. This is currently a problem in the SPI-NAND subsystem. Another situation, even if not yet supported, will be with DTR commands, when the data is latched on both edges of the clock. The same chips as mentioned previously are in this case limited to 80MHz. Yet another example might be continuous reads, which, under certain circumstances, can also run at most at 104 or 120MHz. As a matter of fact, the "one frequency per chip" policy is outdated and more fine grain configuration is needed: we need to allow per-operation frequency limitations. So far, all datasheets I encountered advertise a maximum default frequency, which need to be lowered for certain specific operations. So based on the current infrastructure, we can still expect firmware (device trees in general) to continued advertising the same maximum speed which is a mix between the PCB limitations and the chip maximum capability, and expect per-operation lower frequencies when this is relevant. Add a `struct spi_mem_op` member to carry this information. Not providing this field explicitly from upper layers means that there is no further constraint and the default spi device maximum speed will be carried instead. The SPI_MEM_OP() macro is also expanded with an optional frequency argument, because virtually all operations can be subject to such a limitation, and this will allow for a smooth and discrete transition. For controller drivers which do not implement the spi-mem interface, the per-transfer speed is also set acordingly to a lower (than the maximum default) speed when relevant. Acked-by: Pratyush Yadav Signed-off-by: Miquel Raynal Link: https://patch.msgid.link/20241224-winbond-6-11-rc1-quad-support-v2-1-ad218dbc406f@bootlin.com Signed-off-by: Mark Brown --- include/linux/spi/spi-mem.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index c46d2b8029be..84ec52498792 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -68,6 +68,9 @@ enum spi_mem_data_dir { SPI_MEM_DATA_OUT, }; +#define SPI_MEM_OP_MAX_FREQ(__freq) \ + .max_freq = __freq + /** * struct spi_mem_op - describes a SPI memory operation * @cmd.nbytes: number of opcode bytes (only 1 or 2 are valid). The opcode is @@ -97,6 +100,9 @@ enum spi_mem_data_dir { * operation does not involve transferring data * @data.buf.in: input buffer (must be DMA-able) * @data.buf.out: output buffer (must be DMA-able) + * @max_freq: frequency limitation wrt this operation. 0 means there is no + * specific constraint and the highest achievable frequency can be + * attempted. */ struct spi_mem_op { struct { @@ -135,14 +141,17 @@ struct spi_mem_op { const void *out; } buf; } data; + + unsigned int max_freq; }; -#define SPI_MEM_OP(__cmd, __addr, __dummy, __data) \ +#define SPI_MEM_OP(__cmd, __addr, __dummy, __data, ...) \ { \ .cmd = __cmd, \ .addr = __addr, \ .dummy = __dummy, \ .data = __data, \ + __VA_ARGS__ \ } /** @@ -371,6 +380,7 @@ bool spi_mem_default_supports_op(struct spi_mem *mem, #endif /* CONFIG_SPI_MEM */ int spi_mem_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op); +void spi_mem_adjust_op_freq(struct spi_mem *mem, struct spi_mem_op *op); bool spi_mem_supports_op(struct spi_mem *mem, const struct spi_mem_op *op); -- cgit v1.2.3 From 1248c9b8d54120950fda10fbeb98fb8932b4d45c Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 24 Dec 2024 18:05:47 +0100 Subject: spi: spi-mem: Add a new controller capability There are spi devices with multiple frequency limitations depending on the invoked command. We probably do not want to afford running at the lowest supported frequency all the time, so if we want to get the most of our hardware, we need to allow per-operation frequency limitations. Among all the SPI memory controllers, I believe all are capable of changing the spi frequency on the fly. Some of the drivers do not make any frequency setup though. And some others will derive a per chip prescaler value which will be used forever. Actually changing the frequency on the fly is something new in Linux, so we need to carefully flag the drivers which do and do not support it. A controller capability is created for that, and the presence for this capability will always be checked before accepting such pattern. Signed-off-by: Miquel Raynal Reviewed-by: Tudor Ambarus Link: https://patch.msgid.link/20241224-winbond-6-11-rc1-quad-support-v2-2-ad218dbc406f@bootlin.com Signed-off-by: Mark Brown --- include/linux/spi/spi-mem.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index 84ec52498792..c7a7719c2648 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -311,11 +311,13 @@ struct spi_controller_mem_ops { * @ecc: Supports operations with error correction * @swap16: Supports swapping bytes on a 16 bit boundary when configured in * Octal DTR + * @per_op_freq: Supports per operation frequency switching */ struct spi_controller_mem_caps { bool dtr; bool ecc; bool swap16; + bool per_op_freq; }; #define spi_mem_controller_is_capable(ctlr, cap) \ -- cgit v1.2.3 From d1f85873d2d62d6980e68d21d3a21f20b0664cc3 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 24 Dec 2024 18:06:03 +0100 Subject: spi: spi-mem: Reorder spi-mem macro assignments Follow the order in which all the `struct spi_mem_op` members are defined. This is purely aesthetics, there is no functional change. Signed-off-by: Miquel Raynal Link: https://patch.msgid.link/20241224-winbond-6-11-rc1-quad-support-v2-18-ad218dbc406f@bootlin.com Signed-off-by: Mark Brown --- include/linux/spi/spi-mem.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index c7a7719c2648..ca6ea01c40f8 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -15,16 +15,16 @@ #define SPI_MEM_OP_CMD(__opcode, __buswidth) \ { \ + .nbytes = 1, \ .buswidth = __buswidth, \ .opcode = __opcode, \ - .nbytes = 1, \ } #define SPI_MEM_OP_ADDR(__nbytes, __val, __buswidth) \ { \ .nbytes = __nbytes, \ - .val = __val, \ .buswidth = __buswidth, \ + .val = __val, \ } #define SPI_MEM_OP_NO_ADDR { } @@ -39,18 +39,18 @@ #define SPI_MEM_OP_DATA_IN(__nbytes, __buf, __buswidth) \ { \ + .buswidth = __buswidth, \ .dir = SPI_MEM_DATA_IN, \ .nbytes = __nbytes, \ .buf.in = __buf, \ - .buswidth = __buswidth, \ } #define SPI_MEM_OP_DATA_OUT(__nbytes, __buf, __buswidth) \ { \ + .buswidth = __buswidth, \ .dir = SPI_MEM_DATA_OUT, \ .nbytes = __nbytes, \ .buf.out = __buf, \ - .buswidth = __buswidth, \ } #define SPI_MEM_OP_NO_DATA { } -- cgit v1.2.3 From f0006897a96c736623ddeb9b68c3880eb5cdebe7 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 24 Dec 2024 18:06:04 +0100 Subject: spi: spi-mem: Create macros for DTR operation We do have macros for defining command, address, dummy and data cycles. We also have a .dtr flag that implies sampling the bus on both edges, but there are currently no macros enabling it. We might make use of such macros, so let's create: - SPI_MEM_DTR_OP_CMD - SPI_MEM_DTR_OP_ADDR - SPI_MEM_DTR_OP_DUMMY - SPI_MEM_DTR_OP_DATA_OUT - SPI_MEM_DTR_OP_DATA_OUT Signed-off-by: Miquel Raynal Link: https://patch.msgid.link/20241224-winbond-6-11-rc1-quad-support-v2-19-ad218dbc406f@bootlin.com Signed-off-by: Mark Brown --- include/linux/spi/spi-mem.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include') diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index ca6ea01c40f8..306c05dd1378 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -20,6 +20,14 @@ .opcode = __opcode, \ } +#define SPI_MEM_DTR_OP_CMD(__opcode, __buswidth) \ + { \ + .nbytes = 1, \ + .opcode = __opcode, \ + .buswidth = __buswidth, \ + .dtr = true, \ + } + #define SPI_MEM_OP_ADDR(__nbytes, __val, __buswidth) \ { \ .nbytes = __nbytes, \ @@ -27,6 +35,14 @@ .val = __val, \ } +#define SPI_MEM_DTR_OP_ADDR(__nbytes, __val, __buswidth) \ + { \ + .nbytes = __nbytes, \ + .val = __val, \ + .buswidth = __buswidth, \ + .dtr = true, \ + } + #define SPI_MEM_OP_NO_ADDR { } #define SPI_MEM_OP_DUMMY(__nbytes, __buswidth) \ @@ -35,6 +51,13 @@ .buswidth = __buswidth, \ } +#define SPI_MEM_DTR_OP_DUMMY(__nbytes, __buswidth) \ + { \ + .nbytes = __nbytes, \ + .buswidth = __buswidth, \ + .dtr = true, \ + } + #define SPI_MEM_OP_NO_DUMMY { } #define SPI_MEM_OP_DATA_IN(__nbytes, __buf, __buswidth) \ @@ -45,6 +68,15 @@ .buf.in = __buf, \ } +#define SPI_MEM_DTR_OP_DATA_IN(__nbytes, __buf, __buswidth) \ + { \ + .dir = SPI_MEM_DATA_IN, \ + .nbytes = __nbytes, \ + .buf.in = __buf, \ + .buswidth = __buswidth, \ + .dtr = true, \ + } + #define SPI_MEM_OP_DATA_OUT(__nbytes, __buf, __buswidth) \ { \ .buswidth = __buswidth, \ @@ -53,6 +85,15 @@ .buf.out = __buf, \ } +#define SPI_MEM_DTR_OP_DATA_OUT(__nbytes, __buf, __buswidth) \ + { \ + .dir = SPI_MEM_DATA_OUT, \ + .nbytes = __nbytes, \ + .buf.out = __buf, \ + .buswidth = __buswidth, \ + .dtr = true, \ + } + #define SPI_MEM_OP_NO_DATA { } /** -- cgit v1.2.3 From 155c5bf26f983e9988333eeb0ef217138304d13b Mon Sep 17 00:00:00 2001 From: guanjing Date: Fri, 20 Dec 2024 09:33:35 +0100 Subject: firewall: remove misplaced semicolon from stm32_firewall_get_firewall Remove misplaced colon in stm32_firewall_get_firewall() which results in a syntax error when the code is compiled without CONFIG_STM32_FIREWALL. Fixes: 5c9668cfc6d7 ("firewall: introduce stm32_firewall framework") Signed-off-by: guanjing Reviewed-by: Gatien Chevallier Signed-off-by: Alexandre Torgue Signed-off-by: Arnd Bergmann --- include/linux/bus/stm32_firewall_device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bus/stm32_firewall_device.h b/include/linux/bus/stm32_firewall_device.h index 18e0a2fc3816..5178b72bc920 100644 --- a/include/linux/bus/stm32_firewall_device.h +++ b/include/linux/bus/stm32_firewall_device.h @@ -115,7 +115,7 @@ void stm32_firewall_release_access_by_id(struct stm32_firewall *firewall, u32 su #else /* CONFIG_STM32_FIREWALL */ int stm32_firewall_get_firewall(struct device_node *np, struct stm32_firewall *firewall, - unsigned int nb_firewall); + unsigned int nb_firewall) { return -ENODEV; } -- cgit v1.2.3 From e2b6e5e4877ac898b61338dc20491cd837af79b2 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 9 Dec 2024 11:41:11 +0900 Subject: jump_label: Define guard() for jump_label_lock Link: https://lore.kernel.org/all/173371207108.480397.12818384744149153972.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- include/linux/jump_label.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index f5a2727ca4a9..fdb79dd1ebd8 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -75,6 +75,7 @@ #include #include +#include extern bool static_key_initialized; @@ -347,6 +348,8 @@ static inline void static_key_disable(struct static_key *key) #endif /* CONFIG_JUMP_LABEL */ +DEFINE_LOCK_GUARD_0(jump_label_lock, jump_label_lock(), jump_label_unlock()) + #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE #define jump_label_enabled static_key_enabled -- cgit v1.2.3 From e68bda71a2384e4463c96bac958912b4c5e58502 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Mon, 25 Nov 2024 15:24:42 -0800 Subject: hyperv: Add new Hyper-V headers in include/hyperv These headers contain definitions for regular Hyper-V guests (as in hyperv-tlfs.h), as well as interfaces for more privileged guests like the root partition (aka Dom0). These files are derived from headers exported from Hyper-V, rather than being derived from the TLFS document. (Although, to preserve compatibility with existing Linux code, some definitions are copied directly from hyperv-tlfs.h too). The new files follow a naming convention according to their original use: - hdk "host development kit" - gdk "guest development kit" With postfix "_mini" implying userspace-only headers, and "_ext" for extended hypercalls. The use of multiple files and their original names is primarily to keep the provenance of exactly where they came from in Hyper-V code, which is helpful for manual maintenance and extension of these definitions. Microsoft maintainers importing new definitions should take care to put them in the right file. However, Linux kernel code that uses any of the definitions need not be aware of the multiple files or assign any meaning to the new names. Linux kernel code should always just include hvhdk.h Note the new headers contain both arm64 and x86_64 definitions. Some are guarded by #ifdefs, and some are instead prefixed with the architecture, e.g. hv_x64_*. These conventions are kept from Hyper-V code as another tactic to simplify the process of importing and maintaining the definitions, rather than splitting them up into their own files in arch/x86/ and arch/arm64/. These headers are a step toward importing headers directly from Hyper-V in the future, similar to Xen public files in include/xen/interface/. Signed-off-by: Nuno Das Neves Reviewed-by: Easwar Hariharan Reviewed-by: Michael Kelley Signed-off-by: Roman Kisel Link: https://lore.kernel.org/r/1732577084-2122-4-git-send-email-nunodasneves@linux.microsoft.com Link: https://lore.kernel.org/r/20250108222138.1623703-2-romank@linux.microsoft.com Signed-off-by: Wei Liu --- include/hyperv/hvgdk.h | 308 ++++++++++ include/hyperv/hvgdk_ext.h | 46 ++ include/hyperv/hvgdk_mini.h | 1348 +++++++++++++++++++++++++++++++++++++++++++ include/hyperv/hvhdk.h | 733 +++++++++++++++++++++++ include/hyperv/hvhdk_mini.h | 311 ++++++++++ 5 files changed, 2746 insertions(+) create mode 100644 include/hyperv/hvgdk.h create mode 100644 include/hyperv/hvgdk_ext.h create mode 100644 include/hyperv/hvgdk_mini.h create mode 100644 include/hyperv/hvhdk.h create mode 100644 include/hyperv/hvhdk_mini.h (limited to 'include') diff --git a/include/hyperv/hvgdk.h b/include/hyperv/hvgdk.h new file mode 100644 index 000000000000..dd6d4939ea29 --- /dev/null +++ b/include/hyperv/hvgdk.h @@ -0,0 +1,308 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Type definitions for the Microsoft Hypervisor. + */ +#ifndef _HV_HVGDK_H +#define _HV_HVGDK_H + +#include "hvgdk_mini.h" +#include "hvgdk_ext.h" + +/* + * The guest OS needs to register the guest ID with the hypervisor. + * The guest ID is a 64 bit entity and the structure of this ID is + * specified in the Hyper-V TLFS specification. + * + * While the current guideline does not specify how Linux guest ID(s) + * need to be generated, our plan is to publish the guidelines for + * Linux and other guest operating systems that currently are hosted + * on Hyper-V. The implementation here conforms to this yet + * unpublished guidelines. + * + * Bit(s) + * 63 - Indicates if the OS is Open Source or not; 1 is Open Source + * 62:56 - Os Type; Linux is 0x100 + * 55:48 - Distro specific identification + * 47:16 - Linux kernel version number + * 15:0 - Distro specific identification + */ + +#define HV_LINUX_VENDOR_ID 0x8100 + +/* HV_VMX_ENLIGHTENED_VMCS */ +struct hv_enlightened_vmcs { + u32 revision_id; + u32 abort; + + u16 host_es_selector; + u16 host_cs_selector; + u16 host_ss_selector; + u16 host_ds_selector; + u16 host_fs_selector; + u16 host_gs_selector; + u16 host_tr_selector; + + u16 padding16_1; + + u64 host_ia32_pat; + u64 host_ia32_efer; + + u64 host_cr0; + u64 host_cr3; + u64 host_cr4; + + u64 host_ia32_sysenter_esp; + u64 host_ia32_sysenter_eip; + u64 host_rip; + u32 host_ia32_sysenter_cs; + + u32 pin_based_vm_exec_control; + u32 vm_exit_controls; + u32 secondary_vm_exec_control; + + u64 io_bitmap_a; + u64 io_bitmap_b; + u64 msr_bitmap; + + u16 guest_es_selector; + u16 guest_cs_selector; + u16 guest_ss_selector; + u16 guest_ds_selector; + u16 guest_fs_selector; + u16 guest_gs_selector; + u16 guest_ldtr_selector; + u16 guest_tr_selector; + + u32 guest_es_limit; + u32 guest_cs_limit; + u32 guest_ss_limit; + u32 guest_ds_limit; + u32 guest_fs_limit; + u32 guest_gs_limit; + u32 guest_ldtr_limit; + u32 guest_tr_limit; + u32 guest_gdtr_limit; + u32 guest_idtr_limit; + + u32 guest_es_ar_bytes; + u32 guest_cs_ar_bytes; + u32 guest_ss_ar_bytes; + u32 guest_ds_ar_bytes; + u32 guest_fs_ar_bytes; + u32 guest_gs_ar_bytes; + u32 guest_ldtr_ar_bytes; + u32 guest_tr_ar_bytes; + + u64 guest_es_base; + u64 guest_cs_base; + u64 guest_ss_base; + u64 guest_ds_base; + u64 guest_fs_base; + u64 guest_gs_base; + u64 guest_ldtr_base; + u64 guest_tr_base; + u64 guest_gdtr_base; + u64 guest_idtr_base; + + u64 padding64_1[3]; + + u64 vm_exit_msr_store_addr; + u64 vm_exit_msr_load_addr; + u64 vm_entry_msr_load_addr; + + u64 cr3_target_value0; + u64 cr3_target_value1; + u64 cr3_target_value2; + u64 cr3_target_value3; + + u32 page_fault_error_code_mask; + u32 page_fault_error_code_match; + + u32 cr3_target_count; + u32 vm_exit_msr_store_count; + u32 vm_exit_msr_load_count; + u32 vm_entry_msr_load_count; + + u64 tsc_offset; + u64 virtual_apic_page_addr; + u64 vmcs_link_pointer; + + u64 guest_ia32_debugctl; + u64 guest_ia32_pat; + u64 guest_ia32_efer; + + u64 guest_pdptr0; + u64 guest_pdptr1; + u64 guest_pdptr2; + u64 guest_pdptr3; + + u64 guest_pending_dbg_exceptions; + u64 guest_sysenter_esp; + u64 guest_sysenter_eip; + + u32 guest_activity_state; + u32 guest_sysenter_cs; + + u64 cr0_guest_host_mask; + u64 cr4_guest_host_mask; + u64 cr0_read_shadow; + u64 cr4_read_shadow; + u64 guest_cr0; + u64 guest_cr3; + u64 guest_cr4; + u64 guest_dr7; + + u64 host_fs_base; + u64 host_gs_base; + u64 host_tr_base; + u64 host_gdtr_base; + u64 host_idtr_base; + u64 host_rsp; + + u64 ept_pointer; + + u16 virtual_processor_id; + u16 padding16_2[3]; + + u64 padding64_2[5]; + u64 guest_physical_address; + + u32 vm_instruction_error; + u32 vm_exit_reason; + u32 vm_exit_intr_info; + u32 vm_exit_intr_error_code; + u32 idt_vectoring_info_field; + u32 idt_vectoring_error_code; + u32 vm_exit_instruction_len; + u32 vmx_instruction_info; + + u64 exit_qualification; + u64 exit_io_instruction_ecx; + u64 exit_io_instruction_esi; + u64 exit_io_instruction_edi; + u64 exit_io_instruction_eip; + + u64 guest_linear_address; + u64 guest_rsp; + u64 guest_rflags; + + u32 guest_interruptibility_info; + u32 cpu_based_vm_exec_control; + u32 exception_bitmap; + u32 vm_entry_controls; + u32 vm_entry_intr_info_field; + u32 vm_entry_exception_error_code; + u32 vm_entry_instruction_len; + u32 tpr_threshold; + + u64 guest_rip; + + u32 hv_clean_fields; + u32 padding32_1; + u32 hv_synthetic_controls; + struct { + u32 nested_flush_hypercall:1; + u32 msr_bitmap:1; + u32 reserved:30; + } __packed hv_enlightenments_control; + u32 hv_vp_id; + u32 padding32_2; + u64 hv_vm_id; + u64 partition_assist_page; + u64 padding64_4[4]; + u64 guest_bndcfgs; + u64 guest_ia32_perf_global_ctrl; + u64 guest_ia32_s_cet; + u64 guest_ssp; + u64 guest_ia32_int_ssp_table_addr; + u64 guest_ia32_lbr_ctl; + u64 padding64_5[2]; + u64 xss_exit_bitmap; + u64 encls_exiting_bitmap; + u64 host_ia32_perf_global_ctrl; + u64 tsc_multiplier; + u64 host_ia32_s_cet; + u64 host_ssp; + u64 host_ia32_int_ssp_table_addr; + u64 padding64_6; +} __packed; +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0 + + +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP BIT(0) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP BIT(1) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2 BIT(2) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1 BIT(3) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC BIT(4) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT BIT(5) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY BIT(6) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN BIT(7) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR BIT(8) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT BIT(9) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC BIT(10) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1 BIT(11) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2 BIT(12) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER BIT(13) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1 BIT(14) +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL BIT(15) + +#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF + +/* + * Note, Hyper-V isn't actually stealing bit 28 from Intel, just abusing it by + * pairing it with architecturally impossible exit reasons. Bit 28 is set only + * on SMI exits to a SMI transfer monitor (STM) and if and only if a MTF VM-Exit + * is pending. I.e. it will never be set by hardware for non-SMI exits (there + * are only three), nor will it ever be set unless the VMM is an STM. + */ +#define HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH 0x10000031 + +/* + * Hyper-V uses the software reserved 32 bytes in VMCB control area to expose + * SVM enlightenments to guests. This is documented in the TLFS doc. + * Note on naming: SVM_NESTED_ENLIGHTENED_VMCB_FIELDS + */ +struct hv_vmcb_enlightenments { + struct __packed hv_enlightenments_control { + u32 nested_flush_hypercall : 1; + u32 msr_bitmap : 1; + u32 enlightened_npt_tlb: 1; + u32 reserved : 29; + } __packed hv_enlightenments_control; + u32 hv_vp_id; + u64 hv_vm_id; + u64 partition_assist_page; + u64 reserved; +} __packed; + +/* + * Hyper-V uses the software reserved clean bit in VMCB. + */ +#define HV_VMCB_NESTED_ENLIGHTENMENTS 31 + +/* Synthetic VM-Exit */ +#define HV_SVM_EXITCODE_ENL 0xf0000000 +#define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH (1) + +/* VM_PARTITION_ASSIST_PAGE */ +struct hv_partition_assist_pg { + u32 tlb_lock_count; +}; + +/* Define connection identifier type. */ +union hv_connection_id { + u32 asu32; + struct { + u32 id : 24; + u32 reserved : 8; + } __packed u; +}; + +struct hv_input_unmap_gpa_pages { + u64 target_partition_id; + u64 target_gpa_base; + u32 unmap_flags; + u32 padding; +} __packed; + +#endif /* #ifndef _HV_HVGDK_H */ diff --git a/include/hyperv/hvgdk_ext.h b/include/hyperv/hvgdk_ext.h new file mode 100644 index 000000000000..641b591ee61f --- /dev/null +++ b/include/hyperv/hvgdk_ext.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Type definitions for the Microsoft Hypervisor. + */ +#ifndef _HV_HVGDK_EXT_H +#define _HV_HVGDK_EXT_H + +#include "hvgdk_mini.h" + +/* Extended hypercalls */ +#define HV_EXT_CALL_QUERY_CAPABILITIES 0x8001 +#define HV_EXT_CALL_MEMORY_HEAT_HINT 0x8003 + +/* Extended hypercalls */ +enum { /* HV_EXT_CALL */ + HV_EXTCALL_QUERY_CAPABILITIES = 0x8001, + HV_EXTCALL_MEMORY_HEAT_HINT = 0x8003, +}; + +/* HV_EXT_OUTPUT_QUERY_CAPABILITIES */ +#define HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT BIT(8) + +enum { /* HV_EXT_MEMORY_HEAT_HINT_TYPE */ + HV_EXTMEM_HEAT_HINT_COLD = 0, + HV_EXTMEM_HEAT_HINT_HOT = 1, + HV_EXTMEM_HEAT_HINT_COLD_DISCARD = 2, + HV_EXTMEM_HEAT_HINT_MAX +}; + +/* + * The whole argument should fit in a page to be able to pass to the hypervisor + * in one hypercall. + */ +#define HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES \ + ((HV_HYP_PAGE_SIZE - sizeof(struct hv_memory_hint)) / \ + sizeof(union hv_gpa_page_range)) + +/* HvExtCallMemoryHeatHint hypercall */ +#define HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD 2 +struct hv_memory_hint { /* HV_EXT_INPUT_MEMORY_HEAT_HINT */ + u64 heat_type : 2; /* HV_EXTMEM_HEAT_HINT_* */ + u64 reserved : 62; + union hv_gpa_page_range ranges[]; +} __packed; + +#endif /* _HV_HVGDK_EXT_H */ diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h new file mode 100644 index 000000000000..155615175965 --- /dev/null +++ b/include/hyperv/hvgdk_mini.h @@ -0,0 +1,1348 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Type definitions for the Microsoft hypervisor. + */ +#ifndef _HV_HVGDK_MINI_H +#define _HV_HVGDK_MINI_H + +#include +#include + +struct hv_u128 { + u64 low_part; + u64 high_part; +} __packed; + +/* NOTE: when adding below, update hv_status_to_string() */ +#define HV_STATUS_SUCCESS 0x0 +#define HV_STATUS_INVALID_HYPERCALL_CODE 0x2 +#define HV_STATUS_INVALID_HYPERCALL_INPUT 0x3 +#define HV_STATUS_INVALID_ALIGNMENT 0x4 +#define HV_STATUS_INVALID_PARAMETER 0x5 +#define HV_STATUS_ACCESS_DENIED 0x6 +#define HV_STATUS_INVALID_PARTITION_STATE 0x7 +#define HV_STATUS_OPERATION_DENIED 0x8 +#define HV_STATUS_UNKNOWN_PROPERTY 0x9 +#define HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE 0xA +#define HV_STATUS_INSUFFICIENT_MEMORY 0xB +#define HV_STATUS_INVALID_PARTITION_ID 0xD +#define HV_STATUS_INVALID_VP_INDEX 0xE +#define HV_STATUS_NOT_FOUND 0x10 +#define HV_STATUS_INVALID_PORT_ID 0x11 +#define HV_STATUS_INVALID_CONNECTION_ID 0x12 +#define HV_STATUS_INSUFFICIENT_BUFFERS 0x13 +#define HV_STATUS_NOT_ACKNOWLEDGED 0x14 +#define HV_STATUS_INVALID_VP_STATE 0x15 +#define HV_STATUS_NO_RESOURCES 0x1D +#define HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED 0x20 +#define HV_STATUS_INVALID_LP_INDEX 0x41 +#define HV_STATUS_INVALID_REGISTER_VALUE 0x50 +#define HV_STATUS_OPERATION_FAILED 0x71 +#define HV_STATUS_TIME_OUT 0x78 +#define HV_STATUS_CALL_PENDING 0x79 +#define HV_STATUS_VTL_ALREADY_ENABLED 0x86 + +/* + * The Hyper-V TimeRefCount register and the TSC + * page provide a guest VM clock with 100ns tick rate + */ +#define HV_CLOCK_HZ (NSEC_PER_SEC / 100) + +#define HV_HYP_PAGE_SHIFT 12 +#define HV_HYP_PAGE_SIZE BIT(HV_HYP_PAGE_SHIFT) +#define HV_HYP_PAGE_MASK (~(HV_HYP_PAGE_SIZE - 1)) + +#define HV_PARTITION_ID_INVALID ((u64)0) +#define HV_PARTITION_ID_SELF ((u64)-1) + +/* Hyper-V specific model specific registers (MSRs) */ + +#if defined(CONFIG_X86) +/* HV_X64_SYNTHETIC_MSR */ +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 +#define HV_X64_MSR_HYPERCALL 0x40000001 +#define HV_X64_MSR_VP_INDEX 0x40000002 +#define HV_X64_MSR_RESET 0x40000003 +#define HV_X64_MSR_VP_RUNTIME 0x40000010 +#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 +#define HV_X64_MSR_REFERENCE_TSC 0x40000021 +#define HV_X64_MSR_TSC_FREQUENCY 0x40000022 +#define HV_X64_MSR_APIC_FREQUENCY 0x40000023 + +/* Define the virtual APIC registers */ +#define HV_X64_MSR_EOI 0x40000070 +#define HV_X64_MSR_ICR 0x40000071 +#define HV_X64_MSR_TPR 0x40000072 +#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073 + +/* Define synthetic interrupt controller model specific registers. */ +#define HV_X64_MSR_SCONTROL 0x40000080 +#define HV_X64_MSR_SVERSION 0x40000081 +#define HV_X64_MSR_SIEFP 0x40000082 +#define HV_X64_MSR_SIMP 0x40000083 +#define HV_X64_MSR_EOM 0x40000084 +#define HV_X64_MSR_SIRBP 0x40000085 +#define HV_X64_MSR_SINT0 0x40000090 +#define HV_X64_MSR_SINT1 0x40000091 +#define HV_X64_MSR_SINT2 0x40000092 +#define HV_X64_MSR_SINT3 0x40000093 +#define HV_X64_MSR_SINT4 0x40000094 +#define HV_X64_MSR_SINT5 0x40000095 +#define HV_X64_MSR_SINT6 0x40000096 +#define HV_X64_MSR_SINT7 0x40000097 +#define HV_X64_MSR_SINT8 0x40000098 +#define HV_X64_MSR_SINT9 0x40000099 +#define HV_X64_MSR_SINT10 0x4000009A +#define HV_X64_MSR_SINT11 0x4000009B +#define HV_X64_MSR_SINT12 0x4000009C +#define HV_X64_MSR_SINT13 0x4000009D +#define HV_X64_MSR_SINT14 0x4000009E +#define HV_X64_MSR_SINT15 0x4000009F + +/* Define synthetic interrupt controller model specific registers for nested hypervisor */ +#define HV_X64_MSR_NESTED_SCONTROL 0x40001080 +#define HV_X64_MSR_NESTED_SVERSION 0x40001081 +#define HV_X64_MSR_NESTED_SIEFP 0x40001082 +#define HV_X64_MSR_NESTED_SIMP 0x40001083 +#define HV_X64_MSR_NESTED_EOM 0x40001084 +#define HV_X64_MSR_NESTED_SINT0 0x40001090 + +/* + * Synthetic Timer MSRs. Four timers per vcpu. + */ +#define HV_X64_MSR_STIMER0_CONFIG 0x400000B0 +#define HV_X64_MSR_STIMER0_COUNT 0x400000B1 +#define HV_X64_MSR_STIMER1_CONFIG 0x400000B2 +#define HV_X64_MSR_STIMER1_COUNT 0x400000B3 +#define HV_X64_MSR_STIMER2_CONFIG 0x400000B4 +#define HV_X64_MSR_STIMER2_COUNT 0x400000B5 +#define HV_X64_MSR_STIMER3_CONFIG 0x400000B6 +#define HV_X64_MSR_STIMER3_COUNT 0x400000B7 + +/* Hyper-V guest idle MSR */ +#define HV_X64_MSR_GUEST_IDLE 0x400000F0 + +/* Hyper-V guest crash notification MSR's */ +#define HV_X64_MSR_CRASH_P0 0x40000100 +#define HV_X64_MSR_CRASH_P1 0x40000101 +#define HV_X64_MSR_CRASH_P2 0x40000102 +#define HV_X64_MSR_CRASH_P3 0x40000103 +#define HV_X64_MSR_CRASH_P4 0x40000104 +#define HV_X64_MSR_CRASH_CTL 0x40000105 + +#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) + +#define HV_X64_MSR_CRASH_PARAMS \ + (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) + +#define HV_IPI_LOW_VECTOR 0x10 +#define HV_IPI_HIGH_VECTOR 0xff + +#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 +#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) + +/* Hyper-V Enlightened VMCS version mask in nested features CPUID */ +#define HV_X64_ENLIGHTENED_VMCS_VERSION 0xff + +#define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001 +#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12 + +/* Number of XMM registers used in hypercall input/output */ +#define HV_HYPERCALL_MAX_XMM_REGISTERS 6 + +struct hv_reenlightenment_control { + u64 vector : 8; + u64 reserved1 : 8; + u64 enabled : 1; + u64 reserved2 : 15; + u64 target_vp : 32; +} __packed; + +struct hv_tsc_emulation_status { /* HV_TSC_EMULATION_STATUS */ + u64 inprogress : 1; + u64 reserved : 63; +} __packed; + +struct hv_tsc_emulation_control { /* HV_TSC_INVARIANT_CONTROL */ + u64 enabled : 1; + u64 reserved : 63; +} __packed; + +/* TSC emulation after migration */ +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 +#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 +#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 +#define HV_X64_MSR_TSC_INVARIANT_CONTROL 0x40000118 +#define HV_EXPOSE_INVARIANT_TSC BIT_ULL(0) + +#endif /* CONFIG_X86 */ + +struct hv_get_partition_id { /* HV_OUTPUT_GET_PARTITION_ID */ + u64 partition_id; +} __packed; + +/* HV_CRASH_CTL_REG_CONTENTS */ +#define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) +#define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) + +union hv_reference_tsc_msr { + u64 as_uint64; + struct { + u64 enable : 1; + u64 reserved : 11; + u64 pfn : 52; + } __packed; +}; + +/* The maximum number of sparse vCPU banks which can be encoded by 'struct hv_vpset' */ +#define HV_MAX_SPARSE_VCPU_BANKS (64) +/* The number of vCPUs in one sparse bank */ +#define HV_VCPUS_PER_SPARSE_BANK (64) + +/* Some of Hyper-V structs do not use hv_vpset where linux uses them */ +struct hv_vpset { /* HV_VP_SET */ + u64 format; + u64 valid_bank_mask; + u64 bank_contents[]; +} __packed; + +/* + * Version info reported by hypervisor + * Changed to a union for convenience + */ +union hv_hypervisor_version_info { + struct { + u32 build_number; + + u32 minor_version : 16; + u32 major_version : 16; + + u32 service_pack; + + u32 service_number : 24; + u32 service_branch : 8; + }; + struct { + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + }; +}; + +/* HV_CPUID_FUNCTION */ +#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000 +#define HYPERV_CPUID_INTERFACE 0x40000001 +#define HYPERV_CPUID_VERSION 0x40000002 +#define HYPERV_CPUID_FEATURES 0x40000003 +#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 +#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 +#define HYPERV_CPUID_CPU_MANAGEMENT_FEATURES 0x40000007 +#define HYPERV_CPUID_NESTED_FEATURES 0x4000000A +#define HYPERV_CPUID_ISOLATION_CONFIG 0x4000000C + +#define HYPERV_CPUID_VIRT_STACK_INTERFACE 0x40000081 +#define HYPERV_VS_INTERFACE_EAX_SIGNATURE 0x31235356 /* "VS#1" */ + +#define HYPERV_CPUID_VIRT_STACK_PROPERTIES 0x40000082 +/* Support for the extended IOAPIC RTE format */ +#define HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE BIT(2) + +#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000 +#define HYPERV_CPUID_MIN 0x40000005 +#define HYPERV_CPUID_MAX 0x4000ffff + +/* + * HV_X64_HYPERVISOR_FEATURES (EAX), or + * HV_PARTITION_PRIVILEGE_MASK [31-0] + */ +#define HV_MSR_VP_RUNTIME_AVAILABLE BIT(0) +#define HV_MSR_TIME_REF_COUNT_AVAILABLE BIT(1) +#define HV_MSR_SYNIC_AVAILABLE BIT(2) +#define HV_MSR_SYNTIMER_AVAILABLE BIT(3) +#define HV_MSR_APIC_ACCESS_AVAILABLE BIT(4) +#define HV_MSR_HYPERCALL_AVAILABLE BIT(5) +#define HV_MSR_VP_INDEX_AVAILABLE BIT(6) +#define HV_MSR_RESET_AVAILABLE BIT(7) +#define HV_MSR_STAT_PAGES_AVAILABLE BIT(8) +#define HV_MSR_REFERENCE_TSC_AVAILABLE BIT(9) +#define HV_MSR_GUEST_IDLE_AVAILABLE BIT(10) +#define HV_ACCESS_FREQUENCY_MSRS BIT(11) +#define HV_ACCESS_REENLIGHTENMENT BIT(13) +#define HV_ACCESS_TSC_INVARIANT BIT(15) + +/* + * HV_X64_HYPERVISOR_FEATURES (EBX), or + * HV_PARTITION_PRIVILEGE_MASK [63-32] + */ +#define HV_CREATE_PARTITIONS BIT(0) +#define HV_ACCESS_PARTITION_ID BIT(1) +#define HV_ACCESS_MEMORY_POOL BIT(2) +#define HV_ADJUST_MESSAGE_BUFFERS BIT(3) +#define HV_POST_MESSAGES BIT(4) +#define HV_SIGNAL_EVENTS BIT(5) +#define HV_CREATE_PORT BIT(6) +#define HV_CONNECT_PORT BIT(7) +#define HV_ACCESS_STATS BIT(8) +#define HV_DEBUGGING BIT(11) +#define HV_CPU_MANAGEMENT BIT(12) +#define HV_ENABLE_EXTENDED_HYPERCALLS BIT(20) +#define HV_ISOLATION BIT(22) + +#if defined(CONFIG_X86) +/* HV_X64_HYPERVISOR_FEATURES (EDX) */ +#define HV_X64_MWAIT_AVAILABLE BIT(0) +#define HV_X64_GUEST_DEBUGGING_AVAILABLE BIT(1) +#define HV_X64_PERF_MONITOR_AVAILABLE BIT(2) +#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE BIT(3) +#define HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE BIT(4) +#define HV_X64_GUEST_IDLE_STATE_AVAILABLE BIT(5) +#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE BIT(8) +#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10) +#define HV_FEATURE_DEBUG_MSRS_AVAILABLE BIT(11) +#define HV_FEATURE_EXT_GVA_RANGES_FLUSH BIT(14) +/* + * Support for returning hypercall output block via XMM + * registers is available + */ +#define HV_X64_HYPERCALL_XMM_OUTPUT_AVAILABLE BIT(15) +/* stimer Direct Mode is available */ +#define HV_STIMER_DIRECT_MODE_AVAILABLE BIT(19) + +/* + * Implementation recommendations. Indicates which behaviors the hypervisor + * recommends the OS implement for optimal performance. + * These are HYPERV_CPUID_ENLIGHTMENT_INFO.EAX bits. + */ +/* HV_X64_ENLIGHTENMENT_INFORMATION */ +#define HV_X64_AS_SWITCH_RECOMMENDED BIT(0) +#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED BIT(1) +#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED BIT(2) +#define HV_X64_APIC_ACCESS_RECOMMENDED BIT(3) +#define HV_X64_SYSTEM_RESET_RECOMMENDED BIT(4) +#define HV_X64_RELAXED_TIMING_RECOMMENDED BIT(5) +#define HV_DEPRECATING_AEOI_RECOMMENDED BIT(9) +#define HV_X64_CLUSTER_IPI_RECOMMENDED BIT(10) +#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED BIT(11) +#define HV_X64_HYPERV_NESTED BIT(12) +#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14) +#define HV_X64_USE_MMIO_HYPERCALLS BIT(21) + +/* + * CPU management features identification. + * These are HYPERV_CPUID_CPU_MANAGEMENT_FEATURES.EAX bits. + */ +#define HV_X64_START_LOGICAL_PROCESSOR BIT(0) +#define HV_X64_CREATE_ROOT_VIRTUAL_PROCESSOR BIT(1) +#define HV_X64_PERFORMANCE_COUNTER_SYNC BIT(2) +#define HV_X64_RESERVED_IDENTITY_BIT BIT(31) + +/* + * Virtual processor will never share a physical core with another virtual + * processor, except for virtual processors that are reported as sibling SMT + * threads. + */ +#define HV_X64_NO_NONARCH_CORESHARING BIT(18) + +/* Nested features. These are HYPERV_CPUID_NESTED_FEATURES.EAX bits. */ +#define HV_X64_NESTED_DIRECT_FLUSH BIT(17) +#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) +#define HV_X64_NESTED_MSR_BITMAP BIT(19) + +/* Nested features #2. These are HYPERV_CPUID_NESTED_FEATURES.EBX bits. */ +#define HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL BIT(0) + +/* + * This is specific to AMD and specifies that enlightened TLB flush is + * supported. If guest opts in to this feature, ASID invalidations only + * flushes gva -> hpa mapping entries. To flush the TLB entries derived + * from NPT, hypercalls should be used (HvFlushGuestPhysicalAddressSpace + * or HvFlushGuestPhysicalAddressList). + */ +#define HV_X64_NESTED_ENLIGHTENED_TLB BIT(22) + +/* HYPERV_CPUID_ISOLATION_CONFIG.EAX bits. */ +#define HV_PARAVISOR_PRESENT BIT(0) + +/* HYPERV_CPUID_ISOLATION_CONFIG.EBX bits. */ +#define HV_ISOLATION_TYPE GENMASK(3, 0) +#define HV_SHARED_GPA_BOUNDARY_ACTIVE BIT(5) +#define HV_SHARED_GPA_BOUNDARY_BITS GENMASK(11, 6) + +enum hv_isolation_type { + HV_ISOLATION_TYPE_NONE = 0, /* HV_PARTITION_ISOLATION_TYPE_NONE */ + HV_ISOLATION_TYPE_VBS = 1, + HV_ISOLATION_TYPE_SNP = 2, + HV_ISOLATION_TYPE_TDX = 3 +}; + +union hv_x64_msr_hypercall_contents { + u64 as_uint64; + struct { + u64 enable : 1; + u64 reserved : 11; + u64 guest_physical_address : 52; + } __packed; +}; +#endif /* CONFIG_X86 */ + +#if defined(CONFIG_ARM64) +#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(8) +#define HV_STIMER_DIRECT_MODE_AVAILABLE BIT(13) +#endif /* CONFIG_ARM64 */ + +#if defined(CONFIG_X86) +#define HV_MAXIMUM_PROCESSORS 2048 +#elif defined(CONFIG_ARM64) /* CONFIG_X86 */ +#define HV_MAXIMUM_PROCESSORS 320 +#endif /* CONFIG_ARM64 */ + +#define HV_MAX_VP_INDEX (HV_MAXIMUM_PROCESSORS - 1) +#define HV_VP_INDEX_SELF ((u32)-2) +#define HV_ANY_VP ((u32)-1) + +union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ + u64 as_uint64; + struct { + u64 enable : 1; + u64 reserved : 11; + u64 pfn : 52; + } __packed; +}; + +/* Declare the various hypercall operations. */ +/* HV_CALL_CODE */ +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 +#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 +#define HVCALL_SEND_IPI 0x000b +#define HVCALL_ENABLE_VP_VTL 0x000f +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 +#define HVCALL_SEND_IPI_EX 0x0015 +#define HVCALL_CREATE_PARTITION 0x0040 +#define HVCALL_INITIALIZE_PARTITION 0x0041 +#define HVCALL_FINALIZE_PARTITION 0x0042 +#define HVCALL_DELETE_PARTITION 0x0043 +#define HVCALL_GET_PARTITION_PROPERTY 0x0044 +#define HVCALL_SET_PARTITION_PROPERTY 0x0045 +#define HVCALL_GET_PARTITION_ID 0x0046 +#define HVCALL_DEPOSIT_MEMORY 0x0048 +#define HVCALL_WITHDRAW_MEMORY 0x0049 +#define HVCALL_MAP_GPA_PAGES 0x004b +#define HVCALL_UNMAP_GPA_PAGES 0x004c +#define HVCALL_CREATE_VP 0x004e +#define HVCALL_DELETE_VP 0x004f +#define HVCALL_GET_VP_REGISTERS 0x0050 +#define HVCALL_SET_VP_REGISTERS 0x0051 +#define HVCALL_DELETE_PORT 0x0058 +#define HVCALL_DISCONNECT_PORT 0x005b +#define HVCALL_POST_MESSAGE 0x005c +#define HVCALL_SIGNAL_EVENT 0x005d +#define HVCALL_POST_DEBUG_DATA 0x0069 +#define HVCALL_RETRIEVE_DEBUG_DATA 0x006a +#define HVCALL_RESET_DEBUG_SESSION 0x006b +#define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076 +#define HVCALL_GET_SYSTEM_PROPERTY 0x007b +#define HVCALL_MAP_DEVICE_INTERRUPT 0x007c +#define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d +#define HVCALL_RETARGET_INTERRUPT 0x007e +#define HVCALL_NOTIFY_PORT_RING_EMPTY 0x008b +#define HVCALL_ASSERT_VIRTUAL_INTERRUPT 0x0094 +#define HVCALL_CREATE_PORT 0x0095 +#define HVCALL_CONNECT_PORT 0x0096 +#define HVCALL_START_VP 0x0099 +#define HVCALL_GET_VP_ID_FROM_APIC_ID 0x009a +#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af +#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 +#define HVCALL_DISPATCH_VP 0x00c2 +#define HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY 0x00db +#define HVCALL_MAP_VP_STATE_PAGE 0x00e1 +#define HVCALL_UNMAP_VP_STATE_PAGE 0x00e2 +#define HVCALL_GET_VP_STATE 0x00e3 +#define HVCALL_SET_VP_STATE 0x00e4 +#define HVCALL_MMIO_READ 0x0106 +#define HVCALL_MMIO_WRITE 0x0107 + +/* HV_HYPERCALL_INPUT */ +#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) +#define HV_HYPERCALL_FAST_BIT BIT(16) +#define HV_HYPERCALL_VARHEAD_OFFSET 17 +#define HV_HYPERCALL_VARHEAD_MASK GENMASK_ULL(26, 17) +#define HV_HYPERCALL_RSVD0_MASK GENMASK_ULL(31, 27) +#define HV_HYPERCALL_NESTED BIT_ULL(31) +#define HV_HYPERCALL_REP_COMP_OFFSET 32 +#define HV_HYPERCALL_REP_COMP_1 BIT_ULL(32) +#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) +#define HV_HYPERCALL_RSVD1_MASK GENMASK_ULL(47, 44) +#define HV_HYPERCALL_REP_START_OFFSET 48 +#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) +#define HV_HYPERCALL_RSVD2_MASK GENMASK_ULL(63, 60) +#define HV_HYPERCALL_RSVD_MASK (HV_HYPERCALL_RSVD0_MASK | \ + HV_HYPERCALL_RSVD1_MASK | \ + HV_HYPERCALL_RSVD2_MASK) + +/* HvFlushGuestPhysicalAddressSpace hypercalls */ +struct hv_guest_mapping_flush { + u64 address_space; + u64 flags; +} __packed; + +/* + * HV_MAX_FLUSH_PAGES = "additional_pages" + 1. It's limited + * by the bitwidth of "additional_pages" in union hv_gpa_page_range. + */ +#define HV_MAX_FLUSH_PAGES (2048) +#define HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB 0 +#define HV_GPA_PAGE_RANGE_PAGE_SIZE_1GB 1 + +#define HV_FLUSH_ALL_PROCESSORS BIT(0) +#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) +#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) +#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) + +/* HvFlushGuestPhysicalAddressList, HvExtCallMemoryHeatHint hypercall */ +union hv_gpa_page_range { + u64 address_space; + struct { + u64 additional_pages : 11; + u64 largepage : 1; + u64 basepfn : 52; + } page; + struct { + u64 reserved : 12; + u64 page_size : 1; + u64 reserved1 : 8; + u64 base_large_pfn : 43; + }; +}; + +/* + * All input flush parameters should be in single page. The max flush + * count is equal with how many entries of union hv_gpa_page_range can + * be populated into the input parameter page. + */ +#define HV_MAX_FLUSH_REP_COUNT ((HV_HYP_PAGE_SIZE - 2 * sizeof(u64)) / \ + sizeof(union hv_gpa_page_range)) + +struct hv_guest_mapping_flush_list { + u64 address_space; + u64 flags; + union hv_gpa_page_range gpa_list[HV_MAX_FLUSH_REP_COUNT]; +}; + +struct hv_tlb_flush { /* HV_INPUT_FLUSH_VIRTUAL_ADDRESS_LIST */ + u64 address_space; + u64 flags; + u64 processor_mask; + u64 gva_list[]; +} __packed; + +/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ +struct hv_tlb_flush_ex { + u64 address_space; + u64 flags; + struct hv_vpset hv_vp_set; + u64 gva_list[]; +} __packed; + +struct ms_hyperv_tsc_page { /* HV_REFERENCE_TSC_PAGE */ + volatile u32 tsc_sequence; + u32 reserved1; + volatile u64 tsc_scale; + volatile s64 tsc_offset; +} __packed; + +/* Define the number of synthetic interrupt sources. */ +#define HV_SYNIC_SINT_COUNT (16) + +/* Define the expected SynIC version. */ +#define HV_SYNIC_VERSION_1 (0x1) +/* Valid SynIC vectors are 16-255. */ +#define HV_SYNIC_FIRST_VALID_VECTOR (16) + +#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0) +#define HV_SYNIC_SIMP_ENABLE (1ULL << 0) +#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0) +#define HV_SYNIC_SINT_MASKED (1ULL << 16) +#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17) +#define HV_SYNIC_SINT_VECTOR_MASK (0xFF) + +# + +/* Hyper-V defined statically assigned SINTs */ +#define HV_SYNIC_INTERCEPTION_SINT_INDEX 0x00000000 +#define HV_SYNIC_IOMMU_FAULT_SINT_INDEX 0x00000001 +#define HV_SYNIC_VMBUS_SINT_INDEX 0x00000002 +#define HV_SYNIC_FIRST_UNUSED_SINT_INDEX 0x00000005 + +/* mshv assigned SINT for doorbell */ +#define HV_SYNIC_DOORBELL_SINT_INDEX HV_SYNIC_FIRST_UNUSED_SINT_INDEX + +enum hv_interrupt_type { + HV_X64_INTERRUPT_TYPE_FIXED = 0x0000, + HV_X64_INTERRUPT_TYPE_LOWESTPRIORITY = 0x0001, + HV_X64_INTERRUPT_TYPE_SMI = 0x0002, + HV_X64_INTERRUPT_TYPE_REMOTEREAD = 0x0003, + HV_X64_INTERRUPT_TYPE_NMI = 0x0004, + HV_X64_INTERRUPT_TYPE_INIT = 0x0005, + HV_X64_INTERRUPT_TYPE_SIPI = 0x0006, + HV_X64_INTERRUPT_TYPE_EXTINT = 0x0007, + HV_X64_INTERRUPT_TYPE_LOCALINT0 = 0x0008, + HV_X64_INTERRUPT_TYPE_LOCALINT1 = 0x0009, + HV_X64_INTERRUPT_TYPE_MAXIMUM = 0x000A, +}; + +/* Define synthetic interrupt source. */ +union hv_synic_sint { + u64 as_uint64; + struct { + u64 vector : 8; + u64 reserved1 : 8; + u64 masked : 1; + u64 auto_eoi : 1; + u64 polling : 1; + u64 as_intercept : 1; + u64 proxy : 1; + u64 reserved2 : 43; + } __packed; +}; + +union hv_x64_xsave_xfem_register { + u64 as_uint64; + struct { + u32 low_uint32; + u32 high_uint32; + } __packed; + struct { + u64 legacy_x87 : 1; + u64 legacy_sse : 1; + u64 avx : 1; + u64 mpx_bndreg : 1; + u64 mpx_bndcsr : 1; + u64 avx_512_op_mask : 1; + u64 avx_512_zmmhi : 1; + u64 avx_512_zmm16_31 : 1; + u64 rsvd8_9 : 2; + u64 pasid : 1; + u64 cet_u : 1; + u64 cet_s : 1; + u64 rsvd13_16 : 4; + u64 xtile_cfg : 1; + u64 xtile_data : 1; + u64 rsvd19_63 : 45; + } __packed; +}; + +/* Synthetic timer configuration */ +union hv_stimer_config { /* HV_X64_MSR_STIMER_CONFIG_CONTENTS */ + u64 as_uint64; + struct { + u64 enable : 1; + u64 periodic : 1; + u64 lazy : 1; + u64 auto_enable : 1; + u64 apic_vector : 8; + u64 direct_mode : 1; + u64 reserved_z0 : 3; + u64 sintx : 4; + u64 reserved_z1 : 44; + } __packed; +}; + +/* Define the number of synthetic timers */ +#define HV_SYNIC_STIMER_COUNT (4) + +/* Define port identifier type. */ +union hv_port_id { + u32 asu32; + struct { + u32 id : 24; + u32 reserved : 8; + } __packed u; +}; + +#define HV_MESSAGE_SIZE (256) +#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) +#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30) + +/* Define hypervisor message types. */ +enum hv_message_type { + HVMSG_NONE = 0x00000000, + + /* Memory access messages. */ + HVMSG_UNMAPPED_GPA = 0x80000000, + HVMSG_GPA_INTERCEPT = 0x80000001, + + /* Timer notification messages. */ + HVMSG_TIMER_EXPIRED = 0x80000010, + + /* Error messages. */ + HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020, + HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021, + HVMSG_UNSUPPORTED_FEATURE = 0x80000022, + + /* + * Opaque intercept message. The original intercept message is only + * accessible from the mapped intercept message page. + */ + HVMSG_OPAQUE_INTERCEPT = 0x8000003F, + + /* Trace buffer complete messages. */ + HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040, + + /* Hypercall intercept */ + HVMSG_HYPERCALL_INTERCEPT = 0x80000050, + + /* SynIC intercepts */ + HVMSG_SYNIC_EVENT_INTERCEPT = 0x80000060, + HVMSG_SYNIC_SINT_INTERCEPT = 0x80000061, + HVMSG_SYNIC_SINT_DELIVERABLE = 0x80000062, + + /* Async call completion intercept */ + HVMSG_ASYNC_CALL_COMPLETION = 0x80000070, + + /* Root scheduler messages */ + HVMSG_SCHEDULER_VP_SIGNAL_BITSET = 0x80000100, + HVMSG_SCHEDULER_VP_SIGNAL_PAIR = 0x80000101, + + /* Platform-specific processor intercept messages. */ + HVMSG_X64_IO_PORT_INTERCEPT = 0x80010000, + HVMSG_X64_MSR_INTERCEPT = 0x80010001, + HVMSG_X64_CPUID_INTERCEPT = 0x80010002, + HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003, + HVMSG_X64_APIC_EOI = 0x80010004, + HVMSG_X64_LEGACY_FP_ERROR = 0x80010005, + HVMSG_X64_IOMMU_PRQ = 0x80010006, + HVMSG_X64_HALT = 0x80010007, + HVMSG_X64_INTERRUPTION_DELIVERABLE = 0x80010008, + HVMSG_X64_SIPI_INTERCEPT = 0x80010009, +}; + +/* Define the format of the SIMP register */ +union hv_synic_simp { + u64 as_uint64; + struct { + u64 simp_enabled : 1; + u64 preserved : 11; + u64 base_simp_gpa : 52; + } __packed; +}; + +union hv_message_flags { + u8 asu8; + struct { + u8 msg_pending : 1; + u8 reserved : 7; + } __packed; +}; + +struct hv_message_header { + u32 message_type; + u8 payload_size; + union hv_message_flags message_flags; + u8 reserved[2]; + union { + u64 sender; + union hv_port_id port; + }; +} __packed; + +/* + * Message format for notifications delivered via + * intercept message(as_intercept=1) + */ +struct hv_notification_message_payload { + u32 sint_index; +} __packed; + +struct hv_message { + struct hv_message_header header; + union { + u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; + } u; +} __packed; + +/* Define the synthetic interrupt message page layout. */ +struct hv_message_page { + struct hv_message sint_message[HV_SYNIC_SINT_COUNT]; +} __packed; + +/* Define timer message payload structure. */ +struct hv_timer_message_payload { + __u32 timer_index; + __u32 reserved; + __u64 expiration_time; /* When the timer expired */ + __u64 delivery_time; /* When the message was delivered */ +} __packed; + +struct hv_x64_segment_register { + u64 base; + u32 limit; + u16 selector; + union { + struct { + u16 segment_type : 4; + u16 non_system_segment : 1; + u16 descriptor_privilege_level : 2; + u16 present : 1; + u16 reserved : 4; + u16 available : 1; + u16 _long : 1; + u16 _default : 1; + u16 granularity : 1; + } __packed; + u16 attributes; + }; +} __packed; + +struct hv_x64_table_register { + u16 pad[3]; + u16 limit; + u64 base; +} __packed; + +union hv_input_vtl { + u8 as_uint8; + struct { + u8 target_vtl : 4; + u8 use_target_vtl : 1; + u8 reserved_z : 3; + }; +} __packed; + +struct hv_init_vp_context { + u64 rip; + u64 rsp; + u64 rflags; + + struct hv_x64_segment_register cs; + struct hv_x64_segment_register ds; + struct hv_x64_segment_register es; + struct hv_x64_segment_register fs; + struct hv_x64_segment_register gs; + struct hv_x64_segment_register ss; + struct hv_x64_segment_register tr; + struct hv_x64_segment_register ldtr; + + struct hv_x64_table_register idtr; + struct hv_x64_table_register gdtr; + + u64 efer; + u64 cr0; + u64 cr3; + u64 cr4; + u64 msr_cr_pat; +} __packed; + +struct hv_enable_vp_vtl { + u64 partition_id; + u32 vp_index; + union hv_input_vtl target_vtl; + u8 mbz0; + u16 mbz1; + struct hv_init_vp_context vp_context; +} __packed; + +struct hv_get_vp_from_apic_id_in { + u64 partition_id; + union hv_input_vtl target_vtl; + u8 res[7]; + u32 apic_ids[]; +} __packed; + +struct hv_nested_enlightenments_control { + struct { + u32 directhypercall : 1; + u32 reserved : 31; + } __packed features; + struct { + u32 inter_partition_comm : 1; + u32 reserved : 31; + } __packed hypercall_controls; +} __packed; + +/* Define virtual processor assist page structure. */ +struct hv_vp_assist_page { + u32 apic_assist; + u32 reserved1; + u32 vtl_entry_reason; + u32 vtl_reserved; + u64 vtl_ret_x64rax; + u64 vtl_ret_x64rcx; + struct hv_nested_enlightenments_control nested_control; + u8 enlighten_vmentry; + u8 reserved2[7]; + u64 current_nested_vmcs; + u8 synthetic_time_unhalted_timer_expired; + u8 reserved3[7]; + u8 virtualization_fault_information[40]; + u8 reserved4[8]; + u8 intercept_message[256]; + u8 vtl_ret_actions[256]; +} __packed; + +enum hv_register_name { + /* Suspend Registers */ + HV_REGISTER_EXPLICIT_SUSPEND = 0x00000000, + HV_REGISTER_INTERCEPT_SUSPEND = 0x00000001, + HV_REGISTER_DISPATCH_SUSPEND = 0x00000003, + + /* Version - 128-bit result same as CPUID 0x40000002 */ + HV_REGISTER_HYPERVISOR_VERSION = 0x00000100, + + /* Feature Access (registers are 128 bits) - same as CPUID 0x40000003 - 0x4000000B */ + HV_REGISTER_PRIVILEGES_AND_FEATURES_INFO = 0x00000200, + HV_REGISTER_FEATURES_INFO = 0x00000201, + HV_REGISTER_IMPLEMENTATION_LIMITS_INFO = 0x00000202, + HV_REGISTER_HARDWARE_FEATURES_INFO = 0x00000203, + HV_REGISTER_CPU_MANAGEMENT_FEATURES_INFO = 0x00000204, + HV_REGISTER_SVM_FEATURES_INFO = 0x00000205, + HV_REGISTER_SKIP_LEVEL_FEATURES_INFO = 0x00000206, + HV_REGISTER_NESTED_VIRT_FEATURES_INFO = 0x00000207, + HV_REGISTER_IPT_FEATURES_INFO = 0x00000208, + + /* Guest Crash Registers */ + HV_REGISTER_GUEST_CRASH_P0 = 0x00000210, + HV_REGISTER_GUEST_CRASH_P1 = 0x00000211, + HV_REGISTER_GUEST_CRASH_P2 = 0x00000212, + HV_REGISTER_GUEST_CRASH_P3 = 0x00000213, + HV_REGISTER_GUEST_CRASH_P4 = 0x00000214, + HV_REGISTER_GUEST_CRASH_CTL = 0x00000215, + + /* Misc */ + HV_REGISTER_VP_RUNTIME = 0x00090000, + HV_REGISTER_GUEST_OS_ID = 0x00090002, + HV_REGISTER_VP_INDEX = 0x00090003, + HV_REGISTER_TIME_REF_COUNT = 0x00090004, + HV_REGISTER_CPU_MANAGEMENT_VERSION = 0x00090007, + HV_REGISTER_VP_ASSIST_PAGE = 0x00090013, + HV_REGISTER_VP_ROOT_SIGNAL_COUNT = 0x00090014, + HV_REGISTER_REFERENCE_TSC = 0x00090017, + + /* Hypervisor-defined Registers (Synic) */ + HV_REGISTER_SINT0 = 0x000A0000, + HV_REGISTER_SINT1 = 0x000A0001, + HV_REGISTER_SINT2 = 0x000A0002, + HV_REGISTER_SINT3 = 0x000A0003, + HV_REGISTER_SINT4 = 0x000A0004, + HV_REGISTER_SINT5 = 0x000A0005, + HV_REGISTER_SINT6 = 0x000A0006, + HV_REGISTER_SINT7 = 0x000A0007, + HV_REGISTER_SINT8 = 0x000A0008, + HV_REGISTER_SINT9 = 0x000A0009, + HV_REGISTER_SINT10 = 0x000A000A, + HV_REGISTER_SINT11 = 0x000A000B, + HV_REGISTER_SINT12 = 0x000A000C, + HV_REGISTER_SINT13 = 0x000A000D, + HV_REGISTER_SINT14 = 0x000A000E, + HV_REGISTER_SINT15 = 0x000A000F, + HV_REGISTER_SCONTROL = 0x000A0010, + HV_REGISTER_SVERSION = 0x000A0011, + HV_REGISTER_SIEFP = 0x000A0012, + HV_REGISTER_SIMP = 0x000A0013, + HV_REGISTER_EOM = 0x000A0014, + HV_REGISTER_SIRBP = 0x000A0015, + + HV_REGISTER_NESTED_SINT0 = 0x000A1000, + HV_REGISTER_NESTED_SINT1 = 0x000A1001, + HV_REGISTER_NESTED_SINT2 = 0x000A1002, + HV_REGISTER_NESTED_SINT3 = 0x000A1003, + HV_REGISTER_NESTED_SINT4 = 0x000A1004, + HV_REGISTER_NESTED_SINT5 = 0x000A1005, + HV_REGISTER_NESTED_SINT6 = 0x000A1006, + HV_REGISTER_NESTED_SINT7 = 0x000A1007, + HV_REGISTER_NESTED_SINT8 = 0x000A1008, + HV_REGISTER_NESTED_SINT9 = 0x000A1009, + HV_REGISTER_NESTED_SINT10 = 0x000A100A, + HV_REGISTER_NESTED_SINT11 = 0x000A100B, + HV_REGISTER_NESTED_SINT12 = 0x000A100C, + HV_REGISTER_NESTED_SINT13 = 0x000A100D, + HV_REGISTER_NESTED_SINT14 = 0x000A100E, + HV_REGISTER_NESTED_SINT15 = 0x000A100F, + HV_REGISTER_NESTED_SCONTROL = 0x000A1010, + HV_REGISTER_NESTED_SVERSION = 0x000A1011, + HV_REGISTER_NESTED_SIFP = 0x000A1012, + HV_REGISTER_NESTED_SIPP = 0x000A1013, + HV_REGISTER_NESTED_EOM = 0x000A1014, + HV_REGISTER_NESTED_SIRBP = 0x000a1015, + + /* Hypervisor-defined Registers (Synthetic Timers) */ + HV_REGISTER_STIMER0_CONFIG = 0x000B0000, + HV_REGISTER_STIMER0_COUNT = 0x000B0001, + + /* VSM */ + HV_REGISTER_VSM_VP_STATUS = 0x000D0003, +}; + +/* + * Arch compatibility regs for use with hv_set/get_register + */ +#if defined(CONFIG_X86) + +/* + * To support arch-generic code calling hv_set/get_register: + * - On x86, HV_MSR_ indicates an MSR accessed via rdmsrl/wrmsrl + * - On ARM, HV_MSR_ indicates a VP register accessed via hypercall + */ +#define HV_MSR_CRASH_P0 (HV_X64_MSR_CRASH_P0) +#define HV_MSR_CRASH_P1 (HV_X64_MSR_CRASH_P1) +#define HV_MSR_CRASH_P2 (HV_X64_MSR_CRASH_P2) +#define HV_MSR_CRASH_P3 (HV_X64_MSR_CRASH_P3) +#define HV_MSR_CRASH_P4 (HV_X64_MSR_CRASH_P4) +#define HV_MSR_CRASH_CTL (HV_X64_MSR_CRASH_CTL) + +#define HV_MSR_VP_INDEX (HV_X64_MSR_VP_INDEX) +#define HV_MSR_TIME_REF_COUNT (HV_X64_MSR_TIME_REF_COUNT) +#define HV_MSR_REFERENCE_TSC (HV_X64_MSR_REFERENCE_TSC) + +#define HV_MSR_SINT0 (HV_X64_MSR_SINT0) +#define HV_MSR_SVERSION (HV_X64_MSR_SVERSION) +#define HV_MSR_SCONTROL (HV_X64_MSR_SCONTROL) +#define HV_MSR_SIEFP (HV_X64_MSR_SIEFP) +#define HV_MSR_SIMP (HV_X64_MSR_SIMP) +#define HV_MSR_EOM (HV_X64_MSR_EOM) +#define HV_MSR_SIRBP (HV_X64_MSR_SIRBP) + +#define HV_MSR_NESTED_SCONTROL (HV_X64_MSR_NESTED_SCONTROL) +#define HV_MSR_NESTED_SVERSION (HV_X64_MSR_NESTED_SVERSION) +#define HV_MSR_NESTED_SIEFP (HV_X64_MSR_NESTED_SIEFP) +#define HV_MSR_NESTED_SIMP (HV_X64_MSR_NESTED_SIMP) +#define HV_MSR_NESTED_EOM (HV_X64_MSR_NESTED_EOM) +#define HV_MSR_NESTED_SINT0 (HV_X64_MSR_NESTED_SINT0) + +#define HV_MSR_STIMER0_CONFIG (HV_X64_MSR_STIMER0_CONFIG) +#define HV_MSR_STIMER0_COUNT (HV_X64_MSR_STIMER0_COUNT) + +#elif defined(CONFIG_ARM64) /* CONFIG_X86 */ + +#define HV_MSR_CRASH_P0 (HV_REGISTER_GUEST_CRASH_P0) +#define HV_MSR_CRASH_P1 (HV_REGISTER_GUEST_CRASH_P1) +#define HV_MSR_CRASH_P2 (HV_REGISTER_GUEST_CRASH_P2) +#define HV_MSR_CRASH_P3 (HV_REGISTER_GUEST_CRASH_P3) +#define HV_MSR_CRASH_P4 (HV_REGISTER_GUEST_CRASH_P4) +#define HV_MSR_CRASH_CTL (HV_REGISTER_GUEST_CRASH_CTL) + +#define HV_MSR_VP_INDEX (HV_REGISTER_VP_INDEX) +#define HV_MSR_TIME_REF_COUNT (HV_REGISTER_TIME_REF_COUNT) +#define HV_MSR_REFERENCE_TSC (HV_REGISTER_REFERENCE_TSC) + +#define HV_MSR_SINT0 (HV_REGISTER_SINT0) +#define HV_MSR_SCONTROL (HV_REGISTER_SCONTROL) +#define HV_MSR_SIEFP (HV_REGISTER_SIEFP) +#define HV_MSR_SIMP (HV_REGISTER_SIMP) +#define HV_MSR_EOM (HV_REGISTER_EOM) +#define HV_MSR_SIRBP (HV_REGISTER_SIRBP) + +#define HV_MSR_STIMER0_CONFIG (HV_REGISTER_STIMER0_CONFIG) +#define HV_MSR_STIMER0_COUNT (HV_REGISTER_STIMER0_COUNT) + +#endif /* CONFIG_ARM64 */ + +union hv_explicit_suspend_register { + u64 as_uint64; + struct { + u64 suspended : 1; + u64 reserved : 63; + } __packed; +}; + +union hv_intercept_suspend_register { + u64 as_uint64; + struct { + u64 suspended : 1; + u64 reserved : 63; + } __packed; +}; + +union hv_dispatch_suspend_register { + u64 as_uint64; + struct { + u64 suspended : 1; + u64 reserved : 63; + } __packed; +}; + +union hv_arm64_pending_interruption_register { + u64 as_uint64; + struct { + u64 interruption_pending : 1; + u64 interruption_type: 1; + u64 reserved : 30; + u64 error_code : 32; + } __packed; +}; + +union hv_arm64_interrupt_state_register { + u64 as_uint64; + struct { + u64 interrupt_shadow : 1; + u64 reserved : 63; + } __packed; +}; + +union hv_arm64_pending_synthetic_exception_event { + u64 as_uint64[2]; + struct { + u8 event_pending : 1; + u8 event_type : 3; + u8 reserved : 4; + u8 rsvd[3]; + u32 exception_type; + u64 context; + } __packed; +}; + +union hv_x64_interrupt_state_register { + u64 as_uint64; + struct { + u64 interrupt_shadow : 1; + u64 nmi_masked : 1; + u64 reserved : 62; + } __packed; +}; + +union hv_x64_pending_interruption_register { + u64 as_uint64; + struct { + u32 interruption_pending : 1; + u32 interruption_type : 3; + u32 deliver_error_code : 1; + u32 instruction_length : 4; + u32 nested_event : 1; + u32 reserved : 6; + u32 interruption_vector : 16; + u32 error_code; + } __packed; +}; + +union hv_register_value { + struct hv_u128 reg128; + u64 reg64; + u32 reg32; + u16 reg16; + u8 reg8; + + struct hv_x64_segment_register segment; + struct hv_x64_table_register table; + union hv_explicit_suspend_register explicit_suspend; + union hv_intercept_suspend_register intercept_suspend; + union hv_dispatch_suspend_register dispatch_suspend; +#ifdef CONFIG_ARM64 + union hv_arm64_interrupt_state_register interrupt_state; + union hv_arm64_pending_interruption_register pending_interruption; +#endif +#ifdef CONFIG_X86 + union hv_x64_interrupt_state_register interrupt_state; + union hv_x64_pending_interruption_register pending_interruption; +#endif + union hv_arm64_pending_synthetic_exception_event pending_synthetic_exception_event; +}; + +/* NOTE: Linux helper struct - NOT from Hyper-V code. */ +struct hv_output_get_vp_registers { + DECLARE_FLEX_ARRAY(union hv_register_value, values); +}; + +#if defined(CONFIG_ARM64) +/* HvGetVpRegisters returns an array of these output elements */ +struct hv_get_vp_registers_output { + union { + struct { + u32 a; + u32 b; + u32 c; + u32 d; + } as32 __packed; + struct { + u64 low; + u64 high; + } as64 __packed; + }; +}; + +#endif /* CONFIG_ARM64 */ + +struct hv_register_assoc { + u32 name; /* enum hv_register_name */ + u32 reserved1; + u64 reserved2; + union hv_register_value value; +} __packed; + +struct hv_input_get_vp_registers { + u64 partition_id; + u32 vp_index; + union hv_input_vtl input_vtl; + u8 rsvd_z8; + u16 rsvd_z16; + u32 names[]; +} __packed; + +struct hv_input_set_vp_registers { + u64 partition_id; + u32 vp_index; + union hv_input_vtl input_vtl; + u8 rsvd_z8; + u16 rsvd_z16; + struct hv_register_assoc elements[]; +} __packed; + +#define HV_UNMAP_GPA_LARGE_PAGE 0x2 + +/* HvCallSendSyntheticClusterIpi hypercall */ +struct hv_send_ipi { /* HV_INPUT_SEND_SYNTHETIC_CLUSTER_IPI */ + u32 vector; + u32 reserved; + u64 cpu_mask; +} __packed; + +#define HV_X64_VTL_MASK GENMASK(3, 0) + +/* Hyper-V memory host visibility */ +enum hv_mem_host_visibility { + VMBUS_PAGE_NOT_VISIBLE = 0, + VMBUS_PAGE_VISIBLE_READ_ONLY = 1, + VMBUS_PAGE_VISIBLE_READ_WRITE = 3 +}; + +/* HvCallModifySparseGpaPageHostVisibility hypercall */ +#define HV_MAX_MODIFY_GPA_REP_COUNT ((HV_HYP_PAGE_SIZE / sizeof(u64)) - 2) +struct hv_gpa_range_for_visibility { + u64 partition_id; + u32 host_visibility : 2; + u32 reserved0 : 30; + u32 reserved1; + u64 gpa_page_list[HV_MAX_MODIFY_GPA_REP_COUNT]; +} __packed; + +#if defined(CONFIG_X86) +union hv_msi_address_register { /* HV_MSI_ADDRESS */ + u32 as_uint32; + struct { + u32 reserved1 : 2; + u32 destination_mode : 1; + u32 redirection_hint : 1; + u32 reserved2 : 8; + u32 destination_id : 8; + u32 msi_base : 12; + }; +} __packed; + +union hv_msi_data_register { /* HV_MSI_ENTRY.Data */ + u32 as_uint32; + struct { + u32 vector : 8; + u32 delivery_mode : 3; + u32 reserved1 : 3; + u32 level_assert : 1; + u32 trigger_mode : 1; + u32 reserved2 : 16; + }; +} __packed; + +union hv_msi_entry { /* HV_MSI_ENTRY */ + + u64 as_uint64; + struct { + union hv_msi_address_register address; + union hv_msi_data_register data; + } __packed; +}; + +#elif defined(CONFIG_ARM64) /* CONFIG_X86 */ + +union hv_msi_entry { + u64 as_uint64[2]; + struct { + u64 address; + u32 data; + u32 reserved; + } __packed; +}; +#endif /* CONFIG_ARM64 */ + +union hv_ioapic_rte { + u64 as_uint64; + + struct { + u32 vector : 8; + u32 delivery_mode : 3; + u32 destination_mode : 1; + u32 delivery_status : 1; + u32 interrupt_polarity : 1; + u32 remote_irr : 1; + u32 trigger_mode : 1; + u32 interrupt_mask : 1; + u32 reserved1 : 15; + + u32 reserved2 : 24; + u32 destination_id : 8; + }; + + struct { + u32 low_uint32; + u32 high_uint32; + }; +} __packed; + +enum hv_interrupt_source { /* HV_INTERRUPT_SOURCE */ + HV_INTERRUPT_SOURCE_MSI = 1, /* MSI and MSI-X */ + HV_INTERRUPT_SOURCE_IOAPIC, +}; + +struct hv_interrupt_entry { /* HV_INTERRUPT_ENTRY */ + u32 source; + u32 reserved1; + union { + union hv_msi_entry msi_entry; + union hv_ioapic_rte ioapic_rte; + }; +} __packed; + +#define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1 +#define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2 + +struct hv_device_interrupt_target { /* HV_DEVICE_INTERRUPT_TARGET */ + u32 vector; + u32 flags; /* HV_DEVICE_INTERRUPT_TARGET_* above */ + union { + u64 vp_mask; + struct hv_vpset vp_set; + }; +} __packed; + +struct hv_retarget_device_interrupt { /* HV_INPUT_RETARGET_DEVICE_INTERRUPT */ + u64 partition_id; /* use "self" */ + u64 device_id; + struct hv_interrupt_entry int_entry; + u64 reserved2; + struct hv_device_interrupt_target int_target; +} __packed __aligned(8); + +/* Data structures for HVCALL_MMIO_READ and HVCALL_MMIO_WRITE */ +#define HV_HYPERCALL_MMIO_MAX_DATA_LENGTH 64 + +struct hv_mmio_read_input { /* HV_INPUT_MEMORY_MAPPED_IO_READ */ + u64 gpa; + u32 size; + u32 reserved; +} __packed; + +struct hv_mmio_read_output { + u8 data[HV_HYPERCALL_MMIO_MAX_DATA_LENGTH]; +} __packed; + +struct hv_mmio_write_input { + u64 gpa; + u32 size; + u32 reserved; + u8 data[HV_HYPERCALL_MMIO_MAX_DATA_LENGTH]; +} __packed; + +#endif /* _HV_HVGDK_MINI_H */ diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h new file mode 100644 index 000000000000..64407c2a3809 --- /dev/null +++ b/include/hyperv/hvhdk.h @@ -0,0 +1,733 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Type definitions for the Microsoft hypervisor. + */ +#ifndef _HV_HVHDK_H +#define _HV_HVHDK_H + +#include + +#include "hvhdk_mini.h" +#include "hvgdk.h" + +/* Bits for dirty mask of hv_vp_register_page */ +#define HV_X64_REGISTER_CLASS_GENERAL 0 +#define HV_X64_REGISTER_CLASS_IP 1 +#define HV_X64_REGISTER_CLASS_XMM 2 +#define HV_X64_REGISTER_CLASS_SEGMENT 3 +#define HV_X64_REGISTER_CLASS_FLAGS 4 + +#define HV_VP_REGISTER_PAGE_VERSION_1 1u + +struct hv_vp_register_page { + u16 version; + u8 isvalid; + u8 rsvdz; + u32 dirty; + union { + struct { + /* General purpose registers + * (HV_X64_REGISTER_CLASS_GENERAL) + */ + union { + struct { + u64 rax; + u64 rcx; + u64 rdx; + u64 rbx; + u64 rsp; + u64 rbp; + u64 rsi; + u64 rdi; + u64 r8; + u64 r9; + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; + } __packed; + + u64 gp_registers[16]; + }; + /* Instruction pointer (HV_X64_REGISTER_CLASS_IP) */ + u64 rip; + /* Flags (HV_X64_REGISTER_CLASS_FLAGS) */ + u64 rflags; + } __packed; + + u64 registers[18]; + }; + /* Volatile XMM registers (HV_X64_REGISTER_CLASS_XMM) */ + union { + struct { + struct hv_u128 xmm0; + struct hv_u128 xmm1; + struct hv_u128 xmm2; + struct hv_u128 xmm3; + struct hv_u128 xmm4; + struct hv_u128 xmm5; + } __packed; + + struct hv_u128 xmm_registers[6]; + }; + /* Segment registers (HV_X64_REGISTER_CLASS_SEGMENT) */ + union { + struct { + struct hv_x64_segment_register es; + struct hv_x64_segment_register cs; + struct hv_x64_segment_register ss; + struct hv_x64_segment_register ds; + struct hv_x64_segment_register fs; + struct hv_x64_segment_register gs; + } __packed; + + struct hv_x64_segment_register segment_registers[6]; + }; + /* Misc. control registers (cannot be set via this interface) */ + u64 cr0; + u64 cr3; + u64 cr4; + u64 cr8; + u64 efer; + u64 dr7; + union hv_x64_pending_interruption_register pending_interruption; + union hv_x64_interrupt_state_register interrupt_state; + u64 instruction_emulation_hints; +} __packed; + +#define HV_PARTITION_PROCESSOR_FEATURES_BANKS 2 + +union hv_partition_processor_features { + u64 as_uint64[HV_PARTITION_PROCESSOR_FEATURES_BANKS]; + struct { + u64 sse3_support : 1; + u64 lahf_sahf_support : 1; + u64 ssse3_support : 1; + u64 sse4_1_support : 1; + u64 sse4_2_support : 1; + u64 sse4a_support : 1; + u64 xop_support : 1; + u64 pop_cnt_support : 1; + u64 cmpxchg16b_support : 1; + u64 altmovcr8_support : 1; + u64 lzcnt_support : 1; + u64 mis_align_sse_support : 1; + u64 mmx_ext_support : 1; + u64 amd3dnow_support : 1; + u64 extended_amd3dnow_support : 1; + u64 page_1gb_support : 1; + u64 aes_support : 1; + u64 pclmulqdq_support : 1; + u64 pcid_support : 1; + u64 fma4_support : 1; + u64 f16c_support : 1; + u64 rd_rand_support : 1; + u64 rd_wr_fs_gs_support : 1; + u64 smep_support : 1; + u64 enhanced_fast_string_support : 1; + u64 bmi1_support : 1; + u64 bmi2_support : 1; + u64 hle_support_deprecated : 1; + u64 rtm_support_deprecated : 1; + u64 movbe_support : 1; + u64 npiep1_support : 1; + u64 dep_x87_fpu_save_support : 1; + u64 rd_seed_support : 1; + u64 adx_support : 1; + u64 intel_prefetch_support : 1; + u64 smap_support : 1; + u64 hle_support : 1; + u64 rtm_support : 1; + u64 rdtscp_support : 1; + u64 clflushopt_support : 1; + u64 clwb_support : 1; + u64 sha_support : 1; + u64 x87_pointers_saved_support : 1; + u64 invpcid_support : 1; + u64 ibrs_support : 1; + u64 stibp_support : 1; + u64 ibpb_support: 1; + u64 unrestricted_guest_support : 1; + u64 mdd_support : 1; + u64 fast_short_rep_mov_support : 1; + u64 l1dcache_flush_support : 1; + u64 rdcl_no_support : 1; + u64 ibrs_all_support : 1; + u64 skip_l1df_support : 1; + u64 ssb_no_support : 1; + u64 rsb_a_no_support : 1; + u64 virt_spec_ctrl_support : 1; + u64 rd_pid_support : 1; + u64 umip_support : 1; + u64 mbs_no_support : 1; + u64 mb_clear_support : 1; + u64 taa_no_support : 1; + u64 tsx_ctrl_support : 1; + /* + * N.B. The final processor feature bit in bank 0 is reserved to + * simplify potential downlevel backports. + */ + u64 reserved_bank0 : 1; + + /* N.B. Begin bank 1 processor features. */ + u64 acount_mcount_support : 1; + u64 tsc_invariant_support : 1; + u64 cl_zero_support : 1; + u64 rdpru_support : 1; + u64 la57_support : 1; + u64 mbec_support : 1; + u64 nested_virt_support : 1; + u64 psfd_support : 1; + u64 cet_ss_support : 1; + u64 cet_ibt_support : 1; + u64 vmx_exception_inject_support : 1; + u64 enqcmd_support : 1; + u64 umwait_tpause_support : 1; + u64 movdiri_support : 1; + u64 movdir64b_support : 1; + u64 cldemote_support : 1; + u64 serialize_support : 1; + u64 tsc_deadline_tmr_support : 1; + u64 tsc_adjust_support : 1; + u64 fzlrep_movsb : 1; + u64 fsrep_stosb : 1; + u64 fsrep_cmpsb : 1; + u64 reserved_bank1 : 42; + } __packed; +}; + +union hv_partition_processor_xsave_features { + struct { + u64 xsave_support : 1; + u64 xsaveopt_support : 1; + u64 avx_support : 1; + u64 reserved1 : 61; + } __packed; + u64 as_uint64; +}; + +struct hv_partition_creation_properties { + union hv_partition_processor_features disabled_processor_features; + union hv_partition_processor_xsave_features + disabled_processor_xsave_features; +} __packed; + +#define HV_PARTITION_SYNTHETIC_PROCESSOR_FEATURES_BANKS 1 + +union hv_partition_synthetic_processor_features { + u64 as_uint64[HV_PARTITION_SYNTHETIC_PROCESSOR_FEATURES_BANKS]; + + struct { + u64 hypervisor_present : 1; + /* Support for HV#1: (CPUID leaves 0x40000000 - 0x40000006)*/ + u64 hv1 : 1; + u64 access_vp_run_time_reg : 1; /* HV_X64_MSR_VP_RUNTIME */ + u64 access_partition_reference_counter : 1; /* HV_X64_MSR_TIME_REF_COUNT */ + u64 access_synic_regs : 1; /* SINT-related registers */ + /* + * Access to HV_X64_MSR_STIMER0_CONFIG through + * HV_X64_MSR_STIMER3_COUNT. + */ + u64 access_synthetic_timer_regs : 1; + u64 access_intr_ctrl_regs : 1; /* APIC MSRs and VP assist page*/ + /* HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL */ + u64 access_hypercall_regs : 1; + u64 access_vp_index : 1; + u64 access_partition_reference_tsc : 1; + u64 access_guest_idle_reg : 1; + u64 access_frequency_regs : 1; + u64 reserved_z12 : 1; + u64 reserved_z13 : 1; + u64 reserved_z14 : 1; + u64 enable_extended_gva_ranges_for_flush_virtual_address_list : 1; + u64 reserved_z16 : 1; + u64 reserved_z17 : 1; + /* Use fast hypercall output. Corresponds to privilege. */ + u64 fast_hypercall_output : 1; + u64 reserved_z19 : 1; + u64 start_virtual_processor : 1; /* Can start VPs */ + u64 reserved_z21 : 1; + /* Synthetic timers in direct mode. */ + u64 direct_synthetic_timers : 1; + u64 reserved_z23 : 1; + u64 extended_processor_masks : 1; + + /* Enable various hypercalls */ + u64 tb_flush_hypercalls : 1; + u64 synthetic_cluster_ipi : 1; + u64 notify_long_spin_wait : 1; + u64 query_numa_distance : 1; + u64 signal_events : 1; + u64 retarget_device_interrupt : 1; + u64 restore_time : 1; + + /* EnlightenedVmcs nested enlightenment is supported. */ + u64 enlightened_vmcs : 1; + u64 reserved : 31; + } __packed; +}; + +#define HV_MAKE_COMPATIBILITY_VERSION(major_, minor_) \ + ((u32)((major_) << 8 | (minor_))) + +#define HV_COMPATIBILITY_21_H2 HV_MAKE_COMPATIBILITY_VERSION(0X6, 0X9) + +union hv_partition_isolation_properties { + u64 as_uint64; + struct { + u64 isolation_type: 5; + u64 isolation_host_type : 2; + u64 rsvd_z: 5; + u64 shared_gpa_boundary_page_number: 52; + } __packed; +}; + +/* + * Various isolation types supported by MSHV. + */ +#define HV_PARTITION_ISOLATION_TYPE_NONE 0 +#define HV_PARTITION_ISOLATION_TYPE_SNP 2 +#define HV_PARTITION_ISOLATION_TYPE_TDX 3 + +/* + * Various host isolation types supported by MSHV. + */ +#define HV_PARTITION_ISOLATION_HOST_TYPE_NONE 0x0 +#define HV_PARTITION_ISOLATION_HOST_TYPE_HARDWARE 0x1 +#define HV_PARTITION_ISOLATION_HOST_TYPE_RESERVED 0x2 + +/* Note: Exo partition is enabled by default */ +#define HV_PARTITION_CREATION_FLAG_EXO_PARTITION BIT(8) +#define HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED BIT(13) +#define HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED BIT(19) +#define HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE BIT(22) + +struct hv_input_create_partition { + u64 flags; + struct hv_proximity_domain_info proximity_domain_info; + u32 compatibility_version; + u32 padding; + struct hv_partition_creation_properties partition_creation_properties; + union hv_partition_isolation_properties isolation_properties; +} __packed; + +struct hv_output_create_partition { + u64 partition_id; +} __packed; + +struct hv_input_initialize_partition { + u64 partition_id; +} __packed; + +struct hv_input_finalize_partition { + u64 partition_id; +} __packed; + +struct hv_input_delete_partition { + u64 partition_id; +} __packed; + +struct hv_input_get_partition_property { + u64 partition_id; + u32 property_code; /* enum hv_partition_property_code */ + u32 padding; +} __packed; + +struct hv_output_get_partition_property { + u64 property_value; +} __packed; + +struct hv_input_set_partition_property { + u64 partition_id; + u32 property_code; /* enum hv_partition_property_code */ + u32 padding; + u64 property_value; +} __packed; + +enum hv_vp_state_page_type { + HV_VP_STATE_PAGE_REGISTERS = 0, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE = 1, + HV_VP_STATE_PAGE_COUNT +}; + +struct hv_input_map_vp_state_page { + u64 partition_id; + u32 vp_index; + u32 type; /* enum hv_vp_state_page_type */ +} __packed; + +struct hv_output_map_vp_state_page { + u64 map_location; /* GPA page number */ +} __packed; + +struct hv_input_unmap_vp_state_page { + u64 partition_id; + u32 vp_index; + u32 type; /* enum hv_vp_state_page_type */ +} __packed; + +struct hv_opaque_intercept_message { + u32 vp_index; +} __packed; + +enum hv_port_type { + HV_PORT_TYPE_MESSAGE = 1, + HV_PORT_TYPE_EVENT = 2, + HV_PORT_TYPE_MONITOR = 3, + HV_PORT_TYPE_DOORBELL = 4 /* Root Partition only */ +}; + +struct hv_port_info { + u32 port_type; /* enum hv_port_type */ + u32 padding; + union { + struct { + u32 target_sint; + u32 target_vp; + u64 rsvdz; + } message_port_info; + struct { + u32 target_sint; + u32 target_vp; + u16 base_flag_number; + u16 flag_count; + u32 rsvdz; + } event_port_info; + struct { + u64 monitor_address; + u64 rsvdz; + } monitor_port_info; + struct { + u32 target_sint; + u32 target_vp; + u64 rsvdz; + } doorbell_port_info; + }; +} __packed; + +struct hv_connection_info { + u32 port_type; + u32 padding; + union { + struct { + u64 rsvdz; + } message_connection_info; + struct { + u64 rsvdz; + } event_connection_info; + struct { + u64 monitor_address; + } monitor_connection_info; + struct { + u64 gpa; + u64 trigger_value; + u64 flags; + } doorbell_connection_info; + }; +} __packed; + +/* Define synthetic interrupt controller flag constants. */ +#define HV_EVENT_FLAGS_COUNT (256 * 8) +#define HV_EVENT_FLAGS_BYTE_COUNT (256) +#define HV_EVENT_FLAGS32_COUNT (256 / sizeof(u32)) + +/* linux side we create long version of flags to use long bit ops on flags */ +#define HV_EVENT_FLAGS_UL_COUNT (256 / sizeof(ulong)) + +/* Define the synthetic interrupt controller event flags format. */ +union hv_synic_event_flags { + unsigned char flags8[HV_EVENT_FLAGS_BYTE_COUNT]; + u32 flags32[HV_EVENT_FLAGS32_COUNT]; + ulong flags[HV_EVENT_FLAGS_UL_COUNT]; /* linux only */ +}; + +struct hv_synic_event_flags_page { + volatile union hv_synic_event_flags event_flags[HV_SYNIC_SINT_COUNT]; +}; + +#define HV_SYNIC_EVENT_RING_MESSAGE_COUNT 63 + +struct hv_synic_event_ring { + u8 signal_masked; + u8 ring_full; + u16 reserved_z; + u32 data[HV_SYNIC_EVENT_RING_MESSAGE_COUNT]; +} __packed; + +struct hv_synic_event_ring_page { + struct hv_synic_event_ring sint_event_ring[HV_SYNIC_SINT_COUNT]; +}; + +/* Define SynIC control register. */ +union hv_synic_scontrol { + u64 as_uint64; + struct { + u64 enable : 1; + u64 reserved : 63; + } __packed; +}; + +/* Define the format of the SIEFP register */ +union hv_synic_siefp { + u64 as_uint64; + struct { + u64 siefp_enabled : 1; + u64 preserved : 11; + u64 base_siefp_gpa : 52; + } __packed; +}; + +union hv_synic_sirbp { + u64 as_uint64; + struct { + u64 sirbp_enabled : 1; + u64 preserved : 11; + u64 base_sirbp_gpa : 52; + } __packed; +}; + +union hv_interrupt_control { + u64 as_uint64; + struct { + u32 interrupt_type; /* enum hv_interrupt_type */ + u32 level_triggered : 1; + u32 logical_dest_mode : 1; + u32 rsvd : 30; + } __packed; +}; + +struct hv_stimer_state { + struct { + u32 undelivered_msg_pending : 1; + u32 reserved : 31; + } __packed flags; + u32 resvd; + u64 config; + u64 count; + u64 adjustment; + u64 undelivered_exp_time; +} __packed; + +struct hv_synthetic_timers_state { + struct hv_stimer_state timers[HV_SYNIC_STIMER_COUNT]; + u64 reserved[5]; +} __packed; + +union hv_input_delete_vp { + u64 as_uint64[2]; + struct { + u64 partition_id; + u32 vp_index; + u8 reserved[4]; + } __packed; +} __packed; + +struct hv_input_assert_virtual_interrupt { + u64 partition_id; + union hv_interrupt_control control; + u64 dest_addr; /* cpu's apic id */ + u32 vector; + u8 target_vtl; + u8 rsvd_z0; + u16 rsvd_z1; +} __packed; + +struct hv_input_create_port { + u64 port_partition_id; + union hv_port_id port_id; + u8 port_vtl; + u8 min_connection_vtl; + u16 padding; + u64 connection_partition_id; + struct hv_port_info port_info; + struct hv_proximity_domain_info proximity_domain_info; +} __packed; + +union hv_input_delete_port { + u64 as_uint64[2]; + struct { + u64 port_partition_id; + union hv_port_id port_id; + u32 reserved; + }; +} __packed; + +struct hv_input_connect_port { + u64 connection_partition_id; + union hv_connection_id connection_id; + u8 connection_vtl; + u8 rsvdz0; + u16 rsvdz1; + u64 port_partition_id; + union hv_port_id port_id; + u32 reserved2; + struct hv_connection_info connection_info; + struct hv_proximity_domain_info proximity_domain_info; +} __packed; + +union hv_input_disconnect_port { + u64 as_uint64[2]; + struct { + u64 connection_partition_id; + union hv_connection_id connection_id; + u32 is_doorbell: 1; + u32 reserved: 31; + } __packed; +} __packed; + +union hv_input_notify_port_ring_empty { + u64 as_uint64; + struct { + u32 sint_index; + u32 reserved; + }; +} __packed; + +struct hv_vp_state_data_xsave { + u64 flags; + union hv_x64_xsave_xfem_register states; +} __packed; + +/* + * For getting and setting VP state, there are two options based on the state type: + * + * 1.) Data that is accessed by PFNs in the input hypercall page. This is used + * for state which may not fit into the hypercall pages. + * 2.) Data that is accessed directly in the input\output hypercall pages. + * This is used for state that will always fit into the hypercall pages. + * + * In the future this could be dynamic based on the size if needed. + * + * Note these hypercalls have an 8-byte aligned variable header size as per the tlfs + */ + +#define HV_GET_SET_VP_STATE_TYPE_PFN BIT(31) + +enum hv_get_set_vp_state_type { + /* HvGetSetVpStateLocalInterruptControllerState - APIC/GIC state */ + HV_GET_SET_VP_STATE_LAPIC_STATE = 0 | HV_GET_SET_VP_STATE_TYPE_PFN, + HV_GET_SET_VP_STATE_XSAVE = 1 | HV_GET_SET_VP_STATE_TYPE_PFN, + HV_GET_SET_VP_STATE_SIM_PAGE = 2 | HV_GET_SET_VP_STATE_TYPE_PFN, + HV_GET_SET_VP_STATE_SIEF_PAGE = 3 | HV_GET_SET_VP_STATE_TYPE_PFN, + HV_GET_SET_VP_STATE_SYNTHETIC_TIMERS = 4, +}; + +struct hv_vp_state_data { + u32 type; + u32 rsvd; + struct hv_vp_state_data_xsave xsave; +} __packed; + +struct hv_input_get_vp_state { + u64 partition_id; + u32 vp_index; + u8 input_vtl; + u8 rsvd0; + u16 rsvd1; + struct hv_vp_state_data state_data; + u64 output_data_pfns[]; +} __packed; + +union hv_output_get_vp_state { + struct hv_synthetic_timers_state synthetic_timers_state; +} __packed; + +union hv_input_set_vp_state_data { + u64 pfns; + u8 bytes; +} __packed; + +struct hv_input_set_vp_state { + u64 partition_id; + u32 vp_index; + u8 input_vtl; + u8 rsvd0; + u16 rsvd1; + struct hv_vp_state_data state_data; + union hv_input_set_vp_state_data data[]; +} __packed; + +/* + * Dispatch state for the VP communicated by the hypervisor to the + * VP-dispatching thread in the root on return from HVCALL_DISPATCH_VP. + */ +enum hv_vp_dispatch_state { + HV_VP_DISPATCH_STATE_INVALID = 0, + HV_VP_DISPATCH_STATE_BLOCKED = 1, + HV_VP_DISPATCH_STATE_READY = 2, +}; + +/* + * Dispatch event that caused the current dispatch state on return from + * HVCALL_DISPATCH_VP. + */ +enum hv_vp_dispatch_event { + HV_VP_DISPATCH_EVENT_INVALID = 0x00000000, + HV_VP_DISPATCH_EVENT_SUSPEND = 0x00000001, + HV_VP_DISPATCH_EVENT_INTERCEPT = 0x00000002, +}; + +#define HV_ROOT_SCHEDULER_MAX_VPS_PER_CHILD_PARTITION 1024 +/* The maximum array size of HV_GENERIC_SET (vp_set) buffer */ +#define HV_GENERIC_SET_QWORD_COUNT(max) (((((max) - 1) >> 6) + 1) + 2) + +struct hv_vp_signal_bitset_scheduler_message { + u64 partition_id; + u32 overflow_count; + u16 vp_count; + u16 reserved; + +#define BITSET_BUFFER_SIZE \ + HV_GENERIC_SET_QWORD_COUNT(HV_ROOT_SCHEDULER_MAX_VPS_PER_CHILD_PARTITION) + union { + struct hv_vpset bitset; + u64 bitset_buffer[BITSET_BUFFER_SIZE]; + } vp_bitset; +#undef BITSET_BUFFER_SIZE +} __packed; + +static_assert(sizeof(struct hv_vp_signal_bitset_scheduler_message) <= + (sizeof(struct hv_message) - sizeof(struct hv_message_header))); + +#define HV_MESSAGE_MAX_PARTITION_VP_PAIR_COUNT \ + (((sizeof(struct hv_message) - sizeof(struct hv_message_header)) / \ + (sizeof(u64 /* partition id */) + sizeof(u32 /* vp index */))) - 1) + +struct hv_vp_signal_pair_scheduler_message { + u32 overflow_count; + u8 vp_count; + u8 reserved1[3]; + + u64 partition_ids[HV_MESSAGE_MAX_PARTITION_VP_PAIR_COUNT]; + u32 vp_indexes[HV_MESSAGE_MAX_PARTITION_VP_PAIR_COUNT]; + + u8 reserved2[4]; +} __packed; + +static_assert(sizeof(struct hv_vp_signal_pair_scheduler_message) == + (sizeof(struct hv_message) - sizeof(struct hv_message_header))); + +/* Input and output structures for HVCALL_DISPATCH_VP */ +#define HV_DISPATCH_VP_FLAG_CLEAR_INTERCEPT_SUSPEND 0x1 +#define HV_DISPATCH_VP_FLAG_ENABLE_CALLER_INTERRUPTS 0x2 +#define HV_DISPATCH_VP_FLAG_SET_CALLER_SPEC_CTRL 0x4 +#define HV_DISPATCH_VP_FLAG_SKIP_VP_SPEC_FLUSH 0x8 +#define HV_DISPATCH_VP_FLAG_SKIP_CALLER_SPEC_FLUSH 0x10 +#define HV_DISPATCH_VP_FLAG_SKIP_CALLER_USER_SPEC_FLUSH 0x20 + +struct hv_input_dispatch_vp { + u64 partition_id; + u32 vp_index; + u32 flags; + u64 time_slice; /* in 100ns */ + u64 spec_ctrl; +} __packed; + +struct hv_output_dispatch_vp { + u32 dispatch_state; /* enum hv_vp_dispatch_state */ + u32 dispatch_event; /* enum hv_vp_dispatch_event */ +} __packed; + +#endif /* _HV_HVHDK_H */ diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h new file mode 100644 index 000000000000..f8a39d3e9ce6 --- /dev/null +++ b/include/hyperv/hvhdk_mini.h @@ -0,0 +1,311 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Type definitions for the Microsoft Hypervisor. + */ +#ifndef _HV_HVHDK_MINI_H +#define _HV_HVHDK_MINI_H + +#include "hvgdk_mini.h" + +/* + * Doorbell connection_info flags. + */ +#define HV_DOORBELL_FLAG_TRIGGER_SIZE_MASK 0x00000007 +#define HV_DOORBELL_FLAG_TRIGGER_SIZE_ANY 0x00000000 +#define HV_DOORBELL_FLAG_TRIGGER_SIZE_BYTE 0x00000001 +#define HV_DOORBELL_FLAG_TRIGGER_SIZE_WORD 0x00000002 +#define HV_DOORBELL_FLAG_TRIGGER_SIZE_DWORD 0x00000003 +#define HV_DOORBELL_FLAG_TRIGGER_SIZE_QWORD 0x00000004 +#define HV_DOORBELL_FLAG_TRIGGER_ANY_VALUE 0x80000000 + +/* Each generic set contains 64 elements */ +#define HV_GENERIC_SET_SHIFT (6) +#define HV_GENERIC_SET_MASK (63) + +enum hv_generic_set_format { + HV_GENERIC_SET_SPARSE_4K, + HV_GENERIC_SET_ALL, +}; +#define HV_GENERIC_SET_FORMAT hv_generic_set_format + +enum hv_scheduler_type { + HV_SCHEDULER_TYPE_LP = 1, /* Classic scheduler w/o SMT */ + HV_SCHEDULER_TYPE_LP_SMT = 2, /* Classic scheduler w/ SMT */ + HV_SCHEDULER_TYPE_CORE_SMT = 3, /* Core scheduler */ + HV_SCHEDULER_TYPE_ROOT = 4, /* Root / integrated scheduler */ + HV_SCHEDULER_TYPE_MAX +}; + +enum hv_partition_property_code { + /* Privilege properties */ + HV_PARTITION_PROPERTY_PRIVILEGE_FLAGS = 0x00010000, + HV_PARTITION_PROPERTY_SYNTHETIC_PROC_FEATURES = 0x00010001, + + /* Resource properties */ + HV_PARTITION_PROPERTY_GPA_PAGE_ACCESS_TRACKING = 0x00050005, + HV_PARTITION_PROPERTY_UNIMPLEMENTED_MSR_ACTION = 0x00050017, + + /* Compatibility properties */ + HV_PARTITION_PROPERTY_PROCESSOR_XSAVE_FEATURES = 0x00060002, + HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE = 0x00060008, + HV_PARTITION_PROPERTY_PROCESSOR_CLOCK_FREQUENCY = 0x00060009, +}; + +enum hv_system_property { + /* Add more values when needed */ + HV_SYSTEM_PROPERTY_SCHEDULER_TYPE = 15, +}; + +struct hv_input_get_system_property { + u32 property_id; /* enum hv_system_property */ + union { + u32 as_uint32; + /* More fields to be filled in when needed */ + }; +} __packed; + +struct hv_output_get_system_property { + union { + u32 scheduler_type; /* enum hv_scheduler_type */ + }; +} __packed; + +struct hv_proximity_domain_flags { + u32 proximity_preferred : 1; + u32 reserved : 30; + u32 proximity_info_valid : 1; +} __packed; + +struct hv_proximity_domain_info { + u32 domain_id; + struct hv_proximity_domain_flags flags; +} __packed; + +/* HvDepositMemory hypercall */ +struct hv_deposit_memory { /* HV_INPUT_DEPOSIT_MEMORY */ + u64 partition_id; + u64 gpa_page_list[]; +} __packed; + +struct hv_input_withdraw_memory { + u64 partition_id; + struct hv_proximity_domain_info proximity_domain_info; +} __packed; + +struct hv_output_withdraw_memory { + DECLARE_FLEX_ARRAY(u64, gpa_page_list); +} __packed; + +/* HV Map GPA (Guest Physical Address) Flags */ +#define HV_MAP_GPA_PERMISSIONS_NONE 0x0 +#define HV_MAP_GPA_READABLE 0x1 +#define HV_MAP_GPA_WRITABLE 0x2 +#define HV_MAP_GPA_KERNEL_EXECUTABLE 0x4 +#define HV_MAP_GPA_USER_EXECUTABLE 0x8 +#define HV_MAP_GPA_EXECUTABLE 0xC +#define HV_MAP_GPA_PERMISSIONS_MASK 0xF +#define HV_MAP_GPA_ADJUSTABLE 0x8000 +#define HV_MAP_GPA_NO_ACCESS 0x10000 +#define HV_MAP_GPA_NOT_CACHED 0x200000 +#define HV_MAP_GPA_LARGE_PAGE 0x80000000 + +struct hv_input_map_gpa_pages { + u64 target_partition_id; + u64 target_gpa_base; + u32 map_flags; + u32 padding; + u64 source_gpa_page_list[]; +} __packed; + +union hv_gpa_page_access_state_flags { + struct { + u64 clear_accessed : 1; + u64 set_accessed : 1; + u64 clear_dirty : 1; + u64 set_dirty : 1; + u64 reserved : 60; + } __packed; + u64 as_uint64; +}; + +struct hv_input_get_gpa_pages_access_state { + u64 partition_id; + union hv_gpa_page_access_state_flags flags; + u64 hv_gpa_page_number; +} __packed; + +union hv_gpa_page_access_state { + struct { + u8 accessed : 1; + u8 dirty : 1; + u8 reserved: 6; + }; + u8 as_uint8; +} __packed; + +struct hv_lp_startup_status { + u64 hv_status; + u64 substatus1; + u64 substatus2; + u64 substatus3; + u64 substatus4; + u64 substatus5; + u64 substatus6; +} __packed; + +struct hv_input_add_logical_processor { + u32 lp_index; + u32 apic_id; + struct hv_proximity_domain_info proximity_domain_info; +} __packed; + +struct hv_output_add_logical_processor { + struct hv_lp_startup_status startup_status; +} __packed; + +enum { /* HV_SUBNODE_TYPE */ + HV_SUBNODE_ANY = 0, + HV_SUBNODE_SOCKET, + HV_SUBNODE_CLUSTER, + HV_SUBNODE_L3, + HV_SUBNODE_COUNT, + HV_SUBNODE_INVALID = -1 +}; + +struct hv_create_vp { /* HV_INPUT_CREATE_VP */ + u64 partition_id; + u32 vp_index; + u8 padding[3]; + u8 subnode_type; + u64 subnode_id; + struct hv_proximity_domain_info proximity_domain_info; + u64 flags; +} __packed; + +/* HV_INTERRUPT_TRIGGER_MODE */ +enum hv_interrupt_trigger_mode { + HV_INTERRUPT_TRIGGER_MODE_EDGE = 0, + HV_INTERRUPT_TRIGGER_MODE_LEVEL = 1, +}; + +/* HV_DEVICE_INTERRUPT_DESCRIPTOR */ +struct hv_device_interrupt_descriptor { + u32 interrupt_type; + u32 trigger_mode; + u32 vector_count; + u32 reserved; + struct hv_device_interrupt_target target; +} __packed; + +/* HV_INPUT_MAP_DEVICE_INTERRUPT */ +struct hv_input_map_device_interrupt { + u64 partition_id; + u64 device_id; + u32 flags; + u32 base_irt_idx; + struct hv_interrupt_entry logical_interrupt_entry; + struct hv_device_interrupt_descriptor interrupt_descriptor; +} __packed; + +/* HV_OUTPUT_MAP_DEVICE_INTERRUPT */ +struct hv_output_map_device_interrupt { + struct hv_interrupt_entry interrupt_entry; +} __packed; + +/* HV_INPUT_UNMAP_DEVICE_INTERRUPT */ +struct hv_input_unmap_device_interrupt { + u64 partition_id; + u64 device_id; + struct hv_interrupt_entry interrupt_entry; + u32 flags; +} __packed; + +#define HV_SOURCE_SHADOW_NONE 0x0 +#define HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE 0x1 + +struct hv_send_ipi_ex { /* HV_INPUT_SEND_SYNTHETIC_CLUSTER_IPI_EX */ + u32 vector; + u32 reserved; + struct hv_vpset vp_set; +} __packed; + +typedef u16 hv_pci_rid; /* HV_PCI_RID */ +typedef u16 hv_pci_segment; /* HV_PCI_SEGMENT */ +typedef u64 hv_logical_device_id; +union hv_pci_bdf { /* HV_PCI_BDF */ + u16 as_uint16; + + struct { + u8 function : 3; + u8 device : 5; + u8 bus; + }; +} __packed; + +union hv_pci_bus_range { + u16 as_uint16; + + struct { + u8 subordinate_bus; + u8 secondary_bus; + }; +} __packed; + +enum hv_device_type { /* HV_DEVICE_TYPE */ + HV_DEVICE_TYPE_LOGICAL = 0, + HV_DEVICE_TYPE_PCI = 1, + HV_DEVICE_TYPE_IOAPIC = 2, + HV_DEVICE_TYPE_ACPI = 3, +}; + +union hv_device_id { /* HV_DEVICE_ID */ + u64 as_uint64; + + struct { + u64 reserved0 : 62; + u64 device_type : 2; + }; + + /* HV_DEVICE_TYPE_LOGICAL */ + struct { + u64 id : 62; + u64 device_type : 2; + } logical; + + /* HV_DEVICE_TYPE_PCI */ + struct { + union { + hv_pci_rid rid; + union hv_pci_bdf bdf; + }; + + hv_pci_segment segment; + union hv_pci_bus_range shadow_bus_range; + + u16 phantom_function_bits : 2; + u16 source_shadow : 1; + + u16 rsvdz0 : 11; + u16 device_type : 2; + } pci; + + /* HV_DEVICE_TYPE_IOAPIC */ + struct { + u8 ioapic_id; + u8 rsvdz0; + u16 rsvdz1; + u16 rsvdz2; + + u16 rsvdz3 : 14; + u16 device_type : 2; + } ioapic; + + /* HV_DEVICE_TYPE_ACPI */ + struct { + u32 input_mapping_base; + u32 input_mapping_count : 30; + u32 device_type : 2; + } acpi; +} __packed; + +#endif /* _HV_HVHDK_MINI_H */ -- cgit v1.2.3 From ef5a3c92a81a1a892ae9edf949625beb68b4bd43 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Mon, 25 Nov 2024 15:24:43 -0800 Subject: hyperv: Switch from hyperv-tlfs.h to hyperv/hvhdk.h Switch to using hvhdk.h everywhere in the kernel. This header includes all the new Hyper-V headers in include/hyperv, which form a superset of the definitions found in hyperv-tlfs.h. This makes it easier to add new Hyper-V interfaces without being restricted to those in the TLFS doc (reflected in hyperv-tlfs.h). To be more consistent with the original Hyper-V code, the names of some definitions are changed slightly. Update those where needed. Update comments in mshyperv.h files to point to include/hyperv for adding new definitions. Signed-off-by: Nuno Das Neves Reviewed-by: Michael Kelley Reviewed-by: Easwar Hariharan Signed-off-by: Roman Kisel Reviewed-by: Easwar Hariharan Link: https://lore.kernel.org/r/1732577084-2122-5-git-send-email-nunodasneves@linux.microsoft.com Link: https://lore.kernel.org/r/20250108222138.1623703-3-romank@linux.microsoft.com Signed-off-by: Wei Liu --- include/asm-generic/mshyperv.h | 7 +++---- include/clocksource/hyperv_timer.h | 2 +- include/linux/hyperv.h | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 8fe7aaab2599..a7bbe504e4f3 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -6,9 +6,8 @@ * independent. See arch//include/asm/mshyperv.h for definitions * that are specific to architecture . * - * Definitions that are specified in the Hyper-V Top Level Functional - * Spec (TLFS) should not go in this file, but should instead go in - * hyperv-tlfs.h. + * Definitions that are derived from Hyper-V code or headers should not go in + * this file, but should instead go in the relevant files in include/hyperv. * * Copyright (C) 2019, Microsoft, Inc. * @@ -25,7 +24,7 @@ #include #include #include -#include +#include #define VTPM_BASE_ADDRESS 0xfed40000 diff --git a/include/clocksource/hyperv_timer.h b/include/clocksource/hyperv_timer.h index aa5233b1eba9..d48dd4176fd3 100644 --- a/include/clocksource/hyperv_timer.h +++ b/include/clocksource/hyperv_timer.h @@ -15,7 +15,7 @@ #include #include -#include +#include #define HV_MAX_MAX_DELTA_TICKS 0xffffffff #define HV_MIN_DELTA_TICKS 1 diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index b0dbba3b9108..4179add2864b 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -24,7 +24,7 @@ #include #include #include -#include +#include #define MAX_PAGE_BUFFER_COUNT 32 #define MAX_MULTIPAGE_BUFFER_COUNT 32 /* 128K */ -- cgit v1.2.3 From 962a4c7ea87884ed44ff48213f00cd5114c357e9 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Mon, 25 Nov 2024 15:24:44 -0800 Subject: hyperv: Remove the now unused hyperv-tlfs.h files Remove all hyperv-tlfs.h files. These are no longer included anywhere. hyperv/hvhdk.h serves the same role, but with an easier path for adding new definitions. Remove the relevant lines in MAINTAINERS. Signed-off-by: Nuno Das Neves Reviewed-by: Michael Kelley Reviewed-by: Easwar Hariharan Link: https://lore.kernel.org/r/1732577084-2122-6-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1732577084-2122-6-git-send-email-nunodasneves@linux.microsoft.com> --- include/asm-generic/hyperv-tlfs.h | 883 -------------------------------------- 1 file changed, 883 deletions(-) delete mode 100644 include/asm-generic/hyperv-tlfs.h (limited to 'include') diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h deleted file mode 100644 index 52274c9aefef..000000000000 --- a/include/asm-generic/hyperv-tlfs.h +++ /dev/null @@ -1,883 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -/* - * This file contains definitions from Hyper-V Hypervisor Top-Level Functional - * Specification (TLFS): - * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs - */ - -#ifndef _ASM_GENERIC_HYPERV_TLFS_H -#define _ASM_GENERIC_HYPERV_TLFS_H - -#include -#include -#include - -/* - * While not explicitly listed in the TLFS, Hyper-V always runs with a page size - * of 4096. These definitions are used when communicating with Hyper-V using - * guest physical pages and guest physical page addresses, since the guest page - * size may not be 4096 on all architectures. - */ -#define HV_HYP_PAGE_SHIFT 12 -#define HV_HYP_PAGE_SIZE BIT(HV_HYP_PAGE_SHIFT) -#define HV_HYP_PAGE_MASK (~(HV_HYP_PAGE_SIZE - 1)) - -/* - * Hyper-V provides two categories of flags relevant to guest VMs. The - * "Features" category indicates specific functionality that is available - * to guests on this particular instance of Hyper-V. The "Features" - * are presented in four groups, each of which is 32 bits. The group A - * and B definitions are common across architectures and are listed here. - * However, not all flags are relevant on all architectures. - * - * Groups C and D vary across architectures and are listed in the - * architecture specific portion of hyperv-tlfs.h. Some of these flags exist - * on multiple architectures, but the bit positions are different so they - * cannot appear in the generic portion of hyperv-tlfs.h. - * - * The "Enlightenments" category provides recommendations on whether to use - * specific enlightenments that are available. The Enlighenments are a single - * group of 32 bits, but they vary across architectures and are listed in - * the architecture specific portion of hyperv-tlfs.h. - */ - -/* - * Group A Features. - */ - -/* VP Runtime register available */ -#define HV_MSR_VP_RUNTIME_AVAILABLE BIT(0) -/* Partition Reference Counter available*/ -#define HV_MSR_TIME_REF_COUNT_AVAILABLE BIT(1) -/* Basic SynIC register available */ -#define HV_MSR_SYNIC_AVAILABLE BIT(2) -/* Synthetic Timer registers available */ -#define HV_MSR_SYNTIMER_AVAILABLE BIT(3) -/* Virtual APIC assist and VP assist page registers available */ -#define HV_MSR_APIC_ACCESS_AVAILABLE BIT(4) -/* Hypercall and Guest OS ID registers available*/ -#define HV_MSR_HYPERCALL_AVAILABLE BIT(5) -/* Access virtual processor index register available*/ -#define HV_MSR_VP_INDEX_AVAILABLE BIT(6) -/* Virtual system reset register available*/ -#define HV_MSR_RESET_AVAILABLE BIT(7) -/* Access statistics page registers available */ -#define HV_MSR_STAT_PAGES_AVAILABLE BIT(8) -/* Partition reference TSC register is available */ -#define HV_MSR_REFERENCE_TSC_AVAILABLE BIT(9) -/* Partition Guest IDLE register is available */ -#define HV_MSR_GUEST_IDLE_AVAILABLE BIT(10) -/* Partition local APIC and TSC frequency registers available */ -#define HV_ACCESS_FREQUENCY_MSRS BIT(11) -/* AccessReenlightenmentControls privilege */ -#define HV_ACCESS_REENLIGHTENMENT BIT(13) -/* AccessTscInvariantControls privilege */ -#define HV_ACCESS_TSC_INVARIANT BIT(15) - -/* - * Group B features. - */ -#define HV_CREATE_PARTITIONS BIT(0) -#define HV_ACCESS_PARTITION_ID BIT(1) -#define HV_ACCESS_MEMORY_POOL BIT(2) -#define HV_ADJUST_MESSAGE_BUFFERS BIT(3) -#define HV_POST_MESSAGES BIT(4) -#define HV_SIGNAL_EVENTS BIT(5) -#define HV_CREATE_PORT BIT(6) -#define HV_CONNECT_PORT BIT(7) -#define HV_ACCESS_STATS BIT(8) -#define HV_DEBUGGING BIT(11) -#define HV_CPU_MANAGEMENT BIT(12) -#define HV_ENABLE_EXTENDED_HYPERCALLS BIT(20) -#define HV_ISOLATION BIT(22) - -/* - * TSC page layout. - */ -struct ms_hyperv_tsc_page { - volatile u32 tsc_sequence; - u32 reserved1; - volatile u64 tsc_scale; - volatile s64 tsc_offset; -} __packed; - -union hv_reference_tsc_msr { - u64 as_uint64; - struct { - u64 enable:1; - u64 reserved:11; - u64 pfn:52; - } __packed; -}; - -/* - * The guest OS needs to register the guest ID with the hypervisor. - * The guest ID is a 64 bit entity and the structure of this ID is - * specified in the Hyper-V specification: - * - * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx - * - * While the current guideline does not specify how Linux guest ID(s) - * need to be generated, our plan is to publish the guidelines for - * Linux and other guest operating systems that currently are hosted - * on Hyper-V. The implementation here conforms to this yet - * unpublished guidelines. - * - * - * Bit(s) - * 63 - Indicates if the OS is Open Source or not; 1 is Open Source - * 62:56 - Os Type; Linux is 0x100 - * 55:48 - Distro specific identification - * 47:16 - Linux kernel version number - * 15:0 - Distro specific identification - * - * - */ - -#define HV_LINUX_VENDOR_ID 0x8100 - -/* - * Crash notification flags. - */ -#define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) -#define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) - -/* Declare the various hypercall operations. */ -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 -#define HVCALL_ENABLE_VP_VTL 0x000f -#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 -#define HVCALL_SEND_IPI 0x000b -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 -#define HVCALL_SEND_IPI_EX 0x0015 -#define HVCALL_GET_PARTITION_ID 0x0046 -#define HVCALL_DEPOSIT_MEMORY 0x0048 -#define HVCALL_CREATE_VP 0x004e -#define HVCALL_GET_VP_REGISTERS 0x0050 -#define HVCALL_SET_VP_REGISTERS 0x0051 -#define HVCALL_POST_MESSAGE 0x005c -#define HVCALL_SIGNAL_EVENT 0x005d -#define HVCALL_POST_DEBUG_DATA 0x0069 -#define HVCALL_RETRIEVE_DEBUG_DATA 0x006a -#define HVCALL_RESET_DEBUG_SESSION 0x006b -#define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076 -#define HVCALL_MAP_DEVICE_INTERRUPT 0x007c -#define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d -#define HVCALL_RETARGET_INTERRUPT 0x007e -#define HVCALL_START_VP 0x0099 -#define HVCALL_GET_VP_ID_FROM_APIC_ID 0x009a -#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af -#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 -#define HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY 0x00db -#define HVCALL_MMIO_READ 0x0106 -#define HVCALL_MMIO_WRITE 0x0107 - -/* Extended hypercalls */ -#define HV_EXT_CALL_QUERY_CAPABILITIES 0x8001 -#define HV_EXT_CALL_MEMORY_HEAT_HINT 0x8003 - -#define HV_FLUSH_ALL_PROCESSORS BIT(0) -#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) -#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) -#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) - -/* Extended capability bits */ -#define HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT BIT(8) - -enum HV_GENERIC_SET_FORMAT { - HV_GENERIC_SET_SPARSE_4K, - HV_GENERIC_SET_ALL, -}; - -#define HV_PARTITION_ID_SELF ((u64)-1) -#define HV_VP_INDEX_SELF ((u32)-2) - -#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) -#define HV_HYPERCALL_FAST_BIT BIT(16) -#define HV_HYPERCALL_VARHEAD_OFFSET 17 -#define HV_HYPERCALL_VARHEAD_MASK GENMASK_ULL(26, 17) -#define HV_HYPERCALL_RSVD0_MASK GENMASK_ULL(31, 27) -#define HV_HYPERCALL_NESTED BIT_ULL(31) -#define HV_HYPERCALL_REP_COMP_OFFSET 32 -#define HV_HYPERCALL_REP_COMP_1 BIT_ULL(32) -#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) -#define HV_HYPERCALL_RSVD1_MASK GENMASK_ULL(47, 44) -#define HV_HYPERCALL_REP_START_OFFSET 48 -#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) -#define HV_HYPERCALL_RSVD2_MASK GENMASK_ULL(63, 60) -#define HV_HYPERCALL_RSVD_MASK (HV_HYPERCALL_RSVD0_MASK | \ - HV_HYPERCALL_RSVD1_MASK | \ - HV_HYPERCALL_RSVD2_MASK) - -/* hypercall status code */ -#define HV_STATUS_SUCCESS 0 -#define HV_STATUS_INVALID_HYPERCALL_CODE 2 -#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 -#define HV_STATUS_INVALID_ALIGNMENT 4 -#define HV_STATUS_INVALID_PARAMETER 5 -#define HV_STATUS_ACCESS_DENIED 6 -#define HV_STATUS_OPERATION_DENIED 8 -#define HV_STATUS_INSUFFICIENT_MEMORY 11 -#define HV_STATUS_INVALID_PORT_ID 17 -#define HV_STATUS_INVALID_CONNECTION_ID 18 -#define HV_STATUS_INSUFFICIENT_BUFFERS 19 -#define HV_STATUS_TIME_OUT 120 -#define HV_STATUS_VTL_ALREADY_ENABLED 134 - -/* - * The Hyper-V TimeRefCount register and the TSC - * page provide a guest VM clock with 100ns tick rate - */ -#define HV_CLOCK_HZ (NSEC_PER_SEC/100) - -/* Define the number of synthetic interrupt sources. */ -#define HV_SYNIC_SINT_COUNT (16) -/* Define the expected SynIC version. */ -#define HV_SYNIC_VERSION_1 (0x1) -/* Valid SynIC vectors are 16-255. */ -#define HV_SYNIC_FIRST_VALID_VECTOR (16) - -#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0) -#define HV_SYNIC_SIMP_ENABLE (1ULL << 0) -#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0) -#define HV_SYNIC_SINT_MASKED (1ULL << 16) -#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17) -#define HV_SYNIC_SINT_VECTOR_MASK (0xFF) - -#define HV_SYNIC_STIMER_COUNT (4) - -/* Define synthetic interrupt controller message constants. */ -#define HV_MESSAGE_SIZE (256) -#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) -#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30) - -/* - * Define hypervisor message types. Some of the message types - * are x86/x64 specific, but there's no good way to separate - * them out into the arch-specific version of hyperv-tlfs.h - * because C doesn't provide a way to extend enum types. - * Keeping them all in the arch neutral hyperv-tlfs.h seems - * the least messy compromise. - */ -enum hv_message_type { - HVMSG_NONE = 0x00000000, - - /* Memory access messages. */ - HVMSG_UNMAPPED_GPA = 0x80000000, - HVMSG_GPA_INTERCEPT = 0x80000001, - - /* Timer notification messages. */ - HVMSG_TIMER_EXPIRED = 0x80000010, - - /* Error messages. */ - HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020, - HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021, - HVMSG_UNSUPPORTED_FEATURE = 0x80000022, - - /* Trace buffer complete messages. */ - HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040, - - /* Platform-specific processor intercept messages. */ - HVMSG_X64_IOPORT_INTERCEPT = 0x80010000, - HVMSG_X64_MSR_INTERCEPT = 0x80010001, - HVMSG_X64_CPUID_INTERCEPT = 0x80010002, - HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003, - HVMSG_X64_APIC_EOI = 0x80010004, - HVMSG_X64_LEGACY_FP_ERROR = 0x80010005 -}; - -/* Define synthetic interrupt controller message flags. */ -union hv_message_flags { - __u8 asu8; - struct { - __u8 msg_pending:1; - __u8 reserved:7; - } __packed; -}; - -/* Define port identifier type. */ -union hv_port_id { - __u32 asu32; - struct { - __u32 id:24; - __u32 reserved:8; - } __packed u; -}; - -/* Define synthetic interrupt controller message header. */ -struct hv_message_header { - __u32 message_type; - __u8 payload_size; - union hv_message_flags message_flags; - __u8 reserved[2]; - union { - __u64 sender; - union hv_port_id port; - }; -} __packed; - -/* Define synthetic interrupt controller message format. */ -struct hv_message { - struct hv_message_header header; - union { - __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; - } u; -} __packed; - -/* Define the synthetic interrupt message page layout. */ -struct hv_message_page { - struct hv_message sint_message[HV_SYNIC_SINT_COUNT]; -} __packed; - -/* Define timer message payload structure. */ -struct hv_timer_message_payload { - __u32 timer_index; - __u32 reserved; - __u64 expiration_time; /* When the timer expired */ - __u64 delivery_time; /* When the message was delivered */ -} __packed; - - -/* Define synthetic interrupt controller flag constants. */ -#define HV_EVENT_FLAGS_COUNT (256 * 8) -#define HV_EVENT_FLAGS_LONG_COUNT (256 / sizeof(unsigned long)) - -/* - * Synthetic timer configuration. - */ -union hv_stimer_config { - u64 as_uint64; - struct { - u64 enable:1; - u64 periodic:1; - u64 lazy:1; - u64 auto_enable:1; - u64 apic_vector:8; - u64 direct_mode:1; - u64 reserved_z0:3; - u64 sintx:4; - u64 reserved_z1:44; - } __packed; -}; - - -/* Define the synthetic interrupt controller event flags format. */ -union hv_synic_event_flags { - unsigned long flags[HV_EVENT_FLAGS_LONG_COUNT]; -}; - -/* Define SynIC control register. */ -union hv_synic_scontrol { - u64 as_uint64; - struct { - u64 enable:1; - u64 reserved:63; - } __packed; -}; - -/* Define synthetic interrupt source. */ -union hv_synic_sint { - u64 as_uint64; - struct { - u64 vector:8; - u64 reserved1:8; - u64 masked:1; - u64 auto_eoi:1; - u64 polling:1; - u64 reserved2:45; - } __packed; -}; - -/* Define the format of the SIMP register */ -union hv_synic_simp { - u64 as_uint64; - struct { - u64 simp_enabled:1; - u64 preserved:11; - u64 base_simp_gpa:52; - } __packed; -}; - -/* Define the format of the SIEFP register */ -union hv_synic_siefp { - u64 as_uint64; - struct { - u64 siefp_enabled:1; - u64 preserved:11; - u64 base_siefp_gpa:52; - } __packed; -}; - -struct hv_vpset { - u64 format; - u64 valid_bank_mask; - u64 bank_contents[]; -} __packed; - -/* The maximum number of sparse vCPU banks which can be encoded by 'struct hv_vpset' */ -#define HV_MAX_SPARSE_VCPU_BANKS (64) -/* The number of vCPUs in one sparse bank */ -#define HV_VCPUS_PER_SPARSE_BANK (64) - -/* HvCallSendSyntheticClusterIpi hypercall */ -struct hv_send_ipi { - u32 vector; - u32 reserved; - u64 cpu_mask; -} __packed; - -/* HvCallSendSyntheticClusterIpiEx hypercall */ -struct hv_send_ipi_ex { - u32 vector; - u32 reserved; - struct hv_vpset vp_set; -} __packed; - -/* HvFlushGuestPhysicalAddressSpace hypercalls */ -struct hv_guest_mapping_flush { - u64 address_space; - u64 flags; -} __packed; - -/* - * HV_MAX_FLUSH_PAGES = "additional_pages" + 1. It's limited - * by the bitwidth of "additional_pages" in union hv_gpa_page_range. - */ -#define HV_MAX_FLUSH_PAGES (2048) -#define HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB 0 -#define HV_GPA_PAGE_RANGE_PAGE_SIZE_1GB 1 - -/* HvFlushGuestPhysicalAddressList, HvExtCallMemoryHeatHint hypercall */ -union hv_gpa_page_range { - u64 address_space; - struct { - u64 additional_pages:11; - u64 largepage:1; - u64 basepfn:52; - } page; - struct { - u64 reserved:12; - u64 page_size:1; - u64 reserved1:8; - u64 base_large_pfn:43; - }; -}; - -/* - * All input flush parameters should be in single page. The max flush - * count is equal with how many entries of union hv_gpa_page_range can - * be populated into the input parameter page. - */ -#define HV_MAX_FLUSH_REP_COUNT ((HV_HYP_PAGE_SIZE - 2 * sizeof(u64)) / \ - sizeof(union hv_gpa_page_range)) - -struct hv_guest_mapping_flush_list { - u64 address_space; - u64 flags; - union hv_gpa_page_range gpa_list[HV_MAX_FLUSH_REP_COUNT]; -}; - -/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ -struct hv_tlb_flush { - u64 address_space; - u64 flags; - u64 processor_mask; - u64 gva_list[]; -} __packed; - -/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ -struct hv_tlb_flush_ex { - u64 address_space; - u64 flags; - struct hv_vpset hv_vp_set; - u64 gva_list[]; -} __packed; - -/* HvGetPartitionId hypercall (output only) */ -struct hv_get_partition_id { - u64 partition_id; -} __packed; - -/* HvDepositMemory hypercall */ -struct hv_deposit_memory { - u64 partition_id; - u64 gpa_page_list[]; -} __packed; - -struct hv_proximity_domain_flags { - u32 proximity_preferred : 1; - u32 reserved : 30; - u32 proximity_info_valid : 1; -} __packed; - -struct hv_proximity_domain_info { - u32 domain_id; - struct hv_proximity_domain_flags flags; -} __packed; - -struct hv_lp_startup_status { - u64 hv_status; - u64 substatus1; - u64 substatus2; - u64 substatus3; - u64 substatus4; - u64 substatus5; - u64 substatus6; -} __packed; - -/* HvAddLogicalProcessor hypercall */ -struct hv_input_add_logical_processor { - u32 lp_index; - u32 apic_id; - struct hv_proximity_domain_info proximity_domain_info; -} __packed; - -struct hv_output_add_logical_processor { - struct hv_lp_startup_status startup_status; -} __packed; - -enum HV_SUBNODE_TYPE -{ - HvSubnodeAny = 0, - HvSubnodeSocket = 1, - HvSubnodeAmdNode = 2, - HvSubnodeL3 = 3, - HvSubnodeCount = 4, - HvSubnodeInvalid = -1 -}; - -/* HvCreateVp hypercall */ -struct hv_create_vp { - u64 partition_id; - u32 vp_index; - u8 padding[3]; - u8 subnode_type; - u64 subnode_id; - struct hv_proximity_domain_info proximity_domain_info; - u64 flags; -} __packed; - -enum hv_interrupt_source { - HV_INTERRUPT_SOURCE_MSI = 1, /* MSI and MSI-X */ - HV_INTERRUPT_SOURCE_IOAPIC, -}; - -union hv_ioapic_rte { - u64 as_uint64; - - struct { - u32 vector:8; - u32 delivery_mode:3; - u32 destination_mode:1; - u32 delivery_status:1; - u32 interrupt_polarity:1; - u32 remote_irr:1; - u32 trigger_mode:1; - u32 interrupt_mask:1; - u32 reserved1:15; - - u32 reserved2:24; - u32 destination_id:8; - }; - - struct { - u32 low_uint32; - u32 high_uint32; - }; -} __packed; - -struct hv_interrupt_entry { - u32 source; - u32 reserved1; - union { - union hv_msi_entry msi_entry; - union hv_ioapic_rte ioapic_rte; - }; -} __packed; - -/* - * flags for hv_device_interrupt_target.flags - */ -#define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1 -#define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2 - -struct hv_device_interrupt_target { - u32 vector; - u32 flags; - union { - u64 vp_mask; - struct hv_vpset vp_set; - }; -} __packed; - -struct hv_retarget_device_interrupt { - u64 partition_id; /* use "self" */ - u64 device_id; - struct hv_interrupt_entry int_entry; - u64 reserved2; - struct hv_device_interrupt_target int_target; -} __packed __aligned(8); - -/* - * These Hyper-V registers provide information equivalent to the CPUID - * instruction on x86/x64. - */ -#define HV_REGISTER_HYPERVISOR_VERSION 0x00000100 /*CPUID 0x40000002 */ -#define HV_REGISTER_FEATURES 0x00000200 /*CPUID 0x40000003 */ -#define HV_REGISTER_ENLIGHTENMENTS 0x00000201 /*CPUID 0x40000004 */ - -/* - * Synthetic register definitions equivalent to MSRs on x86/x64 - */ -#define HV_REGISTER_GUEST_CRASH_P0 0x00000210 -#define HV_REGISTER_GUEST_CRASH_P1 0x00000211 -#define HV_REGISTER_GUEST_CRASH_P2 0x00000212 -#define HV_REGISTER_GUEST_CRASH_P3 0x00000213 -#define HV_REGISTER_GUEST_CRASH_P4 0x00000214 -#define HV_REGISTER_GUEST_CRASH_CTL 0x00000215 - -#define HV_REGISTER_GUEST_OS_ID 0x00090002 -#define HV_REGISTER_VP_INDEX 0x00090003 -#define HV_REGISTER_TIME_REF_COUNT 0x00090004 -#define HV_REGISTER_REFERENCE_TSC 0x00090017 - -#define HV_REGISTER_SINT0 0x000A0000 -#define HV_REGISTER_SCONTROL 0x000A0010 -#define HV_REGISTER_SIEFP 0x000A0012 -#define HV_REGISTER_SIMP 0x000A0013 -#define HV_REGISTER_EOM 0x000A0014 - -#define HV_REGISTER_STIMER0_CONFIG 0x000B0000 -#define HV_REGISTER_STIMER0_COUNT 0x000B0001 - -/* HvGetVpRegisters hypercall input with variable size reg name list*/ -struct hv_get_vp_registers_input { - struct { - u64 partitionid; - u32 vpindex; - u8 inputvtl; - u8 padding[3]; - } header; - struct input { - u32 name0; - u32 name1; - } element[]; -} __packed; - -/* HvGetVpRegisters returns an array of these output elements */ -struct hv_get_vp_registers_output { - union { - struct { - u32 a; - u32 b; - u32 c; - u32 d; - } as32 __packed; - struct { - u64 low; - u64 high; - } as64 __packed; - }; -}; - -/* HvSetVpRegisters hypercall with variable size reg name/value list*/ -struct hv_set_vp_registers_input { - struct { - u64 partitionid; - u32 vpindex; - u8 inputvtl; - u8 padding[3]; - } header; - struct { - u32 name; - u32 padding1; - u64 padding2; - u64 valuelow; - u64 valuehigh; - } element[]; -} __packed; - -enum hv_device_type { - HV_DEVICE_TYPE_LOGICAL = 0, - HV_DEVICE_TYPE_PCI = 1, - HV_DEVICE_TYPE_IOAPIC = 2, - HV_DEVICE_TYPE_ACPI = 3, -}; - -typedef u16 hv_pci_rid; -typedef u16 hv_pci_segment; -typedef u64 hv_logical_device_id; -union hv_pci_bdf { - u16 as_uint16; - - struct { - u8 function:3; - u8 device:5; - u8 bus; - }; -} __packed; - -union hv_pci_bus_range { - u16 as_uint16; - - struct { - u8 subordinate_bus; - u8 secondary_bus; - }; -} __packed; - -union hv_device_id { - u64 as_uint64; - - struct { - u64 reserved0:62; - u64 device_type:2; - }; - - /* HV_DEVICE_TYPE_LOGICAL */ - struct { - u64 id:62; - u64 device_type:2; - } logical; - - /* HV_DEVICE_TYPE_PCI */ - struct { - union { - hv_pci_rid rid; - union hv_pci_bdf bdf; - }; - - hv_pci_segment segment; - union hv_pci_bus_range shadow_bus_range; - - u16 phantom_function_bits:2; - u16 source_shadow:1; - - u16 rsvdz0:11; - u16 device_type:2; - } pci; - - /* HV_DEVICE_TYPE_IOAPIC */ - struct { - u8 ioapic_id; - u8 rsvdz0; - u16 rsvdz1; - u16 rsvdz2; - - u16 rsvdz3:14; - u16 device_type:2; - } ioapic; - - /* HV_DEVICE_TYPE_ACPI */ - struct { - u32 input_mapping_base; - u32 input_mapping_count:30; - u32 device_type:2; - } acpi; -} __packed; - -enum hv_interrupt_trigger_mode { - HV_INTERRUPT_TRIGGER_MODE_EDGE = 0, - HV_INTERRUPT_TRIGGER_MODE_LEVEL = 1, -}; - -struct hv_device_interrupt_descriptor { - u32 interrupt_type; - u32 trigger_mode; - u32 vector_count; - u32 reserved; - struct hv_device_interrupt_target target; -} __packed; - -struct hv_input_map_device_interrupt { - u64 partition_id; - u64 device_id; - u64 flags; - struct hv_interrupt_entry logical_interrupt_entry; - struct hv_device_interrupt_descriptor interrupt_descriptor; -} __packed; - -struct hv_output_map_device_interrupt { - struct hv_interrupt_entry interrupt_entry; -} __packed; - -struct hv_input_unmap_device_interrupt { - u64 partition_id; - u64 device_id; - struct hv_interrupt_entry interrupt_entry; -} __packed; - -#define HV_SOURCE_SHADOW_NONE 0x0 -#define HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE 0x1 - -/* - * Version info reported by hypervisor - */ -union hv_hypervisor_version_info { - struct { - u32 build_number; - - u32 minor_version : 16; - u32 major_version : 16; - - u32 service_pack; - - u32 service_number : 24; - u32 service_branch : 8; - }; - struct { - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - }; -}; - -/* - * The whole argument should fit in a page to be able to pass to the hypervisor - * in one hypercall. - */ -#define HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES \ - ((HV_HYP_PAGE_SIZE - sizeof(struct hv_memory_hint)) / \ - sizeof(union hv_gpa_page_range)) - -/* HvExtCallMemoryHeatHint hypercall */ -#define HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD 2 -struct hv_memory_hint { - u64 type:2; - u64 reserved:62; - union hv_gpa_page_range ranges[]; -} __packed; - -/* Data structures for HVCALL_MMIO_READ and HVCALL_MMIO_WRITE */ -#define HV_HYPERCALL_MMIO_MAX_DATA_LENGTH 64 - -struct hv_mmio_read_input { - u64 gpa; - u32 size; - u32 reserved; -} __packed; - -struct hv_mmio_read_output { - u8 data[HV_HYPERCALL_MMIO_MAX_DATA_LENGTH]; -} __packed; - -struct hv_mmio_write_input { - u64 gpa; - u32 size; - u32 reserved; - u8 data[HV_HYPERCALL_MMIO_MAX_DATA_LENGTH]; -} __packed; - -/* Define connection identifier type. */ -union hv_connection_id { - u32 asu32; - struct { - u32 id:24; - u32 reserved:8; - } u; -}; - -#endif -- cgit v1.2.3 From 2b624a2c18656ea32e0849e7bc0018ba3c97ca64 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Wed, 4 Dec 2024 14:44:03 +0100 Subject: drm/ttm: Handle cgroup based eviction in TTM cgroup resource allocation has to be handled in TTM, so -EAGAIN from cgroups can be converted into -ENOSPC, and the limitcg can be properly evicted in ttm code. When hitting a resource limit through -EAGAIN, the cgroup for which the limit is hit is also returned. This allows eviction to delete only from cgroups which are a subgroup of the current cgroup. The returned CSS is used to determine if eviction is valuable for a given resource, and allows TTM to only target specific resources to lower memory usage. Co-developed-by: Friedrich Vock Signed-off-by: Friedrich Vock Co-developed-by: Maxime Ripard Signed-off-by: Maarten Lankhorst Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20241204134410.1161769-4-dev@lankhorst.se Signed-off-by: Maxime Ripard --- include/drm/ttm/ttm_resource.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/drm/ttm/ttm_resource.h b/include/drm/ttm/ttm_resource.h index be034be56ba1..ee688d0c029b 100644 --- a/include/drm/ttm/ttm_resource.h +++ b/include/drm/ttm/ttm_resource.h @@ -38,6 +38,7 @@ #define TTM_MAX_BO_PRIORITY 4U #define TTM_NUM_MEM_TYPES 8 +struct dmem_cgroup_device; struct ttm_device; struct ttm_resource_manager; struct ttm_resource; @@ -211,6 +212,11 @@ struct ttm_resource_manager { * bdev->lru_lock. */ uint64_t usage; + + /** + * @cg: &dmem_cgroup_region used for memory accounting, if not NULL. + */ + struct dmem_cgroup_region *cg; }; /** @@ -239,6 +245,7 @@ struct ttm_bus_placement { * @placement: Placement flags. * @bus: Placement on io bus accessible to the CPU * @bo: weak reference to the BO, protected by ttm_device::lru_lock + * @css: cgroup state this resource is charged to * * Structure indicating the placement and space resources used by a * buffer object. @@ -251,6 +258,8 @@ struct ttm_resource { struct ttm_bus_placement bus; struct ttm_buffer_object *bo; + struct dmem_cgroup_pool_state *css; + /** * @lru: Least recently used list, see &ttm_resource_manager.lru */ @@ -432,7 +441,8 @@ void ttm_resource_fini(struct ttm_resource_manager *man, int ttm_resource_alloc(struct ttm_buffer_object *bo, const struct ttm_place *place, - struct ttm_resource **res); + struct ttm_resource **res, + struct dmem_cgroup_pool_state **ret_limit_pool); void ttm_resource_free(struct ttm_buffer_object *bo, struct ttm_resource **res); bool ttm_resource_intersects(struct ttm_device *bdev, struct ttm_resource *res, -- cgit v1.2.3 From 6a04bb5a2046067681257d5dd69a724856c8fbcb Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Thu, 9 Jan 2025 12:50:32 +0200 Subject: drm/xe: remove unused xe_pciids.h harder, add missing PCI ID Commit 493454445c95 ("drm/xe: switch to common PCI ID macros") removed xe_pciids.h via drm-intel-next. In the mean time, commit ae78ec0a52c4 ("drm/xe/ptl: Add another PTL PCI ID") added to xe_pciids.h via drm-xe-next. The two commits were merged in commit 8f109f287fdc ("Merge drm/drm-next into drm-xe-next"), but xe_pciids.h wasn't removed, and the PCI ID wasn't added to pciids.h. Remove xe_pciids.h, and add the PCI ID to pciids.h. Cc: Matt Atwood Cc: Matt Roper Cc: Rodrigo Vivi Fixes: 8f109f287fdc ("Merge drm/drm-next into drm-xe-next") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/r/20241125120921.1bbc1930@canb.auug.org.au Reviewed-by: Tejas Upadhyay Reviewed-by: Matt Roper Link: https://patchwork.freedesktop.org/patch/msgid/20250109105032.2585416-1-jani.nikula@intel.com Signed-off-by: Jani Nikula --- include/drm/intel/pciids.h | 3 +- include/drm/intel/xe_pciids.h | 235 ------------------------------------------ 2 files changed, 2 insertions(+), 236 deletions(-) delete mode 100644 include/drm/intel/xe_pciids.h (limited to 'include') diff --git a/include/drm/intel/pciids.h b/include/drm/intel/pciids.h index 32480b5563db..7883384acd5e 100644 --- a/include/drm/intel/pciids.h +++ b/include/drm/intel/pciids.h @@ -829,6 +829,7 @@ MACRO__(0xB092, ## __VA_ARGS__), \ MACRO__(0xB0A0, ## __VA_ARGS__), \ MACRO__(0xB0A1, ## __VA_ARGS__), \ - MACRO__(0xB0A2, ## __VA_ARGS__) + MACRO__(0xB0A2, ## __VA_ARGS__), \ + MACRO__(0xB0B0, ## __VA_ARGS__) #endif /* __PCIIDS_H__ */ diff --git a/include/drm/intel/xe_pciids.h b/include/drm/intel/xe_pciids.h deleted file mode 100644 index 16d4b8bb590a..000000000000 --- a/include/drm/intel/xe_pciids.h +++ /dev/null @@ -1,235 +0,0 @@ -/* SPDX-License-Identifier: MIT */ -/* - * Copyright © 2022 Intel Corporation - */ - -#ifndef _XE_PCIIDS_H_ -#define _XE_PCIIDS_H_ - -/* - * Lists below can be turned into initializers for a struct pci_device_id - * by defining INTEL_VGA_DEVICE: - * - * #define INTEL_VGA_DEVICE(id, info) { \ - * 0x8086, id, \ - * ~0, ~0, \ - * 0x030000, 0xff0000, \ - * (unsigned long) info } - * - * And then calling like: - * - * XE_TGL_12_GT1_IDS(INTEL_VGA_DEVICE, ## __VA_ARGS__) - * - * To turn them into something else, just provide a different macro passed as - * first argument. - */ - -/* TGL */ -#define XE_TGL_GT1_IDS(MACRO__, ...) \ - MACRO__(0x9A60, ## __VA_ARGS__), \ - MACRO__(0x9A68, ## __VA_ARGS__), \ - MACRO__(0x9A70, ## __VA_ARGS__) - -#define XE_TGL_GT2_IDS(MACRO__, ...) \ - MACRO__(0x9A40, ## __VA_ARGS__), \ - MACRO__(0x9A49, ## __VA_ARGS__), \ - MACRO__(0x9A59, ## __VA_ARGS__), \ - MACRO__(0x9A78, ## __VA_ARGS__), \ - MACRO__(0x9AC0, ## __VA_ARGS__), \ - MACRO__(0x9AC9, ## __VA_ARGS__), \ - MACRO__(0x9AD9, ## __VA_ARGS__), \ - MACRO__(0x9AF8, ## __VA_ARGS__) - -#define XE_TGL_IDS(MACRO__, ...) \ - XE_TGL_GT1_IDS(MACRO__, ## __VA_ARGS__),\ - XE_TGL_GT2_IDS(MACRO__, ## __VA_ARGS__) - -/* RKL */ -#define XE_RKL_IDS(MACRO__, ...) \ - MACRO__(0x4C80, ## __VA_ARGS__), \ - MACRO__(0x4C8A, ## __VA_ARGS__), \ - MACRO__(0x4C8B, ## __VA_ARGS__), \ - MACRO__(0x4C8C, ## __VA_ARGS__), \ - MACRO__(0x4C90, ## __VA_ARGS__), \ - MACRO__(0x4C9A, ## __VA_ARGS__) - -/* DG1 */ -#define XE_DG1_IDS(MACRO__, ...) \ - MACRO__(0x4905, ## __VA_ARGS__), \ - MACRO__(0x4906, ## __VA_ARGS__), \ - MACRO__(0x4907, ## __VA_ARGS__), \ - MACRO__(0x4908, ## __VA_ARGS__), \ - MACRO__(0x4909, ## __VA_ARGS__) - -/* ADL-S */ -#define XE_ADLS_IDS(MACRO__, ...) \ - MACRO__(0x4680, ## __VA_ARGS__), \ - MACRO__(0x4682, ## __VA_ARGS__), \ - MACRO__(0x4688, ## __VA_ARGS__), \ - MACRO__(0x468A, ## __VA_ARGS__), \ - MACRO__(0x468B, ## __VA_ARGS__), \ - MACRO__(0x4690, ## __VA_ARGS__), \ - MACRO__(0x4692, ## __VA_ARGS__), \ - MACRO__(0x4693, ## __VA_ARGS__) - -/* ADL-P */ -#define XE_ADLP_IDS(MACRO__, ...) \ - MACRO__(0x46A0, ## __VA_ARGS__), \ - MACRO__(0x46A1, ## __VA_ARGS__), \ - MACRO__(0x46A2, ## __VA_ARGS__), \ - MACRO__(0x46A3, ## __VA_ARGS__), \ - MACRO__(0x46A6, ## __VA_ARGS__), \ - MACRO__(0x46A8, ## __VA_ARGS__), \ - MACRO__(0x46AA, ## __VA_ARGS__), \ - MACRO__(0x462A, ## __VA_ARGS__), \ - MACRO__(0x4626, ## __VA_ARGS__), \ - MACRO__(0x4628, ## __VA_ARGS__), \ - MACRO__(0x46B0, ## __VA_ARGS__), \ - MACRO__(0x46B1, ## __VA_ARGS__), \ - MACRO__(0x46B2, ## __VA_ARGS__), \ - MACRO__(0x46B3, ## __VA_ARGS__), \ - MACRO__(0x46C0, ## __VA_ARGS__), \ - MACRO__(0x46C1, ## __VA_ARGS__), \ - MACRO__(0x46C2, ## __VA_ARGS__), \ - MACRO__(0x46C3, ## __VA_ARGS__) - -/* ADL-N */ -#define XE_ADLN_IDS(MACRO__, ...) \ - MACRO__(0x46D0, ## __VA_ARGS__), \ - MACRO__(0x46D1, ## __VA_ARGS__), \ - MACRO__(0x46D2, ## __VA_ARGS__), \ - MACRO__(0x46D3, ## __VA_ARGS__), \ - MACRO__(0x46D4, ## __VA_ARGS__) - -/* RPL-S */ -#define XE_RPLS_IDS(MACRO__, ...) \ - MACRO__(0xA780, ## __VA_ARGS__), \ - MACRO__(0xA781, ## __VA_ARGS__), \ - MACRO__(0xA782, ## __VA_ARGS__), \ - MACRO__(0xA783, ## __VA_ARGS__), \ - MACRO__(0xA788, ## __VA_ARGS__), \ - MACRO__(0xA789, ## __VA_ARGS__), \ - MACRO__(0xA78A, ## __VA_ARGS__), \ - MACRO__(0xA78B, ## __VA_ARGS__) - -/* RPL-U */ -#define XE_RPLU_IDS(MACRO__, ...) \ - MACRO__(0xA721, ## __VA_ARGS__), \ - MACRO__(0xA7A1, ## __VA_ARGS__), \ - MACRO__(0xA7A9, ## __VA_ARGS__), \ - MACRO__(0xA7AC, ## __VA_ARGS__), \ - MACRO__(0xA7AD, ## __VA_ARGS__) - -/* RPL-P */ -#define XE_RPLP_IDS(MACRO__, ...) \ - MACRO__(0xA720, ## __VA_ARGS__), \ - MACRO__(0xA7A0, ## __VA_ARGS__), \ - MACRO__(0xA7A8, ## __VA_ARGS__), \ - MACRO__(0xA7AA, ## __VA_ARGS__), \ - MACRO__(0xA7AB, ## __VA_ARGS__) - -/* DG2 */ -#define XE_DG2_G10_IDS(MACRO__, ...) \ - MACRO__(0x5690, ## __VA_ARGS__), \ - MACRO__(0x5691, ## __VA_ARGS__), \ - MACRO__(0x5692, ## __VA_ARGS__), \ - MACRO__(0x56A0, ## __VA_ARGS__), \ - MACRO__(0x56A1, ## __VA_ARGS__), \ - MACRO__(0x56A2, ## __VA_ARGS__), \ - MACRO__(0x56BE, ## __VA_ARGS__), \ - MACRO__(0x56BF, ## __VA_ARGS__) - -#define XE_DG2_G11_IDS(MACRO__, ...) \ - MACRO__(0x5693, ## __VA_ARGS__), \ - MACRO__(0x5694, ## __VA_ARGS__), \ - MACRO__(0x5695, ## __VA_ARGS__), \ - MACRO__(0x56A5, ## __VA_ARGS__), \ - MACRO__(0x56A6, ## __VA_ARGS__), \ - MACRO__(0x56B0, ## __VA_ARGS__), \ - MACRO__(0x56B1, ## __VA_ARGS__), \ - MACRO__(0x56BA, ## __VA_ARGS__), \ - MACRO__(0x56BB, ## __VA_ARGS__), \ - MACRO__(0x56BC, ## __VA_ARGS__), \ - MACRO__(0x56BD, ## __VA_ARGS__) - -#define XE_DG2_G12_IDS(MACRO__, ...) \ - MACRO__(0x5696, ## __VA_ARGS__), \ - MACRO__(0x5697, ## __VA_ARGS__), \ - MACRO__(0x56A3, ## __VA_ARGS__), \ - MACRO__(0x56A4, ## __VA_ARGS__), \ - MACRO__(0x56B2, ## __VA_ARGS__), \ - MACRO__(0x56B3, ## __VA_ARGS__) - -#define XE_DG2_IDS(MACRO__, ...) \ - XE_DG2_G10_IDS(MACRO__, ## __VA_ARGS__),\ - XE_DG2_G11_IDS(MACRO__, ## __VA_ARGS__),\ - XE_DG2_G12_IDS(MACRO__, ## __VA_ARGS__) - -#define XE_ATS_M150_IDS(MACRO__, ...) \ - MACRO__(0x56C0, ## __VA_ARGS__), \ - MACRO__(0x56C2, ## __VA_ARGS__) - -#define XE_ATS_M75_IDS(MACRO__, ...) \ - MACRO__(0x56C1, ## __VA_ARGS__) - -#define XE_ATS_M_IDS(MACRO__, ...) \ - XE_ATS_M150_IDS(MACRO__, ## __VA_ARGS__),\ - XE_ATS_M75_IDS(MACRO__, ## __VA_ARGS__) - -/* ARL */ -#define XE_ARL_IDS(MACRO__, ...) \ - MACRO__(0x7D41, ## __VA_ARGS__), \ - MACRO__(0x7D51, ## __VA_ARGS__), \ - MACRO__(0x7D67, ## __VA_ARGS__), \ - MACRO__(0x7DD1, ## __VA_ARGS__), \ - MACRO__(0xB640, ## __VA_ARGS__) - -/* MTL */ -#define XE_MTL_IDS(MACRO__, ...) \ - MACRO__(0x7D40, ## __VA_ARGS__), \ - MACRO__(0x7D45, ## __VA_ARGS__), \ - MACRO__(0x7D55, ## __VA_ARGS__), \ - MACRO__(0x7D60, ## __VA_ARGS__), \ - MACRO__(0x7DD5, ## __VA_ARGS__) - -/* PVC */ -#define XE_PVC_IDS(MACRO__, ...) \ - MACRO__(0x0B69, ## __VA_ARGS__), \ - MACRO__(0x0B6E, ## __VA_ARGS__), \ - MACRO__(0x0BD4, ## __VA_ARGS__), \ - MACRO__(0x0BD5, ## __VA_ARGS__), \ - MACRO__(0x0BD6, ## __VA_ARGS__), \ - MACRO__(0x0BD7, ## __VA_ARGS__), \ - MACRO__(0x0BD8, ## __VA_ARGS__), \ - MACRO__(0x0BD9, ## __VA_ARGS__), \ - MACRO__(0x0BDA, ## __VA_ARGS__), \ - MACRO__(0x0BDB, ## __VA_ARGS__), \ - MACRO__(0x0BE0, ## __VA_ARGS__), \ - MACRO__(0x0BE1, ## __VA_ARGS__), \ - MACRO__(0x0BE5, ## __VA_ARGS__) - -#define XE_LNL_IDS(MACRO__, ...) \ - MACRO__(0x6420, ## __VA_ARGS__), \ - MACRO__(0x64A0, ## __VA_ARGS__), \ - MACRO__(0x64B0, ## __VA_ARGS__) - -#define XE_BMG_IDS(MACRO__, ...) \ - MACRO__(0xE202, ## __VA_ARGS__), \ - MACRO__(0xE20B, ## __VA_ARGS__), \ - MACRO__(0xE20C, ## __VA_ARGS__), \ - MACRO__(0xE20D, ## __VA_ARGS__), \ - MACRO__(0xE212, ## __VA_ARGS__) - -#define XE_PTL_IDS(MACRO__, ...) \ - MACRO__(0xB080, ## __VA_ARGS__), \ - MACRO__(0xB081, ## __VA_ARGS__), \ - MACRO__(0xB082, ## __VA_ARGS__), \ - MACRO__(0xB090, ## __VA_ARGS__), \ - MACRO__(0xB091, ## __VA_ARGS__), \ - MACRO__(0xB092, ## __VA_ARGS__), \ - MACRO__(0xB0A0, ## __VA_ARGS__), \ - MACRO__(0xB0A1, ## __VA_ARGS__), \ - MACRO__(0xB0A2, ## __VA_ARGS__), \ - MACRO__(0xB0B0, ## __VA_ARGS__) - -#endif -- cgit v1.2.3 From cacd9ae4bf801ff4125d8961bb9a3ba955e51680 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 7 Jan 2025 17:27:17 +0100 Subject: poll_wait: add mb() to fix theoretical race between waitqueue_active() and .poll() As the comment above waitqueue_active() explains, it can only be used if both waker and waiter have mb()'s that pair with each other. However __pollwait() is broken in this respect. This is not pipe-specific, but let's look at pipe_poll() for example: poll_wait(...); // -> __pollwait() -> add_wait_queue() LOAD(pipe->head); LOAD(pipe->head); In theory these LOAD()'s can leak into the critical section inside add_wait_queue() and can happen before list_add(entry, wq_head), in this case pipe_poll() can race with wakeup_pipe_readers/writers which do smp_mb(); if (waitqueue_active(wq_head)) wake_up_interruptible(wq_head); There are more __pollwait()-like functions (grep init_poll_funcptr), and it seems that at least ep_ptable_queue_proc() has the same problem, so the patch adds smp_mb() into poll_wait(). Link: https://lore.kernel.org/all/20250102163320.GA17691@redhat.com/ Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250107162717.GA18922@redhat.com Signed-off-by: Christian Brauner --- include/linux/poll.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/poll.h b/include/linux/poll.h index d1ea4f3714a8..fc641b50f129 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -41,8 +41,16 @@ typedef struct poll_table_struct { static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { - if (p && p->_qproc && wait_address) + if (p && p->_qproc && wait_address) { p->_qproc(filp, wait_address, p); + /* + * This memory barrier is paired in the wq_has_sleeper(). + * See the comment above prepare_to_wait(), we need to + * ensure that subsequent tests in this thread can't be + * reordered with __add_wait_queue() in _qproc() paths. + */ + smp_mb(); + } } /* -- cgit v1.2.3 From 10b02a2cfec2f106db4897ad87732db56d71e6fd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 7 Jan 2025 17:27:24 +0100 Subject: poll_wait: kill the obsolete wait_address check This check is historical and no longer needed, wait_address is never NULL. These days we rely on the poll_table->_qproc check. NULL if select/poll is not going to sleep, or it already has a data to report, or all waiters have already been registered after the 1st iteration. However, poll_table *p can be NULL, see p9_fd_poll() for example, so we can't remove the "p != NULL" check. Link: https://lore.kernel.org/all/20250106180325.GF7233@redhat.com/ Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250107162724.GA18926@redhat.com Signed-off-by: Christian Brauner --- include/linux/poll.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/poll.h b/include/linux/poll.h index fc641b50f129..57b6d1ccd8bf 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -41,7 +41,7 @@ typedef struct poll_table_struct { static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { - if (p && p->_qproc && wait_address) { + if (p && p->_qproc) { p->_qproc(filp, wait_address, p); /* * This memory barrier is paired in the wq_has_sleeper(). -- cgit v1.2.3 From b2849867b3a70c2d675ddca01c4e4540f7d3b8e9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 7 Jan 2025 17:27:36 +0100 Subject: sock_poll_wait: kill the no longer necessary barrier after poll_wait() Now that poll_wait() provides a full barrier we can remove smp_mb() from sock_poll_wait(). Also, the poll_does_not_wait() check before poll_wait() just adds the unnecessary confusion, kill it. poll_wait() does the same "p && p->_qproc" check. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250107162736.GA18944@redhat.com Signed-off-by: Christian Brauner --- include/net/sock.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 7464e9f9f47c..305f3ae5edc2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2291,7 +2291,7 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq) } /** - * sock_poll_wait - place memory barrier behind the poll_wait call. + * sock_poll_wait - wrapper for the poll_wait call. * @filp: file * @sock: socket to wait on * @p: poll_table @@ -2301,15 +2301,12 @@ static inline bool skwq_has_sleeper(struct socket_wq *wq) static inline void sock_poll_wait(struct file *filp, struct socket *sock, poll_table *p) { - if (!poll_does_not_wait(p)) { - poll_wait(filp, &sock->wq.wait, p); - /* We need to be sure we are in sync with the - * socket flags modification. - * - * This memory barrier is paired in the wq_has_sleeper. - */ - smp_mb(); - } + /* Provides a barrier we need to be sure we are in sync + * with the socket flags modification. + * + * This memory barrier is paired in the wq_has_sleeper. + */ + poll_wait(filp, &sock->wq.wait, p); } static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) -- cgit v1.2.3 From f005bf18a57aadf3af1e85a0f0151cb3688ee606 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 7 Jan 2025 17:27:43 +0100 Subject: poll: kill poll_does_not_wait() It no longer has users. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250107162743.GA18947@redhat.com Signed-off-by: Christian Brauner --- include/linux/poll.h | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/poll.h b/include/linux/poll.h index 57b6d1ccd8bf..12bb18e8b978 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -25,14 +25,14 @@ struct poll_table_struct; -/* +/* * structures and helpers for f_op->poll implementations */ typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *); /* - * Do not touch the structure directly, use the access functions - * poll_does_not_wait() and poll_requested_events() instead. + * Do not touch the structure directly, use the access function + * poll_requested_events() instead. */ typedef struct poll_table_struct { poll_queue_proc _qproc; @@ -53,16 +53,6 @@ static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_addres } } -/* - * Return true if it is guaranteed that poll will not wait. This is the case - * if the poll() of another file descriptor in the set got an event, so there - * is no need for waiting. - */ -static inline bool poll_does_not_wait(const poll_table *p) -{ - return p == NULL || p->_qproc == NULL; -} - /* * Return the set of events that the application wants to poll for. * This is useful for drivers that need to know whether a DMA transfer has -- cgit v1.2.3 From 30bca65bbbae13f32ee4f2897c55a496ea8132cf Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 7 Jan 2025 18:34:51 +0000 Subject: afs: Make /afs/@cell and /afs/.@cell symlinks Make /afs/@cell a symlink in the /afs dynamic root to match what other AFS clients do rather than doing a substitution in the dentry name. This has the bonus of being tab-expandable also. Further, provide a /afs/.@cell symlink to point to the dotted cell share. Signed-off-by: David Howells Link: https://lore.kernel.org/r/20250107183454.608451-4-dhowells@redhat.com cc: Marc Dionne cc: linux-afs@lists.infradead.org Signed-off-by: Christian Brauner --- include/trace/events/afs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index a0aed1a428a1..de0e2301a037 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -168,12 +168,14 @@ enum yfs_cm_operation { #define afs_cell_traces \ EM(afs_cell_trace_alloc, "ALLOC ") \ EM(afs_cell_trace_free, "FREE ") \ + EM(afs_cell_trace_get_atcell, "GET atcell") \ EM(afs_cell_trace_get_queue_dns, "GET q-dns ") \ EM(afs_cell_trace_get_queue_manage, "GET q-mng ") \ EM(afs_cell_trace_get_queue_new, "GET q-new ") \ EM(afs_cell_trace_get_vol, "GET vol ") \ EM(afs_cell_trace_insert, "INSERT ") \ EM(afs_cell_trace_manage, "MANAGE ") \ + EM(afs_cell_trace_put_atcell, "PUT atcell") \ EM(afs_cell_trace_put_candidate, "PUT candid") \ EM(afs_cell_trace_put_destroy, "PUT destry") \ EM(afs_cell_trace_put_queue_work, "PUT q-work") \ -- cgit v1.2.3 From 1cd9502ee9275c6176a7312863f939cca9506114 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 29 Dec 2024 00:45:28 +0900 Subject: module: get symbol CRC back to unsigned Commit 71810db27c1c ("modversions: treat symbol CRCs as 32 bit quantities") changed the CRC fields to s32 because the __kcrctab and __kcrctab_gpl sections contained relative references to the actual CRC values stored in the .rodata section when CONFIG_MODULE_REL_CRCS=y. Commit 7b4537199a4a ("kbuild: link symbol CRCs at final link, removing CONFIG_MODULE_REL_CRCS") removed this complexity. Now, the __kcrctab and __kcrctab_gpl sections directly contain the CRC values in all cases. The genksyms tool outputs unsigned 32-bit CRC values, so u32 is preferred over s32. No functional changes are intended. Regardless of this change, the CRC value is assigned to the u32 variable 'crcval' before the comparison, as seen in kernel/module/version.c: crcval = *crc; It was previously mandatory (but now optional) in order to avoid sign extension because the following line previously compared 'unsigned long' and 's32': if (versions[i].crc == crcval) return 1; versions[i].crc is still 'unsigned long' for backward compatibility. Signed-off-by: Masahiro Yamada Reviewed-by: Petr Pavlu --- include/linux/module.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/module.h b/include/linux/module.h index 94acbacdcdf1..903ef8fe4c04 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -430,7 +430,7 @@ struct module { /* Exported symbols */ const struct kernel_symbol *syms; - const s32 *crcs; + const u32 *crcs; unsigned int num_syms; #ifdef CONFIG_ARCH_USES_CFI_TRAPS @@ -448,7 +448,7 @@ struct module { /* GPL-only exported symbols. */ unsigned int num_gpl_syms; const struct kernel_symbol *gpl_syms; - const s32 *gpl_crcs; + const u32 *gpl_crcs; bool using_gplonly_symbols; #ifdef CONFIG_MODULE_SIG -- cgit v1.2.3 From 896be785015c0e0ba73442f73b8d4d9f5ccfc54c Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Wed, 4 Sep 2024 11:17:23 -0300 Subject: bus: fsl-mc: constify the struct device_type usage Since commit aed65af1cc2f ("drivers: make device_type const"), the driver core can properly handle constant struct device_type. Move all the device_type variables used in the bus to be constant structures as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Link: https://lore.kernel.org/r/20240904-class_cleanup-fsl-mc-bus-v2-1-83fa25cbdc68@suse.com Signed-off-by: Greg Kroah-Hartman --- include/linux/fsl/mc.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index c90ec889bfc2..99f30c7d6208 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -438,21 +438,21 @@ struct fsl_mc_device *fsl_mc_get_endpoint(struct fsl_mc_device *mc_dev, extern const struct bus_type fsl_mc_bus_type; -extern struct device_type fsl_mc_bus_dprc_type; -extern struct device_type fsl_mc_bus_dpni_type; -extern struct device_type fsl_mc_bus_dpio_type; -extern struct device_type fsl_mc_bus_dpsw_type; -extern struct device_type fsl_mc_bus_dpbp_type; -extern struct device_type fsl_mc_bus_dpcon_type; -extern struct device_type fsl_mc_bus_dpmcp_type; -extern struct device_type fsl_mc_bus_dpmac_type; -extern struct device_type fsl_mc_bus_dprtc_type; -extern struct device_type fsl_mc_bus_dpseci_type; -extern struct device_type fsl_mc_bus_dpdmux_type; -extern struct device_type fsl_mc_bus_dpdcei_type; -extern struct device_type fsl_mc_bus_dpaiop_type; -extern struct device_type fsl_mc_bus_dpci_type; -extern struct device_type fsl_mc_bus_dpdmai_type; +extern const struct device_type fsl_mc_bus_dprc_type; +extern const struct device_type fsl_mc_bus_dpni_type; +extern const struct device_type fsl_mc_bus_dpio_type; +extern const struct device_type fsl_mc_bus_dpsw_type; +extern const struct device_type fsl_mc_bus_dpbp_type; +extern const struct device_type fsl_mc_bus_dpcon_type; +extern const struct device_type fsl_mc_bus_dpmcp_type; +extern const struct device_type fsl_mc_bus_dpmac_type; +extern const struct device_type fsl_mc_bus_dprtc_type; +extern const struct device_type fsl_mc_bus_dpseci_type; +extern const struct device_type fsl_mc_bus_dpdmux_type; +extern const struct device_type fsl_mc_bus_dpdcei_type; +extern const struct device_type fsl_mc_bus_dpaiop_type; +extern const struct device_type fsl_mc_bus_dpci_type; +extern const struct device_type fsl_mc_bus_dpdmai_type; static inline bool is_fsl_mc_bus_dprc(const struct fsl_mc_device *mc_dev) { -- cgit v1.2.3 From ab017a15fdb2222002cdc6bdf86699fb21a0721a Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Sun, 5 Jan 2025 16:34:05 +0800 Subject: driver core: Rename declaration parameter name for API device_find_child() cluster For APIs: device_find_child() device_for_each_child() device_for_each_child_reverse() Their declaration has parameter name 'dev', but their defination changes the name to 'parent'. Rename declaration name to defination 'parent' to make both have the same name. Reviewed-by: Fan Ni Reviewed-by: Jonathan Cameron Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250105-class_fix-v6-4-3a2f1768d4d4@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/device.h b/include/linux/device.h index 0e0bc9bfe0d1..a9d928398895 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1074,14 +1074,14 @@ void device_del(struct device *dev); DEFINE_FREE(device_del, struct device *, if (_T) device_del(_T)) -int device_for_each_child(struct device *dev, void *data, +int device_for_each_child(struct device *parent, void *data, int (*fn)(struct device *dev, void *data)); -int device_for_each_child_reverse(struct device *dev, void *data, +int device_for_each_child_reverse(struct device *parent, void *data, int (*fn)(struct device *dev, void *data)); int device_for_each_child_reverse_from(struct device *parent, struct device *from, const void *data, int (*fn)(struct device *, const void *)); -struct device *device_find_child(struct device *dev, const void *data, +struct device *device_find_child(struct device *parent, const void *data, device_match_t match); struct device *device_find_child_by_name(struct device *parent, const char *name); -- cgit v1.2.3 From 523c6b3ed7702a638e0f8fd02708a7ed4f938269 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Sun, 5 Jan 2025 16:34:07 +0800 Subject: driver core: Correct API device_for_each_child_reverse_from() prototype For API device_for_each_child_reverse_from(..., const void *data, int (*fn)(struct device *dev, const void *data)) - Type of @data is const pointer, and means caller's data @*data is not allowed to be modified, but that usually is not proper for such non finding device iterating API. - Types for both @data and @fn are not consistent with all other for_each device iterating APIs device_for_each_child(_reverse)(), bus_for_each_dev() and (driver|class)_for_each_device(). Correct its prototype by removing const from parameter types, then adapt for various existing usages. An dedicated typedef device_iter_t will be introduced as @fn() type for various for_each device interating APIs later. Reviewed-by: Jonathan Cameron Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250105-class_fix-v6-6-3a2f1768d4d4@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/device.h b/include/linux/device.h index a9d928398895..025bac08fca7 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1079,8 +1079,8 @@ int device_for_each_child(struct device *parent, void *data, int device_for_each_child_reverse(struct device *parent, void *data, int (*fn)(struct device *dev, void *data)); int device_for_each_child_reverse_from(struct device *parent, - struct device *from, const void *data, - int (*fn)(struct device *, const void *)); + struct device *from, void *data, + int (*fn)(struct device *, void *)); struct device *device_find_child(struct device *parent, const void *data, device_match_t match); struct device *device_find_child_by_name(struct device *parent, -- cgit v1.2.3 From 767b74e0d1fc7890a94d1770acf05a442474bd87 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Sun, 5 Jan 2025 16:34:08 +0800 Subject: driver core: Introduce device_iter_t for device iterating APIs There are several for_each APIs which has parameter with type below: int (*fn)(struct device *dev, void *data) They iterate over various device lists and call @fn() for each device with caller provided data @*data, and they usually need to modify @*data. Give the type an dedicated typedef with advantages shown below: typedef int (*device_iter_t)(struct device *dev, void *data) - Shorter API declarations and definitions - Prevent further for_each APIs from using bad parameter type So introduce device_iter_t and apply it to various existing APIs below: bus_for_each_dev() (class|driver)_for_each_device() device_for_each_child(_reverse|_reverse_from)(). Reviewed-by: Jonathan Cameron Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250105-class_fix-v6-7-3a2f1768d4d4@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 6 +++--- include/linux/device/bus.h | 7 +++++-- include/linux/device/class.h | 4 ++-- include/linux/device/driver.h | 2 +- 4 files changed, 11 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/device.h b/include/linux/device.h index 025bac08fca7..36d1a1607712 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1075,12 +1075,12 @@ void device_del(struct device *dev); DEFINE_FREE(device_del, struct device *, if (_T) device_del(_T)) int device_for_each_child(struct device *parent, void *data, - int (*fn)(struct device *dev, void *data)); + device_iter_t fn); int device_for_each_child_reverse(struct device *parent, void *data, - int (*fn)(struct device *dev, void *data)); + device_iter_t fn); int device_for_each_child_reverse_from(struct device *parent, struct device *from, void *data, - int (*fn)(struct device *, void *)); + device_iter_t fn); struct device *device_find_child(struct device *parent, const void *data, device_match_t match); struct device *device_find_child_by_name(struct device *parent, diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index bc3fd74bb763..3d3517da41a1 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -139,9 +139,12 @@ int device_match_acpi_dev(struct device *dev, const void *adev); int device_match_acpi_handle(struct device *dev, const void *handle); int device_match_any(struct device *dev, const void *unused); +/* Device iterating function type for various driver core for_each APIs */ +typedef int (*device_iter_t)(struct device *dev, void *data); + /* iterator helpers for buses */ -int bus_for_each_dev(const struct bus_type *bus, struct device *start, void *data, - int (*fn)(struct device *dev, void *data)); +int bus_for_each_dev(const struct bus_type *bus, struct device *start, + void *data, device_iter_t fn); struct device *bus_find_device(const struct bus_type *bus, struct device *start, const void *data, device_match_t match); /** diff --git a/include/linux/device/class.h b/include/linux/device/class.h index 518c9c83d64b..aa67d4736816 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -92,8 +92,8 @@ void class_dev_iter_init(struct class_dev_iter *iter, const struct class *class, struct device *class_dev_iter_next(struct class_dev_iter *iter); void class_dev_iter_exit(struct class_dev_iter *iter); -int class_for_each_device(const struct class *class, const struct device *start, void *data, - int (*fn)(struct device *dev, void *data)); +int class_for_each_device(const struct class *class, const struct device *start, + void *data, device_iter_t fn); struct device *class_find_device(const struct class *class, const struct device *start, const void *data, device_match_t match); diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index 5c04b8e3833b..cd8e0f0a634b 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -154,7 +154,7 @@ void driver_remove_file(const struct device_driver *driver, int driver_set_override(struct device *dev, const char **override, const char *s, size_t len); int __must_check driver_for_each_device(struct device_driver *drv, struct device *start, - void *data, int (*fn)(struct device *dev, void *)); + void *data, device_iter_t fn); struct device *driver_find_device(const struct device_driver *drv, struct device *start, const void *data, device_match_t match); -- cgit v1.2.3 From 51796f5e2960130fe53e9a71d07152622d5e024c Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Sun, 5 Jan 2025 16:34:09 +0800 Subject: driver core: Move two simple APIs for finding child device to header The following two APIs are for finding child device, and both only have one line code in function body. device_find_child_by_name() device_find_any_child() Move them to header as static inline function. Reviewed-by: Jonathan Cameron Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250105-class_fix-v6-8-3a2f1768d4d4@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/device.h b/include/linux/device.h index 36d1a1607712..1e9aded9a086 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1083,9 +1083,35 @@ int device_for_each_child_reverse_from(struct device *parent, device_iter_t fn); struct device *device_find_child(struct device *parent, const void *data, device_match_t match); -struct device *device_find_child_by_name(struct device *parent, - const char *name); -struct device *device_find_any_child(struct device *parent); +/** + * device_find_child_by_name - device iterator for locating a child device. + * @parent: parent struct device + * @name: name of the child device + * + * This is similar to the device_find_child() function above, but it + * returns a reference to a device that has the name @name. + * + * NOTE: you will need to drop the reference with put_device() after use. + */ +static inline struct device *device_find_child_by_name(struct device *parent, + const char *name) +{ + return device_find_child(parent, name, device_match_name); +} + +/** + * device_find_any_child - device iterator for locating a child device, if any. + * @parent: parent struct device + * + * This is similar to the device_find_child() function above, but it + * returns a reference to a child device, if any. + * + * NOTE: you will need to drop the reference with put_device() after use. + */ +static inline struct device *device_find_any_child(struct device *parent) +{ + return device_find_child(parent, NULL, device_match_any); +} int device_rename(struct device *dev, const char *new_name); int device_move(struct device *dev, struct device *new_parent, -- cgit v1.2.3 From 9c96821b44f893fb63f021a28625d3b32c68e8b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 06:47:09 +0100 Subject: block: fix docs for freezing of queue limits updates queue_limits_commit_update is the function that needs to operate on a frozen queue, not queue_limits_start_update. Update the kerneldoc comments to reflect that. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Nilay Shroff Reviewed-by: Johannes Thumshirn Reviewed-by: John Garry Link: https://lore.kernel.org/r/20250110054726.1499538-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5d40af2ef971..e781d4e6f92d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -944,8 +944,7 @@ static inline unsigned int blk_boundary_sectors_left(sector_t offset, * the caller can modify. The caller must call queue_limits_commit_update() * to finish the update. * - * Context: process context. The caller must have frozen the queue or ensured - * that there is outstanding I/O by other means. + * Context: process context. */ static inline struct queue_limits queue_limits_start_update(struct request_queue *q) -- cgit v1.2.3 From aa427d7b73b196f657d6d2cf0e94eff6b883fdef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 06:47:10 +0100 Subject: block: add a queue_limits_commit_update_frozen helper Add a helper that freezes the queue, updates the queue limits and unfreezes the queue and convert all open coded versions of that to the new helper. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Nilay Shroff Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250110054726.1499538-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e781d4e6f92d..13d353351c37 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -952,6 +952,8 @@ queue_limits_start_update(struct request_queue *q) mutex_lock(&q->limits_lock); return q->limits; } +int queue_limits_commit_update_frozen(struct request_queue *q, + struct queue_limits *lim); int queue_limits_commit_update(struct request_queue *q, struct queue_limits *lim); int queue_limits_set(struct request_queue *q, struct queue_limits *lim); -- cgit v1.2.3 From 827ed8b1590d4d29dae837283d606709ffeebe37 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 19 Dec 2024 22:48:17 +0100 Subject: drivers: core: remove device_link argument from class_compat_[create|remove]_link After 7e722083fcc3 ("i2c: Remove I2C_COMPAT config symbol and related code") there's no caller left passing a non-null device_link argument. So remove this argument to simplify the code. Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/db49131d-fd79-4f23-93f2-0ab541a345fa@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device/class.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/device/class.h b/include/linux/device/class.h index aa67d4736816..45ee3a634999 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -82,10 +82,8 @@ bool class_is_registered(const struct class *class); struct class_compat; struct class_compat *class_compat_register(const char *name); void class_compat_unregister(struct class_compat *cls); -int class_compat_create_link(struct class_compat *cls, struct device *dev, - struct device *device_link); -void class_compat_remove_link(struct class_compat *cls, struct device *dev, - struct device *device_link); +int class_compat_create_link(struct class_compat *cls, struct device *dev); +void class_compat_remove_link(struct class_compat *cls, struct device *dev); void class_dev_iter_init(struct class_dev_iter *iter, const struct class *class, const struct device *start, const struct device_type *type); -- cgit v1.2.3 From f1725160fd28a2e65e47166637aa44856a1a7f89 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 7 Jan 2025 13:25:10 +0100 Subject: devres: add devm_remove_action_nowarn() devm_remove_action() warns if the action to remove does not exist (anymore). The Rust devres abstraction, however, has a use-case to call devm_remove_action() at a point where it can't be guaranteed that the corresponding action hasn't been released yet. In particular, an instance of `Devres` may be dropped after the action has been released. So far, `Devres` worked around this by keeping the inner type alive. Hence, add devm_remove_action_nowarn(), which returns an error code if the action has been removed already. A subsequent patch uses devm_remove_action_nowarn() to remove the action when `Devres` is dropped. Signed-off-by: Danilo Krummrich Link: https://lore.kernel.org/r/20250107122609.8135-1-dakr@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/device.h b/include/linux/device.h index 1e9aded9a086..80a5b3268986 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -399,7 +399,23 @@ void __iomem *devm_of_iomap(struct device *dev, #endif /* allows to add/remove a custom action to devres stack */ -void devm_remove_action(struct device *dev, void (*action)(void *), void *data); +int devm_remove_action_nowarn(struct device *dev, void (*action)(void *), void *data); + +/** + * devm_remove_action() - removes previously added custom action + * @dev: Device that owns the action + * @action: Function implementing the action + * @data: Pointer to data passed to @action implementation + * + * Removes instance of @action previously added by devm_add_action(). + * Both action and data should match one of the existing entries. + */ +static inline +void devm_remove_action(struct device *dev, void (*action)(void *), void *data) +{ + WARN_ON(devm_remove_action_nowarn(dev, action, data)); +} + void devm_release_action(struct device *dev, void (*action)(void *), void *data); int __devm_add_action(struct device *dev, void (*action)(void *), void *data, const char *name); -- cgit v1.2.3 From 910ef438e93cd2ceb70c72caea418710d648feef Mon Sep 17 00:00:00 2001 From: John Ogness Date: Tue, 7 Jan 2025 22:33:00 +0106 Subject: serial: 8250: Provide flag for IER toggling for RS485 For RS485 mode, if SER_RS485_RX_DURING_TX is not available, the console ->write() callback needs to enable/disable Tx. It does this by calling the ->rs485_start_tx() and ->rs485_stop_tx() callbacks. However, some of these callbacks also disable/enable interrupts and makes power management calls. This causes 2 problems for console writing: 1. A console write can occur in contexts that are illegal for pm_runtime_*(). It is not even necessary for console writing to use pm_runtime_*() because a console already does this in serial8250_console_setup() and serial8250_console_exit(). 2. The console ->write() callback already handles disabling/enabling the interrupts by properly restoring the previous IER value. Add an argument @toggle_ier to the ->rs485_start_tx() and ->rs485_stop_tx() callbacks to specify if they may disable/enable receive interrupts while using pm_runtime_*(). Console writing will not allow the toggling. For all call sites other than console writing there is no functional change. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20250107212702.169493-5-john.ogness@linutronix.de Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_8250.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index e0717c8393d7..144de7a7948d 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -161,8 +161,8 @@ struct uart_8250_port { void (*dl_write)(struct uart_8250_port *up, u32 value); struct uart_8250_em485 *em485; - void (*rs485_start_tx)(struct uart_8250_port *); - void (*rs485_stop_tx)(struct uart_8250_port *); + void (*rs485_start_tx)(struct uart_8250_port *up, bool toggle_ier); + void (*rs485_stop_tx)(struct uart_8250_port *up, bool toggle_ier); /* Serial port overrun backoff */ struct delayed_work overrun_backoff; -- cgit v1.2.3 From b63e6f60eab45b16a1bf734fef9035a4c4187cd5 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Tue, 7 Jan 2025 22:33:01 +0106 Subject: serial: 8250: Switch to nbcon console Implement the necessary callbacks to switch the 8250 console driver to perform as an nbcon console. Add implementations for the nbcon console callbacks: ->write_atomic() ->write_thread() ->device_lock() ->device_unlock() and add CON_NBCON to the initial @flags. All register access in the callbacks are within unsafe sections. The ->write_atomic() and ->write_thread() callbacks allow safe handover/takeover per byte and add a preceding newline if they take over from another context mid-line. For the ->write_atomic() callback, a new irq_work is used to defer modem control since it may be called from a context that does not allow waking up tasks. Note: A new __serial8250_clear_IER() is introduced for direct clearing of UART_IER. This will allow to restore the lockdep check to serial8250_clear_IER() in a follow-up commit. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Tested-by: Petr Mladek Link: https://lore.kernel.org/r/20250107212702.169493-6-john.ogness@linutronix.de Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_8250.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index 144de7a7948d..57875c37023a 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -150,8 +150,17 @@ struct uart_8250_port { #define LSR_SAVE_FLAGS UART_LSR_BRK_ERROR_BITS u16 lsr_saved_flags; u16 lsr_save_mask; + + /* + * Track when a console line has been fully written to the + * hardware, i.e. true when the most recent byte written to + * UART_TX by the console was '\n'. + */ + bool console_line_ended; + #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA unsigned char msr_saved_flags; + struct irq_work modem_status_work; struct uart_8250_dma *dma; const struct uart_8250_ops *ops; @@ -202,8 +211,8 @@ void serial8250_tx_chars(struct uart_8250_port *up); unsigned int serial8250_modem_status(struct uart_8250_port *up); void serial8250_init_port(struct uart_8250_port *up); void serial8250_set_defaults(struct uart_8250_port *up); -void serial8250_console_write(struct uart_8250_port *up, const char *s, - unsigned int count); +void serial8250_console_write(struct uart_8250_port *up, + struct nbcon_write_context *wctxt, bool in_atomic); int serial8250_console_setup(struct uart_port *port, char *options, bool probe); int serial8250_console_exit(struct uart_port *port); -- cgit v1.2.3 From e364374369b365351ad8ad69a10b5f7861f24bcd Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Thu, 9 Jan 2025 20:38:07 +0100 Subject: VMCI: fix reference to ioctl-number.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There has never been an ioctl-number.h — this must have been a typo for ioctl-number.txt (which later become ioctl-number.rst). At the time this comment was written, the note didn't actually end up appearing anywhere, but I fixed the omission from ioctl-number.rst in 0a8e4dc1d353 ("Documentation: ioctl: document 0x07 ioctl code"). Fixes: 20259849bb1a ("VMCI: Some header and config files.") Signed-off-by: Alyssa Ross Link: https://lore.kernel.org/r/re3xng4uwull2cu53xnu5dtv3wlstfiv3v7rmbwtw2qbvj5mo3@q45iujse5ovc Signed-off-by: Greg Kroah-Hartman --- include/linux/vmw_vmci_defs.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/vmw_vmci_defs.h b/include/linux/vmw_vmci_defs.h index c2df94696593..60c9eacd2cf3 100644 --- a/include/linux/vmw_vmci_defs.h +++ b/include/linux/vmw_vmci_defs.h @@ -431,11 +431,11 @@ enum { ((((_p)[0] & 0xFF) << 24) | (((_p)[1] & 0xFF) << 16) | ((_p)[2])) /* - * The VMCI IOCTLs. We use identity code 7, as noted in ioctl-number.h, and - * we start at sequence 9f. This gives us the same values that our shipping - * products use, starting at 1951, provided we leave out the direction and - * structure size. Note that VMMon occupies the block following us, starting - * at 2001. + * The VMCI IOCTLs. We use identity code 7, as noted in ioctl-number.rst, + * and we start at sequence 9f. This gives us the same values that our + * shipping products use, starting at 1951, provided we leave out the + * direction and structure size. Note that VMMon occupies the block + * following us, starting at 2001. */ #define IOCTL_VMCI_VERSION _IO(7, 0x9f) /* 1951 */ #define IOCTL_VMCI_INIT_CONTEXT _IO(7, 0xa0) -- cgit v1.2.3 From 226d6cb3cb799aae46d0dd19a521133997d9db11 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 10 Jan 2025 15:45:22 +0100 Subject: spi: spi-mem: Estimate the time taken by operations In the SPI-NAND layer, we currently make list of operation variants from the fastest one to the slowest and there is a bit of logic in the core to go over them and pick the first one that is supported by the controller, ie. the fastest one among the supported ops. This kind of logic only works if all operations run at the same frequency, but as soon as we introduce per operation max frequencies it is not longer as obvious which operation will be faster, especially since it also depends on the PCB/controller frequency limitation. One way to make this choice more clever is to go over all the variants and for each of them derive an indicator which will help derive the theoretical best. In this case, we derive a theoretical duration for the entire operation and we take the smallest one. Add a helper that parses the spi-mem operation and returns this value. Signed-off-by: Miquel Raynal Link: https://patch.msgid.link/20250110-winbond-6-11-rc1-quad-support-v3-20-7ab4bd56cf6e@bootlin.com Signed-off-by: Mark Brown --- include/linux/spi/spi-mem.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index 306c05dd1378..c4830dfaff3d 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -424,6 +424,7 @@ bool spi_mem_default_supports_op(struct spi_mem *mem, int spi_mem_adjust_op_size(struct spi_mem *mem, struct spi_mem_op *op); void spi_mem_adjust_op_freq(struct spi_mem *mem, struct spi_mem_op *op); +u64 spi_mem_calc_op_duration(struct spi_mem_op *op); bool spi_mem_supports_op(struct spi_mem *mem, const struct spi_mem_op *op); -- cgit v1.2.3 From d7476f24c9aa93d02ef3fd8d587a6114387b7667 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 3 Jan 2025 20:45:38 +0000 Subject: export: Add __gendwarfksyms_ptr_ references to exported symbols With gendwarfksyms, we need each TU where the EXPORT_SYMBOL() macro is used to also contain DWARF type information for the symbols it exports. However, as a TU can also export external symbols and compilers may choose not to emit debugging information for symbols not defined in the current TU, the missing types will result in missing symbol versions. Stand-alone assembly code also doesn't contain type information for exported symbols, so we need to compile a temporary object file with asm-prototypes.h instead, and similarly need to ensure the DWARF in the temporary object file contains the necessary types. To always emit type information for external exports, add explicit __gendwarfksyms_ptr_ references to them in EXPORT_SYMBOL(). gendwarfksyms will use the type information for __gendwarfksyms_ptr_* if needed. Discard the pointers from the final binary to avoid further bloat. Signed-off-by: Sami Tolvanen Signed-off-by: Masahiro Yamada --- include/linux/export.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/export.h b/include/linux/export.h index 2633df4d31e6..a8c23d945634 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -52,9 +52,24 @@ #else +#ifdef CONFIG_GENDWARFKSYMS +/* + * With CONFIG_GENDWARFKSYMS, ensure the compiler emits debugging + * information for all exported symbols, including those defined in + * different TUs, by adding a __gendwarfksyms_ptr_ pointer + * that's discarded during the final link. + */ +#define __GENDWARFKSYMS_EXPORT(sym) \ + static typeof(sym) *__gendwarfksyms_ptr_##sym __used \ + __section(".discard.gendwarfksyms") = &sym; +#else +#define __GENDWARFKSYMS_EXPORT(sym) +#endif + #define __EXPORT_SYMBOL(sym, license, ns) \ extern typeof(sym) sym; \ __ADDRESSABLE(sym) \ + __GENDWARFKSYMS_EXPORT(sym) \ asm(__stringify(___EXPORT_SYMBOL(sym, license, ns))) #endif -- cgit v1.2.3 From 7579790915387396e26041ceafbc07348658edef Mon Sep 17 00:00:00 2001 From: Cezary Rojewski Date: Fri, 10 Jan 2025 12:33:25 +0100 Subject: ALSA: hda: Fix compilation of snd_hdac_adsp_xxx() helpers The snd_hdac_adsp_xxx() wrap snd_hdac_reg_xxx() helpers to simplify register access for AudioDSP drivers e.g.: the avs-driver. Byte- and word-variants of said helps do not expand to bare readx/writex() operations but functions instead and, due to pointer type incompatibility, cause compilation to fail. As the macros are utilized by the avs-driver alone, relocate the code introduced with commit c19bd02e9029 ("ALSA: hda: Add helper macros for DSP capable devices") into the avs/ directory and update it to operate on 'adev' i.e.: the avs-driver-context directly to fix the issue. Fixes: c19bd02e9029 ("ALSA: hda: Add helper macros for DSP capable devices") Signed-off-by: Cezary Rojewski Acked-by: Mark Brown Link: https://patch.msgid.link/20250110113326.3809897-2-cezary.rojewski@intel.com Signed-off-by: Takashi Iwai --- include/sound/hdaudio_ext.h | 45 --------------------------------------------- 1 file changed, 45 deletions(-) (limited to 'include') diff --git a/include/sound/hdaudio_ext.h b/include/sound/hdaudio_ext.h index 957295364a5e..4c7a40e149a5 100644 --- a/include/sound/hdaudio_ext.h +++ b/include/sound/hdaudio_ext.h @@ -2,8 +2,6 @@ #ifndef __SOUND_HDAUDIO_EXT_H #define __SOUND_HDAUDIO_EXT_H -#include -#include #include int snd_hdac_ext_bus_init(struct hdac_bus *bus, struct device *dev, @@ -119,49 +117,6 @@ int snd_hdac_ext_bus_link_put(struct hdac_bus *bus, struct hdac_ext_link *hlink) void snd_hdac_ext_bus_link_power(struct hdac_device *codec, bool enable); -#define snd_hdac_adsp_writeb(chip, reg, value) \ - snd_hdac_reg_writeb(chip, (chip)->dsp_ba + (reg), value) -#define snd_hdac_adsp_readb(chip, reg) \ - snd_hdac_reg_readb(chip, (chip)->dsp_ba + (reg)) -#define snd_hdac_adsp_writew(chip, reg, value) \ - snd_hdac_reg_writew(chip, (chip)->dsp_ba + (reg), value) -#define snd_hdac_adsp_readw(chip, reg) \ - snd_hdac_reg_readw(chip, (chip)->dsp_ba + (reg)) -#define snd_hdac_adsp_writel(chip, reg, value) \ - snd_hdac_reg_writel(chip, (chip)->dsp_ba + (reg), value) -#define snd_hdac_adsp_readl(chip, reg) \ - snd_hdac_reg_readl(chip, (chip)->dsp_ba + (reg)) -#define snd_hdac_adsp_writeq(chip, reg, value) \ - snd_hdac_reg_writeq(chip, (chip)->dsp_ba + (reg), value) -#define snd_hdac_adsp_readq(chip, reg) \ - snd_hdac_reg_readq(chip, (chip)->dsp_ba + (reg)) - -#define snd_hdac_adsp_updateb(chip, reg, mask, val) \ - snd_hdac_adsp_writeb(chip, reg, \ - (snd_hdac_adsp_readb(chip, reg) & ~(mask)) | (val)) -#define snd_hdac_adsp_updatew(chip, reg, mask, val) \ - snd_hdac_adsp_writew(chip, reg, \ - (snd_hdac_adsp_readw(chip, reg) & ~(mask)) | (val)) -#define snd_hdac_adsp_updatel(chip, reg, mask, val) \ - snd_hdac_adsp_writel(chip, reg, \ - (snd_hdac_adsp_readl(chip, reg) & ~(mask)) | (val)) -#define snd_hdac_adsp_updateq(chip, reg, mask, val) \ - snd_hdac_adsp_writeq(chip, reg, \ - (snd_hdac_adsp_readq(chip, reg) & ~(mask)) | (val)) - -#define snd_hdac_adsp_readb_poll(chip, reg, val, cond, delay_us, timeout_us) \ - readb_poll_timeout((chip)->dsp_ba + (reg), val, cond, \ - delay_us, timeout_us) -#define snd_hdac_adsp_readw_poll(chip, reg, val, cond, delay_us, timeout_us) \ - readw_poll_timeout((chip)->dsp_ba + (reg), val, cond, \ - delay_us, timeout_us) -#define snd_hdac_adsp_readl_poll(chip, reg, val, cond, delay_us, timeout_us) \ - readl_poll_timeout((chip)->dsp_ba + (reg), val, cond, \ - delay_us, timeout_us) -#define snd_hdac_adsp_readq_poll(chip, reg, val, cond, delay_us, timeout_us) \ - readq_poll_timeout((chip)->dsp_ba + (reg), val, cond, \ - delay_us, timeout_us) - struct hdac_ext_device; /* ops common to all codec drivers */ -- cgit v1.2.3 From 815940bb7db7b37d058d76a575827287eebe39ce Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Fri, 3 Jan 2025 10:02:04 +0200 Subject: scsi: Revert "scsi: ufs: core: Probe for EXT_IID support" This reverts commit 6e1d850acff9477ae4c18a73c19ef52841ac2010. Although added a while ago, to date no one make use of ext_iid, specifically incorporates it in the upiu header. Therefore, remove it as it is currently unused and not serving any purpose. Signed-off-by: Avri Altman Link: https://lore.kernel.org/r/20250103080204.63951-1-avri.altman@wdc.com Cc: Can Guo Cc: Asutosh Das Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- include/ufs/ufs.h | 5 ----- include/ufs/ufshcd.h | 2 -- include/ufs/ufshci.h | 5 ----- 3 files changed, 12 deletions(-) (limited to 'include') diff --git a/include/ufs/ufs.h b/include/ufs/ufs.h index e594abe5d05f..89672ad8c3bb 100644 --- a/include/ufs/ufs.h +++ b/include/ufs/ufs.h @@ -180,7 +180,6 @@ enum attr_idn { QUERY_ATTR_IDN_AVAIL_WB_BUFF_SIZE = 0x1D, QUERY_ATTR_IDN_WB_BUFF_LIFE_TIME_EST = 0x1E, QUERY_ATTR_IDN_CURR_WB_BUFF_SIZE = 0x1F, - QUERY_ATTR_IDN_EXT_IID_EN = 0x2A, QUERY_ATTR_IDN_TIMESTAMP = 0x30 }; @@ -391,7 +390,6 @@ enum { UFS_DEV_EXT_TEMP_NOTIF = BIT(6), UFS_DEV_HPB_SUPPORT = BIT(7), UFS_DEV_WRITE_BOOSTER_SUP = BIT(8), - UFS_DEV_EXT_IID_SUP = BIT(16), }; #define UFS_DEV_HPB_SUPPORT_VERSION 0x310 @@ -585,9 +583,6 @@ struct ufs_dev_info { bool b_advanced_rpmb_en; - /* UFS EXT_IID Enable */ - bool b_ext_iid_en; - /* UFS RTC */ enum ufs_rtc_time rtc_type; time64_t rtc_time_baseline; diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index da0fa5c65081..650ff238cd74 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -952,7 +952,6 @@ enum ufshcd_mcq_opr { * @nr_queues: number of Queues of different queue types * @complete_put: whether or not to call ufshcd_rpm_put() from inside * ufshcd_resume_complete() - * @ext_iid_sup: is EXT_IID is supported by UFSHC * @mcq_sup: is mcq supported by UFSHC * @mcq_enabled: is mcq ready to accept requests * @res: array of resource info of MCQ registers @@ -1118,7 +1117,6 @@ struct ufs_hba { unsigned int nr_hw_queues; unsigned int nr_queues[HCTX_MAX_TYPES]; bool complete_put; - bool ext_iid_sup; bool scsi_host_added; bool mcq_sup; bool lsdb_sup; diff --git a/include/ufs/ufshci.h b/include/ufs/ufshci.h index 27364c4a6ef9..612500a7088f 100644 --- a/include/ufs/ufshci.h +++ b/include/ufs/ufshci.h @@ -82,11 +82,6 @@ enum { MASK_MCQ_SUPPORT = 0x40000000, }; -/* MCQ capability mask */ -enum { - MASK_EXT_IID_SUPPORT = 0x00000400, -}; - enum { REG_SQATTR = 0x0, REG_SQLBA = 0x4, -- cgit v1.2.3 From 94d57442e56d2ad2ca20d096040b8ae6f216a921 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 5 Dec 2024 11:51:09 +0530 Subject: io_uring: expose read/write attribute capability After commit 9a213d3b80c0, we can pass additional attributes along with read/write. However, userspace doesn't know that. Add a new feature flag IORING_FEAT_RW_ATTR, to notify the userspace that the kernel has this ability. Signed-off-by: Anuj Gupta Reviewed-by: Li Zetao Reviewed-by: Martin K. Petersen Tested-by: Martin K. Petersen Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20241205062109.1788-1-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 38f0d6b10eaf..e11c82638527 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -577,6 +577,7 @@ struct io_uring_params { #define IORING_FEAT_REG_REG_RING (1U << 13) #define IORING_FEAT_RECVSEND_BUNDLE (1U << 14) #define IORING_FEAT_MIN_TIMEOUT (1U << 15) +#define IORING_FEAT_RW_ATTR (1U << 16) /* * io_uring_register(2) opcodes and arguments -- cgit v1.2.3 From cf337105ad38564d7855151889a7315da73119d0 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 8 Jan 2025 16:47:33 +0000 Subject: net: phy: add configuration of rx clock stop mode Add a function to allow configuration of the PCS's clock stop enable bit, used to configure whether the xMII receive clock can be stopped during LPI mode. Reviewed-by: Andrew Lunn Tested-by: Choong Yong Liang Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1tVZDR-0002Jl-Ry@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 5bc71d59910c..4875465653ca 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2096,6 +2096,7 @@ int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask); int phy_unregister_fixup_for_id(const char *bus_id); int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask); +int phy_eee_rx_clock_stop(struct phy_device *phydev, bool clk_stop_enable); int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable); int phy_get_eee_err(struct phy_device *phydev); int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_keee *data); -- cgit v1.2.3 From 5ef44b3cb43bda4009ba7fe3d54e0d258ae4aee7 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 8 Jan 2025 16:34:36 -0800 Subject: xsk: Bring back busy polling support Commit 86e25f40aa1e ("net: napi: Add napi_config") moved napi->napi_id assignment to a later point in time (napi_hash_add_with_id). This breaks __xdp_rxq_info_reg which copies napi_id at an earlier time and now stores 0 napi_id. It also makes sk_mark_napi_id_once_xdp and __sk_mark_napi_id_once useless because they now work against 0 napi_id. Since sk_busy_loop requires valid napi_id to busy-poll on, there is no way to busy-poll AF_XDP sockets anymore. Bring back the ability to busy-poll on XSK by resolving socket's napi_id at bind time. This relies on relatively recent netif_queue_set_napi, but (assume) at this point most popular drivers should have been converted. This also removes per-tx/rx cycles which used to check and/or set the napi_id value. Confirmed by running a busy-polling AF_XDP socket (github.com/fomichev/xskrtt) on mlx5 and looking at BusyPollRxPackets from /proc/net/netstat. Fixes: 86e25f40aa1e ("net: napi: Add napi_config") Signed-off-by: Stanislav Fomichev Acked-by: Magnus Karlsson Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20250109003436.2829560-1-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/net/busy_poll.h | 8 -------- include/net/xdp.h | 1 - include/net/xdp_sock_drv.h | 14 -------------- 3 files changed, 23 deletions(-) (limited to 'include') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index c858270141bc..c39a426ebf52 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -174,12 +174,4 @@ static inline void sk_mark_napi_id_once(struct sock *sk, #endif } -static inline void sk_mark_napi_id_once_xdp(struct sock *sk, - const struct xdp_buff *xdp) -{ -#ifdef CONFIG_NET_RX_BUSY_POLL - __sk_mark_napi_id_once(sk, xdp->rxq->napi_id); -#endif -} - #endif /* _LINUX_NET_BUSY_POLL_H */ diff --git a/include/net/xdp.h b/include/net/xdp.h index e6770dd40c91..b5b10f2b88e5 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -62,7 +62,6 @@ struct xdp_rxq_info { u32 queue_index; u32 reg_state; struct xdp_mem_info mem; - unsigned int napi_id; u32 frag_size; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 40085afd9160..7a7316d9c0da 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -59,15 +59,6 @@ static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, xp_fill_cb(pool, desc); } -static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) -{ -#ifdef CONFIG_NET_RX_BUSY_POLL - return pool->heads[0].xdp.rxq->napi_id; -#else - return 0; -#endif -} - static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { @@ -306,11 +297,6 @@ static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, { } -static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) -{ - return 0; -} - static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { -- cgit v1.2.3 From 21520e74ba454c549f4f732d014f180f8c0c041c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 9 Jan 2025 16:49:24 -0800 Subject: net: hide the definition of dev_get_by_napi_id() There are no module callers of dev_get_by_napi_id(), and commit d1cacd747768 ("netdev: prevent accessing NAPI instances from another namespace") proves that getting NAPI by id needs to be done with care. So hide dev_get_by_napi_id(). Reviewed-by: Jacob Keller Reviewed-by: Kalesh AP Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250110004924.3212260-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1812564b5204..aeb4a6cff171 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3252,7 +3252,6 @@ struct net_device *netdev_get_by_index(struct net *net, int ifindex, struct net_device *netdev_get_by_name(struct net *net, const char *name, netdevice_tracker *tracker, gfp_t gfp); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); -struct net_device *dev_get_by_napi_id(unsigned int napi_id); void netdev_copy_name(struct net_device *dev, char *name); static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, -- cgit v1.2.3 From 30e77e0fbec6940ecc5c79ffe0f076c54cf5a8d9 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:34 +0900 Subject: nvme: Move opcode string helper functions declarations Move the declaration of all helper functions converting NVMe command opcodes and status codes into strings from drivers/nvme/host/nvme.h into include/linux/nvme.h, together with the commands definitions. This allows NVMe target drivers to call these functions without having to include a host header file. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- include/linux/nvme.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 13377dde4527..a5a4ee56efcf 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1896,6 +1896,46 @@ static inline bool nvme_is_fabrics(const struct nvme_command *cmd) return cmd->common.opcode == nvme_fabrics_command; } +#ifdef CONFIG_NVME_VERBOSE_ERRORS +const char *nvme_get_error_status_str(u16 status); +const char *nvme_get_opcode_str(u8 opcode); +const char *nvme_get_admin_opcode_str(u8 opcode); +const char *nvme_get_fabrics_opcode_str(u8 opcode); +#else /* CONFIG_NVME_VERBOSE_ERRORS */ +static inline const char *nvme_get_error_status_str(u16 status) +{ + return "I/O Error"; +} +static inline const char *nvme_get_opcode_str(u8 opcode) +{ + return "I/O Cmd"; +} +static inline const char *nvme_get_admin_opcode_str(u8 opcode) +{ + return "Admin Cmd"; +} + +static inline const char *nvme_get_fabrics_opcode_str(u8 opcode) +{ + return "Fabrics Cmd"; +} +#endif /* CONFIG_NVME_VERBOSE_ERRORS */ + +static inline const char *nvme_opcode_str(int qid, u8 opcode) +{ + return qid ? nvme_get_opcode_str(opcode) : + nvme_get_admin_opcode_str(opcode); +} + +static inline const char *nvme_fabrics_opcode_str( + int qid, const struct nvme_command *cmd) +{ + if (nvme_is_fabrics(cmd)) + return nvme_get_fabrics_opcode_str(cmd->fabrics.fctype); + + return nvme_opcode_str(qid, cmd->common.opcode); +} + struct nvme_error_slot { __le64 error_count; __le16 sqid; -- cgit v1.2.3 From 200adac75888182c09027e9b7852507dabd87034 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:39 +0900 Subject: nvme: Add PCI transport type Define the transport type NVMF_TRTYPE_PCI for PCI endpoint targets. This transport type is defined using the value 0 which is reserved in the NVMe base specifications v2.1 (Figure 294). Given that struct nvmet_port are zeroed out on creation, to avoid having this transsport type becoming the new default, nvmet_referral_make() and nvmet_ports_make() are modified to initialize a port discovery address transport type field (disc_addr.trtype) to NVMF_TRTYPE_MAX. Any port using this transport type is also skipped and not reported in the discovery log page (nvmet_execute_disc_get_log_page()). The helper function nvmet_is_pci_ctrl() is also introduced to check if a target controller uses the PCI transport. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index a5a4ee56efcf..42fc00dc494e 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -64,6 +64,7 @@ enum { /* Transport Type codes for Discovery Log Page entry TRTYPE field */ enum { + NVMF_TRTYPE_PCI = 0, /* PCI */ NVMF_TRTYPE_RDMA = 1, /* RDMA */ NVMF_TRTYPE_FC = 2, /* Fibre Channel */ NVMF_TRTYPE_TCP = 3, /* TCP/IP */ -- cgit v1.2.3 From 2f2b20fad973d00169d24f5338eb1bf0a42e8218 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:46 +0900 Subject: nvmet: Implement host identifier set feature support The NVMe specifications mandate support for the host identifier set_features for controllers that also supports reservations. Satisfy this requirement by implementing handling of the NVME_FEAT_HOST_ID feature for the nvme_set_features command. This implementation is for now effective only for PCI target controllers. For other controller types, the set features command is failed with a NVME_SC_CMD_SEQ_ERROR status as before. As noted in the code, 128 bits host identifiers are supported since the NVMe base specifications version 2.1 indicate in section 5.1.25.1.28.1 that "The controller may support a 64-bit Host Identifier...". The RHII (Reservations and Host Identifier Interaction) bit of the controller attribute (ctratt) field of the identify controller data is also set to indicate that a host ID of "0" is supported but that the host ID must be a non-zero value to use reservations. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 42fc00dc494e..fe3b60818fdc 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -276,6 +276,7 @@ enum nvme_ctrl_attr { NVME_CTRL_ATTR_HID_128_BIT = (1 << 0), NVME_CTRL_ATTR_TBKAS = (1 << 6), NVME_CTRL_ATTR_ELBAS = (1 << 15), + NVME_CTRL_ATTR_RHII = (1 << 18), }; struct nvme_id_ctrl { -- cgit v1.2.3 From 0f52b4db4f91320569311b97a1a14a18fb8ff256 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 12 Dec 2024 19:02:04 +0100 Subject: rcu/kvfree: Initialize kvfree_rcu() separately Introduce a separate initialization of kvfree_rcu() functionality. For such purpose a kfree_rcu_batch_init() is renamed to a kvfree_rcu_init() and it is invoked from the main.c right after rcu_init() is done. Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Hyeonggon Yoo Tested-by: Hyeonggon Yoo Signed-off-by: Vlastimil Babka --- include/linux/rcupdate.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48e5c03df1dd..acb0095b4dbe 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -118,6 +118,7 @@ static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) /* Internal to kernel */ void rcu_init(void); +void __init kvfree_rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); -- cgit v1.2.3 From bbe658d6580251d1832d408fa8a71ec254dc4416 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 12 Dec 2024 19:02:08 +0100 Subject: mm/slab: Move kvfree_rcu() into SLAB Move kvfree_rcu() functionality to the slab_common.c file. The reason to have kvfree_rcu() functionality as part of SLAB is that there is a clear trend and need of closer integration. One of the recent example is creating a barrier function for SLAB caches. Another reason is to prevent of having several implementations of RCU machinery for reclaiming objects after a GP. As future steps, it can be more integrated(easier) with SLAB internals. Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Hyeonggon Yoo Tested-by: Hyeonggon Yoo Signed-off-by: Vlastimil Babka --- include/linux/rcupdate.h | 1 - include/linux/slab.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index acb0095b4dbe..48e5c03df1dd 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -118,7 +118,6 @@ static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) /* Internal to kernel */ void rcu_init(void); -void __init kvfree_rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); diff --git a/include/linux/slab.h b/include/linux/slab.h index 10a971c2bde3..09eedaecf120 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -1099,5 +1099,6 @@ unsigned int kmem_cache_size(struct kmem_cache *s); size_t kmalloc_size_roundup(size_t size); void __init kmem_cache_init_late(void); +void __init kvfree_rcu_init(void); #endif /* _LINUX_SLAB_H */ -- cgit v1.2.3 From 875aec2357cd22d412226e4134d1106248b0eb9d Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 13 Dec 2024 10:08:23 -0800 Subject: kunit: platform: Resolve 'struct completion' warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the header is included in a test without certain other headers, it produces compiler warnings like: In file included from [...] ../include/kunit/platform_device.h:15:57: warning: ‘struct completion’ declared inside parameter list will not be visible outside of this definition or declaration 15 | struct completion *x); | ^~~~~~~~~~ Add a 'struct completion' forward declaration to resolve this. Signed-off-by: Brian Norris Reviewed-by: David Gow Signed-off-by: Shuah Khan --- include/kunit/platform_device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/kunit/platform_device.h b/include/kunit/platform_device.h index 0fc0999d2420..f8236a8536f7 100644 --- a/include/kunit/platform_device.h +++ b/include/kunit/platform_device.h @@ -2,6 +2,7 @@ #ifndef _KUNIT_PLATFORM_DRIVER_H #define _KUNIT_PLATFORM_DRIVER_H +struct completion; struct kunit; struct platform_device; struct platform_driver; -- cgit v1.2.3 From 387bef82d0b4afd4c7430b52c4971649a5cf3b06 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Thu, 9 Jan 2025 22:42:28 +0200 Subject: net/mlx5: Update mlx5_ifc to support FEC for 200G per lane link modes Add FEC admin and override related fields in PPLM, and the bit in PCAM to indicate those fields are supported. Signed-off-by: Jianbo Liu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250109204231.1809851-2-tariqt@nvidia.com Reviewed-by: Jacob Keller Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 43b3cb4bf8d1..c3da1581853c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10150,7 +10150,21 @@ struct mlx5_ifc_pplm_reg_bits { u8 fec_override_admin_200g_2x[0x10]; u8 fec_override_admin_100g_1x[0x10]; - u8 reserved_at_260[0x20]; + u8 reserved_at_260[0x60]; + + u8 fec_override_cap_1600g_8x[0x10]; + u8 fec_override_cap_800g_4x[0x10]; + + u8 fec_override_cap_400g_2x[0x10]; + u8 fec_override_cap_200g_1x[0x10]; + + u8 fec_override_admin_1600g_8x[0x10]; + u8 fec_override_admin_800g_4x[0x10]; + + u8 fec_override_admin_400g_2x[0x10]; + u8 fec_override_admin_200g_1x[0x10]; + + u8 reserved_at_340[0x80]; }; struct mlx5_ifc_ppcnt_reg_bits { @@ -10524,7 +10538,9 @@ struct mlx5_ifc_mtutc_reg_bits { }; struct mlx5_ifc_pcam_enhanced_features_bits { - u8 reserved_at_0[0x48]; + u8 reserved_at_0[0x1d]; + u8 fec_200G_per_lane_in_pplm[0x1]; + u8 reserved_at_1e[0x2a]; u8 fec_100G_per_lane_in_pplm[0x1]; u8 reserved_at_49[0x1f]; u8 fec_50G_per_lane_in_pplm[0x1]; -- cgit v1.2.3 From e2685ef5f56295249bf98bc6603d3c092fe0ce56 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Thu, 9 Jan 2025 22:42:29 +0200 Subject: net/mlx5: Add support for MRTCQ register Management Real Time Clock Query (MRTCQ) register is used to query hardware clock identity. Signed-off-by: Jianbo Liu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250109204231.1809851-3-tariqt@nvidia.com Reviewed-by: Jacob Keller Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fc7e6153b73d..8f6fe29bc4be 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -160,6 +160,7 @@ enum { MLX5_REG_MIRC = 0x9162, MLX5_REG_MTPTM = 0x9180, MLX5_REG_MTCTR = 0x9181, + MLX5_REG_MRTCQ = 0x9182, MLX5_REG_SBCAM = 0xB01F, MLX5_REG_RESOURCE_DUMP = 0xC000, MLX5_REG_DTOR = 0xC00E, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index c3da1581853c..221146278ac8 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10680,7 +10680,8 @@ struct mlx5_ifc_mcam_access_reg_bits3 { u8 regs_63_to_32[0x20]; - u8 regs_31_to_2[0x1e]; + u8 regs_31_to_3[0x1d]; + u8 mrtcq[0x1]; u8 mtctr[0x1]; u8 mtptm[0x1]; }; @@ -13171,4 +13172,12 @@ struct mlx5_ifc_msees_reg_bits { u8 reserved_at_80[0x180]; }; +struct mlx5_ifc_mrtcq_reg_bits { + u8 reserved_at_0[0x40]; + + u8 rt_clock_identity[0x40]; + + u8 reserved_at_80[0x180]; +}; + #endif /* MLX5_IFC_H */ -- cgit v1.2.3 From df75ad562a6f9ae6add42d56e228aa973b421421 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 9 Jan 2025 22:42:30 +0200 Subject: net/mlx5: SHAMPO: Introduce new SHAMPO specific HCA caps Read and cache SHAMPO specific caps for header data split capabilities. Will be used in downstream patch. Signed-off-by: Saeed Mahameed Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250109204231.1809851-4-tariqt@nvidia.com Reviewed-by: Jacob Keller Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 4 ++++ include/linux/mlx5/mlx5_ifc.h | 20 +++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index cc647992f3d1..0c48b20f818a 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1245,6 +1245,7 @@ enum mlx5_cap_type { MLX5_CAP_DEV_EVENT = 0x14, MLX5_CAP_IPSEC, MLX5_CAP_CRYPTO = 0x1a, + MLX5_CAP_SHAMPO = 0x1d, MLX5_CAP_MACSEC = 0x1f, MLX5_CAP_GENERAL_2 = 0x20, MLX5_CAP_PORT_SELECTION = 0x25, @@ -1470,6 +1471,9 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_MACSEC(mdev, cap)\ MLX5_GET(macsec_cap, (mdev)->caps.hca[MLX5_CAP_MACSEC]->cur, cap) +#define MLX5_CAP_SHAMPO(mdev, cap) \ + MLX5_GET(shampo_cap, mdev->caps.hca[MLX5_CAP_SHAMPO]->cur, cap) + enum { MLX5_CMD_STAT_OK = 0x0, MLX5_CMD_STAT_INT_ERR = 0x1, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 221146278ac8..d7c91f152735 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2327,7 +2327,9 @@ struct mlx5_ifc_wq_bits { u8 headers_mkey[0x20]; u8 shampo_enable[0x1]; - u8 reserved_at_1e1[0x4]; + u8 reserved_at_1e1[0x1]; + u8 shampo_mode[0x2]; + u8 reserved_at_1e4[0x1]; u8 log_reservation_size[0x3]; u8 reserved_at_1e8[0x5]; u8 log_max_num_of_packets_per_reservation[0x3]; @@ -3699,6 +3701,22 @@ struct mlx5_ifc_crypto_cap_bits { u8 reserved_at_80[0x780]; }; +struct mlx5_ifc_shampo_cap_bits { + u8 reserved_at_0[0x3]; + u8 shampo_log_max_reservation_size[0x5]; + u8 reserved_at_8[0x3]; + u8 shampo_log_min_reservation_size[0x5]; + u8 shampo_min_mss_size[0x10]; + + u8 shampo_header_split[0x1]; + u8 shampo_header_split_data_merge[0x1]; + u8 reserved_at_22[0x1]; + u8 shampo_log_max_headers_entry_size[0x5]; + u8 reserved_at_28[0x18]; + + u8 reserved_at_40[0x7c0]; +}; + union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_cmd_hca_cap_bits cmd_hca_cap; struct mlx5_ifc_cmd_hca_cap_2_bits cmd_hca_cap_2; -- cgit v1.2.3 From 6ca00ec47b70acb7a06cf5c79f6bec6074cef008 Mon Sep 17 00:00:00 2001 From: Akiva Goldberger Date: Thu, 9 Jan 2025 22:42:31 +0200 Subject: net/mlx5: Add nic_cap_reg and vhca_icm_ctrl registers Add nic_cap_reg and vhca_icm_ctrl registers interfaces for exposing ICM consumption. Signed-off-by: Akiva Goldberger Reviewed-by: Moshe Shemesh Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250109204231.1809851-5-tariqt@nvidia.com Reviewed-by: Jacob Keller Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 2 ++ include/linux/mlx5/mlx5_ifc.h | 22 +++++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8f6fe29bc4be..b957391529b3 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -163,7 +163,9 @@ enum { MLX5_REG_MRTCQ = 0x9182, MLX5_REG_SBCAM = 0xB01F, MLX5_REG_RESOURCE_DUMP = 0xC000, + MLX5_REG_NIC_CAP = 0xC00D, MLX5_REG_DTOR = 0xC00E, + MLX5_REG_VHCA_ICM_CTRL = 0xC010, }; enum mlx5_qpts_trust_state { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index d7c91f152735..2a40b1fd50e8 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1830,7 +1830,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 regexp_params[0x1]; u8 uar_sz[0x6]; u8 port_selection_cap[0x1]; - u8 reserved_at_251[0x1]; + u8 nic_cap_reg[0x1]; u8 umem_uid_0[0x1]; u8 reserved_at_253[0x5]; u8 log_pg_sz[0x8]; @@ -3327,6 +3327,14 @@ struct mlx5_ifc_dropped_packet_logged_bits { u8 reserved_at_0[0xe0]; }; +struct mlx5_ifc_nic_cap_reg_bits { + u8 reserved_at_0[0x1a]; + u8 vhca_icm_ctrl[0x1]; + u8 reserved_at_1b[0x5]; + + u8 reserved_at_20[0x60]; +}; + struct mlx5_ifc_default_timeout_bits { u8 to_multiplier[0x3]; u8 reserved_at_3[0x9]; @@ -3363,6 +3371,18 @@ struct mlx5_ifc_dtor_reg_bits { u8 reserved_at_1c0[0x20]; }; +struct mlx5_ifc_vhca_icm_ctrl_reg_bits { + u8 vhca_id_valid[0x1]; + u8 reserved_at_1[0xf]; + u8 vhca_id[0x10]; + + u8 reserved_at_20[0xa0]; + + u8 cur_alloc_icm[0x20]; + + u8 reserved_at_e0[0x120]; +}; + enum { MLX5_CQ_ERROR_SYNDROME_CQ_OVERRUN = 0x1, MLX5_CQ_ERROR_SYNDROME_CQ_ACCESS_VIOLATION_ERROR = 0x2, -- cgit v1.2.3 From bdf46443f350dd5d226fd528a5a5954ff762f591 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 10 Jan 2025 16:59:34 +0100 Subject: ALSA: rawmidi: Expose the tied device number in info ioctl The UMP legacy rawmidi is derived from the UMP rawmidi, but currently there is no way to know which device is involved in other side. This patch extends the rawmidi info ioctl to show the tied device number. As default it stores -1, indicating that no tied device. Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250110155943.31578-2-tiwai@suse.de --- include/sound/rawmidi.h | 1 + include/uapi/sound/asound.h | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/rawmidi.h b/include/sound/rawmidi.h index f31cabf0158c..7f1fec786b72 100644 --- a/include/sound/rawmidi.h +++ b/include/sound/rawmidi.h @@ -118,6 +118,7 @@ struct snd_rawmidi { struct list_head list; unsigned int device; /* device number */ unsigned int info_flags; /* SNDRV_RAWMIDI_INFO_XXXX */ + unsigned int tied_device; char id[64]; char name[80]; diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index 4cd513215bcd..1fcff031b5e3 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -729,6 +729,8 @@ enum { #define SNDRV_RAWMIDI_INFO_DUPLEX 0x00000004 #define SNDRV_RAWMIDI_INFO_UMP 0x00000008 +#define SNDRV_RAWMIDI_DEVICE_UNKNOWN -1 + struct snd_rawmidi_info { unsigned int device; /* RO/WR (control): device number */ unsigned int subdevice; /* RO/WR (control): subdevice number */ @@ -740,7 +742,8 @@ struct snd_rawmidi_info { unsigned char subname[32]; /* name of active or selected subdevice */ unsigned int subdevices_count; unsigned int subdevices_avail; - unsigned char reserved[64]; /* reserved for future use */ + int tied_device; /* R: tied rawmidi device (UMP/legacy) */ + unsigned char reserved[60]; /* reserved for future use */ }; #define SNDRV_RAWMIDI_MODE_FRAMING_MASK (7<<0) -- cgit v1.2.3 From b8fefed73a952a33521ad416fb39683abd87454b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 10 Jan 2025 16:59:35 +0100 Subject: ALSA: rawmidi: Show substream activity in info ioctl The UMP legacy rawmidi may turn on/off the substream dynamically depending on the UMP Function Block information. So far, there was no direct way to know whether the substream is disabled (inactive) or not; at most one can take a look at the substream name string or try to open and get -ENODEV. This patch extends the rawmidi info ioctl to show the current inactive state of the given substream. When the selected substream is inactive, info flags field contains the new bit flag SNDRV_RAWMIDI_INFO_STREAM_INACTIVE. Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250110155943.31578-3-tiwai@suse.de --- include/sound/rawmidi.h | 1 + include/uapi/sound/asound.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/sound/rawmidi.h b/include/sound/rawmidi.h index 7f1fec786b72..6f2e95298fc7 100644 --- a/include/sound/rawmidi.h +++ b/include/sound/rawmidi.h @@ -89,6 +89,7 @@ struct snd_rawmidi_substream { unsigned int framing; /* whether to frame input data */ unsigned int clock_type; /* clock source to use for input framing */ int use_count; /* use counter (for output) */ + bool inactive; /* inactive substream (for UMP legacy) */ size_t bytes; spinlock_t lock; struct snd_rawmidi *rmidi; diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index 1fcff031b5e3..9f3b32602ac1 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -728,6 +728,7 @@ enum { #define SNDRV_RAWMIDI_INFO_INPUT 0x00000002 #define SNDRV_RAWMIDI_INFO_DUPLEX 0x00000004 #define SNDRV_RAWMIDI_INFO_UMP 0x00000008 +#define SNDRV_RAWMIDI_INFO_STREAM_INACTIVE 0x00000010 #define SNDRV_RAWMIDI_DEVICE_UNKNOWN -1 -- cgit v1.2.3 From 7bb49d2e8b52adace88249d9dece41e36af3bba0 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 10 Jan 2025 16:59:36 +0100 Subject: ALSA: rawmidi: Bump protocol version to 2.0.5 Bump the protocol version to 2.0.5, as we extended the rawmidi ABI for the new tied_device info and the substream inactive flag. Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250110155943.31578-4-tiwai@suse.de --- include/uapi/sound/asound.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index 9f3b32602ac1..000b36c0e2b9 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -716,7 +716,7 @@ enum { * Raw MIDI section - /dev/snd/midi?? */ -#define SNDRV_RAWMIDI_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 4) +#define SNDRV_RAWMIDI_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 5) enum { SNDRV_RAWMIDI_STREAM_OUTPUT = 0, -- cgit v1.2.3 From 3ab4a3199c782eae9b683b952f3ec5a766e4a096 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 10 Jan 2025 16:59:41 +0100 Subject: ALSA: seq: Notify UMP EP and FB changes So far we notify the sequencer client and port changes upon UMP FB changes, but those aren't really corresponding to the UMP updates. e.g. when a FB info gets updated, it's not notified but done only when some of sequencer port attribute is changed. This is no ideal behavior. This patch adds the two new sequencer event types for notifying the UMP EP and FB changes via the announce port. The new event takes snd_seq_ev_ump_notify type data, which is compatible with snd_seq_addr (where the port number is replaced with the block number). The events are sent when the EP and FB info gets updated explicitly via ioctl, or the backend UMP receives the corresponding UMP messages. The sequencer protocol version is bumped to 1.0.5 along with it. Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250110155943.31578-9-tiwai@suse.de --- include/sound/ump.h | 1 + include/uapi/sound/asequencer.h | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/ump.h b/include/sound/ump.h index 532c2c3ea28e..73f97f88e2ed 100644 --- a/include/sound/ump.h +++ b/include/sound/ump.h @@ -83,6 +83,7 @@ struct snd_ump_ops { struct snd_seq_ump_ops { void (*input_receive)(struct snd_ump_endpoint *ump, const u32 *data, int words); + int (*notify_ep_change)(struct snd_ump_endpoint *ump); int (*notify_fb_change)(struct snd_ump_endpoint *ump, struct snd_ump_block *fb); int (*switch_protocol)(struct snd_ump_endpoint *ump); diff --git a/include/uapi/sound/asequencer.h b/include/uapi/sound/asequencer.h index bc30c1f2a109..a5c41f771e05 100644 --- a/include/uapi/sound/asequencer.h +++ b/include/uapi/sound/asequencer.h @@ -10,7 +10,7 @@ #include /** version of the sequencer */ -#define SNDRV_SEQ_VERSION SNDRV_PROTOCOL_VERSION(1, 0, 4) +#define SNDRV_SEQ_VERSION SNDRV_PROTOCOL_VERSION(1, 0, 5) /** * definition of sequencer event types @@ -92,6 +92,9 @@ #define SNDRV_SEQ_EVENT_PORT_SUBSCRIBED 66 /* ports connected */ #define SNDRV_SEQ_EVENT_PORT_UNSUBSCRIBED 67 /* ports disconnected */ +#define SNDRV_SEQ_EVENT_UMP_EP_CHANGE 68 /* UMP EP info has changed */ +#define SNDRV_SEQ_EVENT_UMP_BLOCK_CHANGE 69 /* UMP block info has changed */ + /* 70-89: synthesizer events - obsoleted */ /** user-defined events with fixed length @@ -253,6 +256,12 @@ struct snd_seq_ev_quote { struct snd_seq_event *event; /* quoted event */ } __packed; + /* UMP info change notify */ +struct snd_seq_ev_ump_notify { + unsigned char client; /**< Client number */ + unsigned char block; /**< Block number (optional) */ +}; + union snd_seq_event_data { /* event data... */ struct snd_seq_ev_note note; struct snd_seq_ev_ctrl control; @@ -265,6 +274,7 @@ union snd_seq_event_data { /* event data... */ struct snd_seq_connect connect; struct snd_seq_result result; struct snd_seq_ev_quote quote; + struct snd_seq_ev_ump_notify ump_notify; }; /* sequencer event */ -- cgit v1.2.3 From 0d2c022ffa7ccbd5bed5b18eefbda7ef2df4f81f Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 19 Dec 2024 23:03:37 +0100 Subject: i3c: fix kdoc parameter description for module_i3c_i2c_driver() A typo mentioned I3C when it should have been I2C. Signed-off-by: Wolfram Sang Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20241219220338.10315-1-wsa+renesas@sang-engineering.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h index 0a8a44ac2f02..b674f64d0822 100644 --- a/include/linux/i3c/device.h +++ b/include/linux/i3c/device.h @@ -283,7 +283,7 @@ static inline void i3c_i2c_driver_unregister(struct i3c_driver *i3cdrv, * module_i3c_i2c_driver() - Register a module providing an I3C and an I2C * driver * @__i3cdrv: the I3C driver to register - * @__i2cdrv: the I3C driver to register + * @__i2cdrv: the I2C driver to register * * Provide generic init/exit functions that simply register/unregister an I3C * and an I2C driver. -- cgit v1.2.3 From c320592f3f2a1e6a4e69e5db8e76fc66934a0a78 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 7 Jan 2025 10:01:59 +0100 Subject: bitops: add generic parity calculation for u8 There are multiple open coded implementations for getting the parity of a byte in the kernel, even using different approaches. Take the pretty efficient version from SPD5118 driver and make it generally available by putting it into the bitops header. As long as there is just one parity calculation helper, the creation of a distinct 'parity.h' header was discarded. Also, the usage of hweight8() for architectures having a popcnt instruction is postponed until a use case within hot paths is desired. The motivation for this patch is the frequent use of odd parity in the I3C specification and to simplify drivers there. Changes compared to the original SPD5118 version are the addition of kernel documentation, switching the return type from bool to int, and renaming the argument of the function. Signed-off-by: Wolfram Sang Tested-by: Guenter Roeck Reviewed-by: Geert Uytterhoeven Acked-by: Yury Norov Reviewed-by: Kuan-Wei Chiu Tested-by: Kuan-Wei Chiu Link: https://lore.kernel.org/r/20250107090204.6593-2-wsa+renesas@sang-engineering.com Signed-off-by: Alexandre Belloni --- include/linux/bitops.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index ba35bbf07798..c1cb53cf2f0f 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -229,6 +229,37 @@ static inline int get_count_order_long(unsigned long l) return (int)fls_long(--l); } +/** + * parity8 - get the parity of an u8 value + * @value: the value to be examined + * + * Determine the parity of the u8 argument. + * + * Returns: + * 0 for even parity, 1 for odd parity + * + * Note: This function informs you about the current parity. Example to bail + * out when parity is odd: + * + * if (parity8(val) == 1) + * return -EBADMSG; + * + * If you need to calculate a parity bit, you need to draw the conclusion from + * this result yourself. Example to enforce odd parity, parity bit is bit 7: + * + * if (parity8(val) == 0) + * val ^= BIT(7); + */ +static inline int parity8(u8 val) +{ + /* + * One explanation of this algorithm: + * https://funloop.org/codex/problem/parity/README.html + */ + val ^= val >> 4; + return (0x6996 >> (val & 0xf)) & 1; +} + /** * __ffs64 - find first set bit in a 64 bit word * @word: The 64 bit word -- cgit v1.2.3 From 9ab96b524dce598c041388a599e3a227c7a7926c Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 6 Jan 2025 11:31:17 +0800 Subject: hugetlb: fix NULL pointer dereference in trace_hugetlbfs_alloc_inode hugetlb_file_setup() will pass a NULL @dir to hugetlbfs_get_inode(), so we will access a NULL pointer for @dir. Fix it and set __entry->dr to 0 if @dir is NULL. Because ->i_ino cannot be 0 (see get_next_ino()), there is no confusing if user sees a 0 inode number. Link: https://lkml.kernel.org/r/20250106033118.4640-1-songmuchun@bytedance.com Fixes: 318580ad7f28 ("hugetlbfs: support tracepoint") Signed-off-by: Muchun Song Reported-by: Cheung Wall Closes: https://lore.kernel.org/linux-mm/02858D60-43C1-4863-A84F-3C76A8AF1F15@linux.dev/T/# Reviewed-by: Hongbo Li Cc: cheung wall Cc: Christian Brauner Cc: Signed-off-by: Andrew Morton --- include/trace/events/hugetlbfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/hugetlbfs.h b/include/trace/events/hugetlbfs.h index 8331c904a9ba..59605dfaeeb4 100644 --- a/include/trace/events/hugetlbfs.h +++ b/include/trace/events/hugetlbfs.h @@ -23,7 +23,7 @@ TRACE_EVENT(hugetlbfs_alloc_inode, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->dir = dir->i_ino; + __entry->dir = dir ? dir->i_ino : 0; __entry->mode = mode; ), -- cgit v1.2.3 From 4bcf29741145e73440323e3e9af8b1a6f4961183 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Tue, 7 Jan 2025 16:34:57 +0100 Subject: module: fix writing of livepatch relocations in ROX text A livepatch module can contain a special relocation section .klp.rela.. to apply its relocations at the appropriate time and to additionally access local and unexported symbols. When points to another module, such relocations are processed separately from the regular module relocation process. For instance, only when the target actually becomes loaded. With CONFIG_STRICT_MODULE_RWX, when the livepatch core decides to apply these relocations, their processing results in the following bug: [ 25.827238] BUG: unable to handle page fault for address: 00000000000012ba [ 25.827819] #PF: supervisor read access in kernel mode [ 25.828153] #PF: error_code(0x0000) - not-present page [ 25.828588] PGD 0 P4D 0 [ 25.829063] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI [ 25.829742] CPU: 2 UID: 0 PID: 452 Comm: insmod Tainted: G O K 6.13.0-rc4-00078-g059dd502b263 #7820 [ 25.830417] Tainted: [O]=OOT_MODULE, [K]=LIVEPATCH [ 25.830768] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-20220807_005459-localhost 04/01/2014 [ 25.831651] RIP: 0010:memcmp+0x24/0x60 [ 25.832190] Code: [...] [ 25.833378] RSP: 0018:ffffa40b403a3ae8 EFLAGS: 00000246 [ 25.833637] RAX: 0000000000000000 RBX: ffff93bc81d8e700 RCX: ffffffffc0202000 [ 25.834072] RDX: 0000000000000000 RSI: 0000000000000004 RDI: 00000000000012ba [ 25.834548] RBP: ffffa40b403a3b68 R08: ffffa40b403a3b30 R09: 0000004a00000002 [ 25.835088] R10: ffffffffffffd222 R11: f000000000000000 R12: 0000000000000000 [ 25.835666] R13: ffffffffc02032ba R14: ffffffffc007d1e0 R15: 0000000000000004 [ 25.836139] FS: 00007fecef8c3080(0000) GS:ffff93bc8f900000(0000) knlGS:0000000000000000 [ 25.836519] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 25.836977] CR2: 00000000000012ba CR3: 0000000002f24000 CR4: 00000000000006f0 [ 25.837442] Call Trace: [ 25.838297] [ 25.841083] __write_relocate_add.constprop.0+0xc7/0x2b0 [ 25.841701] apply_relocate_add+0x75/0xa0 [ 25.841973] klp_write_section_relocs+0x10e/0x140 [ 25.842304] klp_write_object_relocs+0x70/0xa0 [ 25.842682] klp_init_object_loaded+0x21/0xf0 [ 25.842972] klp_enable_patch+0x43d/0x900 [ 25.843572] do_one_initcall+0x4c/0x220 [ 25.844186] do_init_module+0x6a/0x260 [ 25.844423] init_module_from_file+0x9c/0xe0 [ 25.844702] idempotent_init_module+0x172/0x270 [ 25.845008] __x64_sys_finit_module+0x69/0xc0 [ 25.845253] do_syscall_64+0x9e/0x1a0 [ 25.845498] entry_SYSCALL_64_after_hwframe+0x77/0x7f [ 25.846056] RIP: 0033:0x7fecef9eb25d [ 25.846444] Code: [...] [ 25.847563] RSP: 002b:00007ffd0c5d6de8 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 25.848082] RAX: ffffffffffffffda RBX: 000055b03f05e470 RCX: 00007fecef9eb25d [ 25.848456] RDX: 0000000000000000 RSI: 000055b001e74e52 RDI: 0000000000000003 [ 25.848969] RBP: 00007ffd0c5d6ea0 R08: 0000000000000040 R09: 0000000000004100 [ 25.849411] R10: 00007fecefac7b20 R11: 0000000000000246 R12: 000055b001e74e52 [ 25.849905] R13: 0000000000000000 R14: 000055b03f05e440 R15: 0000000000000000 [ 25.850336] [ 25.850553] Modules linked in: deku(OK+) uinput [ 25.851408] CR2: 00000000000012ba [ 25.852085] ---[ end trace 0000000000000000 ]--- The problem is that the .klp.rela.. relocations are processed after the module was already formed and mod->rw_copy was reset. However, the code in __write_relocate_add() calls module_writable_address() which translates the target address 'loc' still to 'loc + (mem->rw_copy - mem->base)', with mem->rw_copy now being 0. Fix the problem by returning directly 'loc' in module_writable_address() when the module is already formed. Function __write_relocate_add() knows to use text_poke() in such a case. Link: https://lkml.kernel.org/r/20250107153507.14733-1-petr.pavlu@suse.com Fixes: 0c133b1e78cd ("module: prepare to handle ROX allocations for text") Signed-off-by: Petr Pavlu Reported-by: Marek Maslanka Closes: https://lore.kernel.org/linux-modules/CAGcaFA2hdThQV6mjD_1_U+GNHThv84+MQvMWLgEuX+LVbAyDxg@mail.gmail.com/ Reviewed-by: Petr Mladek Tested-by: Petr Mladek Cc: Joe Lawrence Cc: Josh Poimboeuf Cc: Luis Chamberlain Cc: Mike Rapoport (Microsoft) Cc: Petr Mladek Signed-off-by: Andrew Morton --- include/linux/module.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/module.h b/include/linux/module.h index 94acbacdcdf1..b3a643435357 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -773,7 +773,8 @@ void *__module_writable_address(struct module *mod, void *loc); static inline void *module_writable_address(struct module *mod, void *loc) { - if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX) || !mod) + if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX) || !mod || + mod->state != MODULE_STATE_UNFORMED) return loc; return __module_writable_address(mod, loc); } -- cgit v1.2.3 From 0cef0bb836e3cfe00f08f9606c72abd72fe78ca3 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 7 Jan 2025 14:47:52 +0000 Subject: mm: clear uffd-wp PTE/PMD state on mremap() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When mremap()ing a memory region previously registered with userfaultfd as write-protected but without UFFD_FEATURE_EVENT_REMAP, an inconsistency in flag clearing leads to a mismatch between the vma flags (which have uffd-wp cleared) and the pte/pmd flags (which do not have uffd-wp cleared). This mismatch causes a subsequent mprotect(PROT_WRITE) to trigger a warning in page_table_check_pte_flags() due to setting the pte to writable while uffd-wp is still set. Fix this by always explicitly clearing the uffd-wp pte/pmd flags on any such mremap() so that the values are consistent with the existing clearing of VM_UFFD_WP. Be careful to clear the logical flag regardless of its physical form; a PTE bit, a swap PTE bit, or a PTE marker. Cover PTE, huge PMD and hugetlb paths. Link: https://lkml.kernel.org/r/20250107144755.1871363-2-ryan.roberts@arm.com Co-developed-by: Mikołaj Lenczewski Signed-off-by: Mikołaj Lenczewski Signed-off-by: Ryan Roberts Closes: https://lore.kernel.org/linux-mm/810b44a8-d2ae-4107-b665-5a42eae2d948@arm.com/ Fixes: 63b2d4174c4a ("userfaultfd: wp: add the writeprotect API to userfaultfd ioctl") Cc: David Hildenbrand Cc: Jann Horn Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Mark Rutland Cc: Muchun Song Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index cb40f1a1d081..75342022d144 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -247,6 +247,13 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, vma_is_shmem(vma); } +static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) +{ + struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx; + + return uffd_ctx && (uffd_ctx->features & UFFD_FEATURE_EVENT_REMAP) == 0; +} + extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); extern void dup_userfaultfd_complete(struct list_head *); void dup_userfaultfd_fail(struct list_head *); @@ -402,6 +409,11 @@ static inline bool userfaultfd_wp_async(struct vm_area_struct *vma) return false; } +static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) +{ + return false; +} + #endif /* CONFIG_USERFAULTFD */ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) -- cgit v1.2.3 From ec01e9d001fef6278df1900df4207c70166095b4 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sat, 30 Nov 2024 02:12:19 +0800 Subject: lib min_heap: improve type safety in min_heap macros by using container_of Patch series "lib min_heap: Improve min_heap safety, testing, and documentation". Improve the min heap implementation by enhancing type safety with container_of, reducing the attack vector by replacing test function calls with inline variants, and adding a brief API introduction in min_heap.h. It also includes author information in Documentation/core-api/min_heap.rst. This patch (of 4): The current implementation of min_heap macros uses explicit casting to min_heap_char *, which prevents the compiler from detecting incorrect pointer types. This can lead to errors if non-min_heap pointers are passed inadvertently. To enhance safety, replace all explicit casts to min_heap_char * with the use of container_of(&(_heap)->nr, min_heap_char, nr). This approach ensures that the _heap parameter is indeed a min_heap_char-compatible structure, allowing the compiler to catch improper usages. Link: https://lkml.kernel.org/r/20241129181222.646855-1-visitorckw@gmail.com Link: https://lore.kernel.org/lkml/CAMuHMdVO5DPuD9HYWBFqKDHphx7+0BEhreUxtVC40A=8p6VAhQ@mail.gmail.com Link: https://lkml.kernel.org/r/20241129181222.646855-2-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Suggested-by: Geert Uytterhoeven Cc: Ching-Chun (Jim) Huang Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 61 +++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index e781727c8916..456cfbc1b8f5 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -218,7 +218,7 @@ void __min_heap_init_inline(min_heap_char *heap, void *data, int size) } #define min_heap_init_inline(_heap, _data, _size) \ - __min_heap_init_inline((min_heap_char *)_heap, _data, _size) + __min_heap_init_inline(container_of(&(_heap)->nr, min_heap_char, nr), _data, _size) /* Get the minimum element from the heap. */ static __always_inline @@ -228,7 +228,8 @@ void *__min_heap_peek_inline(struct min_heap_char *heap) } #define min_heap_peek_inline(_heap) \ - (__minheap_cast(_heap) __min_heap_peek_inline((min_heap_char *)_heap)) + (__minheap_cast(_heap) \ + __min_heap_peek_inline(container_of(&(_heap)->nr, min_heap_char, nr))) /* Check if the heap is full. */ static __always_inline @@ -238,7 +239,7 @@ bool __min_heap_full_inline(min_heap_char *heap) } #define min_heap_full_inline(_heap) \ - __min_heap_full_inline((min_heap_char *)_heap) + __min_heap_full_inline(container_of(&(_heap)->nr, min_heap_char, nr)) /* Sift the element at pos down the heap. */ static __always_inline @@ -277,8 +278,8 @@ void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size, } #define min_heap_sift_down_inline(_heap, _pos, _func, _args) \ - __min_heap_sift_down_inline((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), \ - _func, _args) + __min_heap_sift_down_inline(container_of(&(_heap)->nr, min_heap_char, nr), _pos, \ + __minheap_obj_size(_heap), _func, _args) /* Sift up ith element from the heap, O(log2(nr)). */ static __always_inline @@ -304,8 +305,8 @@ void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx } #define min_heap_sift_up_inline(_heap, _idx, _func, _args) \ - __min_heap_sift_up_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, \ - _func, _args) + __min_heap_sift_up_inline(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _idx, _func, _args) /* Floyd's approach to heapification that is O(nr). */ static __always_inline @@ -319,7 +320,8 @@ void __min_heapify_all_inline(min_heap_char *heap, size_t elem_size, } #define min_heapify_all_inline(_heap, _func, _args) \ - __min_heapify_all_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) + __min_heapify_all_inline(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _func, _args) /* Remove minimum element from the heap, O(log2(nr)). */ static __always_inline @@ -340,7 +342,8 @@ bool __min_heap_pop_inline(min_heap_char *heap, size_t elem_size, } #define min_heap_pop_inline(_heap, _func, _args) \ - __min_heap_pop_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) + __min_heap_pop_inline(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _func, _args) /* * Remove the minimum element and then push the given element. The @@ -356,8 +359,8 @@ void __min_heap_pop_push_inline(min_heap_char *heap, const void *element, size_t } #define min_heap_pop_push_inline(_heap, _element, _func, _args) \ - __min_heap_pop_push_inline((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ - _func, _args) + __min_heap_pop_push_inline(container_of(&(_heap)->nr, min_heap_char, nr), _element, \ + __minheap_obj_size(_heap), _func, _args) /* Push an element on to the heap, O(log2(nr)). */ static __always_inline @@ -382,8 +385,8 @@ bool __min_heap_push_inline(min_heap_char *heap, const void *element, size_t ele } #define min_heap_push_inline(_heap, _element, _func, _args) \ - __min_heap_push_inline((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ - _func, _args) + __min_heap_push_inline(container_of(&(_heap)->nr, min_heap_char, nr), _element, \ + __minheap_obj_size(_heap), _func, _args) /* Remove ith element from the heap, O(log2(nr)). */ static __always_inline @@ -411,8 +414,8 @@ bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx, } #define min_heap_del_inline(_heap, _idx, _func, _args) \ - __min_heap_del_inline((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, \ - _func, _args) + __min_heap_del_inline(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _idx, _func, _args) void __min_heap_init(min_heap_char *heap, void *data, int size); void *__min_heap_peek(struct min_heap_char *heap); @@ -433,25 +436,31 @@ bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx, const struct min_heap_callbacks *func, void *args); #define min_heap_init(_heap, _data, _size) \ - __min_heap_init((min_heap_char *)_heap, _data, _size) + __min_heap_init(container_of(&(_heap)->nr, min_heap_char, nr), _data, _size) #define min_heap_peek(_heap) \ - (__minheap_cast(_heap) __min_heap_peek((min_heap_char *)_heap)) + (__minheap_cast(_heap) __min_heap_peek(container_of(&(_heap)->nr, min_heap_char, nr))) #define min_heap_full(_heap) \ - __min_heap_full((min_heap_char *)_heap) + __min_heap_full(container_of(&(_heap)->nr, min_heap_char, nr)) #define min_heap_sift_down(_heap, _pos, _func, _args) \ - __min_heap_sift_down((min_heap_char *)_heap, _pos, __minheap_obj_size(_heap), _func, _args) + __min_heap_sift_down(container_of(&(_heap)->nr, min_heap_char, nr), _pos, \ + __minheap_obj_size(_heap), _func, _args) #define min_heap_sift_up(_heap, _idx, _func, _args) \ - __min_heap_sift_up((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) + __min_heap_sift_up(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _idx, _func, _args) #define min_heapify_all(_heap, _func, _args) \ - __min_heapify_all((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) + __min_heapify_all(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _func, _args) #define min_heap_pop(_heap, _func, _args) \ - __min_heap_pop((min_heap_char *)_heap, __minheap_obj_size(_heap), _func, _args) + __min_heap_pop(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _func, _args) #define min_heap_pop_push(_heap, _element, _func, _args) \ - __min_heap_pop_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), \ - _func, _args) + __min_heap_pop_push(container_of(&(_heap)->nr, min_heap_char, nr), _element, \ + __minheap_obj_size(_heap), _func, _args) #define min_heap_push(_heap, _element, _func, _args) \ - __min_heap_push((min_heap_char *)_heap, _element, __minheap_obj_size(_heap), _func, _args) + __min_heap_push(container_of(&(_heap)->nr, min_heap_char, nr), _element, \ + __minheap_obj_size(_heap), _func, _args) #define min_heap_del(_heap, _idx, _func, _args) \ - __min_heap_del((min_heap_char *)_heap, __minheap_obj_size(_heap), _idx, _func, _args) + __min_heap_del(container_of(&(_heap)->nr, min_heap_char, nr), \ + __minheap_obj_size(_heap), _idx, _func, _args) #endif /* _LINUX_MIN_HEAP_H */ -- cgit v1.2.3 From 2ad0546deb0214e385dbf33bb7a5e26c0dda3ad1 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sat, 30 Nov 2024 02:12:21 +0800 Subject: lib min_heap: add brief introduction to Min Heap API A short description of the Min Heap API is added to the min_heap.h, explaining its purpose for managing min-heaps and emphasizing the use of macro wrappers instead of direct function calls. For more details, users are directed to the documentation at Documentation/core-api/min_heap.rst. Link: https://lkml.kernel.org/r/20241129181222.646855-4-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Ching-Chun (Jim) Huang Cc: Geert Uytterhoeven Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/min_heap.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 456cfbc1b8f5..55bfe670bbb9 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -6,6 +6,17 @@ #include #include +/* + * The Min Heap API provides utilities for managing min-heaps, a binary tree + * structure where each node's value is less than or equal to its children's + * values, ensuring the smallest element is at the root. + * + * Users should avoid directly calling functions prefixed with __min_heap_*(). + * Instead, use the provided macro wrappers. + * + * For further details and examples, refer to Documentation/core-api/min_heap.rst. + */ + /** * Data structure to hold a min-heap. * @nr: Number of elements currently in the heap. -- cgit v1.2.3 From 658eb5ab916ddc92f294dbce8e3d449470be9f86 Mon Sep 17 00:00:00 2001 From: Wang Yaxin Date: Tue, 3 Dec 2024 16:48:48 +0800 Subject: delayacct: add delay max to record delay peak Introduce the use cases of delay max, which can help quickly detect potential abnormal delays in the system and record the types and specific details of delay spikes. Problem ======== Delay accounting can track the average delay of processes to show system workload. However, when a process experiences a significant delay, maybe a delay spike, which adversely affects performance, getdelays can only display the average system delay over a period of time. Yet, average delay is unhelpful for diagnosing delay peak. It is not even possible to determine which type of delay has spiked, as this information might be masked by the average delay. Solution ========= the 'delay max' can display delay peak since the system's startup, which can record potential abnormal delays over time, including the type of delay and the maximum delay. This is helpful for quickly identifying crash caused by delay. Use case ========= bash# ./getdelays -d -p 244 print delayacct stats ON PID 244 CPU count real total virtual total delay total delay average delay max 68 192000000 213676651 705643 0.010ms 0.306381ms IO count delay total delay average delay max 0 0 0.000ms 0.000000ms SWAP count delay total delay average delay max 0 0 0.000ms 0.000000ms RECLAIM count delay total delay average delay max 0 0 0.000ms 0.000000ms THRASHING count delay total delay average delay max 0 0 0.000ms 0.000000ms COMPACT count delay total delay average delay max 0 0 0.000ms 0.000000ms WPCOPY count delay total delay average delay max 235 15648284 0.067ms 0.263842ms IRQ count delay total delay average delay max 0 0 0.000ms 0.000000ms [wang.yaxin@zte.com.cn: update docs and fix some spelling errors] Link: https://lkml.kernel.org/r/20241213192700771XKZ8H30OtHSeziGqRVMs0@zte.com.cn Link: https://lkml.kernel.org/r/20241203164848805CS62CQPQWG9GLdQj2_BxS@zte.com.cn Co-developed-by: Wang Yong Signed-off-by: Wang Yong Co-developed-by: xu xin Signed-off-by: xu xin Co-developed-by: Wang Yaxin Signed-off-by: Wang Yaxin Signed-off-by: Kun Jiang Cc: Balbir Singh Cc: David Hildenbrand Cc: Fan Yu Cc: Peilin He Cc: tuqiang Cc: Yang Yang Cc: ye xingchen Cc: Yunkai Zhang Signed-off-by: Andrew Morton --- include/linux/delayacct.h | 7 +++++++ include/linux/sched.h | 3 +++ include/uapi/linux/taskstats.h | 9 +++++++++ 3 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 6639f48dac36..56fbfa2c2ac5 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -29,25 +29,32 @@ struct task_delay_info { * XXX_delay contains the accumulated delay time in nanoseconds. */ u64 blkio_start; + u64 blkio_delay_max; u64 blkio_delay; /* wait for sync block io completion */ u64 swapin_start; + u64 swapin_delay_max; u64 swapin_delay; /* wait for swapin */ u32 blkio_count; /* total count of the number of sync block */ /* io operations performed */ u32 swapin_count; /* total count of swapin */ u64 freepages_start; + u64 freepages_delay_max; u64 freepages_delay; /* wait for memory reclaim */ u64 thrashing_start; + u64 thrashing_delay_max; u64 thrashing_delay; /* wait for thrashing page */ u64 compact_start; + u64 compact_delay_max; u64 compact_delay; /* wait for memory compact */ u64 wpcopy_start; + u64 wpcopy_delay_max; u64 wpcopy_delay; /* wait for write-protect copy */ + u64 irq_delay_max; u64 irq_delay; /* wait for IRQ/SOFTIRQ */ u32 freepages_count; /* total count of memory reclaim */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 64934e0830af..a0ae3923b41d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -398,6 +398,9 @@ struct sched_info { /* Time spent waiting on a runqueue: */ unsigned long long run_delay; + /* Max time spent waiting on a runqueue: */ + unsigned long long max_run_delay; + /* Timestamps: */ /* When did we last run on a CPU? */ diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index b50b2eb257a0..e0d1c6fc9f3b 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -72,6 +72,7 @@ struct taskstats { */ __u64 cpu_count __attribute__((aligned(8))); __u64 cpu_delay_total; + __u64 cpu_delay_max; /* Following four fields atomically updated using task->delays->lock */ @@ -80,10 +81,12 @@ struct taskstats { */ __u64 blkio_count; __u64 blkio_delay_total; + __u64 blkio_delay_max; /* Delay waiting for page fault I/O (swap in only) */ __u64 swapin_count; __u64 swapin_delay_total; + __u64 swapin_delay_max; /* cpu "wall-clock" running time * On some architectures, value will adjust for cpu time stolen @@ -166,10 +169,12 @@ struct taskstats { /* Delay waiting for memory reclaim */ __u64 freepages_count; __u64 freepages_delay_total; + __u64 freepages_delay_max; /* Delay waiting for thrashing page */ __u64 thrashing_count; __u64 thrashing_delay_total; + __u64 thrashing_delay_max; /* v10: 64-bit btime to avoid overflow */ __u64 ac_btime64; /* 64-bit begin time */ @@ -177,6 +182,7 @@ struct taskstats { /* v11: Delay waiting for memory compact */ __u64 compact_count; __u64 compact_delay_total; + __u64 compact_delay_max; /* v12 begin */ __u32 ac_tgid; /* thread group ID */ @@ -198,10 +204,13 @@ struct taskstats { /* v13: Delay waiting for write-protect copy */ __u64 wpcopy_count; __u64 wpcopy_delay_total; + __u64 wpcopy_delay_max; /* v14: Delay waiting for IRQ/SOFTIRQ */ __u64 irq_count; __u64 irq_delay_total; + __u64 irq_delay_max; + /* v15: add Delay max */ }; -- cgit v1.2.3 From 7a77edf45a05615775d1e423a7309b9f06e866ec Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 5 Dec 2024 14:20:44 +0100 Subject: include: update references to include/asm- "include/asm-" was replaced by "arch//include/asm" a long time ago. Link: https://lkml.kernel.org/r/541258219b0441fa1da890e2f8458a7ac18c2ef9.1733404444.git.geert+renesas@glider.be Signed-off-by: Geert Uytterhoeven Cc: Andy Whitcroft Cc: Arnd Bergmann Cc: Dwaipayan Ray Cc: Joe Perches Cc: Lukas Bulwahn Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Oleg Nesterov Cc: Rasmus Villemoes Cc: Yury Norov Signed-off-by: Andrew Morton --- include/asm-generic/syscall.h | 2 +- include/linux/bitmap.h | 2 +- include/linux/types.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h index 5a80fe728dc8..182b039ce5fa 100644 --- a/include/asm-generic/syscall.h +++ b/include/asm-generic/syscall.h @@ -5,7 +5,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc. All rights reserved. * * This file is a stub providing documentation for what functions - * asm-ARCH/syscall.h files need to define. Most arch definitions + * arch/ARCH/include/asm/syscall.h files need to define. Most arch definitions * will be simple inlines. * * All of these functions expect to be called with no locks, diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 262b6596eca5..2026953e2c4e 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -23,7 +23,7 @@ struct device; * * Function implementations generic to all architectures are in * lib/bitmap.c. Functions implementations that are architecture - * specific are in various include/asm-/bitops.h headers + * specific are in various arch//include/asm/bitops.h headers * and other arch/ specific files. * * See lib/bitmap.c for more details. diff --git a/include/linux/types.h b/include/linux/types.h index 2d7b9ae8714c..1c509ce8f7f6 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -43,7 +43,7 @@ typedef unsigned long uintptr_t; typedef long intptr_t; #ifdef CONFIG_HAVE_UID16 -/* This is defined by include/asm-{arch}/posix_types.h */ +/* This is defined by arch/{arch}/include/asm/posix_types.h */ typedef __kernel_old_uid_t old_uid_t; typedef __kernel_old_gid_t old_gid_t; #endif /* CONFIG_UID16 */ -- cgit v1.2.3 From 78346c34d20f571d6495aa50c735653c730b94ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominik=20Karol=20Pi=C4=85tkowski?= Date: Fri, 20 Dec 2024 18:12:12 +0000 Subject: kasan: fix typo in kasan_poison_new_object documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix presumed copy-paste typo of kasan_poison_new_object documentation referring to kasan_unpoison_new_object. No functional changes. Link: https://lkml.kernel.org/r/20241220181205.9663-1-dominik.karol.piatkowski@protonmail.com Fixes: 1ce9a0523938 ("kasan: rename and document kasan_(un)poison_object_data") ta") Signed-off-by: Dominik Karol Piątkowski Reviewed-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Signed-off-by: Andrew Morton --- include/linux/kasan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 6bbfc8aa42e8..56465af31044 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -153,7 +153,7 @@ static __always_inline void kasan_unpoison_new_object(struct kmem_cache *cache, void __kasan_poison_new_object(struct kmem_cache *cache, void *object); /** - * kasan_unpoison_new_object - Repoison a new slab object. + * kasan_poison_new_object - Repoison a new slab object. * @cache: Cache the object belong to. * @object: Pointer to the object. * -- cgit v1.2.3 From f65c64f311ee2f1ddc1eb395ed8b20e6b9d14e85 Mon Sep 17 00:00:00 2001 From: Wang Yaxin Date: Fri, 20 Dec 2024 17:31:05 +0800 Subject: delayacct: add delay min to record delay peak Delay accounting can now calculate the average delay of processes, detect the overall system load, and also record the 'delay max' to identify potential abnormal delays. However, 'delay min' can help us identify another useful delay peak. By comparing the difference between 'delay max' and 'delay min', we can understand the optimization space for latency, providing a reference for the optimization of latency performance. Use case ========= bash-4.4# ./getdelays -d -t 242 print delayacct stats ON TGID 242 CPU count real total virtual total delay total delay average delay max delay min 39 156000000 156576579 2111069 0.054ms 0.212296ms 0.031307ms IO count delay total delay average delay max delay min 0 0 0.000ms 0.000000ms 0.000000ms SWAP count delay total delay average delay max delay min 0 0 0.000ms 0.000000ms 0.000000ms RECLAIM count delay total delay average delay max delay min 0 0 0.000ms 0.000000ms 0.000000ms THRASHING count delay total delay average delay max delay min 0 0 0.000ms 0.000000ms 0.000000ms COMPACT count delay total delay average delay max delay min 0 0 0.000ms 0.000000ms 0.000000ms WPCOPY count delay total delay average delay max delay min 156 11215873 0.072ms 0.207403ms 0.033913ms IRQ count delay total delay average delay max delay min 0 0 0.000ms 0.000000ms 0.000000ms Link: https://lkml.kernel.org/r/20241220173105906EOdsPhzjMLYNJJBqgz1ga@zte.com.cn Co-developed-by: Wang Yong Signed-off-by: Wang Yong Co-developed-by: xu xin Signed-off-by: xu xin Signed-off-by: Wang Yaxin Co-developed-by: Kun Jiang Signed-off-by: Kun Jiang Cc: Balbir Singh Cc: David Hildenbrand Cc: Fan Yu Cc: Peilin He Cc: tuqiang Cc: ye xingchen Cc: Yunkai Zhang Signed-off-by: Andrew Morton --- include/linux/delayacct.h | 7 +++++++ include/linux/sched.h | 3 +++ include/uapi/linux/taskstats.h | 8 ++++++++ 3 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 56fbfa2c2ac5..800dcc360db2 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -30,9 +30,11 @@ struct task_delay_info { */ u64 blkio_start; u64 blkio_delay_max; + u64 blkio_delay_min; u64 blkio_delay; /* wait for sync block io completion */ u64 swapin_start; u64 swapin_delay_max; + u64 swapin_delay_min; u64 swapin_delay; /* wait for swapin */ u32 blkio_count; /* total count of the number of sync block */ /* io operations performed */ @@ -40,21 +42,26 @@ struct task_delay_info { u64 freepages_start; u64 freepages_delay_max; + u64 freepages_delay_min; u64 freepages_delay; /* wait for memory reclaim */ u64 thrashing_start; u64 thrashing_delay_max; + u64 thrashing_delay_min; u64 thrashing_delay; /* wait for thrashing page */ u64 compact_start; u64 compact_delay_max; + u64 compact_delay_min; u64 compact_delay; /* wait for memory compact */ u64 wpcopy_start; u64 wpcopy_delay_max; + u64 wpcopy_delay_min; u64 wpcopy_delay; /* wait for write-protect copy */ u64 irq_delay_max; + u64 irq_delay_min; u64 irq_delay; /* wait for IRQ/SOFTIRQ */ u32 freepages_count; /* total count of memory reclaim */ diff --git a/include/linux/sched.h b/include/linux/sched.h index a0ae3923b41d..155012467b21 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -401,6 +401,9 @@ struct sched_info { /* Max time spent waiting on a runqueue: */ unsigned long long max_run_delay; + /* Min time spent waiting on a runqueue: */ + unsigned long long min_run_delay; + /* Timestamps: */ /* When did we last run on a CPU? */ diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index e0d1c6fc9f3b..934e20ef7f79 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -73,6 +73,7 @@ struct taskstats { __u64 cpu_count __attribute__((aligned(8))); __u64 cpu_delay_total; __u64 cpu_delay_max; + __u64 cpu_delay_min; /* Following four fields atomically updated using task->delays->lock */ @@ -82,11 +83,13 @@ struct taskstats { __u64 blkio_count; __u64 blkio_delay_total; __u64 blkio_delay_max; + __u64 blkio_delay_min; /* Delay waiting for page fault I/O (swap in only) */ __u64 swapin_count; __u64 swapin_delay_total; __u64 swapin_delay_max; + __u64 swapin_delay_min; /* cpu "wall-clock" running time * On some architectures, value will adjust for cpu time stolen @@ -170,11 +173,13 @@ struct taskstats { __u64 freepages_count; __u64 freepages_delay_total; __u64 freepages_delay_max; + __u64 freepages_delay_min; /* Delay waiting for thrashing page */ __u64 thrashing_count; __u64 thrashing_delay_total; __u64 thrashing_delay_max; + __u64 thrashing_delay_min; /* v10: 64-bit btime to avoid overflow */ __u64 ac_btime64; /* 64-bit begin time */ @@ -183,6 +188,7 @@ struct taskstats { __u64 compact_count; __u64 compact_delay_total; __u64 compact_delay_max; + __u64 compact_delay_min; /* v12 begin */ __u32 ac_tgid; /* thread group ID */ @@ -205,11 +211,13 @@ struct taskstats { __u64 wpcopy_count; __u64 wpcopy_delay_total; __u64 wpcopy_delay_max; + __u64 wpcopy_delay_min; /* v14: Delay waiting for IRQ/SOFTIRQ */ __u64 irq_count; __u64 irq_delay_total; __u64 irq_delay_max; + __u64 irq_delay_min; /* v15: add Delay max */ }; -- cgit v1.2.3 From 26a6cc10f19a058c24cbe3be2a4a10048e66d9c9 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Sun, 12 Jan 2025 18:33:52 +0800 Subject: usb: phy: Remove API devm_usb_put_phy() Static devm_usb_phy_match() is only called by API devm_usb_put_phy(), and the API has no caller now. Remove the API and the static function. Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250112-remove_api-v1-1-49cc8f792ac9@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/phy.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h index e4de6bc1f69b..0fa9885a1038 100644 --- a/include/linux/usb/phy.h +++ b/include/linux/usb/phy.h @@ -223,7 +223,6 @@ extern struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, extern struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, struct device_node *node, struct notifier_block *nb); extern void usb_put_phy(struct usb_phy *); -extern void devm_usb_put_phy(struct device *dev, struct usb_phy *x); extern void usb_phy_set_event(struct usb_phy *x, unsigned long event); extern void usb_phy_set_charger_current(struct usb_phy *usb_phy, unsigned int mA); @@ -259,10 +258,6 @@ static inline void usb_put_phy(struct usb_phy *x) { } -static inline void devm_usb_put_phy(struct device *dev, struct usb_phy *x) -{ -} - static inline void usb_phy_set_event(struct usb_phy *x, unsigned long event) { } -- cgit v1.2.3 From 687a7c8a722743526b4fe45fc75c6999990480ab Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Tue, 24 Dec 2024 19:27:30 +0200 Subject: wifi: mac80211: change disassoc sequence a bit Currently, the sequence goes like this (among others): 1. flush all stations (including the AP ones) -> this will tell the drivers to remove the stations 2. notify the driver the vif is not associated. Which means that in between 1 and 2, the state is that the vif is associated, but there is no AP station, which makes no sense, and may be problematic for some drivers (for example iwlwifi) Change the sequence to: 1. flush the TDLS stations 2. move the AP station to IEEE80211_STA_NONE 3. notify the driver about the vif being unassociated 4. flush the AP station In order to not break other drivers, add a vif flag to indicate whether the driver wants to new sequence or not. If the flag is not set, then things will be done in the old sequence. Signed-off-by: Miri Korenblit Reviewed-by: Johannes Berg Link: https://patch.msgid.link/20241224192322.996ad1be6cb3.I7815d33415aa1d65c0120b54be7a15a45388f807@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 9320d4bc22ee..769c1fe30e34 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1855,6 +1855,9 @@ struct ieee80211_channel_switch { * operation on this interface and request a channel context without * the AP definition. Use this e.g. because the device is able to * handle OFDMA (downlink and trigger for uplink) on a per-AP basis. + * @IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC: indicates that the AP sta should + * be removed only after setting the vif as unassociated, and not the + * opposite. Only relevant for STA vifs. */ enum ieee80211_vif_flags { IEEE80211_VIF_BEACON_FILTER = BIT(0), @@ -1863,6 +1866,7 @@ enum ieee80211_vif_flags { IEEE80211_VIF_GET_NOA_UPDATE = BIT(3), IEEE80211_VIF_EML_ACTIVE = BIT(4), IEEE80211_VIF_IGNORE_OFDMA_WIDER_BW = BIT(5), + IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC = BIT(6), }; -- cgit v1.2.3 From 453a73c3069a268c3c4dd00695fc2a95f7880438 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 14 Nov 2024 17:04:29 +0900 Subject: btrfs: zoned: reclaim unused zone by zone resetting On the zoned mode, once used and freed region is still not reusable after the freeing. The underlying zone needs to be reset before reusing. Btrfs resets a zone when it removes a block group, and then new block group is allocated on the zones to reuse the zones. But, it is sometime too late to catch up with a write side. This commit introduces a new space-info reclaim method ZONE_RESET. That will pick a block group from the unused list and reset its zone to reuse the zone_unusable space. It is faster than removing the block group and re-creating a new block group on the same zones. For the first implementation, the ZONE_RESET is only applied to a block group whose region is fully zone_unusable. Reclaiming partial zone_unusable block group could be implemented later. Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 4df93ca9b7a8..549ab3b41961 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -100,7 +100,8 @@ struct find_free_extent_ctl; EM( ALLOC_CHUNK, "ALLOC_CHUNK") \ EM( ALLOC_CHUNK_FORCE, "ALLOC_CHUNK_FORCE") \ EM( RUN_DELAYED_IPUTS, "RUN_DELAYED_IPUTS") \ - EMe(COMMIT_TRANS, "COMMIT_TRANS") + EM( COMMIT_TRANS, "COMMIT_TRANS") \ + EMe(RESET_ZONES, "RESET_ZONES") /* * First define the enums in the above macros to be exported to userspace via -- cgit v1.2.3 From 90dde9a13c0020ce140bc8d27c1f4c48a070cc97 Mon Sep 17 00:00:00 2001 From: "Roger L. Beckermeyer III" Date: Wed, 18 Dec 2024 08:28:50 +1030 Subject: rbtree: add rb_find_add_cached() to rbtree.h Adds rb_find_add_cached() as a helper function for use with red-black trees. Used in btrfs to reduce boilerplate code. And since it's a new helper, the cmp() function will require both parameter to be const rb_node pointers. Suggested-by: Josef Bacik Signed-off-by: Roger L. Beckermeyer III Acked-by: Peter Zijlstra (Intel) Reviewed-by: Qu Wenruo Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/linux/rbtree.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 7c173aa64e1e..8d2ba3749866 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -210,6 +210,43 @@ rb_add(struct rb_node *node, struct rb_root *tree, rb_insert_color(node, tree); } +/** + * rb_find_add_cached() - find equivalent @node in @tree, or add @node + * @node: node to look-for / insert + * @tree: tree to search / modify + * @cmp: operator defining the node order + * + * Returns the rb_node matching @node, or NULL when no match is found and @node + * is inserted. + */ +static __always_inline struct rb_node * +rb_find_add_cached(struct rb_node *node, struct rb_root_cached *tree, + int (*cmp)(const struct rb_node *new, const struct rb_node *exist)) +{ + bool leftmost = true; + struct rb_node **link = &tree->rb_root.rb_node; + struct rb_node *parent = NULL; + int c; + + while (*link) { + parent = *link; + c = cmp(node, parent); + + if (c < 0) { + link = &parent->rb_left; + } else if (c > 0) { + link = &parent->rb_right; + leftmost = false; + } else { + return parent; + } + } + + rb_link_node(node, parent, link); + rb_insert_color_cached(node, tree, leftmost); + return NULL; +} + /** * rb_find_add() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert -- cgit v1.2.3 From 24410f499e808884cc91239dc16013e5bee8779a Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 6 Jan 2025 05:49:45 +0000 Subject: ASoC: soc-core: Enable to use extra format on each DAI Current ASoC is using dai_link->dai_fmt to set DAI format for both CPU/Codec. But because it is using same settings, and SND_SOC_DAIFMT_CLOCK_PROVIDER is flipped for CPU, we can't set both CPU/Codec as clock consumer, for example. To solve this issue, this patch enable to use extra format for each DAI which can keep compatibility with legacy system, 1. SND_SOC_DAIFMT_FORMAT_MASK 2. SND_SOC_DAIFMT_CLOCK 3. SND_SOC_DAIFMT_INV 4. SND_SOC_DAIFMT_CLOCK_PROVIDER Legacy dai_fmt includes 1, 2, 3, 4 New idea dai_fmt includes 1, 2, 3 ext_fmt includes 4 Signed-off-by: Kuninori Morimoto Tested-by: Stephen Gordon Link: https://patch.msgid.link/87r05go5ja.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index 1e09ff084247..fcdb5adfcd5e 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -681,6 +681,17 @@ struct snd_soc_dai_link_component { struct device_node *of_node; const char *dai_name; const struct of_phandle_args *dai_args; + + /* + * Extra format = SND_SOC_DAIFMT_Bx_Fx + * + * [Note] it is Bx_Fx base, not CBx_CFx + * + * It will be used with dai_link->dai_fmt + * see + * snd_soc_runtime_set_dai_fmt() + */ + unsigned int ext_fmt; }; /* -- cgit v1.2.3 From da7f40c05c16ea35afef41f64a22689b2d974b14 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 1 Jan 2025 07:05:21 +0200 Subject: wifi: mac80211: add some support for RX OMI power saving In order to save power, it can be desirable to change the RX operating mode using OMI to reduce the bandwidth. As the handshake must be done in the HTC+ field, it cannot be done by mac80211 directly, so expose functions to the driver to request and finalize the necessary updates. Note that RX OMI really only changes what the peer (AP) will transmit to us, but in order to use it to actually save some power (by reducing the listen bandwidth) we also update rate scaling and then the channel context's mindef accordingly. The updates are split into two in order to sequence them correctly, when reducing bandwidth first reduce the rate scaling and thus TX, then send OMI, then reduce the listen bandwidth (chandef); when increasing bandwidth this is the other way around. This also requires tracking in different variables which part is applicable already. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250101070249.2c1a1934bd73.I4e90fd503504e37f9eac5bdae62e3f07e7071275@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 769c1fe30e34..d1540a605da5 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2340,6 +2340,8 @@ enum ieee80211_sta_rx_bandwidth { IEEE80211_STA_RX_BW_320, }; +#define IEEE80211_STA_RX_BW_MAX IEEE80211_STA_RX_BW_320 + /** * struct ieee80211_sta_rates - station rate selection table * @@ -7738,6 +7740,50 @@ ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width) } } +/** + * ieee80211_prepare_rx_omi_bw - prepare for sending BW RX OMI + * @link_sta: the link STA the OMI is going to be sent to + * @bw: the bandwidth requested + * + * When the driver decides to do RX OMI to change bandwidth with a STA + * it calls this function to prepare, then sends the OMI, and finally + * calls ieee80211_finalize_rx_omi_bw(). + * + * Note that the (link) STA rate control is updated accordingly as well, + * but the chanctx might not be updated if there are other users. + * If the intention is to reduce the listen bandwidth, the driver must + * ensure there are no TDLS stations nor other uses of the chanctx. + * + * Also note that in order to sequence correctly, narrowing bandwidth + * will only happen in ieee80211_finalize_rx_omi_bw(), whereas widening + * again (e.g. going back to normal) will happen here. + * + * Note that we treat this symmetrically, so if the driver calls this + * and tells the peer to only send with a lower bandwidth, we assume + * that the driver also wants to only send at that lower bandwidth, to + * allow narrowing of the chanctx request for this station/interface. + * + * Finally, the driver must ensure that if the function returned %true, + * ieee80211_finalize_rx_omi_bw() is also called, even for example in + * case of HW restart. + * + * Context: Must be called with wiphy mutex held, and will call back + * into the driver, so ensure no driver locks are held. + * + * Return: %true if changes are going to be made, %false otherwise + */ +bool ieee80211_prepare_rx_omi_bw(struct ieee80211_link_sta *link_sta, + enum ieee80211_sta_rx_bandwidth bw); + +/** + * ieee80211_finalize_rx_omi_bw - finalize BW RX OMI update + * @link_sta: the link STA the OMI was sent to + * + * See ieee80211_client_prepare_rx_omi_bw(). Context is the same here + * as well. + */ +void ieee80211_finalize_rx_omi_bw(struct ieee80211_link_sta *link_sta); + /* for older drivers - let's not document these ... */ int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw, struct ieee80211_chanctx_conf *ctx); -- cgit v1.2.3 From dfd5b5b5b725aa033e88b02adc3980a3fcf361d0 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Wed, 1 Jan 2025 07:05:23 +0200 Subject: wifi: mac80211: clarify key idx documententaion ieee80211_key_conf::keyidx s in range 0-7, ano not 0-3. Make this clear in the documentation. Signed-off-by: Miri Korenblit Reviewed-by: Johannes Berg Link: https://patch.msgid.link/20250101070249.4e414710fba7.Ib739c40dd5aa6ed148c3151220eb38d8a9e238de@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d1540a605da5..0d7ddaa952ff 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2220,7 +2220,7 @@ enum ieee80211_key_flags { * @tx_pn: PN used for TX keys, may be used by the driver as well if it * needs to do software PN assignment by itself (e.g. due to TSO) * @flags: key flags, see &enum ieee80211_key_flags. - * @keyidx: the key index (0-3) + * @keyidx: the key index (0-7) * @keylen: key material length * @key: key material. For ALG_TKIP the key is encoded as a 256-bit (32 byte) * data block: -- cgit v1.2.3 From 2bf502251b3ba0734aad81317d62e13389b89a5d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 1 Jan 2025 07:05:27 +0200 Subject: wifi: cfg80211: check extended MLD capa/ops in assoc Check that additionally extended MLD capa/ops for the MLD is consistent, i.e. the same value is reported by all affiliated APs/links. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250101070249.e29f42c7ae21.Ib2cdce608321ad154e4b13103cc315c3e3cb6b2b@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 05dedc45505c..9c0e2617fe8f 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4961,6 +4961,7 @@ struct ieee80211_multi_link_elem { #define IEEE80211_MLC_BASIC_PRES_EML_CAPA 0x0080 #define IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP 0x0100 #define IEEE80211_MLC_BASIC_PRES_MLD_ID 0x0200 +#define IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP 0x0400 #define IEEE80211_MED_SYNC_DELAY_DURATION 0x00ff #define IEEE80211_MED_SYNC_DELAY_SYNC_OFDM_ED_THRESH 0x0f00 @@ -5226,6 +5227,47 @@ static inline u16 ieee80211_mle_get_mld_capa_op(const u8 *data) return get_unaligned_le16(common); } +/** + * ieee80211_mle_get_ext_mld_capa_op - returns the extended MLD capabilities + * and operations. + * @data: pointer to the multi-link element + * Return: the extended MLD capabilities and operations field value from + * the multi-link element, or 0 if not present + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + */ +static inline u16 ieee80211_mle_get_ext_mld_capa_op(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* + * common points now at the beginning of + * ieee80211_mle_basic_common_info + */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP)) + return 0; + + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID) + common += 1; + + return get_unaligned_le16(common); +} + /** * ieee80211_mle_get_mld_id - returns the MLD ID * @data: pointer to the multi-link element @@ -5298,6 +5340,8 @@ static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) common += 2; if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID) common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP) + common += 2; break; case IEEE80211_ML_CONTROL_TYPE_PREQ: common += sizeof(struct ieee80211_mle_preq_common_info); -- cgit v1.2.3 From 98934687f8a871ea2bf90be6590daddd1a130cdd Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 1 Jan 2025 07:05:33 +0200 Subject: wifi: mac80211: skip all known membership selectors The GLK and EPD Selectors are also not rates, so add a new macro for the minimum value of a selector and test against that instead of the entire list. Also fix the typo in the EPD selector define. Signed-off-by: Benjamin Berg Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250101070249.2c19a2dc53db.If187b7d93d8b43a6c70e422c837b7636538fb358@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 9c0e2617fe8f..745c3b125d97 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1542,11 +1542,13 @@ struct ieee80211_mgmt { #define BSS_MEMBERSHIP_SELECTOR_HT_PHY 127 #define BSS_MEMBERSHIP_SELECTOR_VHT_PHY 126 #define BSS_MEMBERSHIP_SELECTOR_GLK 125 -#define BSS_MEMBERSHIP_SELECTOR_EPS 124 +#define BSS_MEMBERSHIP_SELECTOR_EPD 124 #define BSS_MEMBERSHIP_SELECTOR_SAE_H2E 123 #define BSS_MEMBERSHIP_SELECTOR_HE_PHY 122 #define BSS_MEMBERSHIP_SELECTOR_EHT_PHY 121 +#define BSS_MEMBERSHIP_SELECTOR_MIN BSS_MEMBERSHIP_SELECTOR_EHT_PHY + /* mgmt header + 1 byte category code */ #define IEEE80211_MIN_ACTION_SIZE offsetof(struct ieee80211_mgmt, u.action.u) -- cgit v1.2.3 From f6d2e5abf154da59ccb3bcac23438f2230c8948a Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 1 Jan 2025 07:05:35 +0200 Subject: wifi: nl80211: permit userspace to pass supported selectors Currently the SAE_H2E selector already exists, which needs to be implemented by the SME. As new such selectors might be added in the future, add a feature to permit userspace to report a selector as supported. If not given, the kernel should assume that userspace does support SAE_H2E. Signed-off-by: Benjamin Berg Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250101070249.fe67b871cc39.Ieb98390328927e998e612345a58b6dbc00b0e3a2@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 12 ++++++++++++ include/uapi/linux/nl80211.h | 9 +++++++++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7790af534b7f..8a38d66be2c9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3023,6 +3023,10 @@ static inline const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 id) * * @bss: The BSS to authenticate with, the callee must obtain a reference * to it if it needs to keep it. + * @supported_selectors: List of selectors that should be assumed to be + * supported by the station. + * SAE_H2E must be assumed supported if set to %NULL. + * @supported_selectors_len: Length of supported_selectors in octets. * @auth_type: Authentication type (algorithm) * @ie: Extra IEs to add to Authentication frame or %NULL * @ie_len: Length of ie buffer in octets @@ -3045,6 +3049,8 @@ struct cfg80211_auth_request { struct cfg80211_bss *bss; const u8 *ie; size_t ie_len; + const u8 *supported_selectors; + u8 supported_selectors_len; enum nl80211_auth_type auth_type; const u8 *key; u8 key_len; @@ -3124,6 +3130,10 @@ enum cfg80211_assoc_req_flags { * included in the Current AP address field of the Reassociation Request * frame. * @flags: See &enum cfg80211_assoc_req_flags + * @supported_selectors: supported selectors in IEEE 802.11 format + * (or %NULL for no change). + * If %NULL, then support for SAE_H2E should be assumed. + * @supported_selectors_len: Length of supported_selectors in octets. * @ht_capa: HT Capabilities over-rides. Values set in ht_capa_mask * will be used in ht_capa. Un-supported values will be ignored. * @ht_capa_mask: The bits of ht_capa which are to be used. @@ -3150,6 +3160,8 @@ struct cfg80211_assoc_request { struct cfg80211_crypto_settings crypto; bool use_mfp; u32 flags; + const u8 *supported_selectors; + u8 supported_selectors_len; struct ieee80211_ht_cap ht_capa; struct ieee80211_ht_cap ht_capa_mask; struct ieee80211_vht_cap vht_capa, vht_capa_mask; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 6d11437596b9..ee1cf19803ea 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2871,6 +2871,12 @@ enum nl80211_commands { * @NL80211_ATTR_VIF_RADIO_MASK: Bitmask of allowed radios (u32). * A value of 0 means all radios. * + * @NL80211_ATTR_SUPPORTED_SELECTORS: supported selectors, array of + * supported selectors as defined by IEEE 802.11 7.3.2.2 but without the + * length restriction (at most %NL80211_MAX_SUPP_SELECTORS). + * This can be used to provide a list of selectors that are implemented + * by the supplicant. If not given, support for SAE_H2E is assumed. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3421,6 +3427,8 @@ enum nl80211_attrs { NL80211_ATTR_VIF_RADIO_MASK, + NL80211_ATTR_SUPPORTED_SELECTORS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -3465,6 +3473,7 @@ enum nl80211_attrs { #define NL80211_WIPHY_NAME_MAXLEN 64 #define NL80211_MAX_SUPP_RATES 32 +#define NL80211_MAX_SUPP_SELECTORS 128 #define NL80211_MAX_SUPP_HT_RATES 77 #define NL80211_MAX_SUPP_REG_RULES 128 #define NL80211_TKIP_DATA_OFFSET_ENCR_KEY 0 -- cgit v1.2.3 From fa2a71a3b9ed1a333f1bed30ffe758cc150a399a Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 2 Jan 2025 16:19:53 +0200 Subject: wifi: ieee80211: Add some missing MLO related definitions As a preparation to support ML reconfiguration request and response, add additional ML reconfiguration definitions required to support the flow. See Section 9.4.2.321.4 in Draft P802.11be_D6.0. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250102161730.4970ca10ebfd.Ibe7f6108cd0e04b8c739a8e35a4f485f664a17e6@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 745c3b125d97..ee6bebfd041d 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -3885,6 +3885,16 @@ enum ieee80211_protected_eht_actioncode { WLAN_PROTECTED_EHT_ACTION_TTLM_REQ = 0, WLAN_PROTECTED_EHT_ACTION_TTLM_RES = 1, WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN = 2, + WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_REQ = 3, + WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_RESP = 4, + WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN = 5, + WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF = 6, + WLAN_PROTECTED_EHT_ACTION_LINK_RECOMMEND = 7, + WLAN_PROTECTED_EHT_ACTION_ML_OP_UPDATE_REQ = 8, + WLAN_PROTECTED_EHT_ACTION_ML_OP_UPDATE_RESP = 9, + WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_NOTIF = 10, + WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_REQ = 11, + WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_RESP = 12, }; /* Security key length */ @@ -5021,6 +5031,8 @@ struct ieee80211_multi_link_elem { #define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_DIFF 3 #define IEEE80211_MLD_CAP_OP_FREQ_SEP_TYPE_IND 0x0f80 #define IEEE80211_MLD_CAP_OP_AAR_SUPPORT 0x1000 +#define IEEE80211_MLD_CAP_OP_LINK_RECONF_SUPPORT 0x2000 +#define IEEE80211_MLD_CAP_OP_ALIGNED_TWT_SUPPORT 0x4000 struct ieee80211_mle_basic_common_info { u8 len; @@ -5036,6 +5048,9 @@ struct ieee80211_mle_preq_common_info { } __packed; #define IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR 0x0010 +#define IEEE80211_MLC_RECONF_PRES_EML_CAPA 0x0020 +#define IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP 0x0040 +#define IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP 0x0080 /* no fixed fields in RECONF */ @@ -5354,6 +5369,12 @@ static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) case IEEE80211_ML_CONTROL_TYPE_RECONF: if (control & IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR) common += ETH_ALEN; + if (control & IEEE80211_MLC_RECONF_PRES_EML_CAPA) + common += 2; + if (control & IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP) + common += 2; + if (control & IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP) + common += 2; break; case IEEE80211_ML_CONTROL_TYPE_TDLS: common += sizeof(struct ieee80211_mle_tdls_common_info); @@ -5504,8 +5525,13 @@ ieee80211_mle_basic_sta_prof_bss_param_ch_cnt(const struct ieee80211_mle_per_sta #define IEEE80211_MLE_STA_RECONF_CONTROL_COMPLETE_PROFILE 0x0010 #define IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT 0x0020 #define IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT 0x0040 -#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_UPDATE_TYPE 0x0780 -#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_PARAMS_PRESENT 0x0800 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE 0x0780 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_AP_REM 0 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_OP_PARAM_UPDATE 1 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_ADD_LINK 2 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_DEL_LINK 3 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_NSTR_STATUS 4 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_PARAMS_PRESENT 0x0800 /** * ieee80211_mle_reconf_sta_prof_size_ok - validate reconfiguration multi-link -- cgit v1.2.3 From 65c1c041798484da54cbad5fb5833b81694c43cf Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 2 Jan 2025 16:19:55 +0200 Subject: wifi: cfg80211: Add support for dynamic addition/removal of links Add support for requesting dynamic addition/removal of links to the current MLO association. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250102161730.cef23352f2a2.I79c849974c494cb1cbf9e1b22a5d2d37395ff5ac@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 45 ++++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/nl80211.h | 10 ++++++++++ 2 files changed, 55 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8a38d66be2c9..d4b4cabac029 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4596,6 +4596,15 @@ struct mgmt_frame_regs { * @set_ttlm: set the TID to link mapping. * @get_radio_mask: get bitmask of radios in use. * (invoked with the wiphy mutex held) + * @assoc_ml_reconf: Request a non-AP MLO connection to perform ML + * reconfiguration, i.e., add and/or remove links to/from the + * association using ML reconfiguration action frames. Successfully added + * links will be added to the set of valid links. Successfully removed + * links will be removed from the set of valid links. The driver must + * indicate removed links by calling cfg80211_links_removed() and added + * links by calling cfg80211_mlo_reconf_add_done(). When calling + * cfg80211_mlo_reconf_add_done() the bss pointer must be given for each + * link for which MLO reconfiguration 'add' operation was requested. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -4959,6 +4968,9 @@ struct cfg80211_ops { int (*set_ttlm)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_ttlm_params *params); u32 (*get_radio_mask)(struct wiphy *wiphy, struct net_device *dev); + int (*assoc_ml_reconf)(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_assoc_link *add_links, + u16 rem_links); }; /* @@ -9716,6 +9728,39 @@ static inline int cfg80211_color_change_notify(struct net_device *dev, */ void cfg80211_links_removed(struct net_device *dev, u16 link_mask); +/** + * struct cfg80211_mlo_reconf_done_data - MLO reconfiguration data + * @buf: MLO Reconfiguration Response frame (header + body) + * @len: length of the frame data + * @added_links: BIT mask of links successfully added to the association + * @links: per-link information indexed by link ID + * @links.bss: the BSS that MLO reconfiguration was requested for, ownership of + * the pointer moves to cfg80211 in the call to + * cfg80211_mlo_reconf_add_done(). + * + * The BSS pointer must be set for each link for which 'add' operation was + * requested in the assoc_ml_reconf callback. + */ +struct cfg80211_mlo_reconf_done_data { + const u8 *buf; + size_t len; + u16 added_links; + struct { + struct cfg80211_bss *bss; + } links[IEEE80211_MLD_MAX_NUM_LINKS]; +}; + +/** + * cfg80211_mlo_reconf_add_done - Notify about MLO reconfiguration result + * @dev: network device. + * @data: MLO reconfiguration done data, &struct cfg80211_mlo_reconf_done_data + * + * Inform cfg80211 and the userspace that processing of ML reconfiguration + * request to add links to the association is done. + */ +void cfg80211_mlo_reconf_add_done(struct net_device *dev, + struct cfg80211_mlo_reconf_done_data *data); + /** * cfg80211_schedule_channels_check - schedule regulatory check if needed * @wdev: the wireless device to check diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ee1cf19803ea..f900d7cc42bc 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1329,6 +1329,9 @@ * %NL80211_ATTR_MLO_TTLM_ULINK attributes are used to specify the * TID to Link mapping for downlink/uplink traffic. * + * @NL80211_CMD_ASSOC_MLO_RECONF: For a non-AP MLD station, request to + * add/remove links to/from the association. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1586,6 +1589,8 @@ enum nl80211_commands { NL80211_CMD_SET_TID_TO_LINK_MAPPING, + NL80211_CMD_ASSOC_MLO_RECONF, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -2877,6 +2882,9 @@ enum nl80211_commands { * This can be used to provide a list of selectors that are implemented * by the supplicant. If not given, support for SAE_H2E is assumed. * + * @NL80211_ATTR_MLO_RECONF_REM_LINKS: (u16) A bitmask of the links requested + * to be removed from the MLO association. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3429,6 +3437,8 @@ enum nl80211_attrs { NL80211_ATTR_SUPPORTED_SELECTORS, + NL80211_ATTR_MLO_RECONF_REM_LINKS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, -- cgit v1.2.3 From 36e05b0b83903e2d85b3675d10ac8b5eced54377 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 2 Jan 2025 16:19:58 +0200 Subject: wifi: mac80211: Support dynamic link addition and removal Add support for adding and removing station links: - Adding links is done asynchronously, i.e., first an ML reconfiguration action frame is sent to the AP requesting to add links, and only when the AP replies, links which were added successfully by the AP are added locally. - Removing links is done synchronously, i.e., the links are removed before sending the ML reconfiguration action frame to the AP (to avoid using this links after the AP MLD removed them but before the station got the ML reconfiguration response). In case the AP replies with a status indicating that a link removal was not successful, disconnect (as this should not happen an is an indication that something might be wrong on the AP MLD). Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250102161730.ec0492a8dd21.I2869686642bbc0f86c40f284ebf7e6f644b551ab@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index ee6bebfd041d..b5c5b5c39d9a 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1532,6 +1532,17 @@ struct ieee80211_mgmt { struct { u8 action_code; } __packed ttlm_tear_down; + struct { + u8 action_code; + u8 dialog_token; + u8 variable[]; + } __packed ml_reconf_req; + struct { + u8 action_code; + u8 dialog_token; + u8 count; + u8 variable[]; + } __packed ml_reconf_resp; } u; } __packed action; DECLARE_FLEX_ARRAY(u8, body); /* Generic frame body */ -- cgit v1.2.3 From 904c277342936b75ae55999d87abacd4c1ab1fd3 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 2 Jan 2025 16:19:59 +0200 Subject: wifi: cfg80211: Add support for controlling EPCS Add support for configuring Emergency Preparedness Communication Services (EPCS) for station mode. Signed-off-by: Ilan Peer Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250102161730.ea54ac94445c.I11d750188bc0871e13e86146a3b5cc048d853e69@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 10 ++++++++++ include/uapi/linux/nl80211.h | 9 +++++++++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d4b4cabac029..363d7dd2255a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4594,6 +4594,7 @@ struct mgmt_frame_regs { * * @set_hw_timestamp: Enable/disable HW timestamping of TM/FTM frames. * @set_ttlm: set the TID to link mapping. + * @set_epcs: Enable/Disable EPCS for station mode. * @get_radio_mask: get bitmask of radios in use. * (invoked with the wiphy mutex held) * @assoc_ml_reconf: Request a non-AP MLO connection to perform ML @@ -4971,6 +4972,8 @@ struct cfg80211_ops { int (*assoc_ml_reconf)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_assoc_link *add_links, u16 rem_links); + int (*set_epcs)(struct wiphy *wiphy, struct net_device *dev, + bool val); }; /* @@ -9771,6 +9774,13 @@ void cfg80211_mlo_reconf_add_done(struct net_device *dev, */ void cfg80211_schedule_channels_check(struct wireless_dev *wdev); +/** + * cfg80211_epcs_changed - Notify about a change in EPCS state + * @netdev: the wireless device whose EPCS state changed + * @enabled: set to true if EPCS was enabled, otherwise set to false. + */ +void cfg80211_epcs_changed(struct net_device *netdev, bool enabled); + #ifdef CONFIG_CFG80211_DEBUGFS /** * wiphy_locked_debugfs_read - do a locked read in debugfs diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f900d7cc42bc..f6c1b181c886 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1332,6 +1332,10 @@ * @NL80211_CMD_ASSOC_MLO_RECONF: For a non-AP MLD station, request to * add/remove links to/from the association. * + * @NL80211_CMD_EPCS_CFG: EPCS configuration for a station. Used by userland to + * control EPCS configuration. Used to notify userland on the current state + * of EPCS. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1590,6 +1594,7 @@ enum nl80211_commands { NL80211_CMD_SET_TID_TO_LINK_MAPPING, NL80211_CMD_ASSOC_MLO_RECONF, + NL80211_CMD_EPCS_CFG, /* add new commands above here */ @@ -2885,6 +2890,9 @@ enum nl80211_commands { * @NL80211_ATTR_MLO_RECONF_REM_LINKS: (u16) A bitmask of the links requested * to be removed from the MLO association. * + * @NL80211_ATTR_EPCS: Flag attribute indicating that EPCS is enabled for a + * station interface. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3438,6 +3446,7 @@ enum nl80211_attrs { NL80211_ATTR_SUPPORTED_SELECTORS, NL80211_ATTR_MLO_RECONF_REM_LINKS, + NL80211_ATTR_EPCS, /* add attributes here, update the policy in nl80211.c */ -- cgit v1.2.3 From 19aa842dcbb5860509b7e1b7745dbae0b791f6c4 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 2 Jan 2025 16:20:00 +0200 Subject: wifi: mac80211: Fix common size calculation for ML element When the ML type is EPCS the control bitmap is reserved, the length is always 7 and is captured by the 1st octet after the control. Fixes: 0f48b8b88aa9 ("wifi: ieee80211: add definitions for multi-link element") Signed-off-by: Ilan Peer Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250102161730.5790376754a7.I381208cbb72b1be2a88239509294099e9337e254@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index b5c5b5c39d9a..16741e542e81 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -5084,28 +5084,24 @@ static inline u8 ieee80211_mle_common_size(const u8 *data) { const struct ieee80211_multi_link_elem *mle = (const void *)data; u16 control = le16_to_cpu(mle->control); - u8 common = 0; switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) { case IEEE80211_ML_CONTROL_TYPE_BASIC: case IEEE80211_ML_CONTROL_TYPE_PREQ: case IEEE80211_ML_CONTROL_TYPE_TDLS: case IEEE80211_ML_CONTROL_TYPE_RECONF: + case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: /* * The length is the first octet pointed by mle->variable so no * need to add anything */ break; - case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: - if (control & IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR) - common += ETH_ALEN; - return common; default: WARN_ON(1); return 0; } - return sizeof(*mle) + common + mle->variable[0]; + return sizeof(*mle) + mle->variable[0]; } /** @@ -5392,8 +5388,7 @@ static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) check_common_len = true; break; case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: - if (control & IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR) - common += ETH_ALEN; + common = ETH_ALEN + 1; break; default: /* we don't know this type */ -- cgit v1.2.3 From 6bd9a087c8035626e7bfb6b678c9e036b8b26038 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 2 Jan 2025 16:20:05 +0200 Subject: wifi: mac80211: set key link ID to the deflink one When in non-MLO mode, the key ID was set to -1 even for keys that are not pairwise. Change the link ID to be the link ID of the deflink in this case so that drivers do not need to special cases for this. Signed-off-by: Benjamin Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250102161730.0c066f084677.I4a5c288465e75119edb6a0df90dddf6f30d14a02@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 0d7ddaa952ff..c3ed2fcff8b7 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2229,7 +2229,7 @@ enum ieee80211_key_flags { * - Temporal Authenticator Rx MIC Key (64 bits) * @icv_len: The ICV length for this key type * @iv_len: The IV length for this key type - * @link_id: the link ID for MLO, or -1 for non-MLO or pairwise keys + * @link_id: the link ID, 0 for non-MLO, or -1 for pairwise keys */ struct ieee80211_key_conf { atomic64_t tx_pn; -- cgit v1.2.3 From 127186cfb184eaccdfe948e6da66940cfa03efc5 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 2 Jan 2025 19:28:41 +0800 Subject: md: reintroduce md-linear THe md-linear is removed by commit 849d18e27be9 ("md: Remove deprecated CONFIG_MD_LINEAR") because it has been marked as deprecated for a long time. However, md-linear is used widely for underlying disks with different size, sadly we didn't know this until now, and it's true useful to create partitions and assemble multiple raid and then append one to the other. People have to use dm-linear in this case now, however, they will prefer to minimize the number of involved modules. Fixes: 849d18e27be9 ("md: Remove deprecated CONFIG_MD_LINEAR") Cc: stable@vger.kernel.org Signed-off-by: Yu Kuai Acked-by: Coly Li Acked-by: Mike Snitzer Link: https://lore.kernel.org/r/20250102112841.1227111-1-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- include/uapi/linux/raid/md_p.h | 2 +- include/uapi/linux/raid/md_u.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 5a43c23f53bf..ff47b6f0ba0f 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -233,7 +233,7 @@ struct mdp_superblock_1 { char set_name[32]; /* set and interpreted by user-space */ __le64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/ - __le32 level; /* 0,1,4,5 */ + __le32 level; /* 0,1,4,5, -1 (linear) */ __le32 layout; /* only for raid5 and raid10 currently */ __le64 size; /* used size of component devices, in 512byte sectors */ diff --git a/include/uapi/linux/raid/md_u.h b/include/uapi/linux/raid/md_u.h index 7be89a4906e7..a893010735fb 100644 --- a/include/uapi/linux/raid/md_u.h +++ b/include/uapi/linux/raid/md_u.h @@ -103,6 +103,8 @@ typedef struct mdu_array_info_s { } mdu_array_info_t; +#define LEVEL_LINEAR (-1) + /* we need a value for 'no level specified' and 0 * means 'raid0', so we need something else. This is * for internal use only -- cgit v1.2.3 From 5e31e3477f1661ebbbec1cbf141f91ad3cffafc3 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Sat, 11 Jan 2025 09:17:51 +0000 Subject: cxl/events: Update Common Event Record to CXL spec rev 3.1 CXL spec 3.1 section 8.2.9.2.1 Table 8-42, Common Event Record format has updated with Maintenance Operation Subclass information. Add updates for the above spec change in the CXL events record and CXL common trace event implementations. Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Reviewed-by: Ira Weiny Signed-off-by: Shiju Jose Reviewed-by: Fan Ni Link: https://patch.msgid.link/20250111091756.1682-2-shiju.jose@huawei.com Signed-off-by: Dave Jiang --- include/cxl/event.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/cxl/event.h b/include/cxl/event.h index 0bea1afbd747..e1d485ad376b 100644 --- a/include/cxl/event.h +++ b/include/cxl/event.h @@ -18,7 +18,8 @@ struct cxl_event_record_hdr { __le16 related_handle; __le64 timestamp; u8 maint_op_class; - u8 reserved[15]; + u8 maint_op_sub_class; + u8 reserved[14]; } __packed; struct cxl_event_media_hdr { -- cgit v1.2.3 From ae8341313058f76f3ce4169c780af4cf5cb05e1e Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Sat, 11 Jan 2025 09:17:53 +0000 Subject: cxl/events: Update General Media Event Record to CXL spec rev 3.1 CXL spec rev 3.1 section 8.2.9.2.1.1 Table 8-45, General Media Event Record has updated with following new fields and new types for Memory Event Type and Transaction Type fields. 1. Advanced Programmable Corrected Memory Error Threshold Event Flags 2. Corrected Memory Error Count at Event 3. Memory Event Sub-Type The format of component identifier has changed (CXL spec 3.1 section 8.2.9.2.1 Table 8-44). Update the general media event record and general media trace event for the above spec changes. The new fields are inserted in logical places. Example trace log of cxl_general_media trace event, cxl_general_media: memdev=mem0 host=0000:0f:00.0 serial=3 log=Fatal : \ time=156831237413 uuid=fbcd0a77-c260-417f-85a9-088b1621eba6 len=128 \ flags='0x1' handle=1 related_handle=0 maint_op_class=2 \ maint_op_sub_class=4 : dpa=30d40 dpa_flags='' \ descriptor='UNCORRECTABLE_EVENT|THRESHOLD_EVENT|POISON_LIST_OVERFLOW' \ type='TE State Violation' sub_type='Media Link Command Training Error' \ transaction_type='Host Inject Poison' channel=3 rank=33 device=5 \ validity_flags='CHANNEL|RANK|DEVICE|COMPONENT|COMPONENT PLDM FORMAT' \ comp_id=03 74 c5 08 9a 1a 0b fc d2 7e 2f 31 9b 3c 81 4d \ comp_id_pldm_valid_flags='PLDM Entity ID | Resource ID' \ pldm_entity_id=74 c5 08 9a 1a 0b pldm_resource_id=fc d2 7e 2f \ hpa=ffffffffffffffff region= \ region_uuid=00000000-0000-0000-0000-000000000000 \ cme_threshold_ev_flags='Corrected Memory Errors in Multiple Media \ Components|Exceeded Programmable Threshold' cme_count=120 Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Reviewed-by: Ira Weiny Signed-off-by: Shiju Jose Link: https://patch.msgid.link/20250111091756.1682-4-shiju.jose@huawei.com Signed-off-by: Dave Jiang --- include/cxl/event.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/cxl/event.h b/include/cxl/event.h index e1d485ad376b..2b07adf39010 100644 --- a/include/cxl/event.h +++ b/include/cxl/event.h @@ -45,14 +45,17 @@ struct cxl_event_generic { /* * General Media Event Record - * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 + * CXL rev 3.1 Section 8.2.9.2.1.1; Table 8-45 */ #define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10 struct cxl_event_gen_media { struct cxl_event_media_hdr media_hdr; u8 device[3]; u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE]; - u8 reserved[46]; + u8 cme_threshold_ev_flags; + u8 cme_count[3]; + u8 sub_type; + u8 reserved[41]; } __packed; /* -- cgit v1.2.3 From 24ec41f7c7b22d668443c1581b3b8f8aceed18f9 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Sat, 11 Jan 2025 09:17:54 +0000 Subject: cxl/events: Update DRAM Event Record to CXL spec rev 3.1 CXL spec 3.1 section 8.2.9.2.1.2 Table 8-46, DRAM Event Record has updated with following new fields and new types for Memory Event Type, Transaction Type and Validity Flags fields. 1. Component Identifier 2. Sub-channel 3. Advanced Programmable Corrected Memory Error Threshold Event Flags 4. Corrected Memory Error Count at Event 5. Memory Event Sub-Type Update DRAM events record and DRAM trace event for the above spec changes. The new fields are inserted in logical places. Includes trivial consistency of white space improvements. Example trace print of cxl_dram trace event, cxl_dram: memdev=mem0 host=0000:0f:00.0 serial=3 log=Informational : \ time=54799339519 uuid=601dcbb3-9c06-4eab-b8af-4e9bfb5c9624 len=128 \ flags='0x1' handle=1 related_handle=0 maint_op_class=1 \ maint_op_sub_class=3 : dpa=18680 dpa_flags='' \ descriptor='UNCORRECTABLE_EVENT|THRESHOLD_EVENT' type='Data Path Error' \ sub_type='Media Link CRC Error' transaction_type='Internal Media Scrub' \ channel=3 rank=17 nibble_mask=3b00b2 bank_group=7 bank=11 row=2 \ column=77 cor_mask=21 00 00 00 00 00 00 00 2c 00 00 00 00 00 00 00 37 00 \ 00 00 00 00 00 00 42 00 00 00 00 00 00 00 validity_flags='CHANNEL|RANK|NIBBLE|\ BANK GROUP|BANK|ROW|COLUMN|CORRECTION MASK|COMPONENT|COMPONENT PLDM FORMAT' \ comp_id=01 74 c5 08 9a 1a 0b fc d2 7e 2f 31 9b 3c 81 4d \ comp_id_pldm_valid_flags='PLDM Entity ID' pldm_entity_id=74 c5 08 9a 1a 0b \ pldm_resource_id=0x00 hpa=ffffffffffffffff region= \ region_uuid=00000000-0000-0000-0000-000000000000 sub_channel=5 \ cme_threshold_ev_flags='Corrected Memory Errors in Multiple Media Components|\ Exceeded Programmable Threshold' cvme_count=148 Reviewed-by: Jonathan Cameron Reviewed-by: Davidlohr Bueso Signed-off-by: Shiju Jose Reviewed-by: Ira Weiny Link: https://patch.msgid.link/20250111091756.1682-5-shiju.jose@huawei.com Signed-off-by: Dave Jiang --- include/cxl/event.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/cxl/event.h b/include/cxl/event.h index 2b07adf39010..10815414f376 100644 --- a/include/cxl/event.h +++ b/include/cxl/event.h @@ -60,7 +60,7 @@ struct cxl_event_gen_media { /* * DRAM Event Record - DER - * CXL rev 3.0 section 8.2.9.2.1.2; Table 3-44 + * CXL rev 3.1 section 8.2.9.2.1.2; Table 8-46 */ #define CXL_EVENT_DER_CORRECTION_MASK_SIZE 0x20 struct cxl_event_dram { @@ -71,7 +71,12 @@ struct cxl_event_dram { u8 row[3]; u8 column[2]; u8 correction_mask[CXL_EVENT_DER_CORRECTION_MASK_SIZE]; - u8 reserved[0x17]; + u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE]; + u8 sub_channel; + u8 cme_threshold_ev_flags; + u8 cvme_count[3]; + u8 sub_type; + u8 reserved; } __packed; /* -- cgit v1.2.3 From 4c6e20eb564e0d77497f452ab1ae1b40b1de3ab7 Mon Sep 17 00:00:00 2001 From: Shiju Jose Date: Sat, 11 Jan 2025 09:17:55 +0000 Subject: cxl/events: Update Memory Module Event Record to CXL spec rev 3.1 CXL spec 3.1 section 8.2.9.2.1.3 Table 8-47, Memory Module Event Record has updated with following new fields and new info for Device Event Type and Device Health Information fields. 1. Validity Flags 2. Component Identifier 3. Device Event Sub-Type Update the Memory Module event record and Memory Module trace event for the above spec changes. The new fields are inserted in logical places. Example trace print of cxl_memory_module trace event, cxl_memory_module: memdev=mem3 host=0000:0f:00.0 serial=3 log=Fatal : \ time=371709344709 uuid=fe927475-dd59-4339-a586-79bab113b774 len=128 \ flags='0x1' handle=2 related_handle=0 maint_op_class=0 \ maint_op_sub_class=0 : event_type='Temperature Change' \ event_sub_type='Unsupported Config Data' \ health_status='MAINTENANCE_NEEDED|REPLACEMENT_NEEDED' \ media_status='All Data Loss in Event of Power Loss' as_life_used=0x3 \ as_dev_temp=Normal as_cor_vol_err_cnt=Normal as_cor_per_err_cnt=Normal \ life_used=8 device_temp=3 dirty_shutdown_cnt=33 cor_vol_err_cnt=25 \ cor_per_err_cnt=45 validity_flags='COMPONENT|COMPONENT PLDM FORMAT' \ comp_id=03 74 c5 08 9a 1a 0b fc d2 7e 2f 31 9b 3c 81 4d \ comp_id_pldm_valid_flags='Resource ID' \ pldm_entity_id=0x00 pldm_resource_id=fc d2 7e 2f Reviewed-by: Jonathan Cameron Reviewed-by: Ira Weiny Signed-off-by: Shiju Jose Link: https://patch.msgid.link/20250111091756.1682-6-shiju.jose@huawei.com Signed-off-by: Dave Jiang --- include/cxl/event.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/cxl/event.h b/include/cxl/event.h index 10815414f376..04edd44bd26f 100644 --- a/include/cxl/event.h +++ b/include/cxl/event.h @@ -81,7 +81,7 @@ struct cxl_event_dram { /* * Get Health Info Record - * CXL rev 3.0 section 8.2.9.8.3.1; Table 8-100 + * CXL rev 3.1 section 8.2.9.9.3.1; Table 8-133 */ struct cxl_get_health_info { u8 health_status; @@ -96,13 +96,16 @@ struct cxl_get_health_info { /* * Memory Module Event Record - * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45 + * CXL rev 3.1 section 8.2.9.2.1.3; Table 8-47 */ struct cxl_event_mem_module { struct cxl_event_record_hdr hdr; u8 event_type; struct cxl_get_health_info info; - u8 reserved[0x3d]; + u8 validity_flags[2]; + u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE]; + u8 event_sub_type; + u8 reserved[0x2a]; } __packed; union cxl_event { -- cgit v1.2.3 From e0537c9f828dc9500e5ffc9211099c495b72f402 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Tue, 19 Nov 2024 13:43:22 -0800 Subject: SUNRPC: only put task on cl_tasks list after the RPC call slot is reserved. Under heavy write load, we've seen the cl_tasks list grows to millions of entries. Even though the list is extremely long, the system still runs fine until the user wants to get the information of all active RPC tasks by doing: When this happens, tasks_start acquires the cl_lock to walk the cl_tasks list, returning one entry at a time to the caller. The cl_lock is held until all tasks on this list have been processed. While the cl_lock is held, completed RPC tasks have to spin wait in rpc_task_release_client for the cl_lock. If there are millions of entries in the cl_tasks list it will take a long time before tasks_stop is called and the cl_lock is released. The spin wait tasks can use up all the available CPUs in the system, preventing other jobs to run, this causes the system to temporarily lock up. This patch fixes this problem by delaying inserting the RPC task on the cl_tasks list until the RPC call slot is reserved. This limits the length of the cl_tasks to the number of call slots available in the system. Signed-off-by: Dai Ngo Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 5321585c778f..fec976e58174 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -93,6 +93,7 @@ struct rpc_clnt { const struct cred *cl_cred; unsigned int cl_max_connect; /* max number of transports not to the same IP */ struct super_block *pipefs_sb; + atomic_t cl_task_count; }; /* -- cgit v1.2.3 From 02d3b7557ce28c373ea1e925ae16ab5988284313 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Tue, 14 Jan 2025 00:10:03 +0100 Subject: dt-bindings: clock: drop NUM_CLOCKS define for EN7581 Drop NUM_CLOCKS define for EN7581 include. This is not a binding and should not be placed here. Value is derived internally in the user driver. Signed-off-by: Christian Marangi Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250113231030.6735-3-ansuelsmth@gmail.com Signed-off-by: Stephen Boyd --- include/dt-bindings/clock/en7523-clk.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/dt-bindings/clock/en7523-clk.h b/include/dt-bindings/clock/en7523-clk.h index 717d23a5e5ae..28e56745ccff 100644 --- a/include/dt-bindings/clock/en7523-clk.h +++ b/include/dt-bindings/clock/en7523-clk.h @@ -12,6 +12,4 @@ #define EN7523_CLK_CRYPTO 6 #define EN7523_CLK_PCIE 7 -#define EN7523_NUM_CLOCKS 8 - #endif /* _DT_BINDINGS_CLOCK_AIROHA_EN7523_H_ */ -- cgit v1.2.3 From 82108ad3285f58f314ad41398f44017c7dbe44de Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Tue, 14 Jan 2025 00:10:04 +0100 Subject: dt-bindings: clock: add ID for eMMC for EN7581 Add ID for eMMC for EN7581. This is to control clock selection of eMMC between 200MHz and 150MHz. Signed-off-by: Christian Marangi Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20250113231030.6735-4-ansuelsmth@gmail.com Signed-off-by: Stephen Boyd --- include/dt-bindings/clock/en7523-clk.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/en7523-clk.h b/include/dt-bindings/clock/en7523-clk.h index 28e56745ccff..edfa64045f52 100644 --- a/include/dt-bindings/clock/en7523-clk.h +++ b/include/dt-bindings/clock/en7523-clk.h @@ -12,4 +12,6 @@ #define EN7523_CLK_CRYPTO 6 #define EN7523_CLK_PCIE 7 +#define EN7581_CLK_EMMC 8 + #endif /* _DT_BINDINGS_CLOCK_AIROHA_EN7523_H_ */ -- cgit v1.2.3 From bb3914101f704a8282f65238d6b021d216efc608 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 9 Jan 2025 13:42:05 -0600 Subject: device property: Split property reading bool and presence test ops The fwnode/device property API currently implement (fwnode|device)_property_read_bool() with (fwnode|device)_property_present(). That does not allow having different behavior depending on the backend. Specifically, the usage of (fwnode|device)_property_read_bool() on non-boolean properties is deprecated on DT. In order to add a warning on this deprecated use, these 2 APIs need separate ops for the backend. Acked-by: Greg Kroah-Hartman Reviewed-by: Krzysztof Kozlowski Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20250109-dt-type-warnings-v1-1-0150e32e716c@kernel.org Signed-off-by: Rob Herring (Arm) --- include/linux/fwnode.h | 3 +++ include/linux/of.h | 4 +++- include/linux/property.h | 15 +++------------ 3 files changed, 9 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 0d79070c5a70..0731994b9d7c 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -112,6 +112,7 @@ struct fwnode_reference_args { * @device_is_available: Return true if the device is available. * @device_get_match_data: Return the device driver match data. * @property_present: Return true if a property is present. + * @property_read_bool: Return a boolean property value. * @property_read_int_array: Read an array of integer properties. Return zero on * success, a negative error code otherwise. * @property_read_string_array: Read an array of string properties. Return zero @@ -141,6 +142,8 @@ struct fwnode_operations { (*device_get_dma_attr)(const struct fwnode_handle *fwnode); bool (*property_present)(const struct fwnode_handle *fwnode, const char *propname); + bool (*property_read_bool)(const struct fwnode_handle *fwnode, + const char *propname); int (*property_read_int_array)(const struct fwnode_handle *fwnode, const char *propname, unsigned int elem_size, void *val, diff --git a/include/linux/of.h b/include/linux/of.h index f921786cb8ac..1cb4eb7fc2ed 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -1271,7 +1271,9 @@ static inline bool of_property_read_bool(const struct device_node *np, */ static inline bool of_property_present(const struct device_node *np, const char *propname) { - return of_property_read_bool(np, propname); + struct property *prop = of_find_property(np, propname, NULL); + + return prop ? true : false; } /** diff --git a/include/linux/property.h b/include/linux/property.h index 61fc20e5f81f..e214ecd241eb 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -37,6 +37,7 @@ struct fwnode_handle *__dev_fwnode(struct device *dev); struct device *: __dev_fwnode)(dev) bool device_property_present(const struct device *dev, const char *propname); +bool device_property_read_bool(const struct device *dev, const char *propname); int device_property_read_u8_array(const struct device *dev, const char *propname, u8 *val, size_t nval); int device_property_read_u16_array(const struct device *dev, const char *propname, @@ -54,6 +55,8 @@ int device_property_match_string(const struct device *dev, bool fwnode_property_present(const struct fwnode_handle *fwnode, const char *propname); +bool fwnode_property_read_bool(const struct fwnode_handle *fwnode, + const char *propname); int fwnode_property_read_u8_array(const struct fwnode_handle *fwnode, const char *propname, u8 *val, size_t nval); @@ -207,12 +210,6 @@ int fwnode_irq_get_byname(const struct fwnode_handle *fwnode, const char *name); unsigned int device_get_child_node_count(const struct device *dev); -static inline bool device_property_read_bool(const struct device *dev, - const char *propname) -{ - return device_property_present(dev, propname); -} - static inline int device_property_read_u8(const struct device *dev, const char *propname, u8 *val) { @@ -263,12 +260,6 @@ static inline int device_property_string_array_count(const struct device *dev, return device_property_read_string_array(dev, propname, NULL, 0); } -static inline bool fwnode_property_read_bool(const struct fwnode_handle *fwnode, - const char *propname) -{ - return fwnode_property_present(fwnode, propname); -} - static inline int fwnode_property_read_u8(const struct fwnode_handle *fwnode, const char *propname, u8 *val) { -- cgit v1.2.3 From c141ecc3cecd764799e17c8251026336cab86800 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Thu, 9 Jan 2025 13:42:06 -0600 Subject: of: Warn when of_property_read_bool() is used on non-boolean properties The use of of_property_read_bool() for non-boolean properties is deprecated. The primary use of it was to test property presence, but that has been replaced in favor of of_property_present(). With those uses now fixed, add a warning to discourage new ones. Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250109-dt-type-warnings-v1-2-0150e32e716c@kernel.org Signed-off-by: Rob Herring (Arm) --- include/linux/of.h | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/of.h b/include/linux/of.h index 1cb4eb7fc2ed..0cdd58ff0a41 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -311,6 +311,7 @@ extern struct device_node *of_find_node_with_property( extern struct property *of_find_property(const struct device_node *np, const char *name, int *lenp); +extern bool of_property_read_bool(const struct device_node *np, const char *propname); extern int of_property_count_elems_of_size(const struct device_node *np, const char *propname, int elem_size); extern int of_property_read_u32_index(const struct device_node *np, @@ -615,6 +616,12 @@ static inline struct device_node *of_find_compatible_node( return NULL; } +static inline bool of_property_read_bool(const struct device_node *np, + const char *propname) +{ + return false; +} + static inline int of_property_count_elems_of_size(const struct device_node *np, const char *propname, int elem_size) { @@ -1242,24 +1249,6 @@ static inline int of_property_read_string_index(const struct device_node *np, return rc < 0 ? rc : 0; } -/** - * of_property_read_bool - Find a property - * @np: device node from which the property value is to be read. - * @propname: name of the property to be searched. - * - * Search for a boolean property in a device node. Usage on non-boolean - * property types is deprecated. - * - * Return: true if the property exists false otherwise. - */ -static inline bool of_property_read_bool(const struct device_node *np, - const char *propname) -{ - const struct property *prop = of_find_property(np, propname, NULL); - - return prop ? true : false; -} - /** * of_property_present - Test if a property is present in a node * @np: device node to search for the property. -- cgit v1.2.3 From f835bdae716751fa20451508150e5fdd5f5b2be3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 12 Jan 2025 16:34:55 -0800 Subject: net: remove init_dummy_netdev() init_dummy_netdev() can initialize statically declared or embedded net_devices. Such netdevs did not come from alloc_netdev_mqs(). After recent work by Breno, there are the only two cases where we have do that. Switch those cases to alloc_netdev_mqs() and delete init_dummy_netdev(). Dealing with static netdevs is not worth the maintenance burden. Reviewed-by: Alexander Lobakin Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250113003456.3904110-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index aeb4a6cff171..dd8f6f8991fe 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3238,7 +3238,6 @@ static inline void unregister_netdevice(struct net_device *dev) int netdev_refcnt_read(const struct net_device *dev); void free_netdev(struct net_device *dev); -void init_dummy_netdev(struct net_device *dev); struct net_device *netdev_get_xmit_slave(struct net_device *dev, struct sk_buff *skb, -- cgit v1.2.3 From aecd9d1020e3c6d29ecc9efccbcee7863e83c517 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 9 Jan 2025 18:05:36 +0200 Subject: net/mlx5: fs, add HWS packet reformat API function Add packet reformat alloc and dealloc API functions to provide packet reformat actions for steering rules. Add HWS action pools for each of the following packet reformat types: - decapl3: decapsulate l3 tunnel to l2 - encapl2: encapsulate l2 to tunnel l2 - encapl3: encapsulate l2 to tunnel l3 - insert_hdr: insert header In addition cache remove header action for remove vlan header as this is currently the only use case of remove header action in the driver. Signed-off-by: Moshe Shemesh Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250109160546.1733647-6-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 370f533da107..bb99a35fc6a2 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -7025,6 +7025,7 @@ struct mlx5_ifc_alloc_packet_reformat_context_out_bits { enum { MLX5_REFORMAT_CONTEXT_ANCHOR_MAC_START = 0x1, + MLX5_REFORMAT_CONTEXT_ANCHOR_VLAN_START = 0x2, MLX5_REFORMAT_CONTEXT_ANCHOR_IP_START = 0x7, MLX5_REFORMAT_CONTEXT_ANCHOR_TCP_UDP_START = 0x9, }; -- cgit v1.2.3 From 061b27e37238d374d8a6954b22d9c5d07c5db574 Mon Sep 17 00:00:00 2001 From: Yang Shen Date: Fri, 3 Jan 2025 18:21:38 +0800 Subject: crypto: hisilicon/qm - support new function communication On the HiSilicon accelerators drivers, the PF/VFs driver can send messages to the VFs/PF by writing hardware registers, and the VFs/PF driver receives messages from the PF/VFs by reading hardware registers. To support this feature, a new version id is added, different communication mechanism are used based on different version id. Signed-off-by: Yang Shen Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index c1dafbabbd6b..99fcf65d575f 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -124,6 +124,7 @@ enum qm_hw_ver { QM_HW_V1 = 0x20, QM_HW_V2 = 0x21, QM_HW_V3 = 0x30, + QM_HW_V4 = 0x50, }; enum qm_fun_type { @@ -397,6 +398,8 @@ struct hisi_qm { struct mutex mailbox_lock; + struct mutex ifc_lock; + const struct hisi_qm_hw_ops *ops; struct qm_debug debug; -- cgit v1.2.3 From e71778c95a804ba641e1407b3a5e49bcae791f3a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 5 Jan 2025 11:34:09 -0800 Subject: crypto: skcipher - document skcipher_walk_done() and rename some vars skcipher_walk_done() has an unusual calling convention, and some of its local variables have unclear names. Document it and rename variables to make it a bit clearer what is going on. No change in behavior. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- include/crypto/internal/skcipher.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h index 08d1e8c63afc..4f49621d3eb6 100644 --- a/include/crypto/internal/skcipher.h +++ b/include/crypto/internal/skcipher.h @@ -196,7 +196,7 @@ void crypto_unregister_lskciphers(struct lskcipher_alg *algs, int count); int lskcipher_register_instance(struct crypto_template *tmpl, struct lskcipher_instance *inst); -int skcipher_walk_done(struct skcipher_walk *walk, int err); +int skcipher_walk_done(struct skcipher_walk *walk, int res); int skcipher_walk_virt(struct skcipher_walk *walk, struct skcipher_request *req, bool atomic); -- cgit v1.2.3 From bfc1d1782984903457b2707d05a35b24ce5fbea1 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Tue, 26 Nov 2024 09:56:54 -0600 Subject: mm: migrate: remove unused argument vma from migrate_misplaced_folio() Commit ee86814b0562 ("mm/migrate: move NUMA hinting fault folio isolation + checks under PTL") removed the code that had used the vma argument in migrate_misplaced_folio. Since the vma argument was no longer used in migrate_misplaced_folio, this patch removes it. Link: https://lkml.kernel.org/r/20241126155655.466186-1-donettom@linux.ibm.com Signed-off-by: Donet Tom Reviewed-by: Baolin Wang Reviewed-by: Zi Yan Acked-by: David Hildenbrand Cc: Ritesh Harjani (IBM) Signed-off-by: Andrew Morton --- include/linux/migrate.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 002e49b2ebd9..29919faea2f1 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -144,16 +144,14 @@ const struct movable_operations *page_movable_ops(struct page *page) #ifdef CONFIG_NUMA_BALANCING int migrate_misplaced_folio_prepare(struct folio *folio, struct vm_area_struct *vma, int node); -int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, - int node); +int migrate_misplaced_folio(struct folio *folio, int node); #else static inline int migrate_misplaced_folio_prepare(struct folio *folio, struct vm_area_struct *vma, int node) { return -EAGAIN; /* can't migrate now */ } -static inline int migrate_misplaced_folio(struct folio *folio, - struct vm_area_struct *vma, int node) +static inline int migrate_misplaced_folio(struct folio *folio, int node) { return -EAGAIN; /* can't migrate now */ } -- cgit v1.2.3 From 38558b2460d7881a3de3bdc31a23fa7034384d00 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:34 +0000 Subject: mm: make alloc_pages_mpol() static All callers outside mempolicy.c now use folio_alloc_mpol() thanks to Kefeng's cleanups, so we can remove this as a visible symbol. And also remove the alloc_hooks for alloc_pages_mpol(), since all users in mempolicy.c are using the nonprof version. Link: https://lkml.kernel.org/r/20241125210149.2976098-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Cc: Miaohe Lin Cc: Muchun Song Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/gfp.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b0fe9f62d15b..c96d5d7f7b89 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -300,8 +300,6 @@ static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask, #ifdef CONFIG_NUMA struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); -struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, - struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); @@ -312,11 +310,6 @@ static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order { return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order); } -static inline struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, - struct mempolicy *mpol, pgoff_t ilx, int nid) -{ - return alloc_pages_noprof(gfp, order); -} static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return __folio_alloc_node_noprof(gfp, order, numa_node_id()); @@ -331,7 +324,6 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde #endif #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) -#define alloc_pages_mpol(...) alloc_hooks(alloc_pages_mpol_noprof(__VA_ARGS__)) #define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) #define folio_alloc_mpol(...) alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__)) #define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) -- cgit v1.2.3 From 9023691d75f29fde884f6e243bcdad6a9dbadb19 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 25 Nov 2024 09:16:17 -0800 Subject: mm: mmap_lock: optimize mmap_lock tracepoints We are starting to deploy mmap_lock tracepoint monitoring across our fleet and the early results showed that these tracepoints are consuming significant amount of CPUs in kernfs_path_from_node when enabled. It seems like the kernel is trying to resolve the cgroup path in the fast path of the locking code path when the tracepoints are enabled. In addition for some application their metrics are regressing when monitoring is enabled. The cgroup path resolution can be slow and should not be done in the fast path. Most userspace tools, like bpftrace, provides functionality to get the cgroup path from cgroup id, so let's just trace the cgroup id and the users can use better tools to get the path in the slow path. Link: https://lkml.kernel.org/r/20241125171617.113892-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Yosry Ahmed Acked-by: Vlastimil Babka Acked-by: Roman Gushchin Reviewed-by: Axel Rasmussen Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Muchun Song Cc: Steven Rostedt Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 22 ++++++++++++++++++++++ include/trace/events/mmap_lock.h | 32 +++++++++++++++----------------- 2 files changed, 37 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5502aa8e138e..b28180269e75 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1046,6 +1046,23 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, void split_page_memcg(struct page *head, int old_order, int new_order); +static inline u64 cgroup_id_from_mm(struct mm_struct *mm) +{ + struct mem_cgroup *memcg; + u64 id; + + if (mem_cgroup_disabled()) + return 0; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (!memcg) + memcg = root_mem_cgroup; + id = cgroup_id(memcg->css.cgroup); + rcu_read_unlock(); + return id; +} + #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 @@ -1466,6 +1483,11 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) static inline void split_page_memcg(struct page *head, int old_order, int new_order) { } + +static inline u64 cgroup_id_from_mm(struct mm_struct *mm) +{ + return 0; +} #endif /* CONFIG_MEMCG */ /* diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h index bc2e3ad787b3..cf9f9faf8914 100644 --- a/include/trace/events/mmap_lock.h +++ b/include/trace/events/mmap_lock.h @@ -5,6 +5,7 @@ #if !defined(_TRACE_MMAP_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MMAP_LOCK_H +#include #include #include @@ -12,64 +13,61 @@ struct mm_struct; DECLARE_EVENT_CLASS(mmap_lock, - TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), + TP_PROTO(struct mm_struct *mm, bool write), - TP_ARGS(mm, memcg_path, write), + TP_ARGS(mm, write), TP_STRUCT__entry( __field(struct mm_struct *, mm) - __string(memcg_path, memcg_path) + __field(u64, memcg_id) __field(bool, write) ), TP_fast_assign( __entry->mm = mm; - __assign_str(memcg_path); + __entry->memcg_id = cgroup_id_from_mm(mm); __entry->write = write; ), TP_printk( - "mm=%p memcg_path=%s write=%s", - __entry->mm, - __get_str(memcg_path), + "mm=%p memcg_id=%llu write=%s", + __entry->mm, __entry->memcg_id, __entry->write ? "true" : "false" ) ); #define DEFINE_MMAP_LOCK_EVENT(name) \ DEFINE_EVENT(mmap_lock, name, \ - TP_PROTO(struct mm_struct *mm, const char *memcg_path, \ - bool write), \ - TP_ARGS(mm, memcg_path, write)) + TP_PROTO(struct mm_struct *mm, bool write), \ + TP_ARGS(mm, write)) DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking); DEFINE_MMAP_LOCK_EVENT(mmap_lock_released); TRACE_EVENT(mmap_lock_acquire_returned, - TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write, - bool success), + TP_PROTO(struct mm_struct *mm, bool write, bool success), - TP_ARGS(mm, memcg_path, write, success), + TP_ARGS(mm, write, success), TP_STRUCT__entry( __field(struct mm_struct *, mm) - __string(memcg_path, memcg_path) + __field(u64, memcg_id) __field(bool, write) __field(bool, success) ), TP_fast_assign( __entry->mm = mm; - __assign_str(memcg_path); + __entry->memcg_id = cgroup_id_from_mm(mm); __entry->write = write; __entry->success = success; ), TP_printk( - "mm=%p memcg_path=%s write=%s success=%s", + "mm=%p memcg_id=%llu write=%s success=%s", __entry->mm, - __get_str(memcg_path), + __entry->memcg_id, __entry->write ? "true" : "false", __entry->success ? "true" : "false" ) -- cgit v1.2.3 From 20f3ab257211594c110c43e71c31bd25ba31e851 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 22 Nov 2024 15:36:52 +0800 Subject: mm: pgtable: make ptep_clear() non-atomic In the generic ptep_get_and_clear() implementation, it is just a simple combination of ptep_get() and pte_clear(). But for some architectures (such as x86 and arm64, etc), the hardware will modify the A/D bits of the page table entry, so the ptep_get_and_clear() needs to be overwritten and implemented as an atomic operation to avoid contention, which has a performance cost. The commit d283d422c6c4 ("x86: mm: add x86_64 support for page table check") adds the ptep_clear() on the x86, and makes it call ptep_get_and_clear() when CONFIG_PAGE_TABLE_CHECK is enabled. The page table check feature does not actually care about the A/D bits, so only ptep_get() + pte_clear() should be called. But considering that the page table check is a debug option, this should not have much of an impact. But then the commit de8c8e52836d ("mm: page_table_check: add hooks to public helpers") changed ptep_clear() to unconditionally call ptep_get_and_clear(), so that the CONFIG_PAGE_TABLE_CHECK check can be put into the page table check stubs (in include/linux/page_table_check.h). This also cause performance loss to the kernel without CONFIG_PAGE_TABLE_CHECK enabled, which doesn't make sense. Currently ptep_clear() is only used in debug code and in khugepaged collapse paths, which are fairly expensive. So the cost of an extra atomic RMW operation does not matter. But this may be used for other paths in the future. After all, for the present pte entry, we need to call ptep_clear() instead of pte_clear() to ensure that PAGE_TABLE_CHECK works properly. So to be more precise, just calling ptep_get() and pte_clear() in the ptep_clear(). Link: https://lkml.kernel.org/r/20241122073652.54030-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Pasha Tatashin Reviewed-by: Jann Horn Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Jason Gunthorpe Cc: Lorenzo Stoakes Cc: Peter Xu Cc: Ryan Roberts Cc: Tong Tiangen Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index adef9d6e9b1b..94d267d02372 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -533,7 +533,14 @@ static inline void clear_young_dirty_ptes(struct vm_area_struct *vma, static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - ptep_get_and_clear(mm, addr, ptep); + pte_t pte = ptep_get(ptep); + + pte_clear(mm, addr, ptep); + /* + * No need for ptep_get_and_clear(): page table check doesn't care about + * any bits that could have been set by HW concurrently. + */ + page_table_check_pte_clear(mm, pte); } #ifdef CONFIG_GUP_GET_PXX_LOW_HIGH -- cgit v1.2.3 From d40797d6720e861196e848f3615bb09dae5be7ce Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Nov 2024 16:54:51 +0100 Subject: kasan: make kasan_record_aux_stack_noalloc() the default behaviour kasan_record_aux_stack_noalloc() was introduced to record a stack trace without allocating memory in the process. It has been added to callers which were invoked while a raw_spinlock_t was held. More and more callers were identified and changed over time. Is it a good thing to have this while functions try their best to do a locklessly setup? The only downside of having kasan_record_aux_stack() not allocate any memory is that we end up without a stacktrace if stackdepot runs out of memory and at the same stacktrace was not recorded before To quote Marco Elver from https://lore.kernel.org/all/CANpmjNPmQYJ7pv1N3cuU8cP18u7PP_uoZD8YxwZd4jtbof9nVQ@mail.gmail.com/ | I'd be in favor, it simplifies things. And stack depot should be | able to replenish its pool sufficiently in the "non-aux" cases | i.e. regular allocations. Worst case we fail to record some | aux stacks, but I think that's only really bad if there's a bug | around one of these allocations. In general the probabilities | of this being a regression are extremely small [...] Make the kasan_record_aux_stack_noalloc() behaviour default as kasan_record_aux_stack(). [bigeasy@linutronix.de: dressed the diff as patch] Link: https://lkml.kernel.org/r/20241122155451.Mb2pmeyJ@linutronix.de Fixes: 7cb3007ce2da ("kasan: generic: introduce kasan_record_aux_stack_noalloc()") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Reported-by: syzbot+39f85d612b7c20d8db48@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67275485.050a0220.3c8d68.0a37.GAE@google.com Reviewed-by: Andrey Konovalov Reviewed-by: Marco Elver Reviewed-by: Waiman Long Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Ben Segall Cc: Boqun Feng Cc: Christoph Lameter Cc: David Rientjes Cc: Dietmar Eggemann Cc: Dmitry Vyukov Cc: Frederic Weisbecker Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Ingo Molnar Cc: Jann Horn Cc: Joel Fernandes (Google) Cc: Joonsoo Kim Cc: Josh Triplett Cc: Juri Lelli Cc: Cc: Lai Jiangshan Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Mathieu Desnoyers Cc: Mel Gorman Cc: Neeraj Upadhyay Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Roman Gushchin Cc: Steven Rostedt Cc: syzkaller-bugs@googlegroups.com Cc: Tejun Heo Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vincenzo Frascino Cc: Vlastimil Babka Cc: Zqiang Signed-off-by: Andrew Morton --- include/linux/kasan.h | 2 -- include/linux/task_work.h | 3 --- 2 files changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 6bbfc8aa42e8..1c1b3d39e7b6 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -491,7 +491,6 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_record_aux_stack(void *ptr); -void kasan_record_aux_stack_noalloc(void *ptr); #else /* CONFIG_KASAN_GENERIC */ @@ -509,7 +508,6 @@ static inline void kasan_cache_create(struct kmem_cache *cache, static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_record_aux_stack(void *ptr) {} -static inline void kasan_record_aux_stack_noalloc(void *ptr) {} #endif /* CONFIG_KASAN_GENERIC */ diff --git a/include/linux/task_work.h b/include/linux/task_work.h index 2964171856e0..0646804860ff 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -19,9 +19,6 @@ enum task_work_notify_mode { TWA_SIGNAL, TWA_SIGNAL_NO_IPI, TWA_NMI_CURRENT, - - TWA_FLAGS = 0xff00, - TWAF_NO_ALLOC = 0x0100, }; static inline bool task_work_pending(struct task_struct *task) -- cgit v1.2.3 From da243c5479add600bdd58c910c9fae3355b4f026 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 28 Nov 2024 15:40:39 +0800 Subject: mm: factor out the order calculation into a new helper Patch series "Support large folios for tmpfs", v3. Traditionally, tmpfs only supported PMD-sized large folios. However nowadays with other file systems supporting any sized large folios, and extending anonymous to support mTHP, we should not restrict tmpfs to allocating only PMD-sized large folios, making it more special. Instead, we should allow tmpfs can allocate any sized large folios. Considering that tmpfs already has the 'huge=' option to control the PMD-sized large folios allocation, we can extend the 'huge=' option to allow any sized large folios. The semantics of the 'huge=' mount option are: huge=never: no any sized large folios huge=always: any sized large folios huge=within_size: like 'always' but respect the i_size huge=advise: like 'always' if requested with madvise() Note: for tmpfs mmap() faults, due to the lack of a write size hint, still allocate the PMD-sized large folios if huge=always/within_size/advise is set. Moreover, the 'deny' and 'force' testing options controlled by '/sys/kernel/mm/transparent_hugepage/shmem_enabled', still retain the same semantics. The 'deny' can disable any sized large folios for tmpfs, while the 'force' can enable PMD sized large folios for tmpfs. This patch (of 6): Factor out the order calculation into a new helper, which can be reused by shmem in the following patch. Link: https://lkml.kernel.org/r/cover.1732779148.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/5505f9ea50942820c1924d1803bfdd3a524e54f6.1732779148.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Suggested-by: Matthew Wilcox Reviewed-by: Barry Song Reviewed-by: David Hildenbrand Reviewed-by: Daniel Gomez Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bcf0865a38ae..d796c8a33647 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -727,6 +727,16 @@ typedef unsigned int __bitwise fgf_t; #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) +static inline unsigned int filemap_get_order(size_t size) +{ + unsigned int shift = ilog2(size); + + if (shift <= PAGE_SHIFT) + return 0; + + return shift - PAGE_SHIFT; +} + /** * fgf_set_order - Encode a length in the fgf_t flags. * @size: The suggested size of the folio to create. @@ -740,11 +750,11 @@ typedef unsigned int __bitwise fgf_t; */ static inline fgf_t fgf_set_order(size_t size) { - unsigned int shift = ilog2(size); + unsigned int order = filemap_get_order(size); - if (shift <= PAGE_SHIFT) + if (!order) return 0; - return (__force fgf_t)((shift - PAGE_SHIFT) << 26); + return (__force fgf_t)(order << 26); } void *filemap_get_entry(struct address_space *mapping, pgoff_t index); -- cgit v1.2.3 From da80f4ffb0dbee2419ac04f23aad0533658f1523 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Fri, 29 Nov 2024 14:58:25 +0000 Subject: list_lru: expand list_lru_add() docs with info about sublists The documentation for list_lru_add() and list_lru_del() has not been updated since lru lists were originally introduced by commit a38e40824844 ("list: add a new LRU list type"). Back then, list_lru stored all of the items in a single list, but the implementation has since been expanded to use many sublists internally. Thus, update the docs to mention that the requirements about not using the item with several lists at the same time also applies not using different sublists. Also mention that list_lru items are reparented when the memcg is deleted as discussed on the LKML [1]. Also fix incorrect use of 'Return value:' which should be 'Return:'. Link: https://lore.kernel.org/all/Z0eXrllVhRI9Ag5b@dread.disaster.area/ [1] Link: https://lkml.kernel.org/r/20241129-list_lru_memcg_docs-v2-1-e285ff1c481b@google.com Signed-off-by: Alice Ryhl Reviewed-by: Dave Chinner Acked-by: Muchun Song Reviewed-by: Nhat Pham Cc: Johannes Weiner Cc: Michal Hocko Cc: Qi Zheng Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 05c166811f6b..fe739d35a864 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -91,13 +91,24 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * @memcg: the cgroup of the sublist to add the item to. * * If the element is already part of a list, this function returns doing - * nothing. Therefore the caller does not need to keep state about whether or - * not the element already belongs in the list and is allowed to lazy update - * it. Note however that this is valid for *a* list, not *this* list. If - * the caller organize itself in a way that elements can be in more than - * one type of list, it is up to the caller to fully remove the item from - * the previous list (with list_lru_del() for instance) before moving it - * to @lru. + * nothing. This means that it is not necessary to keep state about whether or + * not the element already belongs in the list. That said, this logic only + * works if the item is in *this* list. If the item might be in some other + * list, then you cannot rely on this check and you must remove it from the + * other list before trying to insert it. + * + * The lru list consists of many sublists internally; the @nid and @memcg + * parameters are used to determine which sublist to insert the item into. + * It's important to use the right value of @nid and @memcg when deleting the + * item, since it might otherwise get deleted from the wrong sublist. + * + * This also applies when attempting to insert the item multiple times - if + * the item is currently in one sublist and you call list_lru_add() again, you + * must pass the right @nid and @memcg parameters so that the same sublist is + * used. + * + * You must ensure that the memcg is not freed during this call (e.g., with + * rcu or by taking a css refcnt). * * Return: true if the list was updated, false otherwise */ @@ -113,7 +124,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, * memcg of the sublist is determined by @item list_head. This assumption is * valid for slab objects LRU such as dentries, inodes, etc. * - * Return value: true if the list was updated, false otherwise + * Return: true if the list was updated, false otherwise */ bool list_lru_add_obj(struct list_lru *lru, struct list_head *item); @@ -125,8 +136,19 @@ bool list_lru_add_obj(struct list_lru *lru, struct list_head *item); * @memcg: the cgroup of the sublist to delete the item from. * * This function works analogously as list_lru_add() in terms of list - * manipulation. The comments about an element already pertaining to - * a list are also valid for list_lru_del(). + * manipulation. + * + * The comments in list_lru_add() about an element already being in a list are + * also valid for list_lru_del(), that is, you can delete an item that has + * already been removed or never been added. However, if the item is in a + * list, it must be in *this* list, and you must pass the right value of @nid + * and @memcg so that the right sublist is used. + * + * You must ensure that the memcg is not freed during this call (e.g., with + * rcu or by taking a css refcnt). When a memcg is deleted, list_lru entries + * are automatically moved to the parent memcg. This is done in a race-free + * way, so during deletion of an memcg both the old and new memcg will resolve + * to the same sublist internally. * * Return: true if the list was updated, false otherwise */ @@ -142,7 +164,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, * memcg of the sublist is determined by @item list_head. This assumption is * valid for slab objects LRU such as dentries, inodes, etc. * - * Return value: true if the list was updated, false otherwise. + * Return: true if the list was updated, false otherwise. */ bool list_lru_del_obj(struct list_lru *lru, struct list_head *item); -- cgit v1.2.3 From 1168b2bec7660f5146de7b14c14b52417e900d18 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 16 Nov 2024 15:14:46 +0000 Subject: filemap: remove unused folio_add_wait_queue folio_add_wait_queue() has been unused since 2021's commit 850cba069c26 ("cachefiles: Delete the cachefiles driver pending rewrite") Remove it. Link: https://lkml.kernel.org/r/20241116151446.95555-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Reviewed-by: David Hildenbrand Reviewed-by: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d796c8a33647..fc2e1319c7bb 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1280,11 +1280,6 @@ void folio_end_private_2(struct folio *folio); void folio_wait_private_2(struct folio *folio); int folio_wait_private_2_killable(struct folio *folio); -/* - * Add an arbitrary waiter to a page's wait queue - */ -void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter); - /* * Fault in userspace address range. */ -- cgit v1.2.3 From 21641bd9a7a7ce0360106a5a8e5b89a4fc74529d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 4 Nov 2024 11:23:18 -0300 Subject: lazy tlb: fix hotplug exit race with MMU_LAZY_TLB_SHOOTDOWN CPU unplug first calls __cpu_disable(), and that's where powerpc calls cleanup_cpu_mmu_context(), which clears this CPU from mm_cpumask() of all mms in the system. However this CPU may still be using a lazy tlb mm, and its mm_cpumask bit will be cleared from it. The CPU does not switch away from the lazy tlb mm until arch_cpu_idle_dead() calls idle_task_exit(). If that user mm exits in this window, it will not be subject to the lazy tlb mm shootdown and may be freed while in use as a lazy mm by the CPU that is being unplugged. cleanup_cpu_mmu_context() could be moved later, but it looks better to move the lazy tlb mm switching earlier. The problem with doing the lazy mm switching in idle_task_exit() is explained in commit bf2c59fce4074 ("sched/core: Fix illegal RCU from offline CPUs"), which added a wart to switch away from the mm but leave it set in active_mm to be cleaned up later. So instead, switch away from the lazy tlb mm at sched_cpu_wait_empty(), which is the last hotplug state before teardown (CPUHP_AP_SCHED_WAIT_EMPTY). This CPU will never switch to a user thread from this point, so it has no chance to pick up a new lazy tlb mm. This removes the lazy tlb mm handling wart in CPU unplug. With this, idle_task_exit() is not needed anymore and can be cleaned up. This leaves the prototype alone, to be cleaned after this change. herton: took the suggestions from https://lore.kernel.org/all/87jzvyprsw.ffs@tglx/ and made adjustments on the initial patch proposed by Nicholas. Link: https://lkml.kernel.org/r/20230524060455.147699-1-npiggin@gmail.com Link: https://lore.kernel.org/all/20230525205253.E2FAEC433EF@smtp.kernel.org/ Link: https://lkml.kernel.org/r/20241104142318.3295663-1-herton@redhat.com Fixes: 2655421ae69f ("lazy tlb: shoot lazies, non-refcounting lazy tlb mm reference handling scheme") Signed-off-by: Nicholas Piggin Signed-off-by: Herton R. Krzesinski Suggested-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Michael Ellerman Signed-off-by: Andrew Morton --- include/linux/sched/hotplug.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h index 412cdaba33eb..17e04859b9a4 100644 --- a/include/linux/sched/hotplug.h +++ b/include/linux/sched/hotplug.h @@ -18,10 +18,6 @@ extern int sched_cpu_dying(unsigned int cpu); # define sched_cpu_dying NULL #endif -#ifdef CONFIG_HOTPLUG_CPU -extern void idle_task_exit(void); -#else static inline void idle_task_exit(void) {} -#endif #endif /* _LINUX_SCHED_HOTPLUG_H */ -- cgit v1.2.3 From 7a5714991872f0a4805cc6004a5bff19a71d0459 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 3 Dec 2024 18:05:10 +0000 Subject: mm: abstract get_arg_page() stack expansion and mmap read lock Right now fs/exec.c invokes expand_downwards(), an otherwise internal implementation detail of the VMA logic in order to ensure that an arg page can be obtained by get_user_pages_remote(). In order to be able to move the stack expansion logic into mm/vma.c to make it available to userland testing we need to find an alternative approach here. We do so by providing the mmap_read_lock_maybe_expand() function which also helpfully documents what get_arg_page() is doing here and adds an additional check against VM_GROWSDOWN to make explicit that the stack expansion logic is only invoked when the VMA is indeed a downward-growing stack. This allows expand_downwards() to become a static function. Importantly, the VMA referenced by mmap_read_maybe_expand() must NOT be currently user-visible in any way, that is place within an rmap or VMA tree. It must be a newly allocated VMA. This is the case when exec invokes this function. Link: https://lkml.kernel.org/r/5295d1c70c58e6aa63d14be68d4e1de9fa1c8e6d.1733248985.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Al Viro Cc: Christian Brauner Cc: Eric W. Biederman Cc: Jan Kara Cc: Jann Horn Cc: Kees Cook Cc: Liam R. Howlett Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index b1c3db9cf355..2e5ef71b8629 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3324,6 +3324,8 @@ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admi extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void exit_mmap(struct mm_struct *); int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, bool write); static inline int check_data_rlimit(unsigned long rlim, unsigned long new, @@ -3437,9 +3439,6 @@ extern unsigned long stack_guard_gap; int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); -/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ -int expand_downwards(struct vm_area_struct *vma, unsigned long address); - /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, -- cgit v1.2.3 From b9e40605daa94ae1817ceb5ce9e9b34093c6d850 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 3 Dec 2024 10:47:28 +0100 Subject: mm/page_isolation: don't pass gfp flags to start_isolate_page_range() The parameter is unused, so let's stop passing it. Link: https://lkml.kernel.org/r/20241203094732.200195-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Reviewed-by: Oscar Salvador Reviewed-by: Vishal Moola (Oracle) Cc: Christophe Leroy Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Naveen N Rao Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- include/linux/page-isolation.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 73dc2c1841ec..898bb788243b 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -31,7 +31,7 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page, int migratetype); int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - int migratetype, int flags, gfp_t gfp_flags); + int migratetype, int flags); void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, int migratetype); -- cgit v1.2.3 From 735fad44b5a86edf0fe65a8e8d43595bd1cf1d58 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 4 Dec 2024 19:09:46 +0800 Subject: mm: zap_install_uffd_wp_if_needed: return whether uffd-wp pte has been re-installed In some cases, we'll replace the none pte with an uffd-wp swap special pte marker when necessary. Let's expose this information to the caller through the return value, so that subsequent commits can use this information to detect whether the PTE page is empty. Link: https://lkml.kernel.org/r/9d4516554724eda87d6576468042a1741c475413.1733305182.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Muchun Song Cc: Peter Xu Cc: Peter Zijlstra Cc: Will Deacon Cc: Zach O'Keefe Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 1b6a917fffa4..34e5097182a0 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -564,9 +564,9 @@ static inline pte_marker copy_pte_marker( * Must be called with pgtable lock held so that no thread will see the none * pte, and if they see it, they'll fault and serialize at the pgtable lock. * - * This function is a no-op if PTE_MARKER_UFFD_WP is not enabled. + * Returns true if an uffd-wp pte was installed, false otherwise. */ -static inline void +static inline bool pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { @@ -583,7 +583,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, * with a swap pte. There's no way of leaking the bit. */ if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) - return; + return false; /* A uffd-wp wr-protected normal pte */ if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval))) @@ -596,10 +596,13 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, if (unlikely(pte_swp_uffd_wp_any(pteval))) arm_uffd_pte = true; - if (unlikely(arm_uffd_pte)) + if (unlikely(arm_uffd_pte)) { set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); + return true; + } #endif + return false; } static inline bool vma_has_recency(struct vm_area_struct *vma) -- cgit v1.2.3 From 6375e95f381e3dc85065b6f74263a61522736203 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 4 Dec 2024 19:09:49 +0800 Subject: mm: pgtable: reclaim empty PTE page in madvise(MADV_DONTNEED) Now in order to pursue high performance, applications mostly use some high-performance user-mode memory allocators, such as jemalloc or tcmalloc. These memory allocators use madvise(MADV_DONTNEED or MADV_FREE) to release physical memory, but neither MADV_DONTNEED nor MADV_FREE will release page table memory, which may cause huge page table memory usage. The following are a memory usage snapshot of one process which actually happened on our server: VIRT: 55t RES: 590g VmPTE: 110g In this case, most of the page table entries are empty. For such a PTE page where all entries are empty, we can actually free it back to the system for others to use. As a first step, this commit aims to synchronously free the empty PTE pages in madvise(MADV_DONTNEED) case. We will detect and free empty PTE pages in zap_pte_range(), and will add zap_details.reclaim_pt to exclude cases other than madvise(MADV_DONTNEED). Once an empty PTE is detected, we first try to hold the pmd lock within the pte lock. If successful, we clear the pmd entry directly (fast path). Otherwise, we wait until the pte lock is released, then re-hold the pmd and pte locks and loop PTRS_PER_PTE times to check pte_none() to re-detect whether the PTE page is empty and free it (slow path). For other cases such as madvise(MADV_FREE), consider scanning and freeing empty PTE pages asynchronously in the future. The following code snippet can show the effect of optimization: mmap 50G while (1) { for (; i < 1024 * 25; i++) { touch 2M memory madvise MADV_DONTNEED 2M } } As we can see, the memory usage of VmPTE is reduced: before after VIRT 50.0 GB 50.0 GB RES 3.1 MB 3.1 MB VmPTE 102640 KB 240 KB [zhengqi.arch@bytedance.com: fix uninitialized symbol 'ptl'] Link: https://lkml.kernel.org/r/20241206112348.51570-1-zhengqi.arch@bytedance.com Link: https://lore.kernel.org/linux-mm/224e6a4e-43b5-4080-bdd8-b0a6fb2f0853@stanley.mountain/ Link: https://lkml.kernel.org/r/92aba2b319a734913f18ba41e7d86a265f0b84e2.1733305182.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Muchun Song Cc: Peter Xu Cc: Peter Zijlstra Cc: Will Deacon Cc: Zach O'Keefe Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e5ef71b8629..9372bc058b43 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2320,6 +2320,7 @@ extern void pagefault_out_of_memory(void); struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool even_cows; /* Zap COWed private pages too? */ + bool reclaim_pt; /* Need reclaim page tables? */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; -- cgit v1.2.3 From 718b13861d2256ac95d65b892953282a63faf240 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 4 Dec 2024 19:09:50 +0800 Subject: x86: mm: free page table pages by RCU instead of semi RCU Now, if CONFIG_MMU_GATHER_RCU_TABLE_FREE is selected, the page table pages will be freed by semi RCU, that is: - batch table freeing: asynchronous free by RCU - single table freeing: IPI + synchronous free In this way, the page table can be lockless traversed by disabling IRQ in paths such as fast GUP. But this is not enough to free the empty PTE page table pages in paths other that munmap and exit_mmap path, because IPI cannot be synchronized with rcu_read_lock() in pte_offset_map{_lock}(). In preparation for supporting empty PTE page table pages reclaimation, let single table also be freed by RCU like batch table freeing. Then we can also use pte_offset_map() etc to prevent PTE page from being freed. Like pte_free_defer(), we can also safely use ptdesc->pt_rcu_head to free the page table pages: - The pt_rcu_head is unioned with pt_list and pmd_huge_pte. - For pt_list, it is used to manage the PGD page in x86. Fortunately tlb_remove_table() will not be used for free PGD pages, so it is safe to use pt_rcu_head. - For pmd_huge_pte, it is used for THPs, so it is safe. After applying this patch, if CONFIG_PT_RECLAIM is enabled, the function call of free_pte() is as follows: free_pte pte_free_tlb __pte_free_tlb ___pte_free_tlb paravirt_tlb_remove_table tlb_remove_table [!CONFIG_PARAVIRT, Xen PV, Hyper-V, KVM] [no-free-memory slowpath:] tlb_table_invalidate tlb_remove_table_one __tlb_remove_table_one [frees via RCU] [fastpath:] tlb_table_flush tlb_remove_table_free [frees via RCU] native_tlb_remove_table [CONFIG_PARAVIRT on native] tlb_remove_table [see above] Link: https://lkml.kernel.org/r/0287d442a973150b0e1019cc406e6322d148277a.1733305182.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Muchun Song Cc: Peter Xu Cc: Will Deacon Cc: Zach O'Keefe Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 332cee285662..7490d84af310 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -438,7 +438,9 @@ FOLIO_MATCH(compound_head, _head_2a); * struct ptdesc - Memory descriptor for page tables. * @__page_flags: Same as page flags. Powerpc only. * @pt_rcu_head: For freeing page table pages. - * @pt_list: List of used page tables. Used for s390 and x86. + * @pt_list: List of used page tables. Used for s390 gmap shadow pages + * (which are not linked into the user page tables) and x86 + * pgds. * @_pt_pad_1: Padding that aliases with page's compound head. * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. * @__page_mapping: Aliases with page->mapping. Unused for page tables. -- cgit v1.2.3 From 67c8b11bd58aee4644c9a6e495d0c234771e9175 Mon Sep 17 00:00:00 2001 From: Wenchao Hao Date: Mon, 2 Dec 2024 20:47:30 +0800 Subject: mm: add per-order mTHP swap-in fallback/fallback_charge counters Currently, large folio swap-in is supported, but we lack a method to analyze their success ratio. Similar to anon_fault_fallback, we introduce per-order mTHP swpin_fallback and swpin_fallback_charge counters for calculating their success ratio. The new counters are located at: /sys/kernel/mm/transparent_hugepage/hugepages-/stats/ swpin_fallback swpin_fallback_charge Link: https://lkml.kernel.org/r/20241202124730.2407037-1-haowenchao22@gmail.com Signed-off-by: Wenchao Hao Reviewed-by: Barry Song Reviewed-by: Lance Yang Cc: Baolin Wang Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Peter Xu Cc: Ryan Roberts Cc: Usama Arif Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b94c2e8ee918..93e509b6c00e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -121,6 +121,8 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_ZSWPOUT, MTHP_STAT_SWPIN, + MTHP_STAT_SWPIN_FALLBACK, + MTHP_STAT_SWPIN_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, MTHP_STAT_SHMEM_ALLOC, -- cgit v1.2.3 From dba4761a3e40433a8d9e434d515ecbae19b3dcb1 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:14 -0800 Subject: seqlock: add raw_seqcount_try_begin Add raw_seqcount_try_begin() to opens a read critical section of the given seqcount_t if the counter is even. This enables eliding the critical section entirely if the counter is odd, instead of doing the speculation knowing it will fail. Link: https://lkml.kernel.org/r/20241122174416.1367052-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: David Hildenbrand Reviewed-by: Liam R. Howlett Suggested-by: Peter Zijlstra Cc: Christian Brauner Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Paul E. McKenney Cc: Peter Xu Cc: Shakeel Butt Cc: Sourav Panda Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/seqlock.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5298765d6ca4..22c2c48b4265 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -318,6 +318,28 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) __seq; \ }) +/** + * raw_seqcount_try_begin() - begin a seqcount_t read critical section + * w/o lockdep and w/o counter stabilization + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants + * + * Similar to raw_seqcount_begin(), except it enables eliding the critical + * section entirely if odd, instead of doing the speculation knowing it will + * fail. + * + * Useful when counter stabilization is more or less equivalent to taking + * the lock and there is a slowpath that does that. + * + * If true, start will be set to the (even) sequence count read. + * + * Return: true when a read critical section is started. + */ +#define raw_seqcount_try_begin(s, start) \ +({ \ + start = raw_read_seqcount(s); \ + !(start & 1); \ +}) + /** * raw_seqcount_begin() - begin a seqcount_t read critical section w/o * lockdep and w/o counter stabilization -- cgit v1.2.3 From e5e7fb278e5924f29ceab42bbbb891cde528f7cc Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:15 -0800 Subject: mm: convert mm_lock_seq to a proper seqcount Convert mm_lock_seq to be seqcount_t and change all mmap_write_lock variants to increment it, in-line with the usual seqcount usage pattern. This lets us check whether the mmap_lock is write-locked by checking mm_lock_seq.sequence counter (odd=locked, even=unlocked). This will be used when implementing mmap_lock speculation functions. As a result vm_lock_seq is also change to be unsigned to match the type of mm_lock_seq.sequence. Link: https://lkml.kernel.org/r/20241122174416.1367052-2-surenb@google.com Suggested-by: Peter Zijlstra Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Paul E. McKenney Cc: Peter Xu Cc: Shakeel Butt Cc: Sourav Panda Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 +++++------ include/linux/mm_types.h | 7 ++++-- include/linux/mmap_lock.h | 55 +++++++++++++++++++++++++++++++---------------- 3 files changed, 47 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9372bc058b43..a3a50c37603e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -711,7 +711,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) return false; if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) @@ -728,7 +728,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { up_read(&vma->vm_lock->lock); return false; } @@ -743,7 +743,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) +static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -751,7 +751,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ - *mm_lock_seq = vma->vm_mm->mm_lock_seq; + *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; return (vma->vm_lock_seq == *mm_lock_seq); } @@ -762,7 +762,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) */ static inline void vma_start_write(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; if (__is_vma_write_locked(vma, &mm_lock_seq)) return; @@ -780,7 +780,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) static inline void vma_assert_write_locked(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7490d84af310..5f1b2dc788e2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -729,7 +729,7 @@ struct vm_area_struct { * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ - int vm_lock_seq; + unsigned int vm_lock_seq; /* Unstable RCU readers are allowed to read this. */ struct vma_lock *vm_lock; #endif @@ -923,6 +923,9 @@ struct mm_struct { * Roughly speaking, incrementing the sequence number is * equivalent to releasing locks on VMAs; reading the sequence * number can be part of taking a read lock on a VMA. + * Incremented every time mmap_lock is write-locked/unlocked. + * Initialized to 0, therefore odd values indicate mmap_lock + * is write-locked and even values that it's released. * * Can be modified under write mmap_lock using RELEASE * semantics. @@ -931,7 +934,7 @@ struct mm_struct { * Can be read with ACQUIRE semantics if not holding write * mmap_lock. */ - int mm_lock_seq; + seqcount_t mm_lock_seq; #endif diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index de9dc20b01ba..9715326f5a85 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -71,39 +71,39 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK -/* - * Drop all currently-held per-VMA locks. - * This is called from the mmap_lock implementation directly before releasing - * a write-locked mmap_lock (or downgrading it to read-locked). - * This should normally NOT be called manually from other places. - * If you want to call this manually anyway, keep in mind that this will release - * *all* VMA write locks, including ones from further up the stack. - */ -static inline void vma_end_write_all(struct mm_struct *mm) +static inline void mm_lock_seqcount_init(struct mm_struct *mm) { - mmap_assert_write_locked(mm); - /* - * Nobody can concurrently modify mm->mm_lock_seq due to exclusive - * mmap_lock being held. - * We need RELEASE semantics here to ensure that preceding stores into - * the VMA take effect before we unlock it with this store. - * Pairs with ACQUIRE semantics in vma_start_read(). - */ - smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1); + seqcount_init(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) +{ + do_raw_write_seqcount_begin(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_end(struct mm_struct *mm) +{ + ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq); + do_raw_write_seqcount_end(&mm->mm_lock_seq); } + #else -static inline void vma_end_write_all(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} #endif static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); + mm_lock_seqcount_init(mm); } static inline void mmap_write_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, true); down_write(&mm->mmap_lock); + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } @@ -111,6 +111,7 @@ static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) { __mmap_lock_trace_start_locking(mm, true); down_write_nested(&mm->mmap_lock, subclass); + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } @@ -120,10 +121,26 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm) __mmap_lock_trace_start_locking(mm, true); ret = down_write_killable(&mm->mmap_lock); + if (!ret) + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, ret == 0); return ret; } +/* + * Drop all currently-held per-VMA locks. + * This is called from the mmap_lock implementation directly before releasing + * a write-locked mmap_lock (or downgrading it to read-locked). + * This should normally NOT be called manually from other places. + * If you want to call this manually anyway, keep in mind that this will release + * *all* VMA write locks, including ones from further up the stack. + */ +static inline void vma_end_write_all(struct mm_struct *mm) +{ + mmap_assert_write_locked(mm); + mm_lock_seqcount_end(mm); +} + static inline void mmap_write_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, true); -- cgit v1.2.3 From 6f030e32e4499942a223677169d006085d8c57ce Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:16 -0800 Subject: mm: introduce mmap_lock_speculate_{try_begin|retry} Add helper functions to speculatively perform operations without read-locking mmap_lock, expecting that mmap_lock will not be write-locked and mm is not modified from under us. [akpm@linux-foundation.org: use read_seqcount_retry() in mmap_lock_speculate_retry(), per Wei Yang] Link: https://lkml.kernel.org/r/20241122174416.1367052-3-surenb@google.com Suggested-by: Peter Zijlstra Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Paul E. McKenney Cc: Peter Xu Cc: Shakeel Butt Cc: Sourav Panda Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 9715326f5a85..45a21faa3ff6 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -71,6 +71,7 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK + static inline void mm_lock_seqcount_init(struct mm_struct *mm) { seqcount_init(&mm->mm_lock_seq); @@ -87,11 +88,39 @@ static inline void mm_lock_seqcount_end(struct mm_struct *mm) do_raw_write_seqcount_end(&mm->mm_lock_seq); } -#else +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + /* + * Since mmap_lock is a sleeping lock, and waiting for it to become + * unlocked is more or less equivalent with taking it ourselves, don't + * bother with the speculative path if mmap_lock is already write-locked + * and take the slow path, which takes the lock. + */ + return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq); +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return read_seqcount_retry(&mm->mm_lock_seq, seq); +} + +#else /* CONFIG_PER_VMA_LOCK */ + static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} -#endif + +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + return false; +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return true; +} + +#endif /* CONFIG_PER_VMA_LOCK */ static inline void mmap_init_lock(struct mm_struct *mm) { -- cgit v1.2.3 From fa00b8ef1803fe133b4897c25227aa0d298dd093 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 6 Dec 2024 21:28:46 +0000 Subject: mm: perform all memfd seal checks in a single place We no longer actually need to perform these checks in the f_op->mmap() hook any longer. We already moved the operation which clears VM_MAYWRITE on a read-only mapping of a write-sealed memfd in order to work around the restrictions imposed by commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour"). There is no reason for us not to simply go ahead and additionally check to see if any pre-existing seals are in place here rather than defer this to the f_op->mmap() hook. By doing this we remove more logic from shmem_mmap() which doesn't belong there, as well as doing the same for hugetlbfs_file_mmap(). We also remove dubious shared logic in mm.h which simply does not belong there either. It makes sense to do these checks at the earliest opportunity, we know these are shmem (or hugetlbfs) mappings whose relevant VMA flags will not change from the invoking do_mmap() so there is simply no need to wait. This also means the implementation of further memfd seal flags can be done within mm/memfd.c and also have the opportunity to modify VMA flags as necessary early in the mapping logic. [lorenzo.stoakes@oracle.com: fix typos in !memfd inline stub] Link: https://lkml.kernel.org/r/7dee6c5d-480b-4c24-b98e-6fa47dbd8a23@lucifer.local Link: https://lkml.kernel.org/r/20241206212846.210835-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Tested-by: Isaac J. Manjarres Cc: Hugh Dickins Cc: Jann Horn Cc: Kalesh Singh Cc: Liam R. Howlett Cc: Muchun Song Cc: Vlastimil Babka Cc: Jeff Xu Signed-off-by: Andrew Morton --- include/linux/memfd.h | 23 +++++++++++---------- include/linux/mm.h | 55 --------------------------------------------------- 2 files changed, 11 insertions(+), 67 deletions(-) (limited to 'include') diff --git a/include/linux/memfd.h b/include/linux/memfd.h index d437e3070850..246daadbfde8 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -7,7 +7,14 @@ #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); -unsigned int *memfd_file_seals_ptr(struct file *file); +/* + * Check for any existing seals on mmap, return an error if access is denied due + * to sealing, or 0 otherwise. + * + * We also update VMA flags if appropriate by manipulating the VMA flags pointed + * to by vm_flags_ptr. + */ +int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { @@ -17,19 +24,11 @@ static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) { return ERR_PTR(-EINVAL); } - -static inline unsigned int *memfd_file_seals_ptr(struct file *file) +static inline int memfd_check_seals_mmap(struct file *file, + unsigned long *vm_flags_ptr) { - return NULL; + return 0; } #endif -/* Retrieve memfd seals associated with the file, if any. */ -static inline unsigned int memfd_file_seals(struct file *file) -{ - unsigned int *sealsp = memfd_file_seals_ptr(file); - - return sealsp ? *sealsp : 0; -} - #endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index a3a50c37603e..e7c54b9aac6d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4102,61 +4102,6 @@ void mem_dump_obj(void *object); static inline void mem_dump_obj(void *object) {} #endif -static inline bool is_write_sealed(int seals) -{ - return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); -} - -/** - * is_readonly_sealed - Checks whether write-sealed but mapped read-only, - * in which case writes should be disallowing moving - * forwards. - * @seals: the seals to check - * @vm_flags: the VMA flags to check - * - * Returns whether readonly sealed, in which case writess should be disallowed - * going forward. - */ -static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags) -{ - /* - * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as - * MAP_SHARED and read-only, take care to not allow mprotect to - * revert protections on such mappings. Do this only for shared - * mappings. For private mappings, don't need to mask - * VM_MAYWRITE as we still want them to be COW-writable. - */ - if (is_write_sealed(seals) && - ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED)) - return true; - - return false; -} - -/** - * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and - * handle them. - * @seals: the seals to check - * @vma: the vma to operate on - * - * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper - * check/handling on the vma flags. Return 0 if check pass, or <0 for errors. - */ -static inline int seal_check_write(int seals, struct vm_area_struct *vma) -{ - if (!is_write_sealed(seals)) - return 0; - - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * write seals are active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; - - return 0; -} - #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, -- cgit v1.2.3 From 991135774c0e05a4734e6d32aa03b00355e4cac9 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Wed, 11 Dec 2024 12:39:50 -0800 Subject: memcg/hugetlb: introduce mem_cgroup_charge_hugetlb This patch introduces mem_cgroup_charge_hugetlb which combines the logic of mem_cgroup_hugetlb_try_charge / mem_cgroup_hugetlb_commit_charge and removes the need for mem_cgroup_hugetlb_cancel_charge. It also reduces the footprint of memcg in hugetlb code and consolidates all memcg related error paths into one. Link: https://lkml.kernel.org/r/20241211203951.764733-3-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Acked-by: Shakeel Butt Reviewed-by: Nhat Pham Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b28180269e75..387470bed399 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -649,6 +649,8 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, long nr_pages); +int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp); + int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); @@ -1169,6 +1171,11 @@ static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, return 0; } +static inline int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp) +{ + return 0; +} + static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { -- cgit v1.2.3 From 1d8f136a421f26747e58c01281cba5bffae8d289 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Wed, 11 Dec 2024 12:39:51 -0800 Subject: memcg/hugetlb: remove memcg hugetlb try-commit-cancel protocol This patch fully removes the mem_cgroup_{try, commit, cancel}_charge functions, as well as their hugetlb variants. Link: https://lkml.kernel.org/r/20241211203951.764733-4-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Acked-by: Shakeel Butt Reviewed-by: Nhat Pham Cc: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 387470bed399..6e74b8254d9b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -620,8 +620,6 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target, page_counter_read(&memcg->memory); } -void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg); - int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp); /** @@ -646,9 +644,6 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, return __mem_cgroup_charge(folio, mm, gfp); } -int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, - long nr_pages); - int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp); int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, @@ -679,7 +674,6 @@ static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) __mem_cgroup_uncharge_folios(folios); } -void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_replace_folio(struct folio *old, struct folio *new); void mem_cgroup_migrate(struct folio *old, struct folio *new); @@ -1154,23 +1148,12 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target, return false; } -static inline void mem_cgroup_commit_charge(struct folio *folio, - struct mem_cgroup *memcg) -{ -} - static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { return 0; } -static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, - gfp_t gfp, long nr_pages) -{ - return 0; -} - static inline int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp) { return 0; @@ -1194,11 +1177,6 @@ static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios) { } -static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, - unsigned int nr_pages) -{ -} - static inline void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { -- cgit v1.2.3 From ccd582059a132f2bdc3486766ac57c24c465f471 Mon Sep 17 00:00:00 2001 From: Guo Weikang Date: Thu, 12 Dec 2024 18:10:00 +0800 Subject: mm/early_ioremap: add null pointer checks to prevent NULL-pointer dereference The early_ioremap interface can fail and return NULL in certain cases. To prevent NULL-pointer dereference crashes, fixed issues in the acpi_extlog and copy_early_mem interfaces, improving robustness when handling early memory. Link: https://lkml.kernel.org/r/20241212101004.1544070-1-guoweikang.kernel@gmail.com Signed-off-by: Guo Weikang Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Baoquan He Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Ingo Molnar Cc: Jason A. Donenfeld Cc: Julian Stecklina Cc: Kevin Loughlin Cc: Len Brown Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Thomas Gleixner Cc: Xin Li (Intel) Signed-off-by: Andrew Morton --- include/asm-generic/early_ioremap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h index 9d0479f50f97..5db59a1efb65 100644 --- a/include/asm-generic/early_ioremap.h +++ b/include/asm-generic/early_ioremap.h @@ -35,7 +35,7 @@ extern void early_ioremap_reset(void); /* * Early copy from unmapped memory to kernel mapped memory. */ -extern void copy_from_early_mem(void *dest, phys_addr_t src, +extern int copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size); #else -- cgit v1.2.3 From 144d52dd8fc83a082a275e1b663e7454d2b616a4 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 19 Dec 2024 10:27:08 +0100 Subject: x86/efistub: Drop long obsolete UGA support UGA is the EFI graphical output protocol that preceded GOP, and has been long obsolete. Drop support for it from the x86 implementation of the EFI stub - other architectures never bothered to implement it (save for ia64) Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/efi.h b/include/linux/efi.h index e5815867aba9..053c57e61869 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -363,7 +363,6 @@ void efi_native_runtime_setup(void); #define ACPI_20_TABLE_GUID EFI_GUID(0x8868e871, 0xe4f1, 0x11d3, 0xbc, 0x22, 0x00, 0x80, 0xc7, 0x3c, 0x88, 0x81) #define SMBIOS_TABLE_GUID EFI_GUID(0xeb9d2d31, 0x2d88, 0x11d3, 0x9a, 0x16, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d) #define SMBIOS3_TABLE_GUID EFI_GUID(0xf2fd1544, 0x9794, 0x4a2c, 0x99, 0x2e, 0xe5, 0xbb, 0xcf, 0x20, 0xe3, 0x94) -#define UGA_IO_PROTOCOL_GUID EFI_GUID(0x61a4d49e, 0x6f68, 0x4f1b, 0xb9, 0x22, 0xa8, 0x6e, 0xed, 0x0b, 0x07, 0xa2) #define EFI_GLOBAL_VARIABLE_GUID EFI_GUID(0x8be4df61, 0x93ca, 0x11d2, 0xaa, 0x0d, 0x00, 0xe0, 0x98, 0x03, 0x2b, 0x8c) #define UV_SYSTEM_TABLE_GUID EFI_GUID(0x3b13a7d4, 0x633e, 0x11dd, 0x93, 0xec, 0xda, 0x25, 0x56, 0xd8, 0x95, 0x93) #define LINUX_EFI_CRASH_GUID EFI_GUID(0xcfc8fc79, 0xbe2e, 0x4ddc, 0x97, 0xf0, 0x9f, 0x98, 0xbf, 0xe2, 0x98, 0xa0) @@ -373,7 +372,6 @@ void efi_native_runtime_setup(void); #define EFI_DEVICE_PATH_TO_TEXT_PROTOCOL_GUID EFI_GUID(0x8b843e20, 0x8132, 0x4852, 0x90, 0xcc, 0x55, 0x1a, 0x4e, 0x4a, 0x7f, 0x1c) #define EFI_DEVICE_PATH_FROM_TEXT_PROTOCOL_GUID EFI_GUID(0x05c99a21, 0xc70f, 0x4ad2, 0x8a, 0x5f, 0x35, 0xdf, 0x33, 0x43, 0xf5, 0x1e) #define EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID EFI_GUID(0x9042a9de, 0x23dc, 0x4a38, 0x96, 0xfb, 0x7a, 0xde, 0xd0, 0x80, 0x51, 0x6a) -#define EFI_UGA_PROTOCOL_GUID EFI_GUID(0x982c298b, 0xf4fa, 0x41cb, 0xb8, 0x38, 0x77, 0xaa, 0x68, 0x8f, 0xb8, 0x39) #define EFI_PCI_IO_PROTOCOL_GUID EFI_GUID(0x4cf5b200, 0x68b8, 0x4ca5, 0x9e, 0xec, 0xb2, 0x3e, 0x3f, 0x50, 0x02, 0x9a) #define EFI_FILE_INFO_ID EFI_GUID(0x09576e92, 0x6d3f, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b) #define EFI_SYSTEM_RESOURCE_TABLE_GUID EFI_GUID(0xb122a263, 0x3661, 0x4f68, 0x99, 0x29, 0x78, 0xf8, 0xb0, 0xd6, 0x21, 0x80) @@ -1286,8 +1284,6 @@ struct linux_efi_memreserve { void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size); -char *efi_systab_show_arch(char *str); - /* * The LINUX_EFI_MOK_VARIABLE_TABLE_GUID config table can be provided * to the kernel by an EFI boot loader. The table contains a packed -- cgit v1.2.3 From b7a2c1fe6b55364e61b4b54b991eb43a47bb1104 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 10 Jan 2025 07:05:12 +0100 Subject: net: ethtool: plumb PHY stats to PHY drivers Introduce support for standardized PHY statistics reporting in ethtool by extending the PHYLIB framework. Add the functions phy_ethtool_get_phy_stats() and phy_ethtool_get_link_ext_stats() to provide a consistent interface for retrieving PHY-level and link-specific statistics. These functions are used within the ethtool implementation to avoid direct access to the phy_device structure outside of the PHYLIB framework. A new structure, ethtool_phy_stats, is introduced to standardize PHY statistics such as packet counts, byte counts, and error counters. Drivers are updated to include callbacks for retrieving PHY and link-specific statistics, ensuring values are explicitly set only for supported fields, initialized with ETHTOOL_STAT_NOT_SET to avoid ambiguity. Signed-off-by: Jakub Kicinski Signed-off-by: Oleksij Rempel Signed-off-by: Paolo Abeni --- include/linux/ethtool.h | 23 +++++++++++++++++++++++ include/linux/phy.h | 36 ++++++++++++++++++++++++++++++++++++ include/linux/phylib_stubs.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 101 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index f711bfd75c4d..4bf70cfec826 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -412,6 +412,29 @@ struct ethtool_eth_phy_stats { ); }; +/** + * struct ethtool_phy_stats - PHY-level statistics counters + * @rx_packets: Total successfully received frames + * @rx_bytes: Total successfully received bytes + * @rx_errors: Total received frames with errors (e.g., CRC errors) + * @tx_packets: Total successfully transmitted frames + * @tx_bytes: Total successfully transmitted bytes + * @tx_errors: Total transmitted frames with errors + * + * This structure provides a standardized interface for reporting + * PHY-level statistics counters. It is designed to expose statistics + * commonly provided by PHYs but not explicitly defined in the IEEE + * 802.3 standard. + */ +struct ethtool_phy_stats { + u64 rx_packets; + u64 rx_bytes; + u64 rx_errors; + u64 tx_packets; + u64 tx_bytes; + u64 tx_errors; +}; + /* Basic IEEE 802.3 MAC Ctrl statistics (30.3.3.*), not otherwise exposed * via a more targeted API. */ diff --git a/include/linux/phy.h b/include/linux/phy.h index 4875465653ca..81d1612e7d35 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1144,6 +1144,35 @@ struct phy_driver { int (*cable_test_get_status)(struct phy_device *dev, bool *finished); /* Get statistics from the PHY using ethtool */ + /** + * @get_phy_stats: Retrieve PHY statistics. + * @dev: The PHY device for which the statistics are retrieved. + * @eth_stats: structure where Ethernet PHY stats will be stored. + * @stats: structure where additional PHY-specific stats will be stored. + * + * Retrieves the supported PHY statistics and populates the provided + * structures. The input structures are pre-initialized with + * `ETHTOOL_STAT_NOT_SET`, and the driver must only modify members + * corresponding to supported statistics. Unmodified members will remain + * set to `ETHTOOL_STAT_NOT_SET` and will not be returned to userspace. + */ + void (*get_phy_stats)(struct phy_device *dev, + struct ethtool_eth_phy_stats *eth_stats, + struct ethtool_phy_stats *stats); + + /** + * @get_link_stats: Retrieve link statistics. + * @dev: The PHY device for which the statistics are retrieved. + * @link_stats: structure where link-specific stats will be stored. + * + * Retrieves link-related statistics for the given PHY device. The input + * structure is pre-initialized with `ETHTOOL_STAT_NOT_SET`, and the + * driver must only modify members corresponding to supported + * statistics. Unmodified members will remain set to + * `ETHTOOL_STAT_NOT_SET` and will not be returned to userspace. + */ + void (*get_link_stats)(struct phy_device *dev, + struct ethtool_link_ext_stats *link_stats); /** @get_sset_count: Number of statistic counters */ int (*get_sset_count)(struct phy_device *dev); /** @get_strings: Names of the statistic counters */ @@ -2124,6 +2153,13 @@ int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data); int phy_ethtool_get_sset_count(struct phy_device *phydev); int phy_ethtool_get_stats(struct phy_device *phydev, struct ethtool_stats *stats, u64 *data); + +void __phy_ethtool_get_phy_stats(struct phy_device *phydev, + struct ethtool_eth_phy_stats *phy_stats, + struct ethtool_phy_stats *phydev_stats); +void __phy_ethtool_get_link_ext_stats(struct phy_device *phydev, + struct ethtool_link_ext_stats *link_stats); + int phy_ethtool_get_plca_cfg(struct phy_device *phydev, struct phy_plca_cfg *plca_cfg); int phy_ethtool_set_plca_cfg(struct phy_device *phydev, diff --git a/include/linux/phylib_stubs.h b/include/linux/phylib_stubs.h index 1279f48c8a70..9d2d6090c86d 100644 --- a/include/linux/phylib_stubs.h +++ b/include/linux/phylib_stubs.h @@ -5,6 +5,9 @@ #include +struct ethtool_eth_phy_stats; +struct ethtool_link_ext_stats; +struct ethtool_phy_stats; struct kernel_hwtstamp_config; struct netlink_ext_ack; struct phy_device; @@ -19,6 +22,11 @@ struct phylib_stubs { int (*hwtstamp_set)(struct phy_device *phydev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); + void (*get_phy_stats)(struct phy_device *phydev, + struct ethtool_eth_phy_stats *phy_stats, + struct ethtool_phy_stats *phydev_stats); + void (*get_link_ext_stats)(struct phy_device *phydev, + struct ethtool_link_ext_stats *link_stats); }; static inline int phy_hwtstamp_get(struct phy_device *phydev, @@ -50,6 +58,29 @@ static inline int phy_hwtstamp_set(struct phy_device *phydev, return phylib_stubs->hwtstamp_set(phydev, config, extack); } +static inline void phy_ethtool_get_phy_stats(struct phy_device *phydev, + struct ethtool_eth_phy_stats *phy_stats, + struct ethtool_phy_stats *phydev_stats) +{ + ASSERT_RTNL(); + + if (!phylib_stubs) + return; + + phylib_stubs->get_phy_stats(phydev, phy_stats, phydev_stats); +} + +static inline void phy_ethtool_get_link_ext_stats(struct phy_device *phydev, + struct ethtool_link_ext_stats *link_stats) +{ + ASSERT_RTNL(); + + if (!phylib_stubs) + return; + + phylib_stubs->get_link_ext_stats(phydev, link_stats); +} + #else static inline int phy_hwtstamp_get(struct phy_device *phydev, @@ -65,4 +96,15 @@ static inline int phy_hwtstamp_set(struct phy_device *phydev, return -EOPNOTSUPP; } +static inline void phy_ethtool_get_phy_stats(struct phy_device *phydev, + struct ethtool_eth_phy_stats *phy_stats, + struct ethtool_phy_stats *phydev_stats) +{ +} + +static inline void phy_ethtool_get_link_ext_stats(struct phy_device *phydev, + struct ethtool_link_ext_stats *link_stats) +{ +} + #endif -- cgit v1.2.3 From 6167c0b6e8d7ddb6b3e5efffcac34a85f7872997 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 10 Jan 2025 07:05:13 +0100 Subject: net: ethtool: add support for structured PHY statistics Introduce a new way to report PHY statistics in a structured and standardized format using the netlink API. This new method does not replace the old driver-specific stats, which can still be accessed with `ethtool -S `. The structured stats are available with `ethtool -S --all-groups`. This new method makes it easier to diagnose problems by organizing stats in a consistent and documented way. Signed-off-by: Jakub Kicinski Signed-off-by: Oleksij Rempel Signed-off-by: Paolo Abeni --- include/uapi/linux/ethtool.h | 2 ++ include/uapi/linux/ethtool_netlink.h | 14 ++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 7e1b3820f91f..d1089b88efc7 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -681,6 +681,7 @@ enum ethtool_link_ext_substate_module { * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics * @ETH_SS_STATS_RMON: names of RMON statistics + * @ETH_SS_STATS_PHY: names of PHY(dev) statistics * * @ETH_SS_COUNT: number of defined string sets */ @@ -706,6 +707,7 @@ enum ethtool_stringset { ETH_SS_STATS_ETH_MAC, ETH_SS_STATS_ETH_CTRL, ETH_SS_STATS_RMON, + ETH_SS_STATS_PHY, /* add new constants above here */ ETH_SS_COUNT diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 9c909ce733a5..9ff72cfb2e98 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -99,6 +99,7 @@ enum { ETHTOOL_STATS_ETH_MAC, ETHTOOL_STATS_ETH_CTRL, ETHTOOL_STATS_RMON, + ETHTOOL_STATS_PHY, /* add new constants above here */ __ETHTOOL_STATS_CNT @@ -193,6 +194,19 @@ enum { ETHTOOL_A_STATS_RMON_MAX = (__ETHTOOL_A_STATS_RMON_CNT - 1) }; +enum { + /* Basic packet counters if PHY has separate counters from the MAC */ + ETHTOOL_A_STATS_PHY_RX_PKTS, + ETHTOOL_A_STATS_PHY_RX_BYTES, + ETHTOOL_A_STATS_PHY_RX_ERRORS, + ETHTOOL_A_STATS_PHY_TX_PKTS, + ETHTOOL_A_STATS_PHY_TX_BYTES, + ETHTOOL_A_STATS_PHY_TX_ERRORS, + + /* add new constants above here */ + __ETHTOOL_A_STATS_PHY_CNT, + ETHTOOL_A_STATS_PHY_MAX = (__ETHTOOL_A_STATS_PHY_CNT - 1) +}; /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" -- cgit v1.2.3 From f2bc1c2655728ac00c35cfb992bdb3243ca17e7e Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 10 Jan 2025 07:05:15 +0100 Subject: net: phy: introduce optional polling interface for PHY statistics Add an optional polling interface for PHY statistics to simplify driver implementation. Signed-off-by: Oleksij Rempel Signed-off-by: Paolo Abeni --- include/linux/phy.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 81d1612e7d35..afaae74d0949 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1173,6 +1173,24 @@ struct phy_driver { */ void (*get_link_stats)(struct phy_device *dev, struct ethtool_link_ext_stats *link_stats); + + /** + * @update_stats: Trigger periodic statistics updates. + * @dev: The PHY device for which statistics updates are triggered. + * + * Periodically gathers statistics from the PHY device to update locally + * maintained 64-bit counters. This is necessary for PHYs that implement + * reduced-width counters (e.g., 16-bit or 32-bit) which can overflow + * more frequently compared to 64-bit counters. By invoking this + * callback, drivers can fetch the current counter values, handle + * overflow detection, and accumulate the results into local 64-bit + * counters for accurate reporting through the `get_phy_stats` and + * `get_link_stats` interfaces. + * + * Return: 0 on success or a negative error code on failure. + */ + int (*update_stats)(struct phy_device *dev); + /** @get_sset_count: Number of statistic counters */ int (*get_sset_count)(struct phy_device *dev); /** @get_strings: Names of the statistic counters */ @@ -1663,6 +1681,9 @@ static inline bool phy_polling_mode(struct phy_device *phydev) if (phydev->drv->flags & PHY_POLL_CABLE_TEST) return true; + if (phydev->drv->update_stats) + return true; + return phydev->irq == PHY_POLL; } -- cgit v1.2.3 From 04508d20b017326e116c6e8ef953839507c73b6d Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Fri, 10 Jan 2025 13:58:50 +0530 Subject: net: ti: icssg-prueth: Add Multicast Filtering support for VLAN in MAC mode Add multicast filtering support for VLAN interfaces in dual EMAC mode for ICSSG driver. The driver uses vlan_for_each() API to get the list of available vlans. The driver then sync mc addr of vlan interface with a locally mainatined list emac->vlan_mcast_list[vid] using __hw_addr_sync_multiple() API. __hw_addr_sync_multiple() is used instead of __hw_addr_sync() to sync vdev->mc with local list because the sync_cnt for addresses in vdev->mc will already be set by the vlan_dev_set_rx_mode() [net/8021q/vlan_dev.c] and __hw_addr_sync() only syncs when the sync_cnt == 0. Whereas __hw_addr_sync_multiple() can sync addresses even if sync_cnt is not 0. Export __hw_addr_sync_multiple() so that driver can use it. Once the local list is synced, driver calls __hw_addr_sync_dev() with the local list, vdev, sync and unsync callbacks. __hw_addr_sync_dev() is used with the local maintained list as the list to synchronize instead of using __dev_mc_sync() on vdev because __dev_mc_sync() on vdev will call __hw_addr_sync_dev() on vdev->mc and sync_cnt for addresses in vdev->mc will already be set by the vlan_dev_set_rx_mode() [net/8021q/vlan_dev.c] and __hw_addr_sync_dev() only syncs if the sync_cnt of addresses in the list (vdev->mc in this case) is 0. Whereas __hw_addr_sync_dev() on local list will work fine as the sync_cnt for addresses in the local list will still be 0. Based on change in addresses in the local list, sync / unsync callbacks are invoked. In the sync / unsync API in driver, based on whether the ndev is vlan or not, driver passes appropriate vid to FDB helper functions. Signed-off-by: MD Danish Anwar Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dd8f6f8991fe..bced03fb349e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4687,6 +4687,9 @@ int devm_register_netdev(struct device *dev, struct net_device *ndev); /* General hardware address lists handling functions */ int __hw_addr_sync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len); +int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len); void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len); int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, -- cgit v1.2.3 From 9c10dd8eed74de9e8adeb820939f8745cd566d4a Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Fri, 10 Jan 2025 13:58:51 +0530 Subject: net: hsr: Create and export hsr_get_port_ndev() Create an API to get the net_device to the slave port of HSR device. The API will take hsr net_device and enum hsr_port_type for which we want the net_device as arguments. This API can be used by client drivers who support HSR and want to get the net_devcie of slave ports from the hsr device. Export this API for the same. This API needs the enum hsr_port_type to be accessible by the drivers using hsr. Move the enum hsr_port_type from net/hsr/hsr_main.h to include/linux/if_hsr.h for the same. Signed-off-by: MD Danish Anwar Signed-off-by: Paolo Abeni --- include/linux/if_hsr.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/if_hsr.h b/include/linux/if_hsr.h index 0404f5bf4f30..d7941fd88032 100644 --- a/include/linux/if_hsr.h +++ b/include/linux/if_hsr.h @@ -13,6 +13,15 @@ enum hsr_version { PRP_V1, }; +enum hsr_port_type { + HSR_PT_NONE = 0, /* Must be 0, used by framereg */ + HSR_PT_SLAVE_A, + HSR_PT_SLAVE_B, + HSR_PT_INTERLINK, + HSR_PT_MASTER, + HSR_PT_PORTS, /* This must be the last item in the enum */ +}; + /* HSR Tag. * As defined in IEC-62439-3:2010, the HSR tag is really { ethertype = 0x88FB, * path, LSDU_size, sequence Nr }. But we let eth_header() create { h_dest, @@ -32,6 +41,8 @@ struct hsr_tag { #if IS_ENABLED(CONFIG_HSR) extern bool is_hsr_master(struct net_device *dev); extern int hsr_get_version(struct net_device *dev, enum hsr_version *ver); +struct net_device *hsr_get_port_ndev(struct net_device *ndev, + enum hsr_port_type pt); #else static inline bool is_hsr_master(struct net_device *dev) { @@ -42,6 +53,12 @@ static inline int hsr_get_version(struct net_device *dev, { return -EINVAL; } + +static inline struct net_device *hsr_get_port_ndev(struct net_device *ndev, + enum hsr_port_type pt) +{ + return ERR_PTR(-EINVAL); +} #endif /* CONFIG_HSR */ #endif /*_LINUX_IF_HSR_H_*/ -- cgit v1.2.3 From d06905d686107c8343ff71aa4f3c881cc0a9a7b9 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 9 Jan 2025 13:21:10 +0100 Subject: i2c: add core-managed per-client directory in debugfs More and more I2C client drivers use debugfs entries and currently they need to manage a subdir for their files on their own. This means inconsistent naming for these subdirs and they are scattered all over the debugfs-tree as well. Not to mention the duplicated code. Let the I2C core provide and maintain a proper directory per client. Note: It was considered to save the additional pointer in 'struct i2c_client' and only provide a subdir when requested via a helper function. When sketching this approach, more and more corner cases appeared, though, so the current solution with its simple and unabiguous code was chosen. Signed-off-by: Wolfram Sang Reviewed-by: Guenter Roeck --- include/linux/i2c.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 66fb3d6cf686..36de788dc7fe 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -347,6 +347,7 @@ struct i2c_client { i2c_slave_cb_t slave_cb; /* callback for slave mode */ #endif void *devres_group_id; /* ID of probe devres group */ + struct dentry *debugfs; /* per-client debugfs dir */ }; #define to_i2c_client(d) container_of(d, struct i2c_client, dev) -- cgit v1.2.3 From a9ff94477836cb43d94efbd9a851213944800177 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 14 Jan 2025 12:54:52 +0100 Subject: ARM: 9433/2: implement cacheinfo support On ARMv7 / v7m machines read CTR and CLIDR registers to provide information regarding the cache topology. Earlier machines should describe full cache topology in the device tree. Note, this follows the ARM64 cacheinfo support and provides only minimal support required to bootstrap cache info. All useful properties should be decribed in Device Tree. Reviewed-by: Linus Walleij Signed-off-by: Dmitry Baryshkov Signed-off-by: Russell King (Oracle) --- include/linux/cacheinfo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 108060612bb8..1e7061549fc7 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -147,7 +147,7 @@ static inline int get_cpu_cacheinfo_id(int cpu, int level) return ci ? ci->id : -1; } -#ifdef CONFIG_ARM64 +#if defined(CONFIG_ARM64) || defined(CONFIG_ARM) #define use_arch_cache_info() (true) #else #define use_arch_cache_info() (false) -- cgit v1.2.3 From 514dcf78afe6b99fba5e885be295356856dd424b Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 10 Jan 2025 10:40:20 +0100 Subject: net: pse-pd: Remove unused pse_ethtool_get_pw_limit function declaration Removed the unused pse_ethtool_get_pw_limit() function declaration from pse.h. This function was declared but never implemented or used, making the declaration unnecessary. Reviewed-by: Kalesh AP Reviewed-by: Andrew Lunn Acked-by: Oleksij Rempel Reviewed-by: Kyle Swenson Signed-off-by: Kory Maincent Signed-off-by: Paolo Abeni --- include/linux/pse-pd/pse.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index 591a53e082e6..85a08c349256 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -184,8 +184,6 @@ int pse_ethtool_set_config(struct pse_control *psec, int pse_ethtool_set_pw_limit(struct pse_control *psec, struct netlink_ext_ack *extack, const unsigned int pw_limit); -int pse_ethtool_get_pw_limit(struct pse_control *psec, - struct netlink_ext_ack *extack); bool pse_has_podl(struct pse_control *psec); bool pse_has_c33(struct pse_control *psec); @@ -222,12 +220,6 @@ static inline int pse_ethtool_set_pw_limit(struct pse_control *psec, return -EOPNOTSUPP; } -static inline int pse_ethtool_get_pw_limit(struct pse_control *psec, - struct netlink_ext_ack *extack) -{ - return -EOPNOTSUPP; -} - static inline bool pse_has_podl(struct pse_control *psec) { return false; -- cgit v1.2.3 From 6e56a6d47a7fad705a1a1d088237b0858c01a770 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 10 Jan 2025 10:40:22 +0100 Subject: net: pse-pd: Add power limit check Checking only the current limit is not sufficient. According to the standard, voltage can reach up to 57V and current up to 1.92A, which exceeds the power limit described in the standard (99.9W). Add a power limit check to prevent this. Acked-by: Oleksij Rempel Signed-off-by: Kory Maincent Signed-off-by: Paolo Abeni --- include/linux/pse-pd/pse.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index 85a08c349256..bc5addccbf32 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -11,6 +11,8 @@ /* Maximum current in uA according to IEEE 802.3-2022 Table 145-1 */ #define MAX_PI_CURRENT 1920000 +/* Maximum power in mW according to IEEE 802.3-2022 Table 145-16 */ +#define MAX_PI_PW 99900 struct phy_device; struct pse_controller_dev; -- cgit v1.2.3 From e0a5e2bba38aa61a900934b45d6e846e0a6d7524 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 10 Jan 2025 10:40:26 +0100 Subject: net: pse-pd: Use power limit at driver side instead of current limit The regulator framework uses current limits, but the PSE standard and known PSE controllers rely on power limits. Instead of converting current to power within each driver, perform the conversion in the PSE core. This avoids redundancy in driver implementation and aligns better with the standard, simplifying driver development. Remove at the same time the _pse_ethtool_get_status() function which is not needed anymore. Acked-by: Oleksij Rempel Signed-off-by: Kory Maincent Signed-off-by: Paolo Abeni --- include/linux/pse-pd/pse.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index bc5addccbf32..a721651cd1e0 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -77,12 +77,8 @@ struct pse_control_status { * @pi_disable: Configure the PSE PI as disabled. * @pi_get_voltage: Return voltage similarly to get_voltage regulator * callback. - * @pi_get_current_limit: Get the configured current limit similarly to - * get_current_limit regulator callback. - * @pi_set_current_limit: Configure the current limit similarly to - * set_current_limit regulator callback. - * Should not return an error in case of MAX_PI_CURRENT - * current value set. + * @pi_get_pw_limit: Get the configured power limit of the PSE PI. + * @pi_set_pw_limit: Configure the power limit of the PSE PI. */ struct pse_controller_ops { int (*ethtool_get_status)(struct pse_controller_dev *pcdev, @@ -93,10 +89,10 @@ struct pse_controller_ops { int (*pi_enable)(struct pse_controller_dev *pcdev, int id); int (*pi_disable)(struct pse_controller_dev *pcdev, int id); int (*pi_get_voltage)(struct pse_controller_dev *pcdev, int id); - int (*pi_get_current_limit)(struct pse_controller_dev *pcdev, - int id); - int (*pi_set_current_limit)(struct pse_controller_dev *pcdev, - int id, int max_uA); + int (*pi_get_pw_limit)(struct pse_controller_dev *pcdev, + int id); + int (*pi_set_pw_limit)(struct pse_controller_dev *pcdev, + int id, int max_mW); }; struct module; -- cgit v1.2.3 From 3e9dbfec499807767d03592ebdf19d9c15fd495b Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 10 Jan 2025 10:40:27 +0100 Subject: net: pse-pd: Split ethtool_get_status into multiple callbacks The ethtool_get_status callback currently handles all status and PSE information within a single function. This approach has two key drawbacks: 1. If the core requires some information for purposes other than ethtool_get_status, redundant code will be needed to fetch the same data from the driver (like is_enabled). 2. Drivers currently have access to all information passed to ethtool. New variables will soon be added to ethtool status, such as PSE ID, power domain IDs, and budget evaluation strategies, which are meant to be managed solely by the core. Drivers should not have the ability to modify these variables. To resolve these issues, ethtool_get_status has been split into multiple callbacks, with each handling a specific piece of information required by ethtool or the core. Signed-off-by: Kory Maincent Signed-off-by: Paolo Abeni --- include/linux/pse-pd/pse.h | 87 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 75 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index a721651cd1e0..3c544aff58b9 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -31,7 +31,52 @@ struct pse_control_config { }; /** - * struct pse_control_status - PSE control/channel status. + * struct pse_admin_state - PSE operational state + * + * @podl_admin_state: operational state of the PoDL PSE + * functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState + * @c33_admin_state: operational state of the PSE + * functions. IEEE 802.3-2022 30.9.1.1.2 aPSEAdminState + */ +struct pse_admin_state { + enum ethtool_podl_pse_admin_state podl_admin_state; + enum ethtool_c33_pse_admin_state c33_admin_state; +}; + +/** + * struct pse_pw_status - PSE power detection status + * + * @podl_pw_status: power detection status of the PoDL PSE. + * IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus: + * @c33_pw_status: power detection status of the PSE. + * IEEE 802.3-2022 30.9.1.1.5 aPSEPowerDetectionStatus: + */ +struct pse_pw_status { + enum ethtool_podl_pse_pw_d_status podl_pw_status; + enum ethtool_c33_pse_pw_d_status c33_pw_status; +}; + +/** + * struct pse_ext_state_info - PSE extended state information + * + * @c33_ext_state_info: extended state information of the PSE + */ +struct pse_ext_state_info { + struct ethtool_c33_pse_ext_state_info c33_ext_state_info; +}; + +/** + * struct pse_pw_limit_ranges - PSE power limit configuration range + * + * @c33_pw_limit_ranges: supported power limit configuration range. The driver + * is in charge of the memory allocation. + */ +struct pse_pw_limit_ranges { + struct ethtool_c33_pse_pw_limit_range *c33_pw_limit_ranges; +}; + +/** + * struct ethtool_pse_control_status - PSE control/channel status. * * @podl_admin_state: operational state of the PoDL PSE * functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState @@ -49,11 +94,11 @@ struct pse_control_config { * @c33_avail_pw_limit: available power limit of the PSE in mW * IEEE 802.3-2022 145.2.5.4 pse_avail_pwr * @c33_pw_limit_ranges: supported power limit configuration range. The driver - * is in charge of the memory allocation. + * is in charge of the memory allocation * @c33_pw_limit_nb_ranges: number of supported power limit configuration * ranges */ -struct pse_control_status { +struct ethtool_pse_control_status { enum ethtool_podl_pse_admin_state podl_admin_state; enum ethtool_podl_pse_pw_d_status podl_pw_status; enum ethtool_c33_pse_admin_state c33_admin_state; @@ -69,22 +114,37 @@ struct pse_control_status { /** * struct pse_controller_ops - PSE controller driver callbacks * - * @ethtool_get_status: get PSE control status for ethtool interface * @setup_pi_matrix: setup PI matrix of the PSE controller + * @pi_get_admin_state: Get the operational state of the PSE PI. This ops + * is mandatory. + * @pi_get_pw_status: Get the power detection status of the PSE PI. This + * ops is mandatory. + * @pi_get_ext_state: Get the extended state of the PSE PI. + * @pi_get_pw_class: Get the power class of the PSE PI. + * @pi_get_actual_pw: Get actual power of the PSE PI in mW. * @pi_is_enabled: Return 1 if the PSE PI is enabled, 0 if not. * May also return negative errno. * @pi_enable: Configure the PSE PI as enabled. * @pi_disable: Configure the PSE PI as disabled. * @pi_get_voltage: Return voltage similarly to get_voltage regulator - * callback. - * @pi_get_pw_limit: Get the configured power limit of the PSE PI. - * @pi_set_pw_limit: Configure the power limit of the PSE PI. + * callback in uV. + * @pi_get_pw_limit: Get the configured power limit of the PSE PI in mW. + * @pi_set_pw_limit: Configure the power limit of the PSE PI in mW. + * @pi_get_pw_limit_ranges: Get the supported power limit configuration + * range. The driver is in charge of the memory + * allocation and should return the number of + * ranges. */ struct pse_controller_ops { - int (*ethtool_get_status)(struct pse_controller_dev *pcdev, - unsigned long id, struct netlink_ext_ack *extack, - struct pse_control_status *status); int (*setup_pi_matrix)(struct pse_controller_dev *pcdev); + int (*pi_get_admin_state)(struct pse_controller_dev *pcdev, int id, + struct pse_admin_state *admin_state); + int (*pi_get_pw_status)(struct pse_controller_dev *pcdev, int id, + struct pse_pw_status *pw_status); + int (*pi_get_ext_state)(struct pse_controller_dev *pcdev, int id, + struct pse_ext_state_info *ext_state_info); + int (*pi_get_pw_class)(struct pse_controller_dev *pcdev, int id); + int (*pi_get_actual_pw)(struct pse_controller_dev *pcdev, int id); int (*pi_is_enabled)(struct pse_controller_dev *pcdev, int id); int (*pi_enable)(struct pse_controller_dev *pcdev, int id); int (*pi_disable)(struct pse_controller_dev *pcdev, int id); @@ -93,12 +153,15 @@ struct pse_controller_ops { int id); int (*pi_set_pw_limit)(struct pse_controller_dev *pcdev, int id, int max_mW); + int (*pi_get_pw_limit_ranges)(struct pse_controller_dev *pcdev, int id, + struct pse_pw_limit_ranges *pw_limit_ranges); }; struct module; struct device_node; struct of_phandle_args; struct pse_control; +struct ethtool_pse_control_status; /* PSE PI pairset pinout can either be Alternative A or Alternative B */ enum pse_pi_pairset_pinout { @@ -175,7 +238,7 @@ void pse_control_put(struct pse_control *psec); int pse_ethtool_get_status(struct pse_control *psec, struct netlink_ext_ack *extack, - struct pse_control_status *status); + struct ethtool_pse_control_status *status); int pse_ethtool_set_config(struct pse_control *psec, struct netlink_ext_ack *extack, const struct pse_control_config *config); @@ -199,7 +262,7 @@ static inline void pse_control_put(struct pse_control *psec) static inline int pse_ethtool_get_status(struct pse_control *psec, struct netlink_ext_ack *extack, - struct pse_control_status *status) + struct ethtool_pse_control_status *status) { return -EOPNOTSUPP; } -- cgit v1.2.3 From 4640a1f0d8f2246f34d6e74330d7e7d2cf75605b Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 10 Jan 2025 10:40:28 +0100 Subject: net: pse-pd: Remove is_enabled callback from drivers The is_enabled callback is now redundant as the admin_state can be obtained directly from the driver and provides the same information. To simplify functionality, the core will handle this internally, making the is_enabled callback unnecessary at the driver level. Remove the callback from all drivers. Acked-by: Oleksij Rempel Signed-off-by: Kory Maincent Signed-off-by: Paolo Abeni --- include/linux/pse-pd/pse.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index 3c544aff58b9..b5ae3dcee550 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -122,8 +122,6 @@ struct ethtool_pse_control_status { * @pi_get_ext_state: Get the extended state of the PSE PI. * @pi_get_pw_class: Get the power class of the PSE PI. * @pi_get_actual_pw: Get actual power of the PSE PI in mW. - * @pi_is_enabled: Return 1 if the PSE PI is enabled, 0 if not. - * May also return negative errno. * @pi_enable: Configure the PSE PI as enabled. * @pi_disable: Configure the PSE PI as disabled. * @pi_get_voltage: Return voltage similarly to get_voltage regulator @@ -145,7 +143,6 @@ struct pse_controller_ops { struct pse_ext_state_info *ext_state_info); int (*pi_get_pw_class)(struct pse_controller_dev *pcdev, int id); int (*pi_get_actual_pw)(struct pse_controller_dev *pcdev, int id); - int (*pi_is_enabled)(struct pse_controller_dev *pcdev, int id); int (*pi_enable)(struct pse_controller_dev *pcdev, int id); int (*pi_disable)(struct pse_controller_dev *pcdev, int id); int (*pi_get_voltage)(struct pse_controller_dev *pcdev, int id); -- cgit v1.2.3 From 5385f1e1923ca8131eb143567d509b101a344e06 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 10 Jan 2025 10:40:31 +0100 Subject: net: pse-pd: Clean ethtool header of PSE structures Remove PSE-specific structures from the ethtool header to improve code modularity, maintain independent headers, and reduce incremental build time. Signed-off-by: Kory Maincent Signed-off-by: Paolo Abeni --- include/linux/ethtool.h | 20 -------------------- include/linux/pse-pd/pse.h | 22 +++++++++++++++++++++- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 4bf70cfec826..20a86bd5f4e3 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1326,24 +1326,4 @@ struct ethtool_forced_speed_map { void ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size); - -/* C33 PSE extended state and substate. */ -struct ethtool_c33_pse_ext_state_info { - enum ethtool_c33_pse_ext_state c33_pse_ext_state; - union { - enum ethtool_c33_pse_ext_substate_error_condition error_condition; - enum ethtool_c33_pse_ext_substate_mr_pse_enable mr_pse_enable; - enum ethtool_c33_pse_ext_substate_option_detect_ted option_detect_ted; - enum ethtool_c33_pse_ext_substate_option_vport_lim option_vport_lim; - enum ethtool_c33_pse_ext_substate_ovld_detected ovld_detected; - enum ethtool_c33_pse_ext_substate_power_not_available power_not_available; - enum ethtool_c33_pse_ext_substate_short_detected short_detected; - u32 __c33_pse_ext_substate; - }; -}; - -struct ethtool_c33_pse_pw_limit_range { - u32 min; - u32 max; -}; #endif /* _LINUX_ETHTOOL_H */ diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index b5ae3dcee550..c773eeb92d04 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -5,7 +5,6 @@ #ifndef _LINUX_PSE_CONTROLLER_H #define _LINUX_PSE_CONTROLLER_H -#include #include #include @@ -16,6 +15,27 @@ struct phy_device; struct pse_controller_dev; +struct netlink_ext_ack; + +/* C33 PSE extended state and substate. */ +struct ethtool_c33_pse_ext_state_info { + enum ethtool_c33_pse_ext_state c33_pse_ext_state; + union { + enum ethtool_c33_pse_ext_substate_error_condition error_condition; + enum ethtool_c33_pse_ext_substate_mr_pse_enable mr_pse_enable; + enum ethtool_c33_pse_ext_substate_option_detect_ted option_detect_ted; + enum ethtool_c33_pse_ext_substate_option_vport_lim option_vport_lim; + enum ethtool_c33_pse_ext_substate_ovld_detected ovld_detected; + enum ethtool_c33_pse_ext_substate_power_not_available power_not_available; + enum ethtool_c33_pse_ext_substate_short_detected short_detected; + u32 __c33_pse_ext_substate; + }; +}; + +struct ethtool_c33_pse_pw_limit_range { + u32 min; + u32 max; +}; /** * struct pse_control_config - PSE control/channel configuration. -- cgit v1.2.3 From 384669921779806105c56751abff41fa0127f93a Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 14 Jan 2025 11:47:01 +0100 Subject: ALSA: rawmidi: Make tied_device=0 as default / unknown In the original change, rawmidi_info.tied_device showed -1 for the unknown or untied device. But this would require the user-space to check the protocol version and judge the value conditionally, which is rather error-prone. Instead, set the tied_device = 0 to be default as unknown, and indicate the real device with the offset 1, for achieving more backward compatibility. Suggested-by: Jaroslav Kysela Link: https://patch.msgid.link/20250114104711.19197-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- include/sound/rawmidi.h | 9 +++++++++ include/uapi/sound/asound.h | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/rawmidi.h b/include/sound/rawmidi.h index 6f2e95298fc7..6916f7133597 100644 --- a/include/sound/rawmidi.h +++ b/include/sound/rawmidi.h @@ -191,4 +191,13 @@ long snd_rawmidi_kernel_read(struct snd_rawmidi_substream *substream, long snd_rawmidi_kernel_write(struct snd_rawmidi_substream *substream, const unsigned char *buf, long count); +/* set up the tied devices */ +static inline void snd_rawmidi_tie_devices(struct snd_rawmidi *r1, + struct snd_rawmidi *r2) +{ + /* tied_device field keeps the device+1 (so that 0 being unknown) */ + r1->tied_device = r2->device + 1; + r2->tied_device = r1->device + 1; +} + #endif /* __SOUND_RAWMIDI_H */ diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index 000b36c0e2b9..5a049eeaecce 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -730,7 +730,7 @@ enum { #define SNDRV_RAWMIDI_INFO_UMP 0x00000008 #define SNDRV_RAWMIDI_INFO_STREAM_INACTIVE 0x00000010 -#define SNDRV_RAWMIDI_DEVICE_UNKNOWN -1 +#define SNDRV_RAWMIDI_DEVICE_UNKNOWN 0 struct snd_rawmidi_info { unsigned int device; /* RO/WR (control): device number */ -- cgit v1.2.3 From f4757d84abf523ea831dba0c136db4050d55c99f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 7 Jan 2025 17:19:22 +0100 Subject: ACPI: PRM: Fix missing guid_t declaration in linux/prmt.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Seen the following build error: ./include/linux/prmt.h:5:27: error: unknown type name ‘guid_t’ 5 | int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer); | ^~~~~~ The include file uses guid_t but it is not declared. Include linux/uuid.h to fix this. Signed-off-by: Robert Richter Reviewed-by: Yazen Ghannam Link: https://patch.msgid.link/20250107161923.3387552-1-rrichter@amd.com Signed-off-by: Rafael J. Wysocki --- include/linux/prmt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/prmt.h b/include/linux/prmt.h index 9c094294403f..c53ab287e932 100644 --- a/include/linux/prmt.h +++ b/include/linux/prmt.h @@ -1,5 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ +#include + #ifdef CONFIG_ACPI_PRMT void init_prmt(void); int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer); -- cgit v1.2.3 From ee9c69388e3bad6c595fe38f34aa1126d2d07a11 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 12 Jan 2025 14:49:07 +0000 Subject: kobject: Remove unused functions kobj_ns_initial() and kobj_ns_netlink() were adde din 2010 by commit bc451f205823 ("kobj: Add basic infrastructure for dealing with namespaces.") but have remained unused. Remove them. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20250112144907.270272-1-linux@treblig.org Signed-off-by: Greg Kroah-Hartman --- include/linux/kobject_ns.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h index be707748e7ce..150fe2ae1b6b 100644 --- a/include/linux/kobject_ns.h +++ b/include/linux/kobject_ns.h @@ -52,8 +52,6 @@ const struct kobj_ns_type_operations *kobj_ns_ops(const struct kobject *kobj); bool kobj_ns_current_may_mount(enum kobj_ns_type type); void *kobj_ns_grab_current(enum kobj_ns_type type); -const void *kobj_ns_netlink(enum kobj_ns_type type, struct sock *sk); -const void *kobj_ns_initial(enum kobj_ns_type type); void kobj_ns_drop(enum kobj_ns_type type, void *ns); #endif /* _LINUX_KOBJECT_NS_H */ -- cgit v1.2.3 From 124c4c32e9f3b4d89f5be3342897adc8db5c27b8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Jan 2025 13:55:57 +0000 Subject: tcp: add TCP_RFC7323_PAWS_ACK drop reason XPS can cause reorders because of the relaxed OOO conditions for pure ACK packets. For hosts not using RFS, what can happpen is that ACK packets are sent on behalf of the cpu processing NIC interrupts, selecting TX queue A for ACK packet P1. Then a subsequent sendmsg() can run on another cpu. TX queue selection uses the socket hash and can choose another queue B for packets P2 (with payload). If queue A is more congested than queue B, the ACK packet P1 could be sent on the wire after P2. A linux receiver when processing P1 (after P2) currently increments LINUX_MIB_PAWSESTABREJECTED (TcpExtPAWSEstab) and use TCP_RFC7323_PAWS drop reason. It might also send a DUPACK if not rate limited. In order to better understand this pattern, this patch adds a new drop_reason : TCP_RFC7323_PAWS_ACK. For old ACKS like these, we no longer increment LINUX_MIB_PAWSESTABREJECTED and no longer sends a DUPACK, keeping credit for other more interesting DUPACK. perf record -e skb:kfree_skb -a perf script ... swapper 0 [148] 27475.438637: skb:kfree_skb: ... location=tcp_validate_incoming+0x4f0 reason: TCP_RFC7323_PAWS_ACK swapper 0 [208] 27475.438706: skb:kfree_skb: ... location=tcp_validate_incoming+0x4f0 reason: TCP_RFC7323_PAWS_ACK swapper 0 [208] 27475.438908: skb:kfree_skb: ... location=tcp_validate_incoming+0x4f0 reason: TCP_RFC7323_PAWS_ACK swapper 0 [148] 27475.439010: skb:kfree_skb: ... location=tcp_validate_incoming+0x4f0 reason: TCP_RFC7323_PAWS_ACK swapper 0 [148] 27475.439214: skb:kfree_skb: ... location=tcp_validate_incoming+0x4f0 reason: TCP_RFC7323_PAWS_ACK swapper 0 [208] 27475.439286: skb:kfree_skb: ... location=tcp_validate_incoming+0x4f0 reason: TCP_RFC7323_PAWS_ACK ... Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Reviewed-by: Jason Xing Link: https://patch.msgid.link/20250113135558.3180360-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 3a6602f37978..28555109f9bd 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -36,6 +36,7 @@ FN(TCP_OVERWINDOW) \ FN(TCP_OFOMERGE) \ FN(TCP_RFC7323_PAWS) \ + FN(TCP_RFC7323_PAWS_ACK) \ FN(TCP_OLD_SEQUENCE) \ FN(TCP_INVALID_SEQUENCE) \ FN(TCP_INVALID_ACK_SEQUENCE) \ @@ -259,6 +260,10 @@ enum skb_drop_reason { * LINUX_MIB_PAWSESTABREJECTED, LINUX_MIB_PAWSACTIVEREJECTED */ SKB_DROP_REASON_TCP_RFC7323_PAWS, + /** + * @SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK: PAWS check, old ACK packet. + */ + SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK, /** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */ SKB_DROP_REASON_TCP_OLD_SEQUENCE, /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */ -- cgit v1.2.3 From d16b34479064fcb8dbb66a72308b5034be819161 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Jan 2025 13:55:58 +0000 Subject: tcp: add LINUX_MIB_PAWS_OLD_ACK SNMP counter Prior patch in the series added TCP_RFC7323_PAWS_ACK drop reason. This patch adds the corresponding SNMP counter, for folks using nstat instead of tracing for TCP diagnostics. nstat -az | grep PAWSOldAck Suggested-by: Neal Cardwell Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Jason Xing Tested-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250113135558.3180360-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 1 + include/uapi/linux/snmp.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 28555109f9bd..ed864934e20b 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -262,6 +262,7 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_RFC7323_PAWS, /** * @SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK: PAWS check, old ACK packet. + * Corresponds to LINUX_MIB_PAWS_OLD_ACK. */ SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK, /** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */ diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 2e75674e7d4f..848c7784e684 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -186,6 +186,7 @@ enum LINUX_MIB_TIMEWAITKILLED, /* TimeWaitKilled */ LINUX_MIB_PAWSACTIVEREJECTED, /* PAWSActiveRejected */ LINUX_MIB_PAWSESTABREJECTED, /* PAWSEstabRejected */ + LINUX_MIB_PAWS_OLD_ACK, /* PAWSOldAck */ LINUX_MIB_DELAYEDACKS, /* DelayedACKs */ LINUX_MIB_DELAYEDACKLOCKED, /* DelayedACKLocked */ LINUX_MIB_DELAYEDACKLOST, /* DelayedACKLost */ -- cgit v1.2.3 From 3feec68563dda59517f83d19123aa287a1dfd068 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:40:53 -0500 Subject: nfs/localio: add direct IO enablement with sync and async IO support This commit simply adds the required O_DIRECT plumbing. It doesn't address the fact that NFS doesn't ensure all writes are page aligned (nor device logical block size aligned as required by O_DIRECT). Because NFS will read-modify-write for IO that isn't aligned, LOCALIO will not use O_DIRECT semantics by default if/when an application requests the use of O_DIRECT. Allow the use of O_DIRECT semantics by: 1: Adding a flag to the nfs_pgio_header struct to allow the NFS O_DIRECT layer to signal that O_DIRECT was used by the application 2: Adding a 'localio_O_DIRECT_semantics' NFS module parameter that when enabled will cause LOCALIO to use O_DIRECT semantics (this may cause IO to fail if applications do not properly align their IO). This commit is derived from code developed by Weston Andros Adamson. Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs_xdr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 559273a0f16d..80766ff0a47c 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1637,6 +1637,7 @@ enum { NFS_IOHDR_RESEND_PNFS, NFS_IOHDR_RESEND_MDS, NFS_IOHDR_UNSTABLE_WRITES, + NFS_IOHDR_ODIRECT, }; struct nfs_io_completion; -- cgit v1.2.3 From a61466315d7afca032342183a57e62d5e3a3157c Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:40:54 -0500 Subject: nfsd: add nfsd_file_{get,put} to 'nfs_to' nfsd_localio_operations In later a commit LOCALIO must call both nfsd_file_get and nfsd_file_put to manage extra nfsd_file references. Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Acked-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/nfslocalio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 9202f4b24343..ab6a2a53f505 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -56,6 +56,8 @@ struct nfsd_localio_operations { const struct nfs_fh *, const fmode_t); struct net *(*nfsd_file_put_local)(struct nfsd_file *); + struct nfsd_file *(*nfsd_file_get)(struct nfsd_file *); + void (*nfsd_file_put)(struct nfsd_file *); struct file *(*nfsd_file_file)(struct nfsd_file *); } ____cacheline_aligned; -- cgit v1.2.3 From b49f049a22227df701bfbca083d6cc859496e615 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:40:55 -0500 Subject: nfs_common: rename functions that invalidate LOCALIO nfs_clients Rename nfs_uuid_invalidate_one_client to nfs_localio_disable_client. Rename nfs_uuid_invalidate_clients to nfs_localio_invalidate_clients. Signed-off-by: Mike Snitzer Reviewed-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfslocalio.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index ab6a2a53f505..a05d1043f2b0 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -37,8 +37,9 @@ bool nfs_uuid_begin(nfs_uuid_t *); void nfs_uuid_end(nfs_uuid_t *); void nfs_uuid_is_local(const uuid_t *, struct list_head *, struct net *, struct auth_domain *, struct module *); -void nfs_uuid_invalidate_clients(struct list_head *list); -void nfs_uuid_invalidate_one_client(nfs_uuid_t *nfs_uuid); + +void nfs_localio_disable_client(nfs_uuid_t *nfs_uuid); +void nfs_localio_invalidate_clients(struct list_head *list); /* localio needs to map filehandle -> struct nfsd_file */ extern struct nfsd_file * -- cgit v1.2.3 From 4ee7ba40007357a48447a8cbc667480acf9a006a Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:40:56 -0500 Subject: nfs_common: move localio_lock to new lock member of nfs_uuid_t Remove cl_localio_lock from 'struct nfs_client' in favor of adding a lock to the nfs_uuid_t struct (which is embedded in each nfs_client). Push nfs_local_{enable,disable} implementation down to nfs_common. Those methods now call nfs_localio_{enable,disable}_client. This allows implementing nfs_localio_invalidate_clients in terms of nfs_localio_disable_client. Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 1 - include/linux/nfslocalio.h | 8 +++++++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index b804346a9741..239d86ef166c 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -132,7 +132,6 @@ struct nfs_client { struct timespec64 cl_nfssvc_boot; seqlock_t cl_boot_lock; nfs_uuid_t cl_uuid; - spinlock_t cl_localio_lock; #endif /* CONFIG_NFS_LOCALIO */ }; diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index a05d1043f2b0..4d5583873f41 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -6,6 +6,7 @@ #ifndef __LINUX_NFSLOCALIO_H #define __LINUX_NFSLOCALIO_H + /* nfsd_file structure is purposely kept opaque to NFS client */ struct nfsd_file; @@ -19,6 +20,8 @@ struct nfsd_file; #include #include +struct nfs_client; + /* * Useful to allow a client to negotiate if localio * possible with its server. @@ -27,6 +30,8 @@ struct nfsd_file; */ typedef struct { uuid_t uuid; + /* sadly this struct is just over a cacheline, avoid bouncing */ + spinlock_t ____cacheline_aligned lock; struct list_head list; struct net __rcu *net; /* nfsd's network namespace */ struct auth_domain *dom; /* auth_domain for localio */ @@ -38,7 +43,8 @@ void nfs_uuid_end(nfs_uuid_t *); void nfs_uuid_is_local(const uuid_t *, struct list_head *, struct net *, struct auth_domain *, struct module *); -void nfs_localio_disable_client(nfs_uuid_t *nfs_uuid); +void nfs_localio_enable_client(struct nfs_client *clp); +void nfs_localio_disable_client(struct nfs_client *clp); void nfs_localio_invalidate_clients(struct list_head *list); /* localio needs to map filehandle -> struct nfsd_file */ -- cgit v1.2.3 From 86e00412254a717ffd5d38dc5ec0ee1cce6281b3 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:40:57 -0500 Subject: nfs: cache all open LOCALIO nfsd_file(s) in client This commit switches from leaning heavily on NFSD's filecache (in terms of GC'd nfsd_files) back to caching nfsd_files in the client. A later commit will add the callback mechanism needed to allow NFSD to force the NFS client to cleanup all cached nfsd_files. Add nfs_fh_localio_init() and 'struct nfs_fh_localio' to cache opened nfsd_file(s) (both a RO and RW nfsd_file is able to be opened and cached for a given nfs_fh). Update nfs_local_open_fh() to cache the nfsd_file once it is opened using __nfs_local_open_fh(). Introduce nfs_close_local_fh() to clear the cached open nfsd_files and call nfs_to_nfsd_file_put_local(). Refcounting is such that: - nfs_local_open_fh() is paired with nfs_close_local_fh(). - __nfs_local_open_fh() is paired with nfs_to_nfsd_file_put_local(). - nfs_local_file_get() is paired with nfs_local_file_put(). Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs_fs.h | 22 ++++++++++++++++++++-- include/linux/nfslocalio.h | 16 +++++++++------- 2 files changed, 29 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 039898d70954..67ae2c3f41d2 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -77,6 +77,23 @@ struct nfs_lock_context { struct rcu_head rcu_head; }; +struct nfs_file_localio { + struct nfsd_file __rcu *ro_file; + struct nfsd_file __rcu *rw_file; + struct list_head list; + void __rcu *nfs_uuid; /* opaque pointer to 'nfs_uuid_t' */ +}; + +static inline void nfs_localio_file_init(struct nfs_file_localio *nfl) +{ +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + nfl->ro_file = NULL; + nfl->rw_file = NULL; + INIT_LIST_HEAD(&nfl->list); + nfl->nfs_uuid = NULL; +#endif +} + struct nfs4_state; struct nfs_open_context { struct nfs_lock_context lock_context; @@ -87,15 +104,16 @@ struct nfs_open_context { struct nfs4_state *state; fmode_t mode; + int error; unsigned long flags; #define NFS_CONTEXT_BAD (2) #define NFS_CONTEXT_UNLOCK (3) #define NFS_CONTEXT_FILE_OPEN (4) - int error; - struct list_head list; struct nfs4_threshold *mdsthreshold; + struct list_head list; struct rcu_head rcu_head; + struct nfs_file_localio nfl; }; struct nfs_open_dir_context { diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 4d5583873f41..7cfc6720ed26 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -6,10 +6,6 @@ #ifndef __LINUX_NFSLOCALIO_H #define __LINUX_NFSLOCALIO_H - -/* nfsd_file structure is purposely kept opaque to NFS client */ -struct nfsd_file; - #if IS_ENABLED(CONFIG_NFS_LOCALIO) #include @@ -21,6 +17,7 @@ struct nfsd_file; #include struct nfs_client; +struct nfs_file_localio; /* * Useful to allow a client to negotiate if localio @@ -52,6 +49,7 @@ extern struct nfsd_file * nfsd_open_local_fh(struct net *, struct auth_domain *, struct rpc_clnt *, const struct cred *, const struct nfs_fh *, const fmode_t) __must_hold(rcu); +void nfs_close_local_fh(struct nfs_file_localio *); struct nfsd_localio_operations { bool (*nfsd_serv_try_get)(struct net *); @@ -73,7 +71,8 @@ extern const struct nfsd_localio_operations *nfs_to; struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *, struct rpc_clnt *, const struct cred *, - const struct nfs_fh *, const fmode_t); + const struct nfs_fh *, struct nfs_file_localio *, + const fmode_t); static inline void nfs_to_nfsd_net_put(struct net *net) { @@ -100,12 +99,15 @@ static inline void nfs_to_nfsd_file_put_local(struct nfsd_file *localio) } #else /* CONFIG_NFS_LOCALIO */ -static inline void nfsd_localio_ops_init(void) + +struct nfs_file_localio; +static inline void nfs_close_local_fh(struct nfs_file_localio *nfl) { } -static inline void nfs_to_nfsd_file_put_local(struct nfsd_file *localio) +static inline void nfsd_localio_ops_init(void) { } + #endif /* CONFIG_NFS_LOCALIO */ #endif /* __LINUX_NFSLOCALIO_H */ -- cgit v1.2.3 From b33f7dec3a67216123312c7bb752b8f6faa1c465 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:40:59 -0500 Subject: nfsd: rename nfsd_serv_ prefixed methods and variables with nfsd_net_ Also update Documentation/filesystems/nfs/localio.rst accordingly and reduce the technical documentation debt that was previously captured in that document. Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Acked-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/nfslocalio.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 7cfc6720ed26..aa2b5c6561ab 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -52,8 +52,8 @@ nfsd_open_local_fh(struct net *, struct auth_domain *, struct rpc_clnt *, void nfs_close_local_fh(struct nfs_file_localio *); struct nfsd_localio_operations { - bool (*nfsd_serv_try_get)(struct net *); - void (*nfsd_serv_put)(struct net *); + bool (*nfsd_net_try_get)(struct net *); + void (*nfsd_net_put)(struct net *); struct nfsd_file *(*nfsd_open_local_fh)(struct net *, struct auth_domain *, struct rpc_clnt *, @@ -77,12 +77,12 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *, static inline void nfs_to_nfsd_net_put(struct net *net) { /* - * Once reference to nfsd_serv is dropped, NFSD could be - * unloaded, so ensure safe return from nfsd_file_put_local() - * by always taking RCU. + * Once reference to net (and associated nfsd_serv) is dropped, NFSD + * could be unloaded, so ensure safe return from nfsd_net_put() by + * always taking RCU. */ rcu_read_lock(); - nfs_to->nfsd_serv_put(net); + nfs_to->nfsd_net_put(net); rcu_read_unlock(); } -- cgit v1.2.3 From 085804110aa13eac7f763d8d5cfe3a8220e35222 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:41:02 -0500 Subject: nfs_common: track all open nfsd_files per LOCALIO nfs_client This tracking enables __nfsd_file_cache_purge() to call nfs_localio_invalidate_clients(), upon shutdown or export change, to nfs_close_local_fh() all open nfsd_files that are still cached by the LOCALIO nfs clients associated with nfsd_net that is being shutdown. Now that the client must track all open nfsd_files there was more work than necessary being done with the global nfs_uuids_lock contended. This manifested in various RCU issues, e.g.: hrtimer: interrupt took 47969440 ns rcu: INFO: rcu_sched detected stalls on CPUs/tasks: Use nfs_uuid->lock to protect all nfs_uuid_t members, instead of nfs_uuids_lock, once nfs_uuid_is_local() adds the client to nn->local_clients. Also add 'local_clients_lock' to 'struct nfsd_net' to protect nn->local_clients. And store a pointer to spinlock in the 'list_lock' member of nfs_uuid_t so nfs_localio_disable_client() can use it to avoid taking the global nfs_uuids_lock. In combination, these split out locks eliminate the use of the single nfslocalio.c global nfs_uuids_lock in the IO paths (open and close). Also refactored associated fs/nfs_common/nfslocalio.c methods' locking to reduce work performed with spinlocks held in general. Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfslocalio.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index aa2b5c6561ab..c68a529230c1 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -30,19 +30,23 @@ typedef struct { /* sadly this struct is just over a cacheline, avoid bouncing */ spinlock_t ____cacheline_aligned lock; struct list_head list; + spinlock_t *list_lock; /* nn->local_clients_lock */ struct net __rcu *net; /* nfsd's network namespace */ struct auth_domain *dom; /* auth_domain for localio */ + /* Local files to close when net is shut down or exports change */ + struct list_head files; } nfs_uuid_t; void nfs_uuid_init(nfs_uuid_t *); bool nfs_uuid_begin(nfs_uuid_t *); void nfs_uuid_end(nfs_uuid_t *); -void nfs_uuid_is_local(const uuid_t *, struct list_head *, +void nfs_uuid_is_local(const uuid_t *, struct list_head *, spinlock_t *, struct net *, struct auth_domain *, struct module *); void nfs_localio_enable_client(struct nfs_client *clp); void nfs_localio_disable_client(struct nfs_client *clp); -void nfs_localio_invalidate_clients(struct list_head *list); +void nfs_localio_invalidate_clients(struct list_head *nn_local_clients, + spinlock_t *nn_local_clients_lock); /* localio needs to map filehandle -> struct nfsd_file */ extern struct nfsd_file * -- cgit v1.2.3 From 779a395189c692eec0246e7df63e2a3c0f0c8508 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:41:04 -0500 Subject: nfs/localio: remove redundant code and simplify LOCALIO enablement Remove nfs_local_enable and nfs_local_disable, instead use nfs_localio_enable_client and nfs_localio_disable_client. Discontinue use of the NFS_CS_LOCAL_IO bit in the nfs_client struct's cl_flags to reflect that LOCALIO is enabled; instead just test if the net member of the nfs_uuid_t struct is set. Also remove NFS_CS_LOCAL_IO. Lastly, remove trace_nfs_local_enable and trace_nfs_local_disable because comparable traces are available from nfs_localio.ko. Suggested-by: NeilBrown Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 1 - include/linux/nfslocalio.h | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 239d86ef166c..ed66df1093e8 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -50,7 +50,6 @@ struct nfs_client { #define NFS_CS_DS 7 /* - Server is a DS */ #define NFS_CS_REUSEPORT 8 /* - reuse src port on reconnect */ #define NFS_CS_PNFS 9 /* - Server used for pnfs */ -#define NFS_CS_LOCAL_IO 10 /* - client is local */ struct sockaddr_storage cl_addr; /* server identifier */ size_t cl_addrlen; char * cl_hostname; /* hostname of server */ diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index c68a529230c1..05817d6ef3d1 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -111,6 +111,10 @@ static inline void nfs_close_local_fh(struct nfs_file_localio *nfl) static inline void nfsd_localio_ops_init(void) { } +struct nfs_client; +static inline void nfs_localio_disable_client(struct nfs_client *clp) +{ +} #endif /* CONFIG_NFS_LOCALIO */ -- cgit v1.2.3 From 76d4cb6345da0f2cd505e552157258325bcc8bcd Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:41:05 -0500 Subject: nfs: probe for LOCALIO when v4 client reconnects to server Introduce nfs_local_probe_async() for the NFS client to initiate if/when it reconnects with server. For NFSv4 it is a simple matter to call nfs_local_probe_async() from nfs4_do_reclaim (during NFSv4 grace). Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index ed66df1093e8..f00bfcee7120 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -131,6 +131,7 @@ struct nfs_client { struct timespec64 cl_nfssvc_boot; seqlock_t cl_boot_lock; nfs_uuid_t cl_uuid; + struct work_struct cl_local_probe_work; #endif /* CONFIG_NFS_LOCALIO */ }; -- cgit v1.2.3 From 4a489220aa8c9daf5f02396c28cebade9f9ab563 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Nov 2024 20:41:06 -0500 Subject: nfs: probe for LOCALIO when v3 client reconnects to server Re-enabling NFSv3 LOCALIO is made more complex (than NFSv4) because v3 is stateless. As such, the hueristic used to identify a LOCALIO probe point is more adhoc by nature: if/when NFSv3 client IO begins to complete again in terms of normal RPC-based NFSv3 server IO, attempt nfs_local_probe_async(). Care is taken to throttle the frequency of nfs_local_probe_async(), otherwise there could be a flood of repeat calls to nfs_local_probe_async(). The throttle is admin controlled using a new module parameter for nfsv3, e.g.: echo 512 > /sys/module/nfsv3/parameters/nfs3_localio_probe_throttle Probe for NFSv3 LOCALIO every N IO requests (512 in this case). Must be power-of-2, defaults to 0 (probing disabled). On systems that expect to use LOCALIO with NFSv3 the admin should configure the 'nfs3_localio_probe_throttle' module parameter. This commit backfills module parameter documentation in localio.rst Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfslocalio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 05817d6ef3d1..9aa8a43843d7 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -27,7 +27,8 @@ struct nfs_file_localio; */ typedef struct { uuid_t uuid; - /* sadly this struct is just over a cacheline, avoid bouncing */ + unsigned nfs3_localio_probe_count; + /* this struct is over a cacheline, avoid bouncing */ spinlock_t ____cacheline_aligned lock; struct list_head list; spinlock_t *list_lock; /* nn->local_clients_lock */ -- cgit v1.2.3 From 013525583fdd79b6b83547b937535dfd406d3cd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 14 Jan 2025 19:08:33 +0200 Subject: PCI: Don't expose pcie_read_tlp_log() outside PCI subsystem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pcie_read_tlp_log() was exposed by the commit 0a5a46a6a61b ("PCI/AER: Generalize TLP Header Log reading") with the intent that drivers could use it, but the PCI maintainer later decided that drivers should be encouraged to use PCI core diagnostic logging of generic AER registers rather than building their own. Drivers that currently implement their own diagnostic logging include ixgbe (ixgbe_io_error_detected()) and iwlwifi (iwl_trans_pcie_dump_regs()). Remove the unwanted EXPORT of pcie_read_tlp_log() and remove it from include/linux/aer.h. Link: https://lore.kernel.org/r/20250114170840.1633-2-ilpo.jarvinen@linux.intel.com Link: https://lore.kernel.org/all/20240322193011.GA701027@bhelgaas/ Signed-off-by: Ilpo Järvinen [bhelgaas: commit log] Signed-off-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Reviewed-by: Yazen Ghannam --- include/linux/aer.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/aer.h b/include/linux/aer.h index 4b97f38f3fcf..190a0a2061cd 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -37,8 +37,6 @@ struct aer_capability_regs { u16 uncor_err_source; }; -int pcie_read_tlp_log(struct pci_dev *dev, int where, struct pcie_tlp_log *log); - #if defined(CONFIG_PCIEAER) int pci_aer_clear_nonfatal_status(struct pci_dev *dev); int pcie_aer_is_native(struct pci_dev *dev); -- cgit v1.2.3 From ede5d5dbef6f3a1f25ee4e9d1999750827aa0640 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 14 Jan 2025 19:08:35 +0200 Subject: PCI: Add defines for TLP Header/Prefix log sizes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add defines for AER and DPC capabilities TLP Header Logging register sizes (PCIe r6.2, sec 7.8.4 / 7.9.14) and replace literals with them. Link: https://lore.kernel.org/r/20250114170840.1633-4-ilpo.jarvinen@linux.intel.com Suggested-by: Yazen Ghannam Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas --- include/linux/aer.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/aer.h b/include/linux/aer.h index 190a0a2061cd..4ef6515c3205 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -16,10 +16,17 @@ #define AER_CORRECTABLE 2 #define DPC_FATAL 3 +/* + * AER and DPC capabilities TLP Logging register sizes (PCIe r6.2, sec 7.8.4 + * & 7.9.14). + */ +#define PCIE_STD_NUM_TLP_HEADERLOG 4 +#define PCIE_STD_MAX_TLP_PREFIXLOG 4 + struct pci_dev; struct pcie_tlp_log { - u32 dw[4]; + u32 dw[PCIE_STD_NUM_TLP_HEADERLOG]; }; struct aer_capability_regs { -- cgit v1.2.3 From e5321ae10e1323359a5067a26dfe98b5f44cc5e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 14 Jan 2025 19:08:38 +0200 Subject: PCI: Store number of supported End-End TLP Prefixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit eetlp_prefix_path in the struct pci_dev tells if End-End TLP Prefixes are supported by the path or not, and the value is only calculated if CONFIG_PCI_PASID is set. The Max End-End TLP Prefixes field in the Device Capabilities Register 2 also tells how many (1-4) End-End TLP Prefixes are supported (PCIe r6.2 sec 7.5.3.15). The number of supported End-End Prefixes is useful for reading correct number of DWORDs from TLP Prefix Log register in AER capability (PCIe r6.2 sec 7.8.4.12). Replace eetlp_prefix_path with eetlp_prefix_max and determine the number of supported End-End Prefixes regardless of CONFIG_PCI_PASID so that an upcoming commit generalizing TLP Prefix Log register reading does not have to read extra DWORDs for End-End Prefixes that never will be there. The value stored into eetlp_prefix_max is directly derived from device's Max End-End TLP Prefixes and does not consider limitations imposed by bridges or the Root Port beyond supported/not supported flags. This is intentional for two reasons: 1) PCIe r6.2 spec sections 2.2.10.4 & 6.2.4.4 indicate that a TLP is malformed only if the number of prefixes exceed the number of Max End-End TLP Prefixes, which seems to be the case even if the device could never receive that many prefixes due to smaller maximum imposed by a bridge or the Root Port. If TLP parsing is later added, this distinction is significant in interpreting what is logged by the TLP Prefix Log registers and the value matching to the Malformed TLP threshold is going to be more useful. 2) TLP Prefix handling happens autonomously on a low layer and the value in eetlp_prefix_max is not programmed anywhere by the kernel (i.e., there is no limiter OS can control to prevent sending more than N TLP Prefixes). Link: https://lore.kernel.org/r/20250114170840.1633-7-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Reviewed-by: Yazen Ghannam --- include/linux/pci.h | 2 +- include/uapi/linux/pci_regs.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index db9b47ce3eef..21be5a1edf1a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -407,7 +407,7 @@ struct pci_dev { supported from root to here */ #endif unsigned int pasid_no_tlp:1; /* PASID works without TLP Prefix */ - unsigned int eetlp_prefix_path:1; /* End-to-End TLP Prefix */ + unsigned int eetlp_prefix_max:3; /* Max # of End-End TLP Prefixes, 0=not supported */ pci_channel_state_t error_state; /* Current connectivity state */ struct device dev; /* Generic device interface */ diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 1601c7ed5fab..14a6306c4ce1 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -665,6 +665,7 @@ #define PCI_EXP_DEVCAP2_OBFF_MSG 0x00040000 /* New message signaling */ #define PCI_EXP_DEVCAP2_OBFF_WAKE 0x00080000 /* Re-use WAKE# for OBFF */ #define PCI_EXP_DEVCAP2_EE_PREFIX 0x00200000 /* End-End TLP Prefix */ +#define PCI_EXP_DEVCAP2_EE_PREFIX_MAX 0x00c00000 /* Max End-End TLP Prefixes */ #define PCI_EXP_DEVCTL2 0x28 /* Device Control 2 */ #define PCI_EXP_DEVCTL2_COMP_TIMEOUT 0x000f /* Completion Timeout Value */ #define PCI_EXP_DEVCTL2_COMP_TMOUT_DIS 0x0010 /* Completion Timeout Disable */ -- cgit v1.2.3 From b6be5ba8f1c6b28c2daa039fd9e9df32f62852bd Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 12 Jan 2025 13:13:18 +0000 Subject: socket: Remove unused kernel_sendmsg_locked The last use of kernel_sendmsg_locked() was removed in 2023 by commit dc97391e6610 ("sock: Remove ->sendpage*() in favour of sendmsg(MSG_SPLICE_PAGES)") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Kalesh AP Reviewed-by: Kuniyuki Iwashima Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250112131318.63753-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- include/linux/net.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index b75bc534c1b3..0ff950eecc6b 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -343,8 +343,6 @@ static inline bool sendpages_ok(struct page *page, size_t len, size_t offset) int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len); -int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, - struct kvec *vec, size_t num, size_t len); int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len, int flags); -- cgit v1.2.3 From f81a6d12bf8b262f4c8ce5e856a4d399d97612ee Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 10 Jan 2025 16:20:18 -0800 Subject: KVM: Open code kvm_set_memory_region() into its sole caller (ioctl() API) Open code kvm_set_memory_region() into its sole caller in preparation for adding a dedicated API for setting internal memslots. Oppurtunistically use the fancy new guard(mutex) to avoid a local 'r' variable. Cc: Tao Su Reviewed-by: Xiaoyao Li Reviewed-by: Claudio Imbrenda Acked-by: Christoph Schlameuss Link: https://lore.kernel.org/r/20250111002022.1230573-2-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 401439bb21e3..7443de24b1d9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1192,8 +1192,6 @@ enum kvm_mr_change { KVM_MR_FLAGS_ONLY, }; -int kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region2 *mem); int __kvm_set_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region2 *mem); void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); -- cgit v1.2.3 From 156bffdb2b49fc0c869bf160a57378886f5fa92d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 10 Jan 2025 16:20:20 -0800 Subject: KVM: Add a dedicated API for setting KVM-internal memslots Add a dedicated API for setting internal memslots, and have it explicitly disallow setting userspace memslots. Setting a userspace memslots without a direct command from userspace would result in all manner of issues. No functional change intended. Cc: Tao Su Cc: Claudio Imbrenda Cc: Christian Borntraeger Reviewed-by: Xiaoyao Li Reviewed-by: Claudio Imbrenda Acked-by: Christoph Schlameuss Link: https://lore.kernel.org/r/20250111002022.1230573-4-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7443de24b1d9..8707d25a2e5b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1192,8 +1192,8 @@ enum kvm_mr_change { KVM_MR_FLAGS_ONLY, }; -int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region2 *mem); +int kvm_set_internal_memslot(struct kvm *kvm, + const struct kvm_userspace_memory_region2 *mem); void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, -- cgit v1.2.3 From 344315e93dbc9c4733d5c9fd605ecfc46ef97180 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 10 Jan 2025 16:20:21 -0800 Subject: KVM: x86: Drop double-underscores from __kvm_set_memory_region() Now that there's no outer wrapper for __kvm_set_memory_region() and it's static, drop its double-underscore prefix. No functional change intended. Cc: Tao Su Reviewed-by: Xiaoyao Li Reviewed-by: Claudio Imbrenda Acked-by: Christoph Schlameuss Link: https://lore.kernel.org/r/20250111002022.1230573-5-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8707d25a2e5b..dcb59d6e8acb 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1183,7 +1183,7 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn * -- just change its flags * * Since flags can be changed by some of these operations, the following - * differentiation is the best we can do for __kvm_set_memory_region(): + * differentiation is the best we can do for kvm_set_memory_region(): */ enum kvm_mr_change { KVM_MR_CREATE, -- cgit v1.2.3 From a648eb3a3f79e9736a59b28783700c2c691db419 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Dec 2024 11:34:14 +0100 Subject: genirq: Provide IRQCHIP_MOVE_DEFERRED The logic of GENERIC_PENDING_IRQ is backwards for historical reasons. Most interrupt controllers allow to move the interrupt from arbitrary contexts. If GENERIC_PENDING_IRQ is enabled by an architecture to support a chip, which requires the affinity change to happen in interrupt context, all other chips have to be marked with IRQF_MOVE_PCNTXT. That's tedious and there is no real good reason for the extra flags in the irq descriptor and the irq data status fields. In fact the decision whether interrupts can be moved in arbitrary context or not is a property of the interrupt chip. To simplify adoption for RISC-V provide a new mechanism which is enabled via a config switch and allows to add a flag to irq_chip::flags to request that interrupt affinity changes are deferred. Setting the top level chip of an interrupt evaluates the flag and maps it into the existing logic. The config switch and the various PCNTXT flags are temporary until x86 is converted over to this scheme. This intermediate step also allows trivial backporting of the mechanism to plug the affinity change race of various RISC-V interrupt controllers. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241210103335.500314436@linutronix.de --- include/linux/irq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index 25f51bf3c351..6e021548fa0a 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -567,6 +567,7 @@ struct irq_chip { * in the suspend path if they are in disabled state * IRQCHIP_AFFINITY_PRE_STARTUP: Default affinity update before startup * IRQCHIP_IMMUTABLE: Don't ever change anything in this chip + * IRQCHIP_MOVE_DEFERRED: Move the interrupt in actual interrupt context */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), @@ -581,6 +582,7 @@ enum { IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND = (1 << 9), IRQCHIP_AFFINITY_PRE_STARTUP = (1 << 10), IRQCHIP_IMMUTABLE = (1 << 11), + IRQCHIP_MOVE_DEFERRED = (1 << 12), }; #include -- cgit v1.2.3 From 763d1ebec843b5048761e094281281abbd9a75f0 Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Sat, 28 Dec 2024 20:15:08 +0800 Subject: vdso: Correct typo in PAGE_SHIFT comment Signed-off-by: Haiyue Wang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241228121518.80812-1-haiyuewa@163.com --- include/vdso/page.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/vdso/page.h b/include/vdso/page.h index 710ae2414e68..bc47186c07fc 100644 --- a/include/vdso/page.h +++ b/include/vdso/page.h @@ -8,7 +8,7 @@ * PAGE_SHIFT determines the page size. * * Note: This definition is required because PAGE_SHIFT is used - * in several places throuout the codebase. + * in several places throughout the codebase. */ #define PAGE_SHIFT CONFIG_PAGE_SHIFT -- cgit v1.2.3 From f1359f46f1f1305340970b5073240126fe87254f Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 7 Jan 2025 12:06:39 +0200 Subject: drm/bridge: fix documentation for the hdmi_audio_prepare() callback Fix c&p error and change linuxdoc comment for the hdmi_audio_prepare() callback from drm_bridge_funcs to mention the callback name instead of the original prepare() callback. Fixes: 0beba3f9d366 ("drm/bridge: connector: add support for HDMI codec framework") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20250106174645.463927e0@canb.auug.org.au/ Acked-by: Maxime Ripard Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20250107-drm-bridge-fix-docs-v1-1-84e539e6f348@linaro.org Signed-off-by: Dmitry Baryshkov --- include/drm/drm_bridge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/drm/drm_bridge.h b/include/drm/drm_bridge.h index 4b84faf14e36..496dbbd2ad7e 100644 --- a/include/drm/drm_bridge.h +++ b/include/drm/drm_bridge.h @@ -691,7 +691,7 @@ struct drm_bridge_funcs { struct drm_bridge *bridge); /** - * @prepare: + * @hdmi_audio_prepare: * Configures HDMI-encoder for audio stream. Can be called multiple * times for each setup. Mandatory if HDMI audio is enabled in the * bridge's configuration. -- cgit v1.2.3 From 12c92098932b4bbf38396e9aed0a343d35437a21 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jan 2025 08:06:49 +0000 Subject: debugfs: allow to store an additional opaque pointer at file creation Set by debugfs_create_file_aux(name, mode, parent, data, aux, fops). Plain debugfs_create_file() has it set to NULL. Accessed by debugfs_get_aux(file). Convenience macros for numeric opaque data - debugfs_create_file_aux_num and debugfs_get_aux_num, resp. Signed-off-by: Al Viro Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20250112080705.141166-5-viro@zeniv.linux.org.uk Signed-off-by: Greg Kroah-Hartman --- include/linux/debugfs.h | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 59444b495d49..7c97417d73b5 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -79,9 +79,11 @@ struct debugfs_short_fops { struct dentry *debugfs_create_file_full(const char *name, umode_t mode, struct dentry *parent, void *data, + const void *aux, const struct file_operations *fops); struct dentry *debugfs_create_file_short(const char *name, umode_t mode, struct dentry *parent, void *data, + const void *aux, const struct debugfs_short_fops *fops); /** @@ -126,7 +128,15 @@ struct dentry *debugfs_create_file_short(const char *name, umode_t mode, const struct debugfs_short_fops *: debugfs_create_file_short, \ struct file_operations *: debugfs_create_file_full, \ struct debugfs_short_fops *: debugfs_create_file_short) \ - (name, mode, parent, data, fops) + (name, mode, parent, data, NULL, fops) + +#define debugfs_create_file_aux(name, mode, parent, data, aux, fops) \ + _Generic(fops, \ + const struct file_operations *: debugfs_create_file_full, \ + const struct debugfs_short_fops *: debugfs_create_file_short, \ + struct file_operations *: debugfs_create_file_full, \ + struct debugfs_short_fops *: debugfs_create_file_short) \ + (name, mode, parent, data, aux, fops) struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode, struct dentry *parent, void *data, @@ -153,6 +163,7 @@ void debugfs_remove(struct dentry *dentry); void debugfs_lookup_and_remove(const char *name, struct dentry *parent); const struct file_operations *debugfs_real_fops(const struct file *filp); +const void *debugfs_get_aux(const struct file *file); int debugfs_file_get(struct dentry *dentry); void debugfs_file_put(struct dentry *dentry); @@ -259,6 +270,14 @@ static inline struct dentry *debugfs_lookup(const char *name, return ERR_PTR(-ENODEV); } +static inline struct dentry *debugfs_create_file_aux(const char *name, + umode_t mode, struct dentry *parent, + void *data, void *aux, + const void *fops) +{ + return ERR_PTR(-ENODEV); +} + static inline struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const void *fops) @@ -312,6 +331,7 @@ static inline void debugfs_lookup_and_remove(const char *name, { } const struct file_operations *debugfs_real_fops(const struct file *filp); +void *debugfs_get_aux(const struct file *file); static inline int debugfs_file_get(struct dentry *dentry) { @@ -452,6 +472,11 @@ static inline ssize_t debugfs_read_file_str(struct file *file, #endif +#define debugfs_create_file_aux_num(name, mode, parent, data, n, fops) \ + debugfs_create_file_aux(name, mode, parent, data, \ + (void *)(unsigned long)n, fops) +#define debugfs_get_aux_num(f) (unsigned long)debugfs_get_aux(f) + /** * debugfs_create_xul - create a debugfs file that is used to read and write an * unsigned long value, formatted in hexadecimal -- cgit v1.2.3 From d1433c7ba289319983ec0086dd22524721a797ef Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jan 2025 08:06:50 +0000 Subject: debugfs: take debugfs_short_fops definition out of ifdef Signed-off-by: Al Viro Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20250112080705.141166-6-viro@zeniv.linux.org.uk Signed-off-by: Greg Kroah-Hartman --- include/linux/debugfs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 7c97417d73b5..68e9c6cbd835 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -67,16 +67,16 @@ static const struct file_operations __fops = { \ typedef struct vfsmount *(*debugfs_automount_t)(struct dentry *, void *); -#if defined(CONFIG_DEBUG_FS) - -struct dentry *debugfs_lookup(const char *name, struct dentry *parent); - struct debugfs_short_fops { ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); loff_t (*llseek) (struct file *, loff_t, int); }; +#if defined(CONFIG_DEBUG_FS) + +struct dentry *debugfs_lookup(const char *name, struct dentry *parent); + struct dentry *debugfs_create_file_full(const char *name, umode_t mode, struct dentry *parent, void *data, const void *aux, -- cgit v1.2.3 From f7862dfef6612b87b2ad8352c4d73886f09456d6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jan 2025 08:07:05 +0000 Subject: saner replacement for debugfs_rename() Existing primitive has several problems: 1) calling conventions are clumsy - it returns a dentry reference that is either identical to its second argument or is an ERR_PTR(-E...); in both cases no refcount changes happen. Inconvenient for users and bug-prone; it would be better to have it return 0 on success and -E... on failure. 2) it allows cross-directory moves; however, no such caller have ever materialized and considering the way debugfs is used, it's unlikely to happen in the future. What's more, any such caller would have fun issues to deal with wrt interplay with recursive removal. It also makes the calling conventions clumsier... 3) tautological rename fails; the callers have no race-free way to deal with that. 4) new name must have been formed by the caller; quite a few callers have it done by sprintf/kasprintf/etc., ending up with considerable boilerplate. Proposed replacement: int debugfs_change_name(dentry, fmt, ...). All callers convert to that easily, and it's simpler internally. IMO debugfs_rename() should go; if we ever get a real-world use case for cross-directory moves in debugfs, we can always look into the right way to handle that. Signed-off-by: Al Viro Link: https://lore.kernel.org/r/20250112080705.141166-21-viro@zeniv.linux.org.uk Signed-off-by: Greg Kroah-Hartman --- include/linux/debugfs.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 68e9c6cbd835..fa2568b4380d 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -175,8 +175,7 @@ ssize_t debugfs_attr_write(struct file *file, const char __user *buf, ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos); -struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, - struct dentry *new_dir, const char *new_name); +int debugfs_change_name(struct dentry *dentry, const char *fmt, ...) __printf(2, 3); void debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent, u8 *value); @@ -361,10 +360,10 @@ static inline ssize_t debugfs_attr_write_signed(struct file *file, return -ENODEV; } -static inline struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, - struct dentry *new_dir, char *new_name) +static inline int __printf(2, 3) debugfs_change_name(struct dentry *dentry, + const char *fmt, ...) { - return ERR_PTR(-ENODEV); + return -ENODEV; } static inline void debugfs_create_u8(const char *name, umode_t mode, -- cgit v1.2.3 From 04e97fa7dd7e3eda754712f92df2136acd1d9088 Mon Sep 17 00:00:00 2001 From: Laurentiu Mihalcea Date: Tue, 14 Jan 2025 13:43:14 -0500 Subject: ASoC: simple-card-utils: fix priv->dai_props indexing As of commit cb18cd26039f ("ASoC: soc-core: do rtd->id trick at snd_soc_add_pcm_runtime()") the ID stored in the PCM runtime data can no longer be safely used to index the priv->dai_props array. This is because the ID may be modified during snd_soc_add_pcm_runtime(), thus resulting in an ID that's no longer a valid array index. To fix this, use the position of the dai_link stored inside the PCM runtime data relative to the start of the dai_link array as index into the priv->dai_props array. Signed-off-by: Laurentiu Mihalcea Acked-by: Kuninori Morimoto Link: https://patch.msgid.link/20250114184314.3583-2-laurentiumihalcea111@gmail.com Signed-off-by: Mark Brown --- include/sound/simple_card_utils.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/sound/simple_card_utils.h b/include/sound/simple_card_utils.h index 3360d9eab068..36a485571142 100644 --- a/include/sound/simple_card_utils.h +++ b/include/sound/simple_card_utils.h @@ -89,6 +89,13 @@ struct simple_util_priv { #define simple_props_to_dai_codec(props, i) ((props)->codec_dai + i) #define simple_props_to_codec_conf(props, i) ((props)->codec_conf + i) +/* has the same effect as simple_priv_to_props(). Preferred over + * simple_priv_to_props() when dealing with PCM runtime data as + * the ID stored in rtd->id may not be a valid array index. + */ +#define runtime_simple_priv_to_props(priv, rtd) \ + ((priv)->dai_props + ((rtd)->dai_link - (priv)->dai_link)) + #define for_each_prop_dlc_cpus(props, i, cpu) \ for ((i) = 0; \ ((i) < (props)->num.cpus) && \ -- cgit v1.2.3 From d4e91adfc261de1454538ee6437e6014a2d1fca1 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Tue, 14 Jan 2025 22:56:16 +0100 Subject: ASoC: soc-dai: add snd_soc_dai_prepare() and use it internally Add a new snd_soc_dai_prepare() which can be used (in an upcoming patch) by soc-dapm.c. Use this new function internally in snd_soc_pcm_dai_prepare() to avoid duplicating code. Suggested-by: Jerome Brunet Reviewed-by: Charles Keepax Reviewed-by: Jerome Brunet Signed-off-by: Martin Blumenstingl Acked-by: Kuninori Morimoto Link: https://patch.msgid.link/20250114215617.336105-2-martin.blumenstingl@googlemail.com Signed-off-by: Mark Brown --- include/sound/soc-dai.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/sound/soc-dai.h b/include/sound/soc-dai.h index aab57c19f62b..a11501752637 100644 --- a/include/sound/soc-dai.h +++ b/include/sound/soc-dai.h @@ -193,6 +193,9 @@ int snd_soc_dai_set_channel_map(struct snd_soc_dai *dai, int snd_soc_dai_set_tristate(struct snd_soc_dai *dai, int tristate); +int snd_soc_dai_prepare(struct snd_soc_dai *dai, + struct snd_pcm_substream *substream); + /* Digital Audio Interface mute */ int snd_soc_dai_digital_mute(struct snd_soc_dai *dai, int mute, int direction); -- cgit v1.2.3 From cec8c359f87c0f7c9cf63b570c0ce968b5ef62a4 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Mon, 13 Jan 2025 23:13:14 +0100 Subject: Input: i8042 - Add support for platform filter contexts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the platform filter cannot access any driver-specific state which forces drivers installing a i8042 filter to have at least some kind of global pointer for their filter. Allow callers of i8042_install_filter() to submit a context pointer which is then passed to the i8042 filter. This frees drivers from the responsibility of having to manage this global pointer themself. Also introduce a separate type for the i8042 filter (i8042_filter_t) so that the function definitions can stay compact. Tested on a Dell Inspiron 3505. Reviewed-by: Ilpo Järvinen Acked-by: Dmitry Torokhov Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20250113221314.435812-1-W_Armin@gmx.de Signed-off-by: Ilpo Järvinen --- include/linux/i8042.h | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/i8042.h b/include/linux/i8042.h index 95b07f8b77fe..00037c13abc8 100644 --- a/include/linux/i8042.h +++ b/include/linux/i8042.h @@ -54,15 +54,29 @@ struct serio; +/** + * typedef i8042_filter_t - i8042 filter callback + * @data: Data received by the i8042 controller + * @str: Status register of the i8042 controller + * @serio: Serio of the i8042 controller + * @context: Context pointer associated with this callback + * + * This represents a i8042 filter callback which can be used with i8042_install_filter() + * and i8042_remove_filter() to filter the i8042 input for platform-specific key codes. + * + * Context: Interrupt context. + * Returns: true if the data should be filtered out, false if otherwise. + */ +typedef bool (*i8042_filter_t)(unsigned char data, unsigned char str, struct serio *serio, + void *context); + #if defined(CONFIG_SERIO_I8042) || defined(CONFIG_SERIO_I8042_MODULE) void i8042_lock_chip(void); void i8042_unlock_chip(void); int i8042_command(unsigned char *param, int command); -int i8042_install_filter(bool (*filter)(unsigned char data, unsigned char str, - struct serio *serio)); -int i8042_remove_filter(bool (*filter)(unsigned char data, unsigned char str, - struct serio *serio)); +int i8042_install_filter(i8042_filter_t filter, void *context); +int i8042_remove_filter(i8042_filter_t filter); #else @@ -79,14 +93,12 @@ static inline int i8042_command(unsigned char *param, int command) return -ENODEV; } -static inline int i8042_install_filter(bool (*filter)(unsigned char data, unsigned char str, - struct serio *serio)) +static inline int i8042_install_filter(i8042_filter_t filter, void *context) { return -ENODEV; } -static inline int i8042_remove_filter(bool (*filter)(unsigned char data, unsigned char str, - struct serio *serio)) +static inline int i8042_remove_filter(i8042_filter_t filter) { return -ENODEV; } -- cgit v1.2.3 From 1bebc7869c99d466f819dd2cffaef0edf7d7a035 Mon Sep 17 00:00:00 2001 From: Illia Ostapyshyn Date: Thu, 14 Nov 2024 18:39:29 +0100 Subject: Input: allocate keycode for phone linking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The F11 key on the new Lenovo Thinkpad T14 Gen 5, T16 Gen 3, and P14s Gen 5 laptops includes a symbol showing a smartphone and a laptop chained together. According to the user manual, it starts the Microsoft Phone Link software used to connect to Android/iOS devices and relay messages/calls or sync data. As there are no suitable keycodes for this action, introduce a new one. Signed-off-by: Illia Ostapyshyn Acked-by: Dmitry Torokhov Link: https://lore.kernel.org/r/20241114173930.44983-2-illia@yshyn.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/uapi/linux/input-event-codes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index a4206723f503..5a199f3d4a26 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -519,6 +519,7 @@ #define KEY_NOTIFICATION_CENTER 0x1bc /* Show/hide the notification center */ #define KEY_PICKUP_PHONE 0x1bd /* Answer incoming call */ #define KEY_HANGUP_PHONE 0x1be /* Decline incoming call */ +#define KEY_LINK_PHONE 0x1bf /* AL Phone Syncing */ #define KEY_DEL_EOL 0x1c0 #define KEY_DEL_EOS 0x1c1 -- cgit v1.2.3 From e209e5ccc5ac4b2637159ea19be95a1b16846f31 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 25 Nov 2024 15:42:11 -0500 Subject: Bluetooth: MGMT: Mark LL Privacy as stable This marks LL Privacy as stable by removing its experimental UUID and move its functionality to Device Flag (HCI_CONN_FLAG_ADDRESS_RESOLUTION) which can be set by MGMT Device Set Flags so userspace retain control of the feature. Link: https://github.com/bluez/bluez/issues/1028 Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 - include/net/bluetooth/hci_core.h | 11 ++++------- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 6203bd8663b7..0d51970d809f 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -438,7 +438,6 @@ enum { HCI_FORCE_BREDR_SMP, HCI_FORCE_STATIC_ADDR, HCI_LL_RPA_RESOLUTION, - HCI_ENABLE_LL_PRIVACY, HCI_CMD_PENDING, HCI_FORCE_NO_MITM, HCI_QUALITY_REPORT, diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index ca22ead85dbe..734cd50cdc46 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -157,8 +157,9 @@ struct bdaddr_list_with_irk { /* Bitmask of connection flags */ enum hci_conn_flags { - HCI_CONN_FLAG_REMOTE_WAKEUP = 1, - HCI_CONN_FLAG_DEVICE_PRIVACY = 2, + HCI_CONN_FLAG_REMOTE_WAKEUP = BIT(0), + HCI_CONN_FLAG_DEVICE_PRIVACY = BIT(1), + HCI_CONN_FLAG_ADDRESS_RESOLUTION = BIT(2), }; typedef u8 hci_conn_flags_t; @@ -1919,11 +1920,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define ll_privacy_capable(dev) ((dev)->le_features[0] & HCI_LE_LL_PRIVACY) -/* Use LL Privacy based address resolution if supported */ -#define use_ll_privacy(dev) (ll_privacy_capable(dev) && \ - hci_dev_test_flag(dev, HCI_ENABLE_LL_PRIVACY)) - -#define privacy_mode_capable(dev) (use_ll_privacy(dev) && \ +#define privacy_mode_capable(dev) (ll_privacy_capable(dev) && \ (hdev->commands[39] & 0x04)) #define read_key_size_capable(dev) \ -- cgit v1.2.3 From b05ce8896091233fb0e4bf506862abaebdd8945b Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 16 Dec 2024 01:26:36 +0000 Subject: Bluetooth: hci: Remove deadcode hci_bdaddr_list_del_with_flags() was added in 2020's commit 8baaa4038edb ("Bluetooth: Add bdaddr_list_with_flags for classic whitelist") but has remained unused. hci_remove_ext_adv_instance() was added in 2020's commit eca0ae4aea66 ("Bluetooth: Add initial implementation of BIS connections") but has remained unused. Remove them. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 2 -- include/net/bluetooth/hci_sync.h | 1 - 2 files changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 734cd50cdc46..84b522a10019 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1760,8 +1760,6 @@ int hci_bdaddr_list_add_with_flags(struct list_head *list, bdaddr_t *bdaddr, int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type); int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr, u8 type); -int hci_bdaddr_list_del_with_flags(struct list_head *list, bdaddr_t *bdaddr, - u8 type); void hci_bdaddr_list_clear(struct list_head *list); struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index f3052cb252ef..7e2cf0cca939 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -140,7 +140,6 @@ int hci_update_scan(struct hci_dev *hdev); int hci_write_le_host_supported_sync(struct hci_dev *hdev, u8 le, u8 simul); int hci_remove_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance, struct sock *sk); -int hci_remove_ext_adv_instance(struct hci_dev *hdev, u8 instance); struct sk_buff *hci_read_local_oob_data_sync(struct hci_dev *hdev, bool ext, struct sock *sk); -- cgit v1.2.3 From f07d478090b0a03dda46fb45b5c6e089a8408351 Mon Sep 17 00:00:00 2001 From: Hsin-chen Chuang Date: Wed, 8 Jan 2025 20:24:43 +0800 Subject: Bluetooth: Get rid of cmd_timeout and use the reset callback The hdev->reset is never used now and the hdev->cmd_timeout actually does reset. This patch changes the call path from hdev->cmd_timeout -> vendor_cmd_timeout -> btusb_reset -> hdev->reset , to hdev->reset -> vendor_reset -> btusb_reset Which makes it clear when we export the hdev->reset to a wider usage e.g. allowing reset from sysfs. This patch doesn't introduce any behavior change. Signed-off-by: Hsin-chen Chuang Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 84b522a10019..f756fac95488 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -633,7 +633,6 @@ struct hci_dev { int (*post_init)(struct hci_dev *hdev); int (*set_diag)(struct hci_dev *hdev, bool enable); int (*set_bdaddr)(struct hci_dev *hdev, const bdaddr_t *bdaddr); - void (*cmd_timeout)(struct hci_dev *hdev); void (*reset)(struct hci_dev *hdev); bool (*wakeup)(struct hci_dev *hdev); int (*set_quality_report)(struct hci_dev *hdev, bool enable); -- cgit v1.2.3 From 31691914c392675bdc65d1e72dd8d129a1f0014f Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskii Date: Mon, 28 Oct 2024 21:54:52 +0000 Subject: kunit: Introduce autorun option The new option controls tests run on boot or module load. With the new debugfs "run" dentry allowing to run tests on demand, an ability to disable automatic tests run becomes a useful option in case of intrusive tests. The option is set to true by default to preserve the existent behavior. It can be overridden by either the corresponding module option or by the corresponding config build option. Link: https://lore.kernel.org/r/173015245931.4747.16419517391658830640.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net Signed-off-by: Stanislav Kinsburskii Reviewed-by: Rae Moar Acked-by: David Gow Signed-off-by: Shuah Khan --- include/kunit/test.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/kunit/test.h b/include/kunit/test.h index 34b71e42fb10..58dbab60f853 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -312,6 +312,7 @@ static inline void kunit_set_failure(struct kunit *test) } bool kunit_enabled(void); +bool kunit_autorun(void); const char *kunit_action(void); const char *kunit_filter_glob(void); char *kunit_filter(void); @@ -334,7 +335,8 @@ kunit_filter_suites(const struct kunit_suite_set *suite_set, int *err); void kunit_free_suite_set(struct kunit_suite_set suite_set); -int __kunit_test_suites_init(struct kunit_suite * const * const suites, int num_suites); +int __kunit_test_suites_init(struct kunit_suite * const * const suites, int num_suites, + bool run_tests); void __kunit_test_suites_exit(struct kunit_suite **suites, int num_suites); -- cgit v1.2.3 From 6564862d646e7d630929ba1ff330740bb215bdac Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 9 Jan 2025 11:39:59 +0000 Subject: block: Ensure start sector is aligned for stacking atomic writes For stacking atomic writes, ensure that the start sector is aligned with the device atomic write unit min and any boundary. Otherwise, we may permit misaligned atomic writes. Rework bdev_can_atomic_write() into a common helper to resuse the alignment check. There also use atomic_write_hw_unit_min, which is more proper (than atomic_write_unit_min). Fixes: d7f36dc446e89 ("block: Support atomic writes limits for stacked devices") Reviewed-by: Christoph Hellwig Signed-off-by: John Garry Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250109114000.2299896-2-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 13d353351c37..7ac153e4423a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1706,6 +1706,15 @@ struct io_comp_batch { void (*complete)(struct io_comp_batch *); }; +static inline bool blk_atomic_write_start_sect_aligned(sector_t sector, + struct queue_limits *limits) +{ + unsigned int alignment = max(limits->atomic_write_hw_unit_min, + limits->atomic_write_hw_boundary); + + return IS_ALIGNED(sector, alignment >> SECTOR_SHIFT); +} + static inline bool bdev_can_atomic_write(struct block_device *bdev) { struct request_queue *bd_queue = bdev->bd_queue; @@ -1714,15 +1723,9 @@ static inline bool bdev_can_atomic_write(struct block_device *bdev) if (!limits->atomic_write_unit_min) return false; - if (bdev_is_partition(bdev)) { - sector_t bd_start_sect = bdev->bd_start_sect; - unsigned int alignment = - max(limits->atomic_write_unit_min, - limits->atomic_write_hw_boundary); - - if (!IS_ALIGNED(bd_start_sect, alignment >> SECTOR_SHIFT)) - return false; - } + if (bdev_is_partition(bdev)) + return blk_atomic_write_start_sect_aligned(bdev->bd_start_sect, + limits); return true; } -- cgit v1.2.3 From 0d602fcfb9d2fb9d6340bc4e82047df41c7aa214 Mon Sep 17 00:00:00 2001 From: Jan Dakinevich Date: Wed, 13 Nov 2024 02:00:55 +0300 Subject: dt-bindings: reset: add bindings for A1 SoC audio reset controller This reset controller is part of audio clock controller and handled by auxiliary reset driver. Introduced defines supposed to be used together with upcoming device tree nodes for audio clock controller fo A1 SoC. Signed-off-by: Jan Dakinevich Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20241112230056.1406222-2-jan.dakinevich@salutedevices.com Signed-off-by: Philipp Zabel --- .../reset/amlogic,meson-a1-audio-reset.h | 36 ++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 include/dt-bindings/reset/amlogic,meson-a1-audio-reset.h (limited to 'include') diff --git a/include/dt-bindings/reset/amlogic,meson-a1-audio-reset.h b/include/dt-bindings/reset/amlogic,meson-a1-audio-reset.h new file mode 100644 index 000000000000..7693552f1507 --- /dev/null +++ b/include/dt-bindings/reset/amlogic,meson-a1-audio-reset.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ +/* + * Copyright (c) 2024, SaluteDevices. All Rights Reserved. + * + * Author: Jan Dakinevich + */ + +#ifndef _DT_BINDINGS_AMLOGIC_MESON_A1_AUDIO_RESET_H +#define _DT_BINDINGS_AMLOGIC_MESON_A1_AUDIO_RESET_H + +#define AUD_RESET_DDRARB 0 +#define AUD_RESET_TDMIN_A 1 +#define AUD_RESET_TDMIN_B 2 +#define AUD_RESET_TDMIN_LB 3 +#define AUD_RESET_LOOPBACK 4 +#define AUD_RESET_TDMOUT_A 5 +#define AUD_RESET_TDMOUT_B 6 +#define AUD_RESET_FRDDR_A 7 +#define AUD_RESET_FRDDR_B 8 +#define AUD_RESET_TODDR_A 9 +#define AUD_RESET_TODDR_B 10 +#define AUD_RESET_SPDIFIN 11 +#define AUD_RESET_RESAMPLE 12 +#define AUD_RESET_EQDRC 13 +#define AUD_RESET_LOCKER 14 +#define AUD_RESET_TOACODEC 30 +#define AUD_RESET_CLKTREE 31 + +#define AUD_VAD_RESET_DDRARB 0 +#define AUD_VAD_RESET_PDM 1 +#define AUD_VAD_RESET_TDMIN_VAD 2 +#define AUD_VAD_RESET_TODDR_VAD 3 +#define AUD_VAD_RESET_TOVAD 4 +#define AUD_VAD_RESET_CLKTREE 5 + +#endif /* _DT_BINDINGS_AMLOGIC_MESON_A1_AUDIO_RESET_H */ -- cgit v1.2.3 From 72bb8275a3b0784a817e0371c49c0110d68bb7fb Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Mon, 9 Dec 2024 17:04:35 +0100 Subject: reset: amlogic: aux: drop aux registration helper Having the aux registration helper along with the registered driver is not great dependency wise. It does not allow the registering driver to be properly decoupled from the registered auxiliary driver. Drop the registration helper from the amlogic auxiliary reset driver. This will be handled in the registering clock driver to start with while a more generic solution is worked on. Suggested-by: Arnd Bergmann Signed-off-by: Jerome Brunet Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20241209-meson-rst-aux-rework-v1-2-d2afb69cc72e@baylibre.com Signed-off-by: Philipp Zabel --- include/soc/amlogic/reset-meson-aux.h | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 include/soc/amlogic/reset-meson-aux.h (limited to 'include') diff --git a/include/soc/amlogic/reset-meson-aux.h b/include/soc/amlogic/reset-meson-aux.h deleted file mode 100644 index d8a15d48c984..000000000000 --- a/include/soc/amlogic/reset-meson-aux.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __SOC_RESET_MESON_AUX_H -#define __SOC_RESET_MESON_AUX_H - -#include - -struct device; -struct regmap; - -#if IS_ENABLED(CONFIG_RESET_MESON_AUX) -int devm_meson_rst_aux_register(struct device *dev, - struct regmap *map, - const char *adev_name); -#else -static inline int devm_meson_rst_aux_register(struct device *dev, - struct regmap *map, - const char *adev_name) -{ - return 0; -} -#endif - -#endif /* __SOC_RESET_MESON_AUX_H */ -- cgit v1.2.3 From a3751212a8eeece59d2018c455000f30ed7e5bb7 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Tue, 14 Jan 2025 15:37:08 -0500 Subject: PCI: Add enable_device() and disable_device() callbacks for bridges MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some PCI host bridges require special handling when enabling or disabling PCI devices. For example, the i.MX95 platform has a lookup table to map Requester IDs to StreamIDs, which the SMMU and MSI controller use to identify the source of DMA accesses. Without this mapping, DMA accesses may target unintended memory, which would corrupt memory or read the wrong data. Add a host bridge enable_device() hook the imx6 driver can use to configure the Requester ID to StreamID mapping. The hardware table isn't big enough to map all possible Requester IDs, so this hook may fail if no table space is available. In that case, return failure from pci_enable_device(). It might make more sense to make pci_set_master() decline to enable bus mastering and return failure, but it currently doesn't have a way to return failure. Link: https://lore.kernel.org/r/20250114-imx95_lut-v9-1-39f58dbed03a@nxp.com Tested-by: Marc Zyngier Signed-off-by: Frank Li Signed-off-by: Bjorn Helgaas [kwilczynski: commit log] Signed-off-by: Krzysztof Wilczyński Reviewed-by: Marc Zyngier Acked-by: Manivannan Sadhasivam --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index db9b47ce3eef..bcbef004dd56 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -595,6 +595,8 @@ struct pci_host_bridge { u8 (*swizzle_irq)(struct pci_dev *, u8 *); /* Platform IRQ swizzler */ int (*map_irq)(const struct pci_dev *, u8, u8); void (*release_fn)(struct pci_host_bridge *); + int (*enable_device)(struct pci_host_bridge *bridge, struct pci_dev *dev); + void (*disable_device)(struct pci_host_bridge *bridge, struct pci_dev *dev); void *release_data; unsigned int ignore_reset_delay:1; /* For entire hierarchy */ unsigned int no_ext_tags:1; /* No Extended Tags */ -- cgit v1.2.3 From 042087247835dad1ec5e39052abf022fd13c6326 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 10 Jan 2025 15:45:23 +0100 Subject: mtd: spinand: Create distinct fast and slow read from cache variants So far, the SPINAND_PAGE_READ_FROM_CACHE_OP macro was taking a first argument, "fast", which was inducing the possibility to support higher bus frequencies than with the normal (slower) read from cache alternative. In practice, without frequency change on the bus, this was likely without effect, besides perhaps allowing another variant of the same command, that could run at the default highest speed. If we want to support this fully, we need to add a frequency parameter to the slowest command. But before we do that, let's drop the "fast" boolean from the macro and duplicate it, this will further help supporting having different frequencies allowed for each variant. The change is also of course propagated to all users. It has the nice effect to have all macros aligned on the same pattern. Reviewed-by: Tudor Ambarus Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index cbbcd44ac225..0d80fa9400d5 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -62,14 +62,26 @@ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_NO_DATA) -#define SPINAND_PAGE_READ_FROM_CACHE_OP(fast, addr, ndummy, buf, len) \ - SPI_MEM_OP(SPI_MEM_OP_CMD(fast ? 0x0b : 0x03, 1), \ +#define SPINAND_PAGE_READ_FROM_CACHE_OP(addr, ndummy, buf, len) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0x03, 1), \ SPI_MEM_OP_ADDR(2, addr, 1), \ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 1)) -#define SPINAND_PAGE_READ_FROM_CACHE_OP_3A(fast, addr, ndummy, buf, len) \ - SPI_MEM_OP(SPI_MEM_OP_CMD(fast ? 0x0b : 0x03, 1), \ +#define SPINAND_PAGE_READ_FROM_CACHE_FAST_OP(addr, ndummy, buf, len) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0x0b, 1), \ + SPI_MEM_OP_ADDR(2, addr, 1), \ + SPI_MEM_OP_DUMMY(ndummy, 1), \ + SPI_MEM_OP_DATA_IN(len, buf, 1)) + +#define SPINAND_PAGE_READ_FROM_CACHE_OP_3A(addr, ndummy, buf, len) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0x03, 1), \ + SPI_MEM_OP_ADDR(3, addr, 1), \ + SPI_MEM_OP_DUMMY(ndummy, 1), \ + SPI_MEM_OP_DATA_IN(len, buf, 1)) + +#define SPINAND_PAGE_READ_FROM_CACHE_FAST_OP_3A(addr, ndummy, buf, len) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0x0b, 1), \ SPI_MEM_OP_ADDR(3, addr, 1), \ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 1)) -- cgit v1.2.3 From 7ce0d16d5802bfde4209e52ee8ad644ca1eab423 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 10 Jan 2025 15:45:24 +0100 Subject: mtd: spinand: Add an optional frequency to read from cache macros While the SPINAND_PAGE_READ_FROM_CACHE_FAST_OP macro is supposed to be able to run at the flash highest supported frequency, it is not the case of the regular read from cache, which may be limited in terms of maximum frequency. Add an optional argument to this macro, which will be used to set the maximum frequency, if any. Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 0d80fa9400d5..2f5cce23b3ec 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -62,11 +62,12 @@ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_NO_DATA) -#define SPINAND_PAGE_READ_FROM_CACHE_OP(addr, ndummy, buf, len) \ +#define SPINAND_PAGE_READ_FROM_CACHE_OP(addr, ndummy, buf, len, ...) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x03, 1), \ SPI_MEM_OP_ADDR(2, addr, 1), \ SPI_MEM_OP_DUMMY(ndummy, 1), \ - SPI_MEM_OP_DATA_IN(len, buf, 1)) + SPI_MEM_OP_DATA_IN(len, buf, 1), \ + __VA_OPT__(SPI_MEM_OP_MAX_FREQ(__VA_ARGS__))) #define SPINAND_PAGE_READ_FROM_CACHE_FAST_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x0b, 1), \ -- cgit v1.2.3 From 8586bc8d95488dfaadbc1af89ba59900d2c39119 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 10 Jan 2025 15:45:26 +0100 Subject: mtd: spinand: Add support for read DTR operations Advanced SPI-NAND chips are capable of reading data much faster by leveraging DTR support. This support extends to dual and quad configurations. Create macros defining all possible read from cache DTR variants: - SPINAND_PAGE_READ_FROM_CACHE_DTR_OP - SPINAND_PAGE_READ_FROM_CACHE_X2_DTR_OP - SPINAND_PAGE_READ_FROM_CACHE_X4_DTR_OP - SPINAND_PAGE_READ_FROM_CACHE_DUALIO_DTR_OP - SPINAND_PAGE_READ_FROM_CACHE_QUADIO_DTR_OP Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'include') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 2f5cce23b3ec..0da8a1c7740e 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -87,6 +87,13 @@ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 1)) +#define SPINAND_PAGE_READ_FROM_CACHE_DTR_OP(addr, ndummy, buf, len, freq) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0x0d, 1), \ + SPI_MEM_DTR_OP_ADDR(2, addr, 1), \ + SPI_MEM_DTR_OP_DUMMY(ndummy, 1), \ + SPI_MEM_DTR_OP_DATA_IN(len, buf, 1), \ + SPI_MEM_OP_MAX_FREQ(freq)) + #define SPINAND_PAGE_READ_FROM_CACHE_X2_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x3b, 1), \ SPI_MEM_OP_ADDR(2, addr, 1), \ @@ -99,6 +106,13 @@ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 2)) +#define SPINAND_PAGE_READ_FROM_CACHE_X2_DTR_OP(addr, ndummy, buf, len, freq) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0x3d, 1), \ + SPI_MEM_DTR_OP_ADDR(2, addr, 1), \ + SPI_MEM_DTR_OP_DUMMY(ndummy, 1), \ + SPI_MEM_DTR_OP_DATA_IN(len, buf, 2), \ + SPI_MEM_OP_MAX_FREQ(freq)) + #define SPINAND_PAGE_READ_FROM_CACHE_X4_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x6b, 1), \ SPI_MEM_OP_ADDR(2, addr, 1), \ @@ -111,6 +125,13 @@ SPI_MEM_OP_DUMMY(ndummy, 1), \ SPI_MEM_OP_DATA_IN(len, buf, 4)) +#define SPINAND_PAGE_READ_FROM_CACHE_X4_DTR_OP(addr, ndummy, buf, len, freq) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0x6d, 1), \ + SPI_MEM_DTR_OP_ADDR(2, addr, 1), \ + SPI_MEM_DTR_OP_DUMMY(ndummy, 1), \ + SPI_MEM_DTR_OP_DATA_IN(len, buf, 4), \ + SPI_MEM_OP_MAX_FREQ(freq)) + #define SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1), \ SPI_MEM_OP_ADDR(2, addr, 2), \ @@ -123,6 +144,13 @@ SPI_MEM_OP_DUMMY(ndummy, 2), \ SPI_MEM_OP_DATA_IN(len, buf, 2)) +#define SPINAND_PAGE_READ_FROM_CACHE_DUALIO_DTR_OP(addr, ndummy, buf, len, freq) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0xbd, 1), \ + SPI_MEM_DTR_OP_ADDR(2, addr, 2), \ + SPI_MEM_DTR_OP_DUMMY(ndummy, 2), \ + SPI_MEM_DTR_OP_DATA_IN(len, buf, 2), \ + SPI_MEM_OP_MAX_FREQ(freq)) + #define SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1), \ SPI_MEM_OP_ADDR(2, addr, 4), \ @@ -135,6 +163,13 @@ SPI_MEM_OP_DUMMY(ndummy, 4), \ SPI_MEM_OP_DATA_IN(len, buf, 4)) +#define SPINAND_PAGE_READ_FROM_CACHE_QUADIO_DTR_OP(addr, ndummy, buf, len, freq) \ + SPI_MEM_OP(SPI_MEM_OP_CMD(0xed, 1), \ + SPI_MEM_DTR_OP_ADDR(2, addr, 4), \ + SPI_MEM_DTR_OP_DUMMY(ndummy, 4), \ + SPI_MEM_DTR_OP_DATA_IN(len, buf, 4), \ + SPI_MEM_OP_MAX_FREQ(freq)) + #define SPINAND_PROG_EXEC_OP(addr) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0x10, 1), \ SPI_MEM_OP_ADDR(3, addr, 1), \ -- cgit v1.2.3 From 2d2a46cf23788a19e5450c6f9c86ab17f596c708 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 12 Jan 2025 16:01:32 +0000 Subject: timekeeping: Remove unused ktime_get_fast_timestamps() ktime_get_fast_timestamps() was added in 2020 by commit e2d977c9f1ab ("timekeeping: Provide multi-timestamp accessor to NMI safe timekeeper") but has remained unused. Remove it. [ tglx: Fold the inline as David suggested in the submission ] Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250112160132.450209-1-linux@treblig.org --- include/linux/timekeeping.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 0e035f675efe..542773650200 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -263,18 +263,6 @@ extern bool timekeeping_rtc_skipresume(void); extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta); -/** - * struct ktime_timestamps - Simultaneous mono/boot/real timestamps - * @mono: Monotonic timestamp - * @boot: Boottime timestamp - * @real: Realtime timestamp - */ -struct ktime_timestamps { - u64 mono; - u64 boot; - u64 real; -}; - /** * struct system_time_snapshot - simultaneous raw/real time capture with * counter value @@ -345,9 +333,6 @@ extern int get_device_system_crosststamp( */ extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); -/* NMI safe mono/boot/realtime timestamps */ -extern void ktime_get_fast_timestamps(struct ktime_timestamps *snap); - /* * Persistent clock related interfaces */ -- cgit v1.2.3 From f94a18249b7f9131f3ca8eacf07f21050747ebd7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Dec 2024 11:34:17 +0100 Subject: genirq: Remove IRQ_MOVE_PCNTXT and related code Now that x86 is converted over to use the IRQCHIP_MOVE_DEFERRED flags, remove IRQ*_MOVE_PCNTXT and related code. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20241210103335.626707225@linutronix.de --- include/linux/irq.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index 6e021548fa0a..8daa17f0107a 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -64,7 +64,6 @@ enum irqchip_irq_state; * IRQ_NOAUTOEN - Interrupt is not automatically enabled in * request/setup_irq() * IRQ_NO_BALANCING - Interrupt cannot be balanced (affinity set) - * IRQ_MOVE_PCNTXT - Interrupt can be migrated from process context * IRQ_NESTED_THREAD - Interrupt nests into another thread * IRQ_PER_CPU_DEVID - Dev_id is a per-cpu variable * IRQ_IS_POLLED - Always polled by another interrupt. Exclude @@ -93,7 +92,6 @@ enum { IRQ_NOREQUEST = (1 << 11), IRQ_NOAUTOEN = (1 << 12), IRQ_NO_BALANCING = (1 << 13), - IRQ_MOVE_PCNTXT = (1 << 14), IRQ_NESTED_THREAD = (1 << 15), IRQ_NOTHREAD = (1 << 16), IRQ_PER_CPU_DEVID = (1 << 17), @@ -105,7 +103,7 @@ enum { #define IRQF_MODIFY_MASK \ (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \ - IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \ + IRQ_NOAUTOEN | IRQ_LEVEL | IRQ_NO_BALANCING | \ IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \ IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_HIDDEN) @@ -201,8 +199,6 @@ struct irq_data { * IRQD_LEVEL - Interrupt is level triggered * IRQD_WAKEUP_STATE - Interrupt is configured for wakeup * from suspend - * IRQD_MOVE_PCNTXT - Interrupt can be moved in process - * context * IRQD_IRQ_DISABLED - Disabled state of the interrupt * IRQD_IRQ_MASKED - Masked state of the interrupt * IRQD_IRQ_INPROGRESS - In progress state of the interrupt @@ -233,7 +229,6 @@ enum { IRQD_AFFINITY_SET = BIT(12), IRQD_LEVEL = BIT(13), IRQD_WAKEUP_STATE = BIT(14), - IRQD_MOVE_PCNTXT = BIT(15), IRQD_IRQ_DISABLED = BIT(16), IRQD_IRQ_MASKED = BIT(17), IRQD_IRQ_INPROGRESS = BIT(18), @@ -338,11 +333,6 @@ static inline bool irqd_is_wakeup_set(struct irq_data *d) return __irqd_to_state(d) & IRQD_WAKEUP_STATE; } -static inline bool irqd_can_move_in_process_context(struct irq_data *d) -{ - return __irqd_to_state(d) & IRQD_MOVE_PCNTXT; -} - static inline bool irqd_irq_disabled(struct irq_data *d) { return __irqd_to_state(d) & IRQD_IRQ_DISABLED; -- cgit v1.2.3 From 477ac7b0a7984e12e8db07fe54ad64443c5f8928 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 4 Dec 2024 15:01:44 +0000 Subject: PCI: host-generic: Allow {en,dis}able_device() to be provided via pci_ecam_ops In order to let host controller drivers using the host-generic infrastructure use the {en,dis}able_device() callbacks that can be used to configure sideband RID mapping hardware, provide these two callbacks as part of the pci_ecam_ops structure. Link: https://lore.kernel.org/r/20241204150145.800408-2-maz@kernel.org Signed-off-by: Marc Zyngier Signed-off-by: Bjorn Helgaas Reviewed-by: Frank Li Reviewed-by: Manivannan Sadhasivam --- include/linux/pci-ecam.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h index 3a4860bd2758..3a10f8cfc3ad 100644 --- a/include/linux/pci-ecam.h +++ b/include/linux/pci-ecam.h @@ -45,6 +45,10 @@ struct pci_ecam_ops { unsigned int bus_shift; struct pci_ops pci_ops; int (*init)(struct pci_config_window *); + int (*enable_device)(struct pci_host_bridge *, + struct pci_dev *); + void (*disable_device)(struct pci_host_bridge *, + struct pci_dev *); }; /* -- cgit v1.2.3 From 27c3f0e61f19d2306527406cad233d5f5915ca1e Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 15 Jan 2025 08:39:18 +0100 Subject: i2c: add kdoc for the new debugfs entry of clients When adding the new debugfs entry, its kdoc equivalent was forgotten. Add it now. Fixes: d06905d68610 ("i2c: add core-managed per-client directory in debugfs") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/r/20250115163146.6c48f066@canb.auug.org.au Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 36de788dc7fe..c31fd1dba3bd 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -317,6 +317,8 @@ struct i2c_driver { * calls it to pass on slave events to the slave driver. * @devres_group_id: id of the devres group that will be created for resources * acquired when probing this device. + * @debugfs: pointer to the debugfs subdirectory which the I2C core created + * for this client. * * An i2c_client identifies a single device (i.e. chip) connected to an * i2c bus. The behaviour exposed to Linux is defined by the driver -- cgit v1.2.3 From c6739623c91bb3d6e9b20e05afbe69a2664f2d70 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 13 Jan 2025 09:22:29 +0000 Subject: net: phylink: pass neg_mode into .pcs_get_state() method Pass the current neg_mode into the .pcs_get_state() method. Update all users of phylink PCS. Signed-off-by: Russell King (Oracle) Reviewed-by: Maxime Chevallier Tested-by: Maxime Chevallier Link: https://patch.msgid.link/E1tXGeT-000Et3-4L@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 4b7a20620b49..0bbcb4898e93 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -446,7 +446,7 @@ struct phylink_pcs_ops { phy_interface_t interface); int (*pcs_post_config)(struct phylink_pcs *pcs, phy_interface_t interface); - void (*pcs_get_state)(struct phylink_pcs *pcs, + void (*pcs_get_state)(struct phylink_pcs *pcs, unsigned int neg_mode, struct phylink_link_state *state); int (*pcs_config)(struct phylink_pcs *pcs, unsigned int neg_mode, phy_interface_t interface, @@ -505,6 +505,7 @@ void pcs_disable(struct phylink_pcs *pcs); /** * pcs_get_state() - Read the current inband link state from the hardware * @pcs: a pointer to a &struct phylink_pcs. + * @neg_mode: link negotiation mode (PHYLINK_PCS_NEG_xxx) * @state: a pointer to a &struct phylink_link_state. * * Read the current inband link state from the MAC PCS, reporting the @@ -513,8 +514,11 @@ void pcs_disable(struct phylink_pcs *pcs); * negotiation completion state in @state->an_complete, and link up state * in @state->link. If possible, @state->lp_advertising should also be * populated. + * + * Note that the @neg_mode parameter is always the PHYLINK_PCS_NEG_xxx + * state, not MLO_AN_xxx. */ -void pcs_get_state(struct phylink_pcs *pcs, +void pcs_get_state(struct phylink_pcs *pcs, unsigned int neg_mode, struct phylink_link_state *state); /** -- cgit v1.2.3 From 7e3cb4e874ab0dcf8b10e43e5068824bf0adcb4c Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 13 Jan 2025 09:22:34 +0000 Subject: net: phylink: pass neg_mode into c22 state decoder Pass the current neg_mode into phylink_mii_c22_pcs_get_state() and phylink_mii_c22_pcs_decode_state(). Update all users of phylink PCS that use these functions. Signed-off-by: Russell King (Oracle) Reviewed-by: Maxime Chevallier Tested-by: Maxime Chevallier Link: https://patch.msgid.link/E1tXGeY-000Et9-8g@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 0bbcb4898e93..f19b7108c840 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -693,8 +693,9 @@ static inline int phylink_get_link_timer_ns(phy_interface_t interface) } void phylink_mii_c22_pcs_decode_state(struct phylink_link_state *state, - u16 bmsr, u16 lpa); + unsigned int neg_mode, u16 bmsr, u16 lpa); void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs, + unsigned int neg_mode, struct phylink_link_state *state); int phylink_mii_c22_pcs_encode_advertisement(phy_interface_t interface, const unsigned long *advertising); -- cgit v1.2.3 From df998c22321dde3f70cd3cf8c183dfd6bf64c759 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Wed, 8 Jan 2025 17:13:45 +0300 Subject: power: supply: add undervoltage health status property Add POWER_SUPPLY_HEALTH_UNDERVOLTAGE status for power supply to report under voltage lockout failures. Signed-off-by: Dzmitry Sankouski Link: https://lore.kernel.org/r/20250108-starqltechn_integration_upstream-v14-1-f6e84ec20d96@gmail.com Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index c3ce9f2b17d4..6ed53b292162 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -60,6 +60,7 @@ enum { POWER_SUPPLY_HEALTH_OVERHEAT, POWER_SUPPLY_HEALTH_DEAD, POWER_SUPPLY_HEALTH_OVERVOLTAGE, + POWER_SUPPLY_HEALTH_UNDERVOLTAGE, POWER_SUPPLY_HEALTH_UNSPEC_FAILURE, POWER_SUPPLY_HEALTH_COLD, POWER_SUPPLY_HEALTH_WATCHDOG_TIMER_EXPIRE, -- cgit v1.2.3 From 197258f0ef685ddbd534254dc79f49faa47dc93d Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 14 Jan 2025 14:28:43 +0000 Subject: net: ethtool: add hds_config member in ethtool_netdev_state When tcp-data-split is UNKNOWN mode, drivers arbitrarily handle it. For example, bnxt_en driver automatically enables if at least one of LRO/GRO/JUMBO is enabled. If tcp-data-split is UNKNOWN and LRO is enabled, a driver returns ENABLES of tcp-data-split, not UNKNOWN. So, `ethtool -g eth0` shows tcp-data-split is enabled. The problem is in the setting situation. In the ethnl_set_rings(), it first calls get_ringparam() to get the current driver's config. At that moment, if driver's tcp-data-split config is UNKNOWN, it returns ENABLE if LRO/GRO/JUMBO is enabled. Then, it sets values from the user and driver's current config to kernel_ethtool_ringparam. Last it calls .set_ringparam(). The driver, especially bnxt_en driver receives ETHTOOL_TCP_DATA_SPLIT_ENABLED. But it can't distinguish whether it is set by the user or just the current config. When user updates ring parameter, the new hds_config value is updated and current hds_config value is stored to old_hdsconfig. Driver's .set_ringparam() callback can distinguish a passed tcp-data-split value is came from user explicitly. If .set_ringparam() is failed, hds_config is rollbacked immediately. Suggested-by: Jakub Kicinski Reviewed-by: Jakub Kicinski Signed-off-by: Taehee Yoo Link: https://patch.msgid.link/20250114142852.3364986-2-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 2 ++ include/linux/netdevice.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 20a86bd5f4e3..d79bd201c1c8 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1157,12 +1157,14 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev, * @rss_ctx: XArray of custom RSS contexts * @rss_lock: Protects entries in @rss_ctx. May be taken from * within RTNL. + * @hds_config: HDS value from userspace. * @wol_enabled: Wake-on-LAN is enabled * @module_fw_flash_in_progress: Module firmware flashing is in progress. */ struct ethtool_netdev_state { struct xarray rss_ctx; struct mutex rss_lock; + u8 hds_config; unsigned wol_enabled:1; unsigned module_fw_flash_in_progress:1; }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bced03fb349e..3e6336775baf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4082,6 +4082,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); u8 dev_xdp_prog_count(struct net_device *dev); int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf); +u8 dev_xdp_sb_prog_count(struct net_device *dev); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); u32 dev_get_min_mp_channel_count(const struct net_device *dev); -- cgit v1.2.3 From eec8359f0797ef87c6ef6cbed6de08b02073b833 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 14 Jan 2025 14:28:44 +0000 Subject: net: ethtool: add support for configuring hds-thresh The hds-thresh option configures the threshold value of the header-data-split. If a received packet size is larger than this threshold value, a packet will be split into header and payload. The header indicates TCP and UDP header, but it depends on driver spec. The bnxt_en driver supports HDS(Header-Data-Split) configuration at FW level, affecting TCP and UDP too. So, If hds-thresh is set, it affects UDP and TCP packets. Example: # ethtool -G hds-thresh # ethtool -G enp14s0f0np0 tcp-data-split on hds-thresh 256 # ethtool -g enp14s0f0np0 Ring parameters for enp14s0f0np0: Pre-set maximums: ... HDS thresh: 1023 Current hardware settings: ... TCP data split: on HDS thresh: 256 The default/min/max values are not defined in the ethtool so the drivers should define themself. The 0 value means that all TCP/UDP packets' header and payload will be split. Tested-by: Stanislav Fomichev Signed-off-by: Taehee Yoo Link: https://patch.msgid.link/20250114142852.3364986-3-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 9 +++++++++ include/uapi/linux/ethtool_netlink_generated.h | 2 ++ 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index d79bd201c1c8..e4136b0df892 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -78,6 +78,9 @@ enum { * @cqe_size: Size of TX/RX completion queue event * @tx_push_buf_len: Size of TX push buffer * @tx_push_buf_max_len: Maximum allowed size of TX push buffer + * @hds_thresh: Packet size threshold for header data split (HDS) + * @hds_thresh_max: Maximum supported setting for @hds_threshold + * */ struct kernel_ethtool_ringparam { u32 rx_buf_len; @@ -87,6 +90,8 @@ struct kernel_ethtool_ringparam { u32 cqe_size; u32 tx_push_buf_len; u32 tx_push_buf_max_len; + u32 hds_thresh; + u32 hds_thresh_max; }; /** @@ -97,6 +102,7 @@ struct kernel_ethtool_ringparam { * @ETHTOOL_RING_USE_RX_PUSH: capture for setting rx_push * @ETHTOOL_RING_USE_TX_PUSH_BUF_LEN: capture for setting tx_push_buf_len * @ETHTOOL_RING_USE_TCP_DATA_SPLIT: capture for setting tcp_data_split + * @ETHTOOL_RING_USE_HDS_THRS: capture for setting header-data-split-thresh */ enum ethtool_supported_ring_param { ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0), @@ -105,6 +111,7 @@ enum ethtool_supported_ring_param { ETHTOOL_RING_USE_RX_PUSH = BIT(3), ETHTOOL_RING_USE_TX_PUSH_BUF_LEN = BIT(4), ETHTOOL_RING_USE_TCP_DATA_SPLIT = BIT(5), + ETHTOOL_RING_USE_HDS_THRS = BIT(6), }; #define __ETH_RSS_HASH_BIT(bit) ((u32)1 << (bit)) @@ -1157,6 +1164,7 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev, * @rss_ctx: XArray of custom RSS contexts * @rss_lock: Protects entries in @rss_ctx. May be taken from * within RTNL. + * @hds_thresh: HDS Threshold value. * @hds_config: HDS value from userspace. * @wol_enabled: Wake-on-LAN is enabled * @module_fw_flash_in_progress: Module firmware flashing is in progress. @@ -1164,6 +1172,7 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev, struct ethtool_netdev_state { struct xarray rss_ctx; struct mutex rss_lock; + u32 hds_thresh; u8 hds_config; unsigned wol_enabled:1; unsigned module_fw_flash_in_progress:1; diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 43993a2d68e5..2e17ff348f89 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -155,6 +155,8 @@ enum { ETHTOOL_A_RINGS_RX_PUSH, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, + ETHTOOL_A_RINGS_HDS_THRESH, + ETHTOOL_A_RINGS_HDS_THRESH_MAX, __ETHTOOL_A_RINGS_CNT, ETHTOOL_A_RINGS_MAX = (__ETHTOOL_A_RINGS_CNT - 1) -- cgit v1.2.3 From 3440fa34ad99d471f1085bc2f4dedeaebc310261 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Jan 2025 22:10:49 +0000 Subject: inet: ipmr: fix data-races Following fields of 'struct mr_mfc' can be updated concurrently (no lock protection) from ip_mr_forward() and ip6_mr_forward() - bytes - pkt - wrong_if - lastuse They also can be read from other functions. Convert bytes, pkt and wrong_if to atomic_long_t, and use READ_ONCE()/WRITE_ONCE() for lastuse. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250114221049.1190631-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 9dd4bf157255..58a2401e4b55 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -146,9 +146,9 @@ struct mr_mfc { unsigned long last_assert; int minvif; int maxvif; - unsigned long bytes; - unsigned long pkt; - unsigned long wrong_if; + atomic_long_t bytes; + atomic_long_t pkt; + atomic_long_t wrong_if; unsigned long lastuse; unsigned char ttls[MAXVIFS]; refcount_t refcount; -- cgit v1.2.3 From ebca39700f343c42fe37d9606980e8591801c46f Mon Sep 17 00:00:00 2001 From: Dario Binacchi Date: Tue, 14 Jan 2025 19:19:46 +0100 Subject: dt-bindings: clock: convert stm32 rcc bindings to json-schema The patch converts st,stm32-rcc.txt to the JSON schema, but it does more than that. The old bindings, in fact, only covered the stm32f{4,7} platforms and not the stm32h7. Therefore, to avoid patch submission tests failing, it was necessary to add the corresponding compatible (i. e. st,stm32h743-rcc) and specify that, in this case, 3 are the clocks instead of the 2 required for the stm32f{4,7} platforms. Additionally, the old bindings made no mention of the st,syscfg property, which is used by both the stm32f{4,7} and the stm32h7 platforms. The patch also fixes the files referencing to the old st,stm32-rcc.txt. Signed-off-by: Dario Binacchi Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250114182021.670435-2-dario.binacchi@amarulasolutions.com Signed-off-by: Stephen Boyd --- include/dt-bindings/clock/stm32fx-clock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/dt-bindings/clock/stm32fx-clock.h b/include/dt-bindings/clock/stm32fx-clock.h index e5dad050d518..b6ff9c68cb3f 100644 --- a/include/dt-bindings/clock/stm32fx-clock.h +++ b/include/dt-bindings/clock/stm32fx-clock.h @@ -10,7 +10,7 @@ * List of clocks which are not derived from system clock (SYSCLOCK) * * The index of these clocks is the secondary index of DT bindings - * (see Documentation/devicetree/bindings/clock/st,stm32-rcc.txt) + * (see Documentation/devicetree/bindings/clock/st,stm32-rcc.yaml) * * e.g: ; -- cgit v1.2.3 From cbc16bceea784210d585a42ac9f8f10ce62b300e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 8 Jan 2025 14:06:22 -0800 Subject: net: make page_pool_ref_netmem work with net iovs page_pool_ref_netmem() should work with either netmem representation, but currently it casts to a page with netmem_to_page(), which will fail with net iovs. Use netmem_get_pp_ref_count_ref() instead. Fixes: 8ab79ed50cf1 ("page_pool: devmem support") Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://lore.kernel.org/20250108220644.3528845-2-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- include/net/page_pool/helpers.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 793e6fd78bc5..60a5347922be 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -294,7 +294,7 @@ static inline long page_pool_unref_page(struct page *page, long nr) static inline void page_pool_ref_netmem(netmem_ref netmem) { - atomic_long_inc(&netmem_to_page(netmem)->pp_ref_count); + atomic_long_inc(netmem_get_pp_ref_count_ref(netmem)); } static inline void page_pool_ref_page(struct page *page) -- cgit v1.2.3 From ebda2f0bbde540ff7da168d2837f8cfb14581e2e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 14 Jan 2025 19:53:09 -0800 Subject: net: add netdev_lock() / netdev_unlock() helpers Add helpers for locking the netdev instance, use it in drivers and the shaper code. This will make grepping for the lock usage much easier, as we extend the lock to cover more fields. Reviewed-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/20250115035319.559603-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3e6336775baf..6d440db35d5f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2444,8 +2444,12 @@ struct net_device { u32 napi_defer_hard_irqs; /** - * @lock: protects @net_shaper_hierarchy, feel free to use for other - * netdev-scope protection. Ordering: take after rtnl_lock. + * @lock: netdev-scope lock, protects a small selection of fields. + * Should always be taken using netdev_lock() / netdev_unlock() helpers. + * Drivers are free to use it for other protection. + * + * Protects: @net_shaper_hierarchy. + * Ordering: take after rtnl_lock. */ struct mutex lock; @@ -2671,6 +2675,21 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, enum netdev_queue_type type, struct napi_struct *napi); +static inline void netdev_lock(struct net_device *dev) +{ + mutex_lock(&dev->lock); +} + +static inline void netdev_unlock(struct net_device *dev) +{ + mutex_unlock(&dev->lock); +} + +static inline void netdev_assert_locked(struct net_device *dev) +{ + lockdep_assert_held(&dev->lock); +} + static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) { napi->irq = irq; -- cgit v1.2.3 From 5fda3f35349b6b7f22f5f5095a3821261d515075 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 14 Jan 2025 19:53:10 -0800 Subject: net: make netdev_lock() protect netdev->reg_state Protect writes to netdev->reg_state with netdev_lock(). From now on holding netdev_lock() is sufficient to prevent the net_device from getting unregistered, so code which wants to hold just a single netdev around no longer needs to hold rtnl_lock. We do not protect the NETREG_UNREGISTERED -> NETREG_RELEASED transition. We'd need to move mutex_destroy(netdev->lock) to .release, but the real reason is that trying to stop the unregistration process mid-way would be unsafe / crazy. Taking references on such devices is not safe, either. So the intended semantics are to lock REGISTERED devices. Reviewed-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250115035319.559603-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6d440db35d5f..007bcfa383c9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2448,7 +2448,7 @@ struct net_device { * Should always be taken using netdev_lock() / netdev_unlock() helpers. * Drivers are free to use it for other protection. * - * Protects: @net_shaper_hierarchy. + * Protects: @reg_state, @net_shaper_hierarchy. * Ordering: take after rtnl_lock. */ struct mutex lock; -- cgit v1.2.3 From 5112457f3d8e41f987908266068af88ef9f3ab78 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 14 Jan 2025 19:53:12 -0800 Subject: net: add netdev->up protected by netdev_lock() Some uAPI (netdev netlink) hide net_device's sub-objects while the interface is down to ensure uniform behavior across drivers. To remove the rtnl_lock dependency from those uAPIs we need a way to safely tell if the device is down or up. Add an indication of whether device is open or closed, protected by netdev->lock. The semantics are the same as IFF_UP, but taking netdev_lock around every write to ->flags would be a lot of code churn. We don't want to blanket the entire open / close path by netdev_lock, because it will prevent us from applying it to specific structures - core helpers won't be able to take that lock from any function called by the drivers on open/close paths. So the state of the flag is "pessimistic", as in it may report false negatives, but never false positives. Reviewed-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250115035319.559603-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 007bcfa383c9..cac81b0a166f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2443,12 +2443,24 @@ struct net_device { unsigned long gro_flush_timeout; u32 napi_defer_hard_irqs; + /** + * @up: copy of @state's IFF_UP, but safe to read with just @lock. + * May report false negatives while the device is being opened + * or closed (@lock does not protect .ndo_open, or .ndo_close). + */ + bool up; + /** * @lock: netdev-scope lock, protects a small selection of fields. * Should always be taken using netdev_lock() / netdev_unlock() helpers. * Drivers are free to use it for other protection. * - * Protects: @reg_state, @net_shaper_hierarchy. + * Protects: + * @net_shaper_hierarchy, @reg_state + * + * Partially protects (writers must hold both @lock and rtnl_lock): + * @up + * * Ordering: take after rtnl_lock. */ struct mutex lock; -- cgit v1.2.3 From 1b23cdbd2bbc4b40e21c12ae86c2781e347ff0f8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 14 Jan 2025 19:53:13 -0800 Subject: net: protect netdev->napi_list with netdev_lock() Hold netdev->lock when NAPIs are getting added or removed. This will allow safe access to NAPI instances of a net_device without rtnl_lock. Create a family of helpers which assume the lock is already taken. Switch iavf to them, as it makes extensive use of netdev->lock, already. Reviewed-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250115035319.559603-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 54 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cac81b0a166f..3130a8c807dd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2456,7 +2456,7 @@ struct net_device { * Drivers are free to use it for other protection. * * Protects: - * @net_shaper_hierarchy, @reg_state + * @napi_list, @net_shaper_hierarchy, @reg_state * * Partially protects (writers must hold both @lock and rtnl_lock): * @up @@ -2712,8 +2712,19 @@ static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) */ #define NAPI_POLL_WEIGHT 64 -void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight); +void netif_napi_add_weight_locked(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), + int weight); + +static inline void +netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) +{ + netdev_lock(dev); + netif_napi_add_weight_locked(dev, napi, poll, weight); + netdev_unlock(dev); +} /** * netif_napi_add() - initialize a NAPI context @@ -2731,6 +2742,13 @@ netif_napi_add(struct net_device *dev, struct napi_struct *napi, netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT); } +static inline void +netif_napi_add_locked(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int)) +{ + netif_napi_add_weight_locked(dev, napi, poll, NAPI_POLL_WEIGHT); +} + static inline void netif_napi_add_tx_weight(struct net_device *dev, struct napi_struct *napi, @@ -2741,6 +2759,15 @@ netif_napi_add_tx_weight(struct net_device *dev, netif_napi_add_weight(dev, napi, poll, weight); } +static inline void +netif_napi_add_config_locked(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int index) +{ + napi->index = index; + napi->config = &dev->napi_config[index]; + netif_napi_add_weight_locked(dev, napi, poll, NAPI_POLL_WEIGHT); +} + /** * netif_napi_add_config - initialize a NAPI context with persistent config * @dev: network device @@ -2752,9 +2779,9 @@ static inline void netif_napi_add_config(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int index) { - napi->index = index; - napi->config = &dev->napi_config[index]; - netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT); + netdev_lock(dev); + netif_napi_add_config_locked(dev, napi, poll, index); + netdev_unlock(dev); } /** @@ -2774,6 +2801,8 @@ static inline void netif_napi_add_tx(struct net_device *dev, netif_napi_add_tx_weight(dev, napi, poll, NAPI_POLL_WEIGHT); } +void __netif_napi_del_locked(struct napi_struct *napi); + /** * __netif_napi_del - remove a NAPI context * @napi: NAPI context @@ -2782,7 +2811,18 @@ static inline void netif_napi_add_tx(struct net_device *dev, * containing @napi. Drivers might want to call this helper to combine * all the needed RCU grace periods into a single one. */ -void __netif_napi_del(struct napi_struct *napi); +static inline void __netif_napi_del(struct napi_struct *napi) +{ + netdev_lock(napi->dev); + __netif_napi_del_locked(napi); + netdev_unlock(napi->dev); +} + +static inline void netif_napi_del_locked(struct napi_struct *napi) +{ + __netif_napi_del_locked(napi); + synchronize_net(); +} /** * netif_napi_del - remove a NAPI context -- cgit v1.2.3 From 413f0271f3966e0c73d4937963f19335af19e628 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 14 Jan 2025 19:53:14 -0800 Subject: net: protect NAPI enablement with netdev_lock() Wrap napi_enable() / napi_disable() with netdev_lock(). Provide the "already locked" flavor of the API. iavf needs the usual adjustment. A number of drivers call napi_enable() under a spin lock, so they have to be modified to take netdev_lock() first, then spin lock then call napi_enable_locked(). Protecting napi_enable() implies that napi->napi_id is protected by netdev_lock(). Acked-by: Francois Romieu # via-velocity Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250115035319.559603-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3130a8c807dd..3941e4d0073e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -382,7 +382,7 @@ struct napi_struct { struct sk_buff *skb; struct list_head rx_list; /* Pending GRO_NORMAL skbs */ int rx_count; /* length of rx_list */ - unsigned int napi_id; + unsigned int napi_id; /* protected by netdev_lock */ struct hrtimer timer; struct task_struct *thread; unsigned long gro_flush_timeout; @@ -570,16 +570,11 @@ static inline bool napi_complete(struct napi_struct *n) int dev_set_threaded(struct net_device *dev, bool threaded); -/** - * napi_disable - prevent NAPI from scheduling - * @n: NAPI context - * - * Stop NAPI from being scheduled on this context. - * Waits till any outstanding processing completes. - */ void napi_disable(struct napi_struct *n); +void napi_disable_locked(struct napi_struct *n); void napi_enable(struct napi_struct *n); +void napi_enable_locked(struct napi_struct *n); /** * napi_synchronize - wait until NAPI is not running -- cgit v1.2.3 From 1bb86cf8f44b1c1a320566558250b1f5121f6fd3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 14 Jan 2025 19:53:16 -0800 Subject: net: protect threaded status of NAPI with netdev_lock() Now that NAPI instances can't come and go without holding netdev->lock we can trivially switch from rtnl_lock() to netdev_lock() for setting netdev->threaded via sysfs. Note that since we do not lock netdev_lock around sysfs calls in the core we don't have to "trylock" like we do with rtnl_lock. Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250115035319.559603-9-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3941e4d0073e..20e773bbd181 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -384,7 +384,7 @@ struct napi_struct { int rx_count; /* length of rx_list */ unsigned int napi_id; /* protected by netdev_lock */ struct hrtimer timer; - struct task_struct *thread; + struct task_struct *thread; /* protected by netdev_lock */ unsigned long gro_flush_timeout; unsigned long irq_suspend_timeout; u32 defer_hard_irqs; @@ -2451,11 +2451,13 @@ struct net_device { * Drivers are free to use it for other protection. * * Protects: - * @napi_list, @net_shaper_hierarchy, @reg_state + * @napi_list, @net_shaper_hierarchy, @reg_state, @threaded * * Partially protects (writers must hold both @lock and rtnl_lock): * @up * + * Also protects some fields in struct napi_struct. + * * Ordering: take after rtnl_lock. */ struct mutex lock; @@ -2697,6 +2699,13 @@ static inline void netdev_assert_locked(struct net_device *dev) lockdep_assert_held(&dev->lock); } +static inline void netdev_assert_locked_or_invisible(struct net_device *dev) +{ + if (dev->reg_state == NETREG_REGISTERED || + dev->reg_state == NETREG_UNREGISTERING) + netdev_assert_locked(dev); +} + static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) { napi->irq = irq; -- cgit v1.2.3 From 53ed30800d3fd36e1e9f7ba8014b150632f714b1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 14 Jan 2025 19:53:17 -0800 Subject: net: protect napi->irq with netdev_lock() Take netdev_lock() in netif_napi_set_irq(). All NAPI "control fields" are now protected by that lock (most of the other ones are set during napi add/del). The napi_hash_node is fully protected by the hash spin lock, but close enough for the kdoc... Reviewed-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250115035319.559603-10-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 20e773bbd181..a47ff20365f9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -388,6 +388,7 @@ struct napi_struct { unsigned long gro_flush_timeout; unsigned long irq_suspend_timeout; u32 defer_hard_irqs; + /* all fields past this point are write-protected by netdev_lock */ /* control-path-only fields follow */ struct list_head dev_list; struct hlist_node napi_hash_node; @@ -2706,11 +2707,18 @@ static inline void netdev_assert_locked_or_invisible(struct net_device *dev) netdev_assert_locked(dev); } -static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) +static inline void netif_napi_set_irq_locked(struct napi_struct *napi, int irq) { napi->irq = irq; } +static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) +{ + netdev_lock(napi->dev); + netif_napi_set_irq_locked(napi, irq); + netdev_unlock(napi->dev); +} + /* Default NAPI poll() weight * Device drivers are strongly advised to not use bigger value */ -- cgit v1.2.3 From e7ed2ba757bf86a4f90ae9c4080235fc9c74d8a2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 14 Jan 2025 19:53:18 -0800 Subject: net: protect NAPI config fields with netdev_lock() Protect the following members of netdev and napi by netdev_lock: - defer_hard_irqs, - gro_flush_timeout, - irq_suspend_timeout. The first two are written via sysfs (which this patch switches to new lock), and netdev genl which holds both netdev and rtnl locks. irq_suspend_timeout is only written by netdev genl. Reviewed-by: Joe Damato Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250115035319.559603-11-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a47ff20365f9..8308d9c75918 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -384,11 +384,11 @@ struct napi_struct { int rx_count; /* length of rx_list */ unsigned int napi_id; /* protected by netdev_lock */ struct hrtimer timer; - struct task_struct *thread; /* protected by netdev_lock */ + /* all fields past this point are write-protected by netdev_lock */ + struct task_struct *thread; unsigned long gro_flush_timeout; unsigned long irq_suspend_timeout; u32 defer_hard_irqs; - /* all fields past this point are write-protected by netdev_lock */ /* control-path-only fields follow */ struct list_head dev_list; struct hlist_node napi_hash_node; @@ -2452,7 +2452,8 @@ struct net_device { * Drivers are free to use it for other protection. * * Protects: - * @napi_list, @net_shaper_hierarchy, @reg_state, @threaded + * @gro_flush_timeout, @napi_defer_hard_irqs, @napi_list, + * @net_shaper_hierarchy, @reg_state, @threaded * * Partially protects (writers must hold both @lock and rtnl_lock): * @up -- cgit v1.2.3 From 0734d7c3d93cdcb8a56ce914d3c661300f24434d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Jan 2025 20:55:27 +0000 Subject: net: expedite synchronize_net() for cleanup_net() cleanup_net() is the single thread responsible for netns dismantles, and a serious bottleneck. Before we can get per-netns RTNL, make sure all synchronize_net() called from this thread are using rcu_synchronize_expedited(). v3: deal with CONFIG_NET_NS=n Signed-off-by: Eric Dumazet Reviewed-by: Jesse Brandeburg Link: https://patch.msgid.link/20250114205531.967841-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/net_namespace.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 5a2a0df8ad91..0f5eb9db0c62 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -210,6 +210,8 @@ void net_ns_barrier(void); struct ns_common *get_net_ns(struct ns_common *ns); struct net *get_net_ns_by_fd(int fd); +extern struct task_struct *cleanup_net_task; + #else /* CONFIG_NET_NS */ #include #include -- cgit v1.2.3 From 53078a736fbc60e5d3a1e14f4cd4214003815026 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sat, 11 Jan 2025 14:01:53 +1300 Subject: HID: hid-asus: Disable OOBE mode on the ProArt P16 The new ASUS ProArt 16" laptop series come with their keyboards stuck in an Out-Of-Box-Experience mode. While in this mode most functions will not work such as LED control or Fn key combos. The correct init sequence is now done to disable this OOBE. This patch addresses only the ProArt series so far and it is unknown if there may be others, in which case a new quirk may be required. Signed-off-by: Luke D. Jones Co-developed-by: Connor Belli Signed-off-by: Connor Belli Tested-by: Jan Schmidt Signed-off-by: Jiri Kosina --- include/linux/platform_data/x86/asus-wmi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 365e119bebaa..783e2a336861 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -184,6 +184,11 @@ static const struct dmi_system_id asus_use_hid_led_dmi_ids[] = { DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Flow"), }, }, + { + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "ProArt P16"), + }, + }, { .matches = { DMI_MATCH(DMI_BOARD_NAME, "GA403U"), -- cgit v1.2.3 From 4f3b63e8a8a28e3dcdcf3ff260f57a732a20b92b Mon Sep 17 00:00:00 2001 From: Sentaro Onizuka Date: Tue, 14 Jan 2025 00:14:00 +0900 Subject: fs: Fix return type of do_mount() from long to int Fix the return type of do_mount() function from long to int to match its ac tual behavior. The function only returns int values, and all callers, inclu ding those in fs/namespace.c and arch/alpha/kernel/osf_sys.c, already treat the return value as int. This change improves type consistency across the filesystem code and aligns the function signature with its existing impleme ntation and usage. Signed-off-by: Sentaro Onizuka Link: https://lore.kernel.org/r/20250113151400.55512-1-sentaro@amazon.com Signed-off-by: Christian Brauner --- include/linux/mount.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mount.h b/include/linux/mount.h index 33f17b6e8732..a7b472faec2c 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -114,7 +114,7 @@ extern struct vfsmount *kern_mount(struct file_system_type *); extern void kern_unmount(struct vfsmount *mnt); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); -extern long do_mount(const char *, const char __user *, +int do_mount(const char *, const char __user *, const char *, unsigned long, void *); extern struct vfsmount *collect_mounts(const struct path *); extern void drop_collected_mounts(struct vfsmount *); -- cgit v1.2.3 From 4b193fa75efffd90c054d1a7f2b5dbe29a461c14 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Jan 2025 10:46:37 +0100 Subject: lockref: remove lockref_put_not_zero lockref_put_not_zero is not used anywhere, and unless I'm missing something didn't end up being used used at all. Remove it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250115094702.504610-2-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/lockref.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/lockref.h b/include/linux/lockref.h index c3a1f78bc884..e5aa0347f274 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -37,7 +37,6 @@ struct lockref { extern void lockref_get(struct lockref *); extern int lockref_put_return(struct lockref *); extern int lockref_get_not_zero(struct lockref *); -extern int lockref_put_not_zero(struct lockref *); extern int lockref_put_or_lock(struct lockref *); extern void lockref_mark_dead(struct lockref *); -- cgit v1.2.3 From 6d2868d5b6fca7534641440efe432cf268bd8e1b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Jan 2025 10:46:39 +0100 Subject: lockref: use bool for false/true returns Replace int used as bool with the actual bool type for return values that can only be true or false. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250115094702.504610-4-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/lockref.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/lockref.h b/include/linux/lockref.h index e5aa0347f274..3d770e1bdbad 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -36,11 +36,11 @@ struct lockref { extern void lockref_get(struct lockref *); extern int lockref_put_return(struct lockref *); -extern int lockref_get_not_zero(struct lockref *); -extern int lockref_put_or_lock(struct lockref *); +bool lockref_get_not_zero(struct lockref *lockref); +bool lockref_put_or_lock(struct lockref *lockref); extern void lockref_mark_dead(struct lockref *); -extern int lockref_get_not_dead(struct lockref *); +bool lockref_get_not_dead(struct lockref *lockref); /* Must be called under spinlock for reliable results */ static inline bool __lockref_is_dead(const struct lockref *l) -- cgit v1.2.3 From 25d8060418b4e83e109b20f3b3931301e254b8f4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Jan 2025 10:46:40 +0100 Subject: lockref: drop superfluous externs Drop the superfluous externs from the remaining prototypes in lockref.h. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250115094702.504610-5-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/lockref.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/lockref.h b/include/linux/lockref.h index 3d770e1bdbad..f821f46e9fb4 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -34,12 +34,12 @@ struct lockref { }; }; -extern void lockref_get(struct lockref *); -extern int lockref_put_return(struct lockref *); +void lockref_get(struct lockref *lockref); +int lockref_put_return(struct lockref *lockref); bool lockref_get_not_zero(struct lockref *lockref); bool lockref_put_or_lock(struct lockref *lockref); -extern void lockref_mark_dead(struct lockref *); +void lockref_mark_dead(struct lockref *lockref); bool lockref_get_not_dead(struct lockref *lockref); /* Must be called under spinlock for reliable results */ -- cgit v1.2.3 From 63440d1c6dd1fc782db905319dbfb4db354e54b9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Jan 2025 10:46:41 +0100 Subject: lockref: add a lockref_init helper Add a helper to initialize the lockdep, that is initialize the spinlock and set a value. Having to open code them isn't a big deal, but having an initializer feels right for a proper primitive. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250115094702.504610-6-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/lockref.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/lockref.h b/include/linux/lockref.h index f821f46e9fb4..c39f119659ba 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -34,6 +34,17 @@ struct lockref { }; }; +/** + * lockref_init - Initialize a lockref + * @lockref: pointer to lockref structure + * @count: initial count + */ +static inline void lockref_init(struct lockref *lockref, unsigned int count) +{ + spin_lock_init(&lockref->lock); + lockref->count = count; +} + void lockref_get(struct lockref *lockref); int lockref_put_return(struct lockref *lockref); bool lockref_get_not_zero(struct lockref *lockref); -- cgit v1.2.3 From 2f8dea1692eef2b7ba6a256246ed82c365fdc686 Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Fri, 20 Dec 2024 22:44:21 +0900 Subject: hrtimers: Handle CPU state correctly on hotplug Consider a scenario where a CPU transitions from CPUHP_ONLINE to halfway through a CPU hotunplug down to CPUHP_HRTIMERS_PREPARE, and then back to CPUHP_ONLINE: Since hrtimers_prepare_cpu() does not run, cpu_base.hres_active remains set to 1 throughout. However, during a CPU unplug operation, the tick and the clockevents are shut down at CPUHP_AP_TICK_DYING. On return to the online state, for instance CFS incorrectly assumes that the hrtick is already active, and the chance of the clockevent device to transition to oneshot mode is also lost forever for the CPU, unless it goes back to a lower state than CPUHP_HRTIMERS_PREPARE once. This round-trip reveals another issue; cpu_base.online is not set to 1 after the transition, which appears as a WARN_ON_ONCE in enqueue_hrtimer(). Aside of that, the bulk of the per CPU state is not reset either, which means there are dangling pointers in the worst case. Address this by adding a corresponding startup() callback, which resets the stale per CPU state and sets the online flag. [ tglx: Make the new callback unconditionally available, remove the online modification in the prepare() callback and clear the remaining state in the starting callback instead of the prepare callback ] Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") Signed-off-by: Koichiro Den Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20241220134421.3809834-1-koichiro.den@canonical.com --- include/linux/hrtimer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 7ef5f7ef31a9..f7bfdcf0dda3 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -386,6 +386,7 @@ extern void __init hrtimers_init(void); extern void sysrq_timer_list_show(void); int hrtimers_prepare_cpu(unsigned int cpu); +int hrtimers_cpu_starting(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU int hrtimers_cpu_dying(unsigned int cpu); #else -- cgit v1.2.3 From d960f14800b581d79e1c3df4db524d9d4b3aac9a Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 15 Jan 2025 19:27:03 -0500 Subject: ACPI: platform_profile: Replace *class_dev member with class_dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of holding a reference to the class device, embed it the platform_profile_handler. This involves manually creating and registering the device and replacing dev_get_drvdata() with the newly created to_pprof_handler() macro. Reviewed-by: Mario Limonciello Signed-off-by: Kurt Borja Reviewed-by: Mark Pearson Tested-by: Mark Pearson Link: https://lore.kernel.org/r/20250116002721.75592-2-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index f1cd4b65e351..8a9b8754f9ac 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -9,6 +9,7 @@ #ifndef _PLATFORM_PROFILE_H_ #define _PLATFORM_PROFILE_H_ +#include #include /* @@ -30,7 +31,7 @@ enum platform_profile_option { struct platform_profile_handler { const char *name; struct device *dev; - struct device *class_dev; + struct device class_dev; int minor; unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; int (*profile_get)(struct platform_profile_handler *pprof, -- cgit v1.2.3 From 249c576f0f9d0556cb7473b8a437b30239afbd16 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 15 Jan 2025 19:27:04 -0500 Subject: ACPI: platform_profile: Let drivers set drvdata to the class device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add *drvdata to platform_profile_register() signature and assign it to the class device. While at it, pass specific driver state as drvdata to replace uses of container_of() with dev_get_drvdata(). Reviewed-by: Mario Limonciello Signed-off-by: Kurt Borja Reviewed-by: Mark Pearson Tested-by: Mark Pearson Link: https://lore.kernel.org/r/20250116002721.75592-3-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 8a9b8754f9ac..1c8fdda51eaa 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -40,9 +40,9 @@ struct platform_profile_handler { enum platform_profile_option profile); }; -int platform_profile_register(struct platform_profile_handler *pprof); +int platform_profile_register(struct platform_profile_handler *pprof, void *drvdata); int platform_profile_remove(struct platform_profile_handler *pprof); -int devm_platform_profile_register(struct platform_profile_handler *pprof); +int devm_platform_profile_register(struct platform_profile_handler *pprof, void *drvdata); int platform_profile_cycle(void); void platform_profile_notify(struct platform_profile_handler *pprof); -- cgit v1.2.3 From cf3ea098dd3af415f079bc0b999055f213dd4a83 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 15 Jan 2025 19:27:05 -0500 Subject: ACPI: platform_profile: Remove platform_profile_handler from callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Devices can now set drvdata to the class device, thus passing the platform_profile_handler to callbacks is unnecessary. Instead pass the class device. Reviewed-by: Mario Limonciello Signed-off-by: Kurt Borja Reviewed-by: Mark Pearson Tested-by: Mark Pearson Link: https://lore.kernel.org/r/20250116002721.75592-4-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 1c8fdda51eaa..5296d886c243 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -34,10 +34,8 @@ struct platform_profile_handler { struct device class_dev; int minor; unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; - int (*profile_get)(struct platform_profile_handler *pprof, - enum platform_profile_option *profile); - int (*profile_set)(struct platform_profile_handler *pprof, - enum platform_profile_option profile); + int (*profile_get)(struct device *dev, enum platform_profile_option *profile); + int (*profile_set)(struct device *dev, enum platform_profile_option profile); }; int platform_profile_register(struct platform_profile_handler *pprof, void *drvdata); -- cgit v1.2.3 From b5ca1a4488a5e6dfb9962e2319c03c7414e50ec3 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 15 Jan 2025 19:27:06 -0500 Subject: ACPI: platform_profile: Add `ops` member to handlers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace *profile_get and *profile_set members with a general *ops member. Reviewed-by: Mario Limonciello Signed-off-by: Kurt Borja Reviewed-by: Mark Pearson Tested-by: Mark Pearson Link: https://lore.kernel.org/r/20250116002721.75592-5-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 5296d886c243..6013c05d7b86 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -28,14 +28,20 @@ enum platform_profile_option { PLATFORM_PROFILE_LAST, /*must always be last */ }; +struct platform_profile_handler; + +struct platform_profile_ops { + int (*profile_get)(struct device *dev, enum platform_profile_option *profile); + int (*profile_set)(struct device *dev, enum platform_profile_option profile); +}; + struct platform_profile_handler { const char *name; struct device *dev; struct device class_dev; int minor; unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; - int (*profile_get)(struct device *dev, enum platform_profile_option *profile); - int (*profile_set)(struct device *dev, enum platform_profile_option profile); + const struct platform_profile_ops *ops; }; int platform_profile_register(struct platform_profile_handler *pprof, void *drvdata); -- cgit v1.2.3 From 58d5629dc8b8b8d9928fc649d9f2aaa361a8a5c5 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 15 Jan 2025 19:27:07 -0500 Subject: ACPI: platform_profile: Add `probe` to platform_profile_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a `probe` callback to platform_profile_ops, which lets drivers initialize the choices member manually. This is a step towards unexposing the struct platform_profile_handler from the consumer drivers. Reviewed-by: Mario Limonciello Signed-off-by: Kurt Borja Reviewed-by: Mark Pearson Tested-by: Mark Pearson Link: https://lore.kernel.org/r/20250116002721.75592-6-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 6013c05d7b86..5ad1ab7b75e4 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -31,6 +31,7 @@ enum platform_profile_option { struct platform_profile_handler; struct platform_profile_ops { + int (*probe)(void *drvdata, unsigned long *choices); int (*profile_get)(struct device *dev, enum platform_profile_option *profile); int (*profile_set)(struct device *dev, enum platform_profile_option profile); }; -- cgit v1.2.3 From ad41ddeeac216417a52fbc1060577f3098f4e90e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Tue, 14 Jan 2025 19:08:39 +0200 Subject: PCI: Add TLP Prefix reading to pcie_read_tlp_log() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pcie_read_tlp_log() handles only 4 Header Log DWORDs but TLP Prefix Log (PCIe r6.1 secs 7.8.4.12 & 7.9.14.13) may also be present. Generalize pcie_read_tlp_log() and struct pcie_tlp_log to also handle TLP Prefix Log. The relevant registers are formatted identically in AER and DPC Capability, but has these variations: a) The offsets of TLP Prefix Log registers vary. b) DPC RP PIO TLP Prefix Log register can be < 4 DWORDs. c) AER TLP Prefix Log Present (PCIe r6.1 sec 7.8.4.7) can indicate Prefix Log is not present. Therefore callers must pass the offset of the TLP Prefix Log register and the entire length to pcie_read_tlp_log() to be able to read the correct number of TLP Prefix DWORDs from the correct offset. Link: https://lore.kernel.org/r/20250114170840.1633-8-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen [bhelgaas: squash ternary fix from https://lore.kernel.org/r/20250116172019.88116-1-colin.i.king@gmail.com] Signed-off-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron --- include/linux/aer.h | 1 + include/uapi/linux/pci_regs.h | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/aer.h b/include/linux/aer.h index 4ef6515c3205..947b63091902 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -27,6 +27,7 @@ struct pci_dev; struct pcie_tlp_log { u32 dw[PCIE_STD_NUM_TLP_HEADERLOG]; + u32 prefix[PCIE_STD_MAX_TLP_PREFIXLOG]; }; struct aer_capability_regs { diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 14a6306c4ce1..82866ac0bda7 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -790,10 +790,11 @@ /* Same bits as above */ #define PCI_ERR_CAP 0x18 /* Advanced Error Capabilities & Ctrl*/ #define PCI_ERR_CAP_FEP(x) ((x) & 0x1f) /* First Error Pointer */ -#define PCI_ERR_CAP_ECRC_GENC 0x00000020 /* ECRC Generation Capable */ -#define PCI_ERR_CAP_ECRC_GENE 0x00000040 /* ECRC Generation Enable */ -#define PCI_ERR_CAP_ECRC_CHKC 0x00000080 /* ECRC Check Capable */ -#define PCI_ERR_CAP_ECRC_CHKE 0x00000100 /* ECRC Check Enable */ +#define PCI_ERR_CAP_ECRC_GENC 0x00000020 /* ECRC Generation Capable */ +#define PCI_ERR_CAP_ECRC_GENE 0x00000040 /* ECRC Generation Enable */ +#define PCI_ERR_CAP_ECRC_CHKC 0x00000080 /* ECRC Check Capable */ +#define PCI_ERR_CAP_ECRC_CHKE 0x00000100 /* ECRC Check Enable */ +#define PCI_ERR_CAP_PREFIX_LOG_PRESENT 0x00000800 /* TLP Prefix Log Present */ #define PCI_ERR_HEADER_LOG 0x1c /* Header Log Register (16 bytes) */ #define PCI_ERR_ROOT_COMMAND 0x2c /* Root Error Command */ #define PCI_ERR_ROOT_CMD_COR_EN 0x00000001 /* Correctable Err Reporting Enable */ @@ -809,6 +810,7 @@ #define PCI_ERR_ROOT_FATAL_RCV 0x00000040 /* Fatal Received */ #define PCI_ERR_ROOT_AER_IRQ 0xf8000000 /* Advanced Error Interrupt Message Number */ #define PCI_ERR_ROOT_ERR_SRC 0x34 /* Error Source Identification */ +#define PCI_ERR_PREFIX_LOG 0x38 /* TLP Prefix LOG Register (up to 16 bytes) */ /* Virtual Channel */ #define PCI_VC_PORT_CAP1 0x04 -- cgit v1.2.3 From d4679b79ffae994fea08bc7751ff6550ad057f05 Mon Sep 17 00:00:00 2001 From: Konrad Knitter Date: Wed, 6 Nov 2024 10:36:41 +0100 Subject: pldmfw: enable selected component update This patch enables to update a selected component from PLDM image containing multiple components. Example usage: struct pldmfw; data.mode = PLDMFW_UPDATE_MODE_SINGLE_COMPONENT; data.compontent_identifier = DRIVER_FW_MGMT_COMPONENT_ID; Reviewed-by: Jacob Keller Reviewed-by: Marcin Szycik Reviewed-by: Przemek Kitszel Signed-off-by: Konrad Knitter Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- include/linux/pldmfw.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/pldmfw.h b/include/linux/pldmfw.h index 0fc831338226..f5047983004f 100644 --- a/include/linux/pldmfw.h +++ b/include/linux/pldmfw.h @@ -125,9 +125,17 @@ struct pldmfw_ops; * a pointer to their own data, used to implement the device specific * operations. */ + +enum pldmfw_update_mode { + PLDMFW_UPDATE_MODE_FULL, + PLDMFW_UPDATE_MODE_SINGLE_COMPONENT, +}; + struct pldmfw { const struct pldmfw_ops *ops; struct device *dev; + u16 component_identifier; + enum pldmfw_update_mode mode; }; bool pldmfw_op_pci_match_record(struct pldmfw *context, struct pldmfw_record *record); -- cgit v1.2.3 From 0502bd2e06055646f43871f76fb7bdd57f7c0939 Mon Sep 17 00:00:00 2001 From: Konrad Knitter Date: Wed, 6 Nov 2024 10:36:42 +0100 Subject: devlink: add devl guard Add devl guard for scoped_guard(). Example usage: scoped_guard(devl, priv_to_devlink(pf)) { err = init_devlink(pf); if (err) return err; } Co-developed-by: Przemek Kitszel Signed-off-by: Przemek Kitszel Signed-off-by: Konrad Knitter Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- include/net/devlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index fc79fe2297a1..b8783126c1ed 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1535,6 +1535,7 @@ int devl_trylock(struct devlink *devlink); void devl_unlock(struct devlink *devlink); void devl_assert_locked(struct devlink *devlink); bool devl_lock_is_held(struct devlink *devlink); +DEFINE_GUARD(devl, struct devlink *, devl_lock(_T), devl_unlock(_T)); struct ib_device; -- cgit v1.2.3 From e7d0b023955ae3cb25796e19d44a4d55b5ad58de Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Thu, 16 Jan 2025 16:24:12 +0000 Subject: clk: bcm: rpi: Add disp clock BCM2712 has an extra clock exposed by the firmware called DISP, and used by (at least) the HVS. Let's add it to the list of clocks to register in Linux. Acked-by: Stephen Boyd Signed-off-by: Maxime Ripard Signed-off-by: Dave Stevenson Link: https://lore.kernel.org/r/20250116-bcm2712-clk-updates-v1-5-10bc92ffbf41@raspberrypi.com Reviewed-by: Florian Fainelli Signed-off-by: Stephen Boyd --- include/soc/bcm2835/raspberrypi-firmware.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/soc/bcm2835/raspberrypi-firmware.h b/include/soc/bcm2835/raspberrypi-firmware.h index 73cac8d0287e..e1f87fbfe554 100644 --- a/include/soc/bcm2835/raspberrypi-firmware.h +++ b/include/soc/bcm2835/raspberrypi-firmware.h @@ -152,6 +152,7 @@ enum rpi_firmware_clk_id { RPI_FIRMWARE_M2MC_CLK_ID, RPI_FIRMWARE_PIXEL_BVB_CLK_ID, RPI_FIRMWARE_VEC_CLK_ID, + RPI_FIRMWARE_DISP_CLK_ID, RPI_FIRMWARE_NUM_CLK_ID, }; -- cgit v1.2.3 From 3ba0262a8fed29efe28e3ce3162d1794a58aa94f Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 15 Jan 2025 20:42:27 +0000 Subject: net: mdio: add definition for clock stop capable bit Add a definition for the clock stop capable bit in the PCS MMD. This bit indicates whether the MAC is able to stop the transmit xMII clock while it is signalling LPI. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Reviewed-by: Jacob Keller Link: https://patch.msgid.link/E1tYADb-0014PV-6T@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/uapi/linux/mdio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h index f0d3f268240d..6975f182b22c 100644 --- a/include/uapi/linux/mdio.h +++ b/include/uapi/linux/mdio.h @@ -125,6 +125,7 @@ #define MDIO_STAT1_LPOWERABLE 0x0002 /* Low-power ability */ #define MDIO_STAT1_LSTATUS BMSR_LSTATUS #define MDIO_STAT1_FAULT 0x0080 /* Fault */ +#define MDIO_PCS_STAT1_CLKSTOP_CAP 0x0040 #define MDIO_AN_STAT1_LPABLE 0x0001 /* Link partner AN ability */ #define MDIO_AN_STAT1_ABLE BMSR_ANEGCAPABLE #define MDIO_AN_STAT1_RFAULT BMSR_RFAULT -- cgit v1.2.3 From a00e0d34c0362a69369f212b8be1be1f6f4c365d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 15 Jan 2025 20:42:32 +0000 Subject: net: phy: add support for querying PHY clock stop capability Add support for querying whether the PHY allows the transmit xMII clock to be stopped while in LPI mode. This will be used by phylink to pass to the MAC driver so it can configure the generation of the xMII clock appropriately. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Reviewed-by: Jacob Keller Link: https://patch.msgid.link/E1tYADg-0014Pb-AJ@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index afaae74d0949..244f747b3cd9 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2146,6 +2146,7 @@ int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask); int phy_unregister_fixup_for_id(const char *bus_id); int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask); +int phy_eee_tx_clock_stop_capable(struct phy_device *phydev); int phy_eee_rx_clock_stop(struct phy_device *phydev, bool clk_stop_enable); int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable); int phy_get_eee_err(struct phy_device *phydev); -- cgit v1.2.3 From 03abf2a7c65451e663b078b0ed1bfa648cd9380f Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 15 Jan 2025 20:42:42 +0000 Subject: net: phylink: add EEE management Add EEE management to phylink, making use of the phylib implementation. This will only be used where a MAC driver populates the methods and capabilities bitfield, otherwise we keep our old behaviour. Phylink will keep track of the EEE configuration, including the clock stop abilities at each end of the MAC to PHY link, programming the PHY appropriately and preserving the LPI configuration should the PHY go away. Phylink will call into the MAC driver when LPI needs to be enabled or disabled, with the requirement that the MAC have LPI disabled prior to the netdev being brought up (in other words, it will only call mac_disable_tx_lpi() if it has already called mac_enable_tx_lpi().) Support for phylink managed EEE is enabled by populating both tx_lpi MAC operations method pointers, and filling in both LPI interfaces and capabilities. If the methods are provided but the LPI interfaces or capabilities remain empty, this indicates to phylink that EEE is implemented by the driver but the hardware it is driving does not support EEE, and thus the ethtool set_eee() and get_eee() methods will return EOPNOTSUPP. No validation of the LPI timer value is performed by this patch. For interface modes which do not support LPI, we make no attempt to manipulate the phylib EEE advertisement, but instead refuse to activate LPI at the MAC, noting it at debug message level. We also restrict the advertisement and reported userspace support linkmode masks according to the lpi_capabilities provided to phylink by the MAC driver. Signed-off-by: Russell King (Oracle) Reviewed-by: Jacob Keller Link: https://patch.msgid.link/E1tYADq-0014Pn-J1@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index f19b7108c840..898b00451bbf 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -5,6 +5,8 @@ #include #include +#include + struct device_node; struct ethtool_cmd; struct fwnode_handle; @@ -143,11 +145,17 @@ enum phylink_op_type { * possible and avoid stopping it during suspend events. * @default_an_inband: if true, defaults to MLO_AN_INBAND rather than * MLO_AN_PHY. A fixed-link specification will override. + * @eee_rx_clk_stop_enable: if true, PHY can stop the receive clock during LPI * @get_fixed_state: callback to execute to determine the fixed link state, * if MAC link is at %MLO_AN_FIXED mode. * @supported_interfaces: bitmap describing which PHY_INTERFACE_MODE_xxx * are supported by the MAC/PCS. + * @lpi_interfaces: bitmap describing which PHY interface modes can support + * LPI signalling. * @mac_capabilities: MAC pause/speed/duplex capabilities. + * @lpi_capabilities: MAC speeds which can support LPI signalling + * @lpi_timer_default: Default EEE LPI timer setting. + * @eee_enabled_default: If set, EEE will be enabled by phylink at creation time */ struct phylink_config { struct device *dev; @@ -156,10 +164,15 @@ struct phylink_config { bool mac_managed_pm; bool mac_requires_rxc; bool default_an_inband; + bool eee_rx_clk_stop_enable; void (*get_fixed_state)(struct phylink_config *config, struct phylink_link_state *state); DECLARE_PHY_INTERFACE_MASK(supported_interfaces); + DECLARE_PHY_INTERFACE_MASK(lpi_interfaces); unsigned long mac_capabilities; + unsigned long lpi_capabilities; + u32 lpi_timer_default; + bool eee_enabled_default; }; void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed); @@ -173,6 +186,8 @@ void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed); * @mac_finish: finish a major reconfiguration of the interface. * @mac_link_down: take the link down. * @mac_link_up: allow the link to come up. + * @mac_disable_tx_lpi: disable LPI. + * @mac_enable_tx_lpi: enable and configure LPI. * * The individual methods are described more fully below. */ @@ -193,6 +208,9 @@ struct phylink_mac_ops { struct phy_device *phy, unsigned int mode, phy_interface_t interface, int speed, int duplex, bool tx_pause, bool rx_pause); + void (*mac_disable_tx_lpi)(struct phylink_config *config); + int (*mac_enable_tx_lpi)(struct phylink_config *config, u32 timer, + bool tx_clk_stop); }; #if 0 /* For kernel-doc purposes only. */ @@ -387,6 +405,33 @@ void mac_link_down(struct phylink_config *config, unsigned int mode, void mac_link_up(struct phylink_config *config, struct phy_device *phy, unsigned int mode, phy_interface_t interface, int speed, int duplex, bool tx_pause, bool rx_pause); + +/** + * mac_disable_tx_lpi() - disable LPI generation at the MAC + * @config: a pointer to a &struct phylink_config. + * + * Disable generation of LPI at the MAC, effectively preventing the MAC + * from indicating that it is idle. + */ +void mac_disable_tx_lpi(struct phylink_config *config); + +/** + * mac_enable_tx_lpi() - configure and enable LPI generation at the MAC + * @config: a pointer to a &struct phylink_config. + * @timer: LPI timeout in microseconds. + * @tx_clk_stop: allow xMII transmit clock to be stopped during LPI + * + * Configure the LPI timeout accordingly. This will only be called when + * the link is already up, to cater for situations where the hardware + * needs to be programmed according to the link speed. + * + * Enable LPI generation at the MAC, and configure whether the xMII transmit + * clock may be stopped. + * + * Returns: 0 on success. Please consult with rmk before returning an error. + */ +int mac_enable_tx_lpi(struct phylink_config *config, u32 timer, + bool tx_clk_stop); #endif struct phylink_pcs_ops; -- cgit v1.2.3 From 4d27afbf256028a1f54363367f30efc8854433c3 Mon Sep 17 00:00:00 2001 From: Kyle Tso Date: Tue, 14 Jan 2025 22:24:35 +0800 Subject: usb: typec: tcpci: Prevent Sink disconnection before vPpsShutdown in SPR PPS The Source can drop its output voltage to the minimum of the requested PPS APDO voltage range when it is in Current Limit Mode. If this voltage falls within the range of vPpsShutdown, the Source initiates a Hard Reset and discharges Vbus. However, currently the Sink may disconnect before the voltage reaches vPpsShutdown, leading to unexpected behavior. Prevent premature disconnection by setting the Sink's disconnect threshold to the minimum vPpsShutdown value. Additionally, consider the voltage drop due to IR drop when calculating the appropriate threshold. This ensures a robust and reliable interaction between the Source and Sink during SPR PPS Current Limit Mode operation. Fixes: 4288debeaa4e ("usb: typec: tcpci: Fix up sink disconnect thresholds for PD") Cc: stable Signed-off-by: Kyle Tso Reviewed-by: Heikki Krogerus Reviewed-by: Badhri Jagan Sridharan Link: https://lore.kernel.org/r/20250114142435.2093857-1-kyletso@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/tcpm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h index 061da9546a81..b22e659f81ba 100644 --- a/include/linux/usb/tcpm.h +++ b/include/linux/usb/tcpm.h @@ -163,7 +163,8 @@ struct tcpc_dev { void (*frs_sourcing_vbus)(struct tcpc_dev *dev); int (*enable_auto_vbus_discharge)(struct tcpc_dev *dev, bool enable); int (*set_auto_vbus_discharge_threshold)(struct tcpc_dev *dev, enum typec_pwr_opmode mode, - bool pps_active, u32 requested_vbus_voltage); + bool pps_active, u32 requested_vbus_voltage, + u32 pps_apdo_min_voltage); bool (*is_vbus_vsafe0v)(struct tcpc_dev *dev); void (*set_partner_usb_comm_capable)(struct tcpc_dev *dev, bool enable); void (*check_contaminant)(struct tcpc_dev *dev); -- cgit v1.2.3 From 39d0be87438a0cc29151898c7fba24b43f2f3df8 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 12 Jan 2025 13:57:59 +0000 Subject: serial: kgdb_nmi: Remove unused knock code kgdb_nmi_poll_knock() has been unused since it was added in 2013 in commit 0c57dfcc6c1d ("tty/serial: Add kgdb_nmi driver") Remove it, the static helpers, and module parameters it used. (The comment explaining why it might be used sounds sensible, but it's never been wired up, perhaps it's worth doing somewhere?) Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20250112135759.105541-1-linux@treblig.org Signed-off-by: Greg Kroah-Hartman --- include/linux/kgdb.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 76e891ee9e37..51ef131e66b7 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -309,11 +309,9 @@ extern unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs); #ifdef CONFIG_SERIAL_KGDB_NMI extern int kgdb_register_nmi_console(void); extern int kgdb_unregister_nmi_console(void); -extern bool kgdb_nmi_poll_knock(void); #else static inline int kgdb_register_nmi_console(void) { return 0; } static inline int kgdb_unregister_nmi_console(void) { return 0; } -static inline bool kgdb_nmi_poll_knock(void) { return true; } #endif extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops); -- cgit v1.2.3 From 72d1c18262dd5a18d835a94391c31cf04252c748 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Tue, 14 Jan 2025 23:23:03 +0800 Subject: of: Do not expose of_alias_scan() and correct its comments of_alias_scan() has no external callers and returns void. Do not expose it and delete return value descriptions in its comments. Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250114-of_core_fix-v5-1-b8bafd00a86f@quicinc.com Signed-off-by: Rob Herring (Arm) --- include/linux/of.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/of.h b/include/linux/of.h index 0cdd58ff0a41..eaf0e2a2b75c 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -398,7 +398,6 @@ extern int of_phandle_iterator_args(struct of_phandle_iterator *it, uint32_t *args, int size); -extern void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align)); extern int of_alias_get_id(const struct device_node *np, const char *stem); extern int of_alias_get_highest_id(const char *stem); -- cgit v1.2.3 From 07f531b395db3cd1776ef0f7191abf4b077fcf21 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 15 Jan 2025 19:27:17 -0500 Subject: ACPI: platform_profile: Remove platform_profile_handler from exported symbols MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to protect the platform_profile_handler from API consumers, allocate it in platform_profile_register() and modify it's signature accordingly. Remove the platform_profile_handler from all consumer drivers and replace them with a pointer to the class device, which is now returned from platform_profile_register(). Replace *pprof with a pointer to the class device in the rest of exported symbols. Reviewed-by: Mario Limonciello Signed-off-by: Kurt Borja Reviewed-by: Mark Pearson Tested-by: Mark Pearson Link: https://lore.kernel.org/r/20250116002721.75592-16-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 5ad1ab7b75e4..778d4c661c3c 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -45,10 +45,14 @@ struct platform_profile_handler { const struct platform_profile_ops *ops; }; -int platform_profile_register(struct platform_profile_handler *pprof, void *drvdata); -int platform_profile_remove(struct platform_profile_handler *pprof); -int devm_platform_profile_register(struct platform_profile_handler *pprof, void *drvdata); +struct device *platform_profile_register(struct device *dev, const char *name, + void *drvdata, + const struct platform_profile_ops *ops); +int platform_profile_remove(struct device *dev); +struct device *devm_platform_profile_register(struct device *dev, const char *name, + void *drvdata, + const struct platform_profile_ops *ops); int platform_profile_cycle(void); -void platform_profile_notify(struct platform_profile_handler *pprof); +void platform_profile_notify(struct device *dev); #endif /*_PLATFORM_PROFILE_H_*/ -- cgit v1.2.3 From 6ef33895503583d0741a0d8faf820ca8143b9cf2 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 15 Jan 2025 19:27:18 -0500 Subject: ACPI: platform_profile: Move platform_profile_handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit platform_profile_handler is now an internal structure. Move it to platform_profile.c. Reviewed-by: Mario Limonciello Signed-off-by: Kurt Borja Reviewed-by: Mark Pearson Tested-by: Mark Pearson Link: https://lore.kernel.org/r/20250116002721.75592-17-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 778d4c661c3c..eea1daf85616 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -28,23 +28,12 @@ enum platform_profile_option { PLATFORM_PROFILE_LAST, /*must always be last */ }; -struct platform_profile_handler; - struct platform_profile_ops { int (*probe)(void *drvdata, unsigned long *choices); int (*profile_get)(struct device *dev, enum platform_profile_option *profile); int (*profile_set)(struct device *dev, enum platform_profile_option profile); }; -struct platform_profile_handler { - const char *name; - struct device *dev; - struct device class_dev; - int minor; - unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; - const struct platform_profile_ops *ops; -}; - struct device *platform_profile_register(struct device *dev, const char *name, void *drvdata, const struct platform_profile_ops *ops); -- cgit v1.2.3 From ee7f3e2b4942e3f6d8837780d0d3d5d58de8801a Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 15 Jan 2025 19:27:20 -0500 Subject: ACPI: platform_profile: Add documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add kerneldoc and sysfs class documentation. Reviewed-by: Mario Limonciello Signed-off-by: Kurt Borja Reviewed-by: Mark Pearson Tested-by: Mark Pearson Link: https://lore.kernel.org/r/20250116002721.75592-19-kuurtb@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index eea1daf85616..8ab5b0e8eb2c 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -28,6 +28,16 @@ enum platform_profile_option { PLATFORM_PROFILE_LAST, /*must always be last */ }; +/** + * struct platform_profile_ops - platform profile operations + * @probe: Callback to setup choices available to the new class device. These + * choices will only be enforced when setting a new profile, not when + * getting the current one. + * @profile_get: Callback that will be called when showing the current platform + * profile in sysfs. + * @profile_set: Callback that will be called when storing a new platform + * profile in sysfs. + */ struct platform_profile_ops { int (*probe)(void *drvdata, unsigned long *choices); int (*profile_get)(struct device *dev, enum platform_profile_option *profile); -- cgit v1.2.3 From 42d7c87b4e1251f36eceac987e74623e7cda8577 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 15 Jan 2025 15:41:57 +0100 Subject: regulator: Add support for power budget Introduce power budget management for the regulator device. Enable tracking of available power capacity by providing helpers to request and release power budget allocations. Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20250115-feature_regulator_pw_budget-v2-1-0a44b949e6bc@bootlin.com Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 21 +++++++++++++++++++++ include/linux/regulator/driver.h | 2 ++ include/linux/regulator/machine.h | 2 ++ 3 files changed, 25 insertions(+) (limited to 'include') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 8c3c372ad735..8d1a6eca7eb9 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -258,6 +258,11 @@ int regulator_sync_voltage(struct regulator *regulator); int regulator_set_current_limit(struct regulator *regulator, int min_uA, int max_uA); int regulator_get_current_limit(struct regulator *regulator); +int regulator_get_unclaimed_power_budget(struct regulator *regulator); +int regulator_request_power_budget(struct regulator *regulator, + unsigned int pw_req); +void regulator_free_power_budget(struct regulator *regulator, + unsigned int pw); int regulator_set_mode(struct regulator *regulator, unsigned int mode); unsigned int regulator_get_mode(struct regulator *regulator); @@ -571,6 +576,22 @@ static inline int regulator_get_current_limit(struct regulator *regulator) return 0; } +static inline int regulator_get_unclaimed_power_budget(struct regulator *regulator) +{ + return INT_MAX; +} + +static inline int regulator_request_power_budget(struct regulator *regulator, + unsigned int pw_req) +{ + return -EOPNOTSUPP; +} + +static inline void regulator_free_power_budget(struct regulator *regulator, + unsigned int pw) +{ +} + static inline int regulator_set_mode(struct regulator *regulator, unsigned int mode) { diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index 5b66caf1695d..4a216fdba354 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -656,6 +656,8 @@ struct regulator_dev { int cached_err; bool use_cached_err; spinlock_t err_lock; + + int pw_requested_mW; }; /* diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index b3db09a7429b..1fc440c5c4c7 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -113,6 +113,7 @@ struct notification_limit { * @min_uA: Smallest current consumers may set. * @max_uA: Largest current consumers may set. * @ilim_uA: Maximum input current. + * @pw_budget_mW: Power budget for the regulator in mW. * @system_load: Load that isn't captured by any consumer requests. * * @over_curr_limits: Limits for acting on over current. @@ -185,6 +186,7 @@ struct regulation_constraints { int max_uA; int ilim_uA; + int pw_budget_mW; int system_load; /* used for coupled regulators */ -- cgit v1.2.3 From fd8318a32573d73eb20637a0c80689de0dc98169 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Fri, 3 Jan 2025 16:41:13 +0800 Subject: PM: sleep: wakeirq: Introduce device-managed variant of dev_pm_set_wake_irq() Add device-managed variant of dev_pm_set_wake_irq which automatically clear the wake irq on device destruction to simplify error handling and resource management in drivers. Signed-off-by: Peng Fan Link: https://patch.msgid.link/20250103-wake_irq-v2-1-e3aeff5e9966@nxp.com Signed-off-by: Rafael J. Wysocki --- include/linux/pm_wakeirq.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/pm_wakeirq.h b/include/linux/pm_wakeirq.h index d9642c6cf852..25b63ed51b76 100644 --- a/include/linux/pm_wakeirq.h +++ b/include/linux/pm_wakeirq.h @@ -10,6 +10,7 @@ extern int dev_pm_set_wake_irq(struct device *dev, int irq); extern int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq); extern int dev_pm_set_dedicated_wake_irq_reverse(struct device *dev, int irq); extern void dev_pm_clear_wake_irq(struct device *dev); +extern int devm_pm_set_wake_irq(struct device *dev, int irq); #else /* !CONFIG_PM */ @@ -32,5 +33,10 @@ static inline void dev_pm_clear_wake_irq(struct device *dev) { } +static inline int devm_pm_set_wake_irq(struct device *dev, int irq) +{ + return 0; +} + #endif /* CONFIG_PM */ #endif /* _LINUX_PM_WAKEIRQ_H */ -- cgit v1.2.3 From 6a7e17b22062c84a111d7073c67cc677c4190f32 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 16 Jan 2025 17:02:54 +0000 Subject: block: Add common atomic writes enable flag Currently only stacked devices need to explicitly enable atomic writes by setting BLK_FEAT_ATOMIC_WRITES_STACKED flag. This does not work well for device mapper stacking devices, as there many sets of limits are stacked and what is the 'bottom' and 'top' device can swapped. This means that BLK_FEAT_ATOMIC_WRITES_STACKED needs to be set for many queue limits, which is messy. Generalize enabling atomic writes enabling by ensuring that all devices must explicitly set a flag - that includes NVMe, SCSI sd, and md raid. Signed-off-by: John Garry Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20250116170301.474130-2-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7ac153e4423a..76f0a4e7c2e5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -331,8 +331,8 @@ typedef unsigned int __bitwise blk_features_t; #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ ((__force blk_features_t)(1u << 15)) -/* stacked device can/does support atomic writes */ -#define BLK_FEAT_ATOMIC_WRITES_STACKED \ +/* atomic writes enabled */ +#define BLK_FEAT_ATOMIC_WRITES \ ((__force blk_features_t)(1u << 16)) /* -- cgit v1.2.3 From 60295b944ff6805e677c48ae4178532b207d43be Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 16 Jan 2025 16:41:24 -0500 Subject: tracing: gfp: Fix the GFP enum values shown for user space tracing tools Tracing tools like perf and trace-cmd read the /sys/kernel/tracing/events/*/*/format files to know how to parse the data and also how to print it. For the "print fmt" portion of that file, if anything uses an enum that is not exported to the tracing system, user space will not be able to parse it. The GFP flags use to be defines, and defines get translated in the print fmt sections. But now they are converted to use enums, which is not. The mm_page_alloc trace event format use to have: print fmt: "page=%p pfn=0x%lx order=%d migratetype=%d gfp_flags=%s", REC->pfn != -1UL ? (((struct page *)vmemmap_base) + (REC->pfn)) : ((void *)0), REC->pfn != -1UL ? REC->pfn : 0, REC->order, REC->migratetype, (REC->gfp_flags) ? __print_flags(REC->gfp_flags, "|", {( unsigned long)(((((((( gfp_t)(0x400u|0x800u)) | (( gfp_t)0x40u) | (( gfp_t)0x80u) | (( gfp_t)0x100000u)) | (( gfp_t)0x02u)) | (( gfp_t)0x08u) | (( gfp_t)0)) | (( gfp_t)0x40000u) | (( gfp_t)0x80000u) | (( gfp_t)0x2000u)) & ~(( gfp_t)(0x400u|0x800u))) | (( gfp_t)0x400u)), "GFP_TRANSHUGE"}, {( unsigned long)((((((( gfp_t)(0x400u|0x800u)) | (( gfp_t)0x40u) | (( gfp_t)0x80u) | (( gfp_t)0x100000u)) | (( gfp_t)0x02u)) | (( gfp_t)0x08u) | (( gfp_t)0)) ... Where the GFP values are shown and not their names. But after the GFP flags were converted to use enums, it has: print fmt: "page=%p pfn=0x%lx order=%d migratetype=%d gfp_flags=%s", REC->pfn != -1UL ? (vmemmap + (REC->pfn)) : ((void *)0), REC->pfn != -1UL ? REC->pfn : 0, REC->order, REC->migratetype, (REC->gfp_flags) ? __print_flags(REC->gfp_flags, "|", {( unsigned long)(((((((( gfp_t)(((((1UL))) << (___GFP_DIRECT_RECLAIM_BIT))|((((1UL))) << (___GFP_KSWAPD_RECLAIM_BIT)))) | (( gfp_t)((((1UL))) << (___GFP_IO_BIT))) | (( gfp_t)((((1UL))) << (___GFP_FS_BIT))) | (( gfp_t)((((1UL))) << (___GFP_HARDWALL_BIT)))) | (( gfp_t)((((1UL))) << (___GFP_HIGHMEM_BIT)))) | (( gfp_t)((((1UL))) << (___GFP_MOVABLE_BIT))) | (( gfp_t)0)) | (( gfp_t)((((1UL))) << (___GFP_COMP_BIT))) ... Where the enums names like ___GFP_KSWAPD_RECLAIM_BIT are shown and not their values. User space has no way to convert these names to their values and the output will fail to parse. What is shown is now: mm_page_alloc: page=0xffffffff981685f3 pfn=0x1d1ac1 order=0 migratetype=1 gfp_flags=0x140cca The TRACE_DEFINE_ENUM() macro was created to handle enums in the print fmt files. This causes them to be replaced at boot up with the numbers, so that user space tooling can parse it. By using this macro, the output is back to the human readable: mm_page_alloc: page=0xffffffff981685f3 pfn=0x122233 order=0 migratetype=1 gfp_flags=GFP_HIGHUSER_MOVABLE|__GFP_COMP Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Veronika Molnarova Cc: Suren Baghdasaryan Cc: Linus Torvalds Link: https://lore.kernel.org/20250116214438.749504792@goodmis.org Reported-by: Michael Petlan Closes: https://lore.kernel.org/all/87be5f7c-1a0-dad-daa0-54e342efaea7@redhat.com/ Fixes: 772dd0342727c ("mm: enumerate all gfp flags") Signed-off-by: Steven Rostedt (Google) --- include/trace/events/mmflags.h | 63 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'include') diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index bb8a59c6caa2..d36c857dd249 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -13,6 +13,69 @@ * Thus most bits set go first. */ +/* These define the values that are enums (the bits) */ +#define TRACE_GFP_FLAGS_GENERAL \ + TRACE_GFP_EM(DMA) \ + TRACE_GFP_EM(HIGHMEM) \ + TRACE_GFP_EM(DMA32) \ + TRACE_GFP_EM(MOVABLE) \ + TRACE_GFP_EM(RECLAIMABLE) \ + TRACE_GFP_EM(HIGH) \ + TRACE_GFP_EM(IO) \ + TRACE_GFP_EM(FS) \ + TRACE_GFP_EM(ZERO) \ + TRACE_GFP_EM(DIRECT_RECLAIM) \ + TRACE_GFP_EM(KSWAPD_RECLAIM) \ + TRACE_GFP_EM(WRITE) \ + TRACE_GFP_EM(NOWARN) \ + TRACE_GFP_EM(RETRY_MAYFAIL) \ + TRACE_GFP_EM(NOFAIL) \ + TRACE_GFP_EM(NORETRY) \ + TRACE_GFP_EM(MEMALLOC) \ + TRACE_GFP_EM(COMP) \ + TRACE_GFP_EM(NOMEMALLOC) \ + TRACE_GFP_EM(HARDWALL) \ + TRACE_GFP_EM(THISNODE) \ + TRACE_GFP_EM(ACCOUNT) \ + TRACE_GFP_EM(ZEROTAGS) + +#ifdef CONFIG_KASAN_HW_TAGS +# define TRACE_GFP_FLAGS_KASAN \ + TRACE_GFP_EM(SKIP_ZERO) \ + TRACE_GFP_EM(SKIP_KASAN) +#else +# define TRACE_GFP_FLAGS_KASAN +#endif + +#ifdef CONFIG_LOCKDEP +# define TRACE_GFP_FLAGS_LOCKDEP \ + TRACE_GFP_EM(NOLOCKDEP) +#else +# define TRACE_GFP_FLAGS_LOCKDEP +#endif + +#ifdef CONFIG_SLAB_OBJ_EXT +# define TRACE_GFP_FLAGS_SLAB \ + TRACE_GFP_EM(NO_OBJ_EXT) +#else +# define TRACE_GFP_FLAGS_SLAB +#endif + +#define TRACE_GFP_FLAGS \ + TRACE_GFP_FLAGS_GENERAL \ + TRACE_GFP_FLAGS_KASAN \ + TRACE_GFP_FLAGS_LOCKDEP \ + TRACE_GFP_FLAGS_SLAB + +#undef TRACE_GFP_EM +#define TRACE_GFP_EM(a) TRACE_DEFINE_ENUM(___GFP_##a##_BIT); + +TRACE_GFP_FLAGS + +/* Just in case these are ever used */ +TRACE_DEFINE_ENUM(___GFP_UNUSED_BIT); +TRACE_DEFINE_ENUM(___GFP_LAST_BIT); + #define gfpflag_string(flag) {(__force unsigned long)flag, #flag} #define __def_gfpflag_names \ -- cgit v1.2.3 From 3194e36488e2dae05f9a822c73eaa367e47b2e3d Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 16 Jan 2025 17:02:56 +0000 Subject: dm-table: atomic writes support Support stacking atomic write limits for DM devices. All the pre-existing code in blk_stack_atomic_writes_limits() already takes care of finding the aggregrate limits from the bottom devices. Feature flag DM_TARGET_ATOMIC_WRITES is introduced so that atomic writes can be enabled on personalities selectively. This is to ensure that atomic writes are only enabled when verified to be working properly (for a specific personality). In addition, it just may not make sense to enable atomic writes on some personalities (so this flag also helps there). Signed-off-by: John Garry Reviewed-by: Mike Snitzer Signed-off-by: Mikulas Patocka --- include/linux/device-mapper.h | 3 +++ include/uapi/linux/dm-ioctl.h | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 8321f65897f3..bcc6d7b69470 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -299,6 +299,9 @@ struct target_type { #define dm_target_supports_mixed_zoned_model(type) (false) #endif +#define DM_TARGET_ATOMIC_WRITES 0x00000400 +#define dm_target_supports_atomic_writes(type) ((type)->features & DM_TARGET_ATOMIC_WRITES) + struct dm_target { struct dm_table *table; struct target_type *type; diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 1990b5700f69..b08c7378164d 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -286,9 +286,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 48 +#define DM_VERSION_MINOR 49 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2023-03-01)" +#define DM_VERSION_EXTRA "-ioctl (2025-01-17)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit v1.2.3 From 61bc24ac974a4873e3040765e640f62fe99d6226 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 9 Dec 2024 19:15:58 -0500 Subject: make sure that DNAME_INLINE_LEN is a multiple of word size ... calling the number of words DNAME_INLINE_WORDS. The next step will be to have a structure to hold inline name arrays (both in dentry and in name_snapshot) and use that to alias the existing arrays of unsigned char there. That will allow both full-structure copies and convenient word-by-word accesses. Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: Al Viro --- include/linux/dcache.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index bff956f7b2b9..42dd89beaf4e 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -68,15 +68,17 @@ extern const struct qstr dotdot_name; * large memory footprint increase). */ #ifdef CONFIG_64BIT -# define DNAME_INLINE_LEN 40 /* 192 bytes */ +# define DNAME_INLINE_WORDS 5 /* 192 bytes */ #else # ifdef CONFIG_SMP -# define DNAME_INLINE_LEN 36 /* 128 bytes */ +# define DNAME_INLINE_WORDS 9 /* 128 bytes */ # else -# define DNAME_INLINE_LEN 44 /* 128 bytes */ +# define DNAME_INLINE_WORDS 11 /* 128 bytes */ # endif #endif +#define DNAME_INLINE_LEN (DNAME_INLINE_WORDS*sizeof(unsigned long)) + #define d_lock d_lockref.lock struct dentry { -- cgit v1.2.3 From 58cf9c383c5c686668082f83f7e0f3e0bd5cc2e3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 9 Dec 2024 19:35:36 -0500 Subject: dcache: back inline names with a struct-wrapped array of unsigned long ... so that they can be copied with struct assignment (which generates better code) and accessed word-by-word. The type is union shortname_storage; it's a union of arrays of unsigned char and unsigned long. struct name_snapshot.inline_name turned into union shortname_storage; users (all in fs/dcache.c) adjusted. struct dentry.d_iname has some users outside of fs/dcache.c; to reduce the amount of noise in commit, it is replaced with union shortname_storage d_shortname and d_iname is turned into a macro that expands to d_shortname.string (similar to d_lock handling). That compat macro is temporary - most of the remaining instances will be taken out by debugfs series, and once that is merged and few others are taken care of this will go away. Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: Al Viro --- include/linux/dcache.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 42dd89beaf4e..8bc567a35718 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -79,7 +79,13 @@ extern const struct qstr dotdot_name; #define DNAME_INLINE_LEN (DNAME_INLINE_WORDS*sizeof(unsigned long)) +union shortname_store { + unsigned char string[DNAME_INLINE_LEN]; + unsigned long words[DNAME_INLINE_WORDS]; +}; + #define d_lock d_lockref.lock +#define d_iname d_shortname.string struct dentry { /* RCU lookup touched fields */ @@ -90,7 +96,7 @@ struct dentry { struct qstr d_name; struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ - unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ + union shortname_store d_shortname; /* --- cacheline 1 boundary (64 bytes) was 32 bytes ago --- */ /* Ref lookup also touches following */ @@ -591,7 +597,7 @@ static inline struct inode *d_real_inode(const struct dentry *dentry) struct name_snapshot { struct qstr name; - unsigned char inline_name[DNAME_INLINE_LEN]; + union shortname_store inline_name; }; void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *); void release_dentry_name_snapshot(struct name_snapshot *); -- cgit v1.2.3 From 6a128cdf1926b20a94d6af7d7d03b76ba19a4f8b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 16 Jan 2025 12:46:25 +0200 Subject: net: ethtool: ts: add separate counter for unconfirmed one-step TX timestamps For packets with two-step timestamp requests, the hardware timestamp comes back to the driver through a confirmation mechanism of sorts, which allows the driver to confidently bump the successful "pkts" counter. For one-step PTP, the NIC is supposed to autonomously insert its hardware TX timestamp in the packet headers while simultaneously transmitting it. There may be a confirmation that this was done successfully, or there may not. None of the current drivers which implement ethtool_ops :: get_ts_stats() also support HWTSTAMP_TX_ONESTEP_SYNC or HWTSTAMP_TX_ONESTEP_SYNC, so it is a bit unclear which model to follow. But there are NICs, such as DSA, where there is no transmit confirmation at all. Here, it would be wrong / misleading to increment the successful "pkts" counter, because one-step PTP packets can be dropped on TX just like any other packets. So introduce a special counter which signifies "yes, an attempt was made, but we don't know whether it also exited the port or not". I expect that for one-step PTP packets where a confirmation is available, the "pkts" counter would be bumped. Signed-off-by: Vladimir Oltean Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20250116104628.123555-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 7 +++++++ include/uapi/linux/ethtool_netlink_generated.h | 1 + 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index e4136b0df892..64301ddf2f59 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -559,6 +559,12 @@ struct ethtool_rmon_stats { /** * struct ethtool_ts_stats - HW timestamping statistics * @pkts: Number of packets successfully timestamped by the hardware. + * @onestep_pkts_unconfirmed: Number of PTP packets with one-step TX + * timestamping that were sent, but for which the + * device offers no confirmation whether they made + * it onto the wire and the timestamp was inserted + * in the originTimestamp or correctionField, or + * not. * @lost: Number of hardware timestamping requests where the timestamping * information from the hardware never arrived for submission with * the skb. @@ -571,6 +577,7 @@ struct ethtool_rmon_stats { struct ethtool_ts_stats { struct_group(tx_stats, u64 pkts; + u64 onestep_pkts_unconfirmed; u64 lost; u64 err; ); diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 2e17ff348f89..fe24c3459ac0 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -382,6 +382,7 @@ enum { ETHTOOL_A_TS_STAT_TX_PKTS, ETHTOOL_A_TS_STAT_TX_LOST, ETHTOOL_A_TS_STAT_TX_ERR, + ETHTOOL_A_TS_STAT_TX_ONESTEP_PKTS_UNCONFIRMED, __ETHTOOL_A_TS_STAT_CNT, ETHTOOL_A_TS_STAT_MAX = (__ETHTOOL_A_TS_STAT_CNT - 1) -- cgit v1.2.3 From 4b0a3ffa799b1c21a6be010c0c1efa012251080d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 16 Jan 2025 12:46:26 +0200 Subject: net: dsa: implement get_ts_stats ethtool operation for user ports Integrate with the standard infrastructure for reporting hardware packet timestamping statistics. Signed-off-by: Vladimir Oltean Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20250116104628.123555-3-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 9640d5c67f56..a0a9481c52c2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -906,6 +906,8 @@ struct dsa_switch_ops { void (*get_rmon_stats)(struct dsa_switch *ds, int port, struct ethtool_rmon_stats *rmon_stats, const struct ethtool_rmon_hist_range **ranges); + void (*get_ts_stats)(struct dsa_switch *ds, int port, + struct ethtool_ts_stats *ts_stats); void (*get_stats64)(struct dsa_switch *ds, int port, struct rtnl_link_stats64 *s); void (*get_pause_stats)(struct dsa_switch *ds, int port, -- cgit v1.2.3 From 8fbd24f3d17b9d26af6c66a28053fbf5f6da330d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 16 Jan 2025 12:46:27 +0200 Subject: net: mscc: ocelot: add TX timestamping statistics Add an u64 hardware timestamping statistics structure for each ocelot port. Export a function from the common switch library for reporting them to ethtool. This is called by the ocelot switchdev front-end for now. Note that for the switchdev driver, we report the one-step PTP packets as unconfirmed, even though in principle, for some transmission mechanisms like FDMA, we may be able to confirm transmission and bump the "pkts" counter in ocelot_fdma_tx_cleanup() instead. I don't have access to hardware which uses the switchdev front-end, and I've kept the implementation simple. Signed-off-by: Vladimir Oltean Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20250116104628.123555-4-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/soc/mscc/ocelot.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 2db9ae0575b6..6db7fc9dbaa4 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -759,6 +759,14 @@ struct ocelot_mm_state { u8 active_preemptible_tcs; }; +struct ocelot_ts_stats { + u64 pkts; + u64 onestep_pkts_unconfirmed; + u64 lost; + u64 err; + struct u64_stats_sync syncp; +}; + struct ocelot_port; struct ocelot_port { @@ -778,6 +786,7 @@ struct ocelot_port { phy_interface_t phy_mode; + struct ocelot_ts_stats *ts_stats; struct sk_buff_head tx_skbs; unsigned int trap_proto; @@ -1023,6 +1032,8 @@ void ocelot_port_get_eth_mac_stats(struct ocelot *ocelot, int port, struct ethtool_eth_mac_stats *mac_stats); void ocelot_port_get_eth_phy_stats(struct ocelot *ocelot, int port, struct ethtool_eth_phy_stats *phy_stats); +void ocelot_port_get_ts_stats(struct ocelot *ocelot, int port, + struct ethtool_ts_stats *ts_stats); int ocelot_get_ts_info(struct ocelot *ocelot, int port, struct kernel_ethtool_ts_info *info); void ocelot_set_ageing_time(struct ocelot *ocelot, unsigned int msecs); -- cgit v1.2.3 From 46757a3e7d50dac923888e7fbe68377736f13c70 Mon Sep 17 00:00:00 2001 From: "Geoffrey D. Bennett" Date: Fri, 17 Jan 2025 04:17:38 +1030 Subject: ALSA: FCP: Add Focusrite Control Protocol driver Add a new kernel driver for the Focusrite Control Protocol (FCP), which is used by Focusrite Scarlett 2nd Gen, 3rd Gen, 4th Gen, Clarett USB, Clarett+, and Vocaster series audio interfaces. This driver provides a user-space control interface via ALSA's hwdep subsystem. Unlike the existing Scarlett2 driver which implements all ALSA controls in kernel space, this new FCP driver takes a different approach by providing a minimal kernel interface that allows a user-space driver to send FCP commands and receive notifications. The only control implemented in kernel space is the Level Meter, since it requires frequent polling of volatile data. While this driver supports all interfaces that the Scarlett2 driver works with, it is initially enabled only for 4th Gen 16i16, 18i16, and 18i20 interfaces that are not supported by the Scarlett2 driver. Signed-off-by: Geoffrey D. Bennett Link: https://patch.msgid.link/597741a9b1198b965561547511d3d345f91cba20.1737048528.git.g@b4.vu Signed-off-by: Takashi Iwai --- include/uapi/sound/fcp.h | 120 +++++++++++++++++++++++++++++++++++++++++++++++ include/uapi/sound/tlv.h | 2 + 2 files changed, 122 insertions(+) create mode 100644 include/uapi/sound/fcp.h (limited to 'include') diff --git a/include/uapi/sound/fcp.h b/include/uapi/sound/fcp.h new file mode 100644 index 000000000000..aea428956d8f --- /dev/null +++ b/include/uapi/sound/fcp.h @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Focusrite Control Protocol Driver for ALSA + * + * Copyright (c) 2024-2025 by Geoffrey D. Bennett + */ +/* + * DOC: FCP (Focusrite Control Protocol) User-Space API + * + * This header defines the interface between the FCP kernel driver and + * user-space programs to enable the use of the proprietary features + * available in Focusrite USB audio interfaces. This includes Scarlett + * 2nd Gen, 3rd Gen, 4th Gen, Clarett USB, Clarett+, and Vocaster + * series devices. + * + * The interface is provided via ALSA's hwdep interface. Opening the + * hwdep device requires CAP_SYS_RAWIO privileges as this interface + * provides near-direct access. + * + * For details on the FCP protocol, refer to the kernel scarlett2 + * driver in sound/usb/mixer_scarlett2.c and the fcp-support project + * at https://github.com/geoffreybennett/fcp-support + * + * For examples of using these IOCTLs, see the fcp-server source in + * the fcp-support project. + * + * IOCTL Interface + * -------------- + * FCP_IOCTL_PVERSION: + * Returns the protocol version supported by the driver. + * + * FCP_IOCTL_INIT: + * Initialises the protocol and synchronises sequence numbers + * between the driver and device. Must be called at least once + * before sending commands. Can be safely called again at any time. + * + * FCP_IOCTL_CMD: + * Sends an FCP command to the device and returns the response. + * Requires prior initialisation via FCP_IOCTL_INIT. + * + * FCP_IOCTL_SET_METER_MAP: + * Configures the Level Meter control's mapping between device + * meters and control channels. Requires FCP_IOCTL_INIT to have been + * called first. The map size and number of slots cannot be changed + * after initial configuration, although the map itself can be + * updated. Once configured, the Level Meter remains functional even + * after the hwdep device is closed. + * + * FCP_IOCTL_SET_METER_LABELS: + * Set the labels for the Level Meter control. Requires + * FCP_IOCTL_SET_METER_MAP to have been called first. labels[] + * should contain a sequence of null-terminated labels corresponding + * to the control's channels. + */ +#ifndef __UAPI_SOUND_FCP_H +#define __UAPI_SOUND_FCP_H + +#include +#include + +#define FCP_HWDEP_MAJOR 2 +#define FCP_HWDEP_MINOR 0 +#define FCP_HWDEP_SUBMINOR 0 + +#define FCP_HWDEP_VERSION \ + ((FCP_HWDEP_MAJOR << 16) | \ + (FCP_HWDEP_MINOR << 8) | \ + FCP_HWDEP_SUBMINOR) + +#define FCP_HWDEP_VERSION_MAJOR(v) (((v) >> 16) & 0xFF) +#define FCP_HWDEP_VERSION_MINOR(v) (((v) >> 8) & 0xFF) +#define FCP_HWDEP_VERSION_SUBMINOR(v) ((v) & 0xFF) + +/* Get protocol version */ +#define FCP_IOCTL_PVERSION _IOR('S', 0x60, int) + +/* Start the protocol */ + +/* Step 0 and step 2 responses are variable length and placed in + * resp[] one after the other. + */ +struct fcp_init { + __u16 step0_resp_size; + __u16 step2_resp_size; + __u32 init1_opcode; + __u32 init2_opcode; + __u8 resp[]; +} __attribute__((packed)); + +#define FCP_IOCTL_INIT _IOWR('S', 0x64, struct fcp_init) + +/* Perform a command */ + +/* The request data is placed in data[] and the response data will + * overwrite it. + */ +struct fcp_cmd { + __u32 opcode; + __u16 req_size; + __u16 resp_size; + __u8 data[]; +} __attribute__((packed)); +#define FCP_IOCTL_CMD _IOWR('S', 0x65, struct fcp_cmd) + +/* Set the meter map */ +struct fcp_meter_map { + __u16 map_size; + __u16 meter_slots; + __s16 map[]; +} __attribute__((packed)); +#define FCP_IOCTL_SET_METER_MAP _IOW('S', 0x66, struct fcp_meter_map) + +/* Set the meter labels */ +struct fcp_meter_labels { + __u16 labels_size; + char labels[]; +} __attribute__((packed)); +#define FCP_IOCTL_SET_METER_LABELS _IOW('S', 0x67, struct fcp_meter_labels) + +#endif /* __UAPI_SOUND_FCP_H */ diff --git a/include/uapi/sound/tlv.h b/include/uapi/sound/tlv.h index b99a2414b53d..5bb55c80e095 100644 --- a/include/uapi/sound/tlv.h +++ b/include/uapi/sound/tlv.h @@ -18,6 +18,8 @@ #define SNDRV_CTL_TLVT_CHMAP_VAR 0x102 /* channels freely swappable */ #define SNDRV_CTL_TLVT_CHMAP_PAIRED 0x103 /* pair-wise swappable */ +#define SNDRV_CTL_TLVT_FCP_CHANNEL_LABELS 0x110 /* channel labels */ + /* * TLV structure is right behind the struct snd_ctl_tlv: * unsigned int type - see SNDRV_CTL_TLVT_* -- cgit v1.2.3 From 3156ceb222414456084d964f43ada071206039b8 Mon Sep 17 00:00:00 2001 From: Rickard Andersson Date: Mon, 16 Dec 2024 09:54:19 +0100 Subject: ubi: Expose interface for detailed erase counters Using the ioctl command 'UBI_IOCECNFO' user space can obtain detailed erase counter information of all blocks of a device. Signed-off-by: Rickard Andersson Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- include/uapi/mtd/ubi-user.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include') diff --git a/include/uapi/mtd/ubi-user.h b/include/uapi/mtd/ubi-user.h index e1571603175e..aa872a41ffb9 100644 --- a/include/uapi/mtd/ubi-user.h +++ b/include/uapi/mtd/ubi-user.h @@ -175,6 +175,8 @@ #define UBI_IOCRPEB _IOW(UBI_IOC_MAGIC, 4, __s32) /* Force scrubbing on the specified PEB */ #define UBI_IOCSPEB _IOW(UBI_IOC_MAGIC, 5, __s32) +/* Read detailed device erase counter information */ +#define UBI_IOCECNFO _IOWR(UBI_IOC_MAGIC, 6, struct ubi_ecinfo_req) /* ioctl commands of the UBI control character device */ @@ -412,6 +414,37 @@ struct ubi_rnvol_req { } ents[UBI_MAX_RNVOL]; } __packed; +/** + * struct ubi_ecinfo_req - a data structure used for requesting and receiving + * erase block counter information from a UBI device. + * + * @start: index of first physical erase block to read (in) + * @length: number of erase counters to read (in) + * @read_length: number of erase counters that was actually read (out) + * @padding: reserved for future, not used, has to be zeroed + * @erase_counters: array of erase counter values (out) + * + * This structure is used to retrieve erase counter information for a specified + * range of PEBs on a UBI device. + * Erase counters are read from @start and attempts to read @length number of + * erase counters. + * The retrieved values are stored in the @erase_counters array. It is the + * responsibility of the caller to allocate enough memory for storing @length + * elements in the @erase_counters array. + * If a block is bad or if the erase counter is unknown the corresponding value + * in the array will be set to -1. + * The @read_length field will indicate the number of erase counters actually + * read. Typically @read_length will be limited due to memory or the number of + * PEBs on the UBI device. + */ +struct ubi_ecinfo_req { + __s32 start; + __s32 length; + __s32 read_length; + __s8 padding[16]; + __s32 erase_counters[]; +} __packed; + /** * struct ubi_leb_change_req - a data structure used in atomic LEB change * requests. -- cgit v1.2.3 From 4bf97069239bcfca9840936313c7ac35a6e04488 Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Wed, 13 Nov 2024 18:21:20 -0800 Subject: riscv: Add ghostwrite vulnerability Follow the patterns of the other architectures that use GENERIC_CPU_VULNERABILITIES for riscv to introduce the ghostwrite vulnerability and mitigation. The mitigation is to disable all vector which is accomplished by clearing the bit from the cpufeature field. Ghostwrite only affects thead c9xx CPUs that impelment xtheadvector, so the vulerability will only be mitigated on these CPUs. Signed-off-by: Charlie Jenkins Tested-by: Yangyu Chen Link: https://lore.kernel.org/r/20241113-xtheadvector-v11-14-236c22791ef9@rivosinc.com Signed-off-by: Palmer Dabbelt --- include/linux/cpu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index bdcec1732445..6a0a8f1c7c90 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -77,6 +77,7 @@ extern ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_ghostwrite(struct device *dev, struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, -- cgit v1.2.3 From f546e8033d8f3e45d49622f04ca2fde650b80f6d Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Mon, 9 Dec 2024 14:06:23 +0100 Subject: PCI: Export pci_intx_unmanaged() and pcim_intx() pci_intx() is a hybrid function which sometimes performs devres operations, depending on whether pcim_enable_device() has been used to enable the pci_dev. This sometimes-managed nature of the function is problematic. Notably, it causes the function to allocate under some circumstances which makes it unusable from interrupt context. Export pcim_intx() (which is always managed) and rename __pcim_intx() (which is never managed) to pci_intx_unmanaged() and export it as well. Then all callers of pci_intx() can be ported to the version they need, depending whether they use pci_enable_device() or pcim_enable_device(). Link: https://lore.kernel.org/r/20241209130632.132074-3-pstanner@redhat.com Signed-off-by: Philipp Stanner [bhelgaas: commit log] Signed-off-by: Bjorn Helgaas Reviewed-by: Damien Le Moal --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index db9b47ce3eef..b5eb8bda655d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1350,6 +1350,7 @@ int __must_check pcim_set_mwi(struct pci_dev *dev); int pci_try_set_mwi(struct pci_dev *dev); void pci_clear_mwi(struct pci_dev *dev); void pci_disable_parity(struct pci_dev *dev); +void pci_intx_unmanaged(struct pci_dev *pdev, int enable); void pci_intx(struct pci_dev *dev, int enable); bool pci_check_and_mask_intx(struct pci_dev *dev); bool pci_check_and_unmask_intx(struct pci_dev *dev); @@ -2297,6 +2298,7 @@ static inline void pci_fixup_device(enum pci_fixup_pass pass, struct pci_dev *dev) { } #endif +int pcim_intx(struct pci_dev *pdev, int enabled); int pcim_request_all_regions(struct pci_dev *pdev, const char *name); void __iomem *pcim_iomap(struct pci_dev *pdev, int bar, unsigned long maxlen); void __iomem *pcim_iomap_region(struct pci_dev *pdev, int bar, -- cgit v1.2.3 From dfa2f4d5f9e5d757700cefa8ee480099889f1c69 Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Mon, 9 Dec 2024 14:06:33 +0100 Subject: PCI: Remove devres from pci_intx() pci_intx() is a hybrid function which can sometimes be managed through devres. This hybrid nature is undesirable. Since all users of pci_intx() have by now been ported either to always-managed pcim_intx() or never-managed pci_intx_unmanaged(), the devres functionality can be removed from pci_intx(). Consequently, pci_intx_unmanaged() is now redundant, because pci_intx() itself is now unmanaged. Remove the devres functionality from pci_intx(). Have all users of pci_intx_unmanaged() call pci_intx(). Remove pci_intx_unmanaged(). Link: https://lore.kernel.org/r/20241209130632.132074-13-pstanner@redhat.com Signed-off-by: Philipp Stanner Signed-off-by: Bjorn Helgaas Acked-by: Paolo Abeni --- include/linux/pci.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index b5eb8bda655d..f05903dd7695 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1350,7 +1350,6 @@ int __must_check pcim_set_mwi(struct pci_dev *dev); int pci_try_set_mwi(struct pci_dev *dev); void pci_clear_mwi(struct pci_dev *dev); void pci_disable_parity(struct pci_dev *dev); -void pci_intx_unmanaged(struct pci_dev *pdev, int enable); void pci_intx(struct pci_dev *dev, int enable); bool pci_check_and_mask_intx(struct pci_dev *dev); bool pci_check_and_unmask_intx(struct pci_dev *dev); -- cgit v1.2.3 From 4dbf0155dfcfa65440b5f70d3e905261208b387e Mon Sep 17 00:00:00 2001 From: Frank Li Date: Tue, 19 Nov 2024 14:44:19 -0500 Subject: of: address: Add parent_bus_addr to struct of_pci_range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new field called 'parent_bus_addr' to struct of_pci_range to use when retrieving parent bus address information. Refer to the diagram below to better understand that the bus fabric in some systems (like i.MX8QXP) does not always use a 1:1 address map between input and output. Currently, many controller drivers use the cpu_addr_fixup() callback that would often hardcode address translation directly in the code, e.g., "cpu_addr & CDNS_PLAT_CPU_TO_BUS_ADDR" or "cpu_addr + BUS_IATU_OFFSET", etc., even though those translations *should* be described via DT. However, the cpu_addr_fixup() can be eliminated if DT correctly reflects hardware behavior and drivers use 'parent_bus_addr' in struct of_pci_range. ┌─────────┐ ┌────────────┐ ┌─────┐ │ │ IA: 0x8ff8_0000 │ │ │ CPU ├───►│ ┌────►├─────────────────┐ │ PCI │ └─────┘ │ │ │ IA: 0x8ff0_0000 │ │ │ CPU Addr │ │ ┌─►├─────────────┐ │ │ Controller │ 0x7ff8_0000─┼───┘ │ │ │ │ │ │ │ │ │ │ │ │ │ PCI Addr 0x7ff0_0000─┼──────┘ │ │ └──► IOSpace ─┼────────────► │ │ │ │ │ 0 0x7000_0000─┼────────►├─────────┐ │ │ │ └─────────┘ │ └──────► CfgSpace ─┼────────────► BUS Fabric │ │ │ 0 │ │ │ └──────────► MemSpace ─┼────────────► IA: 0x8000_0000 │ │ 0x8000_0000 └────────────┘ bus@5f000000 { compatible = "simple-bus"; #address-cells = <1>; #size-cells = <1>; ranges = <0x80000000 0x0 0x70000000 0x10000000>; pcie@5f010000 { compatible = "fsl,imx8q-pcie"; reg = <0x5f010000 0x10000>, <0x8ff00000 0x80000>; reg-names = "dbi", "config"; #address-cells = <3>; #size-cells = <2>; device_type = "pci"; bus-range = <0x00 0xff>; ranges = <0x81000000 0 0x00000000 0x8ff80000 0 0x00010000>, <0x82000000 0 0x80000000 0x80000000 0 0x0ff00000>; ... }; }; In the diagram above, the 'parent_bus_addr' field in struct of_pci_range can indicate internal address (IA) address information. Link: https://lore.kernel.org/r/20241119-pci_fixup_addr-v8-1-c4bfa5193288@nxp.com Signed-off-by: Frank Li [kwilczynski: commit log] Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas Reviewed-by: Rob Herring (Arm) Acked-by: Manivannan Sadhasivam --- include/linux/of_address.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/of_address.h b/include/linux/of_address.h index 9e034363788a..0cff90365391 100644 --- a/include/linux/of_address.h +++ b/include/linux/of_address.h @@ -26,6 +26,7 @@ struct of_pci_range { u64 bus_addr; }; u64 cpu_addr; + u64 parent_bus_addr; u64 size; u32 flags; }; -- cgit v1.2.3 From e4b1d67e71419c4af581890ecea84b04920d4116 Mon Sep 17 00:00:00 2001 From: Valentina Fernandez Date: Tue, 17 Dec 2024 11:31:34 +0000 Subject: mailbox: add Microchip IPC support Add a mailbox controller driver for the Microchip Inter-processor Communication (IPC), which is used to send and receive data between processors. The driver uses the RISC-V Supervisor Binary Interface (SBI) to communicate with software running in machine mode (M-mode) to access the IPC hardware block. Additional details on the Microchip vendor extension and the IPC function IDs described in the driver can be found in the following documentation: https://github.com/linux4microchip/microchip-sbi-ecall-extension This SBI interface in this driver is compatible with the Mi-V Inter-hart Communication (IHC) IP. Transmitting and receiving data through the mailbox framework is done through struct mchp_ipc_msg. Signed-off-by: Valentina Fernandez Signed-off-by: Jassi Brar --- include/linux/mailbox/mchp-ipc.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 include/linux/mailbox/mchp-ipc.h (limited to 'include') diff --git a/include/linux/mailbox/mchp-ipc.h b/include/linux/mailbox/mchp-ipc.h new file mode 100644 index 000000000000..f084ac9e291b --- /dev/null +++ b/include/linux/mailbox/mchp-ipc.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + *Copyright (c) 2024 Microchip Technology Inc. All rights reserved. + */ + +#ifndef _LINUX_MCHP_IPC_H_ +#define _LINUX_MCHP_IPC_H_ + +#include +#include + +struct mchp_ipc_msg { + u32 *buf; + u16 size; +}; + +struct mchp_ipc_sbi_chan { + void *buf_base_tx; + void *buf_base_rx; + void *msg_buf_tx; + void *msg_buf_rx; + phys_addr_t buf_base_tx_addr; + phys_addr_t buf_base_rx_addr; + phys_addr_t msg_buf_tx_addr; + phys_addr_t msg_buf_rx_addr; + int chan_aggregated_irq; + int mp_irq; + int mc_irq; + u32 id; + u32 max_msg_size; +}; + +#endif /* _LINUX_MCHP_IPC_H_ */ -- cgit v1.2.3 From fbf7e5ce408e0619072e84e93e875de52f2b5fa5 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Wed, 15 Jan 2025 14:18:15 +0000 Subject: mailbox: add Samsung Exynos driver The Samsung Exynos mailbox controller, used on Google GS101 SoC, has 16 flag bits for hardware interrupt generation and a shared register for passing mailbox messages. When the controller is used by the ACPM interface the shared register is ignored and the mailbox controller acts as a doorbell. The controller just raises the interrupt to APM after the ACPM interface has written the message to SRAM. Add support for the Samsung Exynos mailbox controller. Signed-off-by: Tudor Ambarus Signed-off-by: Jassi Brar --- include/linux/mailbox/exynos-message.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/linux/mailbox/exynos-message.h (limited to 'include') diff --git a/include/linux/mailbox/exynos-message.h b/include/linux/mailbox/exynos-message.h new file mode 100644 index 000000000000..5a9ed5ce2046 --- /dev/null +++ b/include/linux/mailbox/exynos-message.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Exynos mailbox message. + * + * Copyright 2024 Linaro Ltd. + */ + +#ifndef _LINUX_EXYNOS_MESSAGE_H_ +#define _LINUX_EXYNOS_MESSAGE_H_ + +#define EXYNOS_MBOX_CHAN_TYPE_DOORBELL 0 +#define EXYNOS_MBOX_CHAN_TYPE_DATA 1 + +struct exynos_mbox_msg { + unsigned int chan_id; + unsigned int chan_type; +}; + +#endif /* _LINUX_EXYNOS_MESSAGE_H_ */ -- cgit v1.2.3 From 12d5151be01017401b8d2681f3c975265a233eaf Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 16 Jan 2025 22:38:40 +0100 Subject: net: phy: remove leftovers from switch to linkmode bitmaps We have some leftovers from the switch to linkmode bitmaps which - have never been used - are not used any longer - have no user outside phy_device.c So remove them. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/5493b96e-88bb-4230-a911-322659ec5167@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 244f747b3cd9..19f076a71f94 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -32,19 +32,6 @@ #include #include -#define PHY_DEFAULT_FEATURES (SUPPORTED_Autoneg | \ - SUPPORTED_TP | \ - SUPPORTED_MII) - -#define PHY_10BT_FEATURES (SUPPORTED_10baseT_Half | \ - SUPPORTED_10baseT_Full) - -#define PHY_100BT_FEATURES (SUPPORTED_100baseT_Half | \ - SUPPORTED_100baseT_Full) - -#define PHY_1000BT_FEATURES (SUPPORTED_1000baseT_Half | \ - SUPPORTED_1000baseT_Full) - extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_t1_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_t1s_p2mp_features) __ro_after_init; @@ -62,16 +49,11 @@ extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_eee_cap2_features) __ro_after_init; #define PHY_BASIC_T1S_P2MP_FEATURES ((unsigned long *)&phy_basic_t1s_p2mp_features) #define PHY_GBIT_FEATURES ((unsigned long *)&phy_gbit_features) #define PHY_GBIT_FIBRE_FEATURES ((unsigned long *)&phy_gbit_fibre_features) -#define PHY_GBIT_ALL_PORTS_FEATURES ((unsigned long *)&phy_gbit_all_ports_features) #define PHY_10GBIT_FEATURES ((unsigned long *)&phy_10gbit_features) -#define PHY_10GBIT_FEC_FEATURES ((unsigned long *)&phy_10gbit_fec_features) -#define PHY_10GBIT_FULL_FEATURES ((unsigned long *)&phy_10gbit_full_features) #define PHY_EEE_CAP1_FEATURES ((unsigned long *)&phy_eee_cap1_features) #define PHY_EEE_CAP2_FEATURES ((unsigned long *)&phy_eee_cap2_features) extern const int phy_basic_ports_array[3]; -extern const int phy_fibre_port_array[1]; -extern const int phy_all_ports_features_array[7]; extern const int phy_10_100_features_array[4]; extern const int phy_basic_t1_features_array[3]; extern const int phy_basic_t1s_p2mp_features_array[2]; -- cgit v1.2.3 From f50fcd23c9b9d99bf03d0ab9f30cba4665e6326e Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 12 Jan 2025 16:30:59 +0000 Subject: crypto: asymmetric_keys - Remove unused key_being_used_for[] key_being_used_for[] is an unused array of textual names for the elements of the enum key_being_used_for. It was added in 2015 by commit 99db44350672 ("PKCS#7: Appropriately restrict authenticated attributes and content type") Remove it. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Herbert Xu --- include/linux/verification.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/verification.h b/include/linux/verification.h index cb2d47f28091..4f3022d081c3 100644 --- a/include/linux/verification.h +++ b/include/linux/verification.h @@ -38,8 +38,6 @@ enum key_being_used_for { VERIFYING_UNSPECIFIED_SIGNATURE, NR__KEY_BEING_USED_FOR }; -extern const char *const key_being_used_for[NR__KEY_BEING_USED_FOR]; - #ifdef CONFIG_SYSTEM_DATA_VERIFICATION struct key; -- cgit v1.2.3 From 8d738c1869f611955d91d8d0fd0012d9ef207201 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 6 Jan 2025 23:40:50 +0100 Subject: netfilter: nf_tables: fix set size with rbtree backend The existing rbtree implementation uses singleton elements to represent ranges, however, userspace provides a set size according to the number of ranges in the set. Adjust provided userspace set size to the number of singleton elements in the kernel by multiplying the range by two. Check if the no-match all-zero element is already in the set, in such case release one slot in the set size. Fixes: 0ed6389c483d ("netfilter: nf_tables: rename set implementations") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 0027beca5cd5..f6958118986a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -442,6 +442,9 @@ struct nft_set_ext; * @remove: remove element from set * @walk: iterate over all set elements * @get: get set elements + * @ksize: kernel set size + * @usize: userspace set size + * @adjust_maxsize: delta to adjust maximum set size * @commit: commit set elements * @abort: abort set elements * @privsize: function to return size of set private data @@ -495,6 +498,9 @@ struct nft_set_ops { const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags); + u32 (*ksize)(u32 size); + u32 (*usize)(u32 size); + u32 (*adjust_maxsize)(const struct nft_set *set); void (*commit)(struct nft_set *set); void (*abort)(const struct nft_set *set); u64 (*privsize)(const struct nlattr * const nla[], -- cgit v1.2.3 From b7c2d793c28cda7dbb67d6b427e3280b7c1e601a Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 9 Jan 2025 18:31:33 +0100 Subject: netfilter: nf_tables: Store user-defined hook ifname Prepare for hooks with NULL ops.dev pointer (due to non-existent device) and store the interface name and length as specified by the user upon creation. No functional change intended. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index f6958118986a..bd93d085b6fb 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1201,6 +1201,8 @@ struct nft_hook { struct list_head list; struct nf_hook_ops ops; struct rcu_head rcu; + char ifname[IFNAMSIZ]; + u8 ifnamelen; }; /** -- cgit v1.2.3 From fc0133428e7ad65aa6b7c8e65ccfe86e469e4512 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 9 Jan 2025 18:31:36 +0100 Subject: netfilter: nf_tables: Tolerate chains with no remaining hooks Do not drop a netdev-family chain if the last interface it is registered for vanishes. Users dumping and storing the ruleset upon shutdown to restore it upon next boot may otherwise lose the chain and all contained rules. They will still lose the list of devices, a later patch will fix that. For now, this aligns the event handler's behaviour with that for flowtables. The controversal situation at netns exit should be no problem here: event handler will unregister the hooks, core nftables cleanup code will drop the chain itself. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index bd93d085b6fb..60d5dcdb289c 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1238,8 +1238,6 @@ static inline bool nft_is_base_chain(const struct nft_chain *chain) return chain->flags & NFT_CHAIN_BASE; } -int __nft_release_basechain(struct nft_ctx *ctx); - unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); static inline bool nft_use_inc(u32 *use) -- cgit v1.2.3 From 31768596b15aa8c9c55f078acad29d0238c8269b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 14 Jan 2025 00:50:35 +0100 Subject: netfilter: conntrack: remove skb argument from nf_ct_refresh Its not used (and could be NULL), so remove it. This allows to use nf_ct_refresh in places where we don't have an skb without having to double-check that skb == NULL would be safe. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index cba3ccf03fcc..3cbf29dd0b71 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -204,8 +204,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, struct nf_conntrack_tuple *tuple); void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, - const struct sk_buff *skb, - u32 extra_jiffies, bool do_acct); + u32 extra_jiffies, unsigned int bytes); /* Refresh conntrack for this many jiffies and do accounting */ static inline void nf_ct_refresh_acct(struct nf_conn *ct, @@ -213,15 +212,14 @@ static inline void nf_ct_refresh_acct(struct nf_conn *ct, const struct sk_buff *skb, u32 extra_jiffies) { - __nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies, true); + __nf_ct_refresh_acct(ct, ctinfo, extra_jiffies, skb->len); } /* Refresh conntrack for this many jiffies */ static inline void nf_ct_refresh(struct nf_conn *ct, - const struct sk_buff *skb, u32 extra_jiffies) { - __nf_ct_refresh_acct(ct, 0, skb, extra_jiffies, false); + __nf_ct_refresh_acct(ct, 0, extra_jiffies, 0); } /* kill conntrack and do accounting */ -- cgit v1.2.3 From 03428ca5cee9f0792edc996c06ce4514816af1fb Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 14 Jan 2025 00:50:36 +0100 Subject: netfilter: conntrack: rework offload nf_conn timeout extension logic Offload nf_conn entries may not see traffic for a very long time. To prevent incorrect 'ct is stale' checks during nf_conntrack table lookup, the gc worker extends the timeout nf_conn entries marked for offload to a large value. The existing logic suffers from a few problems. Garbage collection runs without locks, its unlikely but possible that @ct is removed right after the 'offload' bit test. In that case, the timeout of a new/reallocated nf_conn entry will be increased. Prevent this by obtaining a reference count on the ct object and re-check of the confirmed and offload bits. If those are not set, the ct is being removed, skip the timeout extension in this case. Parallel teardown is also problematic: cpu1 cpu2 gc_worker calls flow_offload_teardown() tests OFFLOAD bit, set clear OFFLOAD bit ct->timeout is repaired (e.g. set to timeout[UDP_CT_REPLIED]) nf_ct_offload_timeout() called expire value is fetched -> NF_CT_DAY timeout for flow that isn't offloaded (and might not see any further packets). Use cmpxchg: if ct->timeout was repaired after the 2nd 'offload bit' test passed, then ct->timeout will only be updated of ct->timeout was not altered in between. As we already have a gc worker for flowtable entries, ct->timeout repair can be handled from the flowtable gc worker. This avoids having flowtable specific logic in the conntrack core and avoids checking entries that were never offloaded. This allows to remove the nf_ct_offload_timeout helper. Its safe to use in the add case, but not on teardown. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 3cbf29dd0b71..3f02a45773e8 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -312,16 +312,6 @@ static inline bool nf_ct_should_gc(const struct nf_conn *ct) #define NF_CT_DAY (86400 * HZ) -/* Set an arbitrary timeout large enough not to ever expire, this save - * us a check for the IPS_OFFLOAD_BIT from the packet path via - * nf_ct_is_expired(). - */ -static inline void nf_ct_offload_timeout(struct nf_conn *ct) -{ - if (nf_ct_expires(ct) < NF_CT_DAY / 2) - WRITE_ONCE(ct->timeout, nfct_time_stamp + NF_CT_DAY); -} - struct kernel_param; int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp); -- cgit v1.2.3 From fdbaf5163331342e90a2c29b87629021f4c15f0c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 14 Jan 2025 00:50:38 +0100 Subject: netfilter: flowtable: add CLOSING state tcp rst/fin packet triggers an immediate teardown of the flow which results in sending flows back to the classic forwarding path. This behaviour was introduced by: da5984e51063 ("netfilter: nf_flow_table: add support for sending flows back to the slow path") b6f27d322a0a ("netfilter: nf_flow_table: tear down TCP flows if RST or FIN was seen") whose goal is to expedite removal of flow entries from the hardware table. Before these patches, the flow was released after the flow entry timed out. However, this approach leads to packet races when restoring the conntrack state as well as late flow re-offload situations when the TCP connection is ending. This patch adds a new CLOSING state that is is entered when tcp rst/fin packet is seen. This allows for an early removal of the flow entry from the hardware table. But the flow entry still remains in software, so tcp packets to shut down the flow are not sent back to slow path. If syn packet is seen from this new CLOSING state, then this flow enters teardown state, ct state is set to TCP_CONNTRACK_CLOSE state and packet is sent to slow path, so this TCP reopen scenario can be handled by conntrack. TCP_CONNTRACK_CLOSE provides a small timeout that aims at quickly releasing this stale entry from the conntrack table. Moreover, skip hardware re-offload from flowtable software packet if the flow is in CLOSING state. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index b63d53bb9dd6..d711642e78b5 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -163,6 +163,7 @@ struct flow_offload_tuple_rhash { enum nf_flow_flags { NF_FLOW_SNAT, NF_FLOW_DNAT, + NF_FLOW_CLOSING, NF_FLOW_TEARDOWN, NF_FLOW_HW, NF_FLOW_HW_DYING, -- cgit v1.2.3 From e876695aab1e3d4743e11633219cea456820660b Mon Sep 17 00:00:00 2001 From: I Hsin Cheng Date: Fri, 17 Jan 2025 22:27:24 +0800 Subject: cpumask: Rephrase comments for cpumask_any*() APIs The cpumask_any*() APIs comment states that it returns a "random" cpu within the given cpumask. However it's not actually random as random itself stands a meaning for uniform distribution. cpumask_any*() APIs are a naming convention for the caller to states that it doesn't care which CPU it gets, so change "random" to "arbitrary" would be more appropriate. CC: Mark Rutland Signed-off-by: I Hsin Cheng Reviewed-by: Kuan-Wei Chiu Signed-off-by: Yury Norov --- include/linux/cpumask.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 590d8438514c..36a890d0dd57 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -391,7 +391,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta for_each_set_bit_from(cpu, cpumask_bits(mask), small_cpumask_bits) /** - * cpumask_any_but - return a "random" in a cpumask, but not this one. + * cpumask_any_but - return an arbitrary cpu in a cpumask, but not this one. * @mask: the cpumask to search * @cpu: the cpu to ignore. * @@ -411,7 +411,7 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) } /** - * cpumask_any_and_but - pick a "random" cpu from *mask1 & *mask2, but not this one. + * cpumask_any_and_but - pick an arbitrary cpu from *mask1 & *mask2, but not this one. * @mask1: the first input cpumask * @mask2: the second input cpumask * @cpu: the cpu to ignore @@ -840,7 +840,7 @@ void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp) } /** - * cpumask_any - pick a "random" cpu from *srcp + * cpumask_any - pick an arbitrary cpu from *srcp * @srcp: the input cpumask * * Return: >= nr_cpu_ids if no cpus set. @@ -848,7 +848,7 @@ void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp) #define cpumask_any(srcp) cpumask_first(srcp) /** - * cpumask_any_and - pick a "random" cpu from *mask1 & *mask2 + * cpumask_any_and - pick an arbitrary cpu from *mask1 & *mask2 * @mask1: the first input cpumask * @mask2: the second input cpumask * -- cgit v1.2.3 From b489e7946656ed67fea1a30f5103eb62a8686e04 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 9 Jan 2024 11:57:48 +0530 Subject: PM / OPP: Add reference counting helpers for Rust implementation To ensure that resources such as OPP tables or OPP nodes are not freed while in use by the Rust implementation, it is necessary to increment their reference count from Rust code. This commit introduces a new helper function, dev_pm_opp_get_opp_table_ref(), to increment the reference count of an OPP table and declares the existing helper dev_pm_opp_get() in pm_opp.h. Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 414146abfe81..c247317aae38 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -100,6 +100,7 @@ struct dev_pm_opp_data { #if defined(CONFIG_PM_OPP) struct opp_table *dev_pm_opp_get_opp_table(struct device *dev); +void dev_pm_opp_get_opp_table_ref(struct opp_table *opp_table); void dev_pm_opp_put_opp_table(struct opp_table *opp_table); unsigned long dev_pm_opp_get_bw(struct dev_pm_opp *opp, bool peak, int index); @@ -160,6 +161,7 @@ struct dev_pm_opp *dev_pm_opp_find_bw_ceil(struct device *dev, struct dev_pm_opp *dev_pm_opp_find_bw_floor(struct device *dev, unsigned int *bw, int index); +void dev_pm_opp_get(struct dev_pm_opp *opp); void dev_pm_opp_put(struct dev_pm_opp *opp); int dev_pm_opp_add_dynamic(struct device *dev, struct dev_pm_opp_data *opp); @@ -205,6 +207,8 @@ static inline struct opp_table *dev_pm_opp_get_opp_table_indexed(struct device * return ERR_PTR(-EOPNOTSUPP); } +static inline void dev_pm_opp_get_opp_table_ref(struct opp_table *opp_table) {} + static inline void dev_pm_opp_put_opp_table(struct opp_table *opp_table) {} static inline unsigned long dev_pm_opp_get_bw(struct dev_pm_opp *opp, bool peak, int index) @@ -341,6 +345,8 @@ static inline struct dev_pm_opp *dev_pm_opp_find_bw_floor(struct device *dev, return ERR_PTR(-EOPNOTSUPP); } +static inline void dev_pm_opp_get(struct dev_pm_opp *opp) {} + static inline void dev_pm_opp_put(struct dev_pm_opp *opp) {} static inline int -- cgit v1.2.3 From a12c76a03386e32413ae8eaaefa337e491880632 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 15 Jan 2025 09:27:54 -0500 Subject: net: sched: refine software bypass handling in tc_run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch addresses issues with filter counting in block (tcf_block), particularly for software bypass scenarios, by introducing a more accurate mechanism using useswcnt. Previously, filtercnt and skipswcnt were introduced by: Commit 2081fd3445fe ("net: sched: cls_api: add filter counter") and Commit f631ef39d819 ("net: sched: cls_api: add skip_sw counter") filtercnt tracked all tp (tcf_proto) objects added to a block, and skipswcnt counted tp objects with the skipsw attribute set. The problem is: a single tp can contain multiple filters, some with skipsw and others without. The current implementation fails in the case: When the first filter in a tp has skipsw, both skipswcnt and filtercnt are incremented, then adding a second filter without skipsw to the same tp does not modify these counters because tp->counted is already set. This results in bypass software behavior based solely on skipswcnt equaling filtercnt, even when the block includes filters without skipsw. Consequently, filters without skipsw are inadvertently bypassed. To address this, the patch introduces useswcnt in block to explicitly count tp objects containing at least one filter without skipsw. Key changes include: Whenever a filter without skipsw is added, its tp is marked with usesw and counted in useswcnt. tc_run() now uses useswcnt to determine software bypass, eliminating reliance on filtercnt and skipswcnt. This refined approach prevents software bypass for blocks containing mixed filters, ensuring correct behavior in tc_run(). Additionally, as atomic operations on useswcnt ensure thread safety and tp->lock guards access to tp->usesw and tp->counted, the broader lock down_write(&block->cb_lock) is no longer required in tc_new_tfilter(), and this resolves a performance regression caused by the filter counting mechanism during parallel filter insertions. The improvement can be demonstrated using the following script: # cat insert_tc_rules.sh tc qdisc add dev ens1f0np0 ingress for i in $(seq 16); do taskset -c $i tc -b rules_$i.txt & done wait Each of rules_$i.txt files above includes 100000 tc filter rules to a mlx5 driver NIC ens1f0np0. Without this patch: # time sh insert_tc_rules.sh real 0m50.780s user 0m23.556s sys 4m13.032s With this patch: # time sh insert_tc_rules.sh real 0m17.718s user 0m7.807s sys 3m45.050s Fixes: 047f340b36fc ("net: sched: make skip_sw actually skip software") Reported-by: Shuang Li Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Reviewed-by: Asbjørn Sloth Tønnesen Tested-by: Asbjørn Sloth Tønnesen Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 13 +++++++++++-- include/net/sch_generic.h | 5 ++--- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 22c5ab4269d7..c64fd896b1f9 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -75,11 +75,11 @@ static inline bool tcf_block_non_null_shared(struct tcf_block *block) } #ifdef CONFIG_NET_CLS_ACT -DECLARE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key); +DECLARE_STATIC_KEY_FALSE(tcf_sw_enabled_key); static inline bool tcf_block_bypass_sw(struct tcf_block *block) { - return block && block->bypass_wanted; + return block && !atomic_read(&block->useswcnt); } #endif @@ -760,6 +760,15 @@ tc_cls_common_offload_init(struct flow_cls_common_offload *cls_common, cls_common->extack = extack; } +static inline void tcf_proto_update_usesw(struct tcf_proto *tp, u32 flags) +{ + if (tp->usesw) + return; + if (tc_skip_sw(flags) && tc_in_hw(flags)) + return; + tp->usesw = true; +} + #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) static inline struct tc_skb_ext *tc_skb_ext_alloc(struct sk_buff *skb) { diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 8074322dd636..d635c5b47eba 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -425,6 +425,7 @@ struct tcf_proto { spinlock_t lock; bool deleting; bool counted; + bool usesw; refcount_t refcnt; struct rcu_head rcu; struct hlist_node destroy_ht_node; @@ -474,9 +475,7 @@ struct tcf_block { struct flow_block flow_block; struct list_head owner_list; bool keep_dst; - bool bypass_wanted; - atomic_t filtercnt; /* Number of filters */ - atomic_t skipswcnt; /* Number of skip_sw filters */ + atomic_t useswcnt; atomic_t offloadcnt; /* Number of oddloaded filters */ unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */ unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */ -- cgit v1.2.3 From 457bb7970a0f10effd6b4ca9e1057727963c473a Mon Sep 17 00:00:00 2001 From: Ales Nezbeda Date: Fri, 17 Jan 2025 12:22:28 +0100 Subject: net: macsec: Add endianness annotations in salt struct This change resolves warning produced by sparse tool as currently there is a mismatch between normal generic type in salt and endian annotated type in macsec driver code. Endian annotated types should be used here. Sparse output: warning: restricted ssci_t degrades to integer warning: incorrect type in assignment (different base types) expected restricted ssci_t [usertype] ssci got unsigned int warning: restricted __be64 degrades to integer warning: incorrect type in assignment (different base types) expected restricted __be64 [usertype] pn got unsigned long long Signed-off-by: Ales Nezbeda Reviewed-by: Simon Horman Reviewed-by: Sabrina Dubroca Signed-off-by: David S. Miller --- include/net/macsec.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/macsec.h b/include/net/macsec.h index de216cbc6b05..bc7de5b53e54 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -38,8 +38,8 @@ struct metadata_dst; typedef union salt { struct { - u32 ssci; - u64 pn; + ssci_t ssci; + __be64 pn; } __packed; u8 bytes[MACSEC_SALT_LEN]; } __packed salt_t; -- cgit v1.2.3 From 454d402481d45af79ee7eea7e64bce02bbbe9766 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:34 +0900 Subject: net: dropreason: Gather SOCKET_ drop reasons. The following patch adds a new drop reason starting with the SOCKET_ prefix. Let's gather the existing SOCKET_ reasons. Note that the order is not part of uAPI. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index ed864934e20b..f3714cbea50d 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -6,9 +6,10 @@ #define DEFINE_DROP_REASON(FN, FNe) \ FN(NOT_SPECIFIED) \ FN(NO_SOCKET) \ + FN(SOCKET_FILTER) \ + FN(SOCKET_RCVBUFF) \ FN(PKT_TOO_SMALL) \ FN(TCP_CSUM) \ - FN(SOCKET_FILTER) \ FN(UDP_CSUM) \ FN(NETFILTER_DROP) \ FN(OTHERHOST) \ @@ -18,7 +19,6 @@ FN(UNICAST_IN_L2_MULTICAST) \ FN(XFRM_POLICY) \ FN(IP_NOPROTO) \ - FN(SOCKET_RCVBUFF) \ FN(PROTO_MEM) \ FN(TCP_AUTH_HDR) \ FN(TCP_MD5NOTFOUND) \ @@ -138,12 +138,14 @@ enum skb_drop_reason { * 3) no valid child socket during 3WHS process */ SKB_DROP_REASON_NO_SOCKET, + /** @SKB_DROP_REASON_SOCKET_FILTER: dropped by socket filter */ + SKB_DROP_REASON_SOCKET_FILTER, + /** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */ + SKB_DROP_REASON_SOCKET_RCVBUFF, /** @SKB_DROP_REASON_PKT_TOO_SMALL: packet size is too small */ SKB_DROP_REASON_PKT_TOO_SMALL, /** @SKB_DROP_REASON_TCP_CSUM: TCP checksum error */ SKB_DROP_REASON_TCP_CSUM, - /** @SKB_DROP_REASON_SOCKET_FILTER: dropped by socket filter */ - SKB_DROP_REASON_SOCKET_FILTER, /** @SKB_DROP_REASON_UDP_CSUM: UDP checksum error */ SKB_DROP_REASON_UDP_CSUM, /** @SKB_DROP_REASON_NETFILTER_DROP: dropped by netfilter */ @@ -174,8 +176,6 @@ enum skb_drop_reason { SKB_DROP_REASON_XFRM_POLICY, /** @SKB_DROP_REASON_IP_NOPROTO: no support for IP protocol */ SKB_DROP_REASON_IP_NOPROTO, - /** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */ - SKB_DROP_REASON_SOCKET_RCVBUFF, /** * @SKB_DROP_REASON_PROTO_MEM: proto memory limitation, such as * udp packet drop out of udp_memory_allocated. -- cgit v1.2.3 From c32f0bd7d4838982c6724fca0da92353f27c6f88 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:35 +0900 Subject: af_unix: Set drop reason in unix_release_sock(). unix_release_sock() is called when the last refcnt of struct file is released. Let's define a new drop reason SKB_DROP_REASON_SOCKET_CLOSE and set it for kfree_skb() in unix_release_sock(). # echo 1 > /sys/kernel/tracing/events/skb/kfree_skb/enable # python3 >>> from socket import * >>> s1, s2 = socketpair(AF_UNIX) >>> s1.send(b'hello world') >>> s2.close() # cat /sys/kernel/tracing/trace_pipe ... python3-280 ... kfree_skb: ... protocol=0 location=unix_release_sock+0x260/0x420 reason: SOCKET_CLOSE To be precise, unix_release_sock() is also called for a new child socket in unix_stream_connect() when something fails, but the new sk does not have skb in the recv queue then and no event is logged. Note that only tcp_inbound_ao_hash() uses a similar drop reason, SKB_DROP_REASON_TCP_CLOSE, and this can be generalised later. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index f3714cbea50d..b9e7ff853ce3 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -6,6 +6,7 @@ #define DEFINE_DROP_REASON(FN, FNe) \ FN(NOT_SPECIFIED) \ FN(NO_SOCKET) \ + FN(SOCKET_CLOSE) \ FN(SOCKET_FILTER) \ FN(SOCKET_RCVBUFF) \ FN(PKT_TOO_SMALL) \ @@ -138,6 +139,8 @@ enum skb_drop_reason { * 3) no valid child socket during 3WHS process */ SKB_DROP_REASON_NO_SOCKET, + /** @SKB_DROP_REASON_SOCKET_CLOSE: socket is close()d */ + SKB_DROP_REASON_SOCKET_CLOSE, /** @SKB_DROP_REASON_SOCKET_FILTER: dropped by socket filter */ SKB_DROP_REASON_SOCKET_FILTER, /** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */ -- cgit v1.2.3 From 533643b091dd6e246d57caf81e6892fa9cbb1cc9 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:38 +0900 Subject: af_unix: Set drop reason in manage_oob(). AF_UNIX SOCK_STREAM socket supports MSG_OOB. When OOB data is sent to a socket, recv() will break at that point. If the next recv() does not have MSG_OOB, the normal data following the OOB data is returned. Then, the OOB skb is dropped. Let's define a new drop reason for that case in manage_oob(). # echo 1 > /sys/kernel/tracing/events/skb/kfree_skb/enable # python3 >>> from socket import * >>> s1, s2 = socketpair(AF_UNIX) >>> s1.send(b'a', MSG_OOB) >>> s1.send(b'b') >>> s2.recv(2) b'b' # cat /sys/kernel/tracing/trace_pipe ... python3-223 ... kfree_skb: ... location=unix_stream_read_generic+0x59e/0xc20 reason: UNIX_SKIP_OOB Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index b9e7ff853ce3..d6c9d841eb11 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -9,6 +9,7 @@ FN(SOCKET_CLOSE) \ FN(SOCKET_FILTER) \ FN(SOCKET_RCVBUFF) \ + FN(UNIX_SKIP_OOB) \ FN(PKT_TOO_SMALL) \ FN(TCP_CSUM) \ FN(UDP_CSUM) \ @@ -145,6 +146,11 @@ enum skb_drop_reason { SKB_DROP_REASON_SOCKET_FILTER, /** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */ SKB_DROP_REASON_SOCKET_RCVBUFF, + /** + * @SKB_DROP_REASON_UNIX_SKIP_OOB: Out-Of-Band data is skipped by + * recv() without MSG_OOB so dropped. + */ + SKB_DROP_REASON_UNIX_SKIP_OOB, /** @SKB_DROP_REASON_PKT_TOO_SMALL: packet size is too small */ SKB_DROP_REASON_PKT_TOO_SMALL, /** @SKB_DROP_REASON_TCP_CSUM: TCP checksum error */ -- cgit v1.2.3 From b3e365bbf4f47b8f76b25b0fcf3f38916ca53e42 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:40 +0900 Subject: af_unix: Set drop reason in unix_dgram_disconnected(). unix_dgram_disconnected() is called from two places: 1. when a connect()ed socket dis-connect()s or re-connect()s to another socket 2. when sendmsg() fails because the peer socket that the client has connect()ed to has been close()d Then, the client's recv queue is purged to remove all messages from the old peer socket. Let's define a new drop reason for that case. # echo 1 > /sys/kernel/tracing/events/skb/kfree_skb/enable # python3 >>> from socket import * >>> >>> # s1 has a message from s2 >>> s1, s2 = socketpair(AF_UNIX, SOCK_DGRAM) >>> s2.send(b'hello world') >>> >>> # re-connect() drops the message from s2 >>> s3 = socket(AF_UNIX, SOCK_DGRAM) >>> s3.bind('') >>> s1.connect(s3.getsockname()) # cat /sys/kernel/tracing/trace_pipe python3-250 ... kfree_skb: ... location=skb_queue_purge_reason+0xdc/0x110 reason: UNIX_DISCONNECT Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-8-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index d6c9d841eb11..32a34dfe8cc5 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -9,6 +9,7 @@ FN(SOCKET_CLOSE) \ FN(SOCKET_FILTER) \ FN(SOCKET_RCVBUFF) \ + FN(UNIX_DISCONNECT) \ FN(UNIX_SKIP_OOB) \ FN(PKT_TOO_SMALL) \ FN(TCP_CSUM) \ @@ -146,6 +147,12 @@ enum skb_drop_reason { SKB_DROP_REASON_SOCKET_FILTER, /** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */ SKB_DROP_REASON_SOCKET_RCVBUFF, + /** + * @SKB_DROP_REASON_UNIX_DISCONNECT: recv queue is purged when SOCK_DGRAM + * or SOCK_SEQPACKET socket re-connect()s to another socket or notices + * during send() that the peer has been close()d. + */ + SKB_DROP_REASON_UNIX_DISCONNECT, /** * @SKB_DROP_REASON_UNIX_SKIP_OOB: Out-Of-Band data is skipped by * recv() without MSG_OOB so dropped. -- cgit v1.2.3 From 3c836451ca9041cfb32a7d8f59ea15b3b991bbb3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 18 Jan 2025 18:05:11 -0800 Subject: net: move HDS config from ethtool state Separate the HDS config from the ethtool state struct. The HDS config contains just simple parameters, not state. Having it as a separate struct will make it easier to clone / copy and also long term potentially make it per-queue. Reviewed-by: Michael Chan Link: https://patch.msgid.link/20250119020518.1962249-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 4 ---- include/linux/netdevice.h | 3 +++ include/net/netdev_queues.h | 10 ++++++++++ 3 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 64301ddf2f59..870994cc3ef7 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1171,16 +1171,12 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev, * @rss_ctx: XArray of custom RSS contexts * @rss_lock: Protects entries in @rss_ctx. May be taken from * within RTNL. - * @hds_thresh: HDS Threshold value. - * @hds_config: HDS value from userspace. * @wol_enabled: Wake-on-LAN is enabled * @module_fw_flash_in_progress: Module firmware flashing is in progress. */ struct ethtool_netdev_state { struct xarray rss_ctx; struct mutex rss_lock; - u32 hds_thresh; - u8 hds_config; unsigned wol_enabled:1; unsigned module_fw_flash_in_progress:1; }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8308d9c75918..173a8b3a9eb2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -63,6 +63,7 @@ struct dsa_port; struct ip_tunnel_parm_kern; struct macsec_context; struct macsec_ops; +struct netdev_config; struct netdev_name_node; struct sd_flow_limit; struct sfp_bus; @@ -2410,6 +2411,8 @@ struct net_device { const struct udp_tunnel_nic_info *udp_tunnel_nic_info; struct udp_tunnel_nic *udp_tunnel_nic; + /** @cfg: net_device queue-related configuration */ + struct netdev_config *cfg; struct ethtool_netdev_state *ethtool; /* protected by rtnl_lock */ diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 5ca019d294ca..b02bb9f109d5 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -4,6 +4,16 @@ #include +/** + * struct netdev_config - queue-related configuration for a netdev + * @hds_thresh: HDS Threshold value. + * @hds_config: HDS value from userspace. + */ +struct netdev_config { + u32 hds_thresh; + u8 hds_config; +}; + /* See the netdev.yaml spec for definition of each statistic */ struct netdev_queue_stats_rx { u64 bytes; -- cgit v1.2.3 From 32ad1f7a050d0c17e1e52e1dfdd9f6221ae20ef9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 18 Jan 2025 18:05:13 -0800 Subject: net: provide pending ring configuration in net_device Record the pending configuration in net_device struct. ethtool core duplicates the current config and the specific handlers (for now just ringparam) can modify it. Reviewed-by: Michael Chan Link: https://patch.msgid.link/20250119020518.1962249-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 173a8b3a9eb2..8da4c61f97b9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2413,6 +2413,12 @@ struct net_device { /** @cfg: net_device queue-related configuration */ struct netdev_config *cfg; + /** + * @cfg_pending: same as @cfg but when device is being actively + * reconfigured includes any changes to the configuration + * requested by the user, but which may or may not be rejected. + */ + struct netdev_config *cfg_pending; struct ethtool_netdev_state *ethtool; /* protected by rtnl_lock */ -- cgit v1.2.3 From f7a6082b5e4c15f34fd766cf0960f7e082009c54 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 15 Jan 2025 17:05:58 +0900 Subject: ipv6: Add __in6_dev_get_rtnl_net(). We will convert rtnl_lock() with rtnl_net_lock(), and we want to convert __in6_dev_get() too. __in6_dev_get() uses rcu_dereference_rtnl(), but as written in its comment, rtnl_dereference() or rcu_dereference() is preferable. Let's add __in6_dev_get_rtnl_net() that uses rtnl_net_dereference(). We can add the RCU version helper later if needed. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250115080608.28127-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/addrconf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index f8f91b2038ea..9e5e95988b9e 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -347,6 +347,11 @@ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev) return rcu_dereference_rtnl(dev->ip6_ptr); } +static inline struct inet6_dev *__in6_dev_get_rtnl_net(const struct net_device *dev) +{ + return rtnl_net_dereference(dev_net(dev), dev->ip6_ptr); +} + /** * __in6_dev_stats_get - get inet6_dev pointer for stats * @dev: network device -- cgit v1.2.3 From 5f56d41a21b6d17b59525958a57feffe597b7de5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= Date: Mon, 25 Nov 2024 12:03:40 +0100 Subject: keys: drop shadowing dead prototype MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The global variable pkcs7 does not exist. Drop the variable declaration, but keep the struct prototype needed for is_key_on_revocation_list(). Reported by clang: ./include/keys/system_keyring.h:104:67: warning: declaration shadows a variable in the global scope [-Wshadow] 104 | static inline int is_key_on_revocation_list(struct pkcs7_message *pkcs7) | ^ ./include/keys/system_keyring.h:76:30: note: previous declaration is here 76 | extern struct pkcs7_message *pkcs7; | ^ Fixes: 56c5812623f9 ("certs: Add EFI_CERT_X509_GUID support for dbx entries") Signed-off-by: Christian Göttsche Signed-off-by: Jarkko Sakkinen --- include/keys/system_keyring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h index 8365adf842ef..a6c2897bcc63 100644 --- a/include/keys/system_keyring.h +++ b/include/keys/system_keyring.h @@ -73,7 +73,6 @@ static inline void __init set_machine_trusted_keys(struct key *keyring) } #endif -extern struct pkcs7_message *pkcs7; #ifdef CONFIG_SYSTEM_BLACKLIST_KEYRING extern int mark_hash_blacklisted(const u8 *hash, size_t hash_len, enum blacklist_hash_type hash_type); @@ -93,6 +92,7 @@ static inline int is_binary_blacklisted(const u8 *hash, size_t hash_len) } #endif +struct pkcs7_message; #ifdef CONFIG_SYSTEM_REVOCATION_LIST extern int add_key_to_revocation_list(const char *data, size_t size); extern int is_key_on_revocation_list(struct pkcs7_message *pkcs7); -- cgit v1.2.3 From d6658d3338f84173fb55c9d6c6cdfa57f879712d Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Sat, 16 Nov 2024 04:20:45 +0100 Subject: misc: pci_endpoint_test: Add consecutive BAR test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a more advanced BAR test that writes all BARs in one go, and then reads them back and verifies that the value matches the BAR number bitwise OR'ed with offset, this allows us to verify: - The BAR number was what we intended to read - The offset was what we intended to read This allows us to detect potential address translation issues on the EP. Reading back the BAR directly after writing will not allow us to detect the case where inbound address translation on the endpoint incorrectly causes multiple BARs to be redirected to the same memory region (within the EP). Link: https://lore.kernel.org/r/20241116032045.2574168-2-cassel@kernel.org Signed-off-by: Niklas Cassel Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas Reviewed-by: Manivannan Sadhasivam --- include/uapi/linux/pcitest.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/pcitest.h b/include/uapi/linux/pcitest.h index 94b46b043b53..acd261f49866 100644 --- a/include/uapi/linux/pcitest.h +++ b/include/uapi/linux/pcitest.h @@ -20,6 +20,7 @@ #define PCITEST_MSIX _IOW('P', 0x7, int) #define PCITEST_SET_IRQTYPE _IOW('P', 0x8, int) #define PCITEST_GET_IRQTYPE _IO('P', 0x9) +#define PCITEST_BARS _IO('P', 0xa) #define PCITEST_CLEAR_IRQ _IO('P', 0x10) #define PCITEST_FLAGS_USE_DMA 0x00000001 -- cgit v1.2.3 From ead11ac50ad4b8ef1b64806e962ea984862d96ad Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 13 Jan 2025 17:29:03 -0500 Subject: nfs: fix incorrect error handling in LOCALIO nfs4_stat_to_errno() expects a NFSv4 error code as an argument and returns a POSIX errno. The problem is LOCALIO is passing nfs4_stat_to_errno() the POSIX errno return values from filp->f_op->read_iter(), filp->f_op->write_iter() and vfs_fsync_range(). So the POSIX errno that nfs_local_pgio_done() and nfs_local_commit_done() are passing to nfs4_stat_to_errno() are failing to match any NFSv4 error code, which results in nfs4_stat_to_errno() defaulting to returning -EREMOTEIO. This causes assertions in upper layers due to -EREMOTEIO not being a valid NFSv4 error code. Fix this by updating nfs_local_pgio_done() and nfs_local_commit_done() to use the new nfs_localio_errno_to_nfs4_stat() to map a POSIX errno to an NFSv4 error code. Care was taken to factor out nfs4_errtbl_common[] to avoid duplicating the same NFS error to errno table. nfs4_errtbl_common[] is checked first by both nfs4_stat_to_errno and nfs_localio_errno_to_nfs4_stat before they check their own more specialized tables (nfs4_errtbl[] and nfs4_errtbl_localio[] respectively). While auditing the associated error mapping tables, the (ab)use of -1 for the last table entry was removed in favor of using ARRAY_SIZE to iterate the nfs_errtbl[] and nfs4_errtbl[]. And 'errno_NFSERR_IO' was removed because it caused needless obfuscation. Fixes: 70ba381e1a431 ("nfs: add LOCALIO support") Reported-by: Trond Myklebust Signed-off-by: Mike Snitzer Signed-off-by: Anna Schumaker --- include/linux/nfs_common.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nfs_common.h b/include/linux/nfs_common.h index 5fc02df88252..a541c3a02887 100644 --- a/include/linux/nfs_common.h +++ b/include/linux/nfs_common.h @@ -9,9 +9,10 @@ #include /* Mapping from NFS error code to "errno" error code. */ -#define errno_NFSERR_IO EIO int nfs_stat_to_errno(enum nfs_stat status); int nfs4_stat_to_errno(int stat); +__u32 nfs_localio_errno_to_nfs4_stat(int errno); + #endif /* _LINUX_NFS_COMMON_H */ -- cgit v1.2.3 From e721f619e3ec9bae08bf419c3944cf1e6966c821 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 20 Jan 2025 11:50:51 -0800 Subject: iommufd: Fix struct iommu_hwpt_pgfault init and padding The iommu_hwpt_pgfault is used to report IO page fault data to userspace, but iommufd_fault_fops_read was never zeroing its padding. This leaks the content of the kernel stack memory to userspace. Also, the iommufd uAPI requires explicit padding and use of __aligned_u64 to ensure ABI compatibility's with 32 bit. pahole result, before: struct iommu_hwpt_pgfault { __u32 flags; /* 0 4 */ __u32 dev_id; /* 4 4 */ __u32 pasid; /* 8 4 */ __u32 grpid; /* 12 4 */ __u32 perm; /* 16 4 */ /* XXX 4 bytes hole, try to pack */ __u64 addr; /* 24 8 */ __u32 length; /* 32 4 */ __u32 cookie; /* 36 4 */ /* size: 40, cachelines: 1, members: 8 */ /* sum members: 36, holes: 1, sum holes: 4 */ /* last cacheline: 40 bytes */ }; pahole result, after: struct iommu_hwpt_pgfault { __u32 flags; /* 0 4 */ __u32 dev_id; /* 4 4 */ __u32 pasid; /* 8 4 */ __u32 grpid; /* 12 4 */ __u32 perm; /* 16 4 */ __u32 __reserved; /* 20 4 */ __u64 addr __attribute__((__aligned__(8))); /* 24 8 */ __u32 length; /* 32 4 */ __u32 cookie; /* 36 4 */ /* size: 40, cachelines: 1, members: 9 */ /* forced alignments: 1 */ /* last cacheline: 40 bytes */ } __attribute__((__aligned__(8))); Fixes: c714f15860fc ("iommufd: Add fault and response message definitions") Link: https://patch.msgid.link/r/20250120195051.2450-1-nicolinc@nvidia.com Cc: stable@vger.kernel.org Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe --- include/uapi/linux/iommufd.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 34810f6ae2b5..78747b24bd0f 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -868,6 +868,7 @@ enum iommu_hwpt_pgfault_perm { * @pasid: Process Address Space ID * @grpid: Page Request Group Index * @perm: Combination of enum iommu_hwpt_pgfault_perm + * @__reserved: Must be 0. * @addr: Fault address * @length: a hint of how much data the requestor is expecting to fetch. For * example, if the PRI initiator knows it is going to do a 10MB @@ -883,7 +884,8 @@ struct iommu_hwpt_pgfault { __u32 pasid; __u32 grpid; __u32 perm; - __u64 addr; + __u32 __reserved; + __aligned_u64 addr; __u32 length; __u32 cookie; }; -- cgit v1.2.3 From 8e1d32273ab7d06b6f78771e05824bfab01141f4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 9 Dec 2024 16:13:54 -0500 Subject: nfs_common: make include/linux/nfs4.h include generated nfs4_1.h In the long run, the NFS development community intends to autogenerate a lot of the XDR handling code. Both the NFS client and server include "include/linux/nfs4.hi". That file was hand-rolled, and some of the symbols in it conflict with the autogenerated symbols. Add a small nfs4_1.x to Documentation that currently just has the necessary definitions for the delstid draft, and generate the relevant header and source files. Make include/linux/nfs4.h include the generated include/linux/sunrpc/xdrgen/nfs4_1.h and remove the conflicting definitions from it and nfs_xdr.h. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/nfs4.h | 7 +- include/linux/nfs_xdr.h | 5 -- include/linux/sunrpc/xdrgen/nfs4_1.h | 124 +++++++++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+), 11 deletions(-) create mode 100644 include/linux/sunrpc/xdrgen/nfs4_1.h (limited to 'include') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 8d7430d9f218..b90719244775 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -17,6 +17,7 @@ #include #include #include +#include enum nfs4_acl_whotype { NFS4_ACL_WHO_NAMED = 0, @@ -512,12 +513,6 @@ enum { FATTR4_XATTR_SUPPORT = 82, }; -enum { - FATTR4_TIME_DELEG_ACCESS = 84, - FATTR4_TIME_DELEG_MODIFY = 85, - FATTR4_OPEN_ARGUMENTS = 86, -}; - /* * The following internal definitions enable processing the above * attribute bits within 32-bit word boundaries. diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 559273a0f16d..e74a87bb18a4 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1315,11 +1315,6 @@ struct nfs4_fsid_present_res { #endif /* CONFIG_NFS_V4 */ -struct nfstime4 { - u64 seconds; - u32 nseconds; -}; - #ifdef CONFIG_NFS_V4_1 struct pnfs_commit_bucket { diff --git a/include/linux/sunrpc/xdrgen/nfs4_1.h b/include/linux/sunrpc/xdrgen/nfs4_1.h new file mode 100644 index 000000000000..6025ab6b7398 --- /dev/null +++ b/include/linux/sunrpc/xdrgen/nfs4_1.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Generated by xdrgen. Manual edits will be lost. */ +/* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */ +/* XDR specification modification time: Thu Oct 3 11:30:59 2024 */ + +#ifndef _LINUX_XDRGEN_NFS4_1_DEF_H +#define _LINUX_XDRGEN_NFS4_1_DEF_H + +#include +#include + +typedef s64 int64_t; + +typedef u32 uint32_t; + +typedef struct { + u32 count; + uint32_t *element; +} bitmap4; + +struct nfstime4 { + int64_t seconds; + uint32_t nseconds; +}; + +typedef bool fattr4_offline; + +enum { FATTR4_OFFLINE = 83 }; + +struct open_arguments4 { + bitmap4 oa_share_access; + bitmap4 oa_share_deny; + bitmap4 oa_share_access_want; + bitmap4 oa_open_claim; + bitmap4 oa_create_mode; +}; + +enum open_args_share_access4 { + OPEN_ARGS_SHARE_ACCESS_READ = 1, + OPEN_ARGS_SHARE_ACCESS_WRITE = 2, + OPEN_ARGS_SHARE_ACCESS_BOTH = 3, +}; +typedef enum open_args_share_access4 open_args_share_access4; + +enum open_args_share_deny4 { + OPEN_ARGS_SHARE_DENY_NONE = 0, + OPEN_ARGS_SHARE_DENY_READ = 1, + OPEN_ARGS_SHARE_DENY_WRITE = 2, + OPEN_ARGS_SHARE_DENY_BOTH = 3, +}; +typedef enum open_args_share_deny4 open_args_share_deny4; + +enum open_args_share_access_want4 { + OPEN_ARGS_SHARE_ACCESS_WANT_ANY_DELEG = 3, + OPEN_ARGS_SHARE_ACCESS_WANT_NO_DELEG = 4, + OPEN_ARGS_SHARE_ACCESS_WANT_CANCEL = 5, + OPEN_ARGS_SHARE_ACCESS_WANT_SIGNAL_DELEG_WHEN_RESRC_AVAIL = 17, + OPEN_ARGS_SHARE_ACCESS_WANT_PUSH_DELEG_WHEN_UNCONTENDED = 18, + OPEN_ARGS_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 20, + OPEN_ARGS_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 21, +}; +typedef enum open_args_share_access_want4 open_args_share_access_want4; + +enum open_args_open_claim4 { + OPEN_ARGS_OPEN_CLAIM_NULL = 0, + OPEN_ARGS_OPEN_CLAIM_PREVIOUS = 1, + OPEN_ARGS_OPEN_CLAIM_DELEGATE_CUR = 2, + OPEN_ARGS_OPEN_CLAIM_DELEGATE_PREV = 3, + OPEN_ARGS_OPEN_CLAIM_FH = 4, + OPEN_ARGS_OPEN_CLAIM_DELEG_CUR_FH = 5, + OPEN_ARGS_OPEN_CLAIM_DELEG_PREV_FH = 6, +}; +typedef enum open_args_open_claim4 open_args_open_claim4; + +enum open_args_createmode4 { + OPEN_ARGS_CREATEMODE_UNCHECKED4 = 0, + OPEN_ARGS_CREATE_MODE_GUARDED = 1, + OPEN_ARGS_CREATEMODE_EXCLUSIVE4 = 2, + OPEN_ARGS_CREATE_MODE_EXCLUSIVE4_1 = 3, +}; +typedef enum open_args_createmode4 open_args_createmode4; + +typedef struct open_arguments4 fattr4_open_arguments; + +enum { FATTR4_OPEN_ARGUMENTS = 86 }; + +enum { OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 0x200000 }; + +enum { OPEN4_RESULT_NO_OPEN_STATEID = 0x00000010 }; + +typedef struct nfstime4 fattr4_time_deleg_access; + +typedef struct nfstime4 fattr4_time_deleg_modify; + +enum { FATTR4_TIME_DELEG_ACCESS = 84 }; + +enum { FATTR4_TIME_DELEG_MODIFY = 85 }; + +enum { OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 0x100000 }; + +#define NFS4_int64_t_sz \ + (XDR_hyper) +#define NFS4_uint32_t_sz \ + (XDR_unsigned_int) +#define NFS4_bitmap4_sz (XDR_unsigned_int) +#define NFS4_nfstime4_sz \ + (NFS4_int64_t_sz + NFS4_uint32_t_sz) +#define NFS4_fattr4_offline_sz \ + (XDR_bool) +#define NFS4_open_arguments4_sz \ + (NFS4_bitmap4_sz + NFS4_bitmap4_sz + NFS4_bitmap4_sz + NFS4_bitmap4_sz + NFS4_bitmap4_sz) +#define NFS4_open_args_share_access4_sz (XDR_int) +#define NFS4_open_args_share_deny4_sz (XDR_int) +#define NFS4_open_args_share_access_want4_sz (XDR_int) +#define NFS4_open_args_open_claim4_sz (XDR_int) +#define NFS4_open_args_createmode4_sz (XDR_int) +#define NFS4_fattr4_open_arguments_sz \ + (NFS4_open_arguments4_sz) +#define NFS4_fattr4_time_deleg_access_sz \ + (NFS4_nfstime4_sz) +#define NFS4_fattr4_time_deleg_modify_sz \ + (NFS4_nfstime4_sz) + +#endif /* _LINUX_XDRGEN_NFS4_1_DEF_H */ -- cgit v1.2.3 From 8dfbea8bde6e976136948421325b24b5bdb76ad3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 9 Dec 2024 16:13:55 -0500 Subject: nfsd: switch to autogenerated definitions for open_delegation_type4 Rename the enum with the same name in include/linux/nfs4.h, add the proper enum to nfs4_1.x and regenerate the headers and source files. Do a mass rename of all NFS4_OPEN_DELEGATE_* to OPEN_DELEGATE_* in the nfsd directory. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/nfs4.h | 2 +- include/linux/sunrpc/xdrgen/nfs4_1.h | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index b90719244775..71fbebfa43c7 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -366,7 +366,7 @@ enum limit_by4 { NFS4_LIMIT_BLOCKS = 2 }; -enum open_delegation_type4 { +enum nfs4_open_delegation_type4 { NFS4_OPEN_DELEGATE_NONE = 0, NFS4_OPEN_DELEGATE_READ = 1, NFS4_OPEN_DELEGATE_WRITE = 2, diff --git a/include/linux/sunrpc/xdrgen/nfs4_1.h b/include/linux/sunrpc/xdrgen/nfs4_1.h index 6025ab6b7398..9ca83a4a04cf 100644 --- a/include/linux/sunrpc/xdrgen/nfs4_1.h +++ b/include/linux/sunrpc/xdrgen/nfs4_1.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Generated by xdrgen. Manual edits will be lost. */ /* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */ -/* XDR specification modification time: Thu Oct 3 11:30:59 2024 */ +/* XDR specification modification time: Sat Oct 12 08:10:54 2024 */ #ifndef _LINUX_XDRGEN_NFS4_1_DEF_H #define _LINUX_XDRGEN_NFS4_1_DEF_H @@ -98,6 +98,16 @@ enum { FATTR4_TIME_DELEG_MODIFY = 85 }; enum { OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 0x100000 }; +enum open_delegation_type4 { + OPEN_DELEGATE_NONE = 0, + OPEN_DELEGATE_READ = 1, + OPEN_DELEGATE_WRITE = 2, + OPEN_DELEGATE_NONE_EXT = 3, + OPEN_DELEGATE_READ_ATTRS_DELEG = 4, + OPEN_DELEGATE_WRITE_ATTRS_DELEG = 5, +}; +typedef enum open_delegation_type4 open_delegation_type4; + #define NFS4_int64_t_sz \ (XDR_hyper) #define NFS4_uint32_t_sz \ @@ -120,5 +130,6 @@ enum { OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 0x100000 }; (NFS4_nfstime4_sz) #define NFS4_fattr4_time_deleg_modify_sz \ (NFS4_nfstime4_sz) +#define NFS4_open_delegation_type4_sz (XDR_int) #endif /* _LINUX_XDRGEN_NFS4_1_DEF_H */ -- cgit v1.2.3 From c9c99a33e2b0083c83a2c29eebfad92c78e16791 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 9 Dec 2024 16:13:56 -0500 Subject: nfsd: rename NFS4_SHARE_WANT_* constants to OPEN4_SHARE_ACCESS_WANT_* Add the OPEN4_SHARE_ACCESS_WANT constants from the nfs4.1 and delstid draft into the nfs4_1.x file, and regenerate the headers and source files. Do a mass renaming of NFS4_SHARE_WANT_* to OPEN4_SHARE_ACCESS_WANT_* in the nfsd directory. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdrgen/nfs4_1.h | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xdrgen/nfs4_1.h b/include/linux/sunrpc/xdrgen/nfs4_1.h index 9ca83a4a04cf..cf21a14aa885 100644 --- a/include/linux/sunrpc/xdrgen/nfs4_1.h +++ b/include/linux/sunrpc/xdrgen/nfs4_1.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Generated by xdrgen. Manual edits will be lost. */ /* XDR specification file: ../../Documentation/sunrpc/xdr/nfs4_1.x */ -/* XDR specification modification time: Sat Oct 12 08:10:54 2024 */ +/* XDR specification modification time: Mon Oct 14 09:10:13 2024 */ #ifndef _LINUX_XDRGEN_NFS4_1_DEF_H #define _LINUX_XDRGEN_NFS4_1_DEF_H @@ -84,8 +84,6 @@ typedef struct open_arguments4 fattr4_open_arguments; enum { FATTR4_OPEN_ARGUMENTS = 86 }; -enum { OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 0x200000 }; - enum { OPEN4_RESULT_NO_OPEN_STATEID = 0x00000010 }; typedef struct nfstime4 fattr4_time_deleg_access; @@ -96,8 +94,28 @@ enum { FATTR4_TIME_DELEG_ACCESS = 84 }; enum { FATTR4_TIME_DELEG_MODIFY = 85 }; +enum { OPEN4_SHARE_ACCESS_WANT_DELEG_MASK = 0xFF00 }; + +enum { OPEN4_SHARE_ACCESS_WANT_NO_PREFERENCE = 0x0000 }; + +enum { OPEN4_SHARE_ACCESS_WANT_READ_DELEG = 0x0100 }; + +enum { OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG = 0x0200 }; + +enum { OPEN4_SHARE_ACCESS_WANT_ANY_DELEG = 0x0300 }; + +enum { OPEN4_SHARE_ACCESS_WANT_NO_DELEG = 0x0400 }; + +enum { OPEN4_SHARE_ACCESS_WANT_CANCEL = 0x0500 }; + +enum { OPEN4_SHARE_ACCESS_WANT_SIGNAL_DELEG_WHEN_RESRC_AVAIL = 0x10000 }; + +enum { OPEN4_SHARE_ACCESS_WANT_PUSH_DELEG_WHEN_UNCONTENDED = 0x20000 }; + enum { OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 0x100000 }; +enum { OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 0x200000 }; + enum open_delegation_type4 { OPEN_DELEGATE_NONE = 0, OPEN_DELEGATE_READ = 1, -- cgit v1.2.3 From cee9b4ef42512a6e57562460a15f18a022c84dda Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 9 Dec 2024 16:13:59 -0500 Subject: nfsd: rework NFS4_SHARE_WANT_* flag handling The delstid draft adds new NFS4_SHARE_WANT_TYPE_MASK values that don't fit neatly into the existing WANT_MASK or WHEN_MASK. Add a new NFS4_SHARE_WANT_MOD_MASK value and redefine NFS4_SHARE_WANT_MASK to include it. Also fix the checks in nfsd4_deleg_xgrade_none_ext() to check for the flags instead of equality, since there may be modifier flags in the value. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/uapi/linux/nfs4.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h index caf4db2fcbb9..4273e0249fcb 100644 --- a/include/uapi/linux/nfs4.h +++ b/include/uapi/linux/nfs4.h @@ -58,7 +58,7 @@ #define NFS4_SHARE_DENY_BOTH 0x0003 /* nfs41 */ -#define NFS4_SHARE_WANT_MASK 0xFF00 +#define NFS4_SHARE_WANT_TYPE_MASK 0xFF00 #define NFS4_SHARE_WANT_NO_PREFERENCE 0x0000 #define NFS4_SHARE_WANT_READ_DELEG 0x0100 #define NFS4_SHARE_WANT_WRITE_DELEG 0x0200 @@ -66,13 +66,16 @@ #define NFS4_SHARE_WANT_NO_DELEG 0x0400 #define NFS4_SHARE_WANT_CANCEL 0x0500 -#define NFS4_SHARE_WHEN_MASK 0xF0000 +#define NFS4_SHARE_WHEN_MASK 0xF0000 #define NFS4_SHARE_SIGNAL_DELEG_WHEN_RESRC_AVAIL 0x10000 #define NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED 0x20000 +#define NFS4_SHARE_WANT_MOD_MASK 0xF00000 #define NFS4_SHARE_WANT_DELEG_TIMESTAMPS 0x100000 #define NFS4_SHARE_WANT_OPEN_XOR_DELEGATION 0x200000 +#define NFS4_SHARE_WANT_MASK (NFS4_SHARE_WANT_TYPE_MASK | NFS4_SHARE_WANT_MOD_MASK) + #define NFS4_CDFC4_FORE 0x1 #define NFS4_CDFC4_BACK 0x2 #define NFS4_CDFC4_BOTH 0x3 -- cgit v1.2.3 From 6ae30d6eb26bce02c48c60074b4306270e2434c1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 9 Dec 2024 16:14:00 -0500 Subject: nfsd: add support for delegated timestamps Add support for the delegated timestamps on write delegations. This allows the server to proxy timestamps from the delegation holder to other clients that are doing GETATTRs vs. the same inode. When OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS bit is set in the OPEN call, set the dl_type to the *_ATTRS_DELEG flavor of delegation. Add timespec64 fields to nfs4_cb_fattr and decode the timestamps into those. Vet those timestamps according to the delstid spec and update the inode attrs if necessary. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/time64.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/time64.h b/include/linux/time64.h index f1bcea8c124a..9934331c7b86 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -49,6 +49,11 @@ static inline int timespec64_equal(const struct timespec64 *a, return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); } +static inline bool timespec64_is_epoch(const struct timespec64 *ts) +{ + return ts->tv_sec == 0 && ts->tv_nsec == 0; +} + /* * lhs < rhs: return <0 * lhs == rhs: return 0 -- cgit v1.2.3 From ee0d90d4b97a9787ed55b22c85c72376329d86ac Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 10 Dec 2024 01:02:23 +0000 Subject: sunrpc: Remove unused xprt_iter_get_xprt xprt_iter_get_xprt() was added by commit 80b14d5e61ca ("SUNRPC: Add a structure to track multiple transports") but is unused. Remove it. Signed-off-by: Dr. David Alan Gilbert Acked-by: Anna Schumaker Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/xprtmultipath.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h index c0514c684b2c..e411368cdacf 100644 --- a/include/linux/sunrpc/xprtmultipath.h +++ b/include/linux/sunrpc/xprtmultipath.h @@ -75,7 +75,6 @@ extern struct rpc_xprt_switch *xprt_iter_xchg_switch( struct rpc_xprt_switch *newswitch); extern struct rpc_xprt *xprt_iter_xprt(struct rpc_xprt_iter *xpi); -extern struct rpc_xprt *xprt_iter_get_xprt(struct rpc_xprt_iter *xpi); extern struct rpc_xprt *xprt_iter_get_next(struct rpc_xprt_iter *xpi); extern bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, -- cgit v1.2.3 From afc52b1eeb36f20eea321f50e338e38d00a8a61f Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Tue, 10 Dec 2024 01:02:24 +0000 Subject: sunrpc: Remove gss_generic_token deadcode Commit ec596aaf9b48 ("SUNRPC: Remove code behind CONFIG_RPCSEC_GSS_KRB5_SIMPLIFIED") was the last user of the routines in gss_generic_token.c. Remove the routines and associated header. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/gss_asn1.h | 81 ----------------------------------------- include/linux/sunrpc/gss_krb5.h | 1 - 2 files changed, 82 deletions(-) delete mode 100644 include/linux/sunrpc/gss_asn1.h (limited to 'include') diff --git a/include/linux/sunrpc/gss_asn1.h b/include/linux/sunrpc/gss_asn1.h deleted file mode 100644 index 3ccecd0ad229..000000000000 --- a/include/linux/sunrpc/gss_asn1.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * linux/include/linux/sunrpc/gss_asn1.h - * - * minimal asn1 for generic encoding/decoding of gss tokens - * - * Adapted from MIT Kerberos 5-1.2.1 lib/include/krb5.h, - * lib/gssapi/krb5/gssapiP_krb5.h, and others - * - * Copyright (c) 2000 The Regents of the University of Michigan. - * All rights reserved. - * - * Andy Adamson - */ - -/* - * Copyright 1995 by the Massachusetts Institute of Technology. - * All Rights Reserved. - * - * Export of this software from the United States of America may - * require a specific license from the United States Government. - * It is the responsibility of any person or organization contemplating - * export to obtain such a license before exporting. - * - * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and - * distribute this software and its documentation for any purpose and - * without fee is hereby granted, provided that the above copyright - * notice appear in all copies and that both that copyright notice and - * this permission notice appear in supporting documentation, and that - * the name of M.I.T. not be used in advertising or publicity pertaining - * to distribution of the software without specific, written prior - * permission. Furthermore if you modify this software you must label - * your software as modified software and not distribute it in such a - * fashion that it might be confused with the original M.I.T. software. - * M.I.T. makes no representations about the suitability of - * this software for any purpose. It is provided "as is" without express - * or implied warranty. - * - */ - - -#include - -#define SIZEOF_INT 4 - -/* from gssapi_err_generic.h */ -#define G_BAD_SERVICE_NAME (-2045022976L) -#define G_BAD_STRING_UID (-2045022975L) -#define G_NOUSER (-2045022974L) -#define G_VALIDATE_FAILED (-2045022973L) -#define G_BUFFER_ALLOC (-2045022972L) -#define G_BAD_MSG_CTX (-2045022971L) -#define G_WRONG_SIZE (-2045022970L) -#define G_BAD_USAGE (-2045022969L) -#define G_UNKNOWN_QOP (-2045022968L) -#define G_NO_HOSTNAME (-2045022967L) -#define G_BAD_HOSTNAME (-2045022966L) -#define G_WRONG_MECH (-2045022965L) -#define G_BAD_TOK_HEADER (-2045022964L) -#define G_BAD_DIRECTION (-2045022963L) -#define G_TOK_TRUNC (-2045022962L) -#define G_REFLECT (-2045022961L) -#define G_WRONG_TOKID (-2045022960L) - -#define g_OID_equal(o1,o2) \ - (((o1)->len == (o2)->len) && \ - (memcmp((o1)->data,(o2)->data,(int) (o1)->len) == 0)) - -u32 g_verify_token_header( - struct xdr_netobj *mech, - int *body_size, - unsigned char **buf_in, - int toksize); - -int g_token_size( - struct xdr_netobj *mech, - unsigned int body_size); - -void g_make_token_header( - struct xdr_netobj *mech, - int body_size, - unsigned char **buf); diff --git a/include/linux/sunrpc/gss_krb5.h b/include/linux/sunrpc/gss_krb5.h index 78a80bf3fdcb..43950b5237c8 100644 --- a/include/linux/sunrpc/gss_krb5.h +++ b/include/linux/sunrpc/gss_krb5.h @@ -40,7 +40,6 @@ #include #include #include -#include /* Length of constant used in key derivation */ #define GSS_KRB5_K5CLENGTH (5) -- cgit v1.2.3 From 188973e3536a18aebee3af486cd3d2ef82b09f88 Mon Sep 17 00:00:00 2001 From: Dongdong Zhang Date: Mon, 16 Dec 2024 09:35:36 +0800 Subject: PCI: Remove redundant PCI_VSEC_HDR and PCI_VSEC_HDR_LEN_SHIFT Remove duplicate macro PCI_VSEC_HDR and its related macro PCI_VSEC_HDR_LEN_SHIFT from pci_regs.h to avoid redundancy and inconsistencies. Update VFIO PCI code to use PCI_VNDR_HEADER and PCI_VNDR_HEADER_LEN() for consistent naming and functionality. These changes aim to streamline header handling while minimizing impact, given the niche usage of these macros in userspace. Link: https://lore.kernel.org/r/20241216013536.4487-1-zhangdongdong@eswincomputing.com Signed-off-by: Dongdong Zhang Signed-off-by: Bjorn Helgaas Acked-by: Alex Williamson --- include/uapi/linux/pci_regs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 02d0ba2999e0..c2ccc85c43ed 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1001,9 +1001,6 @@ #define PCI_ACS_CTRL 0x06 /* ACS Control Register */ #define PCI_ACS_EGRESS_CTL_V 0x08 /* ACS Egress Control Vector */ -#define PCI_VSEC_HDR 4 /* extended cap - vendor-specific */ -#define PCI_VSEC_HDR_LEN_SHIFT 20 /* shift for length field */ - /* SATA capability */ #define PCI_SATA_REGS 4 /* SATA REGs specifier */ #define PCI_SATA_REGS_MASK 0xF /* location - BAR#/inline */ -- cgit v1.2.3 From 66611c0475709607f398e2a5d691b1fc72fe9dfc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 21 Jan 2025 19:44:36 -0500 Subject: fgraph: Remove calltime and rettime from generic operations The function graph infrastructure is now generic so that kretprobes, fprobes and BPF can use it. But there is still some leftover logic that only the function graph tracer itself uses. This is the calculation of the calltime and return time of the functions. The calculation of the calltime has been moved into the function graph tracer and those users that need it so that it doesn't cause overhead to the other users. But the return function timestamp was still called. Instead of just moving the taking of the timestamp into the function graph trace remove the calltime and rettime completely from the ftrace_graph_ret structure. Instead, move it into the function graph return entry event structure and this also moves all the calltime and rettime logic out of the generic fgraph.c code and into the tracing code that uses it. This has been reported to decrease the overhead by ~27%. Link: https://lore.kernel.org/all/Z3aSuql3fnXMVMoM@krava/ Link: https://lore.kernel.org/all/173665959558.1629214.16724136597211810729.stgit@devnote2/ Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250121194436.15bdf71a@gandalf.local.home Reported-by: Jiri Olsa Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 07092dfb21a4..fbabc3d848b3 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1151,8 +1151,6 @@ struct ftrace_graph_ret { int depth; /* Number of functions that overran the depth limit for current task */ unsigned int overrun; - unsigned long long calltime; - unsigned long long rettime; } __packed; struct fgraph_ops; -- cgit v1.2.3 From f79b163c42314a1f46f4bcc40a19c8a75cf1e7a3 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 22 Jan 2025 10:35:56 +0100 Subject: Revert "serial: 8250: Switch to nbcon console" This reverts commit b63e6f60eab45b16a1bf734fef9035a4c4187cd5. kernel test robot has found problems with this commit so revert it for now. Link: https://lore.kernel.org/r/202501221029.fb0d574d-lkp@intel.com Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202501221029.fb0d574d-lkp@intel.com Cc: John Ogness Cc: Petr Mladek Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_8250.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index 57875c37023a..144de7a7948d 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -150,17 +150,8 @@ struct uart_8250_port { #define LSR_SAVE_FLAGS UART_LSR_BRK_ERROR_BITS u16 lsr_saved_flags; u16 lsr_save_mask; - - /* - * Track when a console line has been fully written to the - * hardware, i.e. true when the most recent byte written to - * UART_TX by the console was '\n'. - */ - bool console_line_ended; - #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA unsigned char msr_saved_flags; - struct irq_work modem_status_work; struct uart_8250_dma *dma; const struct uart_8250_ops *ops; @@ -211,8 +202,8 @@ void serial8250_tx_chars(struct uart_8250_port *up); unsigned int serial8250_modem_status(struct uart_8250_port *up); void serial8250_init_port(struct uart_8250_port *up); void serial8250_set_defaults(struct uart_8250_port *up); -void serial8250_console_write(struct uart_8250_port *up, - struct nbcon_write_context *wctxt, bool in_atomic); +void serial8250_console_write(struct uart_8250_port *up, const char *s, + unsigned int count); int serial8250_console_setup(struct uart_port *port, char *options, bool probe); int serial8250_console_exit(struct uart_port *port); -- cgit v1.2.3 From 8259cb14a70680553d5e82d65d1302fe589e9b39 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Wed, 15 Jan 2025 16:15:48 +0100 Subject: rv: Reset per-task monitors also for idle tasks RV per-task monitors are implemented through a monitor structure available for each task_struct. This structure is reset every time the monitor is (re-)started, to avoid inconsistencies if the monitor was activated previously. To do so, we reset the monitor on all threads using the macro for_each_process_thread. However, this macro excludes the idle tasks on each CPU. Idle tasks could be considered tasks on their own right and it should be up to the model whether to ignore them or not. Reset monitors also on the idle tasks for each present CPU whenever we reset all per-task monitors. Cc: stable@vger.kernel.org Cc: Juri Lelli Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: John Kacur Link: https://lore.kernel.org/20250115151547.605750-2-gmonaco@redhat.com Fixes: 792575348ff7 ("rv/include: Add deterministic automata monitor definition via C macros") Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- include/rv/da_monitor.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h index 9705b2a98e49..510c88bfabd4 100644 --- a/include/rv/da_monitor.h +++ b/include/rv/da_monitor.h @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_RV_REACTORS @@ -324,10 +325,13 @@ static inline struct da_monitor *da_get_monitor_##name(struct task_struct *tsk) static void da_monitor_reset_all_##name(void) \ { \ struct task_struct *g, *p; \ + int cpu; \ \ read_lock(&tasklist_lock); \ for_each_process_thread(g, p) \ da_monitor_reset_##name(da_get_monitor_##name(p)); \ + for_each_present_cpu(cpu) \ + da_monitor_reset_##name(da_get_monitor_##name(idle_task(cpu))); \ read_unlock(&tasklist_lock); \ } \ \ -- cgit v1.2.3 From fa3595523d72d13508befd28cf2ca642cafc69f7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 22 Jan 2025 20:00:57 -0700 Subject: io_uring: get rid of alloc cache init_once handling init_once is called when an object doesn't come from the cache, and hence needs initial clearing of certain members. While the whole struct could get cleared by memset() in that case, a few of the cache members are large enough that this may cause unnecessary overhead if the caches used aren't large enough to satisfy the workload. For those cases, some churn of kmalloc+kfree is to be expected. Ensure that the 3 users that need clearing put the members they need cleared at the start of the struct, and wrap the rest of the struct in a struct group so the offset is known. While at it, improve the interaction with KASAN such that when/if KASAN writes to members inside the struct that should be retained over caching, it won't trip over itself. For rw and net, the retaining of the iovec over caching is disabled if KASAN is enabled. A helper will free and clear those members in that case. Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 2 +- include/linux/io_uring_types.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index a3ce553413de..abd0c8bd950b 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -19,8 +19,8 @@ struct io_uring_cmd { }; struct io_uring_cmd_data { - struct io_uring_sqe sqes[2]; void *op_data; + struct io_uring_sqe sqes[2]; }; static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 623d8e798a11..3def525a1da3 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -222,7 +222,8 @@ struct io_alloc_cache { void **entries; unsigned int nr_cached; unsigned int max_cached; - size_t elem_size; + unsigned int elem_size; + unsigned int init_clear; }; struct io_ring_ctx { -- cgit v1.2.3 From da6b353786997c0ffa67127355ad1d54ed3324c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 23 Jan 2025 18:27:07 +0100 Subject: pwm: Ensure callbacks exist before calling them MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If one of the waveform functions is called for a chip that only supports .apply(), we want that an error code is returned and not a NULL pointer exception. Fixes: 6c5126c6406d ("pwm: Provide new consumer API functions for waveforms") Cc: stable@vger.kernel.org Signed-off-by: Uwe Kleine-König Tested-by: Trevor Gamblin Link: https://lore.kernel.org/r/20250123172709.391349-2-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 78827f312407..b8d78009e779 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -347,6 +347,23 @@ struct pwm_chip { struct pwm_device pwms[] __counted_by(npwm); }; +/** + * pwmchip_supports_waveform() - checks if the given chip supports waveform callbacks + * @chip: The pwm_chip to test + * + * Returns true iff the pwm chip support the waveform functions like + * pwm_set_waveform_might_sleep() and pwm_round_waveform_might_sleep() + */ +static inline bool pwmchip_supports_waveform(struct pwm_chip *chip) +{ + /* + * only check for .write_waveform(). If that is available, + * .round_waveform_tohw() and .round_waveform_fromhw() asserted to be + * available, too, in pwmchip_add(). + */ + return chip->ops->write_waveform != NULL; +} + static inline struct device *pwmchip_parent(const struct pwm_chip *chip) { return chip->dev.parent; -- cgit v1.2.3 From 4891cd3eba62ac611a7929948cf5588a1abed909 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 16 Jan 2025 17:43:29 +0200 Subject: PM: Revert "Add EXPORT macros for exporting PM functions" Revert commit 41a337b40e98 ("Add EXPORT macros for exporting PM functions") because the macros added by it are still unused almost two years after they had been introduced. Reported-by: Adrian Hunter Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20250116154354.149297-1-andriy.shevchenko@linux.intel.com [ rjw: New changelog ] Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/pm.h b/include/linux/pm.h index e7f0260f15ad..0627a795892b 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -384,12 +384,8 @@ const struct dev_pm_ops name = { \ #ifdef CONFIG_PM #define _EXPORT_DEV_PM_OPS(name, license, ns) _EXPORT_PM_OPS(name, license, ns) -#define EXPORT_PM_FN_GPL(name) EXPORT_SYMBOL_GPL(name) -#define EXPORT_PM_FN_NS_GPL(name, ns) EXPORT_SYMBOL_NS_GPL(name, "ns") #else #define _EXPORT_DEV_PM_OPS(name, license, ns) _DISCARD_PM_OPS(name, license, ns) -#define EXPORT_PM_FN_GPL(name) -#define EXPORT_PM_FN_NS_GPL(name, ns) #endif #ifdef CONFIG_PM_SLEEP -- cgit v1.2.3 From 24fe962c86f55347385933a1b06ca71b60854690 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Mon, 20 Jan 2025 02:28:59 +0100 Subject: fuse: {io-uring} Handle SQEs - register commands This adds basic support for ring SQEs (with opcode=IORING_OP_URING_CMD). For now only FUSE_IO_URING_CMD_REGISTER is handled to register queue entries. Signed-off-by: Bernd Schubert Reviewed-by: Pavel Begunkov # io_uring Reviewed-by: Luis Henriques Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 76 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index f1e99458e29e..5e0eb41d967e 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -220,6 +220,15 @@ * * 7.41 * - add FUSE_ALLOW_IDMAP + * 7.42 + * - Add FUSE_OVER_IO_URING and all other io-uring related flags and data + * structures: + * - struct fuse_uring_ent_in_out + * - struct fuse_uring_req_header + * - struct fuse_uring_cmd_req + * - FUSE_URING_IN_OUT_HEADER_SZ + * - FUSE_URING_OP_IN_OUT_SZ + * - enum fuse_uring_cmd */ #ifndef _LINUX_FUSE_H @@ -255,7 +264,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 41 +#define FUSE_KERNEL_MINOR_VERSION 42 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -425,6 +434,7 @@ struct fuse_file_lock { * FUSE_HAS_RESEND: kernel supports resending pending requests, and the high bit * of the request ID indicates resend requests * FUSE_ALLOW_IDMAP: allow creation of idmapped mounts + * FUSE_OVER_IO_URING: Indicate that client supports io-uring */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -471,6 +481,7 @@ struct fuse_file_lock { /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP #define FUSE_ALLOW_IDMAP (1ULL << 40) +#define FUSE_OVER_IO_URING (1ULL << 41) /** * CUSE INIT request/reply flags @@ -1206,4 +1217,67 @@ struct fuse_supp_groups { uint32_t groups[]; }; +/** + * Size of the ring buffer header + */ +#define FUSE_URING_IN_OUT_HEADER_SZ 128 +#define FUSE_URING_OP_IN_OUT_SZ 128 + +/* Used as part of the fuse_uring_req_header */ +struct fuse_uring_ent_in_out { + uint64_t flags; + + /* + * commit ID to be used in a reply to a ring request (see also + * struct fuse_uring_cmd_req) + */ + uint64_t commit_id; + + /* size of user payload buffer */ + uint32_t payload_sz; + uint32_t padding; + + uint64_t reserved; +}; + +/** + * Header for all fuse-io-uring requests + */ +struct fuse_uring_req_header { + /* struct fuse_in_header / struct fuse_out_header */ + char in_out[FUSE_URING_IN_OUT_HEADER_SZ]; + + /* per op code header */ + char op_in[FUSE_URING_OP_IN_OUT_SZ]; + + struct fuse_uring_ent_in_out ring_ent_in_out; +}; + +/** + * sqe commands to the kernel + */ +enum fuse_uring_cmd { + FUSE_IO_URING_CMD_INVALID = 0, + + /* register the request buffer and fetch a fuse request */ + FUSE_IO_URING_CMD_REGISTER = 1, + + /* commit fuse request result and fetch next request */ + FUSE_IO_URING_CMD_COMMIT_AND_FETCH = 2, +}; + +/** + * In the 80B command area of the SQE. + */ +struct fuse_uring_cmd_req { + uint64_t flags; + + /* entry identifier for commits */ + uint64_t commit_id; + + /* queue the command is for (queue index) */ + uint16_t qid; + uint8_t padding[6]; +}; + #endif /* _LINUX_FUSE_H */ -- cgit v1.2.3 From 931656b9e2ff7029aee0b36e17780621948a6ac1 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 23 Jan 2025 07:35:43 -0800 Subject: kvm: defer huge page recovery vhost task to later Some libraries want to ensure they are single threaded before forking, so making the kernel's kvm huge page recovery process a vhost task of the user process breaks those. The minijail library used by crosvm is one such affected application. Defer the task to after the first VM_RUN call, which occurs after the parent process has forked all its jailed processes. This needs to happen only once for the kvm instance, so introduce some general-purpose infrastructure for that, too. It's similar in concept to pthread_once; except it is actually usable, because the callback takes a parameter. Cc: Sean Christopherson Cc: Paolo Bonzini Tested-by: Alyssa Ross Signed-off-by: Keith Busch Message-ID: <20250123153543.2769928-1-kbusch@meta.com> [Move call_once API to include/linux. - Paolo] Cc: stable@vger.kernel.org Fixes: d96c77bd4eeb ("KVM: x86: switch hugepage recovery thread to vhost_task") Signed-off-by: Paolo Bonzini --- include/linux/call_once.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 include/linux/call_once.h (limited to 'include') diff --git a/include/linux/call_once.h b/include/linux/call_once.h new file mode 100644 index 000000000000..6261aa0b3fb0 --- /dev/null +++ b/include/linux/call_once.h @@ -0,0 +1,45 @@ +#ifndef _LINUX_CALL_ONCE_H +#define _LINUX_CALL_ONCE_H + +#include +#include + +#define ONCE_NOT_STARTED 0 +#define ONCE_RUNNING 1 +#define ONCE_COMPLETED 2 + +struct once { + atomic_t state; + struct mutex lock; +}; + +static inline void __once_init(struct once *once, const char *name, + struct lock_class_key *key) +{ + atomic_set(&once->state, ONCE_NOT_STARTED); + __mutex_init(&once->lock, name, key); +} + +#define once_init(once) \ +do { \ + static struct lock_class_key __key; \ + __once_init((once), #once, &__key); \ +} while (0) + +static inline void call_once(struct once *once, void (*cb)(struct once *)) +{ + /* Pairs with atomic_set_release() below. */ + if (atomic_read_acquire(&once->state) == ONCE_COMPLETED) + return; + + guard(mutex)(&once->lock); + WARN_ON(atomic_read(&once->state) == ONCE_RUNNING); + if (atomic_read(&once->state) != ONCE_NOT_STARTED) + return; + + atomic_set(&once->state, ONCE_RUNNING); + cb(once); + atomic_set_release(&once->state, ONCE_COMPLETED); +} + +#endif /* _LINUX_CALL_ONCE_H */ -- cgit v1.2.3 From 71ee9b16251ea4bf7c1fe222517c82bdb3220acc Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Nov 2024 19:11:24 +0000 Subject: minmax.h: add whitespace around operators and after commas Patch series "minmax.h: Cleanups and minor optimisations". Some tidyups and minor changes to minmax.h. This patch (of 7): Link: https://lkml.kernel.org/r/c50365d214e04f9ba256d417c8bebbc0@AcuMS.aculab.com Link: https://lkml.kernel.org/r/f04b2e1310244f62826267346fde0553@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dan Carpenter Cc: Jason A. Donenfeld Cc: Jens Axboe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/minmax.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 98008dd92153..51b0d988e322 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -51,10 +51,10 @@ * only need to be careful to not cause warnings for * pointer use. */ -#define __signed_type_use(x,ux) (2+__is_nonneg(x,ux)) -#define __unsigned_type_use(x,ux) (1+2*(sizeof(ux)<4)) -#define __sign_use(x,ux) (is_signed_type(typeof(ux))? \ - __signed_type_use(x,ux):__unsigned_type_use(x,ux)) +#define __signed_type_use(x, ux) (2 + __is_nonneg(x, ux)) +#define __unsigned_type_use(x, ux) (1 + 2 * (sizeof(ux) < 4)) +#define __sign_use(x, ux) (is_signed_type(typeof(ux)) ? \ + __signed_type_use(x, ux) : __unsigned_type_use(x, ux)) /* * To avoid warnings about casting pointers to integers @@ -74,15 +74,15 @@ #ifdef CONFIG_64BIT #define __signed_type(ux) long #else - #define __signed_type(ux) typeof(__builtin_choose_expr(sizeof(ux)>4,1LL,1L)) + #define __signed_type(ux) typeof(__builtin_choose_expr(sizeof(ux) > 4, 1LL, 1L)) #endif -#define __is_nonneg(x,ux) statically_true((__signed_type(ux))(x)>=0) +#define __is_nonneg(x, ux) statically_true((__signed_type(ux))(x) >= 0) -#define __types_ok(x,y,ux,uy) \ - (__sign_use(x,ux) & __sign_use(y,uy)) +#define __types_ok(x, y, ux, uy) \ + (__sign_use(x, ux) & __sign_use(y, uy)) -#define __types_ok3(x,y,z,ux,uy,uz) \ - (__sign_use(x,ux) & __sign_use(y,uy) & __sign_use(z,uz)) +#define __types_ok3(x, y, z, ux, uy, uz) \ + (__sign_use(x, ux) & __sign_use(y, uy) & __sign_use(z, uz)) #define __cmp_op_min < #define __cmp_op_max > @@ -97,7 +97,7 @@ #define __careful_cmp_once(op, x, y, ux, uy) ({ \ __auto_type ux = (x); __auto_type uy = (y); \ - BUILD_BUG_ON_MSG(!__types_ok(x,y,ux,uy), \ + BUILD_BUG_ON_MSG(!__types_ok(x, y, ux, uy), \ #op"("#x", "#y") signedness error"); \ __cmp(op, ux, uy); }) @@ -114,7 +114,7 @@ static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), \ (lo) <= (hi), true), \ "clamp() low limit " #lo " greater than high limit " #hi); \ - BUILD_BUG_ON_MSG(!__types_ok3(val,lo,hi,uval,ulo,uhi), \ + BUILD_BUG_ON_MSG(!__types_ok3(val, lo, hi, uval, ulo, uhi), \ "clamp("#val", "#lo", "#hi") signedness error"); \ __clamp(uval, ulo, uhi); }) @@ -154,7 +154,7 @@ #define __careful_op3(op, x, y, z, ux, uy, uz) ({ \ __auto_type ux = (x); __auto_type uy = (y);__auto_type uz = (z);\ - BUILD_BUG_ON_MSG(!__types_ok3(x,y,z,ux,uy,uz), \ + BUILD_BUG_ON_MSG(!__types_ok3(x, y, z, ux, uy, uz), \ #op"3("#x", "#y", "#z") signedness error"); \ __cmp(op, ux, __cmp(op, uy, uz)); }) @@ -326,9 +326,9 @@ static inline bool in_range32(u32 val, u32 start, u32 len) * Use these carefully: no type checking, and uses the arguments * multiple times. Use for obvious constants only. */ -#define MIN(a,b) __cmp(min,a,b) -#define MAX(a,b) __cmp(max,a,b) -#define MIN_T(type,a,b) __cmp(min,(type)(a),(type)(b)) -#define MAX_T(type,a,b) __cmp(max,(type)(a),(type)(b)) +#define MIN(a, b) __cmp(min, a, b) +#define MAX(a, b) __cmp(max, a, b) +#define MIN_T(type, a, b) __cmp(min, (type)(a), (type)(b)) +#define MAX_T(type, a, b) __cmp(max, (type)(a), (type)(b)) #endif /* _LINUX_MINMAX_H */ -- cgit v1.2.3 From 10666e99204818ef45c702469488353b5bb09ec7 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Nov 2024 19:12:07 +0000 Subject: minmax.h: update some comments - Change three to several. - Remove the comment about retaining constant expressions, no longer true. - Realign to nearer 80 columns and break on major punctiation. - Add a leading comment to the block before __signed_type() and __is_nonneg() Otherwise the block explaining the cast is a bit 'floating'. Reword the rest of that comment to improve readability. Link: https://lkml.kernel.org/r/85b050c81c1d4076aeb91a6cded45fee@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dan Carpenter Cc: Jason A. Donenfeld Cc: Jens Axboe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/minmax.h | 53 +++++++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 51b0d988e322..24e4b372649a 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -8,13 +8,10 @@ #include /* - * min()/max()/clamp() macros must accomplish three things: + * min()/max()/clamp() macros must accomplish several things: * * - Avoid multiple evaluations of the arguments (so side-effects like * "x++" happen only once) when non-constant. - * - Retain result as a constant expressions when called with only - * constant expressions (to avoid tripping VLA warnings in stack - * allocation usage). * - Perform signed v unsigned type-checking (to generate compile * errors instead of nasty runtime surprises). * - Unsigned char/short are always promoted to signed int and can be @@ -31,25 +28,23 @@ * bit #0 set if ok for unsigned comparisons * bit #1 set if ok for signed comparisons * - * In particular, statically non-negative signed integer - * expressions are ok for both. + * In particular, statically non-negative signed integer expressions + * are ok for both. * - * NOTE! Unsigned types smaller than 'int' are implicitly - * converted to 'int' in expressions, and are accepted for - * signed conversions for now. This is debatable. + * NOTE! Unsigned types smaller than 'int' are implicitly converted to 'int' + * in expressions, and are accepted for signed conversions for now. + * This is debatable. * - * Note that 'x' is the original expression, and 'ux' is - * the unique variable that contains the value. + * Note that 'x' is the original expression, and 'ux' is the unique variable + * that contains the value. * - * We use 'ux' for pure type checking, and 'x' for when - * we need to look at the value (but without evaluating - * it for side effects! Careful to only ever evaluate it - * with sizeof() or __builtin_constant_p() etc). + * We use 'ux' for pure type checking, and 'x' for when we need to look at the + * value (but without evaluating it for side effects! + * Careful to only ever evaluate it with sizeof() or __builtin_constant_p() etc). * - * Pointers end up being checked by the normal C type - * rules at the actual comparison, and these expressions - * only need to be careful to not cause warnings for - * pointer use. + * Pointers end up being checked by the normal C type rules at the actual + * comparison, and these expressions only need to be careful to not cause + * warnings for pointer use. */ #define __signed_type_use(x, ux) (2 + __is_nonneg(x, ux)) #define __unsigned_type_use(x, ux) (1 + 2 * (sizeof(ux) < 4)) @@ -57,19 +52,19 @@ __signed_type_use(x, ux) : __unsigned_type_use(x, ux)) /* - * To avoid warnings about casting pointers to integers - * of different sizes, we need that special sign type. + * Check whether a signed value is always non-negative. * - * On 64-bit we can just always use 'long', since any - * integer or pointer type can just be cast to that. + * A cast is needed to avoid any warnings from values that aren't signed + * integer types (in which case the result doesn't matter). * - * This does not work for 128-bit signed integers since - * the cast would truncate them, but we do not use s128 - * types in the kernel (we do use 'u128', but they will - * be handled by the !is_signed_type() case). + * On 64-bit any integer or pointer type can safely be cast to 'long'. + * But on 32-bit we need to avoid warnings about casting pointers to integers + * of different sizes without truncating 64-bit values so 'long' or 'long long' + * must be used depending on the size of the value. * - * NOTE! The cast is there only to avoid any warnings - * from when values that aren't signed integer types. + * This does not work for 128-bit signed integers since the cast would truncate + * them, but we do not use s128 types in the kernel (we do use 'u128', + * but they are handled by the !is_signed_type() case). */ #ifdef CONFIG_64BIT #define __signed_type(ux) long -- cgit v1.2.3 From b280bb27a9f7c91ddab730e1ad91a9c18a051f41 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Nov 2024 19:12:50 +0000 Subject: minmax.h: reduce the #define expansion of min(), max() and clamp() Since the test for signed values being non-negative only relies on __builtion_constant_p() (not is_constexpr()) it can use the 'ux' variable instead of the caller supplied expression. This means that the #define parameters are only expanded twice. Once in the code and once quoted in the error message. Link: https://lkml.kernel.org/r/051afc171806425da991908ed8688a98@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dan Carpenter Cc: Jason A. Donenfeld Cc: Jens Axboe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/minmax.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 24e4b372649a..6f7ea669d305 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -46,10 +46,10 @@ * comparison, and these expressions only need to be careful to not cause * warnings for pointer use. */ -#define __signed_type_use(x, ux) (2 + __is_nonneg(x, ux)) -#define __unsigned_type_use(x, ux) (1 + 2 * (sizeof(ux) < 4)) -#define __sign_use(x, ux) (is_signed_type(typeof(ux)) ? \ - __signed_type_use(x, ux) : __unsigned_type_use(x, ux)) +#define __signed_type_use(ux) (2 + __is_nonneg(ux)) +#define __unsigned_type_use(ux) (1 + 2 * (sizeof(ux) < 4)) +#define __sign_use(ux) (is_signed_type(typeof(ux)) ? \ + __signed_type_use(ux) : __unsigned_type_use(ux)) /* * Check whether a signed value is always non-negative. @@ -71,13 +71,13 @@ #else #define __signed_type(ux) typeof(__builtin_choose_expr(sizeof(ux) > 4, 1LL, 1L)) #endif -#define __is_nonneg(x, ux) statically_true((__signed_type(ux))(x) >= 0) +#define __is_nonneg(ux) statically_true((__signed_type(ux))(ux) >= 0) -#define __types_ok(x, y, ux, uy) \ - (__sign_use(x, ux) & __sign_use(y, uy)) +#define __types_ok(ux, uy) \ + (__sign_use(ux) & __sign_use(uy)) -#define __types_ok3(x, y, z, ux, uy, uz) \ - (__sign_use(x, ux) & __sign_use(y, uy) & __sign_use(z, uz)) +#define __types_ok3(ux, uy, uz) \ + (__sign_use(ux) & __sign_use(uy) & __sign_use(uz)) #define __cmp_op_min < #define __cmp_op_max > @@ -92,7 +92,7 @@ #define __careful_cmp_once(op, x, y, ux, uy) ({ \ __auto_type ux = (x); __auto_type uy = (y); \ - BUILD_BUG_ON_MSG(!__types_ok(x, y, ux, uy), \ + BUILD_BUG_ON_MSG(!__types_ok(ux, uy), \ #op"("#x", "#y") signedness error"); \ __cmp(op, ux, uy); }) @@ -109,7 +109,7 @@ static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), \ (lo) <= (hi), true), \ "clamp() low limit " #lo " greater than high limit " #hi); \ - BUILD_BUG_ON_MSG(!__types_ok3(val, lo, hi, uval, ulo, uhi), \ + BUILD_BUG_ON_MSG(!__types_ok3(uval, ulo, uhi), \ "clamp("#val", "#lo", "#hi") signedness error"); \ __clamp(uval, ulo, uhi); }) @@ -149,7 +149,7 @@ #define __careful_op3(op, x, y, z, ux, uy, uz) ({ \ __auto_type ux = (x); __auto_type uy = (y);__auto_type uz = (z);\ - BUILD_BUG_ON_MSG(!__types_ok3(x, y, z, ux, uy, uz), \ + BUILD_BUG_ON_MSG(!__types_ok3(ux, uy, uz), \ #op"3("#x", "#y", "#z") signedness error"); \ __cmp(op, ux, __cmp(op, uy, uz)); }) -- cgit v1.2.3 From a5743f32baec4728711bbc01d6ac2b33d4c67040 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Nov 2024 19:13:31 +0000 Subject: minmax.h: use BUILD_BUG_ON_MSG() for the lo < hi test in clamp() Use BUILD_BUG_ON_MSG(statically_true(ulo > uhi), ...) for the sanity check of the bounds in clamp(). Gives better error coverage and one less expansion of the arguments. Link: https://lkml.kernel.org/r/34d53778977747f19cce2abb287bb3e6@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dan Carpenter Cc: Jason A. Donenfeld Cc: Jens Axboe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/minmax.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 6f7ea669d305..91aa1b90c1bb 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -106,8 +106,7 @@ __auto_type uval = (val); \ __auto_type ulo = (lo); \ __auto_type uhi = (hi); \ - static_assert(__builtin_choose_expr(__is_constexpr((lo) > (hi)), \ - (lo) <= (hi), true), \ + BUILD_BUG_ON_MSG(statically_true(ulo > uhi), \ "clamp() low limit " #lo " greater than high limit " #hi); \ BUILD_BUG_ON_MSG(!__types_ok3(uval, ulo, uhi), \ "clamp("#val", "#lo", "#hi") signedness error"); \ -- cgit v1.2.3 From c3939872ee4a6b8bdcd0e813c66823b31e6e26f7 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Nov 2024 19:14:19 +0000 Subject: minmax.h: move all the clamp() definitions after the min/max() ones At some point the definitions for clamp() got added in the middle of the ones for min() and max(). Re-order the definitions so they are more sensibly grouped. Link: https://lkml.kernel.org/r/8bb285818e4846469121c8abc3dfb6e2@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dan Carpenter Cc: Jason A. Donenfeld Cc: Jens Axboe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/minmax.h | 109 +++++++++++++++++++++++-------------------------- 1 file changed, 51 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 91aa1b90c1bb..75fb7a6ad4c6 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -99,22 +99,6 @@ #define __careful_cmp(op, x, y) \ __careful_cmp_once(op, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_)) -#define __clamp(val, lo, hi) \ - ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) - -#define __clamp_once(val, lo, hi, uval, ulo, uhi) ({ \ - __auto_type uval = (val); \ - __auto_type ulo = (lo); \ - __auto_type uhi = (hi); \ - BUILD_BUG_ON_MSG(statically_true(ulo > uhi), \ - "clamp() low limit " #lo " greater than high limit " #hi); \ - BUILD_BUG_ON_MSG(!__types_ok3(uval, ulo, uhi), \ - "clamp("#val", "#lo", "#hi") signedness error"); \ - __clamp(uval, ulo, uhi); }) - -#define __careful_clamp(val, lo, hi) \ - __clamp_once(val, lo, hi, __UNIQUE_ID(v_), __UNIQUE_ID(l_), __UNIQUE_ID(h_)) - /** * min - return minimum of two values of the same or compatible types * @x: first value @@ -170,6 +154,22 @@ #define max3(x, y, z) \ __careful_op3(max, x, y, z, __UNIQUE_ID(x_), __UNIQUE_ID(y_), __UNIQUE_ID(z_)) +/** + * min_t - return minimum of two values, using the specified type + * @type: data type to use + * @x: first value + * @y: second value + */ +#define min_t(type, x, y) __cmp_once(min, type, x, y) + +/** + * max_t - return maximum of two values, using the specified type + * @type: data type to use + * @x: first value + * @y: second value + */ +#define max_t(type, x, y) __cmp_once(max, type, x, y) + /** * min_not_zero - return the minimum that is _not_ zero, unless both are zero * @x: value1 @@ -180,6 +180,22 @@ typeof(y) __y = (y); \ __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) +#define __clamp(val, lo, hi) \ + ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) + +#define __clamp_once(val, lo, hi, uval, ulo, uhi) ({ \ + __auto_type uval = (val); \ + __auto_type ulo = (lo); \ + __auto_type uhi = (hi); \ + BUILD_BUG_ON_MSG(statically_true(ulo > uhi), \ + "clamp() low limit " #lo " greater than high limit " #hi); \ + BUILD_BUG_ON_MSG(!__types_ok3(uval, ulo, uhi), \ + "clamp("#val", "#lo", "#hi") signedness error"); \ + __clamp(uval, ulo, uhi); }) + +#define __careful_clamp(val, lo, hi) \ + __clamp_once(val, lo, hi, __UNIQUE_ID(v_), __UNIQUE_ID(l_), __UNIQUE_ID(h_)) + /** * clamp - return a value clamped to a given range with strict typechecking * @val: current value @@ -191,28 +207,30 @@ */ #define clamp(val, lo, hi) __careful_clamp(val, lo, hi) -/* - * ..and if you can't take the strict - * types, you can specify one yourself. - * - * Or not use min/max/clamp at all, of course. - */ - /** - * min_t - return minimum of two values, using the specified type - * @type: data type to use - * @x: first value - * @y: second value + * clamp_t - return a value clamped to a given range using a given type + * @type: the type of variable to use + * @val: current value + * @lo: minimum allowable value + * @hi: maximum allowable value + * + * This macro does no typechecking and uses temporary variables of type + * @type to make all the comparisons. */ -#define min_t(type, x, y) __cmp_once(min, type, x, y) +#define clamp_t(type, val, lo, hi) __careful_clamp((type)(val), (type)(lo), (type)(hi)) /** - * max_t - return maximum of two values, using the specified type - * @type: data type to use - * @x: first value - * @y: second value + * clamp_val - return a value clamped to a given range using val's type + * @val: current value + * @lo: minimum allowable value + * @hi: maximum allowable value + * + * This macro does no typechecking and uses temporary variables of whatever + * type the input argument @val is. This is useful when @val is an unsigned + * type and @lo and @hi are literals that will otherwise be assigned a signed + * integer type. */ -#define max_t(type, x, y) __cmp_once(max, type, x, y) +#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) /* * Do not check the array parameter using __must_be_array(). @@ -257,31 +275,6 @@ */ #define max_array(array, len) __minmax_array(max, array, len) -/** - * clamp_t - return a value clamped to a given range using a given type - * @type: the type of variable to use - * @val: current value - * @lo: minimum allowable value - * @hi: maximum allowable value - * - * This macro does no typechecking and uses temporary variables of type - * @type to make all the comparisons. - */ -#define clamp_t(type, val, lo, hi) __careful_clamp((type)(val), (type)(lo), (type)(hi)) - -/** - * clamp_val - return a value clamped to a given range using val's type - * @val: current value - * @lo: minimum allowable value - * @hi: maximum allowable value - * - * This macro does no typechecking and uses temporary variables of whatever - * type the input argument @val is. This is useful when @val is an unsigned - * type and @lo and @hi are literals that will otherwise be assigned a signed - * integer type. - */ -#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) - static inline bool in_range64(u64 val, u64 start, u64 len) { return (val - start) < len; -- cgit v1.2.3 From 495bba17cdf95e9703af1b8ef773c55ef0dfe703 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Nov 2024 19:15:05 +0000 Subject: minmax.h: simplify the variants of clamp() Always pass a 'type' through to __clamp_once(), pass '__auto_type' from clamp() itself. The expansion of __types_ok3() is reasonable so it isn't worth the added complexity of avoiding it when a fixed type is used for all three values. Link: https://lkml.kernel.org/r/8f69f4deac014f558bab186444bac2e8@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dan Carpenter Cc: Jason A. Donenfeld Cc: Jens Axboe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/minmax.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 75fb7a6ad4c6..2bbdd5b5e07e 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -183,29 +183,29 @@ #define __clamp(val, lo, hi) \ ((val) >= (hi) ? (hi) : ((val) <= (lo) ? (lo) : (val))) -#define __clamp_once(val, lo, hi, uval, ulo, uhi) ({ \ - __auto_type uval = (val); \ - __auto_type ulo = (lo); \ - __auto_type uhi = (hi); \ +#define __clamp_once(type, val, lo, hi, uval, ulo, uhi) ({ \ + type uval = (val); \ + type ulo = (lo); \ + type uhi = (hi); \ BUILD_BUG_ON_MSG(statically_true(ulo > uhi), \ "clamp() low limit " #lo " greater than high limit " #hi); \ BUILD_BUG_ON_MSG(!__types_ok3(uval, ulo, uhi), \ "clamp("#val", "#lo", "#hi") signedness error"); \ __clamp(uval, ulo, uhi); }) -#define __careful_clamp(val, lo, hi) \ - __clamp_once(val, lo, hi, __UNIQUE_ID(v_), __UNIQUE_ID(l_), __UNIQUE_ID(h_)) +#define __careful_clamp(type, val, lo, hi) \ + __clamp_once(type, val, lo, hi, __UNIQUE_ID(v_), __UNIQUE_ID(l_), __UNIQUE_ID(h_)) /** - * clamp - return a value clamped to a given range with strict typechecking + * clamp - return a value clamped to a given range with typechecking * @val: current value * @lo: lowest allowable value * @hi: highest allowable value * - * This macro does strict typechecking of @lo/@hi to make sure they are of the - * same type as @val. See the unnecessary pointer comparisons. + * This macro checks @val/@lo/@hi to make sure they have compatible + * signedness. */ -#define clamp(val, lo, hi) __careful_clamp(val, lo, hi) +#define clamp(val, lo, hi) __careful_clamp(__auto_type, val, lo, hi) /** * clamp_t - return a value clamped to a given range using a given type @@ -217,7 +217,7 @@ * This macro does no typechecking and uses temporary variables of type * @type to make all the comparisons. */ -#define clamp_t(type, val, lo, hi) __careful_clamp((type)(val), (type)(lo), (type)(hi)) +#define clamp_t(type, val, lo, hi) __careful_clamp(type, val, lo, hi) /** * clamp_val - return a value clamped to a given range using val's type @@ -230,7 +230,7 @@ * type and @lo and @hi are literals that will otherwise be assigned a signed * integer type. */ -#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) +#define clamp_val(val, lo, hi) __careful_clamp(typeof(val), val, lo, hi) /* * Do not check the array parameter using __must_be_array(). -- cgit v1.2.3 From 2b97aaf74ed534fb838d09867d09a3ca5d795208 Mon Sep 17 00:00:00 2001 From: David Laight Date: Mon, 18 Nov 2024 19:15:51 +0000 Subject: minmax.h: remove some #defines that are only expanded once The bodies of __signed_type_use() and __unsigned_type_use() are much the same size as their names - so put the bodies in the only line that expands them. Similarly __signed_type() is defined separately for 64bit and then used exactly once just below. Change the test for __signed_type from CONFIG_64BIT to one based on gcc defined macros so that the code is valid if it gets used outside of a kernel build. Link: https://lkml.kernel.org/r/9386d1ebb8974fbabbed2635160c3975@AcuMS.aculab.com Signed-off-by: David Laight Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dan Carpenter Cc: Jason A. Donenfeld Cc: Jens Axboe Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/minmax.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 2bbdd5b5e07e..eaaf5c008e4d 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -46,10 +46,8 @@ * comparison, and these expressions only need to be careful to not cause * warnings for pointer use. */ -#define __signed_type_use(ux) (2 + __is_nonneg(ux)) -#define __unsigned_type_use(ux) (1 + 2 * (sizeof(ux) < 4)) #define __sign_use(ux) (is_signed_type(typeof(ux)) ? \ - __signed_type_use(ux) : __unsigned_type_use(ux)) + (2 + __is_nonneg(ux)) : (1 + 2 * (sizeof(ux) < 4))) /* * Check whether a signed value is always non-negative. @@ -57,7 +55,7 @@ * A cast is needed to avoid any warnings from values that aren't signed * integer types (in which case the result doesn't matter). * - * On 64-bit any integer or pointer type can safely be cast to 'long'. + * On 64-bit any integer or pointer type can safely be cast to 'long long'. * But on 32-bit we need to avoid warnings about casting pointers to integers * of different sizes without truncating 64-bit values so 'long' or 'long long' * must be used depending on the size of the value. @@ -66,12 +64,12 @@ * them, but we do not use s128 types in the kernel (we do use 'u128', * but they are handled by the !is_signed_type() case). */ -#ifdef CONFIG_64BIT - #define __signed_type(ux) long +#if __SIZEOF_POINTER__ == __SIZEOF_LONG_LONG__ +#define __is_nonneg(ux) statically_true((long long)(ux) >= 0) #else - #define __signed_type(ux) typeof(__builtin_choose_expr(sizeof(ux) > 4, 1LL, 1L)) +#define __is_nonneg(ux) statically_true( \ + (typeof(__builtin_choose_expr(sizeof(ux) > 4, 1LL, 1L)))(ux) >= 0) #endif -#define __is_nonneg(ux) statically_true((__signed_type(ux))(ux) >= 0) #define __types_ok(ux, uy) \ (__sign_use(ux) & __sign_use(uy)) -- cgit v1.2.3 From f0ef073e213a3a1a494b6f55a76ce1242600a453 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 14 Jan 2025 21:04:54 +0800 Subject: include/linux/lz4.h: add some missing macros Currently, LZ4_DISTANCE_MAX and LZ4_DECOMPRESS_INPLACE_MARGIN are defined in the erofs subsystem for LZ4 in-place decompression, which is somewhat unsuitable since they should belong to the LZ4 itself and may change with future LZ4 codebase updates. Move them to include/linux/lz4.h to match the upstream LZ4 library [1]. No logic changes. [1] https://github.com/lz4/lz4/blob/v1.10.0/lib/lz4.h#L670 Link: https://lkml.kernel.org/r/20250114130454.1191150-1-hsiangkao@linux.alibaba.com Signed-off-by: Gao Xiang Cc: Yann Collet Cc: Nick Terrell Cc: Chao Yu Cc: Yue Hu Cc; Jeffle Xu Cc: Sandeep Dhavale Signed-off-by: Andrew Morton --- include/linux/lz4.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/lz4.h b/include/linux/lz4.h index b16e15b9587a..ad6042a718b5 100644 --- a/include/linux/lz4.h +++ b/include/linux/lz4.h @@ -645,4 +645,10 @@ int LZ4_decompress_safe_usingDict(const char *source, char *dest, int LZ4_decompress_fast_usingDict(const char *source, char *dest, int originalSize, const char *dictStart, int dictSize); +#define LZ4_DECOMPRESS_INPLACE_MARGIN(compressedSize) (((compressedSize) >> 8) + 32) + +#ifndef LZ4_DISTANCE_MAX /* history window size; can be user-defined at compile time */ +#define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */ +#endif + #endif -- cgit v1.2.3 From 6beaa75cd24d660e7913c60aff702ec809ff9b28 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 12 Jan 2025 01:20:49 +0000 Subject: kdb: Remove unused flags stack kdb_restore_flags() and kdb_save_flags() were added in 2010 by commit 5d5314d6795f ("kdb: core for kgdb back end (1 of 2)") but have remained unused. Remove them, and their associated storage. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20250112012049.319515-1-linux@treblig.org Signed-off-by: Daniel Thompson (RISCstar) --- include/linux/kdb.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index f6c2ddb16b95..905a2e2f45f6 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -140,9 +140,6 @@ extern const char *kdb_diemsg; extern unsigned int kdb_flags; /* Global flags, see kdb_state for per cpu state */ -extern void kdb_save_flags(void); -extern void kdb_restore_flags(void); - #define KDB_FLAG(flag) (kdb_flags & KDB_FLAG_##flag) #define KDB_FLAG_SET(flag) ((void)(kdb_flags |= KDB_FLAG_##flag)) #define KDB_FLAG_CLEAR(flag) ((void)(kdb_flags &= ~KDB_FLAG_##flag)) -- cgit v1.2.3 From 40733e7e0c260d540447d3646e451274bc5d3374 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Dec 2024 19:46:31 +0800 Subject: mm/swap_cgroup: remove swap_cgroup_cmpxchg This function is never used after commit 6b611388b626 ("memcg-v1: remove charge move code"). Link: https://lkml.kernel.org/r/20241218114633.85196-3-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Yosry Ahmed Reviewed-by: Roman Gushchin Acked-by: Shakeel Butt Acked-by: Chris Li Cc: Barry Song Cc: Hugh Dickins Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/swap_cgroup.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index ae73a87775b3..d521ad1c4164 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -6,8 +6,6 @@ #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, - unsigned short old, unsigned short new); extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, unsigned int nr_ents); extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); -- cgit v1.2.3 From 6769183166b33b1a5de8f938d1ff4d5f4be0f428 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 18 Dec 2024 19:46:33 +0800 Subject: mm/swap_cgroup: decouple swap cgroup recording and clearing The current implementation of swap cgroup tracking is a bit complex and fragile: On charging path, swap_cgroup_record always records an actual memcg id, and it depends on the caller to make sure all entries passed in must belong to one single folio. As folios are always charged or uncharged as a whole, and always charged and uncharged in order, swap_cgroup doesn't need an extra lock. On uncharging path, swap_cgroup_record always sets the record to zero. These entries won't be charged again until uncharging is done. So there is no extra lock needed either. Worth noting that swap cgroup clearing may happen without folio involved, eg. exiting processes will zap its page table without swapin. The xchg/cmpxchg provides atomic operations and barriers to ensure no tearing or synchronization issue of these swap cgroup records. It works but quite error-prone. Things can be much clear and robust by decoupling recording and clearing into two helpers. Recording takes the actual folio being charged as argument, and clearing always set the record to zero, and refine the debug sanity checks to better reflect their usage Benchmark even showed a very slight improvement as it saved some extra arguments and lookups: make -j96 with defconfig on tmpfs in 1.5G memory cgroup using 4k folios: Before: sys 9617.23 (stdev 37.764062) After : sys 9541.54 (stdev 42.973976) make -j96 with defconfig on tmpfs in 2G memory cgroup using 64k folios: Before: sys 7358.98 (stdev 54.927593) After : sys 7337.82 (stdev 39.398956) Link: https://lkml.kernel.org/r/20241218114633.85196-5-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Cc: Barry Song Cc: Hugh Dickins Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap_cgroup.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index d521ad1c4164..b5ec038069da 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -6,8 +6,8 @@ #if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) -extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, - unsigned int nr_ents); +extern void swap_cgroup_record(struct folio *folio, swp_entry_t ent); +extern unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents); extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); extern int swap_cgroup_swapon(int type, unsigned long max_pages); extern void swap_cgroup_swapoff(int type); @@ -15,8 +15,12 @@ extern void swap_cgroup_swapoff(int type); #else static inline -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, - unsigned int nr_ents) +void swap_cgroup_record(struct folio *folio, swp_entry_t ent) +{ +} + +static inline +unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents) { return 0; } -- cgit v1.2.3 From 04f13d241b8b146b23038bffd907cb8278391d07 Mon Sep 17 00:00:00 2001 From: yangge Date: Sat, 11 Jan 2025 15:58:20 +0800 Subject: mm: replace free hugepage folios after migration My machine has 4 NUMA nodes, each equipped with 32GB of memory. I have configured each NUMA node with 16GB of CMA and 16GB of in-use hugetlb pages. The allocation of contiguous memory via cma_alloc() can fail probabilistically. When there are free hugetlb folios in the hugetlb pool, during the migration of in-use hugetlb folios, new folios are allocated from the free hugetlb pool. After the migration is completed, the old folios are released back to the free hugetlb pool instead of being returned to the buddy system. This can cause test_pages_isolated() check to fail, ultimately leading to the failure of cma_alloc(). Call trace: cma_alloc() __alloc_contig_migrate_range() // migrate in-use hugepage test_pages_isolated() __test_page_isolated_in_pageblock() PageBuddy(page) // check if the page is in buddy To address this issue, we introduce a function named replace_free_hugepage_folios(). This function will replace the hugepage in the free hugepage pool with a new one and release the old one to the buddy system. After the migration of in-use hugetlb pages is completed, we will invoke replace_free_hugepage_folios() to ensure that these hugepages are properly released to the buddy system. Following this step, when test_pages_isolated() is executed for inspection, it will successfully pass. Additionally, when alloc_contig_range() is used to migrate multiple in-use hugetlb pages, it can result in some in-use hugetlb pages being released back to the free hugetlb pool and subsequently being reallocated and used again. For example: [huge 0] [huge 1] To migrate huge 0, we obtain huge x from the pool. After the migration is completed, we return the now-freed huge 0 back to the pool. When it's time to migrate huge 1, we can simply reuse the now-freed huge 0 from the pool. As a result, when replace_free_hugepage_folios() is executed, it cannot release huge 0 back to the buddy system. To address this issue, we should prevent the reuse of isolated free hugepages during the migration process. Link: https://lkml.kernel.org/r/1734503588-16254-1-git-send-email-yangge1116@126.com Link: https://lkml.kernel.org/r/1736582300-11364-1-git-send-email-yangge1116@126.com Signed-off-by: yangge Cc: Baolin Wang Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ae4fe8615bb6..10faf42ca96a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -681,6 +681,7 @@ struct huge_bootmem_page { }; int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); +int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, @@ -1059,6 +1060,12 @@ static inline int isolate_or_dissolve_huge_page(struct page *page, return -ENOMEM; } +static inline int replace_free_hugepage_folios(unsigned long start_pfn, + unsigned long end_pfn) +{ + return 0; +} + static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) -- cgit v1.2.3 From 44d46b76c3a4b514a0cc9dab147ed430e5c1d699 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 20 Dec 2024 16:07:09 -0500 Subject: mm: add build-time option for hotplug memory default online type Memory hotplug presently auto-onlines memory into a zone the kernel deems appropriate if CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y. The memhp_default_state boot param enables runtime config, but it's not possible to do this at build-time. Remove CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE, and replace it with CONFIG_MHP_DEFAULT_ONLINE_TYPE_* choices that sync with the boot param. Selections: CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE => mhp_default_online_type = "offline" Memory will not be onlined automatically. CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO => mhp_default_online_type = "online" Memory will be onlined automatically in a zone deemed. appropriate by the kernel. CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_KERNEL => mhp_default_online_type = "online_kernel" Memory will be onlined automatically. The zone may allow kernel data (e.g. ZONE_NORMAL). CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE => mhp_default_online_type = "online_movable" Memory will be onlined automatically. The zone will be ZONE_MOVABLE. Default to CONFIG_MHP_DEFAULT_ONLINE_TYPE_OFFLINE to match the existing default CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=n behavior. Existing users of CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y should use CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_AUTO. [gourry@gourry.net: update KConfig comments] Link: https://lkml.kernel.org/r/20241226182918.648799-1-gourry@gourry.net Link: https://lkml.kernel.org/r/20241220210709.300066-1-gourry@gourry.net Signed-off-by: Gregory Price Acked-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Huacai Chen Cc: Jonathan Corbet Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index b27ddce5d324..eaac5ae8c05c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -144,8 +144,6 @@ extern u64 max_mem_size; extern int mhp_online_type_from_str(const char *str); -/* Default online_type (MMOP_*) when new memory blocks are added. */ -extern int mhp_default_online_type; /* If movable_node boot option specified */ extern bool movable_node_enabled; static inline bool movable_node_is_enabled(void) @@ -303,6 +301,9 @@ static inline void __remove_memory(u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ #ifdef CONFIG_MEMORY_HOTPLUG +/* Default online_type (MMOP_*) when new memory blocks are added. */ +extern int mhp_get_default_online_type(void); +extern void mhp_set_default_online_type(int online_type); extern void __ref free_area_init_core_hotplug(struct pglist_data *pgdat); extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags); -- cgit v1.2.3 From 98a7e47faa3ec38260b851a1c5823cbd45d5a229 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Wed, 8 Jan 2025 14:57:19 +0800 Subject: asm-generic: pgalloc: provide generic p4d_{alloc_one,free} Four architectures currently implement 5-level pgtables: arm64, riscv, x86 and s390. The first three have essentially the same implementation for p4d_alloc_one() and p4d_free(), so we've got an opportunity to reduce duplication like at the lower levels. Provide a generic version of p4d_alloc_one() and p4d_free(), and make use of it on those architectures. Their implementation is the same as at PUD level, except that p4d_free() performs a runtime check by calling mm_p4d_folded(). 5-level pgtables depend on a runtime-detected hardware feature on all supported architectures, so we might as well include this check in the generic implementation. No runtime check is required in p4d_alloc_one() as the top-level p4d_alloc() already does the required check. Link: https://lkml.kernel.org/r/26d69c74a29183ecc335b9b407040d8e4cd70c6a.1736317725.git.zhengqi.arch@bytedance.com Signed-off-by: Kevin Brodsky Signed-off-by: Qi Zheng Acked-by: Dave Hansen Acked-by: Arnd Bergmann [asm-generic] Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Aneesh Kumar K.V (Arm) Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Muchun Song Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Peter Zijlstra (Intel) Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 45 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include') diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 7c48f5fbf8aa..59131629ac9c 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -215,6 +215,51 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) #endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#if CONFIG_PGTABLE_LEVELS > 4 + +static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) +{ + gfp_t gfp = GFP_PGTABLE_USER; + struct ptdesc *ptdesc; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + gfp &= ~__GFP_HIGHMEM; + + ptdesc = pagetable_alloc_noprof(gfp, 0); + if (!ptdesc) + return NULL; + + return ptdesc_address(ptdesc); +} +#define __p4d_alloc_one(...) alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__)) + +#ifndef __HAVE_ARCH_P4D_ALLOC_ONE +static inline p4d_t *p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr) +{ + return __p4d_alloc_one_noprof(mm, addr); +} +#define p4d_alloc_one(...) alloc_hooks(p4d_alloc_one_noprof(__VA_ARGS__)) +#endif + +static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(p4d); + + BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); + pagetable_free(ptdesc); +} + +#ifndef __HAVE_ARCH_P4D_FREE +static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) +{ + if (!mm_p4d_folded(mm)) + __p4d_free(mm, p4d); +} +#endif + +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ + #ifndef __HAVE_ARCH_PGD_FREE static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { -- cgit v1.2.3 From 5fcf5fa61218176acf198d9e63fb5739dd147244 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 8 Jan 2025 14:57:20 +0800 Subject: mm: pgtable: add statistics for P4D level page table Like other levels of page tables, add statistics for P4D level page table. Link: https://lkml.kernel.org/r/d55fe3c286305aae84457da9e1066df99b3de125.1736317725.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Originally-by: Peter Zijlstra (Intel) Reviewed-by: Kevin Brodsky Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Aneesh Kumar K.V (Arm) Cc: Arnd Bergmann Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Muchun Song Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 2 ++ include/linux/mm.h | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 59131629ac9c..bb482eeca0c3 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -230,6 +230,7 @@ static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long if (!ptdesc) return NULL; + pagetable_p4d_ctor(ptdesc); return ptdesc_address(ptdesc); } #define __p4d_alloc_one(...) alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__)) @@ -247,6 +248,7 @@ static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d) struct ptdesc *ptdesc = virt_to_ptdesc(p4d); BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); + pagetable_p4d_dtor(ptdesc); pagetable_free(ptdesc); } diff --git a/include/linux/mm.h b/include/linux/mm.h index e7c54b9aac6d..2e56a9634a97 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3175,6 +3175,22 @@ static inline void pagetable_pud_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } +static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) +{ + struct folio *folio = ptdesc_folio(ptdesc); + + __folio_set_pgtable(folio); + lruvec_stat_add_folio(folio, NR_PAGETABLE); +} + +static inline void pagetable_p4d_dtor(struct ptdesc *ptdesc) +{ + struct folio *folio = ptdesc_folio(ptdesc); + + __folio_clear_pgtable(folio); + lruvec_stat_sub_folio(folio, NR_PAGETABLE); +} + extern void __init pagecache_init(void); extern void free_initmem(void); -- cgit v1.2.3 From db6b435d731a8d82c38e558175db55466cb5832a Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 8 Jan 2025 14:57:23 +0800 Subject: mm: pgtable: introduce pagetable_dtor() The pagetable_p*_dtor() are exactly the same except for the handling of ptlock. If we make ptlock_free() handle the case where ptdesc->ptl is NULL and remove VM_BUG_ON_PAGE() from pmd_ptlock_free(), we can unify pagetable_p*_dtor() into one function. Let's introduce pagetable_dtor() to do this. Later, pagetable_dtor() will be moved to tlb_remove_ptdesc(), so that ptlock and page table pages can be freed together (regardless of whether RCU is used). This prevents the use-after-free problem where the ptlock is freed immediately but the page table pages is freed later via RCU. Link: https://lkml.kernel.org/r/47f44fff9dc68d9d9e9a0d6c036df275f820598a.1736317725.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Originally-by: Peter Zijlstra (Intel) Reviewed-by: Kevin Brodsky Acked-by: Alexander Gordeev [s390] Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Aneesh Kumar K.V (Arm) Cc: Arnd Bergmann Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Muchun Song Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 8 +++---- include/linux/mm.h | 52 ++++++++----------------------------------- 2 files changed, 13 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index bb482eeca0c3..4afb346eae25 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -109,7 +109,7 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) { struct ptdesc *ptdesc = page_ptdesc(pte_page); - pagetable_pte_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } @@ -153,7 +153,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) struct ptdesc *ptdesc = virt_to_ptdesc(pmd); BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); - pagetable_pmd_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } #endif @@ -202,7 +202,7 @@ static inline void __pud_free(struct mm_struct *mm, pud_t *pud) struct ptdesc *ptdesc = virt_to_ptdesc(pud); BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); - pagetable_pud_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } @@ -248,7 +248,7 @@ static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d) struct ptdesc *ptdesc = virt_to_ptdesc(p4d); BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); - pagetable_p4d_dtor(ptdesc); + pagetable_dtor(ptdesc); pagetable_free(ptdesc); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e56a9634a97..a3b2263f1c1a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2992,6 +2992,15 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ +static inline void pagetable_dtor(struct ptdesc *ptdesc) +{ + struct folio *folio = ptdesc_folio(ptdesc); + + ptlock_free(ptdesc); + __folio_clear_pgtable(folio); + lruvec_stat_sub_folio(folio, NR_PAGETABLE); +} + static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); @@ -3003,15 +3012,6 @@ static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) return true; } -static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) -{ - struct folio *folio = ptdesc_folio(ptdesc); - - ptlock_free(ptdesc); - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); -} - pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) @@ -3088,14 +3088,6 @@ static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) return ptlock_init(ptdesc); } -static inline void pmd_ptlock_free(struct ptdesc *ptdesc) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc)); -#endif - ptlock_free(ptdesc); -} - #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) #else @@ -3106,7 +3098,6 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } -static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {} #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) @@ -3131,15 +3122,6 @@ static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) return true; } -static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc) -{ - struct folio *folio = ptdesc_folio(ptdesc); - - pmd_ptlock_free(ptdesc); - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); -} - /* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be @@ -3167,14 +3149,6 @@ static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) lruvec_stat_add_folio(folio, NR_PAGETABLE); } -static inline void pagetable_pud_dtor(struct ptdesc *ptdesc) -{ - struct folio *folio = ptdesc_folio(ptdesc); - - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); -} - static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); @@ -3183,14 +3157,6 @@ static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) lruvec_stat_add_folio(folio, NR_PAGETABLE); } -static inline void pagetable_p4d_dtor(struct ptdesc *ptdesc) -{ - struct folio *folio = ptdesc_folio(ptdesc); - - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); -} - extern void __init pagecache_init(void); extern void free_initmem(void); -- cgit v1.2.3 From 2dccdf7076f671764d02c850e83a8b457105268d Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 8 Jan 2025 14:57:30 +0800 Subject: mm: pgtable: introduce generic __tlb_remove_table() Several architectures (arm, arm64, riscv and x86) define exactly the same __tlb_remove_table(), just introduce generic __tlb_remove_table() to eliminate these duplications. The s390 __tlb_remove_table() is nearly the same, so also make s390 __tlb_remove_table() version generic. Link: https://lkml.kernel.org/r/ea372633d94f4d3f9f56a7ec5994bf050bf77e39.1736317725.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Kevin Brodsky Acked-by: Andreas Larsson [sparc] Acked-by: Alexander Gordeev [s390] Acked-by: Arnd Bergmann [asm-generic] Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Aneesh Kumar K.V (Arm) Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Muchun Song Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Peter Zijlstra (Intel) Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/asm-generic/tlb.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 709830274b75..69de47c7ef3c 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -153,8 +153,9 @@ * * Useful if your architecture has non-page page directories. * - * When used, an architecture is expected to provide __tlb_remove_table() - * which does the actual freeing of these pages. + * When used, an architecture is expected to provide __tlb_remove_table() or + * use the generic __tlb_remove_table(), which does the actual freeing of these + * pages. * * MMU_GATHER_RCU_TABLE_FREE * @@ -207,6 +208,16 @@ struct mmu_table_batch { #define MAX_TABLE_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) +#ifndef __HAVE_ARCH_TLB_REMOVE_TABLE +static inline void __tlb_remove_table(void *table) +{ + struct ptdesc *ptdesc = (struct ptdesc *)table; + + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); +} +#endif + extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #else /* !CONFIG_MMU_GATHER_HAVE_TABLE_FREE */ -- cgit v1.2.3 From 92ec7fd136a1f900656bbf6788e15127529e5387 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 8 Jan 2025 14:57:31 +0800 Subject: mm: pgtable: completely move pagetable_dtor() to generic tlb_remove_table() For the generic tlb_remove_table(), it is implemented in the following two forms: 1) CONFIG_MMU_GATHER_TABLE_FREE is enabled tlb_remove_table --> generic __tlb_remove_table() 2) CONFIG_MMU_GATHER_TABLE_FREE is disabled tlb_remove_table --> tlb_remove_page For case 1), the pagetable_dtor() has already been moved to generic __tlb_remove_table(). For case 2), now only arm will call tlb_remove_table()/tlb_remove_ptdesc() when CONFIG_MMU_GATHER_TABLE_FREE is disabled. Let's move pagetable_dtor() completely to generic tlb_remove_table(), so that the architectures can follow more easily. Link: https://lkml.kernel.org/r/0c733ac867b287ec08190676496d1decebf49da2.1736317725.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Suggested-by: Kevin Brodsky Reviewed-by: Kevin Brodsky Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Aneesh Kumar K.V (Arm) Cc: Arnd Bergmann Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Muchun Song Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Peter Zijlstra (Intel) Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/asm-generic/tlb.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 69de47c7ef3c..53ae7748f555 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -220,14 +220,20 @@ static inline void __tlb_remove_table(void *table) extern void tlb_remove_table(struct mmu_gather *tlb, void *table); -#else /* !CONFIG_MMU_GATHER_HAVE_TABLE_FREE */ +#else /* !CONFIG_MMU_GATHER_TABLE_FREE */ +static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page); /* * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based * page directories and we can use the normal page batching to free them. */ -#define tlb_remove_table(tlb, page) tlb_remove_page((tlb), (page)) +static inline void tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + struct page *page = (struct page *)table; + pagetable_dtor(page_ptdesc(page)); + tlb_remove_page(tlb, page); +} #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE -- cgit v1.2.3 From 553e77529fb61e5520b839a0ce412a46cba996e0 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 8 Jan 2025 14:57:33 +0800 Subject: mm: pgtable: introduce generic pagetable_dtor_free() The pte_free(), pmd_free(), __pud_free() and __p4d_free() in asm-generic/pgalloc.h and the generic __tlb_remove_table() are basically the same, so let's introduce pagetable_dtor_free() to deduplicate them. In addition, the pagetable_dtor_free() in s390 does the same thing, so let's s390 also calls generic pagetable_dtor_free(). Link: https://lkml.kernel.org/r/1663a0565aca881d1338ceb7d1db4aa9c333abd6.1736317725.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Suggested-by: Peter Zijlstra (Intel) Reviewed-by: Kevin Brodsky Acked-by: Alexander Gordeev [s390] Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Andreas Larsson Cc: Aneesh Kumar K.V (Arm) Cc: Arnd Bergmann Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Muchun Song Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 12 ++++-------- include/asm-generic/tlb.h | 3 +-- include/linux/mm.h | 6 ++++++ 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 4afb346eae25..e3977ddca15e 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -109,8 +109,7 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) { struct ptdesc *ptdesc = page_ptdesc(pte_page); - pagetable_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } @@ -153,8 +152,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) struct ptdesc *ptdesc = virt_to_ptdesc(pmd); BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); - pagetable_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } #endif @@ -202,8 +200,7 @@ static inline void __pud_free(struct mm_struct *mm, pud_t *pud) struct ptdesc *ptdesc = virt_to_ptdesc(pud); BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); - pagetable_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } #ifndef __HAVE_ARCH_PUD_FREE @@ -248,8 +245,7 @@ static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d) struct ptdesc *ptdesc = virt_to_ptdesc(p4d); BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); - pagetable_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } #ifndef __HAVE_ARCH_P4D_FREE diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 53ae7748f555..e402aef79c93 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -213,8 +213,7 @@ static inline void __tlb_remove_table(void *table) { struct ptdesc *ptdesc = (struct ptdesc *)table; - pagetable_dtor(ptdesc); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index a3b2263f1c1a..15a903d59d09 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3001,6 +3001,12 @@ static inline void pagetable_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } +static inline void pagetable_dtor_free(struct ptdesc *ptdesc) +{ + pagetable_dtor(ptdesc); + pagetable_free(ptdesc); +} + static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); -- cgit v1.2.3 From 30cee1e4861b59200aa09c94a4d789c461e5f408 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Mon, 30 Dec 2024 15:40:43 +0530 Subject: lib/list_debug.c: add object information in case of invalid object As of now during link list corruption it prints about cluprit address and its wrong value, but sometime it is not enough to catch the actual issue point. If it prints allocation and free path of that corrupted node, it will be a lot easier to find and fix the issues. Adding the same information when data mismatch is found in link list debug data: [ 14.243055] slab kmalloc-32 start ffff0000cda19320 data offset 32 pointer offset 8 size 32 allocated at add_to_list+0x28/0xb0 [ 14.245259] __kmalloc_cache_noprof+0x1c4/0x358 [ 14.245572] add_to_list+0x28/0xb0 ... [ 14.248632] do_el0_svc_compat+0x1c/0x34 [ 14.249018] el0_svc_compat+0x2c/0x80 [ 14.249244] Free path: [ 14.249410] kfree+0x24c/0x2f0 [ 14.249724] do_force_corruption+0xbc/0x100 ... [ 14.252266] el0_svc_common.constprop.0+0x40/0xe0 [ 14.252540] do_el0_svc_compat+0x1c/0x34 [ 14.252763] el0_svc_compat+0x2c/0x80 [ 14.253071] ------------[ cut here ]------------ [ 14.253303] list_del corruption. next->prev should be ffff0000cda192a8, but was 6b6b6b6b6b6b6b6b. (next=ffff0000cda19348) [ 14.254255] WARNING: CPU: 3 PID: 84 at lib/list_debug.c:65 __list_del_entry_valid_or_report+0x158/0x164 Moved prototype of mem_dump_obj() to bug.h, as mm.h can not be included in bug.h. Link: https://lkml.kernel.org/r/20241230101043.53773-1-maninder1.s@samsung.com Signed-off-by: Maninder Singh Acked-by: Jan Kara Cc: Al Viro Cc: Christian Brauner Cc: Marco Elver Cc: Rohit Thapliyal Signed-off-by: Andrew Morton --- include/linux/bug.h | 10 +++++++++- include/linux/mm.h | 6 ------ 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bug.h b/include/linux/bug.h index 348acf2558f3..a9948a9f1093 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -73,15 +73,23 @@ static inline void generic_bug_clear_once(void) {} #endif /* CONFIG_GENERIC_BUG */ +#ifdef CONFIG_PRINTK +void mem_dump_obj(void *object); +#else +static inline void mem_dump_obj(void *object) {} +#endif + /* * Since detected data corruption should stop operation on the affected * structures. Return value must be checked and sanely acted on by caller. */ static inline __must_check bool check_data_corruption(bool v) { return v; } -#define CHECK_DATA_CORRUPTION(condition, fmt, ...) \ +#define CHECK_DATA_CORRUPTION(condition, addr, fmt, ...) \ check_data_corruption(({ \ bool corruption = unlikely(condition); \ if (corruption) { \ + if (addr) \ + mem_dump_obj(addr); \ if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION)) { \ pr_err(fmt, ##__VA_ARGS__); \ BUG(); \ diff --git a/include/linux/mm.h b/include/linux/mm.h index 15a903d59d09..c550912a5d6d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4084,12 +4084,6 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping, extern int sysctl_nr_trim_pages; -#ifdef CONFIG_PRINTK -void mem_dump_obj(void *object); -#else -static inline void mem_dump_obj(void *object) {} -#endif - #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, -- cgit v1.2.3 From b0d66d82fce60161de6f3d57f87016c3a6f7a121 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 3 Jan 2025 19:35:35 +0000 Subject: mm/debug: introduce VM_WARN_ON_VMG() to dump VMA merge state Patch series "mm/debug: introduce and use VM_WARN_ON_VMG()". We use a number of asserts, enabled only when CONFIG_DEBUG_VM is set, during VMA merge operations to ensure state is as expected. However, when syzkaller or the like encounters these asserts, often the information provided by the report is insufficient to narrow down what the problem is. We noticed this recently in [0], where a non-repro issue resisted debugging due to simply not having sufficient information to go on. This series improves the situation by providing VM_WARN_ON_VMG() which acts like VM_WARN_ON() (i.e. only actually being invoked if CONFIG_DEBUG_VM is set), while dumping significant information about the VMA merge state, the mm_struct describing the virtual address space, all associated VMAs and, if CONFIG_DEBUG_VM_MAPLE_TREE is set, the associated maple tree. [0]:https://lore.kernel.org/all/6774c98f.050a0220.25abdd.0991.GAE@google.com/ This patch (of 2): We use a number of asserts, enabled only when CONFIG_DEBUG_VM is set, during VMA merge operations to ensure state is as expected. However, when syzkaller or the like encounters these asserts, often the information provided by the report is insufficient to narrow down what the problem is. This might not be so much of an issue if the reported problem is reproducible, but if it is a rarely encountered race or some other case which precludes a repro, it is a very big problem (see [0] for the motivating case). It is therefore sensible to provide a means by which we can easily and conveniently dump a lot more information in these circumstances. The aggregation of merge state into a single struct threaded through the operation makes this trivial - we can simply introduce a variant on VM_WARN_ON() which takes the VMA merge state object (vmg) and use that to dump information. This patch therefore introduces VM_WARN_ON_VMG() which provides this functionality. It additionally dumps full mm state, VMA state for each of the three VMAs the vmg contains (prev, next, vma) and if CONFIG_DEBUG_VM_MAPLE_TREE is enabled, dumps the maple tree from the provided VMA iterator if non-NULL. This patch has no functional impact if CONFIG_DEBUG_VM is not set. [0]:https://lore.kernel.org/all/6774c98f.050a0220.25abdd.0991.GAE@google.com/ Link: https://lkml.kernel.org/r/cover.1735932169.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/13b09b52d4d103ee86acaf0ae612539648ae29e0.1735932169.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Jann Horn Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mmdebug.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index d7cb1e5ecbda..a0a3894900ed 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -9,10 +9,12 @@ struct page; struct vm_area_struct; struct mm_struct; struct vma_iterator; +struct vma_merge_struct; void dump_page(const struct page *page, const char *reason); void dump_vma(const struct vm_area_struct *vma); void dump_mm(const struct mm_struct *mm); +void dump_vmg(const struct vma_merge_struct *vmg, const char *reason); void vma_iter_dump_tree(const struct vma_iterator *vmi); #ifdef CONFIG_DEBUG_VM @@ -87,6 +89,15 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); } \ unlikely(__ret_warn_once); \ }) +#define VM_WARN_ON_VMG(cond, vmg) ({ \ + int __ret_warn = !!(cond); \ + \ + if (unlikely(__ret_warn)) { \ + dump_vmg(vmg, "VM_WARN_ON_VMG(" __stringify(cond)")"); \ + WARN_ON(1); \ + } \ + unlikely(__ret_warn); \ +}) #define VM_WARN_ON(cond) (void)WARN_ON(cond) #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) @@ -104,9 +115,10 @@ void vma_iter_dump_tree(const struct vma_iterator *vmi); #define VM_WARN_ON_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE_MM(cond, mm) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ON_VMG(cond, vmg) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) -#endif +#endif /* CONFIG_DEBUG_VM */ #ifdef CONFIG_DEBUG_VM_IRQSOFF #define VM_WARN_ON_IRQS_ENABLED() WARN_ON_ONCE(!irqs_disabled()) -- cgit v1.2.3 From 11e2400b21a3e2dfbc95e31a9a849a30191f7a92 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Fri, 3 Jan 2025 18:44:10 +0000 Subject: mm: move common part of pagetable_*_ctor to helper Patch series "Account page tables at all levels". This series should be considered in conjunction with Qi's series [1]. Together, they ensure that page table ctor/dtor are called at all levels (PTE to PGD) and all architectures, where page tables are regular pages. Besides the improvement in accounting and general cleanup, this also create a single place where construction/destruction hooks can be called for all page tables, namely the now-generic pagetable_dtor() introduced by Qi, and __pagetable_ctor() introduced in this series. [1] https://lore.kernel.org/linux-mm/cover.1735549103.git.zhengqi.arch@bytedance.com/ This patch (of 6): pagetable_*_ctor all have the same basic implementation. Move the common part to a helper to reduce duplication. Link: https://lkml.kernel.org/r/20250103184415.2744423-1-kevin.brodsky@arm.com Link: https://lkml.kernel.org/r/20250103184415.2744423-2-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: Dave Hansen Acked-by: Qi Zheng Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Linus Walleij Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Will Deacon Cc: Ingo Molnar Signed-off-by: Andrew Morton --- include/linux/mm.h | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index c550912a5d6d..2949b58fd633 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2992,6 +2992,14 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ +static inline void __pagetable_ctor(struct ptdesc *ptdesc) +{ + struct folio *folio = ptdesc_folio(ptdesc); + + __folio_set_pgtable(folio); + lruvec_stat_add_folio(folio, NR_PAGETABLE); +} + static inline void pagetable_dtor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); @@ -3009,12 +3017,9 @@ static inline void pagetable_dtor_free(struct ptdesc *ptdesc) static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); - if (!ptlock_init(ptdesc)) return false; - __folio_set_pgtable(folio); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __pagetable_ctor(ptdesc); return true; } @@ -3118,13 +3123,10 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); - if (!pmd_ptlock_init(ptdesc)) return false; - __folio_set_pgtable(folio); ptdesc_pmd_pts_init(ptdesc); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __pagetable_ctor(ptdesc); return true; } @@ -3149,18 +3151,12 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); - - __folio_set_pgtable(folio); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __pagetable_ctor(ptdesc); } static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); - - __folio_set_pgtable(folio); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __pagetable_ctor(ptdesc); } extern void __init pagecache_init(void); -- cgit v1.2.3 From a9b3c355c2e6388b0a3b67627460a516d88bdbc9 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Fri, 3 Jan 2025 18:44:14 +0000 Subject: asm-generic: pgalloc: provide generic __pgd_{alloc,free} We already have a generic implementation of alloc/free up to P4D level, as well as pgd_free(). Let's finish the work and add a generic PGD-level alloc helper as well. Unlike at lower levels, almost all architectures need some specific magic at PGD level (typically initialising PGD entries), so introducing a generic pgd_alloc() isn't worth it. Instead we introduce two new helpers, __pgd_alloc() and __pgd_free(), and make use of them in the arch-specific pgd_alloc() and pgd_free() wherever possible. To accommodate as many arch as possible, __pgd_alloc() takes a page allocation order. Because pagetable_alloc() allocates zeroed pages, explicit zeroing in pgd_alloc() becomes redundant and we can get rid of it. Some trivial implementations of pgd_free() also become unnecessary once __pgd_alloc() is used; remove them. Another small improvement is consistent accounting of PGD pages by using GFP_PGTABLE_{USER,KERNEL} as appropriate. Not all PGD allocations can be handled by the generic helpers. In particular, multiple architectures allocate PGDs from a kmem_cache, and those PGDs may not be page-sized. Link: https://lkml.kernel.org/r/20250103184415.2744423-6-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: Dave Hansen Acked-by: Qi Zheng Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Ingo Molnar Cc: Linus Walleij Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index e3977ddca15e..de4df14158e6 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -258,10 +258,35 @@ static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) #endif /* CONFIG_PGTABLE_LEVELS > 4 */ +static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order) +{ + gfp_t gfp = GFP_PGTABLE_USER; + struct ptdesc *ptdesc; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + gfp &= ~__GFP_HIGHMEM; + + ptdesc = pagetable_alloc_noprof(gfp, order); + if (!ptdesc) + return NULL; + + return ptdesc_address(ptdesc); +} +#define __pgd_alloc(...) alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__)) + +static inline void __pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(pgd); + + BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); + pagetable_free(ptdesc); +} + #ifndef __HAVE_ARCH_PGD_FREE static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - pagetable_free(virt_to_ptdesc(pgd)); + __pgd_free(mm, pgd); } #endif -- cgit v1.2.3 From d95936a2267c11a38917d5fc7bf3862a64fe13d8 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Fri, 3 Jan 2025 18:44:15 +0000 Subject: mm: introduce ctor/dtor at PGD level Following on from the introduction of P4D-level ctor/dtor, let's finish the job and introduce ctor/dtor at PGD level. The incurred improvement in page accounting is minimal - the main motivation is to create a single, generic place where construction/destruction hooks can be added for all page table pages. This patch should cover all architectures and all configurations where PGDs are one or more regular pages. This excludes any configuration where PGDs are allocated from a kmem_cache object. Link: https://lkml.kernel.org/r/20250103184415.2744423-7-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Acked-by: Dave Hansen Acked-by: Qi Zheng Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Ingo Molnar Cc: Linus Walleij Cc: Matthew Wilcox (Oracle) Cc: Mike Rapoport (Microsoft) Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 3 ++- include/linux/mm.h | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index de4df14158e6..892ece4558a2 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -271,6 +271,7 @@ static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order if (!ptdesc) return NULL; + pagetable_pgd_ctor(ptdesc); return ptdesc_address(ptdesc); } #define __pgd_alloc(...) alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__)) @@ -280,7 +281,7 @@ static inline void __pgd_free(struct mm_struct *mm, pgd_t *pgd) struct ptdesc *ptdesc = virt_to_ptdesc(pgd); BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); - pagetable_free(ptdesc); + pagetable_dtor_free(ptdesc); } #ifndef __HAVE_ARCH_PGD_FREE diff --git a/include/linux/mm.h b/include/linux/mm.h index 2949b58fd633..3550cbeed488 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3159,6 +3159,11 @@ static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) __pagetable_ctor(ptdesc); } +static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc) +{ + __pagetable_ctor(ptdesc); +} + extern void __init pagecache_init(void); extern void free_initmem(void); -- cgit v1.2.3 From 42b7491af14cbba2393329ce43d508a957bd94fa Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Jan 2025 09:43:53 -0800 Subject: mm/damon/core: introduce damon_call() Introduce a new DAMON core API function, damon_call(). It aims to replace some damon_callback usages that access damon_ctx of ongoing kdamond with additional synchronizations. It receives a function pointer, let the parallel kdamond invokes the function, and returns after the invocation is finished, or canceled due to some races. kdamond invokes the function inside the main loop after sampling is done. If it is deactivated by DAMOS watermarks or already out of the main loop, mark the request as canceled so that damon_call() can wakeup and return. Link: https://lkml.kernel.org/r/20250103174400.54890-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index a67f2c4940e9..ac2d42a50751 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -552,6 +552,27 @@ struct damon_callback { void (*before_terminate)(struct damon_ctx *context); }; +/* + * struct damon_call_control - Control damon_call(). + * + * @fn: Function to be called back. + * @data: Data that will be passed to @fn. + * @return_code: Return code from @fn invocation. + * + * Control damon_call(), which requests specific kdamond to invoke a given + * function. Refer to damon_call() for more details. + */ +struct damon_call_control { + int (*fn)(void *data); + void *data; + int return_code; +/* private: internal use only */ + /* informs if the kdamond finished handling of the request */ + struct completion completion; + /* informs if the kdamond canceled @fn infocation */ + bool canceled; +}; + /** * struct damon_attrs - Monitoring attributes for accuracy/overhead control. * @@ -632,6 +653,9 @@ struct damon_ctx { /* for scheme quotas prioritization */ unsigned long *regions_score_histogram; + struct damon_call_control *call_control; + struct mutex call_control_lock; + /* public: */ struct task_struct *kdamond; struct mutex kdamond_lock; @@ -779,6 +803,8 @@ static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); +int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); + int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end); -- cgit v1.2.3 From bf0eaba0ff9c9c8e6fd58ddfa1a8b6df4b813f61 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 3 Jan 2025 09:43:57 -0800 Subject: mm/damon/core: implement damos_walk() Introduce a new core layer interface, damos_walk(). It aims to replace some damon_callback usages that access DAMOS schemes applied regions of ongoing kdamond with additional synchronizations. It receives a function pointer and asks kdamond to invoke it for any region that it tried to apply any DAMOS action within one scheme apply interval for every scheme of it. The function further waits until the kdamond finishes the invocations for every scheme, or cancels the request, and returns. The kdamond invokes the function as requested within the main loop. If it is deactivated by DAMOS watermarks or going out of the main loop, it marks the request as canceled, so that damos_walk() can wakeup and return. Link: https://lkml.kernel.org/r/20250103174400.54890-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index ac2d42a50751..2889de3526c3 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -352,6 +352,31 @@ struct damos_filter { struct list_head list; }; +struct damon_ctx; +struct damos; + +/** + * struct damos_walk_control - Control damos_walk(). + * + * @walk_fn: Function to be called back for each region. + * @data: Data that will be passed to walk functions. + * + * Control damos_walk(), which requests specific kdamond to invoke the given + * function to each region that eligible to apply actions of the kdamond's + * schemes. Refer to damos_walk() for more details. + */ +struct damos_walk_control { + void (*walk_fn)(void *data, struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *s); + void *data; +/* private: internal use only */ + /* informs if the kdamond finished handling of the walk request */ + struct completion completion; + /* informs if the walk is canceled. */ + bool canceled; +}; + /** * struct damos_access_pattern - Target access pattern of the given scheme. * @min_sz_region: Minimum size of target regions. @@ -415,6 +440,8 @@ struct damos { * @action */ unsigned long next_apply_sis; + /* informs if ongoing DAMOS walk for this scheme is finished */ + bool walk_completed; /* public: */ struct damos_quota quota; struct damos_watermarks wmarks; @@ -442,8 +469,6 @@ enum damon_ops_id { NR_DAMON_OPS, }; -struct damon_ctx; - /** * struct damon_operations - Monitoring operations for given use cases. * @@ -656,6 +681,9 @@ struct damon_ctx { struct damon_call_control *call_control; struct mutex call_control_lock; + struct damos_walk_control *walk_control; + struct mutex walk_control_lock; + /* public: */ struct task_struct *kdamond; struct mutex kdamond_lock; @@ -804,6 +832,7 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); +int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); int damon_set_region_biggest_system_ram_default(struct damon_target *t, unsigned long *start, unsigned long *end); -- cgit v1.2.3 From 626ffabe67c2359f3a88bb61fdc83a6280ef16e9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2025 11:33:46 -0800 Subject: mm/damon: clarify trying vs applying on damos_stat kernel-doc comment Patch series "mm/damon: enable page level properties based monitoring". TL; DR ====== This patch series enables access monitoring based on page level properties including their anonymousness, belonging cgroups and young-ness, by extending DAMOS stats and regions walk features with region-internal DAMOS filters. Background ========== DAMOS has initially developed for only access-aware system operations. But, efficient acces monitoring results querying is yet another major usage of today's DAMOS. DAMOS stats and regions walk, which exposes accumulated counts and per-region monitoring results that filtered by DAMOS parameters including target access pattern, quotas and DAMOS filters, are the key features for that usage. For tunings and investigations, it can be more useful if only the information can be exposed without making real system operational change. Special DAMOS action, DAMOS_STAT, was introduced for the purpose. DAMOS fundametally works with only access pattern information in region granularity. For some use cases, fixed and fine granularity information based on non access pattern properties can be useful, though. For example, on systems having swap devices that much faster than storage devices for files, DAMOS-based proactive reclaim need to be applied differently for anonymous pages and file-backed pages. DAMOS filters is a feature that makes it possible. It supports non access pattern information including page level properties such as anonymousness, belonging cgroups, and young-ness (whether the page has accessed since the last access check of it). The information can be useful for tuning and investigations. DAMOS stat exposes some of it via {nr,sz}_applied, but it is mixed with operation failures. Also, exposing the information without making system operation change is impossible, since DAMOS_STAT simply ignores the page level properties based DAMOS filters. Design ====== Expose the exact information for every DAMOS action including DAMOS_STAT by implementing below changes. Extend the interface for DAMON operations set layer, which contains the implementation of the page level filters, to report back the amount of memory that passed the region-internal DAMOS filters to the core layer. On the core layer, account the operations set layer reported stat with DAMOS stat for per-scheme monitoring. Also, pass the information to regions walk for per-region monitoring. In this way, DAMON API users can efficiently get the fine-grained information. For the user-space, make DAMON sysfs interface collects the information using the updated DAMON core API, and expose those to new per-scheme stats file and per-DAMOS-tried region properties file. Practical Usages ================ With this patch series, DAMON users can query how many bytes of regions of specific access temperature is backed by pages of specific type. The type can be any of DAMOS filter-supporting one, including anonymousness, belonging cgroups, and young-ness. For example, users can visualize access hotness-based page granulairty histogram for different cgroups, backing content type, or youngness. In future, it could be extended to more types such as whether it is THP, position on LRU lists, etc. This can be useful for estimating benefits of a new or an existing access-aware system optimizations without really committing the changes. Patches Sequence ================ The patches are constructed in four sub-sequences. First three patches (patches 1-3) update documents to have missing background knowledges and better structures for easily introducing followup changes. Following three patches (patches 4-6) change the operations set layer interface to report back the region-internal filter passed memory size, and make the operations set implementations support the changed symantic. Following five patches (patches 7-11) implement per-scheme accumulated stat for region-internal filter-passed memory size on core API (damos_stat) and DAMON sysfs interface. First two patches of those are for code change, and following three patches are for documentation. Finally, five patches (patches 12-16) implementing per-region region-internal filter-passed memory size follows. Similar to that for per-scheme stat, first two patches implement core-API and sysfs interface change. Then three patches for documentation update follow. This patch (of 16): DAMOS stat kernel-doc documentation is using terms that bit ambiguous. Without reading the code, understanding it correctly is not that easy. Add the clarification on the kernel-doc comment. Link: https://lkml.kernel.org/r/20250106193401.109161-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250106193401.109161-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 2889de3526c3..b85eae388f5b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -287,6 +287,23 @@ struct damos_watermarks { * @nr_applied: Total number of regions that the scheme is applied. * @sz_applied: Total size of regions that the scheme is applied. * @qt_exceeds: Total number of times the quota of the scheme has exceeded. + * + * "Tried an action to a region" in this context means the DAMOS core logic + * determined the region as eligible to apply the action. The access pattern + * (&struct damos_access_pattern), quotas (&struct damos_quota), watermarks + * (&struct damos_watermarks) and filters (&struct damos_filter) that handled + * on core logic can affect this. The core logic asks the operation set + * (&struct damon_operations) to apply the action to the region. + * + * "Applied an action to a region" in this context means the operation set + * (&struct damon_operations) successfully applied the action to the region, at + * least to a part of the region. The filters (&struct damos_filter) that + * handled on operation set layer and type of the action and pages of the + * region can affect this. For example, if a filter is set to exclude + * anonymous pages and the region has only anonymous pages, the region will be + * failed at applying the action. If the action is &DAMOS_PAGEOUT and all + * pages of the region are already paged out, the region will be failed at + * applying the action. */ struct damos_stat { unsigned long nr_tried; -- cgit v1.2.3 From b5bbe9c08fd1519f96832b82256543a567ce2900 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2025 11:33:49 -0800 Subject: mm/damon: ask apply_scheme() to report filter-passed region-internal bytes Some DAMOS filter types including those for young page, anon page, and belonging memcg are handled by underlying DAMON operations set implementation, via damon_operations->apply_scheme() interface. How many bytes of the region have passed the filter can be useful for DAMOS scheme tuning and access pattern monitoring. Modify the interface to let the callback implementation reports back the number if possible. Link: https://lkml.kernel.org/r/20250106193401.109161-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index b85eae388f5b..da003173210f 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -529,7 +529,8 @@ enum damon_ops_id { * @apply_scheme is called from @kdamond when a region for user provided * DAMON-based operation scheme is found. It should apply the scheme's action * to the region and return bytes of the region that the action is successfully - * applied. + * applied. It should also report how many bytes of the region has passed + * filters (&struct damos_filter) that handled by itself. * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup is called from @kdamond just before its termination. @@ -546,7 +547,7 @@ struct damon_operations { struct damos *scheme); unsigned long (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, - struct damos *scheme); + struct damos *scheme, unsigned long *sz_filter_passed); bool (*target_valid)(struct damon_target *t); void (*cleanup)(struct damon_ctx *context); }; -- cgit v1.2.3 From 60fa9355a6c620f7b727d3fdb433fb6cf714a9b0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2025 11:33:52 -0800 Subject: mm/damon/core: implement per-scheme ops-handled filter-passed bytes stat Implement a new per-DAMOS scheme statistic field, namely sz_ops_filter_passed, using the changed damon_operations->apply_scheme() interface. It counts total bytes of memory that given DAMOS action tried to be applied, and passed the operations layer handled region-internal filters of the scheme. DAMON API users can access it using DAMON-internal safe access features such as damon_call() and/or damos_walk(). Link: https://lkml.kernel.org/r/20250106193401.109161-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index da003173210f..2a93dbe06ecc 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -286,6 +286,8 @@ struct damos_watermarks { * @sz_tried: Total size of regions that the scheme is tried to be applied. * @nr_applied: Total number of regions that the scheme is applied. * @sz_applied: Total size of regions that the scheme is applied. + * @sz_ops_filter_passed: + * Total bytes that passed ops layer-handled DAMOS filters. * @qt_exceeds: Total number of times the quota of the scheme has exceeded. * * "Tried an action to a region" in this context means the DAMOS core logic @@ -310,6 +312,7 @@ struct damos_stat { unsigned long sz_tried; unsigned long nr_applied; unsigned long sz_applied; + unsigned long sz_ops_filter_passed; unsigned long qt_exceeds; }; -- cgit v1.2.3 From cfc33a7d2daca4455ef3ebae63a2e89bd9bb0ebe Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jan 2025 11:33:57 -0800 Subject: mm/damon/core: pass per-region filter-passed bytes to damos_walk_control->walk_fn() Total size of memory that passed DAMON operations set layer-handled DAMOS filters per scheme is provided to DAMON core API and ABI (sysfs interface) users. Having it per-region in non-accumulated way can provide it in finer granularity. Provide it to damos_walk() core API users, by passing the data to damos_walk_control->walk_fn(). Link: https://lkml.kernel.org/r/20250106193401.109161-13-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 2a93dbe06ecc..298b1a831e62 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -388,7 +388,7 @@ struct damos; struct damos_walk_control { void (*walk_fn)(void *data, struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, - struct damos *s); + struct damos *s, unsigned long sz_filter_passed); void *data; /* private: internal use only */ /* informs if the kdamond finished handling of the walk request */ -- cgit v1.2.3 From 63db8170bf34ce9e0763f87d993cf9b4c9002b09 Mon Sep 17 00:00:00 2001 From: Bruno Faccini Date: Mon, 6 Jan 2025 04:06:59 -0800 Subject: mm/fake-numa: allow later numa node hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current fake-numa implementation prevents new Numa nodes to be later hot-plugged by drivers. A common symptom of this limitation is the "node was absent from the node_possible_map" message by associated warning in mm/memory_hotplug.c: add_memory_resource(). This comes from the lack of remapping in both pxm_to_node_map[] and node_to_pxm_map[] tables to take fake-numa nodes into account and thus triggers collisions with original and physical nodes only-mapping that had been determined from BIOS tables. This patch fixes this by doing the necessary node-ids translation in both pxm_to_node_map[]/node_to_pxm_map[] tables. node_distance[] table has also been fixed accordingly. Details: When trying to use fake-numa feature on our system where new Numa nodes are being "hot-plugged" upon driver load, this fails with the following type of message and warning with stack : node 8 was absent from the node_possible_map WARNING: CPU: 61 PID: 4259 at mm/memory_hotplug.c:1506 add_memory_resource+0x3dc/0x418 This issue prevents the use of the fake-NUMA debug feature with the system's full configuration, when it has proven to be sometimes extremely useful for performance testing of multi-tasked, memory-bound applications, as it enables better isolation of processes/ranks compared to fat NUMA nodes. Usual numactl output after driver has “hot-plugged”/unveiled some new Numa nodes with and without memory : $ numactl --hardware available: 9 nodes (0-8) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 0 size: 490037 MB node 0 free: 484432 MB node 1 cpus: node 1 size: 97280 MB node 1 free: 97279 MB node 2 cpus: node 2 size: 0 MB node 2 free: 0 MB node 3 cpus: node 3 size: 0 MB node 3 free: 0 MB node 4 cpus: node 4 size: 0 MB node 4 free: 0 MB node 5 cpus: node 5 size: 0 MB node 5 free: 0 MB node 6 cpus: node 6 size: 0 MB node 6 free: 0 MB node 7 cpus: node 7 size: 0 MB node 7 free: 0 MB node 8 cpus: node 8 size: 0 MB node 8 free: 0 MB node distances: node 0 1 2 3 4 5 6 7 8 0: 10 80 80 80 80 80 80 80 80 1: 80 10 255 255 255 255 255 255 255 2: 80 255 10 255 255 255 255 255 255 3: 80 255 255 10 255 255 255 255 255 4: 80 255 255 255 10 255 255 255 255 5: 80 255 255 255 255 10 255 255 255 6: 80 255 255 255 255 255 10 255 255 7: 80 255 255 255 255 255 255 10 255 8: 80 255 255 255 255 255 255 255 10 With recent M.Rapoport set of fake-numa patches in mm-everything and using numa=fake=4 boot parameter : $ numactl --hardware available: 4 nodes (0-3) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 0 size: 122518 MB node 0 free: 117141 MB node 1 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 1 size: 219911 MB node 1 free: 219751 MB node 2 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 2 size: 122599 MB node 2 free: 122541 MB node 3 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 3 size: 122479 MB node 3 free: 122408 MB node distances: node 0 1 2 3 0: 10 10 10 10 1: 10 10 10 10 2: 10 10 10 10 3: 10 10 10 10 With recent M.Rapoport set of fake-numa patches in mm-everything, this patch on top, using numa=fake=4 boot parameter : # numactl —hardware available: 12 nodes (0-11) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 0 size: 122518 MB node 0 free: 116429 MB node 1 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 1 size: 122631 MB node 1 free: 122576 MB node 2 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 2 size: 122599 MB node 2 free: 122544 MB node 3 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 node 3 size: 122479 MB node 3 free: 122419 MB node 4 cpus: node 4 size: 97280 MB node 4 free: 97279 MB node 5 cpus: node 5 size: 0 MB node 5 free: 0 MB node 6 cpus: node 6 size: 0 MB node 6 free: 0 MB node 7 cpus: node 7 size: 0 MB node 7 free: 0 MB node 8 cpus: node 8 size: 0 MB node 8 free: 0 MB node 9 cpus: node 9 size: 0 MB node 9 free: 0 MB node 10 cpus: node 10 size: 0 MB node 10 free: 0 MB node 11 cpus: node 11 size: 0 MB node 11 free: 0 MB node distances: node 0 1 2 3 4 5 6 7 8 9 10 11 0: 10 10 10 10 80 80 80 80 80 80 80 80 1: 10 10 10 10 80 80 80 80 80 80 80 80 2: 10 10 10 10 80 80 80 80 80 80 80 80 3: 10 10 10 10 80 80 80 80 80 80 80 80 4: 80 80 80 80 10 255 255 255 255 255 255 255 5: 80 80 80 80 255 10 255 255 255 255 255 255 6: 80 80 80 80 255 255 10 255 255 255 255 255 7: 80 80 80 80 255 255 255 10 255 255 255 255 8: 80 80 80 80 255 255 255 255 10 255 255 255 9: 80 80 80 80 255 255 255 255 255 10 255 255 10: 80 80 80 80 255 255 255 255 255 255 10 255 11: 80 80 80 80 255 255 255 255 255 255 255 10 Link: https://lkml.kernel.org/r/20250106120659.359610-2-bfaccini@nvidia.com Signed-off-by: Bruno Faccini Cc: David Hildenbrand Cc: John Hubbard Cc: Mike Rapoport (Microsoft) Cc: Zi Yan Signed-off-by: Andrew Morton --- include/acpi/acpi_numa.h | 5 +++++ include/linux/numa_memblks.h | 3 +++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/acpi/acpi_numa.h b/include/acpi/acpi_numa.h index b5f594754a9e..99b960bd473c 100644 --- a/include/acpi/acpi_numa.h +++ b/include/acpi/acpi_numa.h @@ -17,11 +17,16 @@ extern int node_to_pxm(int); extern int acpi_map_pxm_to_node(int); extern unsigned char acpi_srat_revision; extern void disable_srat(void); +extern int fix_pxm_node_maps(int max_nid); extern void bad_srat(void); extern int srat_disabled(void); #else /* CONFIG_ACPI_NUMA */ +static inline int fix_pxm_node_maps(int max_nid) +{ + return 0; +} static inline void disable_srat(void) { } diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h index cfad6ce7e1bd..dd85613cdd86 100644 --- a/include/linux/numa_memblks.h +++ b/include/linux/numa_memblks.h @@ -29,7 +29,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi); int __init numa_memblks_init(int (*init_func)(void), bool memblock_force_top_down); +extern int numa_distance_cnt; + #ifdef CONFIG_NUMA_EMU +extern int emu_nid_to_phys[MAX_NUMNODES]; int numa_emu_cmdline(char *str); void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, unsigned int nr_emu_nids); -- cgit v1.2.3 From b2aad24b53333f1904a55d97e3fde2246ef05bb6 Mon Sep 17 00:00:00 2001 From: Guo Weikang Date: Mon, 6 Jan 2025 10:11:25 +0800 Subject: mm/memmap: prevent double scanning of memmap by kmemleak kmemleak explicitly scans the mem_map through the valid struct page objects. However, memmap_alloc() was also adding this memory to the gray object list, causing it to be scanned twice. Remove memmap_alloc() from the scan list and add a comment to clarify the behavior. Link: https://lore.kernel.org/lkml/CAOm6qn=FVeTpH54wGDFMHuCOeYtvoTx30ktnv9-w3Nh8RMofEA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20250106021126.1678334-1-guoweikang.kernel@gmail.com Signed-off-by: Guo Weikang Reviewed-by: Catalin Marinas Cc: Mike Rapoport (Microsoft) Signed-off-by: Andrew Morton --- include/linux/memblock.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 673d5cae7c81..d48b56c1e558 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -378,6 +378,10 @@ static inline int memblock_get_region_node(const struct memblock_region *r) /* Flags for memblock allocation APIs */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) #define MEMBLOCK_ALLOC_ACCESSIBLE 0 +/* + * MEMBLOCK_ALLOC_NOLEAKTRACE avoids kmemleak tracing. It implies + * MEMBLOCK_ALLOC_ACCESSIBLE + */ #define MEMBLOCK_ALLOC_NOLEAKTRACE 1 /* We are using top down, so it is safe to use 0 here */ -- cgit v1.2.3 From 30cef82bc6e8975a360ec05b707f7fb194c875ed Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 7 Jan 2025 15:39:58 -0500 Subject: mm/hugetlb: rename avoid_reserve to cow_from_owner The old name "avoid_reserve" can be too generic and can be used wrongly in the new call sites that want to allocate a hugetlb folio. It's confusing on two things: (1) whether one can opt-in to avoid global reservation, and (2) whether it should take more than one count. In reality, this flag is only used in an extremely hacky path, in an extremely hacky way in hugetlb CoW path only, and always use with 1 saying "skip global reservation". Rename the flag to avoid future abuse of this flag, making it a boolean so as to reflect its true representation that it's not a counter. To make it even harder to abuse, add a comment above the function to explain it. Link: https://lkml.kernel.org/r/20250107204002.2683356-4-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Oscar Salvador Cc: Ackerley Tng Cc: Breno Leitao Cc: Muchun Song Cc: Naoya Horiguchi Cc: Rik van Riel Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 10faf42ca96a..49ec2362ce92 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -683,7 +683,7 @@ struct huge_bootmem_page { int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve); + unsigned long addr, bool cow_from_owner); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback); @@ -1068,7 +1068,7 @@ static inline int replace_free_hugepage_folios(unsigned long start_pfn, static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, - int avoid_reserve) + bool cow_from_owner) { return NULL; } -- cgit v1.2.3 From c8b979530f27f90c0353a189b2faa6e50a0ea94a Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Mon, 23 Dec 2024 17:00:37 -0500 Subject: mm: alloc_pages_bulk_noprof: drop page_list argument Patch series "mm: alloc_pages_bulk: small API refactor", v2. Today, alloc_pages_bulk_noprof() supports two arguments to return allocated pages: a linked list and an array. There are also higher level APIs for both. However, the linked list API has apparently never been used. So, this series removes it along with the list API and also refactors the remaining API naming for consistency. This patch (of 2): commit 387ba26fb1cb ("mm/page_alloc: add a bulk page allocator") added __alloc_pages_bulk() along with the page_list argument. The next commit 0f87d9d30f21 ("mm/page_alloc: add an array-based interface to the bulk page allocator") added the array-based argument. As it turns out, the page_list argument has no users in the current tree (if it ever had any). Dropping it allows for a slight simplification and eliminates some unnecessary checks, now that page_array is required. Also, note that the removal of the page_list argument was proposed before in the thread below, where Matthew Wilcox mentions that: """ Iterating a linked list is _expensive_. It is about 10x quicker to iterate an array than a linked list. """ (https://lore.kernel.org/linux-mm/20231025093254.xvomlctwhcuerzky@techsingularity.net) Link: https://lkml.kernel.org/r/cover.1734991165.git.luizcap@redhat.com Link: https://lkml.kernel.org/r/f1c75db91d08cafd211eca6a3b199b629d4ffe16.1734991165.git.luizcap@redhat.com Signed-off-by: Luiz Capitulino Acked-by: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- include/linux/gfp.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c96d5d7f7b89..f8b33c5e7a14 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -212,7 +212,6 @@ struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, - struct list_head *page_list, struct page **page_array); #define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__)) @@ -223,11 +222,8 @@ unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__)) /* Bulk allocate order-0 pages */ -#define alloc_pages_bulk_list(_gfp, _nr_pages, _list) \ - __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _list, NULL) - #define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array) \ - __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, NULL, _page_array) + __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _page_array) static inline unsigned long alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, @@ -236,7 +232,7 @@ alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, if (nid == NUMA_NO_NODE) nid = numa_mem_id(); - return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, NULL, page_array); + return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, page_array); } #define alloc_pages_bulk_array_node(...) \ -- cgit v1.2.3 From 6bf9b5b40af373690313f64a3935b2bf2e5d46d9 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Mon, 23 Dec 2024 17:00:38 -0500 Subject: mm: alloc_pages_bulk: rename API The previous commit removed the page_list argument from alloc_pages_bulk_noprof() along with the alloc_pages_bulk_list() function. Now that only the *_array() flavour of the API remains, we can do the following renaming (along with the _noprof() ones): alloc_pages_bulk_array -> alloc_pages_bulk alloc_pages_bulk_array_mempolicy -> alloc_pages_bulk_mempolicy alloc_pages_bulk_array_node -> alloc_pages_bulk_node Link: https://lkml.kernel.org/r/275a3bbc0be20fbe9002297d60045e67ab3d4ada.1734991165.git.luizcap@redhat.com Signed-off-by: Luiz Capitulino Acked-by: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Yunsheng Lin Signed-off-by: Andrew Morton --- include/linux/gfp.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f8b33c5e7a14..6bb1a5a7a4ae 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -215,18 +215,18 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, struct page **page_array); #define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__)) -unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, +unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array); -#define alloc_pages_bulk_array_mempolicy(...) \ - alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__)) +#define alloc_pages_bulk_mempolicy(...) \ + alloc_hooks(alloc_pages_bulk_mempolicy_noprof(__VA_ARGS__)) /* Bulk allocate order-0 pages */ -#define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array) \ +#define alloc_pages_bulk(_gfp, _nr_pages, _page_array) \ __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _page_array) static inline unsigned long -alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, +alloc_pages_bulk_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array) { if (nid == NUMA_NO_NODE) @@ -235,8 +235,8 @@ alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, page_array); } -#define alloc_pages_bulk_array_node(...) \ - alloc_hooks(alloc_pages_bulk_array_node_noprof(__VA_ARGS__)) +#define alloc_pages_bulk_node(...) \ + alloc_hooks(alloc_pages_bulk_node_noprof(__VA_ARGS__)) static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask) { -- cgit v1.2.3 From e20f52e8e3b7947e40bd40c6cdc69884c6df716c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 9 Jan 2025 09:51:17 -0800 Subject: mm/damon: fixup damos_filter kernel-doc Patch series "mm/damon: extend DAMOS filters for inclusion", v2. DAMOS fitlers are exclusive filters. It only excludes memory of given criterias from the DAMOS action targets. This has below limitations. First, the name is not explicitly explaining the behavior. This actually resulted in users' confusions[1]. Secondly, combined uses of multiple filters provide only restriced coverages. For example, building a DAMOS scheme that applies the action to memory that belongs to cgroup A "or" cgroup B is impossible. A workaround would be using two schemes that fitlers out memory that not belong to cgroup A and cgroup B, respectively. It is cumbersome, and difficult to control quota-like per-scheme features in an orchestration. Monitoring of filters-passed memory statistic will also be complicated. Extend DAMOS filters to support not only exclusion (rejecting), but also inclusion (allowing) behavior. For this, add a new damos_filter struct field called 'allow' for DAMON kernel API users. The filter works as an inclusion or exclusion filter when it is set or unset, respectively. For DAMON user-space ABI users, add a DAMON sysfs file of same name under DAMOS filter sysfs directory. To prevent exposing a behavioral change to old users, set rejecting as the default behavior. Note that allow-filters work for only inclusion, not exclusion of memory that not satisfying the criteria. And the default behavior of DAMOS for memory that no filter has involved is that the action can be applied to those memory. Also, filters-passed memory statistics are for any memory that passed through the DAMOS filters check stage. These implies installing allow-filters at the endof the filter list is useless. Refer to the design doc change of this series for more details. [1] https://lore.kernel.org/20240320165619.71478-1-sj@kernel.org This patch (of 10): The comment is slightly wrong. DAMOS filters are not only for pages, but general bytes of memory. Also the description of 'matching' is bit confusing, since DAMOS filters do only filtering out. Update the comments to be less confusing. Link: https://lkml.kernel.org/r/20250109175126.57878-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250109175126.57878-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 298b1a831e62..72afba74ac6d 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -347,8 +347,8 @@ enum damos_filter_type { /** * struct damos_filter - DAMOS action target memory filter. - * @type: Type of the page. - * @matching: If the matching page should filtered out or in. + * @type: Type of the target memory. + * @matching: If the @type-matching memory should be filtered out. * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. * @addr_range: Address range if @type is DAMOS_FILTER_TYPE_ADDR. * @target_idx: Index of the &struct damon_target of @@ -357,9 +357,10 @@ enum damos_filter_type { * @list: List head for siblings. * * Before applying the &damos->action to a memory region, DAMOS checks if each - * page of the region matches to this and avoid applying the action if so. - * Support of each filter type depends on the running &struct damon_operations - * and the type. Refer to &enum damos_filter_type for more detai. + * byte of the region matches to this given condition and avoid applying the + * action if so. Support of each filter type depends on the running &struct + * damon_operations and the type. Refer to &enum damos_filter_type for more + * details. */ struct damos_filter { enum damos_filter_type type; -- cgit v1.2.3 From fe6d7fdd62491524d11433b9ff8d3db5dde32700 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 9 Jan 2025 09:51:18 -0800 Subject: mm/damon/core: add damos_filter->allow field DAMOS filters work as only exclusive (reject) filters. This makes it easy to be confused, and restrictive at combining multiple filters for covering various types of memory. Add a field named 'allow' to damos_filter. The field will be used to indicate whether the filter should work for inclusion or exclusion. To keep the old behavior, set it as 'false' (work as exclusive filter) by default, from damos_new_filter(). Following two commits will make the core and operations set layers, which handles damos_filter objects, respect the field, respectively. Link: https://lkml.kernel.org/r/20250109175126.57878-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 72afba74ac6d..8a2d104df5a3 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -348,7 +348,8 @@ enum damos_filter_type { /** * struct damos_filter - DAMOS action target memory filter. * @type: Type of the target memory. - * @matching: If the @type-matching memory should be filtered out. + * @matching: Whether this is for @type-matching memory. + * @allow: Whether to include or exclude the @matching memory. * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. * @addr_range: Address range if @type is DAMOS_FILTER_TYPE_ADDR. * @target_idx: Index of the &struct damon_target of @@ -365,6 +366,7 @@ enum damos_filter_type { struct damos_filter { enum damos_filter_type type; bool matching; + bool allow; union { unsigned short memcg_id; struct damon_addr_range addr_range; -- cgit v1.2.3 From e2fbfedad03401a38b8c3b7fd52d8fdcd039d0bc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 9 Jan 2025 09:51:21 -0800 Subject: mm/damon: add 'allow' argument to damos_new_filter() DAMON API users should set damos_filter->allow manually to use a DAMOS allow-filter, since damos_new_filter() unsets the field always. It is cumbersome and easy to mistake. Add an arugment for setting the field to damos_new_filter(). Link: https://lkml.kernel.org/r/20250109175126.57878-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 8a2d104df5a3..0834d7ffcb84 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -801,7 +801,7 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed, struct damon_attrs *attrs); struct damos_filter *damos_new_filter(enum damos_filter_type type, - bool matching); + bool matching, bool allow); void damos_add_filter(struct damos *s, struct damos_filter *f); void damos_destroy_filter(struct damos_filter *f); -- cgit v1.2.3 From 07438779313caafe52ac1a1a6958d735a5938988 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Dec 2024 13:16:38 -0800 Subject: alloc_tag: avoid current->alloc_tag manipulations when profiling is disabled When memory allocation profiling is disabled there is no need to update current->alloc_tag and these manipulations add unnecessary overhead. Fix the overhead by skipping these extra updates. I ran comprehensive testing on Pixel 6 on Big, Medium and Little cores: Overhead before fixes Overhead after fixes slab alloc page alloc slab alloc page alloc Big 6.21% 5.32% 3.31% 4.93% Medium 4.51% 5.05% 3.79% 4.39% Little 7.62% 1.82% 6.68% 1.02% This is an allocation microbenchmark doing allocations in a tight loop. Not a really realistic scenario and useful only to make performance comparisons. Link: https://lkml.kernel.org/r/20241226211639.1357704-1-surenb@google.com Fixes: b951aaff5035 ("mm: enable page allocation tagging") Signed-off-by: Suren Baghdasaryan Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Yu Zhao Cc: Zhenhua Huang Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 0bbbe537c5f9..a946e0203e6d 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -224,9 +224,14 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} #define alloc_hooks_tag(_tag, _do_alloc) \ ({ \ - struct alloc_tag * __maybe_unused _old = alloc_tag_save(_tag); \ - typeof(_do_alloc) _res = _do_alloc; \ - alloc_tag_restore(_tag, _old); \ + typeof(_do_alloc) _res; \ + if (mem_alloc_profiling_enabled()) { \ + struct alloc_tag * __maybe_unused _old; \ + _old = alloc_tag_save(_tag); \ + _res = _do_alloc; \ + alloc_tag_restore(_tag, _old); \ + } else \ + _res = _do_alloc; \ _res; \ }) -- cgit v1.2.3 From 7277433096f6ce4a84a1620529ac4ba3e1041ee1 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:22 +0800 Subject: mm, swap: remove old allocation path for HDD We are currently using different swap allocation algorithm for HDD and non-HDD. This leads to the existence of a different set of locks, and the code path is heavily bloated, causing difficulties for further optimization and maintenance. This commit removes all HDD swap allocation and related dead code, and uses the cluster allocation algorithm instead. The performance may drop temporarily, but this should be negligible: The main advantage of the legacy HDD allocation algorithm is that it tends to use continuous slots, but swap device gets fragmented quickly anyway, and the attempt to use continuous slots will fail easily. This commit also enables mTHP swap on HDD, which is expected to be beneficial, and following commits will adapt and optimize the cluster allocator for HDD. Link: https://lkml.kernel.org/r/20250113175732.48099-4-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Suggested-by: "Huang, Ying" Reviewed-by: Baoquan He Cc: Barry Song Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index f3e0ac20c2e8..3a71198a6957 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -309,9 +309,6 @@ struct swap_info_struct { unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ unsigned int inuse_pages; /* number of those currently in use */ - unsigned int cluster_next; /* likely index for next allocation */ - unsigned int cluster_nr; /* countdown to next cluster search */ - unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ -- cgit v1.2.3 From 27701521beb5897d6b97e2f8c20de41e74cbcb7b Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:24 +0800 Subject: mm, swap: clean up device availability check Remove highest_bit and lowest_bit. After the HDD allocation path has been removed, the only purpose of these two fields is to determine whether the device is full or not, which can instead be determined by checking the inuse_pages. Link: https://lkml.kernel.org/r/20250113175732.48099-6-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baoquan He Cc: Barry Song Cc: Chis Li Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 3a71198a6957..c0d49dad7a4b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -305,8 +305,6 @@ struct swap_info_struct { struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; - unsigned int lowest_bit; /* index of first free in swap_map */ - unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ unsigned int inuse_pages; /* number of those currently in use */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ -- cgit v1.2.3 From b228386cf237e659cdf5d8037a19db0b0a06f6b5 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:25 +0800 Subject: mm, swap: clean up plist removal and adding When the swap device is full (inuse_pages == pages), it should be removed from the allocation available plist. If any slot is freed, the swap device should be added back to the plist. Additionally, during swapon or swapoff, the swap device is forcefully added or removed. Currently, the condition (inuse_pages == pages) is checked after every counter update, then remove or add the device accordingly. This is serialized by si->lock. This commit decouples it from the protection of si->lock and reworked plist removal and adding, making it possible to get rid of the hard dependency on si->lock in allocation path in later commits. To achieve this, simply using another lock is not an optimal approach, as the overhead is observable for a hot counter, and may cause complex locking issues. Thus, this commit manages to make it a lock-free atomic operation, by embedding the plist state into the second highest bit of the atomic counter. Simply making the counter an atomic will not work, if the update and plist status check are not performed atomically, we may miss an addition or removal. With the embedded info we can update the counter and check the plist status with single atomic operations, and avoid any extra overheads: If the counter is full (inuse_pages == pages) and the off-list bit is unset, we attempt to remove it from the plist. If the counter is not full (inuse_pages != pages) and the off-list bit is set, we attempt to add it to the plist. Removing, adding and bit update is serialized with a lock, which is a cold path. Ordinary counter updates will be lock-free. Link: https://lkml.kernel.org/r/20250113175732.48099-7-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Baoquan He Cc: Barry Song Cc: Chis Li Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index c0d49dad7a4b..16dcf8bd1a4e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -306,7 +306,7 @@ struct swap_info_struct { /* list of cluster that are fragmented or contented */ unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ - unsigned int inuse_pages; /* number of those currently in use */ + atomic_long_t inuse_pages; /* number of those currently in use */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ -- cgit v1.2.3 From 9a0ddeb7988095a5c21994c37005a45b240039ef Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:26 +0800 Subject: mm, swap: hold a reference during scan and cleanup flag usage The flag SWP_SCANNING was used as an indicator of whether a device is being scanned for allocation, and prevents swapoff. Combined with SWP_WRITEOK, they work as a set of barriers for a clean swapoff: 1. Swapoff clears SWP_WRITEOK, allocation requests will see ~SWP_WRITEOK and abort as it's serialized by si->lock. 2. Swapoff unuses all allocated entries. 3. Swapoff waits for SWP_SCANNING flag to be cleared, so ongoing allocations will stop, preventing UAF. 4. Now swapoff can free everything safely. This will make the allocation path have a hard dependency on si->lock. Allocation always have to acquire si->lock first for setting SWP_SCANNING and checking SWP_WRITEOK. This commit removes this flag, and just uses the existing per-CPU refcount instead to prevent UAF in step 3, which serves well for such usage without dependency on si->lock, and scales very well too. Just hold a reference during the whole scan and allocation process. Swapoff will kill and wait for the counter. And for preventing any allocation from happening after step 1 so the unuse in step 2 can ensure all slots are free, swapoff will acquire the ci->lock of each cluster one by one to ensure all allocations see ~SWP_WRITEOK and abort. This way these dependences on si->lock are gone. And worth noting we can't kill the refcount as the first step for swapoff as the unuse process have to acquire the refcount. Link: https://lkml.kernel.org/r/20250113175732.48099-8-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Baoquan He Cc: Barry Song Cc: Chis Li Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 16dcf8bd1a4e..1651174959c8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -219,7 +219,6 @@ enum { SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ /* add others here before... */ - SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL -- cgit v1.2.3 From 3494d184706ff5e7d28481de0c841b039caa38b1 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:27 +0800 Subject: mm, swap: use an enum to define all cluster flags and wrap flags changes Currently, we are only using flags to indicate which list the cluster is on. Using one bit for each list type might be a waste, as the list type grows, we will consume too many bits. Additionally, the current mixed usage of '&' and '==' is a bit confusing. Make it clean by using an enum to define all possible cluster statuses. Only an off-list cluster will have the NONE (0) flag. And use a wrapper to annotate and sanitize all flag settings and list movements. Link: https://lkml.kernel.org/r/20250113175732.48099-9-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 1651174959c8..0e59cb158b15 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -256,10 +256,19 @@ struct swap_cluster_info { u8 order; struct list_head list; }; -#define CLUSTER_FLAG_FREE 1 /* This cluster is free */ -#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ -#define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */ -#define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */ + +/* All on-list cluster must have a non-zero flag. */ +enum swap_cluster_flags { + CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ + CLUSTER_FLAG_FREE, + CLUSTER_FLAG_NONFULL, + CLUSTER_FLAG_FRAG, + /* Clusters with flags above are allocatable */ + CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, + CLUSTER_FLAG_FULL, + CLUSTER_FLAG_DISCARD, + CLUSTER_FLAG_MAX, +}; /* * The first page in the swap file is the swap header, which is always marked -- cgit v1.2.3 From 3b644773eefda88112d3ee5d57620f6e58fccfc6 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:28 +0800 Subject: mm, swap: reduce contention on device lock Currently, swap locking is mainly composed of two locks: the cluster lock (ci->lock) and the device lock (si->lock). The cluster lock is much more fine-grained, so it is best to use ci->lock instead of si->lock as much as possible. We have cleaned up other hard dependencies on si->lock. Following the new cluster allocator design, most operations don't need to touch si->lock at all. In practice, we only need to take si->lock when moving clusters between lists. To achieve this, this commit reworks the locking pattern of all si->lock and ci->lock users, eliminates all usage of ci->lock inside si->lock, and introduces a new design to avoid touching si->lock unless needed. For minimal contention and easier understanding of the system, two ideas are introduced with the corresponding helpers: isolation and relocation. - Clusters will be `isolated` from the list when iterating the list to search for an allocatable cluster. This ensures other CPUs won't walk into the same cluster easily, and it releases si->lock after acquiring ci->lock, providing the only place that handles the inversion of two locks, and avoids contention. Iterating the cluster list almost always moves the cluster (free -> nonfull, nonfull -> frag, frag -> frag tail), but it doesn't know where the cluster should be moved to until scanning is done. So keeping the cluster off-list is a good option with low overhead. The off-list time window of a cluster is also minimal. In the worst case, one CPU will return the cluster after scanning the 512 entries on it, which we used to busy wait with a spin lock. This is done with the new helper `isolate_lock_cluster`. - Clusters will be `relocated` after allocation or freeing, according to their usage count and status. Allocations no longer hold si->lock now, and may drop ci->lock for reclaim, so the cluster could be moved to any location while no lock is held. Besides, isolation clears all flags when it takes the cluster off the list (the flags must be in sync with the list status, so cluster users don't need to touch si->lock for checking its list status). So the cluster has to be relocated to the right list according to its usage after allocation or freeing. Relocation is optional, if the cluster flags indicate it's already on the right list, it will skip touching the list or si->lock. This is done with `relocate_cluster` after allocation or with `[partial_]free_cluster` after freeing. This handled usage of all kinds of clusters in a clean way. Scanning and allocation by iterating the cluster list is handled by "isolate - - relocate". Scanning and allocation of per-CPU clusters will only involve " - relocate", as it knows which cluster to lock and use. Freeing will only involve "relocate". Each CPU will keep using its per-CPU cluster until the 512 entries are all consumed. Freeing also has to free 512 entries to trigger cluster movement in the best case, so si->lock is rarely touched. Testing with building the Linux kernel with defconfig showed huge improvement: tiem make -j96 / 768M memcg, 4K pages, 10G ZRAM, on Intel 8255C: Before: Sys time: 73578.30, Real time: 864.05 After: (-50.7% sys time, -44.8% real time) Sys time: 36227.49, Real time: 476.66 time make -j96 / 1152M memcg, 64K mTHP, 10G ZRAM, on Intel 8255C: (avg of 4 test run) Before: Sys time: 74044.85, Real time: 846.51 hugepages-64kB/stats/swpout: 1735216 hugepages-64kB/stats/swpout_fallback: 430333 After: (-40.4% sys time, -37.1% real time) Sys time: 44160.56, Real time: 532.07 hugepages-64kB/stats/swpout: 1786288 hugepages-64kB/stats/swpout_fallback: 243384 time make -j32 / 512M memcg, 4K pages, 5G ZRAM, on AMD 7K62: Before: Sys time: 8098.21, Real time: 401.3 After: (-22.6% sys time, -12.8% real time ) Sys time: 6265.02, Real time: 349.83 The allocation success rate also slightly improved as we sanitized the usage of clusters with new defined helpers, previously dropping si->lock or ci->lock during scan will cause cluster order shuffle. Link: https://lkml.kernel.org/r/20250113175732.48099-10-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 0e59cb158b15..5fe650beb77d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -290,6 +290,7 @@ enum swap_cluster_flags { * throughput. */ struct percpu_cluster { + local_lock_t lock; /* Protect the percpu_cluster above */ unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; @@ -312,7 +313,7 @@ struct swap_info_struct { /* list of cluster that contains at least one free slot */ struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ - unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; + atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ -- cgit v1.2.3 From e3ae2dec849ba8bc5649c2d0507e02bd4379da71 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:29 +0800 Subject: mm, swap: simplify percpu cluster updating Instead of using a returning argument, we can simply store the next cluster offset to the fixed percpu location, which reduce the stack usage and simplify the function: Object size: ./scripts/bloat-o-meter mm/swapfile.o mm/swapfile.o.new add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-271 (-271) Function old new delta get_swap_pages 2847 2733 -114 alloc_swap_scan_cluster 894 737 -157 Total: Before=30833, After=30562, chg -0.88% Stack usage: Before: swapfile.c:1190:5:get_swap_pages 240 static After: swapfile.c:1185:5:get_swap_pages 216 static Link: https://lkml.kernel.org/r/20250113175732.48099-11-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Baoquan He Cc: Barry Song Cc: Chis Li Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 5fe650beb77d..75b2b0166cb1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -274,9 +274,9 @@ enum swap_cluster_flags { * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the * cluster to which it belongs being marked free. Therefore 0 is safe to use as - * a sentinel to indicate next is not valid in percpu_cluster. + * a sentinel to indicate an entry is not valid. */ -#define SWAP_NEXT_INVALID 0 +#define SWAP_ENTRY_INVALID 0 #ifdef CONFIG_THP_SWAP #define SWAP_NR_ORDERS (PMD_ORDER + 1) -- cgit v1.2.3 From bae8a4ef3efb56bb7e83bafd3c0856845aeaf605 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:31 +0800 Subject: mm, swap: use a global swap cluster for non-rotation devices Non-rotational devices (SSD / ZRAM) can tolerate fragmentation, so the goal of the SWAP allocator is to avoid contention for clusters. It uses a per-CPU cluster design, and each CPU will use a different cluster as much as possible. However, HDDs are very sensitive to fragmentation, contention is trivial in comparison. Therefore, we use one global cluster instead. This ensures that each order will be written to the same cluster as much as possible, which helps make the I/O more continuous. This ensures that the performance of the cluster allocator is as good as that of the old allocator. Tests after this commit compared to those before this series: Tested using 'make -j32' with tinyconfig, a 1G memcg limit, and HDD swap: make -j32 with tinyconfig, using 1G memcg limit and HDD swap: Before this series: 114.44user 29.11system 39:42.90elapsed 6%CPU (0avgtext+0avgdata 157284maxresident)k 2901232inputs+0outputs (238877major+4227640minor)pagefaults After this commit: 113.90user 23.81system 38:11.77elapsed 6%CPU (0avgtext+0avgdata 157260maxresident)k 2548728inputs+0outputs (235471major+4238110minor)pagefaults [ryncsn@gmail.com: check kmalloc() return in setup_clusters] Link: https://lkml.kernel.org/r/CAMgjq7Au+o04ckHyT=iU-wVx9az=t0B-ZiC5E0bDqNrAtNOP-g@mail.gmail.com Link: https://lkml.kernel.org/r/20250113175732.48099-13-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 75b2b0166cb1..a5f475335aea 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -317,6 +317,8 @@ struct swap_info_struct { unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ + struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */ + spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ -- cgit v1.2.3 From 4f79384a25d57a59e142009e52f40ae1f25102fe Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:32 +0800 Subject: mm, swap_slots: remove slot cache for freeing path The slot cache for freeing path is mostly for reducing the overhead of si->lock. As we have basically eliminated the si->lock usage for freeing path, it can be removed. This helps simplify the code, and avoids swap entries from being hold in cache upon freeing. The delayed freeing of entries have been causing trouble for further optimizations for zswap [1] and in theory will also cause more fragmentation, and extra overhead. Test with build linux kernel showed both performance and fragmentation is better without the cache: tiem make -j96 / 768M memcg, 4K pages, 10G ZRAM, avg of 4 test run:: Before: Sys time: 36047.78, Real time: 472.43 After: (-7.6% sys time, -7.3% real time) Sys time: 33314.76, Real time: 437.67 time make -j96 / 1152M memcg, 64K mTHP, 10G ZRAM, avg of 4 test run: Before: Sys time: 46859.04, Real time: 562.63 hugepages-64kB/stats/swpout: 1783392 hugepages-64kB/stats/swpout_fallback: 240875 After: (-23.3% sys time, -21.3% real time) Sys time: 35958.87, Real time: 442.69 hugepages-64kB/stats/swpout: 1866267 hugepages-64kB/stats/swpout_fallback: 158330 Sequential SWAP should be also slightly faster, tests didn't show a measurable difference though, at least no regression: Swapin 4G zero page on ZRAM (time in us): Before (avg. 1923756) 1912391 1927023 1927957 1916527 1918263 1914284 1934753 1940813 1921791 After (avg. 1922290): 1919101 1925743 1916810 1917007 1923930 1935152 1917403 1923549 1921913 Link: https://lore.kernel.org/all/CAMgjq7ACohT_uerSz8E_994ZZCv709Zor+43hdmesW_59W1BWw@mail.gmail.com/[1] Link: https://lkml.kernel.org/r/20250113175732.48099-14-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap_slots.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h index 15adfb8c813a..840aec3523b2 100644 --- a/include/linux/swap_slots.h +++ b/include/linux/swap_slots.h @@ -16,15 +16,12 @@ struct swap_slots_cache { swp_entry_t *slots; int nr; int cur; - spinlock_t free_lock; /* protects slots_ret, n_ret */ - swp_entry_t *slots_ret; int n_ret; }; void disable_swap_slots_cache_lock(void); void reenable_swap_slots_cache_unlock(void); void enable_swap_slots_cache(void); -void free_swap_slot(swp_entry_t entry); extern bool swap_slot_cache_enabled; -- cgit v1.2.3 From f8d4a6cabb74f82c37ccb7c5e9dc3fdad50393d4 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 2 Jan 2025 12:10:52 +0000 Subject: mm: make mmap_region() internal Now that we have removed the one user of mmap_region() outside of mm, make it internal and add it to vma.c so it can be userland tested. This ensures that all external memory mappings are performed using the appropriate interfaces and allows us to modify memory mapping logic as we see fit. Additionally expand test stubs to allow for the mmap_region() code to compile and be userland testable. Link: https://lkml.kernel.org/r/de5a3c574d35c26237edf20a1d8652d7305709c9.1735819274.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Jann Horn Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3550cbeed488..8483e09aeb2c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3363,9 +3363,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, return __get_unmapped_area(file, addr, len, pgoff, flags, 0); } -extern unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf); extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, -- cgit v1.2.3 From c6f239796b55dbc4225a6fca9f96232092b9df83 Mon Sep 17 00:00:00 2001 From: Guo Weikang Date: Thu, 2 Jan 2025 15:25:28 +0800 Subject: mm/memblock: add memblock_alloc_or_panic interface Before SLUB initialization, various subsystems used memblock_alloc to allocate memory. In most cases, when memory allocation fails, an immediate panic is required. To simplify this behavior and reduce repetitive checks, introduce `memblock_alloc_or_panic`. This function ensures that memory allocation failures result in a panic automatically, improving code readability and consistency across subsystems that require this behavior. [guoweikang.kernel@gmail.com: arch/s390: save_area_alloc default failure behavior changed to panic] Link: https://lkml.kernel.org/r/20250109033136.2845676-1-guoweikang.kernel@gmail.com Link: https://lore.kernel.org/lkml/Z2fknmnNtiZbCc7x@kernel.org/ Link: https://lkml.kernel.org/r/20250102072528.650926-1-guoweikang.kernel@gmail.com Signed-off-by: Guo Weikang Acked-by: Geert Uytterhoeven [m68k] Reviewed-by: Alexander Gordeev [s390] Acked-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Signed-off-by: Andrew Morton --- include/linux/memblock.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index d48b56c1e558..e79eb6ac516f 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -421,6 +421,12 @@ static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align) MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); } +void *__memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, + const char *func); + +#define memblock_alloc_or_panic(size, align) \ + __memblock_alloc_or_panic(size, align, __func__) + static inline void *memblock_alloc_raw(phys_addr_t size, phys_addr_t align) { -- cgit v1.2.3 From 798c0330c2ca078cc3e155e567c77c4d61345a38 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 30 Dec 2024 21:35:34 -0700 Subject: mm/mglru: rework aging feedback The aging feedback is based on both the number of generations and the distribution of folios in each generation. The number of generations is currently the distance between max_seq and anon min_seq. This is because anon min_seq is not allowed to move past file min_seq. The rationale for that is that file is always evictable whereas anon is not. However, for use cases where anon is a lot cheaper than file: 1. Anon in the second oldest generation can be a better choice than file in the oldest generation. 2. A large amount of file in the oldest generation can skew the distribution, making should_run_aging() return false negative. Allow anon and file min_seq to move independently, and use solely the number of generations as the feedback for aging. Specifically, when both anon and file are evictable, anon min_seq can now be greater than file min_seq, and therefore the number of generations becomes the distance between max_seq and min(min_seq[0],min_seq[1]). And should_run_aging() returns true if and only if the number of generations is less than MAX_NR_GENS. As the first step to the final optimization, this change by itself should not have userspace-visiable effects beyond performance. The next twos patch will take advantage of this change; the last patch in this series will better distribute folios across MAX_NR_GENS. [yuzhao@google.com: restore behaviour for systems with swappiness == 200] Link: https://lkml.kernel.org/r/Z4S3-aJy5dj9tBTk@google.com Link: https://lkml.kernel.org/r/20241231043538.4075764-4-yuzhao@google.com Signed-off-by: Yu Zhao Reported-by: David Stevens Tested-by: Kalesh Singh Cc: Barry Song Cc: Bharata B Rao Cc: Kairui Song Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b36124145a16..8245ecb0400b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -421,12 +421,11 @@ enum { /* * The youngest generation number is stored in max_seq for both anon and file * types as they are aged on an equal footing. The oldest generation numbers are - * stored in min_seq[] separately for anon and file types as clean file pages - * can be evicted regardless of swap constraints. - * - * Normally anon and file min_seq are in sync. But if swapping is constrained, - * e.g., out of swap space, file min_seq is allowed to advance and leave anon - * min_seq behind. + * stored in min_seq[] separately for anon and file types so that they can be + * incremented independently. Ideally min_seq[] are kept in sync when both anon + * and file types are evictable. However, to adapt to situations like extreme + * swappiness, they are allowed to be out of sync by at most + * MAX_NR_GENS-MIN_NR_GENS-1. * * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. @@ -446,8 +445,8 @@ struct lru_gen_folio { unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; /* the exponential moving average of evicted+protected */ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; - /* the first tier doesn't need protection, hence the minus one */ - unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; + /* can only be modified under the LRU lock */ + unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* can be modified without holding the LRU lock */ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; @@ -498,7 +497,7 @@ struct lru_gen_mm_walk { int mm_stats[NR_MM_STATS]; /* total batched items */ int batched; - bool can_swap; + int swappiness; bool force_scan; }; -- cgit v1.2.3 From 4d5d14a01e2c9091b128fb46e1d07475e9a7bb72 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 30 Dec 2024 21:35:37 -0700 Subject: mm/mglru: rework workingset protection With the aging feedback no longer considering the distribution of folios in each generation, rework workingset protection to better distribute folios across MAX_NR_GENS. This is achieved by reusing PG_workingset and PG_referenced/LRU_REFS_FLAGS in a slightly different way. For folios accessed multiple times through file descriptors, make lru_gen_inc_refs() set additional bits of LRU_REFS_WIDTH in folio->flags after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily promoted into the second oldest generation in the eviction path. And when folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that lru_gen_inc_refs() can start over. For this case, LRU_REFS_MASK is only valid when PG_referenced is set. For folios accessed multiple times through page tables, folio_update_gen() from a page table walk or lru_gen_set_refs() from a rmap walk sets PG_referenced after the accessed bit is cleared for the first time. Thereafter, those two paths set PG_workingset and promote folios to the youngest generation. Like folio_inc_gen(), when folio_update_gen() does that, it also clears PG_referenced. For this case, LRU_REFS_MASK is not used. For both of the cases, after PG_workingset is set on a folio, it remains until this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It can be set again if lru_gen_test_recent() returns true upon a refault. When adding folios to the LRU lists, lru_gen_folio_seq() distributes them as follows: +---------------------------------+---------------------------------+ | Accessed thru page tables | Accessed thru file descriptors | +---------------------------------+---------------------------------+ | PG_active (set while isolated) | | +----------------+----------------+----------------+----------------+ | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS | +---------------------------------+---------------------------------+ |<--------- MIN_NR_GENS --------->| | |<-------------------------- MAX_NR_GENS -------------------------->| After this patch, some typical client and server workloads showed improvements under heavy memory pressure. For example, Python TPC-C, which was used to benchmark a different approach [1] to better detect refault distances, showed a significant decrease in total refaults: Before After Change Time (seconds) 10801 10801 0% Executed (transactions) 41472 43663 +5% workingset_nodes 109070 120244 +10% workingset_refault_anon 5019627 7281831 +45% workingset_refault_file 1294678786 554855564 -57% workingset_refault_total 1299698413 562137395 -57% [1] https://lore.kernel.org/20230920190244.16839-1-ryncsn@gmail.com/ Link: https://lkml.kernel.org/r/20241231043538.4075764-7-yuzhao@google.com Signed-off-by: Yu Zhao Reported-by: Kairui Song Closes: https://lore.kernel.org/CAOUHufahuWcKf5f1Sg3emnqX+cODuR=2TQo7T4Gr-QYLujn4RA@mail.gmail.com/ Tested-by: Kalesh Singh Cc: Barry Song Cc: Bharata B Rao Cc: David Stevens Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 88 +++++++++++++++++++++++------------------------ include/linux/mmzone.h | 82 ++++++++++++++++++++++++++----------------- 2 files changed, 94 insertions(+), 76 deletions(-) (limited to 'include') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 34e5097182a0..f9157a0c42a5 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -133,31 +133,25 @@ static inline int lru_hist_from_seq(unsigned long seq) return seq % NR_HIST_GENS; } -static inline int lru_tier_from_refs(int refs) +static inline int lru_tier_from_refs(int refs, bool workingset) { VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); - /* see the comment in folio_lru_refs() */ - return order_base_2(refs + 1); + /* see the comment on MAX_NR_TIERS */ + return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs); } static inline int folio_lru_refs(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); - bool workingset = flags & BIT(PG_workingset); + if (!(flags & BIT(PG_referenced))) + return 0; /* - * Return the number of accesses beyond PG_referenced, i.e., N-1 if the - * total number of accesses is N>1, since N=0,1 both map to the first - * tier. lru_tier_from_refs() will account for this off-by-one. Also see - * the comment on MAX_NR_TIERS. + * Return the total number of accesses including PG_referenced. Also see + * the comment on LRU_REFS_FLAGS. */ - return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; -} - -static inline void folio_clear_lru_refs(struct folio *folio) -{ - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); + return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1; } static inline int folio_lru_gen(struct folio *folio) @@ -223,11 +217,43 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } +static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio, + bool reclaiming) +{ + int gen; + int type = folio_is_file_lru(folio); + struct lru_gen_folio *lrugen = &lruvec->lrugen; + + /* + * +-----------------------------------+-----------------------------------+ + * | Accessed through page tables and | Accessed through file descriptors | + * | promoted by folio_update_gen() | and protected by folio_inc_gen() | + * +-----------------------------------+-----------------------------------+ + * | PG_active (set while isolated) | | + * +-----------------+-----------------+-----------------+-----------------+ + * | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS | + * +-----------------------------------+-----------------------------------+ + * |<---------- MIN_NR_GENS ---------->| | + * |<---------------------------- MAX_NR_GENS ---------------------------->| + */ + if (folio_test_active(folio)) + gen = MIN_NR_GENS - folio_test_workingset(folio); + else if (reclaiming) + gen = MAX_NR_GENS; + else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) || + (folio_test_reclaim(folio) && + (folio_test_dirty(folio) || folio_test_writeback(folio)))) + gen = MIN_NR_GENS; + else + gen = MAX_NR_GENS - folio_test_workingset(folio); + + return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type])); +} + static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long seq; unsigned long flags; - unsigned long mask; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); @@ -237,40 +263,12 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, if (folio_test_unevictable(folio) || !lrugen->enabled) return false; - /* - * There are four common cases for this page: - * 1. If it's hot, i.e., freshly faulted in, add it to the youngest - * generation, and it's protected over the rest below. - * 2. If it can't be evicted immediately, i.e., a dirty page pending - * writeback, add it to the second youngest generation. - * 3. If it should be evicted first, e.g., cold and clean from - * folio_rotate_reclaimable(), add it to the oldest generation. - * 4. Everything else falls between 2 & 3 above and is added to the - * second oldest generation if it's considered inactive, or the - * oldest generation otherwise. See lru_gen_is_active(). - */ - if (folio_test_active(folio)) - seq = lrugen->max_seq; - else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) || - (folio_test_reclaim(folio) && - (folio_test_dirty(folio) || folio_test_writeback(folio)))) - seq = lrugen->max_seq - 1; - else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq) - seq = lrugen->min_seq[type]; - else - seq = lrugen->min_seq[type] + 1; + seq = lru_gen_folio_seq(lruvec, folio, reclaiming); gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ - mask = LRU_GEN_MASK; - /* - * Don't clear PG_workingset here because it can affect PSI accounting - * if the activation is due to workingset refault. - */ - if (folio_test_active(folio)) - mask |= LRU_REFS_MASK | BIT(PG_referenced) | BIT(PG_active); - set_mask_bits(&folio->flags, mask, flags); + set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8245ecb0400b..9540b41894da 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -332,66 +332,88 @@ enum lruvec_flags { #endif /* !__GENERATING_BOUNDS_H */ /* - * Evictable pages are divided into multiple generations. The youngest and the + * Evictable folios are divided into multiple generations. The youngest and the * oldest generation numbers, max_seq and min_seq, are monotonically increasing. * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while - * a page is on one of lrugen->folios[]. Otherwise it stores 0. + * a folio is on one of lrugen->folios[]. Otherwise it stores 0. * - * A page is added to the youngest generation on faulting. The aging needs to - * check the accessed bit at least twice before handing this page over to the - * eviction. The first check takes care of the accessed bit set on the initial - * fault; the second check makes sure this page hasn't been used since then. - * This process, AKA second chance, requires a minimum of two generations, - * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive - * LRU, e.g., /proc/vmstat, these two generations are considered active; the - * rest of generations, if they exist, are considered inactive. See - * lru_gen_is_active(). + * After a folio is faulted in, the aging needs to check the accessed bit at + * least twice before handing this folio over to the eviction. The first check + * clears the accessed bit from the initial fault; the second check makes sure + * this folio hasn't been used since then. This process, AKA second chance, + * requires a minimum of two generations, hence MIN_NR_GENS. And to maintain ABI + * compatibility with the active/inactive LRU, e.g., /proc/vmstat, these two + * generations are considered active; the rest of generations, if they exist, + * are considered inactive. See lru_gen_is_active(). * - * PG_active is always cleared while a page is on one of lrugen->folios[] so - * that the aging needs not to worry about it. And it's set again when a page - * considered active is isolated for non-reclaiming purposes, e.g., migration. - * See lru_gen_add_folio() and lru_gen_del_folio(). + * PG_active is always cleared while a folio is on one of lrugen->folios[] so + * that the sliding window needs not to worry about it. And it's set again when + * a folio considered active is isolated for non-reclaiming purposes, e.g., + * migration. See lru_gen_add_folio() and lru_gen_del_folio(). * * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits - * in folio->flags. + * in folio->flags, masked by LRU_GEN_MASK. */ #define MIN_NR_GENS 2U #define MAX_NR_GENS 4U /* - * Each generation is divided into multiple tiers. A page accessed N times - * through file descriptors is in tier order_base_2(N). A page in the first tier - * (N=0,1) is marked by PG_referenced unless it was faulted in through page - * tables or read ahead. A page in any other tier (N>1) is marked by - * PG_referenced and PG_workingset. This implies a minimum of two tiers is - * supported without using additional bits in folio->flags. + * Each generation is divided into multiple tiers. A folio accessed N times + * through file descriptors is in tier order_base_2(N). A folio in the first + * tier (N=0,1) is marked by PG_referenced unless it was faulted in through page + * tables or read ahead. A folio in the last tier (MAX_NR_TIERS-1) is marked by + * PG_workingset. A folio in any other tier (1flags. * * In contrast to moving across generations which requires the LRU lock, moving * across tiers only involves atomic operations on folio->flags and therefore * has a negligible cost in the buffered access path. In the eviction path, - * comparisons of refaulted/(evicted+protected) from the first tier and the - * rest infer whether pages accessed multiple times through file descriptors - * are statistically hot and thus worth protecting. + * comparisons of refaulted/(evicted+protected) from the first tier and the rest + * infer whether folios accessed multiple times through file descriptors are + * statistically hot and thus worth protecting. * * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in - * folio->flags. + * folio->flags, masked by LRU_REFS_MASK. */ #define MAX_NR_TIERS 4U #ifndef __GENERATING_BOUNDS_H -struct lruvec; -struct page_vma_mapped_walk; - #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) +/* + * For folios accessed multiple times through file descriptors, + * lru_gen_inc_refs() sets additional bits of LRU_REFS_WIDTH in folio->flags + * after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its + * bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily + * promoted into the second oldest generation in the eviction path. And when + * folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that + * lru_gen_inc_refs() can start over. Note that for this case, LRU_REFS_MASK is + * only valid when PG_referenced is set. + * + * For folios accessed multiple times through page tables, folio_update_gen() + * from a page table walk or lru_gen_set_refs() from a rmap walk sets + * PG_referenced after the accessed bit is cleared for the first time. + * Thereafter, those two paths set PG_workingset and promote folios to the + * youngest generation. Like folio_inc_gen(), folio_update_gen() also clears + * PG_referenced. Note that for this case, LRU_REFS_MASK is not used. + * + * For both cases above, after PG_workingset is set on a folio, it remains until + * this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It + * can be set again if lru_gen_test_recent() returns true upon a refault. + */ +#define LRU_REFS_FLAGS (LRU_REFS_MASK | BIT(PG_referenced)) + +struct lruvec; +struct page_vma_mapped_walk; + #ifdef CONFIG_LRU_GEN enum { @@ -406,8 +428,6 @@ enum { NR_LRU_GEN_CAPS }; -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) - #define MIN_LRU_BATCH BITS_PER_LONG #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) -- cgit v1.2.3 From d670c8e5302af8ccdf5f12242e58816420738bb5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 9 Jan 2025 15:22:21 +0000 Subject: mm: remove PageTransTail() The last caller was removed in October. Also remove the FALSE definition of PageTransCompoundMap(); the normal definition was removed a few years ago. Link: https://lkml.kernel.org/r/20250109152245.1591914-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Acked-by: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 691506bdf2c5..330929b6e062 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -894,21 +894,9 @@ static inline int PageTransCompound(const struct page *page) { return PageCompound(page); } - -/* - * PageTransTail returns true for both transparent huge pages - * and hugetlbfs pages, so it should only be called when it's known - * that hugetlbfs pages aren't involved. - */ -static inline int PageTransTail(const struct page *page) -{ - return PageTail(page); -} #else TESTPAGEFLAG_FALSE(TransHuge, transhuge) TESTPAGEFLAG_FALSE(TransCompound, transcompound) -TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap) -TESTPAGEFLAG_FALSE(TransTail, transtail) #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -- cgit v1.2.3 From d783cc5913f17b2b5d9c51cb0904860ec97ed44d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 10 Jan 2025 10:52:32 -0800 Subject: mm/damon: explain "effective quota" on kernel-doc comment The kernel-doc comment for 'struct damos_quota' describes how "effective quota" is calculated, but does not explain what it is. Actually there was an input[1] about it. Add the explanation on the comment. Also, fix a trivial typo on the comment block: s/empt/empty/ [1] https://github.com/damonitor/damo/issues/17#issuecomment-2497525043 Link: https://lkml.kernel.org/r/20250110185232.54907-6-sj@kernel.org Signed-off-by: SeongJae Park Suggested-by: Honggyu Kim Cc: Yunjeong Mun Cc: Honggyu Kim Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 0834d7ffcb84..af525252b853 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -193,11 +193,16 @@ struct damos_quota_goal { * size quota is set, DAMON tries to apply the action only up to &sz bytes * within &reset_interval. * - * Internally, the time quota is transformed to a size quota using estimated - * throughput of the scheme's action. DAMON then compares it against &sz and - * uses smaller one as the effective quota. + * To convince the different types of quotas and goals, DAMON internally + * converts those into one single size quota called "effective quota". DAMON + * internally uses it as the only one real quota. The conversion is made as + * follows. * - * If @goals is not empt, DAMON calculates yet another size quota based on the + * The time quota is transformed to a size quota using estimated throughput of + * the scheme's action. DAMON then compares it against &sz and uses smaller + * one as the effective quota. + * + * If @goals is not empty, DAMON calculates yet another size quota based on the * goals using its internal feedback loop algorithm, for every @reset_interval. * Then, if the new size quota is smaller than the effective quota, it uses the * new size quota as the effective quota. -- cgit v1.2.3 From 3ab76c767bc783c122a8dfe105fbc10a0b029b42 Mon Sep 17 00:00:00 2001 From: xu xin Date: Fri, 10 Jan 2025 17:40:34 +0800 Subject: ksm: add ksm involvement information for each process In /proc//ksm_stat, add two extra ksm involvement items including KSM_mergeable and KSM_merge_any. It helps administrators to better know the system's KSM behavior at process level. ksm_merge_any: yes/no whether the process'mm is added by prctl() into the candidate list of KSM or not, and fully enabled at process level. ksm_mergeable: yes/no whether any VMAs of the process'mm are currently applicable to KSM. Purpose ======= These two items are just to improve the observability of KSM at process level, so that users can know if a certain process has enabled KSM. For example, if without these two items, when we look at /proc//ksm_stat and there's no merging pages found, We are not sure whether it is because KSM was not enabled or because KSM did not successfully merge any pages. Although "mg" in /proc//smaps indicate VM_MERGEABLE, it's opaque and not very obvious for non professionals. [akpm@linux-foundation.org: wording tweaks, per David and akpm] Link: https://lkml.kernel.org/r/20250110174034304QOb8eDoqtFkp3_t8mqnqc@zte.com.cn Signed-off-by: xu xin Acked-by: David Hildenbrand Tested-by: Mario Casquero Cc: Wang Yaxin Cc: Yang Yang Signed-off-by: Andrew Morton --- include/linux/ksm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 6a53ac4885bb..d73095b5cd96 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -93,6 +93,7 @@ void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early); long ksm_process_profit(struct mm_struct *); +bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ -- cgit v1.2.3 From 8d91fed83cc12306cbb63efa6c473ffee117977a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 Jan 2025 14:16:06 +0100 Subject: mm/huge_memory: convert has_hwpoisoned into a pure folio flag Patch series "mm: hugetlb+THP folio and migration cleanups", v2. Some cleanups around more folio conversion and migration handling that I collected working on random stuff. This patch (of 6): Let's stop setting it on pages, there is no need to anymore. Link: https://lkml.kernel.org/r/20250113131611.2554758-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Baolin Wang Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 330929b6e062..616b57ddc3fe 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -906,11 +906,9 @@ TESTPAGEFLAG_FALSE(TransCompound, transcompound) * * This flag is set by hwpoison handler. Cleared by THP split or free page. */ -PAGEFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) - TESTSCFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) +FOLIO_FLAG(has_hwpoisoned, FOLIO_SECOND_PAGE) #else -PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) - TESTSCFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) +FOLIO_FLAG_FALSE(has_hwpoisoned) #endif /* -- cgit v1.2.3 From 4c640f128074e0d4459ecf072595a44df5c2ae18 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 Jan 2025 14:16:07 +0100 Subject: mm/hugetlb: rename isolate_hugetlb() to folio_isolate_hugetlb() Let's make the function name match "folio_isolate_lru()", and add some kernel doc. Link: https://lkml.kernel.org/r/20250113131611.2554758-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Baolin Wang Cc: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 49ec2362ce92..c95ad5cd7894 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -153,7 +153,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -bool isolate_hugetlb(struct folio *folio, struct list_head *list); +bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); @@ -414,7 +414,7 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline bool isolate_hugetlb(struct folio *folio, struct list_head *list) +static inline bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list) { return false; } -- cgit v1.2.3 From b235448e8cab7eea17d164efc7bf55505985ba65 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 Jan 2025 14:16:09 +0100 Subject: mm/hugetlb: rename folio_putback_active_hugetlb() to folio_putback_hugetlb() Now that folio_putback_hugetlb() is only called on folios that were previously isolated through folio_isolate_hugetlb(), let's rename it to match folio_putback_lru(). Add some kernel doc to clarify how this function is supposed to be used. Link: https://lkml.kernel.org/r/20250113131611.2554758-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Baolin Wang Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c95ad5cd7894..ec8c0ccc8f95 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -157,7 +157,7 @@ bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); -void folio_putback_active_hugetlb(struct folio *folio); +void folio_putback_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; @@ -430,7 +430,7 @@ static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return 0; } -static inline void folio_putback_active_hugetlb(struct folio *folio) +static inline void folio_putback_hugetlb(struct folio *folio) { } -- cgit v1.2.3 From cceba6f7e46c48deca433030d80fc34599fb9fd8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:42 -0700 Subject: mm: add PG_dropbehind folio flag Add a folio flag that file IO can use to indicate that the cached IO being done should be dropped from the page cache upon completion. Link: https://lkml.kernel.org/r/20241220154831.1086649-5-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Kirill A. Shutemov Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 +++++ include/trace/events/mmflags.h | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 616b57ddc3fe..36d283552f80 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -110,6 +110,7 @@ enum pageflags { PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ + PG_dropbehind, /* drop pages on IO completion */ #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif @@ -562,6 +563,10 @@ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE) +FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE) + FOLIO_TEST_CLEAR_FLAG(dropbehind, FOLIO_HEAD_PAGE) + __FOLIO_SET_FLAG(dropbehind, FOLIO_HEAD_PAGE) + #ifdef CONFIG_HIGHMEM /* * Must use a macro here due to header dependency issues. page_zone() is not diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index bb8a59c6caa2..3bc8656c8359 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -116,7 +116,8 @@ DEF_PAGEFLAG_NAME(head), \ DEF_PAGEFLAG_NAME(reclaim), \ DEF_PAGEFLAG_NAME(swapbacked), \ - DEF_PAGEFLAG_NAME(unevictable) \ + DEF_PAGEFLAG_NAME(unevictable), \ + DEF_PAGEFLAG_NAME(dropbehind) \ IF_HAVE_PG_MLOCK(mlocked) \ IF_HAVE_PG_HWPOISON(hwpoison) \ IF_HAVE_PG_IDLE(idle) \ -- cgit v1.2.3 From 77d075221ae777296e2b18a0a4f5fea6f75daf2c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:43 -0700 Subject: mm/readahead: add readahead_control->dropbehind member If ractl->dropbehind is set to true, then folios created are marked as dropbehind as well. Link: https://lkml.kernel.org/r/20241220154831.1086649-6-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Kirill A. Shutemov Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index fc2e1319c7bb..d53c49abead6 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1358,6 +1358,7 @@ struct readahead_control { pgoff_t _index; unsigned int _nr_pages; unsigned int _batch_count; + bool dropbehind; bool _workingset; unsigned long _pflags; }; -- cgit v1.2.3 From b9f958d4f146bd11be33a5f2bc3ced50f86d6b23 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:45 -0700 Subject: fs: add RWF_DONTCACHE iocb and FOP_DONTCACHE file_operations flag If a file system supports uncached buffered IO, it may set FOP_DONTCACHE and enable support for RWF_DONTCACHE. If RWF_DONTCACHE is attempted without the file system supporting it, it'll get errored with -EOPNOTSUPP. Link: https://lkml.kernel.org/r/20241220154831.1086649-8-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/fs.h | 14 +++++++++++++- include/uapi/linux/fs.h | 6 +++++- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..6a838b5479a6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -322,6 +322,7 @@ struct readahead_control; #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC +#define IOCB_DONTCACHE (__force int) RWF_DONTCACHE /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -356,7 +357,8 @@ struct readahead_control; { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ - { IOCB_ATOMIC, "ATOMIC"}, \ + { IOCB_ATOMIC, "ATOMIC" }, \ + { IOCB_DONTCACHE, "DONTCACHE" }, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ @@ -2127,6 +2129,8 @@ struct file_operations { #define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) /* Supports asynchronous lock callbacks */ #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) +/* File system supports uncached read/write buffered IO */ +#define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, @@ -3614,6 +3618,14 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) return -EOPNOTSUPP; } + if (flags & RWF_DONTCACHE) { + /* file system must support it */ + if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE)) + return -EOPNOTSUPP; + /* DAX mappings not supported */ + if (IS_DAX(ki->ki_filp->f_mapping->host)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 753971770733..56a4f93a08f4 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -332,9 +332,13 @@ typedef int __bitwise __kernel_rwf_t; /* Atomic Write */ #define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) +/* buffered IO that drops the cache after reading or writing data */ +#define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC) + RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\ + RWF_DONTCACHE) #define PROCFS_IOCTL_MAGIC 'f' -- cgit v1.2.3 From dddc559f2e7cff9c6525150cd29ef3a4f6692b26 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:48 -0700 Subject: mm/filemap: add filemap_fdatawrite_range_kick() helper Works like filemap_fdatawrite_range(), except it's a non-integrity data writeback and hence only starts writeback on the specified range. Will help facilitate generically starting uncached writeback from generic_write_sync(), as header dependencies preclude doing this inline from fs.h. Link: https://lkml.kernel.org/r/20241220154831.1086649-11-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6a838b5479a6..653b5efa3d3f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2878,6 +2878,8 @@ extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, extern int __must_check file_check_and_advance_wb_err(struct file *file); extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); +int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, + loff_t end); static inline int file_write_and_wait(struct file *file) { -- cgit v1.2.3 From 1d4457576570627e1702614bc060b55d95b85e39 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:49 -0700 Subject: mm: call filemap_fdatawrite_range_kick() after IOCB_DONTCACHE issue When a buffered write submitted with IOCB_DONTCACHE has been successfully submitted, call filemap_fdatawrite_range_kick() to kick off the IO. File systems call generic_write_sync() for any successful buffered write submission, hence add the logic here rather than needing to modify the file system. Link: https://lkml.kernel.org/r/20241220154831.1086649-12-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/fs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 653b5efa3d3f..58a618853574 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2912,6 +2912,11 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) (iocb->ki_flags & IOCB_SYNC) ? 0 : 1); if (ret) return ret; + } else if (iocb->ki_flags & IOCB_DONTCACHE) { + struct address_space *mapping = iocb->ki_filp->f_mapping; + + filemap_fdatawrite_range_kick(mapping, iocb->ki_pos, + iocb->ki_pos + count); } return count; -- cgit v1.2.3 From d94d23fdd7529f1f3218235d1e0a69e9856907b7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:50 -0700 Subject: mm: add FGP_DONTCACHE folio creation flag Callers can pass this in for uncached folio creation, in which case if a folio is newly created it gets marked as uncached. If a folio exists for this index and lookup succeeds, then it will not get marked as uncached. If an !uncached lookup finds a cached folio, clear the flag. For that case, there are competeting uncached and cached users of the folio, and it should not get pruned. Link: https://lkml.kernel.org/r/20241220154831.1086649-13-axboe@kernel.dk Signed-off-by: Jens Axboe Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d53c49abead6..47bfc6b1b632 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -710,6 +710,7 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, * * %FGP_NOFS - __GFP_FS will get cleared in gfp. * * %FGP_NOWAIT - Don't block on the folio lock. * * %FGP_STABLE - Wait for the folio to be stable (finished writeback) + * * %FGP_DONTCACHE - Uncached buffered IO * * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin() * implementation. */ @@ -723,6 +724,7 @@ typedef unsigned int __bitwise fgf_t; #define FGP_NOWAIT ((__force fgf_t)0x00000020) #define FGP_FOR_MMAP ((__force fgf_t)0x00000040) #define FGP_STABLE ((__force fgf_t)0x00000080) +#define FGP_DONTCACHE ((__force fgf_t)0x00000100) #define FGF_GET_ORDER(fgf) (((__force unsigned)fgf) >> 26) /* top 6 bits */ #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) -- cgit v1.2.3 From 3c7fd94205f86ad89f1d1d01dbfbc4b139860d8f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 16 Jan 2025 10:27:30 -0800 Subject: seqlock: add missing parameter documentation for raw_seqcount_try_begin() Add missing documentation for raw_seqcount_try_begin() start parameter. Link: https://lkml.kernel.org/r/20250116182730.801497-1-surenb@google.com Fixes: dba4761a3e40 ("seqlock: add raw_seqcount_try_begin") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/all/20250116170522.23e884d5@canb.auug.org.au/ Signed-off-by: Suren Baghdasaryan Acked-by: Waiman Long Cc: Boqun Feng Cc: David Hildenbrand Cc: Ingo Molnar Cc: Liam Howlett Cc: Peter Zijlstra (Intel) Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/seqlock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 22c2c48b4265..b783a3a7ed62 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -322,6 +322,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) * raw_seqcount_try_begin() - begin a seqcount_t read critical section * w/o lockdep and w/o counter stabilization * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants + * @start: count to be passed to read_seqcount_retry() * * Similar to raw_seqcount_begin(), except it enables eliding the critical * section entirely if odd, instead of doing the speculation knowing it will -- cgit v1.2.3 From a145c848d69f9c6f32008d8319edaa133360dd74 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 8 Jan 2025 10:04:30 +0100 Subject: module: Extend the preempt disabled section in dereference_symbol_descriptor(). dereference_symbol_descriptor() needs to obtain the module pointer belonging to pointer in order to resolve that pointer. The returned mod pointer is obtained under RCU-sched/ preempt_disable() guarantees and needs to be used within this section to ensure that the module is not removed in the meantime. Extend the preempt_disable() section to also cover dereference_module_function_descriptor(). Fixes: 04b8eb7a4ccd9 ("symbol lookup: introduce dereference_symbol_descriptor()") Cc: James E.J. Bottomley Cc: Christophe Leroy Cc: Helge Deller Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Naveen N Rao Cc: Nicholas Piggin Cc: Sergey Senozhatsky Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Reviewed-by: Sergey Senozhatsky Acked-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/20250108090457.512198-2-bigeasy@linutronix.de Signed-off-by: Petr Pavlu --- include/linux/kallsyms.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index c3f075e8f60c..1c6a6c1704d8 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -57,10 +57,10 @@ static inline void *dereference_symbol_descriptor(void *ptr) preempt_disable(); mod = __module_address((unsigned long)ptr); - preempt_enable(); if (mod) ptr = dereference_module_function_descriptor(mod, ptr); + preempt_enable(); #endif return ptr; } -- cgit v1.2.3 From 38e3fe6595e1fa806c0450b2db666bc46325025e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 16 Dec 2024 18:25:09 +0100 Subject: module: Handle 'struct module_version_attribute' as const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The structure is always read-only due to its placement in the read-only section __modver. Reflect this at its usage sites. Also prepare for the const handling of 'struct module_attribute' itself. Signed-off-by: Thomas Weißschuh Reviewed-by: Petr Pavlu Link: https://lore.kernel.org/r/20241216-sysfs-const-attr-module-v1-2-3790b53e0abf@weissschuh.net Signed-off-by: Petr Pavlu --- include/linux/module.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/module.h b/include/linux/module.h index b3a643435357..5001c166c74f 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -275,7 +275,7 @@ extern typeof(name) __mod_device_table__##type##__##name \ #else #define MODULE_VERSION(_version) \ MODULE_INFO(version, _version); \ - static struct module_version_attribute __modver_attr \ + static const struct module_version_attribute __modver_attr \ __used __section("__modver") \ __aligned(__alignof__(struct module_version_attribute)) \ = { \ -- cgit v1.2.3 From f3227ffda07470848abe3cfa2039b21816ce3090 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 16 Dec 2024 18:25:10 +0100 Subject: module: Constify 'struct module_attribute' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These structs are never modified, move them to read-only memory. This makes the API clearer and also prepares for the constification of 'struct attribute' itself. While at it, also constify 'modinfo_attrs_count'. Signed-off-by: Thomas Weißschuh Reviewed-by: Petr Pavlu Link: https://lore.kernel.org/r/20241216-sysfs-const-attr-module-v1-3-3790b53e0abf@weissschuh.net Signed-off-by: Petr Pavlu --- include/linux/module.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/module.h b/include/linux/module.h index 5001c166c74f..37eb5d88f6eb 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -52,9 +52,9 @@ struct module_kobject { struct module_attribute { struct attribute attr; - ssize_t (*show)(struct module_attribute *, struct module_kobject *, + ssize_t (*show)(const struct module_attribute *, struct module_kobject *, char *); - ssize_t (*store)(struct module_attribute *, struct module_kobject *, + ssize_t (*store)(const struct module_attribute *, struct module_kobject *, const char *, size_t count); void (*setup)(struct module *, const char *); int (*test)(struct module *); @@ -67,10 +67,10 @@ struct module_version_attribute { const char *version; }; -extern ssize_t __modver_version_show(struct module_attribute *, +extern ssize_t __modver_version_show(const struct module_attribute *, struct module_kobject *, char *); -extern struct module_attribute module_uevent; +extern const struct module_attribute module_uevent; /* These are either module local, or the kernel's dummy ones. */ extern int init_module(void); -- cgit v1.2.3 From 26c0a2d93af55d30a46d5f45d3e9c42cde730168 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sun, 26 Jan 2025 21:49:59 +0800 Subject: LoongArch: Fix warnings during S3 suspend The enable_gpe_wakeup() function calls acpi_enable_all_wakeup_gpes(), and the later one may call the preempt_schedule_common() function, resulting in a thread switch and causing the CPU to be in an interrupt enabled state after the enable_gpe_wakeup() function returns, leading to the warnings as follow. [ C0] WARNING: ... at kernel/time/timekeeping.c:845 ktime_get+0xbc/0xc8 [ C0] ... [ C0] Call Trace: [ C0] [<90000000002243b4>] show_stack+0x64/0x188 [ C0] [<900000000164673c>] dump_stack_lvl+0x60/0x88 [ C0] [<90000000002687e4>] __warn+0x8c/0x148 [ C0] [<90000000015e9978>] report_bug+0x1c0/0x2b0 [ C0] [<90000000016478e4>] do_bp+0x204/0x3b8 [ C0] [<90000000025b1924>] exception_handlers+0x1924/0x10000 [ C0] [<9000000000343bbc>] ktime_get+0xbc/0xc8 [ C0] [<9000000000354c08>] tick_sched_timer+0x30/0xb0 [ C0] [<90000000003408e0>] __hrtimer_run_queues+0x160/0x378 [ C0] [<9000000000341f14>] hrtimer_interrupt+0x144/0x388 [ C0] [<9000000000228348>] constant_timer_interrupt+0x38/0x48 [ C0] [<90000000002feba4>] __handle_irq_event_percpu+0x64/0x1e8 [ C0] [<90000000002fed48>] handle_irq_event_percpu+0x20/0x80 [ C0] [<9000000000306b9c>] handle_percpu_irq+0x5c/0x98 [ C0] [<90000000002fd4a0>] generic_handle_domain_irq+0x30/0x48 [ C0] [<9000000000d0c7b0>] handle_cpu_irq+0x70/0xa8 [ C0] [<9000000001646b30>] handle_loongarch_irq+0x30/0x48 [ C0] [<9000000001646bc8>] do_vint+0x80/0xe0 [ C0] [<90000000002aea1c>] finish_task_switch.isra.0+0x8c/0x2a8 [ C0] [<900000000164e34c>] __schedule+0x314/0xa48 [ C0] [<900000000164ead8>] schedule+0x58/0xf0 [ C0] [<9000000000294a2c>] worker_thread+0x224/0x498 [ C0] [<900000000029d2f0>] kthread+0xf8/0x108 [ C0] [<9000000000221f28>] ret_from_kernel_thread+0xc/0xa4 [ C0] [ C0] ---[ end trace 0000000000000000 ]--- The root cause is acpi_enable_all_wakeup_gpes() uses a mutex to protect acpi_hw_enable_all_wakeup_gpes(), and acpi_ut_acquire_mutex() may cause a thread switch. Since there is no longer concurrent execution during loongarch_acpi_suspend(), we can call acpi_hw_enable_all_wakeup_gpes() directly in enable_gpe_wakeup(). The solution is similar to commit 22db06337f590d01 ("ACPI: sleep: Avoid breaking S3 wakeup due to might_sleep()"). Fixes: 366bb35a8e48 ("LoongArch: Add suspend (ACPI S3) support") Signed-off-by: Qunqin Zhao Signed-off-by: Huacai Chen --- include/acpi/acpixf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index d076ebd19a61..78b24b090488 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -763,6 +763,7 @@ ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status *event_status)) ACPI_HW_DEPENDENT_RETURN_UINT32(u32 acpi_dispatch_gpe(acpi_handle gpe_device, u32 gpe_number)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_hw_disable_all_gpes(void)) +ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_hw_enable_all_wakeup_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_disable_all_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_runtime_gpes(void)) ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_wakeup_gpes(void)) -- cgit v1.2.3 From 819403c893551c5e93bf9087d334e01bcab5c6b9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 4 Dec 2024 13:54:36 +0100 Subject: fs/proc/vmcore: move vmcore definitions out of kcore.h These vmcore defines are not related to /proc/kcore, move them out. We'll move "struct vmcoredd_node" to vmcore.c, because it is only used internally. While "struct vmcore" is only used internally for now, we're planning on using it from inline functions in crash_dump.h next, so move it to crash_dump.h. While at it, rename "struct vmcore" to "struct vmcore_range", which is a more suitable name and will make the usage of it outside of vmcore.c clearer. Signed-off-by: David Hildenbrand Message-Id: <20241204125444.1734652-6-david@redhat.com> Acked-by: Andrew Morton Signed-off-by: Michael S. Tsirkin --- include/linux/crash_dump.h | 7 +++++++ include/linux/kcore.h | 13 ------------- 2 files changed, 7 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index acc55626afdc..788a45061f35 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -114,6 +114,13 @@ struct vmcore_cb { extern void register_vmcore_cb(struct vmcore_cb *cb); extern void unregister_vmcore_cb(struct vmcore_cb *cb); +struct vmcore_range { + struct list_head list; + unsigned long long paddr; + unsigned long long size; + loff_t offset; +}; + #else /* !CONFIG_CRASH_DUMP */ static inline bool is_kdump_kernel(void) { return false; } #endif /* CONFIG_CRASH_DUMP */ diff --git a/include/linux/kcore.h b/include/linux/kcore.h index 86c0f1d18998..9a2fa013c91d 100644 --- a/include/linux/kcore.h +++ b/include/linux/kcore.h @@ -20,19 +20,6 @@ struct kcore_list { int type; }; -struct vmcore { - struct list_head list; - unsigned long long paddr; - unsigned long long size; - loff_t offset; -}; - -struct vmcoredd_node { - struct list_head list; /* List of dumps */ - void *buf; /* Buffer containing device's dump */ - unsigned int size; /* Size of the buffer */ -}; - #ifdef CONFIG_PROC_KCORE void __init kclist_add(struct kcore_list *, void *, size_t, int type); -- cgit v1.2.3 From e017b1f4aa4eb887ee85fe13862206c0d31344b4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 4 Dec 2024 13:54:37 +0100 Subject: fs/proc/vmcore: factor out allocating a vmcore range and adding it to a list Let's factor it out into include/linux/crash_dump.h, from where we can use it also outside of vmcore.c later. Acked-by: Baoquan He Signed-off-by: David Hildenbrand Message-Id: <20241204125444.1734652-7-david@redhat.com> Acked-by: Andrew Morton Signed-off-by: Michael S. Tsirkin --- include/linux/crash_dump.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 788a45061f35..9717912ce4d1 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -121,6 +121,20 @@ struct vmcore_range { loff_t offset; }; +/* Allocate a vmcore range and add it to the list. */ +static inline int vmcore_alloc_add_range(struct list_head *list, + unsigned long long paddr, unsigned long long size) +{ + struct vmcore_range *m = kzalloc(sizeof(*m), GFP_KERNEL); + + if (!m) + return -ENOMEM; + m->paddr = paddr; + m->size = size; + list_add_tail(&m->list, list); + return 0; +} + #else /* !CONFIG_CRASH_DUMP */ static inline bool is_kdump_kernel(void) { return false; } #endif /* CONFIG_CRASH_DUMP */ -- cgit v1.2.3 From e29e9acae06dc28ca8dbf3db976e09787e610dc8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 4 Dec 2024 13:54:38 +0100 Subject: fs/proc/vmcore: factor out freeing a list of vmcore ranges Let's factor it out into include/linux/crash_dump.h, from where we can use it also outside of vmcore.c later. Acked-by: Baoquan He Signed-off-by: David Hildenbrand Message-Id: <20241204125444.1734652-8-david@redhat.com> Acked-by: Andrew Morton Signed-off-by: Michael S. Tsirkin --- include/linux/crash_dump.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 9717912ce4d1..5d61c7454fd6 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -135,6 +135,17 @@ static inline int vmcore_alloc_add_range(struct list_head *list, return 0; } +/* Free a list of vmcore ranges. */ +static inline void vmcore_free_ranges(struct list_head *list) +{ + struct vmcore_range *m, *tmp; + + list_for_each_entry_safe(m, tmp, list, list) { + list_del(&m->list); + kfree(m); + } +} + #else /* !CONFIG_CRASH_DUMP */ static inline bool is_kdump_kernel(void) { return false; } #endif /* CONFIG_CRASH_DUMP */ -- cgit v1.2.3 From 7ad4d1f6e6ef967cd24c6275d8d4056045c019c1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 4 Dec 2024 13:54:39 +0100 Subject: fs/proc/vmcore: introduce PROC_VMCORE_DEVICE_RAM to detect device RAM ranges in 2nd kernel s390 allocates+prepares the elfcore hdr in the dump (2nd) kernel, not in the crashed kernel. RAM provided by memory devices such as virtio-mem can only be detected using the device driver; when vmcore_init() is called, these device drivers are usually not loaded yet, or the devices did not get probed yet. Consequently, on s390 these RAM ranges will not be included in the crash dump, which makes the dump partially corrupt and is unfortunate. Instead of deferring the vmcore_init() call, to an (unclear?) later point, let's reuse the vmcore_cb infrastructure to obtain device RAM ranges as the device drivers probe the device and get access to this information. Then, we'll add these ranges to the vmcore, adding more PT_LOAD entries and updating the offsets+vmcore size. Use a separate Kconfig option to be set by an architecture to include this code only if the arch really needs it. Further, we'll make the config depend on the relevant drivers (i.e., virtio_mem) once they implement support (next). The alternative of having a PROVIDE_PROC_VMCORE_DEVICE_RAM config option was dropped for now for simplicity. The current target use case is s390, which only creates an elf64 elfcore, so focusing on elf64 is sufficient. Signed-off-by: David Hildenbrand Message-Id: <20241204125444.1734652-9-david@redhat.com> Acked-by: Andrew Morton Signed-off-by: Michael S. Tsirkin --- include/linux/crash_dump.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 5d61c7454fd6..2f2555e6407c 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -20,6 +20,8 @@ extern int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size); extern void elfcorehdr_free(unsigned long long addr); extern ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos); extern ssize_t elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); +void elfcorehdr_fill_device_ram_ptload_elf64(Elf64_Phdr *phdr, + unsigned long long paddr, unsigned long long size); extern int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot); @@ -99,6 +101,12 @@ static inline void vmcore_unusable(void) * indicated in the vmcore instead. For example, a ballooned page * contains no data and reading from such a page will cause high * load in the hypervisor. + * @get_device_ram: query RAM ranges that can only be detected by device + * drivers, such as the virtio-mem driver, so they can be included in + * the crash dump on architectures that allocate the elfcore hdr in the dump + * ("2nd") kernel. Indicated RAM ranges may contain holes to reduce the + * total number of ranges; such holes can be detected using the pfn_is_ram + * callback just like for other RAM. * @next: List head to manage registered callbacks internally; initialized by * register_vmcore_cb(). * @@ -109,6 +117,7 @@ static inline void vmcore_unusable(void) */ struct vmcore_cb { bool (*pfn_is_ram)(struct vmcore_cb *cb, unsigned long pfn); + int (*get_device_ram)(struct vmcore_cb *cb, struct list_head *list); struct list_head next; }; extern void register_vmcore_cb(struct vmcore_cb *cb); -- cgit v1.2.3 From 1629ee1078fc2ac38e324adb0e50c8b04ec714a2 Mon Sep 17 00:00:00 2001 From: Shijith Thotton Date: Fri, 3 Jan 2025 21:01:36 +0530 Subject: virtio-pci: define type and header for PCI vendor data Added macro definition for VIRTIO_PCI_CAP_VENDOR_CFG to identify the PCI vendor data type in the virtio_pci_cap structure. Defined a new struct virtio_pci_vndr_data for the vendor data capability header as per the specification. Acked-by: Jason Wang Signed-off-by: Shijith Thotton Message-Id: <20250103153226.1933479-3-sthotton@marvell.com> Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/virtio_pci.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h index 1beb317df1b9..8549d4571257 100644 --- a/include/uapi/linux/virtio_pci.h +++ b/include/uapi/linux/virtio_pci.h @@ -116,6 +116,8 @@ #define VIRTIO_PCI_CAP_PCI_CFG 5 /* Additional shared memory capability */ #define VIRTIO_PCI_CAP_SHARED_MEMORY_CFG 8 +/* PCI vendor data configuration */ +#define VIRTIO_PCI_CAP_VENDOR_CFG 9 /* This is the PCI capability header: */ struct virtio_pci_cap { @@ -130,6 +132,18 @@ struct virtio_pci_cap { __le32 length; /* Length of the structure, in bytes. */ }; +/* This is the PCI vendor data capability header: */ +struct virtio_pci_vndr_data { + __u8 cap_vndr; /* Generic PCI field: PCI_CAP_ID_VNDR */ + __u8 cap_next; /* Generic PCI field: next ptr. */ + __u8 cap_len; /* Generic PCI field: capability length */ + __u8 cfg_type; /* Identifies the structure. */ + __u16 vendor_id; /* Identifies the vendor-specific format. */ + /* For Vendor Definition */ + /* Pads structure to a multiple of 4 bytes */ + /* Reads must not have side effects */ +}; + struct virtio_pci_cap64 { struct virtio_pci_cap cap; __le32 offset_hi; /* Most sig 32 bits of offset */ -- cgit v1.2.3 From a0ec4fb63f5ce15732f8dadc63c931bdf9ff98b5 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 27 Nov 2024 08:57:31 +0200 Subject: virtio_pci: Add support for PCIe Function Level Reset Implement support for Function Level Reset (FLR) in virtio_pci devices. This change adds reset_prepare and reset_done callbacks, allowing drivers to properly handle FLR operations. Without this patch, performing and recovering from an FLR is not possible for virtio_pci devices. This implementation ensures proper FLR handling and recovery for both physical and virtual functions. The device reset can be triggered in case of error or manually via sysfs: echo 1 > /sys/bus/pci/devices/$PCI_ADDR/reset Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Message-Id: <1732690652-3065-2-git-send-email-israelr@nvidia.com> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index dd88682e27e3..4d16c13d0df5 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -190,6 +190,8 @@ int virtio_device_freeze(struct virtio_device *dev); int virtio_device_restore(struct virtio_device *dev); #endif void virtio_reset_device(struct virtio_device *dev); +int virtio_device_reset_prepare(struct virtio_device *dev); +int virtio_device_reset_done(struct virtio_device *dev); size_t virtio_max_dma_size(const struct virtio_device *vdev); @@ -214,6 +216,10 @@ size_t virtio_max_dma_size(const struct virtio_device *vdev); * changes; may be called in interrupt context. * @freeze: optional function to call during suspend/hibernation. * @restore: optional function to call on resume. + * @reset_prepare: optional function to call when a transport specific reset + * occurs. + * @reset_done: optional function to call after transport specific reset + * operation has finished. */ struct virtio_driver { struct device_driver driver; @@ -229,6 +235,8 @@ struct virtio_driver { void (*config_changed)(struct virtio_device *dev); int (*freeze)(struct virtio_device *dev); int (*restore)(struct virtio_device *dev); + int (*reset_prepare)(struct virtio_device *dev); + int (*reset_done)(struct virtio_device *dev); }; #define drv_to_virtio(__drv) container_of_const(__drv, struct virtio_driver, driver) -- cgit v1.2.3 From 2f0805d7c08bea71c95561bfb3e45d93b05196b9 Mon Sep 17 00:00:00 2001 From: Liang Jie Date: Fri, 10 Jan 2025 18:05:24 +0800 Subject: ceph: streamline request head structures in MDS client The existence of the ceph_mds_request_head_old structure in the MDS client code is no longer required due to improvements in handling different MDS request header versions. This patch removes the now redundant ceph_mds_request_head_old structure and replaces its usage with the flexible and extensible ceph_mds_request_head structure. Changes include: - Modification of find_legacy_request_head to directly cast the pointer to ceph_mds_request_head_legacy without going through the old structure. - Update sizeof calculations in create_request_message to use offsetofend for consistency and future-proofing, rather than referencing the old structure. - Use of the structured ceph_mds_request_head directly instead of the old one. Additionally, this consolidation normalizes the handling of request_head_version v1 to align with versions v2 and v3, leading to a more consistent and maintainable codebase. These changes simplify the codebase and reduce potential confusion stemming from the existence of an obsolete structure. Signed-off-by: Liang Jie Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_fs.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 2d7d86f0290d..c7f2c63b3bc3 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -504,20 +504,6 @@ struct ceph_mds_request_head_legacy { #define CEPH_MDS_REQUEST_HEAD_VERSION 3 -struct ceph_mds_request_head_old { - __le16 version; /* struct version */ - __le64 oldest_client_tid; - __le32 mdsmap_epoch; /* on client */ - __le32 flags; /* CEPH_MDS_FLAG_* */ - __u8 num_retry, num_fwd; /* count retry, fwd attempts */ - __le16 num_releases; /* # include cap/lease release records */ - __le32 op; /* mds op code */ - __le32 caller_uid, caller_gid; - __le64 ino; /* use this ino for openc, mkdir, mknod, - etc. (if replaying) */ - union ceph_mds_request_args_ext args; -} __attribute__ ((packed)); - struct ceph_mds_request_head { __le16 version; /* struct version */ __le64 oldest_client_tid; -- cgit v1.2.3 From 09ebd028d6d70c7dc4b1c69212a18134ad2e0020 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B0=A2=E8=87=B4=E9=82=A6=20=28XIE=20Zhibang=29?= Date: Thu, 23 Jan 2025 11:57:03 +0000 Subject: net: the appletalk subsystem no longer uses ndo_do_ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ndo_do_ioctl is no longer used by the appletalk subsystem after commit 45bd1c5ba758 ("net: appletalk: Drop aarp_send_probe_phase1()"). Signed-off-by: 谢致邦 (XIE Zhibang) Reviewed-by: Simon Horman Link: https://patch.msgid.link/tencent_4AC6ED413FEA8116B4253D3ED6947FDBCF08@qq.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8da4c61f97b9..2a59034a5fa2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1085,8 +1085,8 @@ struct netdev_net_notifier { * * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); * Old-style ioctl entry point. This is used internally by the - * appletalk and ieee802154 subsystems but is no longer called by - * the device ioctl handler. + * ieee802154 subsystem but is no longer called by the device + * ioctl handler. * * int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd); * Used by the bonding driver for its device specific ioctls: -- cgit v1.2.3 From 67e4bb2ced0f2d8fbd414f932daea94ba63ae4c4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 23 Jan 2025 15:16:20 -0800 Subject: net: page_pool: don't try to stash the napi id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Page ppol tried to cache the NAPI ID in page pool info to avoid having a dependency on the life cycle of the NAPI instance. Since commit under Fixes the NAPI ID is not populated until napi_enable() and there's a good chance that page pool is created before NAPI gets enabled. Protect the NAPI pointer with the existing page pool mutex, the reading path already holds it. napi_id itself we need to READ_ONCE(), it's protected by netdev_lock() which are not holding in page pool. Before this patch napi IDs were missing for mlx5: # ./cli.py --spec netlink/specs/netdev.yaml --dump page-pool-get [{'id': 144, 'ifindex': 2, 'inflight': 3072, 'inflight-mem': 12582912}, {'id': 143, 'ifindex': 2, 'inflight': 5568, 'inflight-mem': 22806528}, {'id': 142, 'ifindex': 2, 'inflight': 5120, 'inflight-mem': 20971520}, {'id': 141, 'ifindex': 2, 'inflight': 4992, 'inflight-mem': 20447232}, ... After: [{'id': 144, 'ifindex': 2, 'inflight': 3072, 'inflight-mem': 12582912, 'napi-id': 565}, {'id': 143, 'ifindex': 2, 'inflight': 4224, 'inflight-mem': 17301504, 'napi-id': 525}, {'id': 142, 'ifindex': 2, 'inflight': 4288, 'inflight-mem': 17563648, 'napi-id': 524}, ... Fixes: 86e25f40aa1e ("net: napi: Add napi_config") Reviewed-by: Mina Almasry Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250123231620.1086401-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/page_pool/types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index ed4cd114180a..7f405672b089 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -237,7 +237,6 @@ struct page_pool { struct { struct hlist_node list; u64 detach_time; - u32 napi_id; u32 id; } user; }; -- cgit v1.2.3 From 5be1fa8abd7b049f51e6e98e75a37eef5ae2c296 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 8 Dec 2024 00:28:51 -0500 Subject: Pass parent directory inode and expected name to ->d_revalidate() ->d_revalidate() often needs to access dentry parent and name; that has to be done carefully, since the locking environment varies from caller to caller. We are not guaranteed that dentry in question will not be moved right under us - not unless the filesystem is such that nothing on it ever gets renamed. It can be dealt with, but that results in boilerplate code that isn't even needed - the callers normally have just found the dentry via dcache lookup and want to verify that it's in the right place; they already have the values of ->d_parent and ->d_name stable. There is a couple of exceptions (overlayfs and, to less extent, ecryptfs), but for the majority of calls that song and dance is not needed at all. It's easier to make ecryptfs and overlayfs find and pass those values if there's a ->d_revalidate() instance to be called, rather than doing that in the instances. This commit only changes the calling conventions; making use of supplied values is left to followups. NOTE: some instances need more than just the parent - things like CIFS may need to build an entire path from filesystem root, so they need more precautions than the usual boilerplate. This series doesn't do anything to that need - these filesystems have to keep their locking mechanisms (rename_lock loops, use of dentry_path_raw(), private rwsem a-la v9fs). One thing to keep in mind when using name is that name->name will normally point into the pathname being resolved; the filename in question occupies name->len bytes starting at name->name, and there is NUL somewhere after it, but it the next byte might very well be '/' rather than '\0'. Do not ignore name->len. Reviewed-by: Jeff Layton Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Al Viro --- include/linux/dcache.h | 3 ++- include/linux/fscrypt.h | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 8bc567a35718..4a6bdadf2f29 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -144,7 +144,8 @@ enum d_real_type { }; struct dentry_operations { - int (*d_revalidate)(struct dentry *, unsigned int); + int (*d_revalidate)(struct inode *, const struct qstr *, + struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); int (*d_hash)(const struct dentry *, struct qstr *); int (*d_compare)(const struct dentry *, diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 772f822dc6b8..18855cb44b1c 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -192,7 +192,8 @@ struct fscrypt_operations { unsigned int *num_devs); }; -int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags); +int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags); static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) @@ -711,8 +712,8 @@ static inline u64 fscrypt_fname_siphash(const struct inode *dir, return 0; } -static inline int fscrypt_d_revalidate(struct dentry *dentry, - unsigned int flags) +static inline int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *dentry, unsigned int flags) { return 1; } -- cgit v1.2.3 From ffeeaada2bddb88078f16ba24f24ce8651c22d5b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 8 Dec 2024 01:27:11 -0500 Subject: nfs: fix ->d_revalidate() UAF on ->d_name accesses Pass the stable name all the way down to ->rpc_ops->lookup() instances. Note that passing &dentry->d_name is safe in e.g. nfs_lookup() - it *is* stable there, as it is in ->create() et.al. dget_parent() in nfs_instantiate() should be redundant - it'd better be stable there; if it's not, we have more trouble, since ->d_name would also be unsafe in such case. nfs_submount() and nfs4_submount() may or may not require fixes - if they ever get moved on server with fhandle preserved, we are in trouble there... UAF window is fairly narrow here and exfiltration requires the ability to watch the traffic. Reviewed-by: Jeff Layton Signed-off-by: Al Viro --- include/linux/nfs_xdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 559273a0f16d..08b62bbf59f0 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1785,7 +1785,7 @@ struct nfs_rpc_ops { struct nfs_fattr *, struct inode *); int (*setattr) (struct dentry *, struct nfs_fattr *, struct iattr *); - int (*lookup) (struct inode *, struct dentry *, + int (*lookup) (struct inode *, struct dentry *, const struct qstr *, struct nfs_fh *, struct nfs_fattr *); int (*lookupp) (struct inode *, struct nfs_fh *, struct nfs_fattr *); -- cgit v1.2.3 From 30d61efe118cad1a73ad2ad66a3298e4abdf9f41 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 5 Jan 2025 21:33:17 -0500 Subject: 9p: fix ->rename_sem exclusion 9p wants to be able to build a path from given dentry to fs root and keep it valid over a blocking operation. ->s_vfs_rename_mutex would be a natural candidate, but there are places where we need that and where we have no way to tell if ->s_vfs_rename_mutex is already held deeper in callchain. Moreover, it's only held for cross-directory renames; name changes within the same directory happen without it. Solution: * have d_move() done in ->rename() rather than in its caller * maintain a 9p-private rwsem (per-filesystem) * hold it exclusive over the relevant part of ->rename() * hold it shared over the places where we want the path. That almost works. FS_RENAME_DOES_D_MOVE is enough to put all d_move() and d_exchange() calls under filesystem's control. However, there's also __d_unalias(), which isn't covered by any of that. If ->lookup() hits a directory inode with preexisting dentry elsewhere (due to e.g. rename done on server behind our back), d_splice_alias() called by ->lookup() will move/rename that alias. Add a couple of optional methods, so that __d_unalias() would do if alias->d_op->d_unalias_trylock != NULL if (!alias->d_op->d_unalias_trylock(alias)) fail (resulting in -ESTALE from lookup) __d_move(...) if alias->d_op->d_unalias_unlock != NULL alias->d_unalias_unlock(alias) where it currently does __d_move(). 9p instances do down_write_trylock() and up_write() of ->rename_mutex. Signed-off-by: Al Viro --- include/linux/dcache.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4a6bdadf2f29..9a1a30857763 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -159,6 +159,8 @@ struct dentry_operations { struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, enum d_real_type type); + bool (*d_unalias_trylock)(const struct dentry *); + void (*d_unalias_unlock)(const struct dentry *); } ____cacheline_aligned; /* -- cgit v1.2.3 From c1feab95e0b2e9fce7e4f4b2739baf40d84543af Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 23 Jan 2025 22:51:04 -0500 Subject: add a string-to-qstr constructor Quite a few places want to build a struct qstr by given string; it would be convenient to have a primitive doing that, rather than open-coding it via QSTR_INIT(). The closest approximation was in bcachefs, but that expands to initializer list - {.len = strlen(string), .name = string}. It would be more useful to have it as compound literal - (struct qstr){.len = strlen(string), .name = string}. Unlike initializer list it's a valid expression. What's more, it's a valid lvalue - it's an equivalent of anonymous local variable with such initializer, so the things like path->dentry = d_alloc_pseudo(mnt->mnt_sb, &QSTR(name)); are valid. It can also be used as initializer, with identical effect - struct qstr x = (struct qstr){.name = s, .len = strlen(s)}; is equivalent to struct qstr anon_variable = {.name = s, .len = strlen(s)}; struct qstr x = anon_variable; // anon_variable is never used after that point and any even remotely sane compiler will manage to collapse that into struct qstr x = {.name = s, .len = strlen(s)}; What compound literals can't be used for is initialization of global variables, but those are covered by QSTR_INIT(). This commit lifts definition(s) of QSTR() into linux/dcache.h, converts it to compound literal (all bcachefs users are fine with that) and converts assorted open-coded instances to using that. Signed-off-by: Al Viro --- include/linux/dcache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index bff956f7b2b9..3d53a6014591 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -57,6 +57,7 @@ struct qstr { }; #define QSTR_INIT(n,l) { { { .len = l } }, .name = n } +#define QSTR(n) (struct qstr)QSTR_INIT(n, strlen(n)) extern const struct qstr empty_name; extern const struct qstr slash_name; -- cgit v1.2.3 From 3775fc538f535a7c5adaf11990c7932a0bd1f9eb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 28 Jan 2025 20:24:41 +0100 Subject: PM: sleep: core: Synchronize runtime PM status of parents and children Commit 6e176bf8d461 ("PM: sleep: core: Do not skip callbacks in the resume phase") overlooked the case in which the parent of a device with DPM_FLAG_SMART_SUSPEND set did not use that flag and could be runtime- suspended before a transition into a system-wide sleep state. In that case, if the child is resumed during the subsequent transition from that state into the working state, its runtime PM status will be set to RPM_ACTIVE, but the runtime PM status of the parent will not be updated accordingly, even though the parent will be resumed too, because of the dev_pm_skip_suspend() check in device_resume_noirq(). Address this problem by tracking the need to set the runtime PM status to RPM_ACTIVE during system-wide resume transitions for devices with DPM_FLAG_SMART_SUSPEND set and all of the devices depended on by them. Fixes: 6e176bf8d461 ("PM: sleep: core: Do not skip callbacks in the resume phase") Closes: https://lore.kernel.org/linux-pm/Z30p2Etwf3F2AUvD@hovoldconsulting.com/ Reported-by: Johan Hovold Tested-by: Manivannan Sadhasivam Signed-off-by: Rafael J. Wysocki Reviewed-by: Johan Hovold Tested-by: Johan Hovold Link: https://patch.msgid.link/12619233.O9o76ZdvQC@rjwysocki.net --- include/linux/pm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pm.h b/include/linux/pm.h index 0627a795892b..0d2597a76dfc 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -679,6 +679,7 @@ struct dev_pm_info { bool no_pm_callbacks:1; /* Owned by the PM core */ bool async_in_progress:1; /* Owned by the PM core */ bool must_resume:1; /* Owned by the PM core */ + bool set_active:1; /* Owned by the PM core */ bool may_skip_resume:1; /* Set by subsystems */ #else bool should_wakeup:1; -- cgit v1.2.3 From fe6628608627424fb4a6d4c8d2235822457c5d9c Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 28 Jan 2025 20:04:13 +0530 Subject: block: get rid of request queue ->sysfs_dir_lock The request queue uses ->sysfs_dir_lock for protecting the addition/ deletion of kobject entries under sysfs while we register/unregister blk-mq. However kobject addition/deletion is already protected with kernfs/sysfs internal synchronization primitives. So use of q->sysfs_ dir_lock seems redundant. Moreover, q->sysfs_dir_lock is also used at few other callsites along with q->sysfs_lock for protecting the addition/deletion of kojects. One such example is when we register with sysfs a set of independent access ranges for a disk. Here as well we could get rid off q->sysfs_ dir_lock and only use q->sysfs_lock. The only variable which q->sysfs_dir_lock appears to protect is q-> mq_sysfs_init_done which is set/unset while registering/unregistering blk-mq with sysfs. But use of q->mq_sysfs_init_done could be easily replaced using queue registered bit QUEUE_FLAG_REGISTERED. So with this patch we remove q->sysfs_dir_lock from each callsite and replace q->mq_sysfs_init_done using QUEUE_FLAG_REGISTERED. Reviewed-by: Christoph Hellwig Signed-off-by: Nilay Shroff Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20250128143436.874357-2-nilay@linux.ibm.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 76f0a4e7c2e5..248416ecd01c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -561,7 +561,6 @@ struct request_queue { struct list_head flush_list; struct mutex sysfs_lock; - struct mutex sysfs_dir_lock; struct mutex limits_lock; /* @@ -605,8 +604,6 @@ struct request_queue { * Serializes all debugfs metadata operations using the above dentries. */ struct mutex debugfs_mutex; - - bool mq_sysfs_init_done; }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ -- cgit v1.2.3 From 27560b371ab82c1894d048aef0d113acb093f67f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 29 Jan 2025 07:37:57 +0100 Subject: fs: pack struct kstat better Move the change_cookie and subvol up to avoid two 4 byte holes. Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- include/linux/stat.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/stat.h b/include/linux/stat.h index 9d8382e23a9c..be7496a6a0dd 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -50,11 +50,11 @@ struct kstat { struct timespec64 btime; /* File creation time */ u64 blocks; u64 mnt_id; + u64 change_cookie; + u64 subvol; u32 dio_mem_align; u32 dio_offset_align; u32 dio_read_offset_align; - u64 change_cookie; - u64 subvol; u32 atomic_write_unit_min; u32 atomic_write_unit_max; u32 atomic_write_segments_max; -- cgit v1.2.3 From 1e1a9cecfab3f22ebef0a976f849c87be8d03c1c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 31 Jan 2025 13:03:47 +0100 Subject: block: force noio scope in blk_mq_freeze_queue When block drivers or the core block code perform allocations with a frozen queue, this could try to recurse into the block device to reclaim memory and deadlock. Thus all allocations done by a process that froze a queue need to be done without __GFP_IO and __GFP_FS. Instead of tying to track all of them down, force a noio scope as part of freezing the queue. Note that nvme is a bit of a mess here due to the non-owner freezes, and they will be addressed separately. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250131120352.1315351-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a0a9007cc1e3..9ebb53f031cd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -900,8 +900,22 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); -void blk_mq_freeze_queue(struct request_queue *q); -void blk_mq_unfreeze_queue(struct request_queue *q); +void blk_mq_freeze_queue_nomemsave(struct request_queue *q); +void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q); +static inline unsigned int __must_check +blk_mq_freeze_queue(struct request_queue *q) +{ + unsigned int memflags = memalloc_noio_save(); + + blk_mq_freeze_queue_nomemsave(q); + return memflags; +} +static inline void +blk_mq_unfreeze_queue(struct request_queue *q, unsigned int memflags) +{ + blk_mq_unfreeze_queue_nomemrestore(q); + memalloc_noio_restore(memflags); +} void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, -- cgit v1.2.3 From 1c7b17cf0594f33c898004ac1b5576c032f266e2 Mon Sep 17 00:00:00 2001 From: liuye Date: Tue, 19 Nov 2024 14:08:42 +0800 Subject: mm/vmscan: fix hard LOCKUP in function isolate_lru_folios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes the following hard lockup in isolate_lru_folios() during memory reclaim. If the LRU mostly contains ineligible folios this may trigger watchdog. watchdog: Watchdog detected hard LOCKUP on cpu 173 RIP: 0010:native_queued_spin_lock_slowpath+0x255/0x2a0 Call Trace: _raw_spin_lock_irqsave+0x31/0x40 folio_lruvec_lock_irqsave+0x5f/0x90 folio_batch_move_lru+0x91/0x150 lru_add_drain_per_cpu+0x1c/0x40 process_one_work+0x17d/0x350 worker_thread+0x27b/0x3a0 kthread+0xe8/0x120 ret_from_fork+0x34/0x50 ret_from_fork_asm+0x1b/0x30 lruvec->lru_lock owner: PID: 2865 TASK: ffff888139214d40 CPU: 40 COMMAND: "kswapd0" #0 [fffffe0000945e60] crash_nmi_callback at ffffffffa567a555 #1 [fffffe0000945e68] nmi_handle at ffffffffa563b171 #2 [fffffe0000945eb0] default_do_nmi at ffffffffa6575920 #3 [fffffe0000945ed0] exc_nmi at ffffffffa6575af4 #4 [fffffe0000945ef0] end_repeat_nmi at ffffffffa6601dde [exception RIP: isolate_lru_folios+403] RIP: ffffffffa597df53 RSP: ffffc90006fb7c28 RFLAGS: 00000002 RAX: 0000000000000001 RBX: ffffc90006fb7c60 RCX: ffffea04a2196f88 RDX: ffffc90006fb7c60 RSI: ffffc90006fb7c60 RDI: ffffea04a2197048 RBP: ffff88812cbd3010 R8: ffffea04a2197008 R9: 0000000000000001 R10: 0000000000000000 R11: 0000000000000001 R12: ffffea04a2197008 R13: ffffea04a2197048 R14: ffffc90006fb7de8 R15: 0000000003e3e937 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #5 [ffffc90006fb7c28] isolate_lru_folios at ffffffffa597df53 #6 [ffffc90006fb7cf8] shrink_active_list at ffffffffa597f788 #7 [ffffc90006fb7da8] balance_pgdat at ffffffffa5986db0 #8 [ffffc90006fb7ec0] kswapd at ffffffffa5987354 #9 [ffffc90006fb7ef8] kthread at ffffffffa5748238 crash> Scenario: User processe are requesting a large amount of memory and keep page active. Then a module continuously requests memory from ZONE_DMA32 area. Memory reclaim will be triggered due to ZONE_DMA32 watermark alarm reached. However pages in the LRU(active_anon) list are mostly from the ZONE_NORMAL area. Reproduce: Terminal 1: Construct to continuously increase pages active(anon). mkdir /tmp/memory mount -t tmpfs -o size=1024000M tmpfs /tmp/memory dd if=/dev/zero of=/tmp/memory/block bs=4M tail /tmp/memory/block Terminal 2: vmstat -a 1 active will increase. procs ---memory--- ---swap-- ---io---- -system-- ---cpu--- ... r b swpd free inact active si so bi bo 1 0 0 1445623076 45898836 83646008 0 0 0 1 0 0 1445623076 43450228 86094616 0 0 0 1 0 0 1445623076 41003480 88541364 0 0 0 1 0 0 1445623076 38557088 90987756 0 0 0 1 0 0 1445623076 36109688 93435156 0 0 0 1 0 0 1445619552 33663256 95881632 0 0 0 1 0 0 1445619804 31217140 98327792 0 0 0 1 0 0 1445619804 28769988 100774944 0 0 0 1 0 0 1445619804 26322348 103222584 0 0 0 1 0 0 1445619804 23875592 105669340 0 0 0 cat /proc/meminfo | head Active(anon) increase. MemTotal: 1579941036 kB MemFree: 1445618500 kB MemAvailable: 1453013224 kB Buffers: 6516 kB Cached: 128653956 kB SwapCached: 0 kB Active: 118110812 kB Inactive: 11436620 kB Active(anon): 115345744 kB Inactive(anon): 945292 kB When the Active(anon) is 115345744 kB, insmod module triggers the ZONE_DMA32 watermark. perf record -e vmscan:mm_vmscan_lru_isolate -aR perf script isolate_mode=0 classzone=1 order=1 nr_requested=32 nr_scanned=2 nr_skipped=2 nr_taken=0 lru=active_anon isolate_mode=0 classzone=1 order=1 nr_requested=32 nr_scanned=0 nr_skipped=0 nr_taken=0 lru=active_anon isolate_mode=0 classzone=1 order=0 nr_requested=32 nr_scanned=28835844 nr_skipped=28835844 nr_taken=0 lru=active_anon isolate_mode=0 classzone=1 order=1 nr_requested=32 nr_scanned=28835844 nr_skipped=28835844 nr_taken=0 lru=active_anon isolate_mode=0 classzone=1 order=0 nr_requested=32 nr_scanned=29 nr_skipped=29 nr_taken=0 lru=active_anon isolate_mode=0 classzone=1 order=0 nr_requested=32 nr_scanned=0 nr_skipped=0 nr_taken=0 lru=active_anon See nr_scanned=28835844. 28835844 * 4k = 115343376KB approximately equal to 115345744 kB. If increase Active(anon) to 1000G then insmod module triggers the ZONE_DMA32 watermark. hard lockup will occur. In my device nr_scanned = 0000000003e3e937 when hard lockup. Convert to memory size 0x0000000003e3e937 * 4KB = 261072092 KB. [ffffc90006fb7c28] isolate_lru_folios at ffffffffa597df53 ffffc90006fb7c30: 0000000000000020 0000000000000000 ffffc90006fb7c40: ffffc90006fb7d40 ffff88812cbd3000 ffffc90006fb7c50: ffffc90006fb7d30 0000000106fb7de8 ffffc90006fb7c60: ffffea04a2197008 ffffea0006ed4a48 ffffc90006fb7c70: 0000000000000000 0000000000000000 ffffc90006fb7c80: 0000000000000000 0000000000000000 ffffc90006fb7c90: 0000000000000000 0000000000000000 ffffc90006fb7ca0: 0000000000000000 0000000003e3e937 ffffc90006fb7cb0: 0000000000000000 0000000000000000 ffffc90006fb7cc0: 8d7c0b56b7874b00 ffff88812cbd3000 About the Fixes: Why did it take eight years to be discovered? The problem requires the following conditions to occur: 1. The device memory should be large enough. 2. Pages in the LRU(active_anon) list are mostly from the ZONE_NORMAL area. 3. The memory in ZONE_DMA32 needs to reach the watermark. If the memory is not large enough, or if the usage design of ZONE_DMA32 area memory is reasonable, this problem is difficult to detect. notes: The problem is most likely to occur in ZONE_DMA32 and ZONE_NORMAL, but other suitable scenarios may also trigger the problem. Link: https://lkml.kernel.org/r/20241119060842.274072-1-liuye@kylinos.cn Fixes: b2e18757f2c9 ("mm, vmscan: begin reclaiming pages on a per-node basis") Signed-off-by: liuye Cc: Hugh Dickins Cc: Mel Gorman Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index a5f475335aea..b13b72645db3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -222,6 +222,7 @@ enum { }; #define SWAP_CLUSTER_MAX 32UL +#define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10) #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX /* Bit flag in swap_map */ -- cgit v1.2.3