From 4314a44564eb1565349fed7a4192344c5f46fc85 Mon Sep 17 00:00:00 2001 From: Yazhou Tang Date: Wed, 6 May 2026 17:47:12 +0800 Subject: bpf: Fix out-of-bounds read in bpf_patch_call_args() The interpreters_args array only accommodates stack depths up to MAX_BPF_STACK (512 bytes). However, do_misc_fixups() may allow a larger stack depth if JIT is requested. If JIT compilation later fails and falls back to the interpreter, the verifier invokes bpf_patch_call_args() with this oversized stack depth. This causes a load-time out-of-bounds (OOB) read when calculating the interpreter function pointer index. Fix this by changing bpf_patch_call_args() to return an int and explicitly rejecting the JIT fallback (returning -EINVAL) if the stack depth exceeds MAX_BPF_STACK. Fixes: 1ea47e01ad6e ("bpf: add support for bpf_call to interpreter") Co-developed-by: Tianci Cao Signed-off-by: Tianci Cao Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Signed-off-by: Yazhou Tang Acked-by: Xu Kuohai Link: https://lore.kernel.org/r/20260506094714.419842-2-tangyazhou@zju.edu.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 01e203964892..52b30e9ea431 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2917,7 +2917,7 @@ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size); #ifndef CONFIG_BPF_JIT_ALWAYS_ON -void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); +int bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); #endif struct btf *bpf_get_btf_vmlinux(void); -- cgit v1.2.3 From 58a8f3e2501dc14b8e00e883d6aaf0600a239da7 Mon Sep 17 00:00:00 2001 From: Yazhou Tang Date: Wed, 6 May 2026 17:47:13 +0800 Subject: bpf: Fix s16 truncation for large bpf-to-bpf call offsets Currently, the BPF instruction set allows bpf-to-bpf calls (or internal calls, pseudo calls) to use a 32-bit imm field to represent the relative jump offset. However, when JIT is disabled or falls back to the interpreter, the verifier invokes bpf_patch_call_args() to rewrite the call instruction. In this function, the 32-bit imm is downcast to s16 and stored in the off field. void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) { stack_depth = max_t(u32, stack_depth, 1); insn->off = (s16) insn->imm; insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - __bpf_call_base_args; insn->code = BPF_JMP | BPF_CALL_ARGS; } If the original imm exceeds the s16 range (i.e., a jump offset greater than 32767 instructions), this downcast silently truncates the offset, resulting in an incorrect call target. Fix this by: 1. In bpf_patch_call_args(), keeping the imm field unchanged and using the off field to store the index of the interpreter function. 2. In ___bpf_prog_run() for the JMP_CALL_ARGS case, retrieving the interpreter function pointer from the interpreters_args array using the off field as the index, and passing the original imm to calculate the last argument of the interpreter function. After these changes, the truncation issue is resolved, and __bpf_call_base_args is also no longer needed and can be removed, which makes the code cleaner. Performance: In ___bpf_prog_run() for the JMP_CALL_ARGS case, changing the retrieval of the interpreter function pointer from pointer addition to direct array indexing improves performance. The possible reason is that the latter has better instruction-level parallelism. See the v5 discussion [1] for more details. [1] https://lore.kernel.org/bpf/f120c3c4-6999-414a-b514-518bb64b4758@zju.edu.cn/ To avoid requiring bpftool changes, keep the new imm/off encoding internal and restore the legacy xlated dump layout in bpf_insn_prepare_dump(). For bpf-to-bpf call offsets that do not fit in s16, export off as 0 instead of a truncated and misleading value. Fixes: 1ea47e01ad6e ("bpf: add support for bpf_call to interpreter") Fixes: 7105e828c087 ("bpf: allow for correlation of maps and helpers in dump") Suggested-by: Xu Kuohai Suggested-by: Puranjay Mohan Co-developed-by: Tianci Cao Signed-off-by: Tianci Cao Co-developed-by: Shenghao Yuan Signed-off-by: Shenghao Yuan Signed-off-by: Yazhou Tang Link: https://lore.kernel.org/r/20260506094714.419842-3-tangyazhou@zju.edu.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ include/linux/filter.h | 3 --- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 52b30e9ea431..cd191c5fdb0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2918,6 +2918,12 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 ua #ifndef CONFIG_BPF_JIT_ALWAYS_ON int bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); +s32 bpf_call_args_imm(s16 idx); +#else +static inline s32 bpf_call_args_imm(s16 idx) +{ + return 0; +} #endif struct btf *bpf_get_btf_vmlinux(void); diff --git a/include/linux/filter.h b/include/linux/filter.h index 1ec6d5ba64cc..88a241aac36a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1151,9 +1151,6 @@ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); -#define __bpf_call_base_args \ - ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ - (void *)__bpf_call_base) struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); -- cgit v1.2.3 From b5782e2d462c028096f922abca46318cec890670 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:40 +0100 Subject: netfs: Fix missing barriers when accessing stream->subrequests locklessly The list of subrequests attached to stream->subrequests is accessed without locks by netfs_collect_read_results() and netfs_collect_write_results(), and then they access subreq->flags without taking a barrier after getting the subreq pointer from the list. Relatedly, the functions that build the list don't use any sort of write barrier when constructing the list to make sure that the NETFS_SREQ_IN_PROGRESS flag is perceived to be set first if no lock is taken. Fix this by: (1) Add a new list_add_tail_release() function that uses a release barrier to set the pointer to the new member of the list. (2) Add a new list_first_entry_or_null_acquire() function that uses an acquire barrier to read the pointer to the first member in a list (or return NULL). (3) Use list_add_tail_release() when adding a subreq to ->subrequests. (4) Use list_first_entry_or_null_acquire() when initially accessing the front of the list (when an item is removed, the pointer to the new front iterm is obtained under the same lock). Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item") Fixes: 288ace2f57c9 ("netfs: New writeback implementation") Link: https://sashiko.dev/#/patchset/20260326104544.509518-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-4-dhowells@redhat.com cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/list.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include') diff --git a/include/linux/list.h b/include/linux/list.h index 00ea8e5fb88b..09d979976b3b 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -191,6 +191,29 @@ static inline void list_add_tail(struct list_head *new, struct list_head *head) __list_add(new, head->prev, head); } +/** + * list_add_tail_release - add a new entry with release barrier + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head, using a release barrier to set + * the ->next pointer that points to it. This is useful for implementing + * queues, in particular one that the elements will be walked through forwards + * locklessly. + */ +static inline void list_add_tail_release(struct list_head *new, + struct list_head *head) +{ + struct list_head *prev = head->prev; + + if (__list_add_valid(new, prev, head)) { + new->next = head; + new->prev = prev; + head->prev = new; + smp_store_release(&prev->next, new); + } +} + /* * Delete a list entry by making the prev/next entries * point to each other. @@ -644,6 +667,20 @@ static inline void list_splice_tail_init(struct list_head *list, pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ }) +/** + * list_first_entry_or_null_acquire - get the first element from a list with barrier + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_head within the struct. + * + * Note that if the list is empty, it returns NULL. + */ +#define list_first_entry_or_null_acquire(ptr, type, member) ({ \ + struct list_head *head__ = (ptr); \ + struct list_head *pos__ = smp_load_acquire(&head__->next); \ + pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ +}) + /** * list_last_entry_or_null - get the last element from a list * @ptr: the list head to take the element from. -- cgit v1.2.3 From 2c8f4742bb76117d735f92a3932d85239b16c494 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:42 +0100 Subject: netfs: Fix potential for tearing in ->remote_i_size and ->zero_point Fix potential tearing in using ->remote_i_size and ->zero_point by copying i_size_read() and i_size_write() and using the same seqcount as for i_size. We need to make sure that netfslib and the filesystems that use it always hold i_lock whilst updating any of the sizes to prevent i_size_seqcount from getting corrupted. Fixes: 4058f742105e ("netfs: Keep track of the actual remote file size") Fixes: 100ccd18bb41 ("netfs: Optimise away reads above the point at which there can be no data") Closes: https://sashiko.dev/#/patchset/20260414082004.3756080-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-6-dhowells@redhat.com cc: Paulo Alcantara cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 293 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 284 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index ba17ac5bf356..4fd1d796ad73 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -62,8 +62,8 @@ struct netfs_inode { struct fscache_cookie *cache; #endif struct mutex wb_lock; /* Writeback serialisation */ - loff_t remote_i_size; /* Size of the remote file */ - loff_t zero_point; /* Size after which we assume there's no data + loff_t _remote_i_size; /* Size of the remote file */ + loff_t _zero_point; /* Size after which we assume there's no data * on the server */ atomic_t io_count; /* Number of outstanding reqs */ unsigned long flags; @@ -474,6 +474,254 @@ static inline struct netfs_inode *netfs_inode(struct inode *inode) return container_of(inode, struct netfs_inode, inode); } +/** + * netfs_read_remote_i_size - Read remote_i_size safely + * @inode: The inode to access + * + * Read remote_i_size safely without the potential for tearing on 32-bit + * arches. + * + * NOTE: in a 32bit arch with a preemptable kernel and an UP compile the + * i_size_read/write must be atomic with respect to the local cpu (unlike with + * preempt disabled), but they don't need to be atomic with respect to other + * cpus like in true SMP (so they need either to either locally disable irq + * around the read or for example on x86 they can be still implemented as a + * cmpxchg8b without the need of the lock prefix). For SMP compiles and 64bit + * archs it makes no difference if preempt is enabled or not. + */ +static inline unsigned long long netfs_read_remote_i_size(const struct inode *inode) +{ + const struct netfs_inode *ictx = container_of(inode, struct netfs_inode, inode); + unsigned long long remote_i_size; + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + unsigned int seq; + + do { + seq = read_seqcount_begin(&inode->i_size_seqcount); + remote_i_size = ictx->_remote_i_size; + } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + remote_i_size = ictx->_remote_i_size; + preempt_enable(); +#else + /* Pairs with smp_store_release() in netfs_write_remote_i_size() */ + remote_i_size = smp_load_acquire(&ictx->_remote_i_size); +#endif + return remote_i_size; +} + +/* + * netfs_write_remote_i_size - Set remote_i_size safely + * @inode: The inode to access + * @remote_i_size: The new value for the size of the file on the server + * + * Set remote_i_size safely without the potential for tearing on 32-bit arches. + * + * Context: The caller must hold inode->i_lock. + * + * NOTE: unlike netfs_read_remote_i_size(), netfs_write_remote_i_size() does + * need locking around it (normally i_rwsem), otherwise on 32bit/SMP an update + * of i_size_seqcount can be lost, resulting in subsequent i_size_read() calls + * spinning forever. + */ +static inline void netfs_write_remote_i_size(struct inode *inode, + unsigned long long remote_i_size) +{ + struct netfs_inode *ictx = netfs_inode(inode); + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&inode->i_size_seqcount); + ictx->_remote_i_size = remote_i_size; + write_seqcount_end(&inode->i_size_seqcount); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + ictx->_remote_i_size = remote_i_size; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in netfs_read_remote_i_size() to + * ensure changes related to inode size (such as page contents) are + * visible before we see the changed inode size. + */ + smp_store_release(&ictx->_remote_i_size, remote_i_size); +#endif +} + +/** + * netfs_read_zero_point - Read zero_point safely + * @inode: The inode to access + * + * Read zero_point safely without the potential for tearing on 32-bit + * arches. + * + * NOTE: in a 32bit arch with a preemptable kernel and an UP compile the + * i_size_read/write must be atomic with respect to the local cpu (unlike with + * preempt disabled), but they don't need to be atomic with respect to other + * cpus like in true SMP (so they need either to either locally disable irq + * around the read or for example on x86 they can be still implemented as a + * cmpxchg8b without the need of the lock prefix). For SMP compiles and 64bit + * archs it makes no difference if preempt is enabled or not. + */ +static inline unsigned long long netfs_read_zero_point(const struct inode *inode) +{ + struct netfs_inode *ictx = container_of(inode, struct netfs_inode, inode); + unsigned long long zero_point; + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + unsigned int seq; + + do { + seq = read_seqcount_begin(&inode->i_size_seqcount); + zero_point = ictx->_zero_point; + } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + zero_point = ictx->_zero_point; + preempt_enable(); +#else + /* Pairs with smp_store_release() in netfs_write_zero_point() */ + zero_point = smp_load_acquire(&ictx->_zero_point); +#endif + return zero_point; +} + +/* + * netfs_write_zero_point - Set zero_point safely + * @inode: The inode to access + * @zero_point: The new value for the point beyond which the server has no data + * + * Set zero_point safely without the potential for tearing on 32-bit arches. + * + * Context: The caller must hold inode->i_lock. + * + * NOTE: unlike netfs_read_zero_point(), netfs_write_zero_point() does need + * locking around it (normally i_rwsem), otherwise on 32bit/SMP an update of + * i_size_seqcount can be lost, resulting in subsequent read calls spinning + * forever. + */ +static inline void netfs_write_zero_point(struct inode *inode, + unsigned long long zero_point) +{ + struct netfs_inode *ictx = netfs_inode(inode); + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&inode->i_size_seqcount); + ictx->_zero_point = zero_point; + write_seqcount_end(&inode->i_size_seqcount); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + ictx->_zero_point = zero_point; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in netfs_read_zero_point() to + * ensure changes related to inode size (such as page contents) are + * visible before we see the changed inode size. + */ + smp_store_release(&ictx->_zero_point, zero_point); +#endif +} + +/** + * netfs_read_sizes - Read remote_i_size and zero_point safely + * @inode: The inode to access + * @i_size: Where to return the local file size. + * @remote_i_size: Where to return the size of the file on the server + * @zero_point: Where to return the the point beyond which the server has no data + * + * Read remote_i_size and zero_point safely without the potential for tearing + * on 32-bit arches. + * + * NOTE: in a 32bit arch with a preemptable kernel and an UP compile the + * i_size_read/write must be atomic with respect to the local cpu (unlike with + * preempt disabled), but they don't need to be atomic with respect to other + * cpus like in true SMP (so they need either to either locally disable irq + * around the read or for example on x86 they can be still implemented as a + * cmpxchg8b without the need of the lock prefix). For SMP compiles and 64bit + * archs it makes no difference if preempt is enabled or not. + */ +static inline void netfs_read_sizes(const struct inode *inode, + unsigned long long *i_size, + unsigned long long *remote_i_size, + unsigned long long *zero_point) +{ + const struct netfs_inode *ictx = container_of(inode, struct netfs_inode, inode); +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + unsigned int seq; + + do { + seq = read_seqcount_begin(&inode->i_size_seqcount); + *i_size = inode->i_size; + *remote_i_size = ictx->_remote_i_size; + *zero_point = ictx->_zero_point; + } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + *i_size = inode->i_size; + *remote_i_size = ictx->_remote_i_size; + *zero_point = ictx->_zero_point; + preempt_enable(); +#else + /* Pairs with smp_store_release() in i_size_write() */ + *i_size = smp_load_acquire(&inode->i_size); + /* Pairs with smp_store_release() in netfs_write_remote_i_size() */ + *remote_i_size = smp_load_acquire(&ictx->_remote_i_size); + /* Pairs with smp_store_release() in netfs_write_zero_point() */ + *zero_point = smp_load_acquire(&ictx->_zero_point); +#endif +} + +/* + * netfs_write_sizes - Set i_size, remote_i_size and zero_point safely + * @inode: The inode to access + * @i_size: The new value for the local size of the file + * @remote_i_size: The new value for the size of the file on the server + * @zero_point: The new value for the point beyond which the server has no data + * + * Set both remote_i_size and zero_point safely without the potential for + * tearing on 32-bit arches. + * + * Context: The caller must hold inode->i_lock. + * + * NOTE: unlike netfs_read_zero_point(), netfs_write_zero_point() does need + * locking around it (normally i_rwsem), otherwise on 32bit/SMP an update of + * i_size_seqcount can be lost, resulting in subsequent read calls spinning + * forever. + */ +static inline void netfs_write_sizes(struct inode *inode, + unsigned long long i_size, + unsigned long long remote_i_size, + unsigned long long zero_point) +{ + struct netfs_inode *ictx = netfs_inode(inode); + +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&inode->i_size_seqcount); + inode->i_size = i_size; + ictx->_remote_i_size = remote_i_size; + ictx->_zero_point = zero_point; + write_seqcount_end(&inode->i_size_seqcount); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + inode->i_size = i_size; + ictx->_remote_i_size = remote_i_size; + ictx->_zero_point = zero_point; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in i_size_read(), + * netfs_read_remote_i_size() and netfs_read_zero_point() to ensure + * changes related to inode size (such as page contents) are visible + * before we see the changed inode size. + */ + smp_store_release(&inode->i_size, i_size); + smp_store_release(&ictx->_remote_i_size, remote_i_size); + smp_store_release(&ictx->_zero_point, zero_point); +#endif +} + /** * netfs_inode_init - Initialise a netfslib inode context * @ctx: The netfs inode to initialise @@ -488,8 +736,8 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, bool use_zero_point) { ctx->ops = ops; - ctx->remote_i_size = i_size_read(&ctx->inode); - ctx->zero_point = LLONG_MAX; + ctx->_remote_i_size = i_size_read(&ctx->inode); + ctx->_zero_point = LLONG_MAX; ctx->flags = 0; atomic_set(&ctx->io_count, 0); #if IS_ENABLED(CONFIG_FSCACHE) @@ -498,7 +746,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, mutex_init(&ctx->wb_lock); /* ->releasepage() drives zero_point */ if (use_zero_point) { - ctx->zero_point = ctx->remote_i_size; + ctx->_zero_point = ctx->_remote_i_size; mapping_set_release_always(ctx->inode.i_mapping); } } @@ -511,13 +759,40 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, * * Inform the netfs lib that a file got resized so that it can adjust its state. */ -static inline void netfs_resize_file(struct netfs_inode *ctx, loff_t new_i_size, +static inline void netfs_resize_file(struct netfs_inode *ictx, + unsigned long long new_i_size, bool changed_on_server) { +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + struct inode *inode = &ictx->inode; + + preempt_disable(); + write_seqcount_begin(&inode->i_size_seqcount); + if (changed_on_server) + ictx->_remote_i_size = new_i_size; + if (new_i_size < ictx->_zero_point) + ictx->_zero_point = new_i_size; + write_seqcount_end(&inode->i_size_seqcount); + preempt_enable(); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); if (changed_on_server) - ctx->remote_i_size = new_i_size; - if (new_i_size < ctx->zero_point) - ctx->zero_point = new_i_size; + ictx->_remote_i_size = new_i_size; + if (new_i_size < ictx->_zero_point) + ictx->_zero_point = new_i_size; + preempt_enable(); +#else + /* + * Pairs with smp_load_acquire() in netfs_read_remote_i_size and + * netfs_read_zero_point() to ensure changes related to inode size + * (such as page contents) are visible before we see the changed inode + * size. + */ + if (changed_on_server) + smp_store_release(&ictx->_remote_i_size, new_i_size); + if (new_i_size < ictx->_zero_point) + smp_store_release(&ictx->_zero_point, new_i_size); +#endif } /** -- cgit v1.2.3 From 156ac2ec2ee77c44c4eb7439d6d165247ba12247 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:48 +0100 Subject: netfs: Fix netfs_invalidate_folio() to clear dirty bit if all changes gone If a streaming write is made, this will leave the relevant modified folio in a not-uptodate, but dirty state with a netfs_folio struct hung off of folio->private indicating the dirty range. Subsequently truncating the file such that the dirty data in the folio is removed, but the first part of the folio theoretically remains will cause the netfs_folio struct to be discarded... but will leave the dirty flag set. If the folio is then read via mmap(), netfs_read_folio() will see that the page is dirty and jump to netfs_read_gaps() to fill in the missing bits. netfs_read_gaps(), however, expects there to be a netfs_folio struct present and can oops because truncate removed it. Fix this by calling folio_cancel_dirty() in netfs_invalidate_folio() in the event that all the dirty data in the folio is erased (as nfs does). Also add some tracepoints to log modifications to a dirty page. This can be reproduced with something like: dd if=/dev/zero of=/xfstest.test/foo bs=1M count=1 umount /xfstest.test mount /xfstest.test xfs_io -c "w 0xbbbf 0xf96c" \ -c "truncate 0xbbbf" \ -c "mmap -r 0xb000 0x11000" \ -c "mr 0xb000 0x11000" \ /xfstest.test/foo with fscaching disabled (otherwise streaming writes are suppressed) and a change to netfs_perform_write() to disallow streaming writes if the fd is open O_RDWR: if (//(file->f_mode & FMODE_READ) || <--- comment this out netfs_is_cache_enabled(ctx)) { It should be reproducible even without this change, but if prevents the above trivial xfs_io command from reproducing it. Note that the initial dd is important: the file must start out sufficiently large that the zero-point logic doesn't just clear the gaps because it knows there's nothing in the file to read yet. Unmounting and mounting is needed to clear the pagecache (there are other ways to do that that may also work). This was initially reproduced with the generic/522 xfstest on some patches that remove the FMODE_READ restriction. Fixes: 9ebff83e6481 ("netfs: Prep to use folio->private for write grouping and streaming write") Reported-by: Marc Dionne Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-12-dhowells@redhat.com cc: Paulo Alcantara cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/trace/events/netfs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 8c936fc575d5..0b702f74aefe 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -194,6 +194,10 @@ EM(netfs_folio_trace_copy_to_cache, "mark-copy") \ EM(netfs_folio_trace_end_copy, "end-copy") \ EM(netfs_folio_trace_filled_gaps, "filled-gaps") \ + EM(netfs_folio_trace_invalidate_all, "inval-all") \ + EM(netfs_folio_trace_invalidate_front, "inval-front") \ + EM(netfs_folio_trace_invalidate_middle, "inval-mid") \ + EM(netfs_folio_trace_invalidate_tail, "inval-tail") \ EM(netfs_folio_trace_kill, "kill") \ EM(netfs_folio_trace_kill_cc, "kill-cc") \ EM(netfs_folio_trace_kill_g, "kill-g") \ -- cgit v1.2.3 From 7b4dcf1b9455a6e52ac7478b4057dbe10359576d Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:50 +0100 Subject: netfs: Fix streaming write being overwritten In order to avoid reading whilst writing, netfslib will allow "streaming writes" in which dirty data is stored directly into folios without reading them first. Such folios are marked dirty but may not be marked uptodate. If a folio is entirely written by a streaming write, uptodate will be set, otherwise it will have a netfs_folio struct attached to ->private recording the dirty region. In the event that a partially written streaming write page is to be overwritten entirely by a single write(), netfs_perform_write() will try to copy over it, but doesn't discard the netfs_folio if it succeeds; further, it doesn't correctly handle a partial copy that overwrites some of the dirty data. Fix this by the following: (1) If the folio is successfully overwritten, free the netfs_folio struct before marking the page uptodate. (2) If the copy to the folio partially fails, but short of the dirty data, just ignore the copy. (3) If the copy partially fails and overwrites some of the dirty data, accept the copy, update the netfs_folio struct to record the new data. If the folio is now filled, free the netfs_folio and set uptodate, otherwise return a partial write. Found with: fsx -q -N 1000000 -p 10000 -o 128000 -l 600000 \ /xfstest.test/junk --replay-ops=junk.fsxops using the following as junk.fsxops: truncate 0x0 0 0x927c0 write 0x63fb8 0x53c8 0 copy_range 0xb704 0x19b9 0x24429 0x79380 write 0x2402b 0x144a2 0x90660 * write 0x204d5 0x140a0 0x927c0 * copy_range 0x1f72c 0x137d0 0x7a906 0x927c0 * read 0x00000 0x20000 0x9157c read 0x20000 0x20000 0x9157c read 0x40000 0x20000 0x9157c read 0x60000 0x20000 0x9157c read 0x7e1a0 0xcfb9 0x9157c on cifs with the default cache option. It shows folio 0x24 misbehaving if the FMODE_READ check is commented out in netfs_perform_write(): if (//(file->f_mode & FMODE_READ) || netfs_is_cache_enabled(ctx)) { and no fscache. This was initially found with the generic/522 xfstest. Fixes: 8f52de0077ba ("netfs: Reduce number of conditional branches in netfs_perform_write()") Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-14-dhowells@redhat.com cc: Paulo Alcantara cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/trace/events/netfs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 0b702f74aefe..aa9940ba307b 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -177,6 +177,9 @@ EM(netfs_folio_is_uptodate, "mod-uptodate") \ EM(netfs_just_prefetch, "mod-prefetch") \ EM(netfs_whole_folio_modify, "mod-whole-f") \ + EM(netfs_whole_folio_modify_efault, "mod-whole-f!") \ + EM(netfs_whole_folio_modify_filled, "mod-whole-f+") \ + EM(netfs_whole_folio_modify_filled_efault, "mod-whole-f+!") \ EM(netfs_modify_and_clear, "mod-n-clear") \ EM(netfs_streaming_write, "mod-streamw") \ EM(netfs_streaming_write_cont, "mod-streamw+") \ -- cgit v1.2.3 From dbe556972100fabb8e5a1b3d2163831ff07b1e8e Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:56 +0100 Subject: netfs: Fix potential UAF in netfs_unlock_abandoned_read_pages() netfs_unlock_abandoned_read_pages(rreq) accesses the index of the folios it is wanting to unlock and compares that to rreq->no_unlock_folio so that it doesn't unlock a folio being read for netfs_perform_write() or netfs_write_begin(). However, given that netfs_unlock_abandoned_read_pages() is called _after_ NETFS_RREQ_IN_PROGRESS is cleared, the one folio that it's not allowed to dereference is the one specified by ->no_unlock_folio as ownership immediately reverts to the caller. Fix this by storing the folio pointer instead and using that rather than the index. Also fix netfs_unlock_read_folio() where the same applies. Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Closes: https://sashiko.dev/#/patchset/20260414082004.3756080-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-20-dhowells@redhat.com cc: Paulo Alcantara cc: Viacheslav Dubeyko cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/linux/netfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 4fd1d796ad73..243c0f737938 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -252,7 +252,7 @@ struct netfs_io_request { unsigned long long collected_to; /* Point we've collected to */ unsigned long long cleaned_to; /* Position we've cleaned folios to */ unsigned long long abandon_to; /* Position to abandon folios to */ - pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ + const struct folio *no_unlock_folio; /* Don't unlock this folio after read */ unsigned int direct_bv_count; /* Number of elements in direct_bv[] */ unsigned int debug_id; unsigned int rsize; /* Maximum read size (0 for none) */ -- cgit v1.2.3 From ccde2ac757c713535b224233a296de40efe5212d Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 12 May 2026 13:33:58 +0100 Subject: netfs: Fix folio->private handling in netfs_perform_write() Under some circumstances, netfs_perform_write() doesn't correctly manipulate folio->private between NULL, NETFS_FOLIO_COPY_TO_CACHE, pointing to a group and pointing to a netfs_folio struct, leading to potential multiple attachments of private data with associated folio ref leaks and also leaks of netfs_folio structs or netfs_group refs. Fix this by consolidating the place at which a folio is marked uptodate in one place and having that look at what's attached to folio->private and decide how to clean it up and then set the new group. Also, the content shouldn't be flushed if group is NULL, even if a group is specified in the netfs_group parameter, as that would be the case for a new folio. A filesystem should always specify netfs_group or never specify netfs_group. The Sashiko auto-review tool noted that it was theoretically possible that the fpos >= ctx->zero_point section might leak if it modified a streaming write folio. This is unlikely, but with a network filesystem, third party changes can happen. It also pointed out that __netfs_set_group() would leak if called multiple times on the same folio from the "whole folio modify section". Fixes: 8f52de0077ba ("netfs: Reduce number of conditional branches in netfs_perform_write()") Closes: https://sashiko.dev/#/patchset/20260414082004.3756080-1-dhowells%40redhat.com Signed-off-by: David Howells Link: https://patch.msgid.link/20260512123404.719402-22-dhowells@redhat.com cc: Paulo Alcantara cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- include/trace/events/netfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index aa9940ba307b..082cb03c6131 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -181,6 +181,7 @@ EM(netfs_whole_folio_modify_filled, "mod-whole-f+") \ EM(netfs_whole_folio_modify_filled_efault, "mod-whole-f+!") \ EM(netfs_modify_and_clear, "mod-n-clear") \ + EM(netfs_modify_and_clear_rm_finfo, "mod-n-clear+") \ EM(netfs_streaming_write, "mod-streamw") \ EM(netfs_streaming_write_cont, "mod-streamw+") \ EM(netfs_flush_content, "flush") \ -- cgit v1.2.3 From 620072fd783290ad92c2d445a47b0a61b161f352 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 26 Apr 2026 12:31:17 -0700 Subject: mm/damon: fix damos_stat tracepoint format for sz_applied The print format is wrongly marking sz_applied as sz_tried. Fix it. Link: https://lore.kernel.org/20260426193119.88095-1-sj@kernel.org Fixes: 804c26b961da ("mm/damon/core: add trace point for damos stat per apply interval") Signed-off-by: SeongJae Park Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: # 7.0.x Signed-off-by: Andrew Morton --- include/trace/events/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index 24fc402ab3c8..7e25f4469b81 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -41,7 +41,7 @@ TRACE_EVENT(damos_stat_after_apply_interval, ), TP_printk("ctx_idx=%u scheme_idx=%u nr_tried=%lu sz_tried=%lu " - "nr_applied=%lu sz_tried=%lu sz_ops_filter_passed=%lu " + "nr_applied=%lu sz_applied=%lu sz_ops_filter_passed=%lu " "qt_exceeds=%lu nr_snapshots=%lu", __entry->context_idx, __entry->scheme_idx, __entry->nr_tried, __entry->sz_tried, -- cgit v1.2.3 From 6a288a4ddb4a994490505ab5f41c445f8e6b6467 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Tue, 21 Apr 2026 17:39:07 +0200 Subject: mm/page_alloc: fix initialization of tags of the huge zero folio with init_on_free __GFP_ZEROTAGS semantics are currently a bit weird, but effectively this flag is only ever set alongside __GFP_ZERO and __GFP_SKIP_KASAN. If we run with init_on_free, we will zero out pages during __free_pages_prepare(), to skip zeroing on the allocation path. However, when allocating with __GFP_ZEROTAG set, post_alloc_hook() will consequently not only skip clearing page content, but also skip clearing tag memory. Not clearing tags through __GFP_ZEROTAGS is irrelevant for most pages that will get mapped to user space through set_pte_at() later: set_pte_at() and friends will detect that the tags have not been initialized yet (PG_mte_tagged not set), and initialize them. However, for the huge zero folio, which will be mapped through a PMD marked as special, this initialization will not be performed, ending up exposing whatever tags were still set for the pages. The docs (Documentation/arch/arm64/memory-tagging-extension.rst) state that allocation tags are set to 0 when a page is first mapped to user space. That no longer holds with the huge zero folio when init_on_free is enabled. Fix it by decoupling __GFP_ZEROTAGS from __GFP_ZERO, passing to tag_clear_highpages() whether we want to also clear page content. Invert the meaning of the tag_clear_highpages() return value to have clearer semantics. Reproduced with the huge zero folio by modifying the check_buffer_fill arm64/mte selftest to use a 2 MiB area, after making sure that pages have a non-0 tag set when freeing (note that, during boot, we will not actually initialize tags, but only set KASAN_TAG_KERNEL in the page flags). $ ./check_buffer_fill 1..20 ... not ok 17 Check initial tags with private mapping, sync error mode and mmap memory not ok 18 Check initial tags with private mapping, sync error mode and mmap/mprotect memory ... This code needs more cleanups; we'll tackle that next, like decoupling __GFP_ZEROTAGS from __GFP_SKIP_KASAN. [akpm@linux-foundation.org: s/__GPF_ZERO/__GFP_ZERO/, per David] Link: https://lore.kernel.org/20260421-zerotags-v2-1-05cb1035482e@kernel.org Fixes: adfb6609c680 ("mm/huge_memory: initialise the tags of the huge zero folio") Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Catalin Marinas Tested-by: Lance Yang Cc: Brendan Jackman Cc: Dev Jain Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Mark Brown Cc: Michal Hocko Cc: Mike Rapoport Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Will Deacon Cc: Zi Yan Cc: Signed-off-by: Andrew Morton --- include/linux/gfp_types.h | 10 +++++----- include/linux/highmem.h | 7 ++++--- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 6c75df30a281..cd4972a7c97c 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -273,11 +273,11 @@ enum { * * %__GFP_ZERO returns a zeroed page on success. * - * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself - * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that - * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting - * memory tags at the same time as zeroing memory has minimal additional - * performance impact. + * %__GFP_ZEROTAGS zeroes memory tags at allocation time. Setting memory tags at + * the same time as zeroing memory (e.g., with __GFP_ZERO) has minimal + * additional performance impact. However, __GFP_ZEROTAGS also zeroes the tags + * even if memory is not getting zeroed at allocation time (e.g., + * with init_on_free). * * %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation. * Used for userspace and vmalloc pages; the latter are unpoisoned by diff --git a/include/linux/highmem.h b/include/linux/highmem.h index af03db851a1d..d7aac9de1c8a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -347,10 +347,11 @@ static inline void clear_highpage_kasan_tagged(struct page *page) #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGES -/* Return false to let people know we did not initialize the pages */ -static inline bool tag_clear_highpages(struct page *page, int numpages) +/* Returns true if the caller has to initialize the pages */ +static inline bool tag_clear_highpages(struct page *page, int numpages, + bool clear_pages) { - return false; + return clear_pages; } #endif -- cgit v1.2.3 From e83f5e24da741fa9405aeeff00b08c5ee7c37b88 Mon Sep 17 00:00:00 2001 From: Jiexun Wang Date: Wed, 6 May 2026 19:43:30 +0800 Subject: Bluetooth: serialize accept_q access bt_sock_poll() walks the accept queue without synchronization, while child teardown can unlink the same socket and drop its last reference. The unsynchronized accept queue walk has existed since the initial Bluetooth import. Protect accept_q with a dedicated lock for queue updates and polling. Also rework bt_accept_dequeue() to take temporary child references under the queue lock before dropping it and locking the child socket. Fixes: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Reported-by: Jann Horn Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Signed-off-by: Jiexun Wang Signed-off-by: Ren Wei Signed-off-by: Jiexun Wang Reviewed-by: Jann Horn Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 69eed69f7f26..3faea66b1979 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -398,6 +398,7 @@ void baswap(bdaddr_t *dst, const bdaddr_t *src); struct bt_sock { struct sock sk; struct list_head accept_q; + spinlock_t accept_q_lock; /* protects accept_q */ struct sock *parent; unsigned long flags; void (*skb_msg_name)(struct sk_buff *, void *, int *); -- cgit v1.2.3 From 5522d65d81a711c60a9969d37a485d48d0ad1496 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 10 May 2026 13:46:05 +0300 Subject: ipvs: avoid possible loop in ip_vs_dst_event on resizing Sashiko points out that unprivileged user can frequently call ip_vs_flush() or ip_vs_del_service() to trigger svc_table_changes updates that can lead to infinite loop in ip_vs_dst_event(). This can also happen if the user triggers frequent table resizing without deleting all services. We should also consider the possible effects if the user triggers many NETDEV_DOWN events. One way to solve it is to hold svc_resize_sem in ip_vs_dst_event() but this can block the dev notifier during the whole resizing process. Instead, use new rw_semaphore svc_replace_sem to protect just the svc_table replacement which is a short code section. Then hold svc_replace_sem in ip_vs_dst_event() to serialize with replacing the svc_table. As result, loop is avoided as there is no need to repeat the table walking from the start. By this way changes in svc_table_changes can happen only when all services are removed and all dev references dropped which allows us to abort the table walking. As IP_VS_WORK_SVC_NORESIZE is the flag used to stop the svc_resize_work under service_mutex, we should check only this flag often but not while under service_mutex. To remove the mutex_trylock() for service_mutex in the second phase where the resizer installs the new table after rehashing, we will avoid holding the service_mutex there. As result, the code in configuration context which is under service_mutex should access ipvs->svc_table under RCU because it can be replaced at anytime and released after a RCU grace period. As for ip_vs_zero_all(), it needs different solution as a table walker which can escape single RCU read-side critical section: to hold the svc_replace_sem to prevent table to be replaced. In ip_vs_status_show() prefer to hold svc_replace_sem to avoid many loops, just detect if the svc_table is removed. Prefer the newly attached table for the u_thresh/l_thresh checks to know when to grow/shrink while adding or deleting services because the new table size is based on the latest parameters. Link: https://sashiko.dev/#/patchset/20260505001648.360569-1-pablo%40netfilter.org Fixes: 840aac3d900d ("ipvs: use resizable hash table for services") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 02762ce73a0c..a02e569813d2 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1186,8 +1186,9 @@ struct netns_ipvs { struct timer_list dest_trash_timer; /* expiration timer */ struct mutex service_mutex; /* service reconfig */ struct rw_semaphore svc_resize_sem; /* svc_table resizing */ + struct rw_semaphore svc_replace_sem; /* svc_table replace */ struct delayed_work svc_resize_work; /* resize svc_table */ - atomic_t svc_table_changes;/* ++ on new table */ + atomic_t svc_table_changes;/* ++ on table changes */ /* Service counters */ atomic_t num_services[IP_VS_AF_MAX]; /* Services */ atomic_t fwm_services[IP_VS_AF_MAX]; /* Services */ -- cgit v1.2.3 From b2870fc21601db9133bc70c48c603b487614fa3b Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 14 May 2026 16:46:38 +0200 Subject: netfilter: br_netfilter: Reallocate headroom if necessary in neigh_hh_bridge() neigh_hh_bridge() assumes the skb always has sufficient headroom to copy the aligned L2 header. This assumption can trigger the crash reported below using the following netfilter setup: $modprobe br_netfilter $sysctl -w net.bridge.bridge-nf-call-iptables=1 $root@OpenWrt:~# nft list ruleset table ip nat { chain prerouting { type nat hook prerouting priority dstnat; policy accept; ip daddr 192.168.83.123 dnat to 192.168.83.120 } } - iperf3 client (192.168.83.119) --> bridge (192.168.83.118) --> iperf3 server (192.168.83.120) the iperf3 client is sending packet for 192.168.83.123 to the bridge device. [ 1579.036575] Unable to handle kernel write to read-only memory at virtual address ffffff8004d76ffe [ 1579.045482] Mem abort info: [ 1579.048273] ESR = 0x000000009600004f [ 1579.052024] EC = 0x25: DABT (current EL), IL = 32 bits [ 1579.057363] SET = 0, FnV = 0 [ 1579.060417] EA = 0, S1PTW = 0 [ 1579.063550] FSC = 0x0f: level 3 permission fault [ 1579.068345] Data abort info: [ 1579.071224] ISV = 0, ISS = 0x0000004f, ISS2 = 0x00000000 [ 1579.076720] CM = 0, WnR = 1, TnD = 0, TagAccess = 0 [ 1579.081770] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 1579.087092] swapper pgtable: 4k pages, 39-bit VAs, pgdp=0000000080dc4000 [ 1579.093794] [ffffff8004d76ffe] pgd=180000009ffff003, p4d=180000009ffff003, pud=180000009ffff003, pmd=180000009ffe3003, pte=0060000084d76787 [ 1579.106343] Internal error: Oops: 000000009600004f [#1] SMP [ 1579.193824] CPU: 0 UID: 0 PID: 235 Comm: napi/qdma_eth-3 Tainted: G O 6.12.57 #0 [ 1579.202614] Tainted: [O]=OOT_MODULE [ 1579.206102] Hardware name: Airoha AN7581 Evaluation Board (DT) [ 1579.211929] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 1579.218889] pc : br_nf_pre_routing_finish_bridge+0x1ac/0xcc8 [br_netfilter] [ 1579.225859] lr : br_nf_pre_routing_finish_bridge+0x18c/0xcc8 [br_netfilter] [ 1579.232822] sp : ffffffc0817cba20 [ 1579.236128] x29: ffffffc0817cba20 x28: 0000000000000000 x27: ffffff8002b89000 [ 1579.243273] x26: ffffff8004d7700e x25: 0000000000000008 x24: 0000000000000000 [ 1579.250416] x23: ffffffc08179d4c0 x22: 0000000000000000 x21: ffffffc08179d4c0 [ 1579.257561] x20: ffffff8004d9b800 x19: ffffff8015010000 x18: 0000000000000014 [ 1579.264704] x17: ffffffbf9e930000 x16: ffffffc0817c8000 x15: 0000000000000070 [ 1579.271848] x14: 0000000000000080 x13: 0000000000000001 x12: 0000000000000000 [ 1579.278993] x11: ffffffc0798caae0 x10: ffffff8014db6fd8 x9 : 0000000000000000 [ 1579.286136] x8 : 0000000000000003 x7 : ffffffc08171f628 x6 : 000000001a3b83d3 [ 1579.293281] x5 : 0000000000000000 x4 : 1beb76f22fee0000 x3 : ffffff8004d7700e [ 1579.300425] x2 : 0000000000000000 x1 : ffffff8004d9b8bc x0 : ffffff80026ed000 [ 1579.307570] Call trace: [ 1579.310018] br_nf_pre_routing_finish_bridge+0x1ac/0xcc8 [br_netfilter] [ 1579.316632] br_nf_hook_thresh+0xd4/0x14bc [br_netfilter] [ 1579.322032] br_nf_hook_thresh+0x250/0x14bc [br_netfilter] [ 1579.327517] br_nf_hook_thresh+0x76c/0x14bc [br_netfilter] [ 1579.333003] br_handle_frame+0x180/0x480 [ 1579.336935] __netif_receive_skb_core.constprop.0+0x540/0xf40 [ 1579.342682] __netif_receive_skb_one_core+0x28/0x50 [ 1579.347561] process_backlog+0x98/0x1e0 [ 1579.351398] __napi_poll+0x34/0x1c4 [ 1579.354887] net_rx_action+0x178/0x330 [ 1579.358638] handle_softirqs+0x108/0x2d4 [ 1579.362560] __do_softirq+0x10/0x18 [ 1579.366051] ____do_softirq+0xc/0x20 [ 1579.369627] call_on_irq_stack+0x30/0x4c [ 1579.373550] do_softirq_own_stack+0x18/0x20 [ 1579.377734] do_softirq+0x4c/0x60 [ 1579.381050] __local_bh_enable_ip+0x88/0x98 [ 1579.385234] napi_threaded_poll_loop+0x188/0x21c [ 1579.389853] napi_threaded_poll+0x70/0x80 [ 1579.393863] kthread+0xd8/0xdc [ 1579.396918] ret_from_fork+0x10/0x20 [ 1579.400499] Code: 88dffc22 3707ffc2 f9406663 f9406684 (f81f0064) [ 1579.406589] ---[ end trace 0000000000000000 ]--- [ 1579.411209] Kernel panic - not syncing: Oops: Fatal exception in interrupt [ 1579.418083] SMP: stopping secondary CPUs [ 1579.422012] Kernel Offset: disabled Fix the issue reallocating the skb headroom if necessary in neigh_hh_bridge routine. Fixes: e179e6322ac33 ("netfilter: bridge-netfilter: Fix MAC header handling with IP DNAT") Reviewed-by: Ido Schimmel Signed-off-by: Lorenzo Bianconi Signed-off-by: Pablo Neira Ayuso --- include/net/neighbour.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 2dfee6d4258a..8860cc2175fc 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -489,11 +489,15 @@ static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { - unsigned int seq, hh_alen; + unsigned int seq, hh_alen = HH_DATA_ALIGN(ETH_HLEN); + int err; + + err = skb_cow_head(skb, hh_alen); + if (err) + return err; do { seq = read_seqbegin(&hh->hh_lock); - hh_alen = HH_DATA_ALIGN(ETH_HLEN); memcpy(skb->data - hh_alen, hh->hh_data, ETH_ALEN + hh_alen - ETH_HLEN); } while (read_seqretry(&hh->hh_lock, seq)); return 0; -- cgit v1.2.3 From e196115ec330a18de415bdb9f5071aa9f08e53ce Mon Sep 17 00:00:00 2001 From: Haoze Xie Date: Fri, 15 May 2026 11:19:02 +0800 Subject: netfilter: nf_queue: hold bridge skb->dev while queued br_pass_frame_up() rewrites skb->dev from the ingress port to the bridge master before queueing bridge LOCAL_IN packets. NFQUEUE only holds references on state.in/out and bridge physdevs, so a queued bridge packet can retain a freed bridge master in skb->dev until reinjection. When the verdict is reinjected later, br_netif_receive_skb() re-enters the receive path with skb->dev still pointing at the freed bridge master, triggering a use-after-free. Store skb->dev in the queue entry, hold a reference on it for the queue lifetime, and use the saved device when dropping queued packets during NETDEV_DOWN handling. Fixes: ac2863445686 ("netfilter: bridge: add nf_afinfo to enable queuing to userspace") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Signed-off-by: Haoze Xie Signed-off-by: Ren Wei Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_queue.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index d17035d14d96..3978c3174cdb 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -14,6 +14,7 @@ struct nf_queue_entry { struct list_head list; struct rhash_head hash_node; struct sk_buff *skb; + struct net_device *skb_dev; unsigned int id; unsigned int hook_index; /* index in hook_entries->hook[] */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -- cgit v1.2.3 From 3d562d35a044ae798cab421c65a116f8cedfa5d4 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 17 May 2026 09:55:28 +0200 Subject: bpf: Check global subprog exception paths Global subprogs are verified independently and are not descended into when their callers are symbolically executed. This means a caller can hold references or locks across a global subprog call that may throw, while the verifier only checks the non-exceptional return path at the call site. Record whether a subprog might throw in the CFG summary pass, alongside the existing might_sleep and packet-data-changing summaries, and propagate that effect through reachable callees. When a global subprog is marked as possibly throwing, push the normal continuation and validate the exceptional path immediately at the call site, avoiding a synthetic exception state and associated special case in the pruning checks. Fixes: f18b03fabaa9 ("bpf: Implement BPF exceptions") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20260517075530.3461166-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b148f816f25b..185b2aa43a42 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -729,6 +729,7 @@ struct bpf_subprog_info { */ s16 fastcall_stack_off; bool has_tail_call: 1; + bool might_throw: 1; bool tail_call_reachable: 1; bool has_ld_abs: 1; bool is_cb: 1; @@ -1308,6 +1309,7 @@ void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); bool bpf_subprog_is_global(const struct bpf_verifier_env *env, int subprog); int bpf_find_subprog(struct bpf_verifier_env *env, int off); +bool bpf_is_throw_kfunc(struct bpf_insn *insn); int bpf_compute_const_regs(struct bpf_verifier_env *env); int bpf_prune_dead_branches(struct bpf_verifier_env *env); int bpf_check_cfg(struct bpf_verifier_env *env); -- cgit v1.2.3 From f233124fb36cd57ef09f96d517a38ab4b902e15e Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 14 May 2026 09:39:01 +0200 Subject: ata: libata-scsi: do not use the deferred QC feature on PMPs with CBS When using Port Multipliers (PMPs) with Command-Based Switching (CBS), you can only issue commands to one link at a time. For PMPs with CBS, there is already code to handle commands being sent to different links in sata_pmp_qc_defer_cmd_switch() using ap->excl_link. sata_sil24 also makes use of ap->excl_link. A user on the list reported that commit 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") broke PMPs with CBS. The commit introduced code that stores a deferred qc in ap->deferred_qc, to later be issued via a workqueue. It turns out that this change is incompatible with the existing ap->excl_link handling used by PMPs with CBS. Thus, modify sata_pmp_qc_defer_cmd_switch() and sil24_qc_defer() to return ATA_DEFER_LINK_EXCL, and make sure that the deferred QC handling via workqueue is not used for this return value. This way, PMPs with CBS will work once again. Note that the starvation referenced in commit 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") can only happen on libsas ports, and libsas does not support Port Multipliers, thus there is no harm of reverting back to the previous way of deferring commands for PMPs with CBS. Non-libsas ports connected to anything but a PMP with CBS (e.g. a normal drive or a PMP with FBS) will continue using the deferred workqueue, since it does result in lower completion latencies for non-NCQ commands, even though the workqueue is not strictly needed to avoid starvation for non-libsas ports. If we want to modify the scope of the workqueue issuing to also handle PMPs with CBS, then we should ensure that we can save both NCQ and non-NCQ commands in ap->deferred_qc, while also removing the existing PMP CBS handling using ap->excl_link, such that we don't duplicate features. While at it, also add a comment explaining how the ap->excl_link mechanism works. Fixes: 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") Tested-by: Tommy Kelly Reported-by: Tommy Kelly Closes: https://lore.kernel.org/linux-ide/ce09cc21-a8e9-4845-b205-35411e22fba9@tkel.ly/ Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- include/linux/libata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/libata.h b/include/linux/libata.h index 5c085ef4eda7..360776016b50 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -371,6 +371,7 @@ enum { /* return values for ->qc_defer */ ATA_DEFER_LINK = 1, ATA_DEFER_PORT = 2, + ATA_DEFER_LINK_EXCL = 3, /* desc_len for ata_eh_info and context */ ATA_EH_DESC_LEN = 80, -- cgit v1.2.3 From 759e8756da00aa115d504a18155b1d1ee1cc12e8 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 14 May 2026 09:39:02 +0200 Subject: ata: libata-scsi: do not needlessly defer commands when using PMP with FBS The ACS specification does not allow a non-NCQ command to be issued while an NCQ command is outstanding. Commit 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") introduced a feature where a deferred non-NCQ command gets issued from a workqueue. The design stores a single non-NCQ command per port. However, when using Port Multipliers (PMPs), specifically PMPs that support FIS-Based Switching (FBS), non-NCQ and NCQ commands can be mixed on the same port, just not for the same link, see e.g. ata_std_qc_defer() which is, and always has operated on a per-link basis. Therefore, move the deferred_qc from struct ata_port to struct ata_link. This way, when using a PMP with FBS, we will not needlessly defer commands to all other links, just because one link issued a non-NCQ command while having an NCQ command outstanding. Only commands for that specific link will be deferred. This is in line with how PMPs with FBS worked before commit 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation"). Fixes: 0ea84089dbf6 ("ata: libata-scsi: avoid Non-NCQ command starvation") Tested-by: Tommy Kelly Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- include/linux/libata.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/libata.h b/include/linux/libata.h index 360776016b50..127229fbd1a6 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -855,6 +855,9 @@ struct ata_link { unsigned int sata_spd; /* current SATA PHY speed */ enum ata_lpm_policy lpm_policy; + struct work_struct deferred_qc_work; + struct ata_queued_cmd *deferred_qc; + /* record runtime error info, protected by host_set lock */ struct ata_eh_info eh_info; /* EH context */ @@ -900,9 +903,6 @@ struct ata_port { u64 qc_active; int nr_active_links; /* #links with active qcs */ - struct work_struct deferred_qc_work; - struct ata_queued_cmd *deferred_qc; - struct ata_link link; /* host default link */ struct ata_link *slave_link; /* see ata_slave_link_init() */ -- cgit v1.2.3 From 379e8f1ca5e919b130b40d8115d92a536e5f8d7a Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Mon, 18 May 2026 13:41:45 +0200 Subject: drm/gem: Make the GEM LRU lock part of drm_device Recently, a few races have been discovered in the GEM LRU logic, all of them caused by the fact the LRU lock is accessed through gem->lru->lock, and that very same lock also protects changes to gem->lru, leading to situations where gem->lru needs to first be accessed without the lock held, to then get the lru to access the lock through and finally take the lock and do the expected operation. Currently, the only driver making use of this API (MSM) declares a device-wide lock, and the user we're about to add (panthor) will do the same. There's no evidence that we will ever have a driver that wants different pools of LRUs protected by different locks under the same drm_device. So we're better off moving this lock to drm_device and always locking it through obj->dev->gem_lru_mutex, or directly through dev->gem_lru_mutex. If anyone ever needs more fine-grained locking, this can be revisited to pass some drm_gem_lru_pool object representing the pool of LRUs under a specific lock, but for now, the per-device lock seems to be enough. Fixes: e7c2af13f811 ("drm/gem: Add LRU/shrinker helper") Reported-by: Chia-I Wu Closes: https://gitlab.freedesktop.org/panfrost/linux/-/work_items/86 Reviewed-by: Rob Clark Reviewed-by: Liviu Dudau Reviewed-by: Steven Price Reviewed-by: Chia-I Wu Link: https://patch.msgid.link/20260518-panthor-shrinker-fixes-v4-1-1920234470d5@collabora.com Signed-off-by: Boris Brezillon --- include/drm/drm_device.h | 7 +++++++ include/drm/drm_gem.h | 20 +++++++++----------- 2 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h index bc78fb77cc27..768a8dae83c5 100644 --- a/include/drm/drm_device.h +++ b/include/drm/drm_device.h @@ -375,6 +375,13 @@ struct drm_device { * Root directory for debugfs files. */ struct dentry *debugfs_root; + + /** + * @gem_lru_mutex: + * + * Lock protecting movement of GEM objects between LRUs. + */ + struct mutex gem_lru_mutex; }; void drm_dev_set_dma_dev(struct drm_device *dev, struct device *dma_dev); diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h index 86f5846154f7..8a704f6a65c1 100644 --- a/include/drm/drm_gem.h +++ b/include/drm/drm_gem.h @@ -245,17 +245,11 @@ struct drm_gem_object_funcs { * for lockless &shrinker.count_objects, and provides * &drm_gem_lru_scan for driver's &shrinker.scan_objects * implementation. + * + * Any access to this kind of object must be done with + * drm_device::gem_lru_mutex held. */ struct drm_gem_lru { - /** - * @lock: - * - * Lock protecting movement of GEM objects between LRUs. All - * LRUs that the object can move between should be protected - * by the same lock. - */ - struct mutex *lock; - /** * @count: * @@ -453,6 +447,9 @@ struct drm_gem_object { * @lru: * * The current LRU list that the GEM object is on. + * + * Access to this field must be done with drm_device::gem_lru_mutex + * held. */ struct drm_gem_lru *lru; }; @@ -610,12 +607,13 @@ void drm_gem_unlock_reservations(struct drm_gem_object **objs, int count, int drm_gem_dumb_map_offset(struct drm_file *file, struct drm_device *dev, u32 handle, u64 *offset); -void drm_gem_lru_init(struct drm_gem_lru *lru, struct mutex *lock); +void drm_gem_lru_init(struct drm_gem_lru *lru); void drm_gem_lru_remove(struct drm_gem_object *obj); void drm_gem_lru_move_tail_locked(struct drm_gem_lru *lru, struct drm_gem_object *obj); void drm_gem_lru_move_tail(struct drm_gem_lru *lru, struct drm_gem_object *obj); unsigned long -drm_gem_lru_scan(struct drm_gem_lru *lru, +drm_gem_lru_scan(struct drm_device *dev, + struct drm_gem_lru *lru, unsigned int nr_to_scan, unsigned long *remaining, bool (*shrink)(struct drm_gem_object *obj, struct ww_acquire_ctx *ticket), -- cgit v1.2.3 From 8817005efbdfdf5d4e4814cb5dc52b53d12917d7 Mon Sep 17 00:00:00 2001 From: Qing Ming Date: Sat, 16 May 2026 15:08:49 +0800 Subject: cgroup/rstat: validate cpu before css_rstat_cpu() access css_rstat_updated() is exposed as a BPF kfunc and accepts a caller-provided cpu argument. The function uses cpu for per-cpu rstat lookups without checking whether it refers to a valid possible CPU. A BPF iter/cgroup program with CAP_BPF and CAP_PERFMON can pass an invalid cpu value. On an unfixed UBSCAN_BOUNDS test kernel, cpu == 0x7fffffff triggers: UBSAN: array-index-out-of-bounds in kernel/cgroup/rstat.c:31:9 index 2147483647 is out of range for type 'long unsigned int [64]' Call Trace: css_rstat_updated bpf_iter_run_prog cgroup_iter_seq_show bpf_seq_read Add cpu validation to the BPF-facing css_rstat_updated() kfunc and move the common implementation to __css_rstat_updated() for in-kernel callers. Fixes: a319185be9f5 ("cgroup: bpf: enable bpf programs to integrate with rstat") Signed-off-by: Qing Ming Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e52160e85af4..e011dc43fcf1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -776,6 +776,7 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) /* * cgroup scalable recursive statistics. */ +void __css_rstat_updated(struct cgroup_subsys_state *css, int cpu); void css_rstat_updated(struct cgroup_subsys_state *css, int cpu); void css_rstat_flush(struct cgroup_subsys_state *css); -- cgit v1.2.3 From 8939562b16052c75b908d3c5f968bffb526fc6e9 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Mon, 18 May 2026 15:02:08 +0800 Subject: efi: efi.h: Remove extra semicolon Remove extra semicolons from comments. Signed-off-by: Rong Tao Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/efi.h b/include/linux/efi.h index 72e76ec54641..ccbc35479684 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -61,7 +61,7 @@ typedef void *efi_handle_t; /* * The UEFI spec and EDK2 reference implementation both define EFI_GUID as - * struct { u32 a; u16; b; u16 c; u8 d[8]; }; and so the implied alignment + * struct { u32 a; u16 b; u16 c; u8 d[8]; }; and so the implied alignment * is 32 bits not 8 bits like our guid_t. In some cases (i.e., on 32-bit ARM), * this means that firmware services invoked by the kernel may assume that * efi_guid_t* arguments are 32-bit aligned, and use memory accessors that -- cgit v1.2.3 From 7122ff96068a03595bde2fbafaca82ca2ed8084e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 13 May 2026 12:00:16 -0300 Subject: RDMA/core: Do not read wild stack memory in uverbs_get_handler_fn() Sashiko points out the legacy write path in ib_uverbs_write() does allocate a struct uverbs_attr_bundle, but it doesn't wrap it in a bundle_priv so downcasting here isn't safe. Instead lift the method_elm out of the bundle_priv and use it for the debug function. The legacy write path will leave it set as NULL since the write method_elm uses a different type. Cc: stable@vger.kernel.org Fixes: 1de9287ece44 ("RDMA: Add ib_copy_validate_udata_in()") Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- include/rdma/uverbs_ioctl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index e2af17da3e32..c89428030d61 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -635,6 +635,7 @@ struct uverbs_attr_bundle { struct ib_uverbs_file *ufile; struct ib_ucontext *context; struct ib_uobject *uobject; + const struct uverbs_api_ioctl_method *method_elm; DECLARE_BITMAP(attr_present, UVERBS_API_ATTR_BKEY_LEN); ); struct uverbs_attr attrs[]; -- cgit v1.2.3 From 0cb5a74faa3bdcfa3b18735d554e12c0f615e35d Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Mon, 18 May 2026 15:44:57 +0200 Subject: net: airoha: Fix NPU RX DMA descriptor bits In an internal review from Airoha, it was notice that the RX DMA descriptor bits and mask are wrong. These values probably refer to an old NPU firmware never published. The previous value works correctly but it was reported that in some specific condition in mixed scenario with both Ethernet and WiFi offload it's possible that RX DMA descriptor signal wrong value with the problem to the RX ring or packets getting dropped. To handle these specific scenario, apply the new suggested bits mask from Airoha. Correct functionality of both AN7581 NPU and MT7996 variant were verified and confirmed working. Fixes: a7fc8c641cab ("net: airoha: Fix npu rx DMA definitions") Signed-off-by: Christian Marangi Acked-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260518134530.3683-1-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/soc/airoha/airoha_offload.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/soc/airoha/airoha_offload.h b/include/linux/soc/airoha/airoha_offload.h index d01ef4a6b3d7..7589fccfeef6 100644 --- a/include/linux/soc/airoha/airoha_offload.h +++ b/include/linux/soc/airoha/airoha_offload.h @@ -71,9 +71,9 @@ static inline void airoha_ppe_dev_check_skb(struct airoha_ppe_dev *dev, #define NPU_RX1_DESC_NUM 512 /* CTRL */ -#define NPU_RX_DMA_DESC_LAST_MASK BIT(27) -#define NPU_RX_DMA_DESC_LEN_MASK GENMASK(26, 14) -#define NPU_RX_DMA_DESC_CUR_LEN_MASK GENMASK(13, 1) +#define NPU_RX_DMA_DESC_LAST_MASK BIT(29) +#define NPU_RX_DMA_DESC_LEN_MASK GENMASK(28, 15) +#define NPU_RX_DMA_DESC_CUR_LEN_MASK GENMASK(14, 1) #define NPU_RX_DMA_DESC_DONE_MASK BIT(0) /* INFO */ #define NPU_RX_DMA_PKT_COUNT_MASK GENMASK(31, 29) -- cgit v1.2.3 From b8d7519352ba8c6df83259295d4a3bad093cae90 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 15 May 2026 15:13:25 -0700 Subject: net: shaper: rework the VALID marking (again) Recent commit changed the semantics from NOT_VALID to VALID. I didn't realize that the flags are not stored atomically with the entry in XArray. There's still a race of reader observing a VALID mark for a slot, getting interrupted, writer replacing the entry with a different one, reader continuing, fetching the entry which is now a different pointer than the pointer for which VALID was meant. The biggest consequence of this is that we may see a UAF since net_shaper_rollback() assumed that entries without VALID can be freed without observing RCU. Looks like the XArray marks are buying us nothing at this point. Let's convert the code to an explicit valid field. The smp_load_acquire() / smp_store_release() barriers are marginally cleaner. Reported-by: Sashiko Fixes: 93954b40f6a4 ("net-shapers: implement NL set and delete operations") Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260515221325.1685455-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/net_shaper.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/net_shaper.h b/include/net/net_shaper.h index 5c3f49b52fe9..3939b816b001 100644 --- a/include/net/net_shaper.h +++ b/include/net/net_shaper.h @@ -53,6 +53,7 @@ struct net_shaper { /* private: */ u32 leaves; /* accounted only for NODE scope */ + bool valid; struct rcu_head rcu; }; -- cgit v1.2.3 From 2b50aceafe6606ea52ed42aadd1b4d44a188aade Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 16 May 2026 00:05:13 +0100 Subject: crypto/krb5, rxrpc: Fix lack of pre-decrypt/pre-verify length checks Change the krb5 crypto library to provide facilities to precheck the length of the message about to be decrypted or verified. Fix AF_RXRPC to make use of this to validate DATA packets secured with RxGK. Fixes: 9d1d2b59341f ("rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)") Closes: https://sashiko.dev/#/patchset/20260511160753.607296-1-dhowells%40redhat.com Signed-off-by: David Howells cc: Herbert Xu cc: Simon Horman cc: Chuck Lever cc: linux-afs@lists.infradead.org Reviewed-by: Jeffrey Altman Tested-by: Marc Dionne Link: https://patch.msgid.link/20260515230516.2718212-2-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/crypto/krb5.h | 9 ++++++--- include/trace/events/rxrpc.h | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/crypto/krb5.h b/include/crypto/krb5.h index 71dd38f59be1..aac3ecf88467 100644 --- a/include/crypto/krb5.h +++ b/include/crypto/krb5.h @@ -121,9 +121,12 @@ size_t crypto_krb5_how_much_buffer(const struct krb5_enctype *krb5, size_t crypto_krb5_how_much_data(const struct krb5_enctype *krb5, enum krb5_crypto_mode mode, size_t *_buffer_size, size_t *_offset); -void crypto_krb5_where_is_the_data(const struct krb5_enctype *krb5, - enum krb5_crypto_mode mode, - size_t *_offset, size_t *_len); +int crypto_krb5_where_is_the_data(const struct krb5_enctype *krb5, + enum krb5_crypto_mode mode, + size_t *_offset, size_t *_len); +int crypto_krb5_check_data_len(const struct krb5_enctype *krb5, + enum krb5_crypto_mode mode, + size_t len, size_t min_content); struct crypto_aead *crypto_krb5_prepare_encryption(const struct krb5_enctype *krb5, const struct krb5_buffer *TK, u32 usage, gfp_t gfp); diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 573f2df3a2c9..704a10de6670 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -71,6 +71,7 @@ EM(rxkad_abort_resp_unknown_tkt, "rxkad-resp-unknown-tkt") \ EM(rxkad_abort_resp_version, "rxkad-resp-version") \ /* RxGK security errors */ \ + EM(rxgk_abort_1_short_header, "rxgk1-short-hdr") \ EM(rxgk_abort_1_verify_mic_eproto, "rxgk1-vfy-mic-eproto") \ EM(rxgk_abort_2_decrypt_eproto, "rxgk2-dec-eproto") \ EM(rxgk_abort_2_short_data, "rxgk2-short-data") \ -- cgit v1.2.3 From 1bbf0ced1d9db73ac7893c2187f3459288603e0d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 May 2026 08:46:11 +0000 Subject: tcp: fix stale per-CPU tcp_tw_isn leak enabling ISN prediction Blamed commit moved the TIME_WAIT-derived ISN from the skb control block to a per-CPU variable, assuming the value would always be consumed by tcp_conn_request() for the same packet that wrote it. That assumption is violated by multiple drop paths between the producer (__this_cpu_write(tcp_tw_isn, isn) in tcp_v{4,6}_rcv()) and the consumer (tcp_conn_request()): - min_ttl / min_hopcount check - xfrm policy check - tcp_inbound_hash() MD5/AO mismatch - tcp_filter() eBPF/SO_ATTACH_FILTER drop - th->syn && th->fin discard in tcp_rcv_state_process() TCP_LISTEN - psp_sk_rx_policy_check() in tcp_v{4,6}_do_rcv() - tcp_checksum_complete() in tcp_v{4,6}_do_rcv() - tcp_v{4,6}_cookie_check() returning NULL When a packet is dropped on any of these paths, tcp_tw_isn is left set. The next SYN processed on the same CPU then consumes the non zero value in tcp_conn_request(), receiving a potentially predictable ISN. This patch moves back tcp_tw_isn to skb->cb[], getting rid of the per-cpu variable. Note that tcp_v{4,6}_fill_cb() do not set it. Very litle impact on overall code size/complexity: $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 0/0 grow/shrink: 2/1 up/down: 8/-15 (-7) Function old new delta tcp_v6_rcv 3038 3042 +4 tcp_v4_rcv 3035 3039 +4 tcp_conn_request 2938 2923 -15 Total: Before=24436060, After=24436053, chg -0.00% Fixes: 41eecbd712b7 ("tcp: replace TCP_SKB_CB(skb)->tcp_tw_isn with a per-cpu field") Reported-by: Chris Mason Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260519084611.2485277-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index ecbadcb3a744..98848db62894 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -65,8 +65,6 @@ static inline void tcp_orphan_count_dec(void) this_cpu_dec(tcp_orphan_count); } -DECLARE_PER_CPU(u32, tcp_tw_isn); - void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER) @@ -1102,10 +1100,13 @@ struct tcp_skb_cb { __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ union { - /* Note : + /* Notes : + * tcp_tw_isn is used in input path only + * (isn chosen by tcp_timewait_state_process()) * tcp_gso_segs/size are used in write queue only, * cf tcp_skb_pcount()/tcp_skb_mss() */ + u32 tcp_tw_isn; struct { u16 tcp_gso_segs; u16 tcp_gso_size; -- cgit v1.2.3 From a494d3c8d5392bcdff83c2a593df0c160ff9f322 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 30 Apr 2026 12:28:16 +0900 Subject: ring-buffer: Flush and stop persistent ring buffer on panic On real hardware, panic and machine reboot may not flush hardware cache to memory. This means the persistent ring buffer, which relies on a coherent state of memory, may not have its events written to the buffer and they may be lost. Moreover, there may be inconsistency with the counters which are used for validation of the integrity of the persistent ring buffer which may cause all data to be discarded. To avoid this issue, stop recording of the ring buffer on panic and flush the cache of the ring buffer's memory. Fixes: e645535a954a ("tracing: Add option to use memmapped memory for trace boot instance") Cc: stable@vger.kernel.org Cc: Will Deacon Cc: Mathieu Desnoyers Cc: Ian Rogers Link: https://patch.msgid.link/177751969602.2136606.12031934362587643488.stgit@mhiramat.tok.corp.google.com Signed-off-by: Masami Hiramatsu (Google) Acked-by: Catalin Marinas Acked-by: Geert Uytterhoeven Signed-off-by: Steven Rostedt --- include/asm-generic/ring_buffer.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 include/asm-generic/ring_buffer.h (limited to 'include') diff --git a/include/asm-generic/ring_buffer.h b/include/asm-generic/ring_buffer.h new file mode 100644 index 000000000000..201d2aee1005 --- /dev/null +++ b/include/asm-generic/ring_buffer.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Generic arch dependent ring_buffer macros. + */ +#ifndef __ASM_GENERIC_RING_BUFFER_H__ +#define __ASM_GENERIC_RING_BUFFER_H__ + +#include + +/* Flush cache on ring buffer range if needed. Do nothing by default. */ +#define arch_ring_buffer_flush_range(start, end) do { } while (0) + +#endif /* __ASM_GENERIC_RING_BUFFER_H__ */ -- cgit v1.2.3 From 215c90ee656114f5e8c32408228d97082f8e0eef Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 6 May 2026 13:57:00 +0200 Subject: device property: set fwnode->secondary to NULL in fwnode_init() If a firmware node is allocated on the stack (for instance: temporary software node whose life-time we control) or on the heap - but using a non-zeroing allocation function - and initialized using fwnode_init(), its secondary pointer will contain uninitalized memory which likely will be neither NULL nor IS_ERR() and so may end up being dereferenced (for example: in dev_to_swnode()). Set fwnode->secondary to NULL on initialization. Cc: stable Fixes: 01bb86b380a3 ("driver core: Add fwnode_init()") Signed-off-by: Bartosz Golaszewski Reviewed-by: Rafael J. Wysocki (Intel) Reviewed-by: Andy Shevchenko Reviewed-by: Sakari Ailus Link: https://patch.msgid.link/20260506115701.23035-1-bartosz.golaszewski@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- include/linux/fwnode.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 80b38fbf2121..31df7608737e 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -208,6 +208,7 @@ struct fwnode_operations { static inline void fwnode_init(struct fwnode_handle *fwnode, const struct fwnode_operations *ops) { + fwnode->secondary = NULL; fwnode->ops = ops; INIT_LIST_HEAD(&fwnode->consumers); INIT_LIST_HEAD(&fwnode->suppliers); -- cgit v1.2.3