From b40cc5adaa80e1471095a62d78233b611d7a558c Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Sat, 24 Jan 2026 19:32:43 +0800 Subject: bpf, sockmap: Fix incorrect copied_seq calculation A socket using sockmap has its own independent receive queue: ingress_msg. This queue may contain data from its own protocol stack or from other sockets. The issue is that when reading from ingress_msg, we update tp->copied_seq by default. However, if the data is not from its own protocol stack, tcp->rcv_nxt is not increased. Later, if we convert this socket to a native socket, reading from this socket may fail because copied_seq might be significantly larger than rcv_nxt. This fix also addresses the syzkaller-reported bug referenced in the Closes tag. This patch marks the skmsg objects in ingress_msg. When reading, we update copied_seq only if the data is from its own protocol stack. FD1:read() -- FD1->copied_seq++ | [read data] | [enqueue data] v [sockmap] -> ingress to self -> ingress_msg queue FD1 native stack ------> ^ -- FD1->rcv_nxt++ -> redirect to other | [enqueue data] | | | ingress to FD1 v ^ ... | [sockmap] FD2 native stack Closes: https://syzkaller.appspot.com/bug?extid=06dbd397158ec0ea4983 Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Reviewed-by: Jakub Sitnicki Reviewed-by: John Fastabend Signed-off-by: Jiayuan Chen Link: https://lore.kernel.org/r/20260124113314.113584-2-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/skmsg.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/skmsg.h') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 49847888c287..dfdc158ab88c 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -141,6 +141,8 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); +int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags, int *copied_from_self); bool sk_msg_is_readable(struct sock *sk); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) -- cgit v1.2.3 From 929e30f9312514902133c45e51c79088421ab084 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Sat, 24 Jan 2026 19:32:44 +0800 Subject: bpf, sockmap: Fix FIONREAD for sockmap A socket using sockmap has its own independent receive queue: ingress_msg. This queue may contain data from its own protocol stack or from other sockets. Therefore, for sockmap, relying solely on copied_seq and rcv_nxt to calculate FIONREAD is not enough. This patch adds a new msg_tot_len field in the psock structure to record the data length in ingress_msg. Additionally, we implement new ioctl interfaces for TCP and UDP to intercept FIONREAD operations. Note that we intentionally do not include sk_receive_queue data in the FIONREAD result. Data in sk_receive_queue has not yet been processed by the BPF verdict program, and may be redirected to other sockets or dropped. Including it would create semantic ambiguity since this data may never be readable by the user. Unix and VSOCK sockets have similar issues, but fixing them is outside the scope of this patch as it would require more intrusive changes. Previous work by John Fastabend made some efforts towards FIONREAD support: commit e5c6de5fa025 ("bpf, sockmap: Incorrectly handling copied_seq") Although the current patch is based on the previous work by John Fastabend, it is acceptable for our Fixes tag to point to the same commit. FD1:read() -- FD1->copied_seq++ | [read data] | [enqueue data] v [sockmap] -> ingress to self -> ingress_msg queue FD1 native stack ------> ^ -- FD1->rcv_nxt++ -> redirect to other | [enqueue data] | | | ingress to FD1 v ^ ... | [sockmap] FD2 native stack Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Signed-off-by: Jiayuan Chen Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20260124113314.113584-3-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/skmsg.h | 68 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) (limited to 'include/linux/skmsg.h') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index dfdc158ab88c..829b281d6c9c 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -97,6 +97,8 @@ struct sk_psock { struct sk_buff_head ingress_skb; struct list_head ingress_msg; spinlock_t ingress_lock; + /** @msg_tot_len: Total bytes queued in ingress_msg list. */ + u32 msg_tot_len; unsigned long state; struct list_head link; spinlock_t link_lock; @@ -321,6 +323,27 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); } +static inline u32 sk_psock_get_msg_len_nolock(struct sk_psock *psock) +{ + /* Used by ioctl to read msg_tot_len only; lock-free for performance */ + return READ_ONCE(psock->msg_tot_len); +} + +static inline void sk_psock_msg_len_add_locked(struct sk_psock *psock, int diff) +{ + /* Use WRITE_ONCE to ensure correct read in sk_psock_get_msg_len_nolock(). + * ingress_lock should be held to prevent concurrent updates to msg_tot_len + */ + WRITE_ONCE(psock->msg_tot_len, psock->msg_tot_len + diff); +} + +static inline void sk_psock_msg_len_add(struct sk_psock *psock, int diff) +{ + spin_lock_bh(&psock->ingress_lock); + sk_psock_msg_len_add_locked(psock, diff); + spin_unlock_bh(&psock->ingress_lock); +} + static inline bool sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { @@ -329,6 +352,7 @@ static inline bool sk_psock_queue_msg(struct sk_psock *psock, spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { list_add_tail(&msg->list, &psock->ingress_msg); + sk_psock_msg_len_add_locked(psock, msg->sg.size); ret = true; } else { sk_msg_free(psock->sk, msg); @@ -345,18 +369,25 @@ static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) spin_lock_bh(&psock->ingress_lock); msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); - if (msg) + if (msg) { list_del(&msg->list); + sk_psock_msg_len_add_locked(psock, -msg->sg.size); + } spin_unlock_bh(&psock->ingress_lock); return msg; } +static inline struct sk_msg *sk_psock_peek_msg_locked(struct sk_psock *psock) +{ + return list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); +} + static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) { struct sk_msg *msg; spin_lock_bh(&psock->ingress_lock); - msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); + msg = sk_psock_peek_msg_locked(psock); spin_unlock_bh(&psock->ingress_lock); return msg; } @@ -523,6 +554,39 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return !!psock->saved_data_ready; } +/* for tcp only, sk is locked */ +static inline ssize_t sk_psock_msg_inq(struct sock *sk) +{ + struct sk_psock *psock; + ssize_t inq = 0; + + psock = sk_psock_get(sk); + if (likely(psock)) { + inq = sk_psock_get_msg_len_nolock(psock); + sk_psock_put(sk, psock); + } + return inq; +} + +/* for udp only, sk is not locked */ +static inline ssize_t sk_msg_first_len(struct sock *sk) +{ + struct sk_psock *psock; + struct sk_msg *msg; + ssize_t inq = 0; + + psock = sk_psock_get(sk); + if (likely(psock)) { + spin_lock_bh(&psock->ingress_lock); + msg = sk_psock_peek_msg_locked(psock); + if (msg) + inq = msg->sg.size; + spin_unlock_bh(&psock->ingress_lock); + sk_psock_put(sk, psock); + } + return inq; +} + #if IS_ENABLED(CONFIG_NET_SOCK_MSG) #define BPF_F_STRPARSER (1UL << 1) -- cgit v1.2.3 From 69050f8d6d075dc01af7a5f2f550a8067510366f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 20 Feb 2026 23:49:23 -0800 Subject: treewide: Replace kmalloc with kmalloc_obj for non-scalar types This is the result of running the Coccinelle script from scripts/coccinelle/api/kmalloc_objs.cocci. The script is designed to avoid scalar types (which need careful case-by-case checking), and instead replace kmalloc-family calls that allocate struct or union object instances: Single allocations: kmalloc(sizeof(TYPE), ...) are replaced with: kmalloc_obj(TYPE, ...) Array allocations: kmalloc_array(COUNT, sizeof(TYPE), ...) are replaced with: kmalloc_objs(TYPE, COUNT, ...) Flex array allocations: kmalloc(struct_size(PTR, FAM, COUNT), ...) are replaced with: kmalloc_flex(*PTR, FAM, COUNT, ...) (where TYPE may also be *VAR) The resulting allocations no longer return "void *", instead returning "TYPE *". Signed-off-by: Kees Cook --- include/linux/skmsg.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/skmsg.h') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 829b281d6c9c..a8513e401760 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -460,8 +460,7 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, * intentional to enforce typesafety. */ #define sk_psock_init_link() \ - ((struct sk_psock_link *)kzalloc(sizeof(struct sk_psock_link), \ - GFP_ATOMIC | __GFP_NOWARN)) + ((struct sk_psock_link *) kzalloc_obj(struct sk_psock_link, GFP_ATOMIC | __GFP_NOWARN)) static inline void sk_psock_free_link(struct sk_psock_link *link) { -- cgit v1.2.3 From 7a70c15bd1449f1eb30991772edce37b41e496fb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 21 Feb 2026 00:12:19 -0800 Subject: kmalloc_obj: Clean up after treewide replacements Coccinelle doesn't handle re-indenting line escapes. Fix the 2 places where these got misaligned. Remove 2 now-redundant type casts, found with: $ git grep -P 'struct (\S+).*\)\s*k\S+alloc_(objs?|flex)\(struct \1' Signed-off-by: Kees Cook --- include/linux/skmsg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/skmsg.h') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index a8513e401760..19f4f253b4f9 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -460,7 +460,7 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, * intentional to enforce typesafety. */ #define sk_psock_init_link() \ - ((struct sk_psock_link *) kzalloc_obj(struct sk_psock_link, GFP_ATOMIC | __GFP_NOWARN)) + kzalloc_obj(struct sk_psock_link, GFP_ATOMIC | __GFP_NOWARN) static inline void sk_psock_free_link(struct sk_psock_link *link) { -- cgit v1.2.3