From 1abe267f173eae7ae76cf56232292e9641eb652f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Mar 2024 14:40:32 +0000 Subject: net: add sk_wake_async_rcu() helper While looking at UDP receive performance, I saw sk_wake_async() was no longer inlined. This matters at least on AMD Zen1-4 platforms (see SRSO) This might be because rcu_read_lock() and rcu_read_unlock() are no longer nops in recent kernels ? Add sk_wake_async_rcu() variant, which must be called from contexts already holding rcu lock. As SOCK_FASYNC is deprecated in modern days, use unlikely() to give a hint to the compiler. sk_wake_async_rcu() is properly inlined from __udp_enqueue_schedule_skb() and sock_def_readable(). Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240328144032.1864988-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index f57bfd8a2ad2..2253eefe2848 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2513,6 +2513,12 @@ static inline void sk_wake_async(const struct sock *sk, int how, int band) } } +static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band) +{ + if (unlikely(sock_flag(sk, SOCK_FASYNC))) + sock_wake_async(rcu_dereference(sk->sk_wq), how, band); +} + /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak. * Note: for send buffers, TCP works better if we can build two skbs at -- cgit v1.2.3 From 9f06f87fef689d28588cde8c7ebb00a67da34026 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 3 Apr 2024 13:21:39 -0700 Subject: net: skbuff: generalize the skb->decrypted bit The ->decrypted bit can be reused for other crypto protocols. Remove the direct dependency on TLS, add helpers to clean up the ifdefs leaking out everywhere. Signed-off-by: Jakub Kicinski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/sock.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 2253eefe2848..a495330c5c49 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2835,12 +2835,10 @@ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { skb = sk->sk_validate_xmit_skb(sk, dev, skb); -#ifdef CONFIG_TLS_DEVICE - } else if (unlikely(skb->decrypted)) { + } else if (unlikely(skb_is_decrypted(skb))) { pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); kfree_skb(skb); skb = NULL; -#endif } #endif -- cgit v1.2.3 From f3d93817fba30a8d3508fa990405039c0820dca3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 29 Apr 2024 13:40:24 +0000 Subject: net: add Move some proto memory definitions out of Very few files need them, and following patch will include from Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240429134025.1233626-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 78 ------------------------------------------------------ 1 file changed, 78 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 48bcc845202f..0450494a1766 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1371,75 +1371,6 @@ static inline int sk_under_cgroup_hierarchy(struct sock *sk, #endif } -static inline bool sk_has_memory_pressure(const struct sock *sk) -{ - return sk->sk_prot->memory_pressure != NULL; -} - -static inline bool sk_under_global_memory_pressure(const struct sock *sk) -{ - return sk->sk_prot->memory_pressure && - !!READ_ONCE(*sk->sk_prot->memory_pressure); -} - -static inline bool sk_under_memory_pressure(const struct sock *sk) -{ - if (!sk->sk_prot->memory_pressure) - return false; - - if (mem_cgroup_sockets_enabled && sk->sk_memcg && - mem_cgroup_under_socket_pressure(sk->sk_memcg)) - return true; - - return !!READ_ONCE(*sk->sk_prot->memory_pressure); -} - -static inline long -proto_memory_allocated(const struct proto *prot) -{ - return max(0L, atomic_long_read(prot->memory_allocated)); -} - -static inline long -sk_memory_allocated(const struct sock *sk) -{ - return proto_memory_allocated(sk->sk_prot); -} - -/* 1 MB per cpu, in page units */ -#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT)) -extern int sysctl_mem_pcpu_rsv; - -static inline void proto_memory_pcpu_drain(struct proto *proto) -{ - int val = this_cpu_xchg(*proto->per_cpu_fw_alloc, 0); - - if (val) - atomic_long_add(val, proto->memory_allocated); -} - -static inline void -sk_memory_allocated_add(const struct sock *sk, int val) -{ - struct proto *proto = sk->sk_prot; - - val = this_cpu_add_return(*proto->per_cpu_fw_alloc, val); - - if (unlikely(val >= READ_ONCE(sysctl_mem_pcpu_rsv))) - proto_memory_pcpu_drain(proto); -} - -static inline void -sk_memory_allocated_sub(const struct sock *sk, int val) -{ - struct proto *proto = sk->sk_prot; - - val = this_cpu_sub_return(*proto->per_cpu_fw_alloc, val); - - if (unlikely(val <= -READ_ONCE(sysctl_mem_pcpu_rsv))) - proto_memory_pcpu_drain(proto); -} - #define SK_ALLOC_PERCPU_COUNTER_BATCH 16 static inline void sk_sockets_allocated_dec(struct sock *sk) @@ -1466,15 +1397,6 @@ proto_sockets_allocated_sum_positive(struct proto *prot) return percpu_counter_sum_positive(prot->sockets_allocated); } -static inline bool -proto_memory_pressure(struct proto *prot) -{ - if (!prot->memory_pressure) - return false; - return !!READ_ONCE(*prot->memory_pressure); -} - - #ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { -- cgit v1.2.3 From 92ef0fd55ac80dfc2e4654edfe5d1ddfa6e070fe Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 May 2024 09:20:08 -0600 Subject: net: change proto and proto_ops accept type Rather than pass in flags, error pointer, and whether this is a kernel invocation or not, add a struct proto_accept_arg struct as the argument. This then holds all of these arguments, and prepares accept for being able to pass back more information. No functional changes in this patch. Acked-by: Jakub Kicinski Signed-off-by: Jens Axboe --- include/net/sock.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 0450494a1766..217079b3e3e8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1194,6 +1194,12 @@ static inline void sk_prot_clear_nulls(struct sock *sk, int size) size - offsetof(struct sock, sk_node.pprev)); } +struct proto_accept_arg { + int flags; + int err; + bool kern; +}; + /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface */ @@ -1208,8 +1214,8 @@ struct proto { int addr_len); int (*disconnect)(struct sock *sk, int flags); - struct sock * (*accept)(struct sock *sk, int flags, int *err, - bool kern); + struct sock * (*accept)(struct sock *sk, + struct proto_accept_arg *arg); int (*ioctl)(struct sock *sk, int cmd, int *karg); @@ -1804,7 +1810,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, int sock_no_bind(struct socket *, struct sockaddr *, int); int sock_no_connect(struct socket *, struct sockaddr *, int, int); int sock_no_socketpair(struct socket *, struct socket *); -int sock_no_accept(struct socket *, struct socket *, int, bool); +int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *); int sock_no_getname(struct socket *, struct sockaddr *, int); int sock_no_ioctl(struct socket *, unsigned int, unsigned long); int sock_no_listen(struct socket *, int); -- cgit v1.2.3 From 7951e36ac620a9ba1bae0ac0ddd62d2e82285725 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 May 2024 09:35:01 -0600 Subject: net: pass back whether socket was empty post accept This adds an 'is_empty' argument to struct proto_accept_arg, which can be used to pass back information on whether or not the given socket has more connections to accept post the one just accepted. To utilize this information, the caller should initialize the 'is_empty' field to, eg, -1 and then check for 0/1 after the accept. If the field has been set, the caller knows whether there are more pending connections or not. If the field remains -1 after the accept call, the protocol doesn't support passing back this information. This patch wires it up for ipv4/6 TCP. Acked-by: Jakub Kicinski Signed-off-by: Jens Axboe --- include/net/sock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 217079b3e3e8..5f4d0629348f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1197,6 +1197,7 @@ static inline void sk_prot_clear_nulls(struct sock *sk, int size) struct proto_accept_arg { int flags; int err; + int is_empty; bool kern; }; -- cgit v1.2.3 From 92f1655aa2b2294d0b49925f3b875a634bd3b59e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 May 2024 11:43:53 +0000 Subject: net: fix __dst_negative_advice() race __dst_negative_advice() does not enforce proper RCU rules when sk->dst_cache must be cleared, leading to possible UAF. RCU rules are that we must first clear sk->sk_dst_cache, then call dst_release(old_dst). Note that sk_dst_reset(sk) is implementing this protocol correctly, while __dst_negative_advice() uses the wrong order. Given that ip6_negative_advice() has special logic against RTF_CACHE, this means each of the three ->negative_advice() existing methods must perform the sk_dst_reset() themselves. Note the check against NULL dst is centralized in __dst_negative_advice(), there is no need to duplicate it in various callbacks. Many thanks to Clement Lecigne for tracking this issue. This old bug became visible after the blamed commit, using UDP sockets. Fixes: a87cb3e48ee8 ("net: Facility to report route quality of connected sockets") Reported-by: Clement Lecigne Diagnosed-by: Clement Lecigne Signed-off-by: Eric Dumazet Cc: Tom Herbert Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240528114353.1794151-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'include/net/sock.h') diff --git a/include/net/sock.h b/include/net/sock.h index 5f4d0629348f..953c8dc4e259 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2063,17 +2063,10 @@ sk_dst_get(const struct sock *sk) static inline void __dst_negative_advice(struct sock *sk) { - struct dst_entry *ndst, *dst = __sk_dst_get(sk); + struct dst_entry *dst = __sk_dst_get(sk); - if (dst && dst->ops->negative_advice) { - ndst = dst->ops->negative_advice(dst); - - if (ndst != dst) { - rcu_assign_pointer(sk->sk_dst_cache, ndst); - sk_tx_queue_clear(sk); - WRITE_ONCE(sk->sk_dst_pending_confirm, 0); - } - } + if (dst && dst->ops->negative_advice) + dst->ops->negative_advice(sk, dst); } static inline void dst_negative_advice(struct sock *sk) -- cgit v1.2.3