From b01fd6e802b6d0a635176f943315670b679d8d7b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:23 -0700 Subject: skmsg: Introduce a spinlock to protect ingress_msg Currently we rely on lock_sock to protect ingress_msg, it is too big for this, we can actually just use a spinlock to protect this list like protecting other skb queues. __tcp_bpf_recvmsg() is still special because of peeking, it still has to use lock_sock. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-3-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 6c09d94be2e9..f2d45a73b2b2 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -89,6 +89,7 @@ struct sk_psock { #endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; + spinlock_t ingress_lock; unsigned long state; struct list_head link; spinlock_t link_lock; @@ -284,7 +285,45 @@ static inline struct sk_psock *sk_psock(const struct sock *sk) static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { + spin_lock_bh(&psock->ingress_lock); list_add_tail(&msg->list, &psock->ingress_msg); + spin_unlock_bh(&psock->ingress_lock); +} + +static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) +{ + struct sk_msg *msg; + + spin_lock_bh(&psock->ingress_lock); + msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); + if (msg) + list_del(&msg->list); + spin_unlock_bh(&psock->ingress_lock); + return msg; +} + +static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) +{ + struct sk_msg *msg; + + spin_lock_bh(&psock->ingress_lock); + msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); + spin_unlock_bh(&psock->ingress_lock); + return msg; +} + +static inline struct sk_msg *sk_psock_next_msg(struct sk_psock *psock, + struct sk_msg *msg) +{ + struct sk_msg *ret; + + spin_lock_bh(&psock->ingress_lock); + if (list_is_last(&msg->list, &psock->ingress_msg)) + ret = NULL; + else + ret = list_next_entry(msg, list); + spin_unlock_bh(&psock->ingress_lock); + return ret; } static inline bool sk_psock_queue_empty(const struct sk_psock *psock) @@ -292,6 +331,13 @@ static inline bool sk_psock_queue_empty(const struct sk_psock *psock) return psock ? list_empty(&psock->ingress_msg) : true; } +static inline void kfree_sk_msg(struct sk_msg *msg) +{ + if (msg->skb) + consume_skb(msg->skb); + kfree(msg); +} + static inline void sk_psock_report_error(struct sk_psock *psock, int err) { struct sock *sk = psock->sk; -- cgit v1.2.3 From 0739cd28f2645e814586c7536ba5da9825cb8029 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:24 -0700 Subject: net: Introduce skb_send_sock() for sock_map We only have skb_send_sock_locked() which requires callers to use lock_sock(). Introduce a variant skb_send_sock() which locks on its own, callers do not need to lock it any more. This will save us from adding a ->sendmsg_locked for each protocol. To reuse the code, pass function pointers to __skb_send_sock() and build skb_send_sock() and skb_send_sock_locked() on top. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210331023237.41094-4-xiyou.wangcong@gmail.com --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c8def85fcc22..dbf820a50a39 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3626,6 +3626,7 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, unsigned int flags); int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len); +int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, -- cgit v1.2.3 From 799aa7f98d53e0f541fa6b4dc9aa47b4ff2178e3 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:25 -0700 Subject: skmsg: Avoid lock_sock() in sk_psock_backlog() We do not have to lock the sock to avoid losing sk_socket, instead we can purge all the ingress queues when we close the socket. Sending or receiving packets after orphaning socket makes no sense. We do purge these queues when psock refcnt reaches zero but here we want to purge them explicitly in sock_map_close(). There are also some nasty race conditions on testing bit SK_PSOCK_TX_ENABLED and queuing/canceling the psock work, we can expand psock->ingress_lock a bit to protect them too. As noticed by John, we still have to lock the psock->work, because the same work item could be running concurrently on different CPU's. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-5-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index f2d45a73b2b2..7382c4b518d7 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -99,6 +99,7 @@ struct sk_psock { void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); struct proto *sk_proto; + struct mutex work_mutex; struct sk_psock_work_state work_state; struct work_struct work; union { @@ -347,6 +348,7 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err) } struct sk_psock *sk_psock_init(struct sock *sk, int node); +void sk_psock_stop(struct sk_psock *psock, bool wait); #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); -- cgit v1.2.3 From 7786dfc41a74e0567557b5c4a28fc8482f5f5691 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:26 -0700 Subject: skmsg: Use rcu work for destroying psock The RCU callback sk_psock_destroy() only queues work psock->gc, so we can just switch to rcu work to simplify the code. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-6-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 7382c4b518d7..e7aba150539d 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -102,10 +102,7 @@ struct sk_psock { struct mutex work_mutex; struct sk_psock_work_state work_state; struct work_struct work; - union { - struct rcu_head rcu; - struct work_struct gc; - }; + struct rcu_work rwork; }; int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, -- cgit v1.2.3 From a7ba4558e69a3c2ae4ca521f015832ef44799538 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:30 -0700 Subject: sock_map: Introduce BPF_SK_SKB_VERDICT Reusing BPF_SK_SKB_STREAM_VERDICT is possible but its name is confusing and more importantly we still want to distinguish them from user-space. So we can just reuse the stream verdict code but introduce a new type of eBPF program, skb_verdict. Users are not allowed to attach stream_verdict and skb_verdict programs to the same map. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-10-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 ++ include/uapi/linux/bpf.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e7aba150539d..c83dbc2d81d9 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -58,6 +58,7 @@ struct sk_psock_progs { struct bpf_prog *msg_parser; struct bpf_prog *stream_parser; struct bpf_prog *stream_verdict; + struct bpf_prog *skb_verdict; }; enum sk_psock_state_bits { @@ -487,6 +488,7 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs) psock_set_prog(&progs->msg_parser, NULL); psock_set_prog(&progs->stream_parser, NULL); psock_set_prog(&progs->stream_verdict, NULL); + psock_set_prog(&progs->skb_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 598716742593..49371eba98ba 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -957,6 +957,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SK_SKB_VERDICT, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 8a59f9d1e3d4340659fdfee8879dc09a6f2546e1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:31 -0700 Subject: sock: Introduce sk->sk_prot->psock_update_sk_prot() Currently sockmap calls into each protocol to update the struct proto and replace it. This certainly won't work when the protocol is implemented as a module, for example, AF_UNIX. Introduce a new ops sk->sk_prot->psock_update_sk_prot(), so each protocol can implement its own way to replace the struct proto. This also helps get rid of symbol dependencies on CONFIG_INET. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210331023237.41094-11-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 18 +++--------------- include/net/sock.h | 3 +++ include/net/tcp.h | 1 + include/net/udp.h | 1 + 4 files changed, 8 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index c83dbc2d81d9..5e800ddc2dc6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -99,6 +99,7 @@ struct sk_psock { void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); + int (*psock_update_sk_prot)(struct sock *sk, bool restore); struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; @@ -395,25 +396,12 @@ static inline void sk_psock_cork_free(struct sk_psock *psock) } } -static inline void sk_psock_update_proto(struct sock *sk, - struct sk_psock *psock, - struct proto *ops) -{ - /* Pairs with lockless read in sk_clone_lock() */ - WRITE_ONCE(sk->sk_prot, ops); -} - static inline void sk_psock_restore_proto(struct sock *sk, struct sk_psock *psock) { sk->sk_prot->unhash = psock->saved_unhash; - if (inet_csk_has_ulp(sk)) { - tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); - } else { - sk->sk_write_space = psock->saved_write_space; - /* Pairs with lockless read in sk_clone_lock() */ - WRITE_ONCE(sk->sk_prot, psock->sk_proto); - } + if (psock->psock_update_sk_prot) + psock->psock_update_sk_prot(sk, true); } static inline void sk_psock_set_state(struct sk_psock *psock, diff --git a/include/net/sock.h b/include/net/sock.h index 0b6266fd6bf6..8b4155e756c2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1184,6 +1184,9 @@ struct proto { void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); +#ifdef CONFIG_BPF_SYSCALL + int (*psock_update_sk_prot)(struct sock *sk, bool restore); +#endif /* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS diff --git a/include/net/tcp.h b/include/net/tcp.h index 075de26f449d..2efa4e5ea23d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2203,6 +2203,7 @@ struct sk_psock; #ifdef CONFIG_BPF_SYSCALL struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); +int tcp_bpf_update_proto(struct sock *sk, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #endif /* CONFIG_BPF_SYSCALL */ diff --git a/include/net/udp.h b/include/net/udp.h index d4d064c59232..df7cc1edc200 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -518,6 +518,7 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, #ifdef CONFIG_BPF_SYSCALL struct sk_psock; struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); +int udp_bpf_update_proto(struct sock *sk, bool restore); #endif #endif /* _UDP_H */ -- cgit v1.2.3 From d7f571188ecf25c244789b883c878ec7c64b5b08 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:32 -0700 Subject: udp: Implement ->read_sock() for sockmap This is similar to tcp_read_sock(), except we do not need to worry about connections, we just need to retrieve skb from UDP receive queue. Note, the return value of ->read_sock() is unused in sk_psock_verdict_data_ready(), and UDP still does not support splice() due to lack of ->splice_read(), so users can not reach udp_read_sock() directly. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-12-xiyou.wangcong@gmail.com --- include/net/udp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index df7cc1edc200..347b62a753c3 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -329,6 +329,8 @@ struct sock *__udp6_lib_lookup(struct net *net, struct sk_buff *skb); struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); +int udp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor); /* UDP uses skb->dev_scratch to cache as much information as possible and avoid * possibly multiple cache miss on dequeue() -- cgit v1.2.3 From 2bc793e3272a13e337416c057cb81c5396ad91d1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:33 -0700 Subject: skmsg: Extract __tcp_bpf_recvmsg() and tcp_bpf_wait_data() Although these two functions are only used by TCP, they are not specific to TCP at all, both operate on skmsg and ingress_msg, so fit in net/core/skmsg.c very well. And we will need them for non-TCP, so rename and move them to skmsg.c and export them to modules. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210331023237.41094-13-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 4 ++++ include/net/tcp.h | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 5e800ddc2dc6..f78e90a04a69 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -125,6 +125,10 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); +int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, + long timeo, int *err); +int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) { diff --git a/include/net/tcp.h b/include/net/tcp.h index 2efa4e5ea23d..31b1696c62ba 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2209,8 +2209,6 @@ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, int flags); -int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len, int flags); #endif /* CONFIG_NET_SOCK_MSG */ #if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG) -- cgit v1.2.3