From 55f6c98e3674ce16038a1949c3f9ca5a9a99f289 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 7 Oct 2019 10:58:29 +0100 Subject: rxrpc: Fix trace-after-put looking at the put peer record rxrpc_put_peer() calls trace_rxrpc_peer() after it has done the decrement of the refcount - which looks at the debug_id in the peer record. But unless the refcount was reduced to zero, we no longer have the right to look in the record and, indeed, it may be deleted by some other thread. Fix this by getting the debug_id out before decrementing the refcount and then passing that into the tracepoint. This can cause the following symptoms: BUG: KASAN: use-after-free in __rxrpc_put_peer net/rxrpc/peer_object.c:411 [inline] BUG: KASAN: use-after-free in rxrpc_put_peer+0x685/0x6a0 net/rxrpc/peer_object.c:435 Read of size 8 at addr ffff888097ec0058 by task syz-executor823/24216 Fixes: 1159d4b496f5 ("rxrpc: Add a tracepoint to track rxrpc_peer refcounting") Reported-by: syzbot+b9be979c55f2bea8ed30@syzkaller.appspotmail.com Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index edc5c887a44c..45556fe771c3 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -519,10 +519,10 @@ TRACE_EVENT(rxrpc_local, ); TRACE_EVENT(rxrpc_peer, - TP_PROTO(struct rxrpc_peer *peer, enum rxrpc_peer_trace op, + TP_PROTO(unsigned int peer_debug_id, enum rxrpc_peer_trace op, int usage, const void *where), - TP_ARGS(peer, op, usage, where), + TP_ARGS(peer_debug_id, op, usage, where), TP_STRUCT__entry( __field(unsigned int, peer ) @@ -532,7 +532,7 @@ TRACE_EVENT(rxrpc_peer, ), TP_fast_assign( - __entry->peer = peer->debug_id; + __entry->peer = peer_debug_id; __entry->op = op; __entry->usage = usage; __entry->where = where; -- cgit v1.2.3 From 4c1295dccc0afe0905b6ca4c62ade7f2406f2cfb Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 7 Oct 2019 10:58:29 +0100 Subject: rxrpc: Fix trace-after-put looking at the put connection record rxrpc_put_*conn() calls trace_rxrpc_conn() after they have done the decrement of the refcount - which looks at the debug_id in the connection record. But unless the refcount was reduced to zero, we no longer have the right to look in the record and, indeed, it may be deleted by some other thread. Fix this by getting the debug_id out before decrementing the refcount and then passing that into the tracepoint. Fixes: 363deeab6d0f ("rxrpc: Add connection tracepoint and client conn state tracepoint") Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 45556fe771c3..38a97e890cb6 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -546,10 +546,10 @@ TRACE_EVENT(rxrpc_peer, ); TRACE_EVENT(rxrpc_conn, - TP_PROTO(struct rxrpc_connection *conn, enum rxrpc_conn_trace op, + TP_PROTO(unsigned int conn_debug_id, enum rxrpc_conn_trace op, int usage, const void *where), - TP_ARGS(conn, op, usage, where), + TP_ARGS(conn_debug_id, op, usage, where), TP_STRUCT__entry( __field(unsigned int, conn ) @@ -559,7 +559,7 @@ TRACE_EVENT(rxrpc_conn, ), TP_fast_assign( - __entry->conn = conn->debug_id; + __entry->conn = conn_debug_id; __entry->op = op; __entry->usage = usage; __entry->where = where; -- cgit v1.2.3 From 48c9e0ec7cbbb7370448f859ccc8e3b7eb69e755 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 7 Oct 2019 10:58:29 +0100 Subject: rxrpc: Fix trace-after-put looking at the put call record rxrpc_put_call() calls trace_rxrpc_call() after it has done the decrement of the refcount - which looks at the debug_id in the call record. But unless the refcount was reduced to zero, we no longer have the right to look in the record and, indeed, it may be deleted by some other thread. Fix this by getting the debug_id out before decrementing the refcount and then passing that into the tracepoint. Fixes: e34d4234b0b7 ("rxrpc: Trace rxrpc_call usage") Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 38a97e890cb6..191fe447f990 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -606,10 +606,10 @@ TRACE_EVENT(rxrpc_client, ); TRACE_EVENT(rxrpc_call, - TP_PROTO(struct rxrpc_call *call, enum rxrpc_call_trace op, + TP_PROTO(unsigned int call_debug_id, enum rxrpc_call_trace op, int usage, const void *where, const void *aux), - TP_ARGS(call, op, usage, where, aux), + TP_ARGS(call_debug_id, op, usage, where, aux), TP_STRUCT__entry( __field(unsigned int, call ) @@ -620,7 +620,7 @@ TRACE_EVENT(rxrpc_call, ), TP_fast_assign( - __entry->call = call->debug_id; + __entry->call = call_debug_id; __entry->op = op; __entry->usage = usage; __entry->where = where; -- cgit v1.2.3 From dc0c18ed229cdcca283dd78fefa38273ec37a42c Mon Sep 17 00:00:00 2001 From: Aaron Komisar Date: Wed, 2 Oct 2019 13:59:07 +0000 Subject: mac80211: fix scan when operating on DFS channels in ETSI domains In non-ETSI regulatory domains scan is blocked when operating channel is a DFS channel. For ETSI, however, once DFS channel is marked as available after the CAC, this channel will remain available (for some time) even after leaving this channel. Therefore a scan can be done without any impact on the availability of the DFS channel as no new CAC is required after the scan. Enable scan in mac80211 in these cases. Signed-off-by: Aaron Komisar Link: https://lore.kernel.org/r/1570024728-17284-1-git-send-email-aaron.komisar@tandemg.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index ff45c3e1abff..4ab2c49423dc 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5549,6 +5549,14 @@ const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy, */ const char *reg_initiator_name(enum nl80211_reg_initiator initiator); +/** + * regulatory_pre_cac_allowed - check if pre-CAC allowed in the current regdom + * @wiphy: wiphy for which pre-CAC capability is checked. + * + * Pre-CAC is allowed only in some regdomains (notable ETSI). + */ +bool regulatory_pre_cac_allowed(struct wiphy *wiphy); + /** * DOC: Internal regulatory db functions * -- cgit v1.2.3 From b74555de21acd791f12c4a1aeaf653dd7ac21133 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 6 Oct 2019 14:24:25 -0700 Subject: llc: fix sk_buff leak in llc_conn_service() syzbot reported: BUG: memory leak unreferenced object 0xffff88811eb3de00 (size 224): comm "syz-executor559", pid 7315, jiffies 4294943019 (age 10.300s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 a0 38 24 81 88 ff ff 00 c0 f2 15 81 88 ff ff ..8$............ backtrace: [<000000008d1c66a1>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<000000008d1c66a1>] slab_post_alloc_hook mm/slab.h:439 [inline] [<000000008d1c66a1>] slab_alloc_node mm/slab.c:3269 [inline] [<000000008d1c66a1>] kmem_cache_alloc_node+0x153/0x2a0 mm/slab.c:3579 [<00000000447d9496>] __alloc_skb+0x6e/0x210 net/core/skbuff.c:198 [<000000000cdbf82f>] alloc_skb include/linux/skbuff.h:1058 [inline] [<000000000cdbf82f>] llc_alloc_frame+0x66/0x110 net/llc/llc_sap.c:54 [<000000002418b52e>] llc_conn_ac_send_sabme_cmd_p_set_x+0x2f/0x140 net/llc/llc_c_ac.c:777 [<000000001372ae17>] llc_exec_conn_trans_actions net/llc/llc_conn.c:475 [inline] [<000000001372ae17>] llc_conn_service net/llc/llc_conn.c:400 [inline] [<000000001372ae17>] llc_conn_state_process+0x1ac/0x640 net/llc/llc_conn.c:75 [<00000000f27e53c1>] llc_establish_connection+0x110/0x170 net/llc/llc_if.c:109 [<00000000291b2ca0>] llc_ui_connect+0x10e/0x370 net/llc/af_llc.c:477 [<000000000f9c740b>] __sys_connect+0x11d/0x170 net/socket.c:1840 [...] The bug is that most callers of llc_conn_send_pdu() assume it consumes a reference to the skb, when actually due to commit b85ab56c3f81 ("llc: properly handle dev_queue_xmit() return value") it doesn't. Revert most of that commit, and instead make the few places that need llc_conn_send_pdu() to *not* consume a reference call skb_get() before. Fixes: b85ab56c3f81 ("llc: properly handle dev_queue_xmit() return value") Reported-by: syzbot+6b825a6494a04cc0e3f7@syzkaller.appspotmail.com Signed-off-by: Eric Biggers Signed-off-by: Jakub Kicinski --- include/net/llc_conn.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/llc_conn.h b/include/net/llc_conn.h index df528a623548..ea985aa7a6c5 100644 --- a/include/net/llc_conn.h +++ b/include/net/llc_conn.h @@ -104,7 +104,7 @@ void llc_sk_reset(struct sock *sk); /* Access to a connection */ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb); -int llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb); +void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb); void llc_conn_rtn_pdu(struct sock *sk, struct sk_buff *skb); void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit); void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit); -- cgit v1.2.3 From 819be8108fded0b9e710bbbf81193e52f7bab2f7 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 8 Oct 2019 19:09:23 +0800 Subject: sctp: add chunks to sk_backlog when the newsk sk_socket is not set This patch is to fix a NULL-ptr deref in selinux_socket_connect_helper: [...] kasan: GPF could be caused by NULL-ptr deref or user memory access [...] RIP: 0010:selinux_socket_connect_helper+0x94/0x460 [...] Call Trace: [...] selinux_sctp_bind_connect+0x16a/0x1d0 [...] security_sctp_bind_connect+0x58/0x90 [...] sctp_process_asconf+0xa52/0xfd0 [sctp] [...] sctp_sf_do_asconf+0x785/0x980 [sctp] [...] sctp_do_sm+0x175/0x5a0 [sctp] [...] sctp_assoc_bh_rcv+0x285/0x5b0 [sctp] [...] sctp_backlog_rcv+0x482/0x910 [sctp] [...] __release_sock+0x11e/0x310 [...] release_sock+0x4f/0x180 [...] sctp_accept+0x3f9/0x5a0 [sctp] [...] inet_accept+0xe7/0x720 It was caused by that the 'newsk' sk_socket was not set before going to security sctp hook when processing asconf chunk with SCTP_PARAM_ADD_IP or SCTP_PARAM_SET_PRIMARY: inet_accept()-> sctp_accept(): lock_sock(): lock listening 'sk' do_softirq(): sctp_rcv(): <-- [1] asconf chunk arrives and enqueued in 'sk' backlog sctp_sock_migrate(): set asoc's sk to 'newsk' release_sock(): sctp_backlog_rcv(): lock 'newsk' sctp_process_asconf() <-- [2] unlock 'newsk' sock_graft(): set sk_socket <-- [3] As it shows, at [1] the asconf chunk would be put into the listening 'sk' backlog, as accept() was holding its sock lock. Then at [2] asconf would get processed with 'newsk' as asoc's sk had been set to 'newsk'. However, 'newsk' sk_socket is not set until [3], while selinux_sctp_bind_connect() would deref it, then kernel crashed. Here to fix it by adding the chunk to sk_backlog until newsk sk_socket is set when .accept() is done. Note that sk->sk_socket can be NULL when the sock is closed, so SOCK_DEAD flag is also needed to check in sctp_newsk_ready(). Thanks to Ondrej for reviewing the code. Fixes: d452930fd3b9 ("selinux: Add SCTP support") Reported-by: Ying Xu Suggested-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: Jakub Kicinski --- include/net/sctp/sctp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 5d60f13d2347..3ab5c6bbb90b 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -610,4 +610,9 @@ static inline __u32 sctp_min_frag_point(struct sctp_sock *sp, __u16 datasize) return sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, datasize); } +static inline bool sctp_newsk_ready(const struct sock *sk) +{ + return sock_flag(sk, SOCK_DEAD) || sk->sk_socket; +} + #endif /* __net_sctp_h__ */ -- cgit v1.2.3 From 60b173ca3d1cd1782bd0096dc17298ec242f6fb1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 14:51:20 -0700 Subject: net: add {READ|WRITE}_ONCE() annotations on ->rskq_accept_head reqsk_queue_empty() is called from inet_csk_listen_poll() while other cpus might write ->rskq_accept_head value. Use {READ|WRITE}_ONCE() to avoid compiler tricks and potential KCSAN splats. Fixes: fff1f3001cc5 ("tcp: add a spinlock to protect struct request_sock_queue") Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/request_sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index fd178d58fa84..cf8b33213bbc 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -185,7 +185,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, static inline bool reqsk_queue_empty(const struct request_sock_queue *queue) { - return queue->rskq_accept_head == NULL; + return READ_ONCE(queue->rskq_accept_head) == NULL; } static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue *queue, @@ -197,7 +197,7 @@ static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue req = queue->rskq_accept_head; if (req) { sk_acceptq_removed(parent); - queue->rskq_accept_head = req->dl_next; + WRITE_ONCE(queue->rskq_accept_head, req->dl_next); if (queue->rskq_accept_head == NULL) queue->rskq_accept_tail = NULL; } -- cgit v1.2.3 From 1f142c17d19a5618d5a633195a46f2c8be9bf232 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 15:10:15 -0700 Subject: tcp: annotate lockless access to tcp_memory_pressure tcp_memory_pressure is read without holding any lock, and its value could be changed on other cpus. Use READ_ONCE() to annotate these lockless reads. The write side is already using atomic ops. Fixes: b8da51ebb1aa ("tcp: introduce tcp_under_memory_pressure()") Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index c9a3f9688223..88e63d64c698 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -258,7 +258,7 @@ static inline bool tcp_under_memory_pressure(const struct sock *sk) mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; - return tcp_memory_pressure; + return READ_ONCE(tcp_memory_pressure); } /* * The next routines deal with comparing 32 bit unsigned ints -- cgit v1.2.3 From eac66402d1c342f07ff38f8d631ff95eb7ad3220 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 15:32:35 -0700 Subject: net: annotate sk->sk_rcvlowat lockless reads sock_rcvlowat() or int_sk_rcvlowat() might be called without the socket lock for example from tcp_poll(). Use READ_ONCE() to document the fact that other cpus might change sk->sk_rcvlowat under us and avoid KCSAN splats. Use WRITE_ONCE() on write sides too. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/sock.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 2c53f1a1d905..79f54e1f8827 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2271,7 +2271,9 @@ static inline long sock_sndtimeo(const struct sock *sk, bool noblock) static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) { - return (waitall ? len : min_t(int, sk->sk_rcvlowat, len)) ? : 1; + int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len); + + return v ?: 1; } /* Alas, with timeout socket operations are not restartable. -- cgit v1.2.3 From 70c2655849a25431f31b505a07fe0c861e5e41fb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2019 15:41:03 -0700 Subject: net: silence KCSAN warnings about sk->sk_backlog.len reads sk->sk_backlog.len can be written by BH handlers, and read from process contexts in a lockless way. Note the write side should also use WRITE_ONCE() or a variant. We need some agreement about the best way to do this. syzbot reported : BUG: KCSAN: data-race in tcp_add_backlog / tcp_grow_window.isra.0 write to 0xffff88812665f32c of 4 bytes by interrupt on cpu 1: sk_add_backlog include/net/sock.h:934 [inline] tcp_add_backlog+0x4a0/0xcc0 net/ipv4/tcp_ipv4.c:1737 tcp_v4_rcv+0x1aba/0x1bf0 net/ipv4/tcp_ipv4.c:1925 ip_protocol_deliver_rcu+0x51/0x470 net/ipv4/ip_input.c:204 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:442 [inline] ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:5004 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5118 netif_receive_skb_internal+0x59/0x190 net/core/dev.c:5208 napi_skb_finish net/core/dev.c:5671 [inline] napi_gro_receive+0x28f/0x330 net/core/dev.c:5704 receive_buf+0x284/0x30b0 drivers/net/virtio_net.c:1061 virtnet_receive drivers/net/virtio_net.c:1323 [inline] virtnet_poll+0x436/0x7d0 drivers/net/virtio_net.c:1428 napi_poll net/core/dev.c:6352 [inline] net_rx_action+0x3ae/0xa50 net/core/dev.c:6418 read to 0xffff88812665f32c of 4 bytes by task 7292 on cpu 0: tcp_space include/net/tcp.h:1373 [inline] tcp_grow_window.isra.0+0x6b/0x480 net/ipv4/tcp_input.c:413 tcp_event_data_recv+0x68f/0x990 net/ipv4/tcp_input.c:717 tcp_rcv_established+0xbfe/0xf50 net/ipv4/tcp_input.c:5618 tcp_v4_do_rcv+0x381/0x4e0 net/ipv4/tcp_ipv4.c:1542 sk_backlog_rcv include/net/sock.h:945 [inline] __release_sock+0x135/0x1e0 net/core/sock.c:2427 release_sock+0x61/0x160 net/core/sock.c:2943 tcp_recvmsg+0x63b/0x1a30 net/ipv4/tcp.c:2181 inet_recvmsg+0xbb/0x250 net/ipv4/af_inet.c:838 sock_recvmsg_nosec net/socket.c:871 [inline] sock_recvmsg net/socket.c:889 [inline] sock_recvmsg+0x92/0xb0 net/socket.c:885 sock_read_iter+0x15f/0x1e0 net/socket.c:967 call_read_iter include/linux/fs.h:1864 [inline] new_sync_read+0x389/0x4f0 fs/read_write.c:414 __vfs_read+0xb1/0xc0 fs/read_write.c:427 vfs_read fs/read_write.c:461 [inline] vfs_read+0x143/0x2c0 fs/read_write.c:446 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 7292 Comm: syz-fuzzer Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 88e63d64c698..35f6f7e0fdc2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1380,7 +1380,8 @@ static inline int tcp_win_from_space(const struct sock *sk, int space) /* Note: caller must be prepared to deal with negative returns */ static inline int tcp_space(const struct sock *sk) { - return tcp_win_from_space(sk, sk->sk_rcvbuf - sk->sk_backlog.len - + return tcp_win_from_space(sk, sk->sk_rcvbuf - + READ_ONCE(sk->sk_backlog.len) - atomic_read(&sk->sk_rmem_alloc)); } -- cgit v1.2.3 From d983ea6f16b835dcde2ee9a58a1e764ce68bfccc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:38 -0700 Subject: tcp: add rcu protection around tp->fastopen_rsk Both tcp_v4_err() and tcp_v6_err() do the following operations while they do not own the socket lock : fastopen = tp->fastopen_rsk; snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; The problem is that without appropriate barrier, the compiler might reload tp->fastopen_rsk and trigger a NULL deref. request sockets are protected by RCU, we can simply add the missing annotations and barriers to solve the issue. Fixes: 168a8f58059a ("tcp: TCP Fast Open Server - main code path") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 99617e528ea2..668e25a76d69 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -393,7 +393,7 @@ struct tcp_sock { /* fastopen_rsk points to request_sock that resulted in this big * socket. Used to retransmit SYNACKs etc. */ - struct request_sock *fastopen_rsk; + struct request_sock __rcu *fastopen_rsk; u32 *saved_syn; }; @@ -447,8 +447,8 @@ static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) static inline bool tcp_passive_fastopen(const struct sock *sk) { - return (sk->sk_state == TCP_SYN_RECV && - tcp_sk(sk)->fastopen_rsk != NULL); + return sk->sk_state == TCP_SYN_RECV && + rcu_access_pointer(tcp_sk(sk)->fastopen_rsk) != NULL; } static inline void fastopen_queue_tune(struct sock *sk, int backlog) -- cgit v1.2.3 From 0f31746452e6793ad6271337438af8f4defb8940 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:41 -0700 Subject: tcp: annotate tp->write_seq lockless reads There are few places where we fetch tp->write_seq while this field can change from IRQ or other cpu. We need to add READ_ONCE() annotations, and also make sure write sides use corresponding WRITE_ONCE() to avoid store-tearing. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 35f6f7e0fdc2..8e7c3f6801a9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1917,7 +1917,7 @@ static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) static inline bool tcp_stream_memory_free(const struct sock *sk, int wake) { const struct tcp_sock *tp = tcp_sk(sk); - u32 notsent_bytes = tp->write_seq - tp->snd_nxt; + u32 notsent_bytes = READ_ONCE(tp->write_seq) - tp->snd_nxt; return (notsent_bytes << wake) < tcp_notsent_lowat(tp); } -- cgit v1.2.3 From e0d694d638dba768b47be31c22e1a9b4f862f561 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:42 -0700 Subject: tcp: annotate tp->snd_nxt lockless reads There are few places where we fetch tp->snd_nxt while this field can change from IRQ or other cpu. We need to add READ_ONCE() annotations, and also make sure write sides use corresponding WRITE_ONCE() to avoid store-tearing. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 8e7c3f6801a9..e1d08f69fd39 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1917,7 +1917,8 @@ static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) static inline bool tcp_stream_memory_free(const struct sock *sk, int wake) { const struct tcp_sock *tp = tcp_sk(sk); - u32 notsent_bytes = READ_ONCE(tp->write_seq) - tp->snd_nxt; + u32 notsent_bytes = READ_ONCE(tp->write_seq) - + READ_ONCE(tp->snd_nxt); return (notsent_bytes << wake) < tcp_notsent_lowat(tp); } -- cgit v1.2.3 From ebb3b78db7bf842270a46fd4fe7cc45c78fa5ed6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:44 -0700 Subject: tcp: annotate sk->sk_rcvbuf lockless reads For the sake of tcp_poll(), there are few places where we fetch sk->sk_rcvbuf while this field can change from IRQ or other cpu. We need to add READ_ONCE() annotations, and also make sure write sides use corresponding WRITE_ONCE() to avoid store-tearing. Note that other transports probably need similar fixes. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 4 ++-- include/trace/events/sock.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index e1d08f69fd39..ab4eb5eb5d07 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1380,14 +1380,14 @@ static inline int tcp_win_from_space(const struct sock *sk, int space) /* Note: caller must be prepared to deal with negative returns */ static inline int tcp_space(const struct sock *sk) { - return tcp_win_from_space(sk, sk->sk_rcvbuf - + return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - READ_ONCE(sk->sk_backlog.len) - atomic_read(&sk->sk_rmem_alloc)); } static inline int tcp_full_space(const struct sock *sk) { - return tcp_win_from_space(sk, sk->sk_rcvbuf); + return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } extern void tcp_openreq_init_rwin(struct request_sock *req, diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index a0c4b8a30966..f720c32e7dfd 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -82,7 +82,7 @@ TRACE_EVENT(sock_rcvqueue_full, TP_fast_assign( __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->truesize = skb->truesize; - __entry->sk_rcvbuf = sk->sk_rcvbuf; + __entry->sk_rcvbuf = READ_ONCE(sk->sk_rcvbuf); ), TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d", -- cgit v1.2.3 From e292f05e0df73f9fcc93329663936e1ded97a988 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:45 -0700 Subject: tcp: annotate sk->sk_sndbuf lockless reads For the sake of tcp_poll(), there are few places where we fetch sk->sk_sndbuf while this field can change from IRQ or other cpu. We need to add READ_ONCE() annotations, and also make sure write sides use corresponding WRITE_ONCE() to avoid store-tearing. Note that other transports probably need similar fixes. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 79f54e1f8827..3d1e7502333e 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -883,7 +883,7 @@ static inline int sk_stream_min_wspace(const struct sock *sk) static inline int sk_stream_wspace(const struct sock *sk) { - return sk->sk_sndbuf - sk->sk_wmem_queued; + return READ_ONCE(sk->sk_sndbuf) - sk->sk_wmem_queued; } void sk_stream_write_space(struct sock *sk); @@ -1207,7 +1207,7 @@ static inline void sk_refcnt_debug_release(const struct sock *sk) static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) { - if (sk->sk_wmem_queued >= sk->sk_sndbuf) + if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf)) return false; return sk->sk_prot->stream_memory_free ? @@ -2220,10 +2220,14 @@ static inline void sk_wake_async(const struct sock *sk, int how, int band) static inline void sk_stream_moderate_sndbuf(struct sock *sk) { - if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) { - sk->sk_sndbuf = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1); - sk->sk_sndbuf = max_t(u32, sk->sk_sndbuf, SOCK_MIN_SNDBUF); - } + u32 val; + + if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) + return; + + val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1); + + WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF)); } struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, @@ -2251,7 +2255,7 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); */ static inline bool sock_writeable(const struct sock *sk) { - return refcount_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf >> 1); + return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1); } static inline gfp_t gfp_any(void) -- cgit v1.2.3 From ab4e846a82d0ae00176de19f2db3c5c64f8eb5f2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Oct 2019 20:17:46 -0700 Subject: tcp: annotate sk->sk_wmem_queued lockless reads For the sake of tcp_poll(), there are few places where we fetch sk->sk_wmem_queued while this field can change from IRQ or other cpu. We need to add READ_ONCE() annotations, and also make sure write sides use corresponding WRITE_ONCE() to avoid store-tearing. sk_wmem_queued_add() helper is added so that we can in the future convert to ADD_ONCE() or equivalent if/when available. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 15 ++++++++++----- include/trace/events/sock.h | 2 +- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 3d1e7502333e..f69b58bff7e5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -878,12 +878,17 @@ static inline bool sk_acceptq_is_full(const struct sock *sk) */ static inline int sk_stream_min_wspace(const struct sock *sk) { - return sk->sk_wmem_queued >> 1; + return READ_ONCE(sk->sk_wmem_queued) >> 1; } static inline int sk_stream_wspace(const struct sock *sk) { - return READ_ONCE(sk->sk_sndbuf) - sk->sk_wmem_queued; + return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued); +} + +static inline void sk_wmem_queued_add(struct sock *sk, int val) +{ + WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val); } void sk_stream_write_space(struct sock *sk); @@ -1207,7 +1212,7 @@ static inline void sk_refcnt_debug_release(const struct sock *sk) static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) { - if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf)) + if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) return false; return sk->sk_prot->stream_memory_free ? @@ -1467,7 +1472,7 @@ DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key); static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { sock_set_flag(sk, SOCK_QUEUE_SHRUNK); - sk->sk_wmem_queued -= skb->truesize; + sk_wmem_queued_add(sk, -skb->truesize); sk_mem_uncharge(sk, skb->truesize); if (static_branch_unlikely(&tcp_tx_skb_cache_key) && !sk->sk_tx_skb_cache && !skb_cloned(skb)) { @@ -2014,7 +2019,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro skb->len += copy; skb->data_len += copy; skb->truesize += copy; - sk->sk_wmem_queued += copy; + sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); return 0; } diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index f720c32e7dfd..51fe9f6719eb 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -115,7 +115,7 @@ TRACE_EVENT(sock_exceed_buf_limit, __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc); __entry->sysctl_wmem = sk_get_wmem0(sk, prot); __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc); - __entry->wmem_queued = sk->sk_wmem_queued; + __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued); __entry->kind = kind; ), -- cgit v1.2.3 From fa4e0f8855fcba600e0be2575ee29c69166f74bd Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Sat, 12 Oct 2019 13:55:07 +0200 Subject: net/sched: fix corrupted L2 header with MPLS 'push' and 'pop' actions the following script: # tc qdisc add dev eth0 clsact # tc filter add dev eth0 egress protocol ip matchall \ > action mpls push protocol mpls_uc label 0x355aa bos 1 causes corruption of all IP packets transmitted by eth0. On TC egress, we can't rely on the value of skb->mac_len, because it's 0 and a MPLS 'push' operation will result in an overwrite of the first 4 octets in the packet L2 header (e.g. the Destination Address if eth0 is an Ethernet); the same error pattern is present also in the MPLS 'pop' operation. Fix this error in act_mpls data plane, computing 'mac_len' as the difference between the network header and the mac header (when not at TC ingress), and use it in MPLS 'push'/'pop' core functions. v2: unbreak 'make htmldocs' because of missing documentation of 'mac_len' in skb_mpls_pop(), reported by kbuild test robot CC: Lorenzo Bianconi Fixes: 2a2ea50870ba ("net: sched: add mpls manipulation actions to TC") Reviewed-by: Simon Horman Acked-by: John Hurley Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4351577b14d7..7914fdaf4226 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3510,8 +3510,9 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len); int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); -int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto); -int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto); +int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, + int mac_len); +int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len); int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse); int skb_mpls_dec_ttl(struct sk_buff *skb); struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy, -- cgit v1.2.3 From 1d951ba3da67bbc7a9b0e05987e09552c2060e18 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Wed, 16 Oct 2019 15:35:07 +0200 Subject: net: phy: micrel: Update KSZ87xx PHY name The KSZ8795 PHY ID is in fact used by KSZ8794/KSZ8795/KSZ8765 switches. Update the PHY ID and name to reflect that, as this family of switches is commonly refered to as KSZ87xx Signed-off-by: Marek Vasut Cc: Andrew Lunn Cc: David S. Miller Cc: Florian Fainelli Cc: George McCollister Cc: Heiner Kallweit Cc: Sean Nyekjaer Cc: Tristram Ha Cc: Woojung Huh Signed-off-by: David S. Miller --- include/linux/micrel_phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h index ad24554f11f9..75f880c25bb8 100644 --- a/include/linux/micrel_phy.h +++ b/include/linux/micrel_phy.h @@ -31,7 +31,7 @@ #define PHY_ID_KSZ886X 0x00221430 #define PHY_ID_KSZ8863 0x00221435 -#define PHY_ID_KSZ8795 0x00221550 +#define PHY_ID_KSZ87XX 0x00221550 #define PHY_ID_KSZ9477 0x00221631 -- cgit v1.2.3 From 2a06b8982f8f2f40d03a3daf634676386bd84dbc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 18 Oct 2019 15:20:05 -0700 Subject: net: reorder 'struct net' fields to avoid false sharing Intel test robot reported a ~7% regression on TCP_CRR tests that they bisected to the cited commit. Indeed, every time a new TCP socket is created or deleted, the atomic counter net->count is touched (via get_net(net) and put_net(net) calls) So cpus might have to reload a contended cache line in net_hash_mix(net) calls. We need to reorder 'struct net' fields to move @hash_mix in a read mostly cache line. We move in the first cache line fields that can be dirtied often. We probably will have to address in a followup patch the __randomize_layout that was added in linux-4.13, since this might break our placement choices. Fixes: 355b98553789 ("netns: provide pure entropy for net_hash_mix()") Signed-off-by: Eric Dumazet Reported-by: kernel test robot Signed-off-by: David S. Miller --- include/net/net_namespace.h | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index f8712bbeb2e0..4c2cd9378699 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -52,6 +52,9 @@ struct bpf_prog; #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) struct net { + /* First cache line can be often dirtied. + * Do not place here read-mostly fields. + */ refcount_t passive; /* To decide when the network * namespace should be freed. */ @@ -60,7 +63,13 @@ struct net { */ spinlock_t rules_mod_lock; - u32 hash_mix; + unsigned int dev_unreg_count; + + unsigned int dev_base_seq; /* protected by rtnl_mutex */ + int ifindex; + + spinlock_t nsid_lock; + atomic_t fnhe_genid; struct list_head list; /* list of network namespaces */ struct list_head exit_list; /* To linked to call pernet exit @@ -76,11 +85,11 @@ struct net { #endif struct user_namespace *user_ns; /* Owning user namespace */ struct ucounts *ucounts; - spinlock_t nsid_lock; struct idr netns_ids; struct ns_common ns; + struct list_head dev_base_head; struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat; @@ -93,17 +102,18 @@ struct net { struct uevent_sock *uevent_sock; /* uevent socket */ - struct list_head dev_base_head; struct hlist_head *dev_name_head; struct hlist_head *dev_index_head; - unsigned int dev_base_seq; /* protected by rtnl_mutex */ - int ifindex; - unsigned int dev_unreg_count; + /* Note that @hash_mix can be read millions times per second, + * it is critical that it is on a read_mostly cache line. + */ + u32 hash_mix; + + struct net_device *loopback_dev; /* The loopback */ /* core fib_rules */ struct list_head rules_ops; - struct net_device *loopback_dev; /* The loopback */ struct netns_core core; struct netns_mib mib; struct netns_packet packet; @@ -171,7 +181,6 @@ struct net { struct sock *crypto_nlsk; #endif struct sock *diag_nlsk; - atomic_t fnhe_genid; } __randomize_layout; #include -- cgit v1.2.3