From 430cac487400494c19a8b85299e979bb07b4671f Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 4 Apr 2023 15:12:16 +0200 Subject: xfrm: don't check the default policy if the policy allows the packet The current code doesn't let a simple "allow" policy counteract a default policy blocking all incoming packets: ip x p setdefault in block ip x p a src 192.168.2.1/32 dst 192.168.2.2/32 dir in action allow At this stage, we have an allow policy (with or without transforms) for this packet. It doesn't matter what the default policy says, since the policy we looked up lets the packet through. The case of a blocking policy is already handled separately, so we can remove this check. Fixes: 2d151d39073a ("xfrm: Add possibility to set the default to block if we have no policy") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 5c61ec04b839..62be042f2ebc 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3712,12 +3712,6 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } xfrm_nr = ti; - if (net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK && - !xfrm_nr) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); - goto reject; - } - if (npols > 1) { xfrm_tmpl_sort(stp, tpp, xfrm_nr, family); tpp = stp; -- cgit v1.2.3 From 94b95dfaa814f565d92f5a65f0ff12a483095522 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 19 Apr 2023 15:19:07 +0300 Subject: xfrm: release all offloaded policy memory Failure to add offloaded policy will cause to the following error once user will try to reload driver. Unregister_netdevice: waiting for eth3 to become free. Usage count = 2 This was caused by xfrm_dev_policy_add() which increments reference to net_device. That reference was supposed to be decremented in xfrm_dev_policy_free(). However the latter wasn't called. unregister_netdevice: waiting for eth3 to become free. Usage count = 2 leaked reference. xfrm_dev_policy_add+0xff/0x3d0 xfrm_policy_construct+0x352/0x420 xfrm_add_policy+0x179/0x320 xfrm_user_rcv_msg+0x1d2/0x3d0 netlink_rcv_skb+0xe0/0x210 xfrm_netlink_rcv+0x45/0x50 netlink_unicast+0x346/0x490 netlink_sendmsg+0x3b0/0x6c0 sock_sendmsg+0x73/0xc0 sock_write_iter+0x13b/0x1f0 vfs_write+0x528/0x5d0 ksys_write+0x120/0x150 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x46/0xb0 Fixes: 919e43fad516 ("xfrm: add an interface to offload policy") Signed-off-by: Leon Romanovsky Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 103af2b3e986..af8fbcbfbe69 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1978,6 +1978,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) { xfrm_dev_policy_delete(xp); + xfrm_dev_policy_free(xp); security_xfrm_policy_free(xp->security); kfree(xp); return err; -- cgit v1.2.3 From ec8f32ad9a65a8cbb465b69e154aaec9d2fe45c4 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 19 Apr 2023 15:19:08 +0300 Subject: xfrm: Fix leak of dev tracker At the stage of direction checks, the netdev reference tracker is already initialized, but released with wrong *_put() call. Fixes: 919e43fad516 ("xfrm: add an interface to offload policy") Signed-off-by: Leon Romanovsky Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 95f1436bf6a2..e2ca50bfca24 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -378,7 +378,7 @@ int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, break; default: xdo->dev = NULL; - dev_put(dev); + netdev_put(dev, &xdo->dev_tracker); NL_SET_ERR_MSG(extack, "Unrecognized offload direction"); return -EINVAL; } -- cgit v1.2.3 From 5fc46f94219d1d103ffb5f0832be9da674d85a73 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Tue, 25 Apr 2023 09:46:18 +0200 Subject: Revert "Fix XFRM-I support for nested ESP tunnels" This reverts commit b0355dbbf13c0052931dd14c38c789efed64d3de. The reverted commit clears the secpath on packets received via xfrm interfaces to support nested IPsec tunnels. This breaks Netfilter policy matching using xt_policy in the FORWARD chain, as the secpath is missing during forwarding. Additionally, Benedict Wong reports that it breaks Transport-in-Tunnel mode. Fix this regression by reverting the commit until we have a better approach for nested IPsec tunnels. Fixes: b0355dbbf13c ("Fix XFRM-I support for nested ESP tunnels") Link: https://lore.kernel.org/netdev/20230412085615.124791-1-martin@strongswan.org/ Signed-off-by: Martin Willi Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface_core.c | 54 ++++-------------------------------------- net/xfrm/xfrm_policy.c | 3 --- 2 files changed, 4 insertions(+), 53 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 35279c220bd7..1f99dc469027 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -310,52 +310,6 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) skb->mark = 0; } -static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi, - int encap_type, unsigned short family) -{ - struct sec_path *sp; - - sp = skb_sec_path(skb); - if (sp && (sp->len || sp->olen) && - !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) - goto discard; - - XFRM_SPI_SKB_CB(skb)->family = family; - if (family == AF_INET) { - XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); - XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; - } else { - XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); - XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; - } - - return xfrm_input(skb, nexthdr, spi, encap_type); -discard: - kfree_skb(skb); - return 0; -} - -static int xfrmi4_rcv(struct sk_buff *skb) -{ - return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET); -} - -static int xfrmi6_rcv(struct sk_buff *skb) -{ - return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], - 0, 0, AF_INET6); -} - -static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) -{ - return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET); -} - -static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) -{ - return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6); -} - static int xfrmi_rcv_cb(struct sk_buff *skb, int err) { const struct xfrm_mode *inner_mode; @@ -991,8 +945,8 @@ static struct pernet_operations xfrmi_net_ops = { }; static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { - .handler = xfrmi6_rcv, - .input_handler = xfrmi6_input, + .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, @@ -1042,8 +996,8 @@ static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { #endif static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { - .handler = xfrmi4_rcv, - .input_handler = xfrmi4_input, + .handler = xfrm4_rcv, + .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 62be042f2ebc..21a3a1cd3d6d 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3739,9 +3739,6 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, goto reject; } - if (if_id) - secpath_reset(skb); - xfrm_pols_put(pols, npols); return 1; } -- cgit v1.2.3 From af97b7dfb0d4636d58f2341346fffce30c6c2259 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 1 May 2023 16:04:08 +0200 Subject: SUNRPC: Avoid relying on crypto API to derive CBC-CTS output IV Scott reports SUNRPC self-test failures regarding the output IV on arm64 when using the SIMD accelerated implementation of AES in CBC mode with ciphertext stealing ("cts(cbc(aes))" in crypto API speak). These failures are the result of the fact that, while RFC 3962 does specify what the output IV should be and includes test vectors for it, the general concept of an output IV is poorly defined, and generally, not specified by the various algorithms implemented by the crypto API. Only algorithms that support transparent chaining (e.g., CBC mode on a block boundary) have requirements on the output IV, but ciphertext stealing (CTS) is fundamentally about how to encapsulate CBC in a way where the length of the entire message may not be an integral multiple of the cipher block size, and the concept of an output IV does not exist here because it has no defined purpose past the end of the message. The generic CTS template takes advantage of this chaining capability of the CBC implementations, and as a result, happens to return an output IV, simply because it passes its IV buffer directly to the encapsulated CBC implementation, which operates on full blocks only, and always returns an IV. This output IV happens to match how RFC 3962 defines it, even though the CTS template itself does not contain any output IV logic whatsoever, and, for this reason, lacks any test vectors that exercise this accidental output IV generation. The arm64 SIMD implementation of cts(cbc(aes)) does not use the generic CTS template at all, but instead, implements the CBC mode and ciphertext stealing directly, and therefore does not encapsule a CBC implementation that returns an output IV in the same way. The arm64 SIMD implementation complies with the specification and passes all internal tests, but when invoked by the SUNRPC code, fails to produce the expected output IV and causes its selftests to fail. Given that the output IV is defined as the penultimate block (where the final block may smaller than the block size), we can quite easily derive it in the caller by copying the appropriate slice of ciphertext after encryption. Cc: Trond Myklebust Cc: Anna Schumaker Cc: Chuck Lever Cc: Jeff Layton Reported-by: Scott Mayhew Tested-by: Scott Mayhew Signed-off-by: Ard Biesheuvel Signed-off-by: Chuck Lever --- net/sunrpc/auth_gss/gss_krb5_crypto.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 212c5d57465a..9734e1d9f991 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -639,6 +639,16 @@ gss_krb5_cts_crypt(struct crypto_sync_skcipher *cipher, struct xdr_buf *buf, ret = write_bytes_to_xdr_buf(buf, offset, data, len); +#if IS_ENABLED(CONFIG_KUNIT) + /* + * CBC-CTS does not define an output IV but RFC 3962 defines it as the + * penultimate block of ciphertext, so copy that into the IV buffer + * before returning. + */ + if (encrypt) + memcpy(iv, data, crypto_sync_skcipher_ivsize(cipher)); +#endif + out: kfree(data); return ret; -- cgit v1.2.3 From 29cd2927fb914cc53b5ba4f67d2b74695c994ba4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 2 May 2023 14:59:10 -0400 Subject: SUNRPC: Fix encoding of accepted but unsuccessful RPC replies Jiri Slaby says: > I bisected to this ... as it breaks nfs3-only servers in 6.3. > I.e. /etc/nfs.conf containing: > [nfsd] > vers4=no > > The client sees: > mount("10.0.2.15:/tmp", "/mnt", "nfs", 0, "vers=4.2,addr=10.0.2.15,clientad"...) = -1 EIO (Input/output error) > write(2, "mount.nfs: mount system call fai"..., 45 > mount.nfs: mount system call failed for /mnt > > And the kernel says: > nfs4_discover_server_trunking unhandled error -5. Exiting with error EIO Reported-by: Jiri Slaby Link: https://bugzilla.suse.com/show_bug.cgi?id=1210995 Fixes: 4bcf0343e8a6 ("SUNRPC: Set rq_accept_statp inside ->accept methods") Tested-by: Jiri Slaby Signed-off-by: Chuck Lever --- net/sunrpc/svc.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 26367cf4c17a..43e32129a583 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1416,7 +1416,7 @@ err_bad_rpc: /* Only RPCv2 supported */ xdr_stream_encode_u32(xdr, RPC_VERSION); xdr_stream_encode_u32(xdr, RPC_VERSION); - goto sendit; + return 1; /* don't wrap */ err_bad_auth: dprintk("svc: authentication failed (%d)\n", @@ -1432,7 +1432,7 @@ err_bad_auth: err_bad_prog: dprintk("svc: unknown program %d\n", rqstp->rq_prog); serv->sv_stats->rpcbadfmt++; - xdr_stream_encode_u32(xdr, RPC_PROG_UNAVAIL); + *rqstp->rq_accept_statp = rpc_prog_unavail; goto sendit; err_bad_vers: @@ -1440,7 +1440,12 @@ err_bad_vers: rqstp->rq_vers, rqstp->rq_prog, progp->pg_name); serv->sv_stats->rpcbadfmt++; - xdr_stream_encode_u32(xdr, RPC_PROG_MISMATCH); + *rqstp->rq_accept_statp = rpc_prog_mismatch; + + /* + * svc_authenticate() has already added the verifier and + * advanced the stream just past rq_accept_statp. + */ xdr_stream_encode_u32(xdr, process.mismatch.lovers); xdr_stream_encode_u32(xdr, process.mismatch.hivers); goto sendit; @@ -1449,19 +1454,19 @@ err_bad_proc: svc_printk(rqstp, "unknown procedure (%d)\n", rqstp->rq_proc); serv->sv_stats->rpcbadfmt++; - xdr_stream_encode_u32(xdr, RPC_PROC_UNAVAIL); + *rqstp->rq_accept_statp = rpc_proc_unavail; goto sendit; err_garbage_args: svc_printk(rqstp, "failed to decode RPC header\n"); serv->sv_stats->rpcbadfmt++; - xdr_stream_encode_u32(xdr, RPC_GARBAGE_ARGS); + *rqstp->rq_accept_statp = rpc_garbage_args; goto sendit; err_system_err: serv->sv_stats->rpcbadfmt++; - xdr_stream_encode_u32(xdr, RPC_SYSTEM_ERR); + *rqstp->rq_accept_statp = rpc_system_err; goto sendit; } -- cgit v1.2.3 From 319050d4302ee9b216fb8cea847a9c4427628a98 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 4 May 2023 13:43:33 -0400 Subject: SUNRPC: Fix error handling in svc_setup_socket() Dan points out that sock_alloc_file() releases @sock on error, but so do all of svc_setup_socket's callers, resulting in a double- release if sock_alloc_file() returns an error. Rather than allocating a struct file for all new sockets, allocate one only for sockets created during a TCP accept. For the moment, those are the only ones that will ever be used with RPC-with-TLS. Reported-by: Dan Carpenter Fixes: ae0d77708aae ("SUNRPC: Ensure server-side sockets have a sock->file") Signed-off-by: Chuck Lever --- net/sunrpc/svcsock.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index a51c9b989d58..9989194446a5 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -895,6 +895,9 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) trace_svcsock_accept_err(xprt, serv->sv_name, err); return NULL; } + if (IS_ERR(sock_alloc_file(newsock, O_NONBLOCK, NULL))) + return NULL; + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); err = kernel_getpeername(newsock, sin); @@ -935,7 +938,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) return &newsvsk->sk_xprt; failed: - sock_release(newsock); + sockfd_put(newsock); return NULL; } @@ -1430,7 +1433,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct socket *sock, int flags) { - struct file *filp = NULL; struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); @@ -1439,14 +1441,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, if (!svsk) return ERR_PTR(-ENOMEM); - if (!sock->file) { - filp = sock_alloc_file(sock, O_NONBLOCK, NULL); - if (IS_ERR(filp)) { - kfree(svsk); - return ERR_CAST(filp); - } - } - inet = sock->sk; if (pmap_register) { @@ -1456,8 +1450,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, inet->sk_protocol, ntohs(inet_sk(inet)->inet_sport)); if (err < 0) { - if (filp) - fput(filp); kfree(svsk); return ERR_PTR(err); } -- cgit v1.2.3 From 424f8416bb39936df6365442d651ee729b283460 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 May 2023 17:06:18 +0000 Subject: net: skb_partial_csum_set() fix against transport header magic value skb->transport_header uses the special 0xFFFF value to mark if the transport header was set or not. We must prevent callers to accidentaly set skb->transport_header to 0xFFFF. Note that only fuzzers can possibly do this today. syzbot reported: WARNING: CPU: 0 PID: 2340 at include/linux/skbuff.h:2847 skb_transport_offset include/linux/skbuff.h:2956 [inline] WARNING: CPU: 0 PID: 2340 at include/linux/skbuff.h:2847 virtio_net_hdr_to_skb+0xbcc/0x10c0 include/linux/virtio_net.h:103 Modules linked in: CPU: 0 PID: 2340 Comm: syz-executor.0 Not tainted 6.3.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/14/2023 RIP: 0010:skb_transport_header include/linux/skbuff.h:2847 [inline] RIP: 0010:skb_transport_offset include/linux/skbuff.h:2956 [inline] RIP: 0010:virtio_net_hdr_to_skb+0xbcc/0x10c0 include/linux/virtio_net.h:103 Code: 41 39 df 0f 82 c3 04 00 00 48 8b 7c 24 10 44 89 e6 e8 08 6e 59 ff 48 85 c0 74 54 e8 ce 36 7e fc e9 37 f8 ff ff e8 c4 36 7e fc <0f> 0b e9 93 f8 ff ff 44 89 f7 44 89 e6 e8 32 38 7e fc 45 39 e6 0f RSP: 0018:ffffc90004497880 EFLAGS: 00010293 RAX: ffffffff84fea55c RBX: 000000000000ffff RCX: ffff888120be2100 RDX: 0000000000000000 RSI: 000000000000ffff RDI: 000000000000ffff RBP: ffffc90004497990 R08: ffffffff84fe9de5 R09: 0000000000000034 R10: ffffea00048ebd80 R11: 0000000000000034 R12: ffff88811dc2d9c8 R13: dffffc0000000000 R14: ffff88811dc2d9ae R15: 1ffff11023b85b35 FS: 00007f9211a59700(0000) GS:ffff8881f6c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000200002c0 CR3: 00000001215a5000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: packet_snd net/packet/af_packet.c:3076 [inline] packet_sendmsg+0x4590/0x61a0 net/packet/af_packet.c:3115 sock_sendmsg_nosec net/socket.c:724 [inline] sock_sendmsg net/socket.c:747 [inline] __sys_sendto+0x472/0x630 net/socket.c:2144 __do_sys_sendto net/socket.c:2156 [inline] __se_sys_sendto net/socket.c:2152 [inline] __x64_sys_sendto+0xe5/0x100 net/socket.c:2152 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2f/0x50 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f9210c8c169 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 f1 19 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f9211a59168 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00007f9210dabf80 RCX: 00007f9210c8c169 RDX: 000000000000ffed RSI: 00000000200000c0 RDI: 0000000000000003 RBP: 00007f9210ce7ca1 R08: 0000000020000540 R09: 0000000000000014 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffe135d65cf R14: 00007f9211a59300 R15: 0000000000022000 Fixes: 66e4c8d95008 ("net: warn if transport header was not set") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Willem de Bruijn Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 26a586007d8b..515ec5cdc79c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5298,7 +5298,7 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); u32 csum_start = skb_headroom(skb) + (u32)start; - if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) { + if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) { net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", start, off, skb_headroom(skb), skb_headlen(skb)); return false; @@ -5306,7 +5306,7 @@ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = csum_start; skb->csum_offset = off; - skb_set_transport_header(skb, start); + skb->transport_header = csum_start; return true; } EXPORT_SYMBOL_GPL(skb_partial_csum_set); -- cgit v1.2.3 From 3d776e31c841ba2f69895d2255a49320bec7cea6 Mon Sep 17 00:00:00 2001 From: Tobias Brunner Date: Tue, 9 May 2023 10:59:58 +0200 Subject: xfrm: Reject optional tunnel/BEET mode templates in outbound policies xfrm_state_find() uses `encap_family` of the current template with the passed local and remote addresses to find a matching state. If an optional tunnel or BEET mode template is skipped in a mixed-family scenario, there could be a mismatch causing an out-of-bounds read as the addresses were not replaced to match the family of the next template. While there are theoretical use cases for optional templates in outbound policies, the only practical one is to skip IPComp states in inbound policies if uncompressed packets are received that are handled by an implicitly created IPIP state instead. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Tobias Brunner Acked-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index af8fbcbfbe69..6794b9dea27a 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1768,7 +1768,7 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, } static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family, - struct netlink_ext_ack *extack) + int dir, struct netlink_ext_ack *extack) { u16 prev_family; int i; @@ -1794,6 +1794,10 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family, switch (ut[i].mode) { case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: + if (ut[i].optional && dir == XFRM_POLICY_OUT) { + NL_SET_ERR_MSG(extack, "Mode in optional template not allowed in outbound policy"); + return -EINVAL; + } break; default: if (ut[i].family != prev_family) { @@ -1831,7 +1835,7 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family, } static int copy_from_user_tmpl(struct xfrm_policy *pol, struct nlattr **attrs, - struct netlink_ext_ack *extack) + int dir, struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[XFRMA_TMPL]; @@ -1842,7 +1846,7 @@ static int copy_from_user_tmpl(struct xfrm_policy *pol, struct nlattr **attrs, int nr = nla_len(rt) / sizeof(*utmpl); int err; - err = validate_tmpl(nr, utmpl, pol->family, extack); + err = validate_tmpl(nr, utmpl, pol->family, dir, extack); if (err) return err; @@ -1919,7 +1923,7 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, if (err) goto error; - if (!(err = copy_from_user_tmpl(xp, attrs, extack))) + if (!(err = copy_from_user_tmpl(xp, attrs, p->dir, extack))) err = copy_from_user_sec_ctx(xp, attrs); if (err) goto error; @@ -3498,7 +3502,7 @@ static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt, return NULL; nr = ((len - sizeof(*p)) / sizeof(*ut)); - if (validate_tmpl(nr, ut, p->sel.family, NULL)) + if (validate_tmpl(nr, ut, p->sel.family, p->dir, NULL)) return NULL; if (p->dir > XFRM_POLICY_OUT) -- cgit v1.2.3 From cf3128a7aca55b2eefb68281d44749c683bdc96f Mon Sep 17 00:00:00 2001 From: Tobias Brunner Date: Tue, 9 May 2023 11:00:06 +0200 Subject: af_key: Reject optional tunnel/BEET mode templates in outbound policies xfrm_state_find() uses `encap_family` of the current template with the passed local and remote addresses to find a matching state. If an optional tunnel or BEET mode template is skipped in a mixed-family scenario, there could be a mismatch causing an out-of-bounds read as the addresses were not replaced to match the family of the next template. While there are theoretical use cases for optional templates in outbound policies, the only practical one is to skip IPComp states in inbound policies if uncompressed packets are received that are handled by an implicitly created IPIP state instead. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Tobias Brunner Acked-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/key/af_key.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index a815f5ab4c49..31ab12fd720a 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1940,7 +1940,8 @@ static u32 gen_reqid(struct net *net) } static int -parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq) +parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_policy *pol, + struct sadb_x_ipsecrequest *rq) { struct net *net = xp_net(xp); struct xfrm_tmpl *t = xp->xfrm_vec + xp->xfrm_nr; @@ -1958,9 +1959,12 @@ parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq) if ((mode = pfkey_mode_to_xfrm(rq->sadb_x_ipsecrequest_mode)) < 0) return -EINVAL; t->mode = mode; - if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_USE) + if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_USE) { + if ((mode == XFRM_MODE_TUNNEL || mode == XFRM_MODE_BEET) && + pol->sadb_x_policy_dir == IPSEC_DIR_OUTBOUND) + return -EINVAL; t->optional = 1; - else if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_UNIQUE) { + } else if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_UNIQUE) { t->reqid = rq->sadb_x_ipsecrequest_reqid; if (t->reqid > IPSEC_MANUAL_REQID_MAX) t->reqid = 0; @@ -2002,7 +2006,7 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) rq->sadb_x_ipsecrequest_len < sizeof(*rq)) return -EINVAL; - if ((err = parse_ipsecrequest(xp, rq)) < 0) + if ((err = parse_ipsecrequest(xp, pol, rq)) < 0) return err; len -= rq->sadb_x_ipsecrequest_len; rq = (void*)((u8*)rq + rq->sadb_x_ipsecrequest_len); -- cgit v1.2.3 From 8680407b6f8f5fba59e8f1d63c869abc280f04df Mon Sep 17 00:00:00 2001 From: Benedict Wong Date: Wed, 10 May 2023 01:14:14 +0000 Subject: xfrm: Check if_id in inbound policy/secpath match This change ensures that if configured in the policy, the if_id set in the policy and secpath states match during the inbound policy check. Without this, there is potential for ambiguity where entries in the secpath differing by only the if_id could be mismatched. Notably, this is checked in the outbound direction when resolving templates to SAs, but not on the inbound path when matching SAs and policies. Test: Tested against Android kernel unit tests & CTS Signed-off-by: Benedict Wong Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 21a3a1cd3d6d..6d15788b5123 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3312,7 +3312,7 @@ xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl) static inline int xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, - unsigned short family) + unsigned short family, u32 if_id) { if (xfrm_state_kern(x)) return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family); @@ -3323,7 +3323,8 @@ xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, (tmpl->allalgs || (tmpl->aalgos & (1<props.aalgo)) || !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) && !(x->props.mode != XFRM_MODE_TRANSPORT && - xfrm_state_addr_cmp(tmpl, x, family)); + xfrm_state_addr_cmp(tmpl, x, family)) && + (if_id == 0 || if_id == x->if_id); } /* @@ -3335,7 +3336,7 @@ xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, */ static inline int xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start, - unsigned short family) + unsigned short family, u32 if_id) { int idx = start; @@ -3345,7 +3346,7 @@ xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int star } else start = -1; for (; idx < sp->len; idx++) { - if (xfrm_state_ok(tmpl, sp->xvec[idx], family)) + if (xfrm_state_ok(tmpl, sp->xvec[idx], family, if_id)) return ++idx; if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) { if (start == -1) @@ -3724,7 +3725,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, * are implied between each two transformations. */ for (i = xfrm_nr-1, k = 0; i >= 0; i--) { - k = xfrm_policy_ok(tpp[i], sp, k, family); + k = xfrm_policy_ok(tpp[i], sp, k, family, if_id); if (k < 0) { if (k < -1) /* "-2 - errored_index" returned */ -- cgit v1.2.3 From dc1c9fd4a8bbe1e06add9053010b652449bfe411 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 4 May 2023 14:20:21 +0200 Subject: netfilter: nf_tables: always release netdev hooks from notifier This reverts "netfilter: nf_tables: skip netdev events generated on netns removal". The problem is that when a veth device is released, the veth release callback will also queue the peer netns device for removal. Its possible that the peer netns is also slated for removal. In this case, the device memory is already released before the pre_exit hook of the peer netns runs: BUG: KASAN: slab-use-after-free in nf_hook_entry_head+0x1b8/0x1d0 Read of size 8 at addr ffff88812c0124f0 by task kworker/u8:1/45 Workqueue: netns cleanup_net Call Trace: nf_hook_entry_head+0x1b8/0x1d0 __nf_unregister_net_hook+0x76/0x510 nft_netdev_unregister_hooks+0xa0/0x220 __nft_release_hook+0x184/0x490 nf_tables_pre_exit_net+0x12f/0x1b0 .. Order is: 1. First netns is released, veth_dellink() queues peer netns device for removal 2. peer netns is queued for removal 3. peer netns device is released, unreg event is triggered 4. unreg event is ignored because netns is going down 5. pre_exit hook calls nft_netdev_unregister_hooks but device memory might be free'd already. Fixes: 68a3765c659f ("netfilter: nf_tables: skip netdev events generated on netns removal") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_chain_filter.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index c3563f0be269..680fe557686e 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -344,6 +344,12 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, return; } + /* UNREGISTER events are also happening on netns exit. + * + * Although nf_tables core releases all tables/chains, only this event + * handler provides guarantee that hook->ops.dev is still accessible, + * so we cannot skip exiting net namespaces. + */ __nft_release_basechain(ctx); } @@ -362,9 +368,6 @@ static int nf_tables_netdev_event(struct notifier_block *this, event != NETDEV_CHANGENAME) return NOTIFY_DONE; - if (!check_net(ctx.net)) - return NOTIFY_DONE; - nft_net = nft_pernet(ctx.net); mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { -- cgit v1.2.3 From e72eeab542dbf4f544e389e64fa13b82a1b6d003 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 4 May 2023 14:55:02 +0200 Subject: netfilter: conntrack: fix possible bug_on with enable_hooks=1 I received a bug report (no reproducer so far) where we trip over 712 rcu_read_lock(); 713 ct_hook = rcu_dereference(nf_ct_hook); 714 BUG_ON(ct_hook == NULL); // here In nf_conntrack_destroy(). First turn this BUG_ON into a WARN. I think it was triggered via enable_hooks=1 flag. When this flag is turned on, the conntrack hooks are registered before nf_ct_hook pointer gets assigned. This opens a short window where packets enter the conntrack machinery, can have skb->_nfct set up and a subsequent kfree_skb might occur before nf_ct_hook is set. Call nf_conntrack_init_end() to set nf_ct_hook before we register the pernet ops. Fixes: ba3fbe663635 ("netfilter: nf_conntrack: provide modparam to always register conntrack hooks") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/core.c | 6 ++++-- net/netfilter/nf_conntrack_standalone.c | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index f0783e42108b..5f76ae86a656 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -711,9 +711,11 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct) rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); - BUG_ON(ct_hook == NULL); - ct_hook->destroy(nfct); + if (ct_hook) + ct_hook->destroy(nfct); rcu_read_unlock(); + + WARN_ON(!ct_hook); } EXPORT_SYMBOL(nf_conntrack_destroy); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 57f6724c99a7..169e16fc2bce 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -1218,11 +1218,12 @@ static int __init nf_conntrack_standalone_init(void) nf_conntrack_htable_size_user = nf_conntrack_htable_size; #endif + nf_conntrack_init_end(); + ret = register_pernet_subsys(&nf_conntrack_net_ops); if (ret < 0) goto out_pernet; - nf_conntrack_init_end(); return 0; out_pernet: -- cgit v1.2.3 From a939d14919b799e6fff8a9c80296ca229ba2f8a4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 May 2023 16:56:34 +0000 Subject: netlink: annotate accesses to nlk->cb_running Both netlink_recvmsg() and netlink_native_seq_show() read nlk->cb_running locklessly. Use READ_ONCE() there. Add corresponding WRITE_ONCE() to netlink_dump() and __netlink_dump_start() syzbot reported: BUG: KCSAN: data-race in __netlink_dump_start / netlink_recvmsg write to 0xffff88813ea4db59 of 1 bytes by task 28219 on cpu 0: __netlink_dump_start+0x3af/0x4d0 net/netlink/af_netlink.c:2399 netlink_dump_start include/linux/netlink.h:308 [inline] rtnetlink_rcv_msg+0x70f/0x8c0 net/core/rtnetlink.c:6130 netlink_rcv_skb+0x126/0x220 net/netlink/af_netlink.c:2577 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:6192 netlink_unicast_kernel net/netlink/af_netlink.c:1339 [inline] netlink_unicast+0x56f/0x640 net/netlink/af_netlink.c:1365 netlink_sendmsg+0x665/0x770 net/netlink/af_netlink.c:1942 sock_sendmsg_nosec net/socket.c:724 [inline] sock_sendmsg net/socket.c:747 [inline] sock_write_iter+0x1aa/0x230 net/socket.c:1138 call_write_iter include/linux/fs.h:1851 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x463/0x760 fs/read_write.c:584 ksys_write+0xeb/0x1a0 fs/read_write.c:637 __do_sys_write fs/read_write.c:649 [inline] __se_sys_write fs/read_write.c:646 [inline] __x64_sys_write+0x42/0x50 fs/read_write.c:646 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffff88813ea4db59 of 1 bytes by task 28222 on cpu 1: netlink_recvmsg+0x3b4/0x730 net/netlink/af_netlink.c:2022 sock_recvmsg_nosec+0x4c/0x80 net/socket.c:1017 ____sys_recvmsg+0x2db/0x310 net/socket.c:2718 ___sys_recvmsg net/socket.c:2762 [inline] do_recvmmsg+0x2e5/0x710 net/socket.c:2856 __sys_recvmmsg net/socket.c:2935 [inline] __do_sys_recvmmsg net/socket.c:2958 [inline] __se_sys_recvmmsg net/socket.c:2951 [inline] __x64_sys_recvmmsg+0xe2/0x160 net/socket.c:2951 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x00 -> 0x01 Fixes: 16b304f3404f ("netlink: Eliminate kmalloc in netlink dump operation.") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 7ef8b9a1e30c..c87804112d0c 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1990,7 +1990,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, skb_free_datagram(sk, skb); - if (nlk->cb_running && + if (READ_ONCE(nlk->cb_running) && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { ret = netlink_dump(sk); if (ret) { @@ -2302,7 +2302,7 @@ static int netlink_dump(struct sock *sk) if (cb->done) cb->done(cb); - nlk->cb_running = false; + WRITE_ONCE(nlk->cb_running, false); module = cb->module; skb = cb->skb; mutex_unlock(nlk->cb_mutex); @@ -2365,7 +2365,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, goto error_put; } - nlk->cb_running = true; + WRITE_ONCE(nlk->cb_running, true); nlk->dump_done_errno = INT_MAX; mutex_unlock(nlk->cb_mutex); @@ -2703,7 +2703,7 @@ static int netlink_native_seq_show(struct seq_file *seq, void *v) nlk->groups ? (u32)nlk->groups[0] : 0, sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), - nlk->cb_running, + READ_ONCE(nlk->cb_running), refcount_read(&s->sk_refcnt), atomic_read(&s->sk_drops), sock_i_ino(s) -- cgit v1.2.3 From e05a5f510f26607616fecdd4ac136310c8bea56b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 May 2023 16:35:53 +0000 Subject: net: annotate sk->sk_err write from do_recvmmsg() do_recvmmsg() can write to sk->sk_err from multiple threads. As said before, many other points reading or writing sk_err need annotations. Fixes: 34b88a68f26a ("net: Fix use after free in the recvmmsg exit path") Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index a7b4b37d86df..b7e01d0fe082 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2911,7 +2911,7 @@ static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg, * error to return on the next call or if the * app asks about it using getsockopt(SO_ERROR). */ - sock->sk->sk_err = -err; + WRITE_ONCE(sock->sk->sk_err, -err); } out_put: fput_light(sock->file, fput_needed); -- cgit v1.2.3 From d0ac89f6f9879fae316c155de77b5173b3e2c9c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 May 2023 18:29:48 +0000 Subject: net: deal with most data-races in sk_wait_event() __condition is evaluated twice in sk_wait_event() macro. First invocation is lockless, and reads can race with writes, as spotted by syzbot. BUG: KCSAN: data-race in sk_stream_wait_connect / tcp_disconnect write to 0xffff88812d83d6a0 of 4 bytes by task 9065 on cpu 1: tcp_disconnect+0x2cd/0xdb0 inet_shutdown+0x19e/0x1f0 net/ipv4/af_inet.c:911 __sys_shutdown_sock net/socket.c:2343 [inline] __sys_shutdown net/socket.c:2355 [inline] __do_sys_shutdown net/socket.c:2363 [inline] __se_sys_shutdown+0xf8/0x140 net/socket.c:2361 __x64_sys_shutdown+0x31/0x40 net/socket.c:2361 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffff88812d83d6a0 of 4 bytes by task 9040 on cpu 0: sk_stream_wait_connect+0x1de/0x3a0 net/core/stream.c:75 tcp_sendmsg_locked+0x2e4/0x2120 net/ipv4/tcp.c:1266 tcp_sendmsg+0x30/0x50 net/ipv4/tcp.c:1484 inet6_sendmsg+0x63/0x80 net/ipv6/af_inet6.c:651 sock_sendmsg_nosec net/socket.c:724 [inline] sock_sendmsg net/socket.c:747 [inline] __sys_sendto+0x246/0x300 net/socket.c:2142 __do_sys_sendto net/socket.c:2154 [inline] __se_sys_sendto net/socket.c:2150 [inline] __x64_sys_sendto+0x78/0x90 net/socket.c:2150 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x00000000 -> 0x00000068 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/stream.c | 12 ++++++------ net/ipv4/tcp_bpf.c | 2 +- net/llc/af_llc.c | 8 +++++--- net/smc/smc_close.c | 4 ++-- net/smc/smc_rx.c | 4 ++-- net/smc/smc_tx.c | 4 ++-- net/tipc/socket.c | 4 ++-- net/tls/tls_main.c | 3 ++- 8 files changed, 22 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/core/stream.c b/net/core/stream.c index 434446ab14c5..f5c4e47df165 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -73,8 +73,8 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending++; done = sk_wait_event(sk, timeo_p, - !sk->sk_err && - !((1 << sk->sk_state) & + !READ_ONCE(sk->sk_err) && + !((1 << READ_ONCE(sk->sk_state)) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait); remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending--; @@ -87,9 +87,9 @@ EXPORT_SYMBOL(sk_stream_wait_connect); * sk_stream_closing - Return 1 if we still have things to send in our buffers. * @sk: socket to verify */ -static inline int sk_stream_closing(struct sock *sk) +static int sk_stream_closing(const struct sock *sk) { - return (1 << sk->sk_state) & + return (1 << READ_ONCE(sk->sk_state)) & (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK); } @@ -142,8 +142,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; - sk_wait_event(sk, ¤t_timeo, sk->sk_err || - (sk->sk_shutdown & SEND_SHUTDOWN) || + sk_wait_event(sk, ¤t_timeo, READ_ONCE(sk->sk_err) || + (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || (sk_stream_memory_free(sk) && !vm_wait), &wait); sk->sk_write_pending--; diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ebf917511937..2e9547467edb 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -168,7 +168,7 @@ static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); ret = sk_wait_event(sk, &timeo, !list_empty(&psock->ingress_msg) || - !skb_queue_empty(&sk->sk_receive_queue), &wait); + !skb_queue_empty_lockless(&sk->sk_receive_queue), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index da7fe94bea2e..9ffbc667be6c 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -583,7 +583,8 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout) add_wait_queue(sk_sleep(sk), &wait); while (1) { - if (sk_wait_event(sk, &timeout, sk->sk_state == TCP_CLOSE, &wait)) + if (sk_wait_event(sk, &timeout, + READ_ONCE(sk->sk_state) == TCP_CLOSE, &wait)) break; rc = -ERESTARTSYS; if (signal_pending(current)) @@ -603,7 +604,8 @@ static bool llc_ui_wait_for_conn(struct sock *sk, long timeout) add_wait_queue(sk_sleep(sk), &wait); while (1) { - if (sk_wait_event(sk, &timeout, sk->sk_state != TCP_SYN_SENT, &wait)) + if (sk_wait_event(sk, &timeout, + READ_ONCE(sk->sk_state) != TCP_SYN_SENT, &wait)) break; if (signal_pending(current) || !timeout) break; @@ -622,7 +624,7 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout) while (1) { rc = 0; if (sk_wait_event(sk, &timeout, - (sk->sk_shutdown & RCV_SHUTDOWN) || + (READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN) || (!llc_data_accept_state(llc->state) && !llc->remote_busy_flag && !llc->p_flag), &wait)) diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 31db7438857c..dbdf03e8aa5b 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -67,8 +67,8 @@ static void smc_close_stream_wait(struct smc_sock *smc, long timeout) rc = sk_wait_event(sk, &timeout, !smc_tx_prepared_sends(&smc->conn) || - sk->sk_err == ECONNABORTED || - sk->sk_err == ECONNRESET || + READ_ONCE(sk->sk_err) == ECONNABORTED || + READ_ONCE(sk->sk_err) == ECONNRESET || smc->conn.killed, &wait); if (rc) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 4380d32f5a5f..9a2f3638d161 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -267,9 +267,9 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); add_wait_queue(sk_sleep(sk), &wait); rc = sk_wait_event(sk, timeo, - sk->sk_err || + READ_ONCE(sk->sk_err) || cflags->peer_conn_abort || - sk->sk_shutdown & RCV_SHUTDOWN || + READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN || conn->killed || fcrit(conn), &wait); diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index f4b6a71ac488..45128443f1f1 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -113,8 +113,8 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) break; /* at least 1 byte of free & no urgent data */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk_wait_event(sk, &timeo, - sk->sk_err || - (sk->sk_shutdown & SEND_SHUTDOWN) || + READ_ONCE(sk->sk_err) || + (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || smc_cdc_rxed_any_close(conn) || (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend), diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 37edfe10f8c6..dd73d71c02a9 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -314,9 +314,9 @@ static void tsk_rej_rx_queue(struct sock *sk, int error) tipc_sk_respond(sk, skb, error); } -static bool tipc_sk_connected(struct sock *sk) +static bool tipc_sk_connected(const struct sock *sk) { - return sk->sk_state == TIPC_ESTABLISHED; + return READ_ONCE(sk->sk_state) == TIPC_ESTABLISHED; } /* tipc_sk_type_connectionless - check if the socket is datagram socket diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index b32c112984dd..f2e7302a4d96 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -111,7 +111,8 @@ int wait_on_pending_writer(struct sock *sk, long *timeo) break; } - if (sk_wait_event(sk, timeo, !sk->sk_write_pending, &wait)) + if (sk_wait_event(sk, timeo, + !READ_ONCE(sk->sk_write_pending), &wait)) break; } remove_wait_queue(sk_sleep(sk), &wait); -- cgit v1.2.3 From 4063384ef762cc5946fc7a3f89879e76c6ec51e2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 May 2023 13:18:57 +0000 Subject: net: add vlan_get_protocol_and_depth() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before blamed commit, pskb_may_pull() was used instead of skb_header_pointer() in __vlan_get_protocol() and friends. Few callers depended on skb->head being populated with MAC header, syzbot caught one of them (skb_mac_gso_segment()) Add vlan_get_protocol_and_depth() to make the intent clearer and use it where sensible. This is a more generic fix than commit e9d3f80935b6 ("net/af_packet: make sure to pull mac header") which was dealing with a similar issue. kernel BUG at include/linux/skbuff.h:2655 ! invalid opcode: 0000 [#1] SMP KASAN CPU: 0 PID: 1441 Comm: syz-executor199 Not tainted 6.1.24-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/14/2023 RIP: 0010:__skb_pull include/linux/skbuff.h:2655 [inline] RIP: 0010:skb_mac_gso_segment+0x68f/0x6a0 net/core/gro.c:136 Code: fd 48 8b 5c 24 10 44 89 6b 70 48 c7 c7 c0 ae 0d 86 44 89 e6 e8 a1 91 d0 00 48 c7 c7 00 af 0d 86 48 89 de 31 d2 e8 d1 4a e9 ff <0f> 0b 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 41 RSP: 0018:ffffc90001bd7520 EFLAGS: 00010286 RAX: ffffffff8469736a RBX: ffff88810f31dac0 RCX: ffff888115a18b00 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffc90001bd75e8 R08: ffffffff84697183 R09: fffff5200037adf9 R10: 0000000000000000 R11: dffffc0000000001 R12: 0000000000000012 R13: 000000000000fee5 R14: 0000000000005865 R15: 000000000000fed7 FS: 000055555633f300(0000) GS:ffff8881f6a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000000 CR3: 0000000116fea000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [] __skb_gso_segment+0x32d/0x4c0 net/core/dev.c:3419 [] skb_gso_segment include/linux/netdevice.h:4819 [inline] [] validate_xmit_skb+0x3aa/0xee0 net/core/dev.c:3725 [] __dev_queue_xmit+0x1332/0x3300 net/core/dev.c:4313 [] dev_queue_xmit+0x17/0x20 include/linux/netdevice.h:3029 [] packet_snd net/packet/af_packet.c:3111 [inline] [] packet_sendmsg+0x49d2/0x6470 net/packet/af_packet.c:3142 [] sock_sendmsg_nosec net/socket.c:716 [inline] [] sock_sendmsg net/socket.c:736 [inline] [] __sys_sendto+0x472/0x5f0 net/socket.c:2139 [] __do_sys_sendto net/socket.c:2151 [inline] [] __se_sys_sendto net/socket.c:2147 [inline] [] __x64_sys_sendto+0xe5/0x100 net/socket.c:2147 [] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [] do_syscall_64+0x2f/0x50 arch/x86/entry/common.c:80 [] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 469aceddfa3e ("vlan: consolidate VLAN parsing code and limit max parsing depth") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Toke Høiland-Jørgensen Cc: Willem de Bruijn Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 2 +- net/core/dev.c | 2 +- net/packet/af_packet.c | 6 ++---- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 57744704ff69..84d6dd5e5b1a 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -42,7 +42,7 @@ int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb eth_type_vlan(skb->protocol)) { int depth; - if (!__vlan_get_protocol(skb, skb->protocol, &depth)) + if (!vlan_get_protocol_and_depth(skb, skb->protocol, &depth)) goto drop; skb_set_network_header(skb, depth); diff --git a/net/core/dev.c b/net/core/dev.c index 735096d42c1d..b3c13e041935 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3335,7 +3335,7 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) type = eth->h_proto; } - return __vlan_get_protocol(skb, type, depth); + return vlan_get_protocol_and_depth(skb, type, depth); } /* openvswitch calls this on rx path, so we need a different check. diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 640d94e34635..94c6a1ffa459 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1934,10 +1934,8 @@ static void packet_parse_headers(struct sk_buff *skb, struct socket *sock) /* Move network header to the right position for VLAN tagged packets */ if (likely(skb->dev->type == ARPHRD_ETHER) && eth_type_vlan(skb->protocol) && - __vlan_get_protocol(skb, skb->protocol, &depth) != 0) { - if (pskb_may_pull(skb, depth)) - skb_set_network_header(skb, depth); - } + vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) + skb_set_network_header(skb, depth); skb_probe_transport_header(skb); } -- cgit v1.2.3 From e14cadfd80d76f01bfaa1a8d745b1db19b57d6be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 May 2023 20:36:56 +0000 Subject: tcp: add annotations around sk->sk_shutdown accesses Now sk->sk_shutdown is no longer a bitfield, we can add standard READ_ONCE()/WRITE_ONCE() annotations to silence KCSAN reports like the following: BUG: KCSAN: data-race in tcp_disconnect / tcp_poll write to 0xffff88814588582c of 1 bytes by task 3404 on cpu 1: tcp_disconnect+0x4d6/0xdb0 net/ipv4/tcp.c:3121 __inet_stream_connect+0x5dd/0x6e0 net/ipv4/af_inet.c:715 inet_stream_connect+0x48/0x70 net/ipv4/af_inet.c:727 __sys_connect_file net/socket.c:2001 [inline] __sys_connect+0x19b/0x1b0 net/socket.c:2018 __do_sys_connect net/socket.c:2028 [inline] __se_sys_connect net/socket.c:2025 [inline] __x64_sys_connect+0x41/0x50 net/socket.c:2025 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffff88814588582c of 1 bytes by task 3374 on cpu 0: tcp_poll+0x2e6/0x7d0 net/ipv4/tcp.c:562 sock_poll+0x253/0x270 net/socket.c:1383 vfs_poll include/linux/poll.h:88 [inline] io_poll_check_events io_uring/poll.c:281 [inline] io_poll_task_func+0x15a/0x820 io_uring/poll.c:333 handle_tw_list io_uring/io_uring.c:1184 [inline] tctx_task_work+0x1fe/0x4d0 io_uring/io_uring.c:1246 task_work_run+0x123/0x160 kernel/task_work.c:179 get_signal+0xe64/0xff0 kernel/signal.c:2635 arch_do_signal_or_restart+0x89/0x2a0 arch/x86/kernel/signal.c:306 exit_to_user_mode_loop+0x6f/0xe0 kernel/entry/common.c:168 exit_to_user_mode_prepare+0x6c/0xb0 kernel/entry/common.c:204 __syscall_exit_to_user_mode_work kernel/entry/common.c:286 [inline] syscall_exit_to_user_mode+0x26/0x140 kernel/entry/common.c:297 do_syscall_64+0x4d/0xc0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x03 -> 0x00 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- net/ipv4/tcp.c | 14 ++++++++------ net/ipv4/tcp_input.c | 4 ++-- 3 files changed, 11 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 940062e08f57..c4aab3aacbd8 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -894,7 +894,7 @@ int inet_shutdown(struct socket *sock, int how) EPOLLHUP, even on eg. unconnected UDP sockets -- RR */ fallthrough; default: - sk->sk_shutdown |= how; + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | how); if (sk->sk_prot->shutdown) sk->sk_prot->shutdown(sk, how); break; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 20db115c38c4..4d6392c16b7a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -498,6 +498,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) __poll_t mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); + u8 shutdown; int state; sock_poll_wait(file, sock, wait); @@ -540,9 +541,10 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK */ - if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) + shutdown = READ_ONCE(sk->sk_shutdown); + if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; - if (sk->sk_shutdown & RCV_SHUTDOWN) + if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* Connected or passive Fast Open socket? */ @@ -559,7 +561,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (tcp_stream_is_readable(sk, target)) mask |= EPOLLIN | EPOLLRDNORM; - if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { + if (!(shutdown & SEND_SHUTDOWN)) { if (__sk_stream_is_writeable(sk, 1)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ @@ -2867,7 +2869,7 @@ void __tcp_close(struct sock *sk, long timeout) int data_was_unread = 0; int state; - sk->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); @@ -3119,7 +3121,7 @@ int tcp_disconnect(struct sock *sk, int flags) inet_bhash2_reset_saddr(sk); - sk->sk_shutdown = 0; + WRITE_ONCE(sk->sk_shutdown, 0); sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); @@ -4649,7 +4651,7 @@ void tcp_done(struct sock *sk) if (req) reqsk_fastopen_remove(sk, req, false); - sk->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a057330d6f59..61b6710f337a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4362,7 +4362,7 @@ void tcp_fin(struct sock *sk) inet_csk_schedule_ack(sk); - sk->sk_shutdown |= RCV_SHUTDOWN; + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); sock_set_flag(sk, SOCK_DONE); switch (sk->sk_state) { @@ -6599,7 +6599,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) break; tcp_set_state(sk, TCP_FIN_WAIT2); - sk->sk_shutdown |= SEND_SHUTDOWN; + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | SEND_SHUTDOWN); sk_dst_confirm(sk); -- cgit v1.2.3 From 5bca1d081f44c9443e61841842ce4e9179d327b6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 May 2023 17:31:31 +0000 Subject: net: datagram: fix data-races in datagram_poll() datagram_poll() runs locklessly, we should add READ_ONCE() annotations while reading sk->sk_err, sk->sk_shutdown and sk->sk_state. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230509173131.3263780-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/datagram.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/datagram.c b/net/core/datagram.c index 5662dff3d381..176eb5834746 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -807,18 +807,21 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, { struct sock *sk = sock->sk; __poll_t mask; + u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; /* exceptional events? */ - if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) + if (READ_ONCE(sk->sk_err) || + !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); - if (sk->sk_shutdown & RCV_SHUTDOWN) + shutdown = READ_ONCE(sk->sk_shutdown); + if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; /* readable? */ @@ -827,10 +830,12 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, /* Connection-based need to check for termination and startup */ if (connection_based(sk)) { - if (sk->sk_state == TCP_CLOSE) + int state = READ_ONCE(sk->sk_state); + + if (state == TCP_CLOSE) mask |= EPOLLHUP; /* connection hasn't started yet? */ - if (sk->sk_state == TCP_SYN_SENT) + if (state == TCP_SYN_SENT) return mask; } -- cgit v1.2.3 From 679ed006d416ea0cecfe24a99d365d1dea69c683 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 9 May 2023 17:34:55 -0700 Subject: af_unix: Fix a data race of sk->sk_receive_queue->qlen. KCSAN found a data race of sk->sk_receive_queue->qlen where recvmsg() updates qlen under the queue lock and sendmsg() checks qlen under unix_state_sock(), not the queue lock, so the reader side needs READ_ONCE(). BUG: KCSAN: data-race in __skb_try_recv_from_queue / unix_wait_for_peer write (marked) to 0xffff888019fe7c68 of 4 bytes by task 49792 on cpu 0: __skb_unlink include/linux/skbuff.h:2347 [inline] __skb_try_recv_from_queue+0x3de/0x470 net/core/datagram.c:197 __skb_try_recv_datagram+0xf7/0x390 net/core/datagram.c:263 __unix_dgram_recvmsg+0x109/0x8a0 net/unix/af_unix.c:2452 unix_dgram_recvmsg+0x94/0xa0 net/unix/af_unix.c:2549 sock_recvmsg_nosec net/socket.c:1019 [inline] ____sys_recvmsg+0x3a3/0x3b0 net/socket.c:2720 ___sys_recvmsg+0xc8/0x150 net/socket.c:2764 do_recvmmsg+0x182/0x560 net/socket.c:2858 __sys_recvmmsg net/socket.c:2937 [inline] __do_sys_recvmmsg net/socket.c:2960 [inline] __se_sys_recvmmsg net/socket.c:2953 [inline] __x64_sys_recvmmsg+0x153/0x170 net/socket.c:2953 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3b/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x72/0xdc read to 0xffff888019fe7c68 of 4 bytes by task 49793 on cpu 1: skb_queue_len include/linux/skbuff.h:2127 [inline] unix_recvq_full net/unix/af_unix.c:229 [inline] unix_wait_for_peer+0x154/0x1a0 net/unix/af_unix.c:1445 unix_dgram_sendmsg+0x13bc/0x14b0 net/unix/af_unix.c:2048 sock_sendmsg_nosec net/socket.c:724 [inline] sock_sendmsg+0x148/0x160 net/socket.c:747 ____sys_sendmsg+0x20e/0x620 net/socket.c:2503 ___sys_sendmsg+0xc6/0x140 net/socket.c:2557 __sys_sendmmsg+0x11d/0x370 net/socket.c:2643 __do_sys_sendmmsg net/socket.c:2672 [inline] __se_sys_sendmmsg net/socket.c:2669 [inline] __x64_sys_sendmmsg+0x58/0x70 net/socket.c:2669 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3b/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x72/0xdc value changed: 0x0000000b -> 0x00000001 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 49793 Comm: syz-executor.0 Not tainted 6.3.0-rc7-02330-gca6270c12e20 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Michal Kubiak Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index fb31e8a4409e..08102e728b15 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1442,7 +1442,7 @@ static long unix_wait_for_peer(struct sock *other, long timeo) sched = !sock_flag(other, SOCK_DEAD) && !(other->sk_shutdown & RCV_SHUTDOWN) && - unix_recvq_full(other); + unix_recvq_full_lockless(other); unix_state_unlock(other); -- cgit v1.2.3 From e1d09c2c2f5793474556b60f83900e088d0d366d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 9 May 2023 17:34:56 -0700 Subject: af_unix: Fix data races around sk->sk_shutdown. KCSAN found a data race around sk->sk_shutdown where unix_release_sock() and unix_shutdown() update it under unix_state_lock(), OTOH unix_poll() and unix_dgram_poll() read it locklessly. We need to annotate the writes and reads with WRITE_ONCE() and READ_ONCE(). BUG: KCSAN: data-race in unix_poll / unix_release_sock write to 0xffff88800d0f8aec of 1 bytes by task 264 on cpu 0: unix_release_sock+0x75c/0x910 net/unix/af_unix.c:631 unix_release+0x59/0x80 net/unix/af_unix.c:1042 __sock_release+0x7d/0x170 net/socket.c:653 sock_close+0x19/0x30 net/socket.c:1397 __fput+0x179/0x5e0 fs/file_table.c:321 ____fput+0x15/0x20 fs/file_table.c:349 task_work_run+0x116/0x1a0 kernel/task_work.c:179 resume_user_mode_work include/linux/resume_user_mode.h:49 [inline] exit_to_user_mode_loop kernel/entry/common.c:171 [inline] exit_to_user_mode_prepare+0x174/0x180 kernel/entry/common.c:204 __syscall_exit_to_user_mode_work kernel/entry/common.c:286 [inline] syscall_exit_to_user_mode+0x1a/0x30 kernel/entry/common.c:297 do_syscall_64+0x4b/0x90 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x72/0xdc read to 0xffff88800d0f8aec of 1 bytes by task 222 on cpu 1: unix_poll+0xa3/0x2a0 net/unix/af_unix.c:3170 sock_poll+0xcf/0x2b0 net/socket.c:1385 vfs_poll include/linux/poll.h:88 [inline] ep_item_poll.isra.0+0x78/0xc0 fs/eventpoll.c:855 ep_send_events fs/eventpoll.c:1694 [inline] ep_poll fs/eventpoll.c:1823 [inline] do_epoll_wait+0x6c4/0xea0 fs/eventpoll.c:2258 __do_sys_epoll_wait fs/eventpoll.c:2270 [inline] __se_sys_epoll_wait fs/eventpoll.c:2265 [inline] __x64_sys_epoll_wait+0xcc/0x190 fs/eventpoll.c:2265 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3b/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x72/0xdc value changed: 0x00 -> 0x03 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 222 Comm: dbus-broker Not tainted 6.3.0-rc7-02330-gca6270c12e20 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Fixes: 3c73419c09a5 ("af_unix: fix 'poll for write'/ connected DGRAM sockets") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Michal Kubiak Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 08102e728b15..cc695c9f09ec 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -603,7 +603,7 @@ static void unix_release_sock(struct sock *sk, int embrion) /* Clear state */ unix_state_lock(sk); sock_orphan(sk); - sk->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); path = u->path; u->path.dentry = NULL; u->path.mnt = NULL; @@ -628,7 +628,7 @@ static void unix_release_sock(struct sock *sk, int embrion) if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { unix_state_lock(skpair); /* No more writes */ - skpair->sk_shutdown = SHUTDOWN_MASK; + WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) WRITE_ONCE(skpair->sk_err, ECONNRESET); unix_state_unlock(skpair); @@ -3008,7 +3008,7 @@ static int unix_shutdown(struct socket *sock, int mode) ++mode; unix_state_lock(sk); - sk->sk_shutdown |= mode; + WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); other = unix_peer(sk); if (other) sock_hold(other); @@ -3028,7 +3028,7 @@ static int unix_shutdown(struct socket *sock, int mode) if (mode&SEND_SHUTDOWN) peer_mode |= RCV_SHUTDOWN; unix_state_lock(other); - other->sk_shutdown |= peer_mode; + WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); unix_state_unlock(other); other->sk_state_change(other); if (peer_mode == SHUTDOWN_MASK) @@ -3160,16 +3160,18 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa { struct sock *sk = sock->sk; __poll_t mask; + u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; + shutdown = READ_ONCE(sk->sk_shutdown); /* exceptional events? */ if (READ_ONCE(sk->sk_err)) mask |= EPOLLERR; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; - if (sk->sk_shutdown & RCV_SHUTDOWN) + if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; /* readable? */ @@ -3203,9 +3205,11 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk, *other; unsigned int writable; __poll_t mask; + u8 shutdown; sock_poll_wait(file, sock, wait); mask = 0; + shutdown = READ_ONCE(sk->sk_shutdown); /* exceptional events? */ if (READ_ONCE(sk->sk_err) || @@ -3213,9 +3217,9 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); - if (sk->sk_shutdown & RCV_SHUTDOWN) + if (shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; - if (sk->sk_shutdown == SHUTDOWN_MASK) + if (shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; /* readable? */ -- cgit v1.2.3 From e93c9378e33f68b61ea9318580d841caa22fb9ea Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 10 May 2023 16:46:21 +0200 Subject: devlink: change per-devlink netdev notifier to static one The commit 565b4824c39f ("devlink: change port event netdev notifier from per-net to global") changed original per-net notifier to be per-devlink instance. That fixed the issue of non-receiving events of netdev uninit if that moved to a different namespace. That worked fine in -net tree. However, later on when commit ee75f1fc44dd ("net/mlx5e: Create separate devlink instance for ethernet auxiliary device") and commit 72ed5d5624af ("net/mlx5: Suspend auxiliary devices only in case of PCI device suspend") were merged, a deadlock was introduced when removing a namespace with devlink instance with another nested instance. Here there is the bad flow example resulting in deadlock with mlx5: net_cleanup_work -> cleanup_net (takes down_read(&pernet_ops_rwsem) -> devlink_pernet_pre_exit() -> devlink_reload() -> mlx5_devlink_reload_down() -> mlx5_unload_one_devl_locked() -> mlx5_detach_device() -> del_adev() -> mlx5e_remove() -> mlx5e_destroy_devlink() -> devlink_free() -> unregister_netdevice_notifier() (takes down_write(&pernet_ops_rwsem) Steps to reproduce: $ modprobe mlx5_core $ ip netns add ns1 $ devlink dev reload pci/0000:08:00.0 netns ns1 $ ip netns del ns1 Resolve this by converting the notifier from per-devlink instance to a static one registered during init phase and leaving it registered forever. Use this notifier for all devlink port instances created later on. Note what a tree needs this fix only in case all of the cited fixes commits are present. Reported-by: Moshe Shemesh Fixes: 565b4824c39f ("devlink: change port event netdev notifier from per-net to global") Fixes: ee75f1fc44dd ("net/mlx5e: Create separate devlink instance for ethernet auxiliary device") Fixes: 72ed5d5624af ("net/mlx5: Suspend auxiliary devices only in case of PCI device suspend") Signed-off-by: Jiri Pirko Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230510144621.932017-1-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- net/devlink/core.c | 16 +++++++--------- net/devlink/devl_internal.h | 1 - net/devlink/leftover.c | 5 ++--- 3 files changed, 9 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/devlink/core.c b/net/devlink/core.c index 777b091ef74d..0e58eee44bdb 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -204,11 +204,6 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, if (ret < 0) goto err_xa_alloc; - devlink->netdevice_nb.notifier_call = devlink_port_netdevice_event; - ret = register_netdevice_notifier(&devlink->netdevice_nb); - if (ret) - goto err_register_netdevice_notifier; - devlink->dev = dev; devlink->ops = ops; xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC); @@ -233,8 +228,6 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, return devlink; -err_register_netdevice_notifier: - xa_erase(&devlinks, devlink->index); err_xa_alloc: kfree(devlink); return NULL; @@ -266,8 +259,6 @@ void devlink_free(struct devlink *devlink) xa_destroy(&devlink->params); xa_destroy(&devlink->ports); - WARN_ON_ONCE(unregister_netdevice_notifier(&devlink->netdevice_nb)); - xa_erase(&devlinks, devlink->index); devlink_put(devlink); @@ -303,6 +294,10 @@ static struct pernet_operations devlink_pernet_ops __net_initdata = { .pre_exit = devlink_pernet_pre_exit, }; +static struct notifier_block devlink_port_netdevice_nb __net_initdata = { + .notifier_call = devlink_port_netdevice_event, +}; + static int __init devlink_init(void) { int err; @@ -311,6 +306,9 @@ static int __init devlink_init(void) if (err) goto out; err = register_pernet_subsys(&devlink_pernet_ops); + if (err) + goto out; + err = register_netdevice_notifier(&devlink_port_netdevice_nb); out: WARN_ON(err); diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index e133f423294a..62921b2eb0d3 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -50,7 +50,6 @@ struct devlink { u8 reload_failed:1; refcount_t refcount; struct rcu_work rwork; - struct notifier_block netdevice_nb; char priv[] __aligned(NETDEV_ALIGN); }; diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index dffca2f9bfa7..cd0254968076 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -7073,10 +7073,9 @@ int devlink_port_netdevice_event(struct notifier_block *nb, struct devlink_port *devlink_port = netdev->devlink_port; struct devlink *devlink; - devlink = container_of(nb, struct devlink, netdevice_nb); - - if (!devlink_port || devlink_port->devlink != devlink) + if (!devlink_port) return NOTIFY_OK; + devlink = devlink_port->devlink; switch (event) { case NETDEV_POST_INIT: -- cgit v1.2.3 From ef1148d4487438a3408d6face2a8360d91b4af70 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 10 May 2023 15:46:46 +0000 Subject: ipv6: remove nexthop_fib6_nh_bh() After blamed commit, nexthop_fib6_nh_bh() and nexthop_fib6_nh() are the same. Delete nexthop_fib6_nh_bh(), and convert /proc/net/ipv6_route to standard rcu to avoid this splat: [ 5723.180080] WARNING: suspicious RCU usage [ 5723.180083] ----------------------------- [ 5723.180084] include/net/nexthop.h:516 suspicious rcu_dereference_check() usage! [ 5723.180086] other info that might help us debug this: [ 5723.180087] rcu_scheduler_active = 2, debug_locks = 1 [ 5723.180089] 2 locks held by cat/55856: [ 5723.180091] #0: ffff9440a582afa8 (&p->lock){+.+.}-{3:3}, at: seq_read_iter (fs/seq_file.c:188) [ 5723.180100] #1: ffffffffaac07040 (rcu_read_lock_bh){....}-{1:2}, at: rcu_lock_acquire (include/linux/rcupdate.h:326) [ 5723.180109] stack backtrace: [ 5723.180111] CPU: 14 PID: 55856 Comm: cat Tainted: G S I 6.3.0-dbx-DEV #528 [ 5723.180115] Call Trace: [ 5723.180117] [ 5723.180119] dump_stack_lvl (lib/dump_stack.c:107) [ 5723.180124] dump_stack (lib/dump_stack.c:114) [ 5723.180126] lockdep_rcu_suspicious (include/linux/context_tracking.h:122) [ 5723.180132] ipv6_route_seq_show (include/net/nexthop.h:?) [ 5723.180135] ? ipv6_route_seq_next (net/ipv6/ip6_fib.c:2605) [ 5723.180140] seq_read_iter (fs/seq_file.c:272) [ 5723.180145] seq_read (fs/seq_file.c:163) [ 5723.180151] proc_reg_read (fs/proc/inode.c:316 fs/proc/inode.c:328) [ 5723.180155] vfs_read (fs/read_write.c:468) [ 5723.180160] ? up_read (kernel/locking/rwsem.c:1617) [ 5723.180164] ksys_read (fs/read_write.c:613) [ 5723.180168] __x64_sys_read (fs/read_write.c:621) [ 5723.180170] do_syscall_64 (arch/x86/entry/common.c:?) [ 5723.180174] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) [ 5723.180177] RIP: 0033:0x7fa455677d2a Fixes: 09eed1192cec ("neighbour: switch to standard rcu, instead of rcu_bh") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20230510154646.370659-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_fib.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 2438da5ff6da..bac768d36cc1 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2491,7 +2491,7 @@ static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) const struct net_device *dev; if (rt->nh) - fib6_nh = nexthop_fib6_nh_bh(rt->nh); + fib6_nh = nexthop_fib6_nh(rt->nh); seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); @@ -2556,14 +2556,14 @@ static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, if (tbl) { h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1; - node = rcu_dereference_bh(hlist_next_rcu(&tbl->tb6_hlist)); + node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist)); } else { h = 0; node = NULL; } while (!node && h < FIB6_TABLE_HASHSZ) { - node = rcu_dereference_bh( + node = rcu_dereference( hlist_first_rcu(&net->ipv6.fib_table_hash[h++])); } return hlist_entry_safe(node, struct fib6_table, tb6_hlist); @@ -2593,7 +2593,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!v) goto iter_table; - n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next); + n = rcu_dereference(((struct fib6_info *)v)->fib6_next); if (n) return n; @@ -2619,12 +2619,12 @@ iter_table: } static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU_BH) + __acquires(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; - rcu_read_lock_bh(); + rcu_read_lock(); iter->tbl = ipv6_route_seq_next_table(NULL, net); iter->skip = *pos; @@ -2645,7 +2645,7 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) } static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) - __releases(RCU_BH) + __releases(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; @@ -2653,7 +2653,7 @@ static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) if (ipv6_route_iter_active(iter)) fib6_walker_unlink(net, &iter->w); - rcu_read_unlock_bh(); + rcu_read_unlock(); } #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) -- cgit v1.2.3 From 6d4486efe9c69626cab423456169e250a5cd3af5 Mon Sep 17 00:00:00 2001 From: Zhuang Shengen Date: Thu, 11 May 2023 19:34:30 +0800 Subject: vsock: avoid to close connected socket after the timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When client and server establish a connection through vsock, the client send a request to the server to initiate the connection, then start a timer to wait for the server's response. When the server's RESPONSE message arrives, the timer also times out and exits. The server's RESPONSE message is processed first, and the connection is established. However, the client's timer also times out, the original processing logic of the client is to directly set the state of this vsock to CLOSE and return ETIMEDOUT. It will not notify the server when the port is released, causing the server port remain. when client's vsock_connect timeout,it should check sk state is ESTABLISHED or not. if sk state is ESTABLISHED, it means the connection is established, the client should not set the sk state to CLOSE Note: I encountered this issue on kernel-4.18, which can be fixed by this patch. Then I checked the latest code in the community and found similar issue. Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Signed-off-by: Zhuang Shengen Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 413407bb646c..efb8a0937a13 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1462,7 +1462,7 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, vsock_transport_cancel_pkt(vsk); vsock_remove_connected(vsk); goto out_wait; - } else if (timeout == 0) { + } else if ((sk->sk_state != TCP_ESTABLISHED) && (timeout == 0)) { err = -ETIMEDOUT; sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; -- cgit v1.2.3 From 1e306ec49a1f206fd2cc89a42fac6e6f592a8cc1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 May 2023 11:47:49 +0000 Subject: tcp: fix possible sk_priority leak in tcp_v4_send_reset() When tcp_v4_send_reset() is called with @sk == NULL, we do not change ctl_sk->sk_priority, which could have been set from a prior invocation. Change tcp_v4_send_reset() to set sk_priority and sk_mark fields before calling ip_send_unicast_reply(). This means tcp_v4_send_reset() and tcp_v4_send_ack() no longer have to clear ctl_sk->sk_mark after their call to ip_send_unicast_reply(). Fixes: f6c0f5d209fa ("tcp: honor SO_PRIORITY in TIME_WAIT state") Signed-off-by: Eric Dumazet Cc: Antoine Tenart Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 39bda2b1066e..06d2573685ca 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -829,6 +829,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) inet_twsk(sk)->tw_priority : sk->sk_priority; transmit_time = tcp_transmit_time(sk); xfrm_sk_clone_policy(ctl_sk, sk); + } else { + ctl_sk->sk_mark = 0; + ctl_sk->sk_priority = 0; } ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, @@ -836,7 +839,6 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) &arg, arg.iov[0].iov_len, transmit_time); - ctl_sk->sk_mark = 0; xfrm_sk_free_policy(ctl_sk); sock_net_set(ctl_sk, &init_net); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); @@ -935,7 +937,6 @@ static void tcp_v4_send_ack(const struct sock *sk, &arg, arg.iov[0].iov_len, transmit_time); - ctl_sk->sk_mark = 0; sock_net_set(ctl_sk, &init_net); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); local_bh_enable(); -- cgit v1.2.3 From d80fc101d2eb9b3188c228d61223890aeea480a4 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 11 May 2023 19:22:11 -0400 Subject: erspan: get the proto with the md version for collect_md In commit 20704bd1633d ("erspan: build the header with the right proto according to erspan_ver"), it gets the proto with t->parms.erspan_ver, but t->parms.erspan_ver is not used by collect_md branch, and instead it should get the proto with md->version for collect_md. Thanks to Kevin for pointing this out. Fixes: 20704bd1633d ("erspan: build the header with the right proto according to erspan_ver") Fixes: 94d7d8f29287 ("ip6_gre: add erspan v2 support") Reported-by: Kevin Traynor Signed-off-by: Xin Long Reviewed-by: Simon Horman Reviewed-by: William Tu Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index a4ecfc9d2593..da80974ad23a 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1015,12 +1015,14 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, ntohl(tun_id), ntohl(md->u.index), truncate, false); + proto = htons(ETH_P_ERSPAN); } else if (md->version == 2) { erspan_build_header_v2(skb, ntohl(tun_id), md->u.md2.dir, get_hwid(&md->u.md2), truncate, false); + proto = htons(ETH_P_ERSPAN2); } else { goto tx_err; } @@ -1043,24 +1045,25 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, break; } - if (t->parms.erspan_ver == 1) + if (t->parms.erspan_ver == 1) { erspan_build_header(skb, ntohl(t->parms.o_key), t->parms.index, truncate, false); - else if (t->parms.erspan_ver == 2) + proto = htons(ETH_P_ERSPAN); + } else if (t->parms.erspan_ver == 2) { erspan_build_header_v2(skb, ntohl(t->parms.o_key), t->parms.dir, t->parms.hwid, truncate, false); - else + proto = htons(ETH_P_ERSPAN2); + } else { goto tx_err; + } fl6.daddr = t->parms.raddr; } /* Push GRE header. */ - proto = (t->parms.erspan_ver == 1) ? htons(ETH_P_ERSPAN) - : htons(ETH_P_ERSPAN2); gre_build_header(skb, 8, TUNNEL_SEQ, proto, 0, htonl(atomic_fetch_inc(&t->o_seqno))); /* TooBig packet may have updated dst->dev's mtu */ -- cgit v1.2.3 From eb8d3a2c809abd73ab0a060fe971d6b9019aa3c1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 9 May 2023 09:41:49 +1000 Subject: SUNRPC: double free xprt_ctxt while still in use When an RPC request is deferred, the rq_xprt_ctxt pointer is moved out of the svc_rqst into the svc_deferred_req. When the deferred request is revisited, the pointer is copied into the new svc_rqst - and also remains in the svc_deferred_req. In the (rare?) case that the request is deferred a second time, the old svc_deferred_req is reused - it still has all the correct content. However in that case the rq_xprt_ctxt pointer is NOT cleared so that when xpo_release_xprt is called, the ctxt is freed (UDP) or possible added to a free list (RDMA). When the deferred request is revisited for a second time, it will reference this ctxt which may be invalid, and the free the object a second time which is likely to oops. So change svc_defer() to *always* clear rq_xprt_ctxt, and assert that the value is now stored in the svc_deferred_req. Fixes: 773f91b2cf3f ("SUNRPC: Fix NFSD's request deferral on RDMA transports") Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/svc_xprt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 84e5d7d31481..5fd94f6bdc75 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1223,13 +1223,14 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req) dr->daddr = rqstp->rq_daddr; dr->argslen = rqstp->rq_arg.len >> 2; dr->xprt_ctxt = rqstp->rq_xprt_ctxt; - rqstp->rq_xprt_ctxt = NULL; /* back up head to the start of the buffer and copy */ skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip, dr->argslen << 2); } + WARN_ON_ONCE(rqstp->rq_xprt_ctxt != dr->xprt_ctxt); + rqstp->rq_xprt_ctxt = NULL; trace_svc_defer(rqstp); svc_xprt_get(rqstp->rq_xprt); dr->xprt = rqstp->rq_xprt; -- cgit v1.2.3 From 948f072ada23e0a504c5e4d7d71d4c83bd0785ec Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 9 May 2023 09:42:47 +1000 Subject: SUNRPC: always free ctxt when freeing deferred request Since the ->xprt_ctxt pointer was added to svc_deferred_req, it has not been sufficient to use kfree() to free a deferred request. We may need to free the ctxt as well. As freeing the ctxt is all that ->xpo_release_rqst() does, we repurpose it to explicit do that even when the ctxt is not stored in an rqst. So we now have ->xpo_release_ctxt() which is given an xprt and a ctxt, which may have been taken either from an rqst or from a dreq. The caller is now responsible for clearing that pointer after the call to ->xpo_release_ctxt. We also clear dr->xprt_ctxt when the ctxt is moved into a new rqst when revisiting a deferred request. This ensures there is only one pointer to the ctxt, so the risk of double freeing in future is reduced. The new code in svc_xprt_release which releases both the ctxt and any rq_deferred depends on this. Fixes: 773f91b2cf3f ("SUNRPC: Fix NFSD's request deferral on RDMA transports") Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/svc_xprt.c | 23 +++++++++++++++++------ net/sunrpc/svcsock.c | 30 ++++++++++++++++-------------- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 11 +++++------ net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 +- 4 files changed, 39 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 5fd94f6bdc75..13a14897bc17 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -532,13 +532,23 @@ void svc_reserve(struct svc_rqst *rqstp, int space) } EXPORT_SYMBOL_GPL(svc_reserve); +static void free_deferred(struct svc_xprt *xprt, struct svc_deferred_req *dr) +{ + if (!dr) + return; + + xprt->xpt_ops->xpo_release_ctxt(xprt, dr->xprt_ctxt); + kfree(dr); +} + static void svc_xprt_release(struct svc_rqst *rqstp) { struct svc_xprt *xprt = rqstp->rq_xprt; - xprt->xpt_ops->xpo_release_rqst(rqstp); + xprt->xpt_ops->xpo_release_ctxt(xprt, rqstp->rq_xprt_ctxt); + rqstp->rq_xprt_ctxt = NULL; - kfree(rqstp->rq_deferred); + free_deferred(xprt, rqstp->rq_deferred); rqstp->rq_deferred = NULL; svc_rqst_release_pages(rqstp); @@ -1054,7 +1064,7 @@ static void svc_delete_xprt(struct svc_xprt *xprt) spin_unlock_bh(&serv->sv_lock); while ((dr = svc_deferred_dequeue(xprt)) != NULL) - kfree(dr); + free_deferred(xprt, dr); call_xpt_users(xprt); svc_xprt_put(xprt); @@ -1176,8 +1186,8 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) if (too_many || test_bit(XPT_DEAD, &xprt->xpt_flags)) { spin_unlock(&xprt->xpt_lock); trace_svc_defer_drop(dr); + free_deferred(xprt, dr); svc_xprt_put(xprt); - kfree(dr); return; } dr->xprt = NULL; @@ -1222,14 +1232,13 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req) dr->addrlen = rqstp->rq_addrlen; dr->daddr = rqstp->rq_daddr; dr->argslen = rqstp->rq_arg.len >> 2; - dr->xprt_ctxt = rqstp->rq_xprt_ctxt; /* back up head to the start of the buffer and copy */ skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip, dr->argslen << 2); } - WARN_ON_ONCE(rqstp->rq_xprt_ctxt != dr->xprt_ctxt); + dr->xprt_ctxt = rqstp->rq_xprt_ctxt; rqstp->rq_xprt_ctxt = NULL; trace_svc_defer(rqstp); svc_xprt_get(rqstp->rq_xprt); @@ -1263,6 +1272,8 @@ static noinline int svc_deferred_recv(struct svc_rqst *rqstp) rqstp->rq_daddr = dr->daddr; rqstp->rq_respages = rqstp->rq_pages; rqstp->rq_xprt_ctxt = dr->xprt_ctxt; + + dr->xprt_ctxt = NULL; svc_xprt_received(rqstp->rq_xprt); return dr->argslen << 2; } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 9989194446a5..63fe7a338992 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -121,27 +121,27 @@ static void svc_reclassify_socket(struct socket *sock) #endif /** - * svc_tcp_release_rqst - Release transport-related resources - * @rqstp: request structure with resources to be released + * svc_tcp_release_ctxt - Release transport-related resources + * @xprt: the transport which owned the context + * @ctxt: the context from rqstp->rq_xprt_ctxt or dr->xprt_ctxt * */ -static void svc_tcp_release_rqst(struct svc_rqst *rqstp) +static void svc_tcp_release_ctxt(struct svc_xprt *xprt, void *ctxt) { } /** - * svc_udp_release_rqst - Release transport-related resources - * @rqstp: request structure with resources to be released + * svc_udp_release_ctxt - Release transport-related resources + * @xprt: the transport which owned the context + * @ctxt: the context from rqstp->rq_xprt_ctxt or dr->xprt_ctxt * */ -static void svc_udp_release_rqst(struct svc_rqst *rqstp) +static void svc_udp_release_ctxt(struct svc_xprt *xprt, void *ctxt) { - struct sk_buff *skb = rqstp->rq_xprt_ctxt; + struct sk_buff *skb = ctxt; - if (skb) { - rqstp->rq_xprt_ctxt = NULL; + if (skb) consume_skb(skb); - } } union svc_pktinfo_u { @@ -696,7 +696,8 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) unsigned int sent; int err; - svc_udp_release_rqst(rqstp); + svc_udp_release_ctxt(xprt, rqstp->rq_xprt_ctxt); + rqstp->rq_xprt_ctxt = NULL; svc_set_cmsg_data(rqstp, cmh); @@ -768,7 +769,7 @@ static const struct svc_xprt_ops svc_udp_ops = { .xpo_recvfrom = svc_udp_recvfrom, .xpo_sendto = svc_udp_sendto, .xpo_result_payload = svc_sock_result_payload, - .xpo_release_rqst = svc_udp_release_rqst, + .xpo_release_ctxt = svc_udp_release_ctxt, .xpo_detach = svc_sock_detach, .xpo_free = svc_sock_free, .xpo_has_wspace = svc_udp_has_wspace, @@ -1301,7 +1302,8 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) unsigned int sent; int err; - svc_tcp_release_rqst(rqstp); + svc_tcp_release_ctxt(xprt, rqstp->rq_xprt_ctxt); + rqstp->rq_xprt_ctxt = NULL; atomic_inc(&svsk->sk_sendqlen); mutex_lock(&xprt->xpt_mutex); @@ -1346,7 +1348,7 @@ static const struct svc_xprt_ops svc_tcp_ops = { .xpo_recvfrom = svc_tcp_recvfrom, .xpo_sendto = svc_tcp_sendto, .xpo_result_payload = svc_sock_result_payload, - .xpo_release_rqst = svc_tcp_release_rqst, + .xpo_release_ctxt = svc_tcp_release_ctxt, .xpo_detach = svc_tcp_sock_detach, .xpo_free = svc_sock_free, .xpo_has_wspace = svc_tcp_has_wspace, diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 1c658fa43063..a22fe7587fa6 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -239,21 +239,20 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, } /** - * svc_rdma_release_rqst - Release transport-specific per-rqst resources - * @rqstp: svc_rqst being released + * svc_rdma_release_ctxt - Release transport-specific per-rqst resources + * @xprt: the transport which owned the context + * @vctxt: the context from rqstp->rq_xprt_ctxt or dr->xprt_ctxt * * Ensure that the recv_ctxt is released whether or not a Reply * was sent. For example, the client could close the connection, * or svc_process could drop an RPC, before the Reply is sent. */ -void svc_rdma_release_rqst(struct svc_rqst *rqstp) +void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *vctxt) { - struct svc_rdma_recv_ctxt *ctxt = rqstp->rq_xprt_ctxt; - struct svc_xprt *xprt = rqstp->rq_xprt; + struct svc_rdma_recv_ctxt *ctxt = vctxt; struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); - rqstp->rq_xprt_ctxt = NULL; if (ctxt) svc_rdma_recv_ctxt_put(rdma, ctxt); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 416b298f74dd..ca04f7a6a085 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -80,7 +80,7 @@ static const struct svc_xprt_ops svc_rdma_ops = { .xpo_recvfrom = svc_rdma_recvfrom, .xpo_sendto = svc_rdma_sendto, .xpo_result_payload = svc_rdma_result_payload, - .xpo_release_rqst = svc_rdma_release_rqst, + .xpo_release_ctxt = svc_rdma_release_ctxt, .xpo_detach = svc_rdma_detach, .xpo_free = svc_rdma_free, .xpo_has_wspace = svc_rdma_has_wspace, -- cgit v1.2.3 From 07a27305938559fb35f7a46fb90a5e37728bdee6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 14 May 2023 15:51:48 -0400 Subject: SUNRPC: Fix trace_svc_register() call site The trace event recorded incorrect values for the registered family, protocol, and port because the arguments are in the wrong order. Fixes: b4af59328c25 ("SUNRPC: Trace server-side rpcbind registration events") Signed-off-by: Chuck Lever --- net/sunrpc/svc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 43e32129a583..79967b6925bd 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1052,7 +1052,7 @@ static int __svc_register(struct net *net, const char *progname, #endif } - trace_svc_register(progname, version, protocol, port, family, error); + trace_svc_register(progname, version, family, protocol, port, error); return error; } -- cgit v1.2.3 From c83b49383b595be50647f0c764a48c78b5f3c4f8 Mon Sep 17 00:00:00 2001 From: Dong Chenchen Date: Thu, 11 May 2023 20:54:40 +0800 Subject: net: nsh: Use correct mac_offset to unwind gso skb in nsh_gso_segment() As the call trace shows, skb_panic was caused by wrong skb->mac_header in nsh_gso_segment(): invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 3 PID: 2737 Comm: syz Not tainted 6.3.0-next-20230505 #1 RIP: 0010:skb_panic+0xda/0xe0 call Trace: skb_push+0x91/0xa0 nsh_gso_segment+0x4f3/0x570 skb_mac_gso_segment+0x19e/0x270 __skb_gso_segment+0x1e8/0x3c0 validate_xmit_skb+0x452/0x890 validate_xmit_skb_list+0x99/0xd0 sch_direct_xmit+0x294/0x7c0 __dev_queue_xmit+0x16f0/0x1d70 packet_xmit+0x185/0x210 packet_snd+0xc15/0x1170 packet_sendmsg+0x7b/0xa0 sock_sendmsg+0x14f/0x160 The root cause is: nsh_gso_segment() use skb->network_header - nhoff to reset mac_header in skb_gso_error_unwind() if inner-layer protocol gso fails. However, skb->network_header may be reset by inner-layer protocol gso function e.g. mpls_gso_segment. skb->mac_header reset by the inaccurate network_header will be larger than skb headroom. nsh_gso_segment nhoff = skb->network_header - skb->mac_header; __skb_pull(skb,nsh_len) skb_mac_gso_segment mpls_gso_segment skb_reset_network_header(skb);//skb->network_header+=nsh_len return -EINVAL; skb_gso_error_unwind skb_push(skb, nsh_len); skb->mac_header = skb->network_header - nhoff; // skb->mac_header > skb->headroom, cause skb_push panic Use correct mac_offset to restore mac_header and get rid of nhoff. Fixes: c411ed854584 ("nsh: add GSO support") Reported-by: syzbot+632b5d9964208bfef8c0@syzkaller.appspotmail.com Suggested-by: Eric Dumazet Signed-off-by: Dong Chenchen Signed-off-by: David S. Miller --- net/nsh/nsh.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c index e9ca007718b7..0f23e5e8e03e 100644 --- a/net/nsh/nsh.c +++ b/net/nsh/nsh.c @@ -77,13 +77,12 @@ static struct sk_buff *nsh_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); + u16 mac_offset = skb->mac_header; unsigned int nsh_len, mac_len; __be16 proto; - int nhoff; skb_reset_network_header(skb); - nhoff = skb->network_header - skb->mac_header; mac_len = skb->mac_len; if (unlikely(!pskb_may_pull(skb, NSH_BASE_HDR_LEN))) @@ -108,15 +107,14 @@ static struct sk_buff *nsh_gso_segment(struct sk_buff *skb, segs = skb_mac_gso_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, htons(ETH_P_NSH), nsh_len, - skb->network_header - nhoff, - mac_len); + mac_offset, mac_len); goto out; } for (skb = segs; skb; skb = skb->next) { skb->protocol = htons(ETH_P_NSH); __skb_push(skb, nsh_len); - skb_set_mac_header(skb, -nhoff); + skb->mac_header = mac_offset; skb->network_header = skb->mac_header + mac_len; skb->mac_len = mac_len; } -- cgit v1.2.3 From 3ae6d66b605be604644d4bb5708a7ffd9cf1abe8 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 14 May 2023 15:52:27 -0400 Subject: tipc: add tipc_bearer_min_mtu to calculate min mtu As different media may requires different min mtu, and even the same media with different net family requires different min mtu, add tipc_bearer_min_mtu() to calculate min mtu accordingly. This API will be used to check the new mtu when doing the link mtu negotiation in the next patch. Signed-off-by: Xin Long Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bearer.c | 13 +++++++++++++ net/tipc/bearer.h | 3 +++ net/tipc/udp_media.c | 5 +++-- 3 files changed, 19 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 35cac7733fd3..0e9a29e1536b 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -541,6 +541,19 @@ int tipc_bearer_mtu(struct net *net, u32 bearer_id) return mtu; } +int tipc_bearer_min_mtu(struct net *net, u32 bearer_id) +{ + int mtu = TIPC_MIN_BEARER_MTU; + struct tipc_bearer *b; + + rcu_read_lock(); + b = bearer_get(net, bearer_id); + if (b) + mtu += b->encap_hlen; + rcu_read_unlock(); + return mtu; +} + /* tipc_bearer_xmit_skb - sends buffer to destination over bearer */ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 490ad6e5f7a3..bd0cc5c287ef 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -146,6 +146,7 @@ struct tipc_media { * @identity: array index of this bearer within TIPC bearer array * @disc: ptr to link setup request * @net_plane: network plane ('A' through 'H') currently associated with bearer + * @encap_hlen: encap headers length * @up: bearer up flag (bit 0) * @refcnt: tipc_bearer reference counter * @@ -170,6 +171,7 @@ struct tipc_bearer { u32 identity; struct tipc_discoverer *disc; char net_plane; + u16 encap_hlen; unsigned long up; refcount_t refcnt; }; @@ -232,6 +234,7 @@ int tipc_bearer_setup(void); void tipc_bearer_cleanup(void); void tipc_bearer_stop(struct net *net); int tipc_bearer_mtu(struct net *net, u32 bearer_id); +int tipc_bearer_min_mtu(struct net *net, u32 bearer_id); bool tipc_bearer_bcast_support(struct net *net, u32 bearer_id); void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, struct sk_buff *skb, diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index c2bb818704c8..0a85244fd618 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -738,8 +738,8 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, udp_conf.local_ip.s_addr = local.ipv4.s_addr; udp_conf.use_udp_checksums = false; ub->ifindex = dev->ifindex; - if (tipc_mtu_bad(dev, sizeof(struct iphdr) + - sizeof(struct udphdr))) { + b->encap_hlen = sizeof(struct iphdr) + sizeof(struct udphdr); + if (tipc_mtu_bad(dev, b->encap_hlen)) { err = -EINVAL; goto err; } @@ -760,6 +760,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, else udp_conf.local_ip6 = local.ipv6; ub->ifindex = dev->ifindex; + b->encap_hlen = sizeof(struct ipv6hdr) + sizeof(struct udphdr); b->mtu = 1280; #endif } else { -- cgit v1.2.3 From 56077b56cd3fb78e1c8619e29581ba25a5c55e86 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 14 May 2023 15:52:28 -0400 Subject: tipc: do not update mtu if msg_max is too small in mtu negotiation When doing link mtu negotiation, a malicious peer may send Activate msg with a very small mtu, e.g. 4 in Shuang's testing, without checking for the minimum mtu, l->mtu will be set to 4 in tipc_link_proto_rcv(), then n->links[bearer_id].mtu is set to 4294967228, which is a overflow of '4 - INT_H_SIZE - EMSG_OVERHEAD' in tipc_link_mss(). With tipc_link.mtu = 4, tipc_link_xmit() kept printing the warning: tipc: Too large msg, purging xmit list 1 5 0 40 4! tipc: Too large msg, purging xmit list 1 15 0 60 4! And with tipc_link_entry.mtu 4294967228, a huge skb was allocated in named_distribute(), and when purging it in tipc_link_xmit(), a crash was even caused: general protection fault, probably for non-canonical address 0x2100001011000dd: 0000 [#1] PREEMPT SMP PTI CPU: 0 PID: 0 Comm: swapper/0 Kdump: loaded Not tainted 6.3.0.neta #19 RIP: 0010:kfree_skb_list_reason+0x7e/0x1f0 Call Trace: skb_release_data+0xf9/0x1d0 kfree_skb_reason+0x40/0x100 tipc_link_xmit+0x57a/0x740 [tipc] tipc_node_xmit+0x16c/0x5c0 [tipc] tipc_named_node_up+0x27f/0x2c0 [tipc] tipc_node_write_unlock+0x149/0x170 [tipc] tipc_rcv+0x608/0x740 [tipc] tipc_udp_recv+0xdc/0x1f0 [tipc] udp_queue_rcv_one_skb+0x33e/0x620 udp_unicast_rcv_skb.isra.72+0x75/0x90 __udp4_lib_rcv+0x56d/0xc20 ip_protocol_deliver_rcu+0x100/0x2d0 This patch fixes it by checking the new mtu against tipc_bearer_min_mtu(), and not updating mtu if it is too small. Fixes: ed193ece2649 ("tipc: simplify link mtu negotiation") Reported-by: Shuang Li Signed-off-by: Xin Long Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index b3ce24823f50..2eff1c7949cb 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2200,7 +2200,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, struct tipc_msg *hdr = buf_msg(skb); struct tipc_gap_ack_blks *ga = NULL; bool reply = msg_probe(hdr), retransmitted = false; - u32 dlen = msg_data_sz(hdr), glen = 0; + u32 dlen = msg_data_sz(hdr), glen = 0, msg_max; u16 peers_snd_nxt = msg_next_sent(hdr); u16 peers_tol = msg_link_tolerance(hdr); u16 peers_prio = msg_linkprio(hdr); @@ -2239,6 +2239,9 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, switch (mtyp) { case RESET_MSG: case ACTIVATE_MSG: + msg_max = msg_max_pkt(hdr); + if (msg_max < tipc_bearer_min_mtu(l->net, l->bearer_id)) + break; /* Complete own link name with peer's interface name */ if_name = strrchr(l->name, ':') + 1; if (sizeof(l->name) - (if_name - l->name) <= TIPC_MAX_IF_NAME) @@ -2283,8 +2286,8 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, l->peer_session = msg_session(hdr); l->in_session = true; l->peer_bearer_id = msg_bearer_id(hdr); - if (l->mtu > msg_max_pkt(hdr)) - l->mtu = msg_max_pkt(hdr); + if (l->mtu > msg_max) + l->mtu = msg_max; break; case STATE_MSG: -- cgit v1.2.3 From 35a089b5d793d2bfd2cc7cfa6104545184de2ce7 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 14 May 2023 15:52:29 -0400 Subject: tipc: check the bearer min mtu properly when setting it by netlink Checking the bearer min mtu with tipc_udp_mtu_bad() only works for IPv4 UDP bearer, and IPv6 UDP bearer has a different value for the min mtu. This patch checks with encap_hlen + TIPC_MIN_BEARER_MTU for min mtu, which works for both IPv4 and IPv6 UDP bearer. Note that tipc_udp_mtu_bad() is still used to check media min mtu in __tipc_nl_media_set(), as m->mtu currently is only used by the IPv4 UDP bearer as its default mtu value. Fixes: 682cd3cf946b ("tipc: confgiure and apply UDP bearer MTU on running links") Signed-off-by: Xin Long Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bearer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 0e9a29e1536b..53881406e200 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -1151,8 +1151,8 @@ int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } #ifdef CONFIG_TIPC_MEDIA_UDP - if (tipc_udp_mtu_bad(nla_get_u32 - (props[TIPC_NLA_PROP_MTU]))) { + if (nla_get_u32(props[TIPC_NLA_PROP_MTU]) < + b->encap_hlen + TIPC_MIN_BEARER_MTU) { NL_SET_ERR_MSG(info->extack, "MTU value is out-of-range"); return -EINVAL; -- cgit v1.2.3 From db2773d65b02aed319a93efdfb958087771d4e19 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Thu, 6 Apr 2023 13:08:45 +0200 Subject: can: isotp: recvmsg(): allow MSG_CMSG_COMPAT flag The control message provided by isotp support MSG_CMSG_COMPAT but blocked recvmsg() syscalls that have set this flag, i.e. on 32bit user space on 64 bit kernels. Link: https://github.com/hartkopp/can-isotp/issues/59 Cc: Oleksij Rempel Suggested-by: Marc Kleine-Budde Signed-off-by: Oliver Hartkopp Fixes: 42bf50a1795a ("can: isotp: support MSG_TRUNC flag when reading from socket") Link: https://lore.kernel.org/20230505110308.81087-2-mkl@pengutronix.de Cc: stable@vger.kernel.org Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/isotp.c b/net/can/isotp.c index a750259cb79c..84f9aba02901 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1139,7 +1139,7 @@ static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, struct isotp_sock *so = isotp_sk(sk); int ret = 0; - if (flags & ~(MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK)) + if (flags & ~(MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK | MSG_CMSG_COMPAT)) return -EINVAL; if (!so->bound) -- cgit v1.2.3 From 1db080cbdbab28752bbb1c86d64daf96253a5da1 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Thu, 6 Apr 2023 13:08:45 +0200 Subject: can: j1939: recvmsg(): allow MSG_CMSG_COMPAT flag The control message provided by J1939 support MSG_CMSG_COMPAT but blocked recvmsg() syscalls that have set this flag, i.e. on 32bit user space on 64 bit kernels. Link: https://github.com/hartkopp/can-isotp/issues/59 Cc: Oleksij Rempel Suggested-by: Marc Kleine-Budde Signed-off-by: Oliver Hartkopp Tested-by: Oleksij Rempel Acked-by: Oleksij Rempel Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/20230505110308.81087-3-mkl@pengutronix.de Cc: stable@vger.kernel.org Signed-off-by: Marc Kleine-Budde --- net/can/j1939/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 7e90f9e61d9b..1790469b2580 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -798,7 +798,7 @@ static int j1939_sk_recvmsg(struct socket *sock, struct msghdr *msg, struct j1939_sk_buff_cb *skcb; int ret = 0; - if (flags & ~(MSG_DONTWAIT | MSG_ERRQUEUE)) + if (flags & ~(MSG_DONTWAIT | MSG_ERRQUEUE | MSG_CMSG_COMPAT)) return -EINVAL; if (flags & MSG_ERRQUEUE) -- cgit v1.2.3 From 1b6b4ed01493b7ea2205ab83c49198f7d13ca9d2 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 24 Apr 2023 10:32:24 +0300 Subject: wifi: cfg80211: Drop entries with invalid BSSIDs in RNR Ignore AP information for entries that include an invalid BSSID in the TBTT information field, e.g., all zeros BSSIDs. Fixes: c8cb5b854b40 ("nl80211/cfg80211: support 6 GHz scanning") Signed-off-by: Ilan Peer Signed-off-by: Gregory Greenman Link: https://lore.kernel.org/r/20230424103224.5e65d04d1448.Ic10c8577ae4a85272c407106c9d0a2ecb5372743@changeid Signed-off-by: Johannes Berg --- net/wireless/scan.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index a1382255fab3..c501db7bbdb3 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -5,7 +5,7 @@ * Copyright 2008 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2016 Intel Deutschland GmbH - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2023 Intel Corporation */ #include #include @@ -540,6 +540,10 @@ static int cfg80211_parse_ap_info(struct cfg80211_colocated_ap *entry, /* skip the TBTT offset */ pos++; + /* ignore entries with invalid BSSID */ + if (!is_valid_ether_addr(pos)) + return -EINVAL; + memcpy(entry->bssid, pos, ETH_ALEN); pos += ETH_ALEN; -- cgit v1.2.3 From ef6e1997da63ad0ac3fe33153fec9524c9ae56c9 Mon Sep 17 00:00:00 2001 From: Mirsad Goran Todorovac Date: Tue, 25 Apr 2023 18:40:08 +0200 Subject: wifi: mac80211: fortify the spinlock against deadlock by interrupt In the function ieee80211_tx_dequeue() there is a particular locking sequence: begin: spin_lock(&local->queue_stop_reason_lock); q_stopped = local->queue_stop_reasons[q]; spin_unlock(&local->queue_stop_reason_lock); However small the chance (increased by ftracetest), an asynchronous interrupt can occur in between of spin_lock() and spin_unlock(), and the interrupt routine will attempt to lock the same &local->queue_stop_reason_lock again. This will cause a costly reset of the CPU and the wifi device or an altogether hang in the single CPU and single core scenario. The only remaining spin_lock(&local->queue_stop_reason_lock) that did not disable interrupts was patched, which should prevent any deadlocks on the same CPU/core and the same wifi device. This is the probable trace of the deadlock: kernel: ================================ kernel: WARNING: inconsistent lock state kernel: 6.3.0-rc6-mt-20230401-00001-gf86822a1170f #4 Tainted: G W kernel: -------------------------------- kernel: inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. kernel: kworker/5:0/25656 [HC0[0]:SC0[0]:HE1:SE1] takes: kernel: ffff9d6190779478 (&local->queue_stop_reason_lock){+.?.}-{2:2}, at: return_to_handler+0x0/0x40 kernel: {IN-SOFTIRQ-W} state was registered at: kernel: lock_acquire+0xc7/0x2d0 kernel: _raw_spin_lock+0x36/0x50 kernel: ieee80211_tx_dequeue+0xb4/0x1330 [mac80211] kernel: iwl_mvm_mac_itxq_xmit+0xae/0x210 [iwlmvm] kernel: iwl_mvm_mac_wake_tx_queue+0x2d/0xd0 [iwlmvm] kernel: ieee80211_queue_skb+0x450/0x730 [mac80211] kernel: __ieee80211_xmit_fast.constprop.66+0x834/0xa50 [mac80211] kernel: __ieee80211_subif_start_xmit+0x217/0x530 [mac80211] kernel: ieee80211_subif_start_xmit+0x60/0x580 [mac80211] kernel: dev_hard_start_xmit+0xb5/0x260 kernel: __dev_queue_xmit+0xdbe/0x1200 kernel: neigh_resolve_output+0x166/0x260 kernel: ip_finish_output2+0x216/0xb80 kernel: __ip_finish_output+0x2a4/0x4d0 kernel: ip_finish_output+0x2d/0xd0 kernel: ip_output+0x82/0x2b0 kernel: ip_local_out+0xec/0x110 kernel: igmpv3_sendpack+0x5c/0x90 kernel: igmp_ifc_timer_expire+0x26e/0x4e0 kernel: call_timer_fn+0xa5/0x230 kernel: run_timer_softirq+0x27f/0x550 kernel: __do_softirq+0xb4/0x3a4 kernel: irq_exit_rcu+0x9b/0xc0 kernel: sysvec_apic_timer_interrupt+0x80/0xa0 kernel: asm_sysvec_apic_timer_interrupt+0x1f/0x30 kernel: _raw_spin_unlock_irqrestore+0x3f/0x70 kernel: free_to_partial_list+0x3d6/0x590 kernel: __slab_free+0x1b7/0x310 kernel: kmem_cache_free+0x52d/0x550 kernel: putname+0x5d/0x70 kernel: do_sys_openat2+0x1d7/0x310 kernel: do_sys_open+0x51/0x80 kernel: __x64_sys_openat+0x24/0x30 kernel: do_syscall_64+0x5c/0x90 kernel: entry_SYSCALL_64_after_hwframe+0x72/0xdc kernel: irq event stamp: 5120729 kernel: hardirqs last enabled at (5120729): [] trace_graph_return+0xd6/0x120 kernel: hardirqs last disabled at (5120728): [] trace_graph_return+0xf0/0x120 kernel: softirqs last enabled at (5069900): [] return_to_handler+0x0/0x40 kernel: softirqs last disabled at (5067555): [] return_to_handler+0x0/0x40 kernel: other info that might help us debug this: kernel: Possible unsafe locking scenario: kernel: CPU0 kernel: ---- kernel: lock(&local->queue_stop_reason_lock); kernel: kernel: lock(&local->queue_stop_reason_lock); kernel: *** DEADLOCK *** kernel: 8 locks held by kworker/5:0/25656: kernel: #0: ffff9d618009d138 ((wq_completion)events_freezable){+.+.}-{0:0}, at: process_one_work+0x1ca/0x530 kernel: #1: ffffb1ef4637fe68 ((work_completion)(&local->restart_work)){+.+.}-{0:0}, at: process_one_work+0x1ce/0x530 kernel: #2: ffffffff9f166548 (rtnl_mutex){+.+.}-{3:3}, at: return_to_handler+0x0/0x40 kernel: #3: ffff9d6190778728 (&rdev->wiphy.mtx){+.+.}-{3:3}, at: return_to_handler+0x0/0x40 kernel: #4: ffff9d619077b480 (&mvm->mutex){+.+.}-{3:3}, at: return_to_handler+0x0/0x40 kernel: #5: ffff9d61907bacd8 (&trans_pcie->mutex){+.+.}-{3:3}, at: return_to_handler+0x0/0x40 kernel: #6: ffffffff9ef9cda0 (rcu_read_lock){....}-{1:2}, at: iwl_mvm_queue_state_change+0x59/0x3a0 [iwlmvm] kernel: #7: ffffffff9ef9cda0 (rcu_read_lock){....}-{1:2}, at: iwl_mvm_mac_itxq_xmit+0x42/0x210 [iwlmvm] kernel: stack backtrace: kernel: CPU: 5 PID: 25656 Comm: kworker/5:0 Tainted: G W 6.3.0-rc6-mt-20230401-00001-gf86822a1170f #4 kernel: Hardware name: LENOVO 82H8/LNVNB161216, BIOS GGCN51WW 11/16/2022 kernel: Workqueue: events_freezable ieee80211_restart_work [mac80211] kernel: Call Trace: kernel: kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: dump_stack_lvl+0x5f/0xa0 kernel: dump_stack+0x14/0x20 kernel: print_usage_bug.part.46+0x208/0x2a0 kernel: mark_lock.part.47+0x605/0x630 kernel: ? sched_clock+0xd/0x20 kernel: ? trace_clock_local+0x14/0x30 kernel: ? __rb_reserve_next+0x5f/0x490 kernel: ? _raw_spin_lock+0x1b/0x50 kernel: __lock_acquire+0x464/0x1990 kernel: ? mark_held_locks+0x4e/0x80 kernel: lock_acquire+0xc7/0x2d0 kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: ? ftrace_return_to_handler+0x8b/0x100 kernel: ? preempt_count_add+0x4/0x70 kernel: _raw_spin_lock+0x36/0x50 kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: ieee80211_tx_dequeue+0xb4/0x1330 [mac80211] kernel: ? prepare_ftrace_return+0xc5/0x190 kernel: ? ftrace_graph_func+0x16/0x20 kernel: ? 0xffffffffc02ab0b1 kernel: ? lock_acquire+0xc7/0x2d0 kernel: ? iwl_mvm_mac_itxq_xmit+0x42/0x210 [iwlmvm] kernel: ? ieee80211_tx_dequeue+0x9/0x1330 [mac80211] kernel: ? __rcu_read_lock+0x4/0x40 kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: iwl_mvm_mac_itxq_xmit+0xae/0x210 [iwlmvm] kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: iwl_mvm_queue_state_change+0x311/0x3a0 [iwlmvm] kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: iwl_mvm_wake_sw_queue+0x17/0x20 [iwlmvm] kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: iwl_txq_gen2_unmap+0x1c9/0x1f0 [iwlwifi] kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: iwl_txq_gen2_free+0x55/0x130 [iwlwifi] kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: iwl_txq_gen2_tx_free+0x63/0x80 [iwlwifi] kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: _iwl_trans_pcie_gen2_stop_device+0x3f3/0x5b0 [iwlwifi] kernel: ? _iwl_trans_pcie_gen2_stop_device+0x9/0x5b0 [iwlwifi] kernel: ? mutex_lock_nested+0x4/0x30 kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: iwl_trans_pcie_gen2_stop_device+0x5f/0x90 [iwlwifi] kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: iwl_mvm_stop_device+0x78/0xd0 [iwlmvm] kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: __iwl_mvm_mac_start+0x114/0x210 [iwlmvm] kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: iwl_mvm_mac_start+0x76/0x150 [iwlmvm] kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: drv_start+0x79/0x180 [mac80211] kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: ieee80211_reconfig+0x1523/0x1ce0 [mac80211] kernel: ? synchronize_net+0x4/0x50 kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: ieee80211_restart_work+0x108/0x170 [mac80211] kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: process_one_work+0x250/0x530 kernel: ? ftrace_regs_caller_end+0x66/0x66 kernel: worker_thread+0x48/0x3a0 kernel: ? __pfx_worker_thread+0x10/0x10 kernel: kthread+0x10f/0x140 kernel: ? __pfx_kthread+0x10/0x10 kernel: ret_from_fork+0x29/0x50 kernel: Fixes: 4444bc2116ae ("wifi: mac80211: Proper mark iTXQs for resumption") Link: https://lore.kernel.org/all/1f58a0d1-d2b9-d851-73c3-93fcc607501c@alu.unizg.hr/ Reported-by: Mirsad Goran Todorovac Cc: Gregory Greenman Cc: Johannes Berg Link: https://lore.kernel.org/all/cdc80531-f25f-6f9d-b15f-25e16130b53a@alu.unizg.hr/ Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Leon Romanovsky Cc: Alexander Wetzel Signed-off-by: Mirsad Goran Todorovac Reviewed-by: Leon Romanovsky Reviewed-by: tag, or it goes automatically? Link: https://lore.kernel.org/r/20230425164005.25272-1-mirsad.todorovac@alu.unizg.hr Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 1a3327407552..0d9fbc8458fd 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3791,6 +3791,7 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, ieee80211_tx_result r; struct ieee80211_vif *vif = txq->vif; int q = vif->hw_queue[txq->ac]; + unsigned long flags; bool q_stopped; WARN_ON_ONCE(softirq_count() == 0); @@ -3799,9 +3800,9 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, return NULL; begin: - spin_lock(&local->queue_stop_reason_lock); + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); q_stopped = local->queue_stop_reasons[q]; - spin_unlock(&local->queue_stop_reason_lock); + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); if (unlikely(q_stopped)) { /* mark for waking later */ -- cgit v1.2.3 From 13ad2b1eeacd48ec0f31f55964e6dc7dfc2c0299 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 24 Apr 2023 19:42:04 +0200 Subject: wifi: mac80211: Fix puncturing bitmap handling in __ieee80211_csa_finalize() 'changed' can be OR'ed with BSS_CHANGED_EHT_PUNCTURING which is larger than an u32. So, turn 'changed' into an u64 and update ieee80211_set_after_csa_beacon() accordingly. In the commit in Fixes, only ieee80211_start_ap() was updated. Fixes: 2cc25e4b2a04 ("wifi: mac80211: configure puncturing bitmap") Signed-off-by: Christophe JAILLET Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/e84a3f80fe536787f7a2c7180507efc36cd14f95.1682358088.git.christophe.jaillet@wanadoo.fr Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 7317e4a5d1ff..c5e5f783f137 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3589,7 +3589,7 @@ void ieee80211_channel_switch_disconnect(struct ieee80211_vif *vif, bool block_t EXPORT_SYMBOL(ieee80211_channel_switch_disconnect); static int ieee80211_set_after_csa_beacon(struct ieee80211_sub_if_data *sdata, - u32 *changed) + u64 *changed) { int err; @@ -3632,7 +3632,7 @@ static int ieee80211_set_after_csa_beacon(struct ieee80211_sub_if_data *sdata, static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; - u32 changed = 0; + u64 changed = 0; int err; sdata_assert_lock(sdata); -- cgit v1.2.3 From 248e4776514bf70236e6b1a54c65aa5324c8b1eb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 4 May 2023 16:45:01 +0300 Subject: wifi: mac80211: fix min center freq offset tracing We need to set the correct trace variable, otherwise we're overwriting something else instead and the right one that we print later is not initialized. Fixes: b6011960f392 ("mac80211: handle channel frequency offset") Signed-off-by: Johannes Berg Signed-off-by: Gregory Greenman Link: https://lore.kernel.org/r/20230504134511.828474-2-gregory.greenman@intel.com Signed-off-by: Johannes Berg --- net/mac80211/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index de5d69f21306..db0d0132c58c 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -67,7 +67,7 @@ __entry->min_freq_offset = (c)->chan ? (c)->chan->freq_offset : 0; \ __entry->min_chan_width = (c)->width; \ __entry->min_center_freq1 = (c)->center_freq1; \ - __entry->freq1_offset = (c)->freq1_offset; \ + __entry->min_freq1_offset = (c)->freq1_offset; \ __entry->min_center_freq2 = (c)->center_freq2; #define MIN_CHANDEF_PR_FMT " min_control:%d.%03d MHz min_width:%d min_center: %d.%03d/%d MHz" #define MIN_CHANDEF_PR_ARG __entry->min_control_freq, __entry->min_freq_offset, \ -- cgit v1.2.3 From a23d7f5b2fbda114de60c4b53311e052281d7533 Mon Sep 17 00:00:00 2001 From: Michael Lee Date: Thu, 4 May 2023 16:04:41 +0800 Subject: wifi: mac80211: Abort running color change when stopping the AP When stopping the AP, there might be a color change in progress. It should be deactivated here, or the driver might later finalize a color change on a stopped AP. Fixes: 5f9404abdf2a (mac80211: add support for BSS color change) Signed-off-by: Michael Lee Link: https://lore.kernel.org/r/20230504080441.22958-1-michael-cy.lee@mediatek.com Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index c5e5f783f137..86b2036d73ff 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1578,9 +1578,10 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, sdata_dereference(link->u.ap.unsol_bcast_probe_resp, sdata); - /* abort any running channel switch */ + /* abort any running channel switch or color change */ mutex_lock(&local->mtx); link_conf->csa_active = false; + link_conf->color_change_active = false; if (link->csa_block_tx) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); -- cgit v1.2.3 From 860e1b43da94551cd1e73adc36b3c64cc3e5dc01 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 4 May 2023 16:45:02 +0300 Subject: wifi: mac80211: simplify chanctx allocation There's no need to call ieee80211_recalc_chanctx_min_def() since it cannot and won't call the driver anyway; just use _ieee80211_recalc_chanctx_min_def() instead. Signed-off-by: Johannes Berg Signed-off-by: Gregory Greenman Link: https://lore.kernel.org/r/20230504134511.828474-3-gregory.greenman@intel.com Signed-off-by: Johannes Berg --- net/mac80211/chan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index dbc34fbe7c8f..d23d1a7b4cc3 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -638,7 +638,7 @@ ieee80211_alloc_chanctx(struct ieee80211_local *local, ctx->conf.rx_chains_dynamic = 1; ctx->mode = mode; ctx->conf.radar_enabled = false; - ieee80211_recalc_chanctx_min_def(local, ctx); + _ieee80211_recalc_chanctx_min_def(local, ctx); return ctx; } -- cgit v1.2.3 From b72a455a2409fd94d6d9b4eb51d659a88213243b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 4 May 2023 16:45:03 +0300 Subject: wifi: mac80211: consider reserved chanctx for mindef When a chanctx is reserved for a new vif and we recalculate the minimal definition for it, we need to consider the new interface it's being reserved for before we assign it, so it can be used directly with the correct min channel width. Fix the code to - optionally - consider that, and use that option just before doing the reassignment. Also, when considering channel context reservations, we should only consider the one link we're currently working with. Change the boolean argument to a link pointer to do that. Signed-off-by: Johannes Berg Signed-off-by: Gregory Greenman Link: https://lore.kernel.org/r/20230504134511.828474-4-gregory.greenman@intel.com Signed-off-by: Johannes Berg --- net/mac80211/chan.c | 72 ++++++++++++++++++++++++++++------------------ net/mac80211/ieee80211_i.h | 3 +- net/mac80211/util.c | 2 +- 3 files changed, 47 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index d23d1a7b4cc3..1b182cf9d661 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -258,7 +258,8 @@ ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata, static enum nl80211_chan_width ieee80211_get_chanctx_vif_max_required_bw(struct ieee80211_sub_if_data *sdata, - struct ieee80211_chanctx_conf *conf) + struct ieee80211_chanctx *ctx, + struct ieee80211_link_data *rsvd_for) { enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT; struct ieee80211_vif *vif = &sdata->vif; @@ -267,13 +268,14 @@ ieee80211_get_chanctx_vif_max_required_bw(struct ieee80211_sub_if_data *sdata, rcu_read_lock(); for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) { enum nl80211_chan_width width = NL80211_CHAN_WIDTH_20_NOHT; - struct ieee80211_bss_conf *link_conf = - rcu_dereference(sdata->vif.link_conf[link_id]); + struct ieee80211_link_data *link = + rcu_dereference(sdata->link[link_id]); - if (!link_conf) + if (!link) continue; - if (rcu_access_pointer(link_conf->chanctx_conf) != conf) + if (link != rsvd_for && + rcu_access_pointer(link->conf->chanctx_conf) != &ctx->conf) continue; switch (vif->type) { @@ -287,7 +289,7 @@ ieee80211_get_chanctx_vif_max_required_bw(struct ieee80211_sub_if_data *sdata, * point, so take the width from the chandef, but * account also for TDLS peers */ - width = max(link_conf->chandef.width, + width = max(link->conf->chandef.width, ieee80211_get_max_required_bw(sdata, link_id)); break; case NL80211_IFTYPE_P2P_DEVICE: @@ -296,7 +298,7 @@ ieee80211_get_chanctx_vif_max_required_bw(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_OCB: - width = link_conf->chandef.width; + width = link->conf->chandef.width; break; case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_UNSPECIFIED: @@ -316,7 +318,8 @@ ieee80211_get_chanctx_vif_max_required_bw(struct ieee80211_sub_if_data *sdata, static enum nl80211_chan_width ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, - struct ieee80211_chanctx_conf *conf) + struct ieee80211_chanctx *ctx, + struct ieee80211_link_data *rsvd_for) { struct ieee80211_sub_if_data *sdata; enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT; @@ -328,7 +331,8 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, if (!ieee80211_sdata_running(sdata)) continue; - width = ieee80211_get_chanctx_vif_max_required_bw(sdata, conf); + width = ieee80211_get_chanctx_vif_max_required_bw(sdata, ctx, + rsvd_for); max_bw = max(max_bw, width); } @@ -336,8 +340,8 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, /* use the configured bandwidth in case of monitor interface */ sdata = rcu_dereference(local->monitor_sdata); if (sdata && - rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) == conf) - max_bw = max(max_bw, conf->def.width); + rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) == &ctx->conf) + max_bw = max(max_bw, ctx->conf.def.width); rcu_read_unlock(); @@ -349,8 +353,10 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, * the max of min required widths of all the interfaces bound to this * channel context. */ -static u32 _ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx) +static u32 +_ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx, + struct ieee80211_link_data *rsvd_for) { enum nl80211_chan_width max_bw; struct cfg80211_chan_def min_def; @@ -370,7 +376,7 @@ static u32 _ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, return 0; } - max_bw = ieee80211_get_chanctx_max_required_bw(local, &ctx->conf); + max_bw = ieee80211_get_chanctx_max_required_bw(local, ctx, rsvd_for); /* downgrade chandef up to max_bw */ min_def = ctx->conf.def; @@ -448,9 +454,10 @@ static void ieee80211_chan_bw_change(struct ieee80211_local *local, * channel context. */ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx) + struct ieee80211_chanctx *ctx, + struct ieee80211_link_data *rsvd_for) { - u32 changed = _ieee80211_recalc_chanctx_min_def(local, ctx); + u32 changed = _ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for); if (!changed) return; @@ -464,10 +471,11 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, ieee80211_chan_bw_change(local, ctx, false); } -static void ieee80211_change_chanctx(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx, - struct ieee80211_chanctx *old_ctx, - const struct cfg80211_chan_def *chandef) +static void _ieee80211_change_chanctx(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx, + struct ieee80211_chanctx *old_ctx, + const struct cfg80211_chan_def *chandef, + struct ieee80211_link_data *rsvd_for) { u32 changed; @@ -492,7 +500,7 @@ static void ieee80211_change_chanctx(struct ieee80211_local *local, ieee80211_chan_bw_change(local, old_ctx, true); if (cfg80211_chandef_identical(&ctx->conf.def, chandef)) { - ieee80211_recalc_chanctx_min_def(local, ctx); + ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for); return; } @@ -502,7 +510,7 @@ static void ieee80211_change_chanctx(struct ieee80211_local *local, /* check if min chanctx also changed */ changed = IEEE80211_CHANCTX_CHANGE_WIDTH | - _ieee80211_recalc_chanctx_min_def(local, ctx); + _ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for); drv_change_chanctx(local, ctx, changed); if (!local->use_chanctx) { @@ -514,6 +522,14 @@ static void ieee80211_change_chanctx(struct ieee80211_local *local, ieee80211_chan_bw_change(local, old_ctx, false); } +static void ieee80211_change_chanctx(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx, + struct ieee80211_chanctx *old_ctx, + const struct cfg80211_chan_def *chandef) +{ + _ieee80211_change_chanctx(local, ctx, old_ctx, chandef, NULL); +} + static struct ieee80211_chanctx * ieee80211_find_chanctx(struct ieee80211_local *local, const struct cfg80211_chan_def *chandef, @@ -638,7 +654,7 @@ ieee80211_alloc_chanctx(struct ieee80211_local *local, ctx->conf.rx_chains_dynamic = 1; ctx->mode = mode; ctx->conf.radar_enabled = false; - _ieee80211_recalc_chanctx_min_def(local, ctx); + _ieee80211_recalc_chanctx_min_def(local, ctx, NULL); return ctx; } @@ -873,12 +889,12 @@ out: ieee80211_recalc_chanctx_chantype(local, curr_ctx); ieee80211_recalc_smps_chanctx(local, curr_ctx); ieee80211_recalc_radar_chanctx(local, curr_ctx); - ieee80211_recalc_chanctx_min_def(local, curr_ctx); + ieee80211_recalc_chanctx_min_def(local, curr_ctx, NULL); } if (new_ctx && ieee80211_chanctx_num_assigned(local, new_ctx) > 0) { ieee80211_recalc_txpower(sdata, false); - ieee80211_recalc_chanctx_min_def(local, new_ctx); + ieee80211_recalc_chanctx_min_def(local, new_ctx, NULL); } if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && @@ -1270,7 +1286,7 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link) ieee80211_link_update_chandef(link, &link->reserved_chandef); - ieee80211_change_chanctx(local, new_ctx, old_ctx, chandef); + _ieee80211_change_chanctx(local, new_ctx, old_ctx, chandef, link); vif_chsw[0].vif = &sdata->vif; vif_chsw[0].old_ctx = &old_ctx->conf; @@ -1300,7 +1316,7 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link) if (ieee80211_chanctx_refcount(local, old_ctx) == 0) ieee80211_free_chanctx(local, old_ctx); - ieee80211_recalc_chanctx_min_def(local, new_ctx); + ieee80211_recalc_chanctx_min_def(local, new_ctx, NULL); ieee80211_recalc_smps_chanctx(local, new_ctx); ieee80211_recalc_radar_chanctx(local, new_ctx); @@ -1665,7 +1681,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) ieee80211_recalc_chanctx_chantype(local, ctx); ieee80211_recalc_smps_chanctx(local, ctx); ieee80211_recalc_radar_chanctx(local, ctx); - ieee80211_recalc_chanctx_min_def(local, ctx); + ieee80211_recalc_chanctx_min_def(local, ctx, NULL); list_for_each_entry_safe(link, link_tmp, &ctx->reserved_links, reserved_chanctx_list) { diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index a0a7839cb961..b0372e76f373 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2537,7 +2537,8 @@ int ieee80211_chanctx_refcount(struct ieee80211_local *local, void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx); void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx); + struct ieee80211_chanctx *ctx, + struct ieee80211_link_data *rsvd_for); bool ieee80211_is_radar_required(struct ieee80211_local *local); void ieee80211_dfs_cac_timer(unsigned long data); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 1527d6aafc14..4bf76150925d 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3015,7 +3015,7 @@ void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata, chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); - ieee80211_recalc_chanctx_min_def(local, chanctx); + ieee80211_recalc_chanctx_min_def(local, chanctx, NULL); } unlock: mutex_unlock(&local->chanctx_mtx); -- cgit v1.2.3 From 04312de4ced4b152749614e8179f3978a20a992f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 4 May 2023 16:45:04 +0300 Subject: wifi: mac80211: recalc chanctx mindef before assigning When we allocate a new channel context, or find an existing one that is compatible, we currently assign it to a link before its mindef is updated. This leads to strange situations, especially in link switching where you switch to an 80 MHz link and expect it to be active immediately, but the mindef is still configured to 20 MHz while assigning. Also, it's strange that the chandef passed to the assign method's argument is wider than the one in the context. Fix this by calculating the mindef with the new link considered before calling the driver. In particular, this fixes an iwlwifi problem during link switch where the firmware would assert because the (link) station that was added for the AP is configured to transmit at a bandwidth that's wider than the channel context that it's configured on. Signed-off-by: Johannes Berg Signed-off-by: Gregory Greenman Link: https://lore.kernel.org/r/20230504134511.828474-5-gregory.greenman@intel.com Signed-off-by: Johannes Berg --- net/mac80211/chan.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 1b182cf9d661..77c90ed8f5d7 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -871,6 +871,9 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, } if (new_ctx) { + /* recalc considering the link we'll use it for now */ + ieee80211_recalc_chanctx_min_def(local, new_ctx, link); + ret = drv_assign_vif_chanctx(local, sdata, link->conf, new_ctx); if (ret) goto out; -- cgit v1.2.3 From d6352dae0903fe8beae4c007dc320e9e9f1fed45 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 15 May 2023 19:29:25 +0300 Subject: devlink: Fix crash with CONFIG_NET_NS=n '__net_initdata' becomes a no-op with CONFIG_NET_NS=y, but when this option is disabled it becomes '__initdata', which means the data can be freed after the initialization phase. This annotation is obviously incorrect for the devlink net device notifier block which is still registered after the initialization phase [1]. Fix this crash by removing the '__net_initdata' annotation. [1] general protection fault, probably for non-canonical address 0xcccccccccccccccc: 0000 [#1] PREEMPT SMP CPU: 3 PID: 117 Comm: (udev-worker) Not tainted 6.4.0-rc1-custom-gdf0acdc59b09 #64 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc37 04/01/2014 RIP: 0010:notifier_call_chain+0x58/0xc0 [...] Call Trace: dev_set_mac_address+0x85/0x120 dev_set_mac_address_user+0x30/0x50 do_setlink+0x219/0x1270 rtnl_setlink+0xf7/0x1a0 rtnetlink_rcv_msg+0x142/0x390 netlink_rcv_skb+0x58/0x100 netlink_unicast+0x188/0x270 netlink_sendmsg+0x214/0x470 __sys_sendto+0x12f/0x1a0 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x38/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: e93c9378e33f ("devlink: change per-devlink netdev notifier to static one") Reported-by: Marek Szyprowski Closes: https://lore.kernel.org/netdev/600ddf9e-589a-2aa0-7b69-a438f833ca10@samsung.com/ Tested-by: Marek Szyprowski Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230515162925.1144416-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/devlink/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/devlink/core.c b/net/devlink/core.c index 0e58eee44bdb..c23ebabadc52 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -294,7 +294,7 @@ static struct pernet_operations devlink_pernet_ops __net_initdata = { .pre_exit = devlink_pernet_pre_exit, }; -static struct notifier_block devlink_port_netdevice_nb __net_initdata = { +static struct notifier_block devlink_port_netdevice_nb = { .notifier_call = devlink_port_netdevice_event, }; -- cgit v1.2.3 From dacab578c7c6cd06c50c89dfa36b0e0f10decd4e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 May 2023 14:23:42 +0000 Subject: vlan: fix a potential uninit-value in vlan_dev_hard_start_xmit() syzbot triggered the following splat [1], sending an empty message through pppoe_sendmsg(). When VLAN_FLAG_REORDER_HDR flag is set, vlan_dev_hard_header() does not push extra bytes for the VLAN header, because vlan is offloaded. Unfortunately vlan_dev_hard_start_xmit() first reads veth->h_vlan_proto before testing (vlan->flags & VLAN_FLAG_REORDER_HDR). We need to swap the two conditions. [1] BUG: KMSAN: uninit-value in vlan_dev_hard_start_xmit+0x171/0x7f0 net/8021q/vlan_dev.c:111 vlan_dev_hard_start_xmit+0x171/0x7f0 net/8021q/vlan_dev.c:111 __netdev_start_xmit include/linux/netdevice.h:4883 [inline] netdev_start_xmit include/linux/netdevice.h:4897 [inline] xmit_one net/core/dev.c:3580 [inline] dev_hard_start_xmit+0x253/0xa20 net/core/dev.c:3596 __dev_queue_xmit+0x3c7f/0x5ac0 net/core/dev.c:4246 dev_queue_xmit include/linux/netdevice.h:3053 [inline] pppoe_sendmsg+0xa93/0xb80 drivers/net/ppp/pppoe.c:900 sock_sendmsg_nosec net/socket.c:724 [inline] sock_sendmsg net/socket.c:747 [inline] ____sys_sendmsg+0xa24/0xe40 net/socket.c:2501 ___sys_sendmsg+0x2a1/0x3f0 net/socket.c:2555 __sys_sendmmsg+0x411/0xa50 net/socket.c:2641 __do_sys_sendmmsg net/socket.c:2670 [inline] __se_sys_sendmmsg net/socket.c:2667 [inline] __x64_sys_sendmmsg+0xbc/0x120 net/socket.c:2667 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Uninit was created at: slab_post_alloc_hook+0x12d/0xb60 mm/slab.h:774 slab_alloc_node mm/slub.c:3452 [inline] kmem_cache_alloc_node+0x543/0xab0 mm/slub.c:3497 kmalloc_reserve+0x148/0x470 net/core/skbuff.c:520 __alloc_skb+0x3a7/0x850 net/core/skbuff.c:606 alloc_skb include/linux/skbuff.h:1277 [inline] sock_wmalloc+0xfe/0x1a0 net/core/sock.c:2583 pppoe_sendmsg+0x3af/0xb80 drivers/net/ppp/pppoe.c:867 sock_sendmsg_nosec net/socket.c:724 [inline] sock_sendmsg net/socket.c:747 [inline] ____sys_sendmsg+0xa24/0xe40 net/socket.c:2501 ___sys_sendmsg+0x2a1/0x3f0 net/socket.c:2555 __sys_sendmmsg+0x411/0xa50 net/socket.c:2641 __do_sys_sendmmsg net/socket.c:2670 [inline] __se_sys_sendmmsg net/socket.c:2667 [inline] __x64_sys_sendmmsg+0xbc/0x120 net/socket.c:2667 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd CPU: 0 PID: 29770 Comm: syz-executor.0 Not tainted 6.3.0-rc6-syzkaller-gc478e5b17829 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/30/2023 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 870e4935d6e6..b90781b9ece6 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -109,8 +109,8 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, * NOTE: THIS ASSUMES DIX ETHERNET, SPECIFICALLY NOT SUPPORTING * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs... */ - if (veth->h_vlan_proto != vlan->vlan_proto || - vlan->flags & VLAN_FLAG_REORDER_HDR) { + if (vlan->flags & VLAN_FLAG_REORDER_HDR || + veth->h_vlan_proto != vlan->vlan_proto) { u16 vlan_tci; vlan_tci = vlan->vlan_id; vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb->priority); -- cgit v1.2.3 From 224a876e37543eee111bf9b6aa4935080e619335 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 14 May 2023 10:00:10 -0400 Subject: netfilter: conntrack: define variables exp_nat_nla_policy and any_addr with CONFIG_NF_NAT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc with W=1 and ! CONFIG_NF_NAT net/netfilter/nf_conntrack_netlink.c:3463:32: error: ‘exp_nat_nla_policy’ defined but not used [-Werror=unused-const-variable=] 3463 | static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = { | ^~~~~~~~~~~~~~~~~~ net/netfilter/nf_conntrack_netlink.c:2979:33: error: ‘any_addr’ defined but not used [-Werror=unused-const-variable=] 2979 | static const union nf_inet_addr any_addr; | ^~~~~~~~ These variables use is controlled by CONFIG_NF_NAT, so should their definitions. Signed-off-by: Tom Rix Reviewed-by: Simon Horman Signed-off-by: Florian Westphal --- net/netfilter/nf_conntrack_netlink.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index d40544cd61a6..69c8c8c7e9b8 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2976,7 +2976,9 @@ nla_put_failure: return -1; } +#if IS_ENABLED(CONFIG_NF_NAT) static const union nf_inet_addr any_addr; +#endif static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp) { @@ -3460,10 +3462,12 @@ ctnetlink_change_expect(struct nf_conntrack_expect *x, return 0; } +#if IS_ENABLED(CONFIG_NF_NAT) static const struct nla_policy exp_nat_nla_policy[CTA_EXPECT_NAT_MAX+1] = { [CTA_EXPECT_NAT_DIR] = { .type = NLA_U32 }, [CTA_EXPECT_NAT_TUPLE] = { .type = NLA_NESTED }, }; +#endif static int ctnetlink_parse_expect_nat(const struct nlattr *attr, -- cgit v1.2.3 From e3c361b8acd636f5fe80c02849ca175201edf10c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 11 May 2023 14:15:15 +0200 Subject: netfilter: nf_tables: fix nft_trans type confusion nft_trans_FOO objects all share a common nft_trans base structure, but trailing fields depend on the real object size. Access is only safe after trans->msg_type check. Check for rule type first. Found by code inspection. Fixes: 1a94e38d254b ("netfilter: nf_tables: add NFTA_RULE_ID attribute") Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 59fb8320ab4d..dc5675962de4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3865,12 +3865,10 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net, struct nft_trans *trans; list_for_each_entry(trans, &nft_net->commit_list, list) { - struct nft_rule *rule = nft_trans_rule(trans); - if (trans->msg_type == NFT_MSG_NEWRULE && trans->ctx.chain == chain && id == nft_trans_rule_id(trans)) - return rule; + return nft_trans_rule(trans); } return ERR_PTR(-ENOENT); } -- cgit v1.2.3 From 61ae320a29b0540c16931816299eb86bf2b66c08 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 11 May 2023 22:39:30 +0200 Subject: netfilter: nft_set_rbtree: fix null deref on element insertion There is no guarantee that rb_prev() will not return NULL in nft_rbtree_gc_elem(): general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] nft_add_set_elem+0x14b0/0x2990 nf_tables_newsetelem+0x528/0xb30 Furthermore, there is a possible use-after-free while iterating, 'node' can be free'd so we need to cache the next value to use. Fixes: c9e6978e2725 ("netfilter: nft_set_rbtree: Switch to node list walk for overlap detection") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_rbtree.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 19ea4d3c3553..2f114aa10f1a 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -221,7 +221,7 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, { struct nft_set *set = (struct nft_set *)__set; struct rb_node *prev = rb_prev(&rbe->node); - struct nft_rbtree_elem *rbe_prev; + struct nft_rbtree_elem *rbe_prev = NULL; struct nft_set_gc_batch *gcb; gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); @@ -229,17 +229,21 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, return -ENOMEM; /* search for expired end interval coming before this element. */ - do { + while (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); if (nft_rbtree_interval_end(rbe_prev)) break; prev = rb_prev(prev); - } while (prev != NULL); + } + + if (rbe_prev) { + rb_erase(&rbe_prev->node, &priv->root); + atomic_dec(&set->nelems); + } - rb_erase(&rbe_prev->node, &priv->root); rb_erase(&rbe->node, &priv->root); - atomic_sub(2, &set->nelems); + atomic_dec(&set->nelems); nft_set_gc_batch_add(gcb, rbe); nft_set_gc_batch_complete(gcb); @@ -268,7 +272,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_set_ext **ext) { struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; - struct rb_node *node, *parent, **p, *first = NULL; + struct rb_node *node, *next, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); int d, err; @@ -307,7 +311,9 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, * Values stored in the tree are in reversed order, starting from * highest to lowest value. */ - for (node = first; node != NULL; node = rb_next(node)) { + for (node = first; node != NULL; node = next) { + next = rb_next(node); + rbe = rb_entry(node, struct nft_rbtree_elem, node); if (!nft_set_elem_active(&rbe->ext, genmask)) -- cgit v1.2.3 From fb1b7be9b16c1f4626969ba4e95a97da2a452b41 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 May 2023 21:45:34 +0200 Subject: atm: hide unused procfs functions When CONFIG_PROC_FS is disabled, the function declarations for some procfs functions are hidden, but the definitions are still build, as shown by this compiler warning: net/atm/resources.c:403:7: error: no previous prototype for 'atm_dev_seq_start' [-Werror=missing-prototypes] net/atm/resources.c:409:6: error: no previous prototype for 'atm_dev_seq_stop' [-Werror=missing-prototypes] net/atm/resources.c:414:7: error: no previous prototype for 'atm_dev_seq_next' [-Werror=missing-prototypes] Add another #ifdef to leave these out of the build. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20230516194625.549249-2-arnd@kernel.org Signed-off-by: Jakub Kicinski --- net/atm/resources.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/atm/resources.c b/net/atm/resources.c index 2b2d33eeaf20..995d29e7fb13 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -400,6 +400,7 @@ done: return error; } +#ifdef CONFIG_PROC_FS void *atm_dev_seq_start(struct seq_file *seq, loff_t *pos) { mutex_lock(&atm_dev_mutex); @@ -415,3 +416,4 @@ void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_list_next(v, &atm_devs, pos); } +#endif -- cgit v1.2.3 From 89dcd87ce534a3a7f267cfd58505803006f51301 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 May 2023 21:45:35 +0200 Subject: bridge: always declare tunnel functions When CONFIG_BRIDGE_VLAN_FILTERING is disabled, two functions are still defined but have no prototype or caller. This causes a W=1 warning for the missing prototypes: net/bridge/br_netlink_tunnel.c:29:6: error: no previous prototype for 'vlan_tunid_inrange' [-Werror=missing-prototypes] net/bridge/br_netlink_tunnel.c:199:5: error: no previous prototype for 'br_vlan_tunnel_info' [-Werror=missing-prototypes] The functions are already contitional on CONFIG_BRIDGE_VLAN_FILTERING, and I coulnd't easily figure out the right set of #ifdefs, so just move the declarations out of the #ifdef to avoid the warning, at a small cost in code size over a more elaborate fix. Fixes: 188c67dd1906 ("net: bridge: vlan options: add support for tunnel id dumping") Fixes: 569da0822808 ("net: bridge: vlan options: add support for tunnel mapping set/del") Signed-off-by: Arnd Bergmann Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230516194625.549249-3-arnd@kernel.org Signed-off-by: Jakub Kicinski --- net/bridge/br_private_tunnel.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bridge/br_private_tunnel.h b/net/bridge/br_private_tunnel.h index 2b053289f016..efb096025151 100644 --- a/net/bridge/br_private_tunnel.h +++ b/net/bridge/br_private_tunnel.h @@ -27,6 +27,10 @@ int br_process_vlan_tunnel_info(const struct net_bridge *br, int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg); int br_fill_vlan_tunnel_info(struct sk_buff *skb, struct net_bridge_vlan_group *vg); +bool vlan_tunid_inrange(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *v_last); +int br_vlan_tunnel_info(const struct net_bridge_port *p, int cmd, + u16 vid, u32 tun_id, bool *changed); #ifdef CONFIG_BRIDGE_VLAN_FILTERING /* br_vlan_tunnel.c */ @@ -43,10 +47,6 @@ void br_handle_ingress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_vlan_group *vg); int br_handle_egress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_vlan *vlan); -bool vlan_tunid_inrange(const struct net_bridge_vlan *v_curr, - const struct net_bridge_vlan *v_last); -int br_vlan_tunnel_info(const struct net_bridge_port *p, int cmd, - u16 vid, u32 tun_id, bool *changed); #else static inline int vlan_tunnel_init(struct net_bridge_vlan_group *vg) { -- cgit v1.2.3 From b3a03b540e3cf62a255213d084d76d71c02793d5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 May 2023 18:50:36 -0700 Subject: tls: rx: device: fix checking decryption status skb->len covers the entire skb, including the frag_list. In fact we're guaranteed that rxm->full_len <= skb->len, so since the change under Fixes we were not checking decrypt status of any skb but the first. Note that the skb_pagelen() added here may feel a bit costly, but it's removed by subsequent fixes, anyway. Reported-by: Tariq Toukan Fixes: 86b259f6f888 ("tls: rx: device: bound the frag walk") Tested-by: Shai Amiram Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/tls/tls_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index a7cc4f9faac2..3b87c7b04ac8 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -1012,7 +1012,7 @@ int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx) struct sk_buff *skb_iter; int left; - left = rxm->full_len - skb->len; + left = rxm->full_len + rxm->offset - skb_pagelen(skb); /* Check if all the data is decrypted already */ skb_iter = skb_shinfo(skb)->frag_list; while (skb_iter && left > 0) { -- cgit v1.2.3 From 210620ae44a83f25220450bbfcc22e6fe986b25f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 May 2023 18:50:37 -0700 Subject: tls: rx: strp: set the skb->len of detached / CoW'ed skbs alloc_skb_with_frags() fills in page frag sizes but does not set skb->len and skb->data_len. Set those correctly otherwise device offload will most likely generate an empty skb and hit the BUG() at the end of __skb_nsg(). Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Tested-by: Shai Amiram Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/tls/tls_strp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index 955ac3e0bf4d..24016c865e00 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -56,6 +56,8 @@ static struct sk_buff *tls_strp_msg_make_copy(struct tls_strparser *strp) offset += skb_frag_size(frag); } + skb->len = strp->stm.full_len; + skb->data_len = strp->stm.full_len; skb_copy_header(skb, strp->anchor); rxm = strp_msg(skb); rxm->offset = 0; -- cgit v1.2.3 From 14c4be92ebb3e36e392aa9dd8f314038a9f96f3c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 May 2023 18:50:38 -0700 Subject: tls: rx: strp: force mixed decrypted records into copy mode If a record is partially decrypted we'll have to CoW it, anyway, so go into copy mode and allocate a writable skb right away. This will make subsequent fix simpler because we won't have to teach tls_strp_msg_make_copy() how to copy skbs while preserving decrypt status. Tested-by: Shai Amiram Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/tls/tls_strp.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index 24016c865e00..2b6fa9855999 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -317,15 +317,19 @@ static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort) return 0; } -static bool tls_strp_check_no_dup(struct tls_strparser *strp) +static bool tls_strp_check_queue_ok(struct tls_strparser *strp) { unsigned int len = strp->stm.offset + strp->stm.full_len; - struct sk_buff *skb; + struct sk_buff *first, *skb; u32 seq; - skb = skb_shinfo(strp->anchor)->frag_list; - seq = TCP_SKB_CB(skb)->seq; + first = skb_shinfo(strp->anchor)->frag_list; + skb = first; + seq = TCP_SKB_CB(first)->seq; + /* Make sure there's no duplicate data in the queue, + * and the decrypted status matches. + */ while (skb->len < len) { seq += skb->len; len -= skb->len; @@ -333,6 +337,8 @@ static bool tls_strp_check_no_dup(struct tls_strparser *strp) if (TCP_SKB_CB(skb)->seq != seq) return false; + if (skb_cmp_decrypted(first, skb)) + return false; } return true; @@ -413,7 +419,7 @@ static int tls_strp_read_sock(struct tls_strparser *strp) return tls_strp_read_copy(strp, true); } - if (!tls_strp_check_no_dup(strp)) + if (!tls_strp_check_queue_ok(strp)) return tls_strp_read_copy(strp, false); strp->msg_ready = 1; -- cgit v1.2.3 From 8b0c0dc9fbbd01e58a573a41c38885f9e4c17696 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 May 2023 18:50:39 -0700 Subject: tls: rx: strp: fix determining record length in copy mode We call tls_rx_msg_size(skb) before doing skb->len += chunk. So the tls_rx_msg_size() code will see old skb->len, most likely leading to an over-read. Worst case we will over read an entire record, next iteration will try to trim the skb but may end up turning frag len negative or discarding the subsequent record (since we already told TCP we've read it during previous read but now we'll trim it out of the skb). Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Tested-by: Shai Amiram Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/tls/tls_strp.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index 2b6fa9855999..e2e48217e7ac 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -210,19 +210,28 @@ static int tls_strp_copyin(read_descriptor_t *desc, struct sk_buff *in_skb, skb_frag_size(frag), chunk)); - sz = tls_rx_msg_size(strp, strp->anchor); + skb->len += chunk; + skb->data_len += chunk; + skb_frag_size_add(frag, chunk); + + sz = tls_rx_msg_size(strp, skb); if (sz < 0) { desc->error = sz; return 0; } /* We may have over-read, sz == 0 is guaranteed under-read */ - if (sz > 0) - chunk = min_t(size_t, chunk, sz - skb->len); + if (unlikely(sz && sz < skb->len)) { + int over = skb->len - sz; + + WARN_ON_ONCE(over > chunk); + skb->len -= over; + skb->data_len -= over; + skb_frag_size_add(frag, -over); + + chunk -= over; + } - skb->len += chunk; - skb->data_len += chunk; - skb_frag_size_add(frag, chunk); frag++; len -= chunk; offset += chunk; -- cgit v1.2.3 From c1c607b1e5d5477d82ca6a86a05a4f10907b33ee Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 May 2023 18:50:40 -0700 Subject: tls: rx: strp: factor out copying skb data We'll need to copy input skbs individually in the next patch. Factor that code out (without assuming we're copying a full record). Tested-by: Shai Amiram Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/tls/tls_strp.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index e2e48217e7ac..61fbf84baf9e 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -34,31 +34,44 @@ static void tls_strp_anchor_free(struct tls_strparser *strp) strp->anchor = NULL; } -/* Create a new skb with the contents of input copied to its page frags */ -static struct sk_buff *tls_strp_msg_make_copy(struct tls_strparser *strp) +static struct sk_buff * +tls_strp_skb_copy(struct tls_strparser *strp, struct sk_buff *in_skb, + int offset, int len) { - struct strp_msg *rxm; struct sk_buff *skb; - int i, err, offset; + int i, err; - skb = alloc_skb_with_frags(0, strp->stm.full_len, TLS_PAGE_ORDER, + skb = alloc_skb_with_frags(0, len, TLS_PAGE_ORDER, &err, strp->sk->sk_allocation); if (!skb) return NULL; - offset = strp->stm.offset; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - WARN_ON_ONCE(skb_copy_bits(strp->anchor, offset, + WARN_ON_ONCE(skb_copy_bits(in_skb, offset, skb_frag_address(frag), skb_frag_size(frag))); offset += skb_frag_size(frag); } - skb->len = strp->stm.full_len; - skb->data_len = strp->stm.full_len; - skb_copy_header(skb, strp->anchor); + skb->len = len; + skb->data_len = len; + skb_copy_header(skb, in_skb); + return skb; +} + +/* Create a new skb with the contents of input copied to its page frags */ +static struct sk_buff *tls_strp_msg_make_copy(struct tls_strparser *strp) +{ + struct strp_msg *rxm; + struct sk_buff *skb; + + skb = tls_strp_skb_copy(strp, strp->anchor, strp->stm.offset, + strp->stm.full_len); + if (!skb) + return NULL; + rxm = strp_msg(skb); rxm->offset = 0; return skb; -- cgit v1.2.3 From eca9bfafee3a0487e59c59201ae14c7594ba940a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 May 2023 18:50:41 -0700 Subject: tls: rx: strp: preserve decryption status of skbs when needed When receive buffer is small we try to copy out the data from TCP into a skb maintained by TLS to prevent connection from stalling. Unfortunately if a single record is made up of a mix of decrypted and non-decrypted skbs combining them into a single skb leads to loss of decryption status, resulting in decryption errors or data corruption. Similarly when trying to use TCP receive queue directly we need to make sure that all the skbs within the record have the same status. If we don't the mixed status will be detected correctly but we'll CoW the anchor, again collapsing it into a single paged skb without decrypted status preserved. So the "fixup" code will not know which parts of skb to re-encrypt. Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Tested-by: Shai Amiram Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/tls/tls.h | 5 +++ net/tls/tls_device.c | 22 ++++------ net/tls/tls_strp.c | 117 +++++++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 113 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/tls/tls.h b/net/tls/tls.h index 804c3880d028..0672acab2773 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -167,6 +167,11 @@ static inline bool tls_strp_msg_ready(struct tls_sw_context_rx *ctx) return ctx->strp.msg_ready; } +static inline bool tls_strp_msg_mixed_decrypted(struct tls_sw_context_rx *ctx) +{ + return ctx->strp.mixed_decrypted; +} + #ifdef CONFIG_TLS_DEVICE int tls_device_init(void); void tls_device_cleanup(void); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 3b87c7b04ac8..bf69c9d6d06c 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -1007,20 +1007,14 @@ int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx) struct tls_sw_context_rx *sw_ctx = tls_sw_ctx_rx(tls_ctx); struct sk_buff *skb = tls_strp_msg(sw_ctx); struct strp_msg *rxm = strp_msg(skb); - int is_decrypted = skb->decrypted; - int is_encrypted = !is_decrypted; - struct sk_buff *skb_iter; - int left; - - left = rxm->full_len + rxm->offset - skb_pagelen(skb); - /* Check if all the data is decrypted already */ - skb_iter = skb_shinfo(skb)->frag_list; - while (skb_iter && left > 0) { - is_decrypted &= skb_iter->decrypted; - is_encrypted &= !skb_iter->decrypted; - - left -= skb_iter->len; - skb_iter = skb_iter->next; + int is_decrypted, is_encrypted; + + if (!tls_strp_msg_mixed_decrypted(sw_ctx)) { + is_decrypted = skb->decrypted; + is_encrypted = !is_decrypted; + } else { + is_decrypted = 0; + is_encrypted = 0; } trace_tls_device_decrypted(sk, tcp_sk(sk)->copied_seq - rxm->full_len, diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index 61fbf84baf9e..da95abbb7ea3 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -29,7 +29,8 @@ static void tls_strp_anchor_free(struct tls_strparser *strp) struct skb_shared_info *shinfo = skb_shinfo(strp->anchor); DEBUG_NET_WARN_ON_ONCE(atomic_read(&shinfo->dataref) != 1); - shinfo->frag_list = NULL; + if (!strp->copy_mode) + shinfo->frag_list = NULL; consume_skb(strp->anchor); strp->anchor = NULL; } @@ -195,22 +196,22 @@ static void tls_strp_flush_anchor_copy(struct tls_strparser *strp) for (i = 0; i < shinfo->nr_frags; i++) __skb_frag_unref(&shinfo->frags[i], false); shinfo->nr_frags = 0; + if (strp->copy_mode) { + kfree_skb_list(shinfo->frag_list); + shinfo->frag_list = NULL; + } strp->copy_mode = 0; + strp->mixed_decrypted = 0; } -static int tls_strp_copyin(read_descriptor_t *desc, struct sk_buff *in_skb, - unsigned int offset, size_t in_len) +static int tls_strp_copyin_frag(struct tls_strparser *strp, struct sk_buff *skb, + struct sk_buff *in_skb, unsigned int offset, + size_t in_len) { - struct tls_strparser *strp = (struct tls_strparser *)desc->arg.data; - struct sk_buff *skb; - skb_frag_t *frag; size_t len, chunk; + skb_frag_t *frag; int sz; - if (strp->msg_ready) - return 0; - - skb = strp->anchor; frag = &skb_shinfo(skb)->frags[skb->len / PAGE_SIZE]; len = in_len; @@ -228,10 +229,8 @@ static int tls_strp_copyin(read_descriptor_t *desc, struct sk_buff *in_skb, skb_frag_size_add(frag, chunk); sz = tls_rx_msg_size(strp, skb); - if (sz < 0) { - desc->error = sz; - return 0; - } + if (sz < 0) + return sz; /* We may have over-read, sz == 0 is guaranteed under-read */ if (unlikely(sz && sz < skb->len)) { @@ -271,15 +270,99 @@ static int tls_strp_copyin(read_descriptor_t *desc, struct sk_buff *in_skb, offset += chunk; } - if (strp->stm.full_len == skb->len) { +read_done: + return in_len - len; +} + +static int tls_strp_copyin_skb(struct tls_strparser *strp, struct sk_buff *skb, + struct sk_buff *in_skb, unsigned int offset, + size_t in_len) +{ + struct sk_buff *nskb, *first, *last; + struct skb_shared_info *shinfo; + size_t chunk; + int sz; + + if (strp->stm.full_len) + chunk = strp->stm.full_len - skb->len; + else + chunk = TLS_MAX_PAYLOAD_SIZE + PAGE_SIZE; + chunk = min(chunk, in_len); + + nskb = tls_strp_skb_copy(strp, in_skb, offset, chunk); + if (!nskb) + return -ENOMEM; + + shinfo = skb_shinfo(skb); + if (!shinfo->frag_list) { + shinfo->frag_list = nskb; + nskb->prev = nskb; + } else { + first = shinfo->frag_list; + last = first->prev; + last->next = nskb; + first->prev = nskb; + } + + skb->len += chunk; + skb->data_len += chunk; + + if (!strp->stm.full_len) { + sz = tls_rx_msg_size(strp, skb); + if (sz < 0) + return sz; + + /* We may have over-read, sz == 0 is guaranteed under-read */ + if (unlikely(sz && sz < skb->len)) { + int over = skb->len - sz; + + WARN_ON_ONCE(over > chunk); + skb->len -= over; + skb->data_len -= over; + __pskb_trim(nskb, nskb->len - over); + + chunk -= over; + } + + strp->stm.full_len = sz; + } + + return chunk; +} + +static int tls_strp_copyin(read_descriptor_t *desc, struct sk_buff *in_skb, + unsigned int offset, size_t in_len) +{ + struct tls_strparser *strp = (struct tls_strparser *)desc->arg.data; + struct sk_buff *skb; + int ret; + + if (strp->msg_ready) + return 0; + + skb = strp->anchor; + if (!skb->len) + skb_copy_decrypted(skb, in_skb); + else + strp->mixed_decrypted |= !!skb_cmp_decrypted(skb, in_skb); + + if (IS_ENABLED(CONFIG_TLS_DEVICE) && strp->mixed_decrypted) + ret = tls_strp_copyin_skb(strp, skb, in_skb, offset, in_len); + else + ret = tls_strp_copyin_frag(strp, skb, in_skb, offset, in_len); + if (ret < 0) { + desc->error = ret; + ret = 0; + } + + if (strp->stm.full_len && strp->stm.full_len == skb->len) { desc->count = 0; strp->msg_ready = 1; tls_rx_msg_ready(strp); } -read_done: - return in_len - len; + return ret; } static int tls_strp_read_copyin(struct tls_strparser *strp) -- cgit v1.2.3 From 74836ec828fe17b63f2006fdbf53311d691396bf Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 May 2023 18:50:42 -0700 Subject: tls: rx: strp: don't use GFP_KERNEL in softirq context When receive buffer is small, or the TCP rx queue looks too complicated to bother using it directly - we allocate a new skb and copy data into it. We already use sk->sk_allocation... but nothing actually sets it to GFP_ATOMIC on the ->sk_data_ready() path. Users of HW offload are far more likely to experience problems due to scheduling while atomic. "Copy mode" is very rarely triggered with SW crypto. Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Tested-by: Shai Amiram Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 635b8bf6b937..6e6a7c37d685 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2304,10 +2304,14 @@ static void tls_data_ready(struct sock *sk) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct sk_psock *psock; + gfp_t alloc_save; trace_sk_data_ready(sk); + alloc_save = sk->sk_allocation; + sk->sk_allocation = GFP_ATOMIC; tls_strp_data_ready(&ctx->strp); + sk->sk_allocation = alloc_save; psock = sk_psock_get(sk); if (psock) { -- cgit v1.2.3 From 35112271672ae98f45df7875244a4e33aa215e31 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Thu, 18 May 2023 13:14:55 +0800 Subject: net/smc: Reset connection when trying to use SMCRv2 fails. We found a crash when using SMCRv2 with 2 Mellanox ConnectX-4. It can be reproduced by: - smc_run nginx - smc_run wrk -t 32 -c 500 -d 30 http://: BUG: kernel NULL pointer dereference, address: 0000000000000014 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 8000000108713067 P4D 8000000108713067 PUD 151127067 PMD 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 4 PID: 2441 Comm: kworker/4:249 Kdump: loaded Tainted: G W E 6.4.0-rc1+ #42 Workqueue: smc_hs_wq smc_listen_work [smc] RIP: 0010:smc_clc_send_confirm_accept+0x284/0x580 [smc] RSP: 0018:ffffb8294b2d7c78 EFLAGS: 00010a06 RAX: ffff8f1873238880 RBX: ffffb8294b2d7dc8 RCX: 0000000000000000 RDX: 00000000000000b4 RSI: 0000000000000001 RDI: 0000000000b40c00 RBP: ffffb8294b2d7db8 R08: ffff8f1815c5860c R09: 0000000000000000 R10: 0000000000000400 R11: 0000000000000000 R12: ffff8f1846f56180 R13: ffff8f1815c5860c R14: 0000000000000001 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff8f1aefd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000014 CR3: 00000001027a0001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? mlx5_ib_map_mr_sg+0xa1/0xd0 [mlx5_ib] ? smcr_buf_map_link+0x24b/0x290 [smc] ? __smc_buf_create+0x4ee/0x9b0 [smc] smc_clc_send_accept+0x4c/0xb0 [smc] smc_listen_work+0x346/0x650 [smc] ? __schedule+0x279/0x820 process_one_work+0x1e5/0x3f0 worker_thread+0x4d/0x2f0 ? __pfx_worker_thread+0x10/0x10 kthread+0xe5/0x120 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2c/0x50 During the CLC handshake, server sequentially tries available SMCRv2 and SMCRv1 devices in smc_listen_work(). If an SMCRv2 device is found. SMCv2 based link group and link will be assigned to the connection. Then assumed that some buffer assignment errors happen later in the CLC handshake, such as RMB registration failure, server will give up SMCRv2 and try SMCRv1 device instead. But the resources assigned to the connection won't be reset. When server tries SMCRv1 device, the connection creation process will be executed again. Since conn->lnk has been assigned when trying SMCRv2, it will not be set to the correct SMCRv1 link in smcr_lgr_conn_assign_link(). So in such situation, conn->lgr points to correct SMCRv1 link group but conn->lnk points to the SMCRv2 link mistakenly. Then in smc_clc_send_confirm_accept(), conn->rmb_desc->mr[link->link_idx] will be accessed. Since the link->link_idx is not correct, the related MR may not have been initialized, so crash happens. | Try SMCRv2 device first | |-> conn->lgr: assign existed SMCRv2 link group; | |-> conn->link: assign existed SMCRv2 link (link_idx may be 1 in SMC_LGR_SYMMETRIC); | |-> sndbuf & RMB creation fails, quit; | | Try SMCRv1 device then | |-> conn->lgr: create SMCRv1 link group and assign; | |-> conn->link: keep SMCRv2 link mistakenly; | |-> sndbuf & RMB creation succeed, only RMB->mr[link_idx = 0] | initialized. | | Then smc_clc_send_confirm_accept() accesses | conn->rmb_desc->mr[conn->link->link_idx, which is 1], then crash. v This patch tries to fix this by cleaning conn->lnk before assigning link. In addition, it is better to reset the connection and clean the resources assigned if trying SMCRv2 failed in buffer creation or registration. Fixes: e49300a6bf62 ("net/smc: add listen processing for SMC-Rv2") Link: https://lore.kernel.org/r/20220523055056.2078994-1-liuyacan@corp.netease.com/ Signed-off-by: Wen Gu Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/af_smc.c | 9 +++++++-- net/smc/smc_core.c | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 50c38b624f77..538e9c6ec8c9 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2000,8 +2000,10 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, return rc; /* create send buffer and rmb */ - if (smc_buf_create(new_smc, false)) + if (smc_buf_create(new_smc, false)) { + smc_conn_abort(new_smc, ini->first_contact_local); return SMC_CLC_DECL_MEM; + } return 0; } @@ -2217,8 +2219,11 @@ static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, smcr_version = ini->smcr_version; ini->smcr_version = SMC_V2; rc = smc_listen_rdma_init(new_smc, ini); - if (!rc) + if (!rc) { rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local); + if (rc) + smc_conn_abort(new_smc, ini->first_contact_local); + } if (!rc) return; ini->smcr_version = smcr_version; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 454356771cda..3f465faf2b68 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -127,6 +127,7 @@ static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first) int i, j; /* do link balancing */ + conn->lnk = NULL; /* reset conn->lnk first */ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &conn->lgr->lnk[i]; -- cgit v1.2.3 From d180891fba995bd54e25b089b1ec98d134873586 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 10 May 2023 12:28:00 -0400 Subject: SUNRPC: Don't change task->tk_status after the call to rpc_exit_task Some calls to rpc_exit_task() may deliberately change the value of task->tk_status, for instance because it gets checked by the RPC call's rpc_release() callback. That makes it wrong to reset the value to task->tk_rpc_status. In particular this causes a bug where the rpc_call_done() callback tries to fail over a set of pNFS/flexfiles writes to a different IP address, but the reset of task->tk_status causes nfs_commit_release_pages() to immediately mark the file as having a fatal error. Fixes: 39494194f93b ("SUNRPC: Fix races with rpc_killall_tasks()") Cc: stable@vger.kernel.org # 6.1.x Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/sched.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index c8321de341ee..6debf4fd42d4 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -927,11 +927,10 @@ static void __rpc_execute(struct rpc_task *task) */ do_action = task->tk_action; /* Tasks with an RPC error status should exit */ - if (do_action != rpc_exit_task && + if (do_action && do_action != rpc_exit_task && (status = READ_ONCE(task->tk_rpc_status)) != 0) { task->tk_status = status; - if (do_action != NULL) - do_action = rpc_exit_task; + do_action = rpc_exit_task; } /* Callbacks override all actions */ if (task->tk_callback) { -- cgit v1.2.3 From ca1fd42e7dbfcb34890ffbf1f2f4b356776dab6f Mon Sep 17 00:00:00 2001 From: Ruihan Li Date: Wed, 3 May 2023 21:39:34 +0800 Subject: Bluetooth: Fix potential double free caused by hci_conn_unlink The hci_conn_unlink function is being called by hci_conn_del, which means it should not call hci_conn_del with the input parameter conn again. If it does, conn may have already been released when hci_conn_unlink returns, leading to potential UAF and double-free issues. This patch resolves the problem by modifying hci_conn_unlink to release only conn's child links when necessary, but never release conn itself. Reported-by: syzbot+690b90b14f14f43f4688@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-bluetooth/000000000000484a8205faafe216@google.com/ Fixes: 06149746e720 ("Bluetooth: hci_conn: Add support for linking multiple hcon") Signed-off-by: Ruihan Li Signed-off-by: Luiz Augusto von Dentz Reported-by: syzbot+690b90b14f14f43f4688@syzkaller.appspotmail.com Reported-by: Luiz Augusto von Dentz Reported-by: syzbot+8bb72f86fc823817bc5d@syzkaller.appspotmail.com --- net/bluetooth/hci_conn.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 640b951bf40a..70e1655a9df6 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1083,8 +1083,18 @@ static void hci_conn_unlink(struct hci_conn *conn) if (!conn->parent) { struct hci_link *link, *t; - list_for_each_entry_safe(link, t, &conn->link_list, list) - hci_conn_unlink(link->conn); + list_for_each_entry_safe(link, t, &conn->link_list, list) { + struct hci_conn *child = link->conn; + + hci_conn_unlink(child); + + /* Due to race, SCO connection might be not established + * yet at this point. Delete it now, otherwise it is + * possible for it to be stuck and can't be deleted. + */ + if (child->handle == HCI_CONN_HANDLE_UNSET) + hci_conn_del(child); + } return; } @@ -1100,13 +1110,6 @@ static void hci_conn_unlink(struct hci_conn *conn) kfree(conn->link); conn->link = NULL; - - /* Due to race, SCO connection might be not established - * yet at this point. Delete it now, otherwise it is - * possible for it to be stuck and can't be deleted. - */ - if (conn->handle == HCI_CONN_HANDLE_UNSET) - hci_conn_del(conn); } int hci_conn_del(struct hci_conn *conn) -- cgit v1.2.3 From 2910431ab0e500dfc5df12299bb15eef0f30b43e Mon Sep 17 00:00:00 2001 From: Ruihan Li Date: Wed, 3 May 2023 21:39:35 +0800 Subject: Bluetooth: Refcnt drop must be placed last in hci_conn_unlink If hci_conn_put(conn->parent) reduces conn->parent's reference count to zero, it can immediately deallocate conn->parent. At the same time, conn->link->list has its head in conn->parent, causing use-after-free problems in the latter list_del_rcu(&conn->link->list). This problem can be easily solved by reordering the two operations, i.e., first performing the list removal with list_del_rcu and then decreasing the refcnt with hci_conn_put. Reported-by: Luiz Augusto von Dentz Closes: https://lore.kernel.org/linux-bluetooth/CABBYNZ+1kce8_RJrLNOXd_8=Mdpb=2bx4Nto-hFORk=qiOkoCg@mail.gmail.com/ Fixes: 06149746e720 ("Bluetooth: hci_conn: Add support for linking multiple hcon") Signed-off-by: Ruihan Li Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 70e1655a9df6..44d0643fc681 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1102,12 +1102,12 @@ static void hci_conn_unlink(struct hci_conn *conn) if (!conn->link) return; - hci_conn_put(conn->parent); - conn->parent = NULL; - list_del_rcu(&conn->link->list); synchronize_rcu(); + hci_conn_put(conn->parent); + conn->parent = NULL; + kfree(conn->link); conn->link = NULL; } -- cgit v1.2.3 From a2ac591cb4d83e1f2d4b4adb3c14b2c79764650a Mon Sep 17 00:00:00 2001 From: Ruihan Li Date: Wed, 3 May 2023 21:39:36 +0800 Subject: Bluetooth: Fix UAF in hci_conn_hash_flush again Commit 06149746e720 ("Bluetooth: hci_conn: Add support for linking multiple hcon") reintroduced a previously fixed bug [1] ("KASAN: slab-use-after-free Read in hci_conn_hash_flush"). This bug was originally fixed by commit 5dc7d23e167e ("Bluetooth: hci_conn: Fix possible UAF"). The hci_conn_unlink function was added to avoid invalidating the link traversal caused by successive hci_conn_del operations releasing extra connections. However, currently hci_conn_unlink itself also releases extra connections, resulted in the reintroduced bug. This patch follows a more robust solution for cleaning up all connections, by repeatedly removing the first connection until there are none left. This approach does not rely on the inner workings of hci_conn_del and ensures proper cleanup of all connections. Meanwhile, we need to make sure that hci_conn_del never fails. Indeed it doesn't, as it now always returns zero. To make this a bit clearer, this patch also changes its return type to void. Reported-by: syzbot+8bb72f86fc823817bc5d@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-bluetooth/000000000000aa920505f60d25ad@google.com/ Fixes: 06149746e720 ("Bluetooth: hci_conn: Add support for linking multiple hcon") Signed-off-by: Ruihan Li Co-developed-by: Luiz Augusto von Dentz Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 44d0643fc681..ce588359b290 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1088,6 +1088,14 @@ static void hci_conn_unlink(struct hci_conn *conn) hci_conn_unlink(child); + /* If hdev is down it means + * hci_dev_close_sync/hci_conn_hash_flush is in progress + * and links don't need to be cleanup as all connections + * would be cleanup. + */ + if (!test_bit(HCI_UP, &hdev->flags)) + continue; + /* Due to race, SCO connection might be not established * yet at this point. Delete it now, otherwise it is * possible for it to be stuck and can't be deleted. @@ -1112,7 +1120,7 @@ static void hci_conn_unlink(struct hci_conn *conn) conn->link = NULL; } -int hci_conn_del(struct hci_conn *conn) +void hci_conn_del(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; @@ -1163,8 +1171,6 @@ int hci_conn_del(struct hci_conn *conn) * rest of hci_conn_del. */ hci_conn_cleanup(conn); - - return 0; } struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type) @@ -2465,22 +2471,27 @@ timer: /* Drop all connection on the device */ void hci_conn_hash_flush(struct hci_dev *hdev) { - struct hci_conn_hash *h = &hdev->conn_hash; - struct hci_conn *c, *n; + struct list_head *head = &hdev->conn_hash.list; + struct hci_conn *conn; BT_DBG("hdev %s", hdev->name); - list_for_each_entry_safe(c, n, &h->list, list) { - c->state = BT_CLOSED; - - hci_disconn_cfm(c, HCI_ERROR_LOCAL_HOST_TERM); + /* We should not traverse the list here, because hci_conn_del + * can remove extra links, which may cause the list traversal + * to hit items that have already been released. + */ + while ((conn = list_first_entry_or_null(head, + struct hci_conn, + list)) != NULL) { + conn->state = BT_CLOSED; + hci_disconn_cfm(conn, HCI_ERROR_LOCAL_HOST_TERM); /* Unlink before deleting otherwise it is possible that * hci_conn_del removes the link which may cause the list to * contain items already freed. */ - hci_conn_unlink(c); - hci_conn_del(c); + hci_conn_unlink(conn); + hci_conn_del(conn); } } -- cgit v1.2.3 From a2904d2825536aa896a149a9174d11b0958e7095 Mon Sep 17 00:00:00 2001 From: Ruihan Li Date: Wed, 3 May 2023 21:39:37 +0800 Subject: Bluetooth: Unlink CISes when LE disconnects in hci_conn_del Currently, hci_conn_del calls hci_conn_unlink for BR/EDR, (e)SCO, and CIS connections, i.e., everything except LE connections. However, if (e)SCO connections are unlinked when BR/EDR disconnects, CIS connections should also be unlinked when LE disconnects. In terms of disconnection behavior, CIS and (e)SCO connections are not too different. One peculiarity of CIS is that when CIS connections are disconnected, the CIS handle isn't deleted, as per [BLUETOOTH CORE SPECIFICATION Version 5.4 | Vol 4, Part E] 7.1.6 Disconnect command: All SCO, eSCO, and CIS connections on a physical link should be disconnected before the ACL connection on the same physical connection is disconnected. If it does not, they will be implicitly disconnected as part of the ACL disconnection. ... Note: As specified in Section 7.7.5, on the Central, the handle for a CIS remains valid even after disconnection and, therefore, the Host can recreate a disconnected CIS at a later point in time using the same connection handle. Since hci_conn_link invokes both hci_conn_get and hci_conn_hold, hci_conn_unlink should perform both hci_conn_put and hci_conn_drop as well. However, currently it performs only hci_conn_put. This patch makes hci_conn_unlink call hci_conn_drop as well, which simplifies the logic in hci_conn_del a bit and may benefit future users of hci_conn_unlink. But it is noted that this change additionally implies that hci_conn_unlink can queue disc_work on conn itself, with the following call stack: hci_conn_unlink(conn) [conn->parent == NULL] -> hci_conn_unlink(child) [child->parent == conn] -> hci_conn_drop(child->parent) -> queue_delayed_work(&conn->disc_work) Queued disc_work after hci_conn_del can be spurious, so during the process of hci_conn_del, it is necessary to make the call to cancel_delayed_work(&conn->disc_work) after invoking hci_conn_unlink. Signed-off-by: Ruihan Li Co-developed-by: Luiz Augusto von Dentz Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index ce588359b290..f75ef12f18f7 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1100,7 +1100,9 @@ static void hci_conn_unlink(struct hci_conn *conn) * yet at this point. Delete it now, otherwise it is * possible for it to be stuck and can't be deleted. */ - if (child->handle == HCI_CONN_HANDLE_UNSET) + if ((child->type == SCO_LINK || + child->type == ESCO_LINK) && + child->handle == HCI_CONN_HANDLE_UNSET) hci_conn_del(child); } @@ -1113,6 +1115,7 @@ static void hci_conn_unlink(struct hci_conn *conn) list_del_rcu(&conn->link->list); synchronize_rcu(); + hci_conn_drop(conn->parent); hci_conn_put(conn->parent); conn->parent = NULL; @@ -1126,12 +1129,13 @@ void hci_conn_del(struct hci_conn *conn) BT_DBG("%s hcon %p handle %d", hdev->name, conn, conn->handle); + hci_conn_unlink(conn); + cancel_delayed_work_sync(&conn->disc_work); cancel_delayed_work_sync(&conn->auto_accept_work); cancel_delayed_work_sync(&conn->idle_work); if (conn->type == ACL_LINK) { - hci_conn_unlink(conn); /* Unacked frames */ hdev->acl_cnt += conn->sent; } else if (conn->type == LE_LINK) { @@ -1142,13 +1146,6 @@ void hci_conn_del(struct hci_conn *conn) else hdev->acl_cnt += conn->sent; } else { - struct hci_conn *acl = conn->parent; - - if (acl) { - hci_conn_unlink(conn); - hci_conn_drop(acl); - } - /* Unacked ISO frames */ if (conn->type == ISO_LINK) { if (hdev->iso_pkts) @@ -2485,12 +2482,6 @@ void hci_conn_hash_flush(struct hci_dev *hdev) list)) != NULL) { conn->state = BT_CLOSED; hci_disconn_cfm(conn, HCI_ERROR_LOCAL_HOST_TERM); - - /* Unlink before deleting otherwise it is possible that - * hci_conn_del removes the link which may cause the list to - * contain items already freed. - */ - hci_conn_unlink(conn); hci_conn_del(conn); } } -- cgit v1.2.3 From 6ca328e985cd995dfd1d5de44046e6074f853fbb Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 18 May 2023 16:03:00 -0400 Subject: sctp: fix an issue that plpmtu can never go to complete state When doing plpmtu probe, the probe size is growing every time when it receives the ACK during the Search state until the probe fails. When the failure occurs, pl.probe_high is set and it goes to the Complete state. However, if the link pmtu is huge, like 65535 in loopback_dev, the probe eventually keeps using SCTP_MAX_PLPMTU as the probe size and never fails. Because of that, pl.probe_high can not be set, and the plpmtu probe can never go to the Complete state. Fix it by setting pl.probe_high to SCTP_MAX_PLPMTU when the probe size grows to SCTP_MAX_PLPMTU in sctp_transport_pl_recv(). Also, not allow the probe size greater than SCTP_MAX_PLPMTU in the Complete state. Fixes: b87641aff9e7 ("sctp: do state transition when a probe succeeds on HB ACK recv path") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/transport.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 2f66a2006517..2abe45af98e7 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -324,9 +324,12 @@ bool sctp_transport_pl_recv(struct sctp_transport *t) t->pl.probe_size += SCTP_PL_BIG_STEP; } else if (t->pl.state == SCTP_PL_SEARCH) { if (!t->pl.probe_high) { - t->pl.probe_size = min(t->pl.probe_size + SCTP_PL_BIG_STEP, - SCTP_MAX_PLPMTU); - return false; + if (t->pl.probe_size < SCTP_MAX_PLPMTU) { + t->pl.probe_size = min(t->pl.probe_size + SCTP_PL_BIG_STEP, + SCTP_MAX_PLPMTU); + return false; + } + t->pl.probe_high = SCTP_MAX_PLPMTU; } t->pl.probe_size += SCTP_PL_MIN_STEP; if (t->pl.probe_size >= t->pl.probe_high) { @@ -341,7 +344,7 @@ bool sctp_transport_pl_recv(struct sctp_transport *t) } else if (t->pl.state == SCTP_PL_COMPLETE) { /* Raise probe_size again after 30 * interval in Search Complete */ t->pl.state = SCTP_PL_SEARCH; /* Search Complete -> Search */ - t->pl.probe_size += SCTP_PL_MIN_STEP; + t->pl.probe_size = min(t->pl.probe_size + SCTP_PL_MIN_STEP, SCTP_MAX_PLPMTU); } return t->pl.state == SCTP_PL_COMPLETE; -- cgit v1.2.3 From b21c7ba6d9a5532add3827a3b49f49cbc0cb9779 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 19 May 2023 13:12:50 -0400 Subject: net/handshake: Squelch allocation warning during Kunit test The "handshake_req_alloc excessive privsize" kunit test is intended to check what happens when the maximum privsize is exceeded. The WARN_ON_ONCE_GFP at mm/page_alloc.c:4744 can be disabled safely for this test. Reported-by: Linux Kernel Functional Testing Fixes: 88232ec1ec5e ("net/handshake: Add Kunit tests for the handshake consumer API") Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/168451636052.47152.9600443326570457947.stgit@oracle-102.nfsv4bat.org Signed-off-by: Jakub Kicinski --- net/handshake/handshake-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/handshake/handshake-test.c b/net/handshake/handshake-test.c index e6adc5dec11a..6193e46ee6d9 100644 --- a/net/handshake/handshake-test.c +++ b/net/handshake/handshake-test.c @@ -102,7 +102,7 @@ struct handshake_req_alloc_test_param handshake_req_alloc_params[] = { { .desc = "handshake_req_alloc excessive privsize", .proto = &handshake_req_alloc_proto_6, - .gfp = GFP_KERNEL, + .gfp = GFP_KERNEL | __GFP_NOWARN, .expect_success = false, }, { -- cgit v1.2.3 From 18c40a1cc1d990c51381ef48cd93fdb31d5cd903 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 19 May 2023 13:08:24 -0400 Subject: net/handshake: Fix sock->file allocation sock->file = sock_alloc_file(sock, O_NONBLOCK, NULL); ^^^^ ^^^^ sock_alloc_file() calls release_sock() on error but the left hand side of the assignment dereferences "sock". This isn't the bug and I didn't report this earlier because there is an assert that it doesn't fail. net/handshake/handshake-test.c:221 handshake_req_submit_test4() error: dereferencing freed memory 'sock' net/handshake/handshake-test.c:233 handshake_req_submit_test4() warn: 'req' was already freed. net/handshake/handshake-test.c:254 handshake_req_submit_test5() error: dereferencing freed memory 'sock' net/handshake/handshake-test.c:290 handshake_req_submit_test6() error: dereferencing freed memory 'sock' net/handshake/handshake-test.c:321 handshake_req_cancel_test1() error: dereferencing freed memory 'sock' net/handshake/handshake-test.c:355 handshake_req_cancel_test2() error: dereferencing freed memory 'sock' net/handshake/handshake-test.c:367 handshake_req_cancel_test2() warn: 'req' was already freed. net/handshake/handshake-test.c:395 handshake_req_cancel_test3() error: dereferencing freed memory 'sock' net/handshake/handshake-test.c:407 handshake_req_cancel_test3() warn: 'req' was already freed. net/handshake/handshake-test.c:451 handshake_req_destroy_test1() error: dereferencing freed memory 'sock' net/handshake/handshake-test.c:463 handshake_req_destroy_test1() warn: 'req' was already freed. Reported-by: Dan Carpenter Fixes: 88232ec1ec5e ("net/handshake: Add Kunit tests for the handshake consumer API") Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/168451609436.45209.15407022385441542980.stgit@oracle-102.nfsv4bat.org Signed-off-by: Jakub Kicinski --- net/handshake/handshake-test.c | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/handshake/handshake-test.c b/net/handshake/handshake-test.c index 6193e46ee6d9..6d37bab35c8f 100644 --- a/net/handshake/handshake-test.c +++ b/net/handshake/handshake-test.c @@ -209,6 +209,7 @@ static void handshake_req_submit_test4(struct kunit *test) { struct handshake_req *req, *result; struct socket *sock; + struct file *filp; int err; /* Arrange */ @@ -218,9 +219,10 @@ static void handshake_req_submit_test4(struct kunit *test) err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock, 1); KUNIT_ASSERT_EQ(test, err, 0); - sock->file = sock_alloc_file(sock, O_NONBLOCK, NULL); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, sock->file); + filp = sock_alloc_file(sock, O_NONBLOCK, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp); KUNIT_ASSERT_NOT_NULL(test, sock->sk); + sock->file = filp; err = handshake_req_submit(sock, req, GFP_KERNEL); KUNIT_ASSERT_EQ(test, err, 0); @@ -241,6 +243,7 @@ static void handshake_req_submit_test5(struct kunit *test) struct handshake_req *req; struct handshake_net *hn; struct socket *sock; + struct file *filp; struct net *net; int saved, err; @@ -251,9 +254,10 @@ static void handshake_req_submit_test5(struct kunit *test) err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock, 1); KUNIT_ASSERT_EQ(test, err, 0); - sock->file = sock_alloc_file(sock, O_NONBLOCK, NULL); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, sock->file); + filp = sock_alloc_file(sock, O_NONBLOCK, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp); KUNIT_ASSERT_NOT_NULL(test, sock->sk); + sock->file = filp; net = sock_net(sock->sk); hn = handshake_pernet(net); @@ -276,6 +280,7 @@ static void handshake_req_submit_test6(struct kunit *test) { struct handshake_req *req1, *req2; struct socket *sock; + struct file *filp; int err; /* Arrange */ @@ -287,9 +292,10 @@ static void handshake_req_submit_test6(struct kunit *test) err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock, 1); KUNIT_ASSERT_EQ(test, err, 0); - sock->file = sock_alloc_file(sock, O_NONBLOCK, NULL); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, sock->file); + filp = sock_alloc_file(sock, O_NONBLOCK, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp); KUNIT_ASSERT_NOT_NULL(test, sock->sk); + sock->file = filp; /* Act */ err = handshake_req_submit(sock, req1, GFP_KERNEL); @@ -307,6 +313,7 @@ static void handshake_req_cancel_test1(struct kunit *test) { struct handshake_req *req; struct socket *sock; + struct file *filp; bool result; int err; @@ -318,8 +325,9 @@ static void handshake_req_cancel_test1(struct kunit *test) &sock, 1); KUNIT_ASSERT_EQ(test, err, 0); - sock->file = sock_alloc_file(sock, O_NONBLOCK, NULL); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, sock->file); + filp = sock_alloc_file(sock, O_NONBLOCK, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp); + sock->file = filp; err = handshake_req_submit(sock, req, GFP_KERNEL); KUNIT_ASSERT_EQ(test, err, 0); @@ -340,6 +348,7 @@ static void handshake_req_cancel_test2(struct kunit *test) struct handshake_req *req, *next; struct handshake_net *hn; struct socket *sock; + struct file *filp; struct net *net; bool result; int err; @@ -352,8 +361,9 @@ static void handshake_req_cancel_test2(struct kunit *test) &sock, 1); KUNIT_ASSERT_EQ(test, err, 0); - sock->file = sock_alloc_file(sock, O_NONBLOCK, NULL); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, sock->file); + filp = sock_alloc_file(sock, O_NONBLOCK, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp); + sock->file = filp; err = handshake_req_submit(sock, req, GFP_KERNEL); KUNIT_ASSERT_EQ(test, err, 0); @@ -380,6 +390,7 @@ static void handshake_req_cancel_test3(struct kunit *test) struct handshake_req *req, *next; struct handshake_net *hn; struct socket *sock; + struct file *filp; struct net *net; bool result; int err; @@ -392,8 +403,9 @@ static void handshake_req_cancel_test3(struct kunit *test) &sock, 1); KUNIT_ASSERT_EQ(test, err, 0); - sock->file = sock_alloc_file(sock, O_NONBLOCK, NULL); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, sock->file); + filp = sock_alloc_file(sock, O_NONBLOCK, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp); + sock->file = filp; err = handshake_req_submit(sock, req, GFP_KERNEL); KUNIT_ASSERT_EQ(test, err, 0); @@ -436,6 +448,7 @@ static void handshake_req_destroy_test1(struct kunit *test) { struct handshake_req *req; struct socket *sock; + struct file *filp; int err; /* Arrange */ @@ -448,8 +461,9 @@ static void handshake_req_destroy_test1(struct kunit *test) &sock, 1); KUNIT_ASSERT_EQ(test, err, 0); - sock->file = sock_alloc_file(sock, O_NONBLOCK, NULL); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, sock->file); + filp = sock_alloc_file(sock, O_NONBLOCK, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp); + sock->file = filp; err = handshake_req_submit(sock, req, GFP_KERNEL); KUNIT_ASSERT_EQ(test, err, 0); -- cgit v1.2.3 From 3632679d9e4f879f49949bb5b050e0de553e4739 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 22 May 2023 14:08:20 +0200 Subject: ipv{4,6}/raw: fix output xfrm lookup wrt protocol With a raw socket bound to IPPROTO_RAW (ie with hdrincl enabled), the protocol field of the flow structure, build by raw_sendmsg() / rawv6_sendmsg()), is set to IPPROTO_RAW. This breaks the ipsec policy lookup when some policies are defined with a protocol in the selector. For ipv6, the sin6_port field from 'struct sockaddr_in6' could be used to specify the protocol. Just accept all values for IPPROTO_RAW socket. For ipv4, the sin_port field of 'struct sockaddr_in' could not be used without breaking backward compatibility (the value of this field was never checked). Let's add a new kind of control message, so that the userland could specify which protocol is used. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") CC: stable@vger.kernel.org Signed-off-by: Nicolas Dichtel Link: https://lore.kernel.org/r/20230522120820.1319391-1-nicolas.dichtel@6wind.com Signed-off-by: Paolo Abeni --- net/ipv4/ip_sockglue.c | 12 +++++++++++- net/ipv4/raw.c | 5 ++++- net/ipv6/raw.c | 3 ++- 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index b511ff0adc0a..8e97d8d4cc9d 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -317,7 +317,14 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, ipc->tos = val; ipc->priority = rt_tos2priority(ipc->tos); break; - + case IP_PROTOCOL: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) + return -EINVAL; + val = *(int *)CMSG_DATA(cmsg); + if (val < 1 || val > 255) + return -EINVAL; + ipc->protocol = val; + break; default: return -EINVAL; } @@ -1761,6 +1768,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_LOCAL_PORT_RANGE: val = inet->local_port_range.hi << 16 | inet->local_port_range.lo; break; + case IP_PROTOCOL: + val = inet_sk(sk)->inet_num; + break; default: sockopt_release_sock(sk); return -ENOPROTOOPT; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index ff712bf2a98d..eadf1c9ef7e4 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -532,6 +532,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } ipcm_init_sk(&ipc, inet); + /* Keep backward compat */ + if (hdrincl) + ipc.protocol = IPPROTO_RAW; if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, false); @@ -599,7 +602,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, RT_SCOPE_UNIVERSE, - hdrincl ? IPPROTO_RAW : sk->sk_protocol, + hdrincl ? ipc.protocol : sk->sk_protocol, inet_sk_flowi_flags(sk) | (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0, sk->sk_uid); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 7d0adb612bdd..44ee7a2e72ac 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -793,7 +793,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!proto) proto = inet->inet_num; - else if (proto != inet->inet_num) + else if (proto != inet->inet_num && + inet->inet_num != IPPROTO_RAW) return -EINVAL; if (proto > 255) -- cgit v1.2.3 From 78fa0d61d97a728d306b0c23d353c0e340756437 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 22 May 2023 19:56:05 -0700 Subject: bpf, sockmap: Pass skb ownership through read_skb The read_skb hook calls consume_skb() now, but this means that if the recv_actor program wants to use the skb it needs to inc the ref cnt so that the consume_skb() doesn't kfree the sk_buff. This is problematic because in some error cases under memory pressure we may need to linearize the sk_buff from sk_psock_skb_ingress_enqueue(). Then we get this, skb_linearize() __pskb_pull_tail() pskb_expand_head() BUG_ON(skb_shared(skb)) Because we incremented users refcnt from sk_psock_verdict_recv() we hit the bug on with refcnt > 1 and trip it. To fix lets simply pass ownership of the sk_buff through the skb_read call. Then we can drop the consume from read_skb handlers and assume the verdict recv does any required kfree. Bug found while testing in our CI which runs in VMs that hit memory constraints rather regularly. William tested TCP read_skb handlers. [ 106.536188] ------------[ cut here ]------------ [ 106.536197] kernel BUG at net/core/skbuff.c:1693! [ 106.536479] invalid opcode: 0000 [#1] PREEMPT SMP PTI [ 106.536726] CPU: 3 PID: 1495 Comm: curl Not tainted 5.19.0-rc5 #1 [ 106.537023] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ArchLinux 1.16.0-1 04/01/2014 [ 106.537467] RIP: 0010:pskb_expand_head+0x269/0x330 [ 106.538585] RSP: 0018:ffffc90000138b68 EFLAGS: 00010202 [ 106.538839] RAX: 000000000000003f RBX: ffff8881048940e8 RCX: 0000000000000a20 [ 106.539186] RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffff8881048940e8 [ 106.539529] RBP: ffffc90000138be8 R08: 00000000e161fd1a R09: 0000000000000000 [ 106.539877] R10: 0000000000000018 R11: 0000000000000000 R12: ffff8881048940e8 [ 106.540222] R13: 0000000000000003 R14: 0000000000000000 R15: ffff8881048940e8 [ 106.540568] FS: 00007f277dde9f00(0000) GS:ffff88813bd80000(0000) knlGS:0000000000000000 [ 106.540954] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 106.541227] CR2: 00007f277eeede64 CR3: 000000000ad3e000 CR4: 00000000000006e0 [ 106.541569] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 106.541915] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 106.542255] Call Trace: [ 106.542383] [ 106.542487] __pskb_pull_tail+0x4b/0x3e0 [ 106.542681] skb_ensure_writable+0x85/0xa0 [ 106.542882] sk_skb_pull_data+0x18/0x20 [ 106.543084] bpf_prog_b517a65a242018b0_bpf_skskb_http_verdict+0x3a9/0x4aa9 [ 106.543536] ? migrate_disable+0x66/0x80 [ 106.543871] sk_psock_verdict_recv+0xe2/0x310 [ 106.544258] ? sk_psock_write_space+0x1f0/0x1f0 [ 106.544561] tcp_read_skb+0x7b/0x120 [ 106.544740] tcp_data_queue+0x904/0xee0 [ 106.544931] tcp_rcv_established+0x212/0x7c0 [ 106.545142] tcp_v4_do_rcv+0x174/0x2a0 [ 106.545326] tcp_v4_rcv+0xe70/0xf60 [ 106.545500] ip_protocol_deliver_rcu+0x48/0x290 [ 106.545744] ip_local_deliver_finish+0xa7/0x150 Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Reported-by: William Findlay Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: William Findlay Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20230523025618.113937-2-john.fastabend@gmail.com --- net/core/skmsg.c | 2 -- net/ipv4/tcp.c | 1 - net/ipv4/udp.c | 7 ++----- net/unix/af_unix.c | 7 ++----- net/vmw_vsock/virtio_transport_common.c | 5 +---- 5 files changed, 5 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index f81883759d38..4a3dc8d27295 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1183,8 +1183,6 @@ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb) int ret = __SK_DROP; int len = skb->len; - skb_get(skb); - rcu_read_lock(); psock = sk_psock(sk); if (unlikely(!psock)) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4d6392c16b7a..e914e3446377 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1773,7 +1773,6 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); tcp_flags = TCP_SKB_CB(skb)->tcp_flags; used = recv_actor(sk, skb); - consume_skb(skb); if (used < 0) { if (!copied) copied = used; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index aa32afd871ee..9482def1f310 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1818,7 +1818,7 @@ EXPORT_SYMBOL(__skb_recv_udp); int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct sk_buff *skb; - int err, copied; + int err; try_again: skb = skb_recv_udp(sk, MSG_DONTWAIT, &err); @@ -1837,10 +1837,7 @@ try_again: } WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); - copied = recv_actor(sk, skb); - kfree_skb(skb); - - return copied; + return recv_actor(sk, skb); } EXPORT_SYMBOL(udp_read_skb); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index cc695c9f09ec..e7728b57a8c7 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2553,7 +2553,7 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct unix_sock *u = unix_sk(sk); struct sk_buff *skb; - int err, copied; + int err; mutex_lock(&u->iolock); skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); @@ -2561,10 +2561,7 @@ static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) if (!skb) return err; - copied = recv_actor(sk, skb); - kfree_skb(skb); - - return copied; + return recv_actor(sk, skb); } /* diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index e4878551f140..b769fc258931 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1441,7 +1441,6 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto struct sock *sk = sk_vsock(vsk); struct sk_buff *skb; int off = 0; - int copied; int err; spin_lock_bh(&vvs->rx_lock); @@ -1454,9 +1453,7 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto if (!skb) return err; - copied = recv_actor(sk, skb); - kfree_skb(skb); - return copied; + return recv_actor(sk, skb); } EXPORT_SYMBOL_GPL(virtio_transport_read_skb); -- cgit v1.2.3 From 29173d07f79883ac94f5570294f98af3d4287382 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 22 May 2023 19:56:06 -0700 Subject: bpf, sockmap: Convert schedule_work into delayed_work Sk_buffs are fed into sockmap verdict programs either from a strparser (when the user might want to decide how framing of skb is done by attaching another parser program) or directly through tcp_read_sock. The tcp_read_sock is the preferred method for performance when the BPF logic is a stream parser. The flow for Cilium's common use case with a stream parser is, tcp_read_sock() sk_psock_verdict_recv ret = bpf_prog_run_pin_on_cpu() sk_psock_verdict_apply(sock, skb, ret) // if system is under memory pressure or app is slow we may // need to queue skb. Do this queuing through ingress_skb and // then kick timer to wake up handler skb_queue_tail(ingress_skb, skb) schedule_work(work); The work queue is wired up to sk_psock_backlog(). This will then walk the ingress_skb skb list that holds our sk_buffs that could not be handled, but should be OK to run at some later point. However, its possible that the workqueue doing this work still hits an error when sending the skb. When this happens the skbuff is requeued on a temporary 'state' struct kept with the workqueue. This is necessary because its possible to partially send an skbuff before hitting an error and we need to know how and where to restart when the workqueue runs next. Now for the trouble, we don't rekick the workqueue. This can cause a stall where the skbuff we just cached on the state variable might never be sent. This happens when its the last packet in a flow and no further packets come along that would cause the system to kick the workqueue from that side. To fix we could do simple schedule_work(), but while under memory pressure it makes sense to back off some instead of continue to retry repeatedly. So instead to fix convert schedule_work to schedule_delayed_work and add backoff logic to reschedule from backlog queue on errors. Its not obvious though what a good backoff is so use '1'. To test we observed some flakes whil running NGINX compliance test with sockmap we attributed these failed test to this bug and subsequent issue. >From on list discussion. This commit bec217197b41("skmsg: Schedule psock work if the cached skb exists on the psock") was intended to address similar race, but had a couple cases it missed. Most obvious it only accounted for receiving traffic on the local socket so if redirecting into another socket we could still get an sk_buff stuck here. Next it missed the case where copied=0 in the recv() handler and then we wouldn't kick the scheduler. Also its sub-optimal to require userspace to kick the internal mechanisms of sockmap to wake it up and copy data to user. It results in an extra syscall and requires the app to actual handle the EAGAIN correctly. Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: William Findlay Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20230523025618.113937-3-john.fastabend@gmail.com --- net/core/skmsg.c | 21 ++++++++++++++------- net/core/sock_map.c | 3 ++- 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 4a3dc8d27295..0a9ee2acac0b 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -482,7 +482,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, } out: if (psock->work_state.skb && copied > 0) - schedule_work(&psock->work); + schedule_delayed_work(&psock->work, 0); return copied; } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); @@ -640,7 +640,8 @@ static void sk_psock_skb_state(struct sk_psock *psock, static void sk_psock_backlog(struct work_struct *work) { - struct sk_psock *psock = container_of(work, struct sk_psock, work); + struct delayed_work *dwork = to_delayed_work(work); + struct sk_psock *psock = container_of(dwork, struct sk_psock, work); struct sk_psock_work_state *state = &psock->work_state; struct sk_buff *skb = NULL; bool ingress; @@ -680,6 +681,12 @@ start: if (ret == -EAGAIN) { sk_psock_skb_state(psock, state, skb, len, off); + + /* Delay slightly to prioritize any + * other work that might be here. + */ + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_delayed_work(&psock->work, 1); goto end; } /* Hard errors break pipe and stop xmit. */ @@ -734,7 +741,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) INIT_LIST_HEAD(&psock->link); spin_lock_init(&psock->link_lock); - INIT_WORK(&psock->work, sk_psock_backlog); + INIT_DELAYED_WORK(&psock->work, sk_psock_backlog); mutex_init(&psock->work_mutex); INIT_LIST_HEAD(&psock->ingress_msg); spin_lock_init(&psock->ingress_lock); @@ -823,7 +830,7 @@ static void sk_psock_destroy(struct work_struct *work) sk_psock_done_strp(psock); - cancel_work_sync(&psock->work); + cancel_delayed_work_sync(&psock->work); mutex_destroy(&psock->work_mutex); psock_progs_drop(&psock->progs); @@ -938,7 +945,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) } skb_queue_tail(&psock_other->ingress_skb, skb); - schedule_work(&psock_other->work); + schedule_delayed_work(&psock_other->work, 0); spin_unlock_bh(&psock_other->ingress_lock); return 0; } @@ -1018,7 +1025,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { skb_queue_tail(&psock->ingress_skb, skb); - schedule_work(&psock->work); + schedule_delayed_work(&psock->work, 0); err = 0; } spin_unlock_bh(&psock->ingress_lock); @@ -1049,7 +1056,7 @@ static void sk_psock_write_space(struct sock *sk) psock = sk_psock(sk); if (likely(psock)) { if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) - schedule_work(&psock->work); + schedule_delayed_work(&psock->work, 0); write_space = psock->saved_write_space; } rcu_read_unlock(); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 7c189c2e2fbf..00afb66cd095 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1644,9 +1644,10 @@ void sock_map_close(struct sock *sk, long timeout) rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); - cancel_work_sync(&psock->work); + cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); } + /* Make sure we do not recurse. This is a bug. * Leak the socket instead of crashing on a stack overflow. */ -- cgit v1.2.3 From bce22552f92ea7c577f49839b8e8f7d29afaf880 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 22 May 2023 19:56:07 -0700 Subject: bpf, sockmap: Reschedule is now done through backlog Now that the backlog manages the reschedule() logic correctly we can drop the partial fix to reschedule from recvmsg hook. Rescheduling on recvmsg hook was added to address a corner case where we still had data in the backlog state but had nothing to kick it and reschedule the backlog worker to run and finish copying data out of the state. This had a couple limitations, first it required user space to kick it introducing an unnecessary EBUSY and retry. Second it only handled the ingress case and egress redirects would still be hung. With the correct fix, pushing the reschedule logic down to where the enomem error occurs we can drop this fix. Fixes: bec217197b412 ("skmsg: Schedule psock work if the cached skb exists on the psock") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20230523025618.113937-4-john.fastabend@gmail.com --- net/core/skmsg.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 0a9ee2acac0b..76ff15f8bb06 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -481,8 +481,6 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, msg_rx = sk_psock_peek_msg(psock); } out: - if (psock->work_state.skb && copied > 0) - schedule_delayed_work(&psock->work, 0); return copied; } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); -- cgit v1.2.3 From 405df89dd52cbcd69a3cd7d9a10d64de38f854b2 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 22 May 2023 19:56:08 -0700 Subject: bpf, sockmap: Improved check for empty queue We noticed some rare sk_buffs were stepping past the queue when system was under memory pressure. The general theory is to skip enqueueing sk_buffs when its not necessary which is the normal case with a system that is properly provisioned for the task, no memory pressure and enough cpu assigned. But, if we can't allocate memory due to an ENOMEM error when enqueueing the sk_buff into the sockmap receive queue we push it onto a delayed workqueue to retry later. When a new sk_buff is received we then check if that queue is empty. However, there is a problem with simply checking the queue length. When a sk_buff is being processed from the ingress queue but not yet on the sockmap msg receive queue its possible to also recv a sk_buff through normal path. It will check the ingress queue which is zero and then skip ahead of the pkt being processed. Previously we used sock lock from both contexts which made the problem harder to hit, but not impossible. To fix instead of popping the skb from the queue entirely we peek the skb from the queue and do the copy there. This ensures checks to the queue length are non-zero while skb is being processed. Then finally when the entire skb has been copied to user space queue or another socket we pop it off the queue. This way the queue length check allows bypassing the queue only after the list has been completely processed. To reproduce issue we run NGINX compliance test with sockmap running and observe some flakes in our testing that we attributed to this issue. Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Suggested-by: Jakub Sitnicki Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: William Findlay Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20230523025618.113937-5-john.fastabend@gmail.com --- net/core/skmsg.c | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 76ff15f8bb06..bcd45a99a3db 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -622,16 +622,12 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, static void sk_psock_skb_state(struct sk_psock *psock, struct sk_psock_work_state *state, - struct sk_buff *skb, int len, int off) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { - state->skb = skb; state->len = len; state->off = off; - } else { - sock_drop(psock->sk, skb); } spin_unlock_bh(&psock->ingress_lock); } @@ -642,23 +638,17 @@ static void sk_psock_backlog(struct work_struct *work) struct sk_psock *psock = container_of(dwork, struct sk_psock, work); struct sk_psock_work_state *state = &psock->work_state; struct sk_buff *skb = NULL; + u32 len = 0, off = 0; bool ingress; - u32 len, off; int ret; mutex_lock(&psock->work_mutex); - if (unlikely(state->skb)) { - spin_lock_bh(&psock->ingress_lock); - skb = state->skb; + if (unlikely(state->len)) { len = state->len; off = state->off; - state->skb = NULL; - spin_unlock_bh(&psock->ingress_lock); } - if (skb) - goto start; - while ((skb = skb_dequeue(&psock->ingress_skb))) { + while ((skb = skb_peek(&psock->ingress_skb))) { len = skb->len; off = 0; if (skb_bpf_strparser(skb)) { @@ -667,7 +657,6 @@ static void sk_psock_backlog(struct work_struct *work) off = stm->offset; len = stm->full_len; } -start: ingress = skb_bpf_ingress(skb); skb_bpf_redirect_clear(skb); do { @@ -677,8 +666,7 @@ start: len, ingress); if (ret <= 0) { if (ret == -EAGAIN) { - sk_psock_skb_state(psock, state, skb, - len, off); + sk_psock_skb_state(psock, state, len, off); /* Delay slightly to prioritize any * other work that might be here. @@ -690,15 +678,16 @@ start: /* Hard errors break pipe and stop xmit. */ sk_psock_report_error(psock, ret ? -ret : EPIPE); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); - sock_drop(psock->sk, skb); goto end; } off += ret; len -= ret; } while (len); - if (!ingress) + skb = skb_dequeue(&psock->ingress_skb); + if (!ingress) { kfree_skb(skb); + } } end: mutex_unlock(&psock->work_mutex); @@ -791,11 +780,6 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock) skb_bpf_redirect_clear(skb); sock_drop(psock->sk, skb); } - kfree_skb(psock->work_state.skb); - /* We null the skb here to ensure that calls to sk_psock_backlog - * do not pick up the free'd skb. - */ - psock->work_state.skb = NULL; __sk_psock_purge_ingress_msg(psock); } @@ -814,7 +798,6 @@ void sk_psock_stop(struct sk_psock *psock) spin_lock_bh(&psock->ingress_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); sk_psock_cork_free(psock); - __sk_psock_zap_ingress(psock); spin_unlock_bh(&psock->ingress_lock); } @@ -829,6 +812,7 @@ static void sk_psock_destroy(struct work_struct *work) sk_psock_done_strp(psock); cancel_delayed_work_sync(&psock->work); + __sk_psock_zap_ingress(psock); mutex_destroy(&psock->work_mutex); psock_progs_drop(&psock->progs); -- cgit v1.2.3 From 901546fd8f9ca4b5c481ce00928ab425ce9aacc0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 22 May 2023 19:56:09 -0700 Subject: bpf, sockmap: Handle fin correctly The sockmap code is returning EAGAIN after a FIN packet is received and no more data is on the receive queue. Correct behavior is to return 0 to the user and the user can then close the socket. The EAGAIN causes many apps to retry which masks the problem. Eventually the socket is evicted from the sockmap because its released from sockmap sock free handling. The issue creates a delay and can cause some errors on application side. To fix this check on sk_msg_recvmsg side if length is zero and FIN flag is set then set return to zero. A selftest will be added to check this condition. Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: William Findlay Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20230523025618.113937-6-john.fastabend@gmail.com --- net/ipv4/tcp_bpf.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 2e9547467edb..73c13642d47f 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -174,6 +174,24 @@ static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, return ret; } +static bool is_next_msg_fin(struct sk_psock *psock) +{ + struct scatterlist *sge; + struct sk_msg *msg_rx; + int i; + + msg_rx = sk_psock_peek_msg(psock); + i = msg_rx->sg.start; + sge = sk_msg_elem(msg_rx, i); + if (!sge->length) { + struct sk_buff *skb = msg_rx->skb; + + if (skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + return true; + } + return false; +} + static int tcp_bpf_recvmsg_parser(struct sock *sk, struct msghdr *msg, size_t len, @@ -196,6 +214,19 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, lock_sock(sk); msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + /* The typical case for EFAULT is the socket was gracefully + * shutdown with a FIN pkt. So check here the other case is + * some error on copy_page_to_iter which would be unexpected. + * On fin return correct return code to zero. + */ + if (copied == -EFAULT) { + bool is_fin = is_next_msg_fin(psock); + + if (is_fin) { + copied = 0; + goto out; + } + } if (!copied) { long timeo; int data; -- cgit v1.2.3 From ea444185a6bf7da4dd0df1598ee953e4f7174858 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 22 May 2023 19:56:10 -0700 Subject: bpf, sockmap: TCP data stall on recv before accept A common mechanism to put a TCP socket into the sockmap is to hook the BPF_SOCK_OPS_{ACTIVE_PASSIVE}_ESTABLISHED_CB event with a BPF program that can map the socket info to the correct BPF verdict parser. When the user adds the socket to the map the psock is created and the new ops are assigned to ensure the verdict program will 'see' the sk_buffs as they arrive. Part of this process hooks the sk_data_ready op with a BPF specific handler to wake up the BPF verdict program when data is ready to read. The logic is simple enough (posted here for easy reading) static void sk_psock_verdict_data_ready(struct sock *sk) { struct socket *sock = sk->sk_socket; if (unlikely(!sock || !sock->ops || !sock->ops->read_skb)) return; sock->ops->read_skb(sk, sk_psock_verdict_recv); } The oversight here is sk->sk_socket is not assigned until the application accepts() the new socket. However, its entirely ok for the peer application to do a connect() followed immediately by sends. The socket on the receiver is sitting on the backlog queue of the listening socket until its accepted and the data is queued up. If the peer never accepts the socket or is slow it will eventually hit data limits and rate limit the session. But, important for BPF sockmap hooks when this data is received TCP stack does the sk_data_ready() call but the read_skb() for this data is never called because sk_socket is missing. The data sits on the sk_receive_queue. Then once the socket is accepted if we never receive more data from the peer there will be no further sk_data_ready calls and all the data is still on the sk_receive_queue(). Then user calls recvmsg after accept() and for TCP sockets in sockmap we use the tcp_bpf_recvmsg_parser() handler. The handler checks for data in the sk_msg ingress queue expecting that the BPF program has already run from the sk_data_ready hook and enqueued the data as needed. So we are stuck. To fix do an unlikely check in recvmsg handler for data on the sk_receive_queue and if it exists wake up data_ready. We have the sock locked in both read_skb and recvmsg so should avoid having multiple runners. Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20230523025618.113937-7-john.fastabend@gmail.com --- net/ipv4/tcp_bpf.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 73c13642d47f..01dd76be1a58 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -212,6 +212,26 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, return tcp_recvmsg(sk, msg, len, flags, addr_len); lock_sock(sk); + + /* We may have received data on the sk_receive_queue pre-accept and + * then we can not use read_skb in this context because we haven't + * assigned a sk_socket yet so have no link to the ops. The work-around + * is to check the sk_receive_queue and in these cases read skbs off + * queue again. The read_skb hook is not running at this point because + * of lock_sock so we avoid having multiple runners in read_skb. + */ + if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) { + tcp_data_ready(sk); + /* This handles the ENOMEM errors if we both receive data + * pre accept and are already under memory pressure. At least + * let user know to retry. + */ + if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) { + copied = -EAGAIN; + goto out; + } + } + msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); /* The typical case for EFAULT is the socket was gracefully -- cgit v1.2.3 From 6df7f764cd3cf5a03a4a47b23be47e57e41fcd85 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 22 May 2023 19:56:11 -0700 Subject: bpf, sockmap: Wake up polling after data copy When TCP stack has data ready to read sk_data_ready() is called. Sockmap overwrites this with its own handler to call into BPF verdict program. But, the original TCP socket had sock_def_readable that would additionally wake up any user space waiters with sk_wake_async(). Sockmap saved the callback when the socket was created so call the saved data ready callback and then we can wake up any epoll() logic waiting on the read. Note we call on 'copied >= 0' to account for returning 0 when a FIN is received because we need to wake up user for this as well so they can do the recvmsg() -> 0 and detect the shutdown. Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20230523025618.113937-8-john.fastabend@gmail.com --- net/core/skmsg.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index bcd45a99a3db..08be5f409fb8 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1199,12 +1199,21 @@ out: static void sk_psock_verdict_data_ready(struct sock *sk) { struct socket *sock = sk->sk_socket; + int copied; trace_sk_data_ready(sk); if (unlikely(!sock || !sock->ops || !sock->ops->read_skb)) return; - sock->ops->read_skb(sk, sk_psock_verdict_recv); + copied = sock->ops->read_skb(sk, sk_psock_verdict_recv); + if (copied >= 0) { + struct sk_psock *psock; + + rcu_read_lock(); + psock = sk_psock(sk); + psock->saved_data_ready(sk); + rcu_read_unlock(); + } } void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) -- cgit v1.2.3 From e5c6de5fa025882babf89cecbed80acf49b987fa Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 22 May 2023 19:56:12 -0700 Subject: bpf, sockmap: Incorrectly handling copied_seq The read_skb() logic is incrementing the tcp->copied_seq which is used for among other things calculating how many outstanding bytes can be read by the application. This results in application errors, if the application does an ioctl(FIONREAD) we return zero because this is calculated from the copied_seq value. To fix this we move tcp->copied_seq accounting into the recv handler so that we update these when the recvmsg() hook is called and data is in fact copied into user buffers. This gives an accurate FIONREAD value as expected and improves ACK handling. Before we were calling the tcp_rcv_space_adjust() which would update 'number of bytes copied to user in last RTT' which is wrong for programs returning SK_PASS. The bytes are only copied to the user when recvmsg is handled. Doing the fix for recvmsg is straightforward, but fixing redirect and SK_DROP pkts is a bit tricker. Build a tcp_psock_eat() helper and then call this from skmsg handlers. This fixes another issue where a broken socket with a BPF program doing a resubmit could hang the receiver. This happened because although read_skb() consumed the skb through sock_drop() it did not update the copied_seq. Now if a single reccv socket is redirecting to many sockets (for example for lb) the receiver sk will be hung even though we might expect it to continue. The hang comes from not updating the copied_seq numbers and memory pressure resulting from that. We have a slight layer problem of calling tcp_eat_skb even if its not a TCP socket. To fix we could refactor and create per type receiver handlers. I decided this is more work than we want in the fix and we already have some small tweaks depending on caller that use the helper skb_bpf_strparser(). So we extend that a bit and always set the strparser bit when it is in use and then we can gate the seq_copied updates on this. Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20230523025618.113937-9-john.fastabend@gmail.com --- net/core/skmsg.c | 15 +++++++-------- net/ipv4/tcp.c | 10 +--------- net/ipv4/tcp_bpf.c | 28 +++++++++++++++++++++++++++- 3 files changed, 35 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 08be5f409fb8..a9060e1f0e43 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -979,10 +979,8 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, err = -EIO; sk_other = psock->sk; if (sock_flag(sk_other, SOCK_DEAD) || - !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { - skb_bpf_redirect_clear(skb); + !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) goto out_free; - } skb_bpf_set_ingress(skb); @@ -1011,18 +1009,19 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, err = 0; } spin_unlock_bh(&psock->ingress_lock); - if (err < 0) { - skb_bpf_redirect_clear(skb); + if (err < 0) goto out_free; - } } break; case __SK_REDIRECT: + tcp_eat_skb(psock->sk, skb); err = sk_psock_skb_redirect(psock, skb); break; case __SK_DROP: default: out_free: + skb_bpf_redirect_clear(skb); + tcp_eat_skb(psock->sk, skb); sock_drop(psock->sk, skb); } @@ -1067,8 +1066,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) skb_dst_drop(skb); skb_bpf_redirect_clear(skb); ret = bpf_prog_run_pin_on_cpu(prog, skb); - if (ret == SK_PASS) - skb_bpf_set_strparser(skb); + skb_bpf_set_strparser(skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } @@ -1176,6 +1174,7 @@ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb) psock = sk_psock(sk); if (unlikely(!psock)) { len = 0; + tcp_eat_skb(sk, skb); sock_drop(sk, skb); goto out; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e914e3446377..a60f6f4e7cd9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1571,7 +1571,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) * calculation of whether or not we must ACK for the sake of * a window update. */ -static void __tcp_cleanup_rbuf(struct sock *sk, int copied) +void __tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; @@ -1786,14 +1786,6 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) break; } } - WRITE_ONCE(tp->copied_seq, seq); - - tcp_rcv_space_adjust(sk); - - /* Clean up data we have read: This will do ACK frames. */ - if (copied > 0) - __tcp_cleanup_rbuf(sk, copied); - return copied; } EXPORT_SYMBOL(tcp_read_skb); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 01dd76be1a58..5f93918c063c 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -11,6 +11,24 @@ #include #include +void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tcp; + int copied; + + if (!skb || !skb->len || !sk_is_tcp(sk)) + return; + + if (skb_bpf_strparser(skb)) + return; + + tcp = tcp_sk(sk); + copied = tcp->copied_seq + skb->len; + WRITE_ONCE(tcp->copied_seq, copied); + tcp_rcv_space_adjust(sk); + __tcp_cleanup_rbuf(sk, skb->len); +} + static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, u32 apply_bytes, int flags) { @@ -198,8 +216,10 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk, int flags, int *addr_len) { + struct tcp_sock *tcp = tcp_sk(sk); + u32 seq = tcp->copied_seq; struct sk_psock *psock; - int copied; + int copied = 0; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); @@ -244,9 +264,11 @@ msg_bytes_ready: if (is_fin) { copied = 0; + seq++; goto out; } } + seq += copied; if (!copied) { long timeo; int data; @@ -284,6 +306,10 @@ msg_bytes_ready: copied = -EAGAIN; } out: + WRITE_ONCE(tcp->copied_seq, seq); + tcp_rcv_space_adjust(sk); + if (copied > 0) + __tcp_cleanup_rbuf(sk, copied); release_sock(sk); sk_psock_put(sk, psock); return copied; -- cgit v1.2.3 From 368d3cb406cdd074d1df2ad9ec06d1bfcb664882 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Mon, 22 May 2023 11:17:14 +0800 Subject: page_pool: fix inconsistency for page_pool_ring_[un]lock() page_pool_ring_[un]lock() use in_softirq() to decide which spin lock variant to use, and when they are called in the context with in_softirq() being false, spin_lock_bh() is called in page_pool_ring_lock() while spin_unlock() is called in page_pool_ring_unlock(), because spin_lock_bh() has disabled the softirq in page_pool_ring_lock(), which causes inconsistency for spin lock pair calling. This patch fixes it by returning in_softirq state from page_pool_producer_lock(), and use it to decide which spin lock variant to use in page_pool_producer_unlock(). As pool->ring has both producer and consumer lock, so rename it to page_pool_producer_[un]lock() to reflect the actual usage. Also move them to page_pool.c as they are only used there, and remove the 'inline' as the compiler may have better idea to do inlining or not. Fixes: 7886244736a4 ("net: page_pool: Add bulk support for ptr_ring") Signed-off-by: Yunsheng Lin Acked-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Link: https://lore.kernel.org/r/20230522031714.5089-1-linyunsheng@huawei.com Signed-off-by: Jakub Kicinski --- net/core/page_pool.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index e212e9d7edcb..a3e12a61d456 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -134,6 +134,29 @@ EXPORT_SYMBOL(page_pool_ethtool_stats_get); #define recycle_stat_add(pool, __stat, val) #endif +static bool page_pool_producer_lock(struct page_pool *pool) + __acquires(&pool->ring.producer_lock) +{ + bool in_softirq = in_softirq(); + + if (in_softirq) + spin_lock(&pool->ring.producer_lock); + else + spin_lock_bh(&pool->ring.producer_lock); + + return in_softirq; +} + +static void page_pool_producer_unlock(struct page_pool *pool, + bool in_softirq) + __releases(&pool->ring.producer_lock) +{ + if (in_softirq) + spin_unlock(&pool->ring.producer_lock); + else + spin_unlock_bh(&pool->ring.producer_lock); +} + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -617,6 +640,7 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, int count) { int i, bulk_len = 0; + bool in_softirq; for (i = 0; i < count; i++) { struct page *page = virt_to_head_page(data[i]); @@ -635,7 +659,7 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, return; /* Bulk producer into ptr_ring page_pool cache */ - page_pool_ring_lock(pool); + in_softirq = page_pool_producer_lock(pool); for (i = 0; i < bulk_len; i++) { if (__ptr_ring_produce(&pool->ring, data[i])) { /* ring full */ @@ -644,7 +668,7 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, } } recycle_stat_add(pool, ring, i); - page_pool_ring_unlock(pool); + page_pool_producer_unlock(pool, in_softirq); /* Hopefully all pages was return into ptr_ring */ if (likely(i == bulk_len)) -- cgit v1.2.3 From 8a02fb71d7192ff1a9a47c9d937624966c6e09af Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Mon, 22 May 2023 17:30:20 +0200 Subject: net: fix skb leak in __skb_tstamp_tx() Commit 50749f2dd685 ("tcp/udp: Fix memleaks of sk and zerocopy skbs with TX timestamp.") added a call to skb_orphan_frags_rx() to fix leaks with zerocopy skbs. But it ended up adding a leak of its own. When skb_orphan_frags_rx() fails, the function just returns, leaking the skb it just cloned. Free it before returning. This bug was discovered and resolved using Coverity Static Analysis Security Testing (SAST) by Synopsys, Inc. Fixes: 50749f2dd685 ("tcp/udp: Fix memleaks of sk and zerocopy skbs with TX timestamp.") Signed-off-by: Pratyush Yadav Reviewed-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/20230522153020.32422-1-ptyadav@amazon.de Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 515ec5cdc79c..cea28d30abb5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5224,8 +5224,10 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, } else { skb = skb_clone(orig_skb, GFP_ATOMIC); - if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) + if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) { + kfree_skb(skb); return; + } } if (!skb) return; -- cgit v1.2.3 From 878ecb0897f4737a4c9401f3523fd49589025671 Mon Sep 17 00:00:00 2001 From: Gavrilov Ilia Date: Tue, 23 May 2023 08:29:44 +0000 Subject: ipv6: Fix out-of-bounds access in ipv6_find_tlv() optlen is fetched without checking whether there is more than one byte to parse. It can lead to out-of-bounds access. Found by InfoTeCS on behalf of Linux Verification Center (linuxtesting.org) with SVACE. Fixes: c61a40432509 ("[IPV6]: Find option offset by type.") Signed-off-by: Gavrilov Ilia Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/exthdrs_core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index da46c4284676..49e31e4ae7b7 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -143,6 +143,8 @@ int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type) optlen = 1; break; default: + if (len < 2) + goto bad; optlen = nh[offset + 1] + 2; if (optlen > len) goto bad; -- cgit v1.2.3 From a095326e2c0f33743ce8e887d5b90edf3f36cced Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 May 2023 11:47:09 -0400 Subject: net/handshake: Remove unneeded check from handshake_dup() handshake_req_submit() now verifies that the socket has a file. Fixes: 3b3009ea8abb ("net/handshake: Create a NETLINK service for handling handshake requests") Reviewed-by: Simon Horman Signed-off-by: Chuck Lever Signed-off-by: Jakub Kicinski --- net/handshake/netlink.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c index 35c9c445e0b8..7ec8a76c3c8a 100644 --- a/net/handshake/netlink.c +++ b/net/handshake/netlink.c @@ -99,9 +99,6 @@ static int handshake_dup(struct socket *sock) struct file *file; int newfd; - if (!sock->file) - return -EBADF; - file = get_file(sock->file); newfd = get_unused_fd_flags(O_CLOEXEC); if (newfd < 0) { -- cgit v1.2.3 From 7ea9c1ec66bc099b0bfba961a8a46dfe25d7d8e4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 May 2023 11:47:40 -0400 Subject: net/handshake: Fix handshake_dup() ref counting If get_unused_fd_flags() fails, we ended up calling fput(sock->file) twice. Reported-by: Dan Carpenter Suggested-by: Paolo Abeni Fixes: 3b3009ea8abb ("net/handshake: Create a NETLINK service for handling handshake requests") Signed-off-by: Chuck Lever Signed-off-by: Jakub Kicinski --- net/handshake/netlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c index 7ec8a76c3c8a..f5dc170689d9 100644 --- a/net/handshake/netlink.c +++ b/net/handshake/netlink.c @@ -139,15 +139,16 @@ int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info) goto out_complete; } err = req->hr_proto->hp_accept(req, info, fd); - if (err) + if (err) { + fput(sock->file); goto out_complete; + } trace_handshake_cmd_accept(net, req, req->hr_sk, fd); return 0; out_complete: handshake_complete(req, -EIO, NULL); - fput(sock->file); out_status: trace_handshake_cmd_accept_err(net, req, NULL, err); return err; -- cgit v1.2.3 From 7afc6d0a107ffbd448c96eb2458b9e64a5af7860 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 May 2023 11:48:13 -0400 Subject: net/handshake: Fix uninitialized local variable trace_handshake_cmd_done_err() simply records the pointer in @req, so initializing it to NULL is sufficient and safe. Reported-by: Dan Carpenter Fixes: 3b3009ea8abb ("net/handshake: Create a NETLINK service for handling handshake requests") Reviewed-by: Simon Horman Signed-off-by: Chuck Lever Signed-off-by: Jakub Kicinski --- net/handshake/netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c index f5dc170689d9..16a4bde648ba 100644 --- a/net/handshake/netlink.c +++ b/net/handshake/netlink.c @@ -157,8 +157,8 @@ out_status: int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); + struct handshake_req *req = NULL; struct socket *sock = NULL; - struct handshake_req *req; int fd, status, err; if (GENL_REQ_ATTR_CHECK(info, HANDSHAKE_A_DONE_SOCKFD)) -- cgit v1.2.3 From fc490880e39d86c65ab2bcbd357af1950fa55e48 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 May 2023 11:48:45 -0400 Subject: net/handshake: handshake_genl_notify() shouldn't ignore @flags Reported-by: Dan Carpenter Fixes: 3b3009ea8abb ("net/handshake: Create a NETLINK service for handling handshake requests") Reviewed-by: Simon Horman Signed-off-by: Chuck Lever Signed-off-by: Jakub Kicinski --- net/handshake/netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c index 16a4bde648ba..1086653e1fad 100644 --- a/net/handshake/netlink.c +++ b/net/handshake/netlink.c @@ -48,7 +48,7 @@ int handshake_genl_notify(struct net *net, const struct handshake_proto *proto, proto->hp_handler_class)) return -ESRCH; - msg = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + msg = genlmsg_new(GENLMSG_DEFAULT_SIZE, flags); if (!msg) return -ENOMEM; -- cgit v1.2.3 From 1ce77c998f0415d7d9d91cb9bd7665e25c8f75f1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 May 2023 11:49:17 -0400 Subject: net/handshake: Unpin sock->file if a handshake is cancelled If user space never calls DONE, sock->file's reference count remains elevated. Enable sock->file to be freed eventually in this case. Reported-by: Jakub Kacinski Fixes: 3b3009ea8abb ("net/handshake: Create a NETLINK service for handling handshake requests") Signed-off-by: Chuck Lever Signed-off-by: Jakub Kicinski --- net/handshake/handshake.h | 1 + net/handshake/request.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'net') diff --git a/net/handshake/handshake.h b/net/handshake/handshake.h index 4dac965c99df..8aeaadca844f 100644 --- a/net/handshake/handshake.h +++ b/net/handshake/handshake.h @@ -31,6 +31,7 @@ struct handshake_req { struct list_head hr_list; struct rhash_head hr_rhash; unsigned long hr_flags; + struct file *hr_file; const struct handshake_proto *hr_proto; struct sock *hr_sk; void (*hr_odestruct)(struct sock *sk); diff --git a/net/handshake/request.c b/net/handshake/request.c index 94d5cef3e048..d78d41abb3d9 100644 --- a/net/handshake/request.c +++ b/net/handshake/request.c @@ -239,6 +239,7 @@ int handshake_req_submit(struct socket *sock, struct handshake_req *req, } req->hr_odestruct = req->hr_sk->sk_destruct; req->hr_sk->sk_destruct = handshake_sk_destruct; + req->hr_file = sock->file; ret = -EOPNOTSUPP; net = sock_net(req->hr_sk); @@ -334,6 +335,9 @@ bool handshake_req_cancel(struct sock *sk) return false; } + /* Request accepted and waiting for DONE */ + fput(req->hr_file); + out_true: trace_handshake_cancel(net, req, sk); -- cgit v1.2.3 From 26fb5480a27d34975cc2b680b77af189620dd740 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 May 2023 11:49:50 -0400 Subject: net/handshake: Enable the SNI extension to work properly Enable the upper layer protocol to specify the SNI peername. This avoids the need for tlshd to use a DNS lookup, which can return a hostname that doesn't match the incoming certificate's SubjectName. Fixes: 2fd5532044a8 ("net/handshake: Add a kernel API for requesting a TLSv1.3 handshake") Reviewed-by: Simon Horman Signed-off-by: Chuck Lever Signed-off-by: Jakub Kicinski --- net/handshake/tlshd.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c index fcbeb63b4eb1..b735f5cced2f 100644 --- a/net/handshake/tlshd.c +++ b/net/handshake/tlshd.c @@ -31,6 +31,7 @@ struct tls_handshake_req { int th_type; unsigned int th_timeout_ms; int th_auth_mode; + const char *th_peername; key_serial_t th_keyring; key_serial_t th_certificate; key_serial_t th_privkey; @@ -48,6 +49,7 @@ tls_handshake_req_init(struct handshake_req *req, treq->th_timeout_ms = args->ta_timeout_ms; treq->th_consumer_done = args->ta_done; treq->th_consumer_data = args->ta_data; + treq->th_peername = args->ta_peername; treq->th_keyring = args->ta_keyring; treq->th_num_peerids = 0; treq->th_certificate = TLS_NO_CERT; @@ -214,6 +216,12 @@ static int tls_handshake_accept(struct handshake_req *req, ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_MESSAGE_TYPE, treq->th_type); if (ret < 0) goto out_cancel; + if (treq->th_peername) { + ret = nla_put_string(msg, HANDSHAKE_A_ACCEPT_PEERNAME, + treq->th_peername); + if (ret < 0) + goto out_cancel; + } if (treq->th_timeout_ms) { ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_TIMEOUT, treq->th_timeout_ms); if (ret < 0) -- cgit v1.2.3 From ad42a35bdfc6d3c0fc4cb4027d7b2757ce665665 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 May 2023 09:33:05 -0700 Subject: udplite: Fix NULL pointer dereference in __sk_mem_raise_allocated(). syzbot reported [0] a null-ptr-deref in sk_get_rmem0() while using IPPROTO_UDPLITE (0x88): 14:25:52 executing program 1: r0 = socket$inet6(0xa, 0x80002, 0x88) We had a similar report [1] for probably sk_memory_allocated_add() in __sk_mem_raise_allocated(), and commit c915fe13cbaa ("udplite: fix NULL pointer dereference") fixed it by setting .memory_allocated for udplite_prot and udplitev6_prot. To fix the variant, we need to set either .sysctl_wmem_offset or .sysctl_rmem. Now UDP and UDPLITE share the same value for .memory_allocated, so we use the same .sysctl_wmem_offset for UDP and UDPLITE. [0]: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 0 PID: 6829 Comm: syz-executor.1 Not tainted 6.4.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/28/2023 RIP: 0010:sk_get_rmem0 include/net/sock.h:2907 [inline] RIP: 0010:__sk_mem_raise_allocated+0x806/0x17a0 net/core/sock.c:3006 Code: c1 ea 03 80 3c 02 00 0f 85 23 0f 00 00 48 8b 44 24 08 48 8b 98 38 01 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 da 48 c1 ea 03 <0f> b6 14 02 48 89 d8 83 e0 07 83 c0 03 38 d0 0f 8d 6f 0a 00 00 8b RSP: 0018:ffffc90005d7f450 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffc90004d92000 RDX: 0000000000000000 RSI: ffffffff88066482 RDI: ffffffff8e2ccbb8 RBP: ffff8880173f7000 R08: 0000000000000005 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000030000 R13: 0000000000000001 R14: 0000000000000340 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff8880b9800000(0063) knlGS:00000000f7f1cb40 CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 CR2: 000000002e82f000 CR3: 0000000034ff0000 CR4: 00000000003506f0 Call Trace: __sk_mem_schedule+0x6c/0xe0 net/core/sock.c:3077 udp_rmem_schedule net/ipv4/udp.c:1539 [inline] __udp_enqueue_schedule_skb+0x776/0xb30 net/ipv4/udp.c:1581 __udpv6_queue_rcv_skb net/ipv6/udp.c:666 [inline] udpv6_queue_rcv_one_skb+0xc39/0x16c0 net/ipv6/udp.c:775 udpv6_queue_rcv_skb+0x194/0xa10 net/ipv6/udp.c:793 __udp6_lib_mcast_deliver net/ipv6/udp.c:906 [inline] __udp6_lib_rcv+0x1bda/0x2bd0 net/ipv6/udp.c:1013 ip6_protocol_deliver_rcu+0x2e7/0x1250 net/ipv6/ip6_input.c:437 ip6_input_finish+0x150/0x2f0 net/ipv6/ip6_input.c:482 NF_HOOK include/linux/netfilter.h:303 [inline] NF_HOOK include/linux/netfilter.h:297 [inline] ip6_input+0xa0/0xd0 net/ipv6/ip6_input.c:491 ip6_mc_input+0x40b/0xf50 net/ipv6/ip6_input.c:585 dst_input include/net/dst.h:468 [inline] ip6_rcv_finish net/ipv6/ip6_input.c:79 [inline] NF_HOOK include/linux/netfilter.h:303 [inline] NF_HOOK include/linux/netfilter.h:297 [inline] ipv6_rcv+0x250/0x380 net/ipv6/ip6_input.c:309 __netif_receive_skb_one_core+0x114/0x180 net/core/dev.c:5491 __netif_receive_skb+0x1f/0x1c0 net/core/dev.c:5605 netif_receive_skb_internal net/core/dev.c:5691 [inline] netif_receive_skb+0x133/0x7a0 net/core/dev.c:5750 tun_rx_batched+0x4b3/0x7a0 drivers/net/tun.c:1553 tun_get_user+0x2452/0x39c0 drivers/net/tun.c:1989 tun_chr_write_iter+0xdf/0x200 drivers/net/tun.c:2035 call_write_iter include/linux/fs.h:1868 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x945/0xd50 fs/read_write.c:584 ksys_write+0x12b/0x250 fs/read_write.c:637 do_syscall_32_irqs_on arch/x86/entry/common.c:112 [inline] __do_fast_syscall_32+0x65/0xf0 arch/x86/entry/common.c:178 do_fast_syscall_32+0x33/0x70 arch/x86/entry/common.c:203 entry_SYSENTER_compat_after_hwframe+0x70/0x82 RIP: 0023:0xf7f21579 Code: b8 01 10 06 03 74 b4 01 10 07 03 74 b0 01 10 08 03 74 d8 01 00 00 00 00 00 00 00 00 00 00 00 00 00 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 8d b4 26 00 00 00 00 8d b4 26 00 00 00 00 RSP: 002b:00000000f7f1c590 EFLAGS: 00000282 ORIG_RAX: 0000000000000004 RAX: ffffffffffffffda RBX: 00000000000000c8 RCX: 0000000020000040 RDX: 0000000000000083 RSI: 00000000f734e000 RDI: 0000000000000000 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000296 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Modules linked in: Link: https://lore.kernel.org/netdev/CANaxB-yCk8hhP68L4Q2nFOJht8sqgXGGQO2AftpHs0u1xyGG5A@mail.gmail.com/ [1] Fixes: 850cbaddb52d ("udp: use it's own memory accounting schema") Reported-by: syzbot+444ca0907e96f7c5e48b@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=444ca0907e96f7c5e48b Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230523163305.66466-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv4/udplite.c | 2 ++ net/ipv6/udplite.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index e0c9cc39b81e..56d94d23b9e0 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -64,6 +64,8 @@ struct proto udplite_prot = { .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp_sock), .h.udp_table = &udplite_table, }; diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 67eaf3ca14ce..3bab0cc13697 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -60,6 +60,8 @@ struct proto udplitev6_prot = { .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp6_sock), .h.udp_table = &udplite_table, }; -- cgit v1.2.3