From 9c9cb23e00ddf45679b21b4dacc11d1ae7961ebe Mon Sep 17 00:00:00 2001 From: Xin Xiong Date: Sun, 24 Jul 2022 17:55:58 +0800 Subject: xfrm: fix refcount leak in __xfrm_policy_check() The issue happens on an error path in __xfrm_policy_check(). When the fetching process of the object `pols[1]` fails, the function simply returns 0, forgetting to decrement the reference count of `pols[0]`, which is incremented earlier by either xfrm_sk_policy_lookup() or xfrm_policy_lookup(). This may result in memory leaks. Fix it by decreasing the reference count of `pols[0]` in that path. Fixes: 134b0fc544ba ("IPsec: propagate security module errors up from flow_cache_lookup") Signed-off-by: Xin Xiong Signed-off-by: Xin Tan Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f1a0bab920a5..4f8bbb825abc 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3599,6 +3599,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, if (pols[1]) { if (IS_ERR(pols[1])) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); + xfrm_pol_put(pols[0]); return 0; } pols[1]->curlft.use_time = ktime_get_real_seconds(); -- cgit v1.2.3 From 717ada9f10f2de8c4f4d72ad045f3b67a7ced715 Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Wed, 27 Jul 2022 17:38:35 +0200 Subject: Revert "xfrm: update SA curlft.use_time" This reverts commit af734a26a1a95a9fda51f2abb0c22a7efcafd5ca. The abvoce commit is a regression according RFC 2367. A better fix would be use x->lastused. Which will be propsed later. according to RFC 2367 use_time == sadb_lifetime_usetime. "sadb_lifetime_usetime For CURRENT, the time, in seconds, when association was first used. For HARD and SOFT, the number of seconds after the first use of the association until it expires." Fixes: af734a26a1a9 ("xfrm: update SA curlft.use_time") Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 1 - net/xfrm/xfrm_output.c | 1 - 2 files changed, 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 144238a50f3d..70a8c36f0ba6 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -669,7 +669,6 @@ resume: x->curlft.bytes += skb->len; x->curlft.packets++; - x->curlft.use_time = ktime_get_real_seconds(); spin_unlock(&x->lock); diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 555ab35cd119..9a5e79a38c67 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -534,7 +534,6 @@ static int xfrm_output_one(struct sk_buff *skb, int err) x->curlft.bytes += skb->len; x->curlft.packets++; - x->curlft.use_time = ktime_get_real_seconds(); spin_unlock_bh(&x->lock); -- cgit v1.2.3 From 6aa811acdb76facca0b705f4e4c1d948ccb6af8b Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Wed, 27 Jul 2022 17:41:22 +0200 Subject: xfrm: clone missing x->lastused in xfrm_do_migrate x->lastused was not cloned in xfrm_do_migrate. Add it to clone during migrate. Fixes: 80c9abaabf42 ("[XFRM]: Extension for dynamic update of endpoint address(es)") Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index ccfb172eb5b8..11d89af9cb55 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1592,6 +1592,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, x->replay = orig->replay; x->preplay = orig->preplay; x->mapping_maxage = orig->mapping_maxage; + x->lastused = orig->lastused; x->new_mapping = 0; x->new_mapping_sport = 0; -- cgit v1.2.3 From ba953a9d89a00c078b85f4b190bc1dde66fe16b5 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 4 Aug 2022 18:03:46 +0800 Subject: af_key: Do not call xfrm_probe_algs in parallel When namespace support was added to xfrm/afkey, it caused the previously single-threaded call to xfrm_probe_algs to become multi-threaded. This is buggy and needs to be fixed with a mutex. Reported-by: Abhishek Shah Fixes: 283bc9f35bbb ("xfrm: Namespacify xfrm state/policy locks") Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- net/key/af_key.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index fb16d7c4e1b8..20e73643b9c8 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1697,9 +1697,12 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sad pfk->registered |= (1<sadb_msg_satype); } + mutex_lock(&pfkey_mutex); xfrm_probe_algs(); supp_skb = compose_sadb_supported(hdr, GFP_KERNEL | __GFP_ZERO); + mutex_unlock(&pfkey_mutex); + if (!supp_skb) { if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) pfk->registered &= ~(1<sadb_msg_satype); -- cgit v1.2.3 From 58ca14ed98c87cfe0d1408cc65a9745d9e9b7a56 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 12 Aug 2022 13:32:59 +0200 Subject: xsk: Fix corrupted packets for XDP_SHARED_UMEM Fix an issue in XDP_SHARED_UMEM mode together with aligned mode where packets are corrupted for the second and any further sockets bound to the same umem. In other words, this does not affect the first socket bound to the umem. The culprit for this bug is that the initialization of the DMA addresses for the pre-populated xsk buffer pool entries was not performed for any socket but the first one bound to the umem. Only the linear array of DMA addresses was populated. Fix this by populating the DMA addresses in the xsk buffer pool for every socket bound to the same umem. Fixes: 94033cd8e73b8 ("xsk: Optimize for aligned case") Reported-by: Alasdair McWilliam Reported-by: Intrusion Shield Team Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Tested-by: Alasdair McWilliam Acked-by: Maciej Fijalkowski Link: https://lore.kernel.org/xdp-newbies/6205E10C-292E-4995-9D10-409649354226@outlook.com/ Link: https://lore.kernel.org/bpf/20220812113259.531-1-magnus.karlsson@gmail.com --- net/xdp/xsk_buff_pool.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index f70112176b7c..a71a8c6edf55 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -379,6 +379,16 @@ static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map) static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_map) { + if (!pool->unaligned) { + u32 i; + + for (i = 0; i < pool->heads_cnt; i++) { + struct xdp_buff_xsk *xskb = &pool->heads[i]; + + xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr); + } + } + pool->dma_pages = kvcalloc(dma_map->dma_pages_cnt, sizeof(*pool->dma_pages), GFP_KERNEL); if (!pool->dma_pages) return -ENOMEM; @@ -428,12 +438,6 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, if (pool->unaligned) xp_check_dma_contiguity(dma_map); - else - for (i = 0; i < pool->heads_cnt; i++) { - struct xdp_buff_xsk *xskb = &pool->heads[i]; - - xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr); - } err = xp_init_dma_info(pool, dma_map); if (err) { -- cgit v1.2.3 From 17ecd4a4db4783392edd4944f5e8268205083f70 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 16 Aug 2022 18:30:50 +0300 Subject: xfrm: policy: fix metadata dst->dev xmit null pointer dereference When we try to transmit an skb with metadata_dst attached (i.e. dst->dev == NULL) through xfrm interface we can hit a null pointer dereference[1] in xfrmi_xmit2() -> xfrm_lookup_with_ifid() due to the check for a loopback skb device when there's no policy which dereferences dst->dev unconditionally. Not having dst->dev can be interepreted as it not being a loopback device, so just add a check for a null dst_orig->dev. With this fix xfrm interface's Tx error counters go up as usual. [1] net-next calltrace captured via netconsole: BUG: kernel NULL pointer dereference, address: 00000000000000c0 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP CPU: 1 PID: 7231 Comm: ping Kdump: loaded Not tainted 5.19.0+ #24 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.0-1.fc36 04/01/2014 RIP: 0010:xfrm_lookup_with_ifid+0x5eb/0xa60 Code: 8d 74 24 38 e8 26 a4 37 00 48 89 c1 e9 12 fc ff ff 49 63 ed 41 83 fd be 0f 85 be 01 00 00 41 be ff ff ff ff 45 31 ed 48 8b 03 80 c0 00 00 00 08 75 0f 41 80 bc 24 19 0d 00 00 01 0f 84 1e 02 RSP: 0018:ffffb0db82c679f0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffffd0db7fcad430 RCX: ffffb0db82c67a10 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffb0db82c67a80 RBP: ffffb0db82c67a80 R08: ffffb0db82c67a14 R09: 0000000000000000 R10: 0000000000000000 R11: ffff8fa449667dc8 R12: ffffffff966db880 R13: 0000000000000000 R14: 00000000ffffffff R15: 0000000000000000 FS: 00007ff35c83f000(0000) GS:ffff8fa478480000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000c0 CR3: 000000001ebb7000 CR4: 0000000000350ee0 Call Trace: xfrmi_xmit+0xde/0x460 ? tcf_bpf_act+0x13d/0x2a0 dev_hard_start_xmit+0x72/0x1e0 __dev_queue_xmit+0x251/0xd30 ip_finish_output2+0x140/0x550 ip_push_pending_frames+0x56/0x80 raw_sendmsg+0x663/0x10a0 ? try_charge_memcg+0x3fd/0x7a0 ? __mod_memcg_lruvec_state+0x93/0x110 ? sock_sendmsg+0x30/0x40 sock_sendmsg+0x30/0x40 __sys_sendto+0xeb/0x130 ? handle_mm_fault+0xae/0x280 ? do_user_addr_fault+0x1e7/0x680 ? kvm_read_and_reset_apf_flags+0x3b/0x50 __x64_sys_sendto+0x20/0x30 do_syscall_64+0x34/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7ff35cac1366 Code: eb 0b 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 11 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 72 c3 90 55 48 83 ec 30 44 89 4c 24 2c 4c 89 RSP: 002b:00007fff738e4028 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00007fff738e57b0 RCX: 00007ff35cac1366 RDX: 0000000000000040 RSI: 0000557164e4b450 RDI: 0000000000000003 RBP: 0000557164e4b450 R08: 00007fff738e7a2c R09: 0000000000000010 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000040 R13: 00007fff738e5770 R14: 00007fff738e4030 R15: 0000001d00000001 Modules linked in: netconsole veth br_netfilter bridge bonding virtio_net [last unloaded: netconsole] CR2: 00000000000000c0 CC: Steffen Klassert CC: Daniel Borkmann Fixes: 2d151d39073a ("xfrm: Add possibility to set the default to block if we have no policy") Signed-off-by: Nikolay Aleksandrov Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 4f8bbb825abc..cc6ab79609e2 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3162,7 +3162,7 @@ ok: return dst; nopol: - if (!(dst_orig->dev->flags & IFF_LOOPBACK) && + if ((!dst_orig->dev || !(dst_orig->dev->flags & IFF_LOOPBACK)) && net->xfrm.policy_default[dir] == XFRM_USERPOLICY_BLOCK) { err = -EPERM; goto error; -- cgit v1.2.3 From 583585e48d965338e73e1eb383768d16e0922d73 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Tue, 9 Aug 2022 17:49:15 +0800 Subject: skmsg: Fix wrong last sg check in sk_msg_recvmsg() Fix one kernel NULL pointer dereference as below: [ 224.462334] Call Trace: [ 224.462394] __tcp_bpf_recvmsg+0xd3/0x380 [ 224.462441] ? sock_has_perm+0x78/0xa0 [ 224.462463] tcp_bpf_recvmsg+0x12e/0x220 [ 224.462494] inet_recvmsg+0x5b/0xd0 [ 224.462534] __sys_recvfrom+0xc8/0x130 [ 224.462574] ? syscall_trace_enter+0x1df/0x2e0 [ 224.462606] ? __do_page_fault+0x2de/0x500 [ 224.462635] __x64_sys_recvfrom+0x24/0x30 [ 224.462660] do_syscall_64+0x5d/0x1d0 [ 224.462709] entry_SYSCALL_64_after_hwframe+0x65/0xca In commit 9974d37ea75f ("skmsg: Fix invalid last sg check in sk_msg_recvmsg()"), we change last sg check to sg_is_last(), but in sockmap redirection case (without stream_parser/stream_verdict/ skb_verdict), we did not mark the end of the scatterlist. Check the sk_msg_alloc, sk_msg_page_add, and bpf_msg_push_data functions, they all do not mark the end of sg. They are expected to use sg.end for end judgment. So the judgment of '(i != msg_rx->sg.end)' is added back here. Fixes: 9974d37ea75f ("skmsg: Fix invalid last sg check in sk_msg_recvmsg()") Signed-off-by: Liu Jian Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20220809094915.150391-1-liujian56@huawei.com --- net/core/skmsg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index f47338d89d5d..5be4485ff07d 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -461,7 +461,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, if (copied == len) break; - } while (!sg_is_last(sge)); + } while ((i != msg_rx->sg.end) && !sg_is_last(sge)); if (unlikely(peek)) { msg_rx = sk_psock_next_msg(psock, msg_rx); @@ -471,7 +471,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, } msg_rx->sg.start = i; - if (!sge->length && sg_is_last(sge)) { + if (!sge->length && (i == msg_rx->sg.end || sg_is_last(sge))) { msg_rx = sk_psock_dequeue_msg(psock); kfree_sk_msg(msg_rx); } -- cgit v1.2.3 From 7ec9fce4b31604f8415136a4c07f7dc8ad431aec Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Thu, 18 Aug 2022 10:41:18 +0300 Subject: ip_tunnel: Respect tunnel key's "flow_flags" in IP tunnels Commit 451ef36bd229 ("ip_tunnels: Add new flow flags field to ip_tunnel_key") added a "flow_flags" member to struct ip_tunnel_key which was later used by the commit in the fixes tag to avoid dropping packets with sources that aren't locally configured when set in bpf_set_tunnel_key(). VXLAN and GENEVE were made to respect this flag, ip tunnels like IPIP and GRE were not. This commit fixes this omission by making ip_tunnel_init_flow() receive the flow flags from the tunnel key in the relevant collect_md paths. Fixes: b8fff748521c ("bpf: Set flow flag to allow any source IP in bpf_tunnel_key") Signed-off-by: Eyal Birger Signed-off-by: Daniel Borkmann Reviewed-by: Paul Chaignon Link: https://lore.kernel.org/bpf/20220818074118.726639-1-eyal.birger@gmail.com --- net/ipv4/ip_gre.c | 2 +- net/ipv4/ip_tunnel.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 5c58e21f724e..f866d6282b2b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -609,7 +609,7 @@ static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), key->tos & ~INET_ECN_MASK, dev_net(dev), 0, - skb->mark, skb_get_hash(skb)); + skb->mark, skb_get_hash(skb), key->flow_flags); rt = ip_route_output_key(dev_net(dev), &fl4); if (IS_ERR(rt)) return PTR_ERR(rt); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index e65e948cab9f..019f3b0839c5 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -295,7 +295,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, RT_TOS(iph->tos), dev_net(dev), - tunnel->parms.link, tunnel->fwmark, 0); + tunnel->parms.link, tunnel->fwmark, 0, 0); rt = ip_route_output_key(tunnel->net, &fl4); if (!IS_ERR(rt)) { @@ -570,7 +570,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), RT_TOS(tos), - dev_net(dev), 0, skb->mark, skb_get_hash(skb)); + dev_net(dev), 0, skb->mark, skb_get_hash(skb), + key->flow_flags); if (tunnel->encap.type != TUNNEL_ENCAP_NONE) goto tx_error; @@ -729,7 +730,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, RT_TOS(tos), dev_net(dev), tunnel->parms.link, - tunnel->fwmark, skb_get_hash(skb)); + tunnel->fwmark, skb_get_hash(skb), 0); if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) goto tx_error; -- cgit v1.2.3 From ed06fce0b034b2e25bd93430f5c4cbb28036cc1a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 3 Aug 2022 14:55:03 -0400 Subject: SUNRPC: RPC level errors should set task->tk_rpc_status Fix up a case in call_encode() where we're failing to set task->tk_rpc_status when an RPC level error occurred. Fixes: 9c5948c24869 ("SUNRPC: task should be exit if encode return EKEYEXPIRED more times") Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b098e707ad41..7d268a291486 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1902,7 +1902,7 @@ call_encode(struct rpc_task *task) break; case -EKEYEXPIRED: if (!task->tk_cred_retry) { - rpc_exit(task, task->tk_status); + rpc_call_rpcerror(task, task->tk_status); } else { task->tk_action = call_refresh; task->tk_cred_retry--; -- cgit v1.2.3 From b1cb8a71f1eaec4eb77051590f7f561f25b15e32 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Sat, 20 Aug 2022 12:25:16 +0900 Subject: batman-adv: Fix hang up with small MTU hard-interface The system hangs up when batman-adv soft-interface is created on hard-interface with small MTU. For example, the following commands create batman-adv soft-interface on dummy interface with zero MTU: # ip link add name dummy0 type dummy # ip link set mtu 0 dev dummy0 # ip link set up dev dummy0 # ip link add name bat0 type batadv # ip link set dev dummy0 master bat0 These commands cause the system hang up with the following messages: [ 90.578925][ T6689] batman_adv: bat0: Adding interface: dummy0 [ 90.580884][ T6689] batman_adv: bat0: The MTU of interface dummy0 is too small (0) to handle the transport of batman-adv packets. Packets going over this interface will be fragmented on layer2 which could impact the performance. Setting the MTU to 1560 would solve the problem. [ 90.586264][ T6689] batman_adv: bat0: Interface activated: dummy0 [ 90.590061][ T6689] batman_adv: bat0: Forced to purge local tt entries to fit new maximum fragment MTU (-320) [ 90.595517][ T6689] batman_adv: bat0: Forced to purge local tt entries to fit new maximum fragment MTU (-320) [ 90.598499][ T6689] batman_adv: bat0: Forced to purge local tt entries to fit new maximum fragment MTU (-320) This patch fixes this issue by returning error when enabling hard-interface with small MTU size. Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Signed-off-by: Shigeru Yoshida Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/hard-interface.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index b8f8da7ee3de..41c1ad33d009 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -700,6 +701,9 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, int max_header_len = batadv_max_header_len(); int ret; + if (hard_iface->net_dev->mtu < ETH_MIN_MTU + max_header_len) + return -EINVAL; + if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) goto out; -- cgit v1.2.3 From 3c53cd65dece47dd1f9d3a809f32e59d1d87b2b8 Mon Sep 17 00:00:00 2001 From: Bernard Pidoux Date: Thu, 18 Aug 2022 02:02:13 +0200 Subject: rose: check NULL rose_loopback_neigh->loopback Commit 3b3fd068c56e3fbea30090859216a368398e39bf added NULL check for `rose_loopback_neigh->dev` in rose_loopback_timer() but omitted to check rose_loopback_neigh->loopback. It thus prevents *all* rose connect. The reason is that a special rose_neigh loopback has a NULL device. /proc/net/rose_neigh illustrates it via rose_neigh_show() function : [...] seq_printf(seq, "%05d %-9s %-4s %3d %3d %3s %3s %3lu %3lu", rose_neigh->number, (rose_neigh->loopback) ? "RSLOOP-0" : ax2asc(buf, &rose_neigh->callsign), rose_neigh->dev ? rose_neigh->dev->name : "???", rose_neigh->count, /proc/net/rose_neigh displays special rose_loopback_neigh->loopback as callsign RSLOOP-0: addr callsign dev count use mode restart t0 tf digipeaters 00001 RSLOOP-0 ??? 1 2 DCE yes 0 0 By checking rose_loopback_neigh->loopback, rose_rx_call_request() is called even in case rose_loopback_neigh->dev is NULL. This repairs rose connections. Verification with rose client application FPAC: FPAC-Node v 4.1.3 (built Aug 5 2022) for LINUX (help = h) F6BVP-4 (Commands = ?) : u Users - AX.25 Level 2 sessions : Port Callsign Callsign AX.25 state ROSE state NetRom status axudp F6BVP-5 -> F6BVP-9 Connected Connected --------- Fixes: 3b3fd068c56e ("rose: Fix Null pointer dereference in rose_send_frame()") Signed-off-by: Bernard Pidoux Suggested-by: Francois Romieu Cc: Thomas DL9SAU Osterried Signed-off-by: David S. Miller --- net/rose/rose_loopback.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c index 11c45c8c6c16..036d92c0ad79 100644 --- a/net/rose/rose_loopback.c +++ b/net/rose/rose_loopback.c @@ -96,7 +96,8 @@ static void rose_loopback_timer(struct timer_list *unused) } if (frametype == ROSE_CALL_REQUEST) { - if (!rose_loopback_neigh->dev) { + if (!rose_loopback_neigh->dev && + !rose_loopback_neigh->loopback) { kfree_skb(skb); continue; } -- cgit v1.2.3 From 855a28f9c96c80e6cbd2d986a857235e34868064 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 19 Aug 2022 20:39:25 +0300 Subject: net: dsa: don't dereference NULL extack in dsa_slave_changeupper() When a driver returns -EOPNOTSUPP in dsa_port_bridge_join() but failed to provide a reason for it, DSA attempts to set the extack to say that software fallback will kick in. The problem is, when we use brctl and the legacy bridge ioctls, the extack will be NULL, and DSA dereferences it in the process of setting it. Sergei Antonov proves this using the following stack trace: Unable to handle kernel NULL pointer dereference at virtual address 00000000 PC is at dsa_slave_changeupper+0x5c/0x158 dsa_slave_changeupper from raw_notifier_call_chain+0x38/0x6c raw_notifier_call_chain from __netdev_upper_dev_link+0x198/0x3b4 __netdev_upper_dev_link from netdev_master_upper_dev_link+0x50/0x78 netdev_master_upper_dev_link from br_add_if+0x430/0x7f4 br_add_if from br_ioctl_stub+0x170/0x530 br_ioctl_stub from br_ioctl_call+0x54/0x7c br_ioctl_call from dev_ifsioc+0x4e0/0x6bc dev_ifsioc from dev_ioctl+0x2f8/0x758 dev_ioctl from sock_ioctl+0x5f0/0x674 sock_ioctl from sys_ioctl+0x518/0xe40 sys_ioctl from ret_fast_syscall+0x0/0x1c Fix the problem by only overriding the extack if non-NULL. Fixes: 1c6e8088d9a7 ("net: dsa: allow port_bridge_join() to override extack message") Link: https://lore.kernel.org/netdev/CABikg9wx7vB5eRDAYtvAm7fprJ09Ta27a4ZazC=NX5K4wn6pWA@mail.gmail.com/ Reported-by: Sergei Antonov Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Tested-by: Sergei Antonov Link: https://lore.kernel.org/r/20220819173925.3581871-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/slave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index ad6a6663feeb..1291c2431d44 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -2484,7 +2484,7 @@ static int dsa_slave_changeupper(struct net_device *dev, if (!err) dsa_bridge_mtu_normalization(dp); if (err == -EOPNOTSUPP) { - if (!extack->_msg) + if (extack && !extack->_msg) NL_SET_ERR_MSG_MOD(extack, "Offloading not supported"); err = 0; -- cgit v1.2.3 From 7997eff82828304b780dc0a39707e1946d6f1ebf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 20 Aug 2022 17:38:37 +0200 Subject: netfilter: ebtables: reject blobs that don't provide all entry points Harshit Mogalapalli says: In ebt_do_table() function dereferencing 'private->hook_entry[hook]' can lead to NULL pointer dereference. [..] Kernel panic: general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f] [..] RIP: 0010:ebt_do_table+0x1dc/0x1ce0 Code: 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 5c 16 00 00 48 b8 00 00 00 00 00 fc ff df 49 8b 6c df 08 48 8d 7d 2c 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 88 [..] Call Trace: nf_hook_slow+0xb1/0x170 __br_forward+0x289/0x730 maybe_deliver+0x24b/0x380 br_flood+0xc6/0x390 br_dev_xmit+0xa2e/0x12c0 For some reason ebtables rejects blobs that provide entry points that are not supported by the table, but what it should instead reject is the opposite: blobs that DO NOT provide an entry point supported by the table. t->valid_hooks is the bitmask of hooks (input, forward ...) that will see packets. Providing an entry point that is not support is harmless (never called/used), but the inverse isn't: it results in a crash because the ebtables traverser doesn't expect a NULL blob for a location its receiving packets for. Instead of fixing all the individual checks, do what iptables is doing and reject all blobs that differ from the expected hooks. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Harshit Mogalapalli Reported-by: syzkaller Signed-off-by: Florian Westphal --- net/bridge/netfilter/ebtable_broute.c | 8 -------- net/bridge/netfilter/ebtable_filter.c | 8 -------- net/bridge/netfilter/ebtable_nat.c | 8 -------- net/bridge/netfilter/ebtables.c | 8 +------- 4 files changed, 1 insertion(+), 31 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 1a11064f9990..8f19253024b0 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -36,18 +36,10 @@ static struct ebt_replace_kernel initial_table = { .entries = (char *)&initial_chain, }; -static int check(const struct ebt_table_info *info, unsigned int valid_hooks) -{ - if (valid_hooks & ~(1 << NF_BR_BROUTING)) - return -EINVAL; - return 0; -} - static const struct ebt_table broute_table = { .name = "broute", .table = &initial_table, .valid_hooks = 1 << NF_BR_BROUTING, - .check = check, .me = THIS_MODULE, }; diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index cb949436bc0e..278f324e6752 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -43,18 +43,10 @@ static struct ebt_replace_kernel initial_table = { .entries = (char *)initial_chains, }; -static int check(const struct ebt_table_info *info, unsigned int valid_hooks) -{ - if (valid_hooks & ~FILTER_VALID_HOOKS) - return -EINVAL; - return 0; -} - static const struct ebt_table frame_filter = { .name = "filter", .table = &initial_table, .valid_hooks = FILTER_VALID_HOOKS, - .check = check, .me = THIS_MODULE, }; diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 5ee0531ae506..9066f7f376d5 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -43,18 +43,10 @@ static struct ebt_replace_kernel initial_table = { .entries = (char *)initial_chains, }; -static int check(const struct ebt_table_info *info, unsigned int valid_hooks) -{ - if (valid_hooks & ~NAT_VALID_HOOKS) - return -EINVAL; - return 0; -} - static const struct ebt_table frame_nat = { .name = "nat", .table = &initial_table, .valid_hooks = NAT_VALID_HOOKS, - .check = check, .me = THIS_MODULE, }; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index f2dbefb61ce8..9a0ae59cdc50 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1040,8 +1040,7 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, goto free_iterate; } - /* the table doesn't like it */ - if (t->check && (ret = t->check(newinfo, repl->valid_hooks))) + if (repl->valid_hooks != t->valid_hooks) goto free_unlock; if (repl->num_counters && repl->num_counters != t->private->nentries) { @@ -1231,11 +1230,6 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, if (ret != 0) goto free_chainstack; - if (table->check && table->check(newinfo, table->valid_hooks)) { - ret = -EINVAL; - goto free_chainstack; - } - table->private = newinfo; rwlock_init(&table->lock); mutex_lock(&ebt_mutex); -- cgit v1.2.3 From cf97769c761abfeac8931b35fe0e1a8d5fabc9d8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 19 Aug 2022 00:42:31 +0200 Subject: netfilter: conntrack: work around exceeded receive window When a TCP sends more bytes than allowed by the receive window, all future packets can be marked as invalid. This can clog up the conntrack table because of 5-day default timeout. Sequence of packets: 01 initiator > responder: [S], seq 171, win 5840, options [mss 1330,sackOK,TS val 63 ecr 0,nop,wscale 1] 02 responder > initiator: [S.], seq 33211, ack 172, win 65535, options [mss 1460,sackOK,TS val 010 ecr 63,nop,wscale 8] 03 initiator > responder: [.], ack 33212, win 2920, options [nop,nop,TS val 068 ecr 010], length 0 04 initiator > responder: [P.], seq 172:240, ack 33212, win 2920, options [nop,nop,TS val 279 ecr 010], length 68 Window is 5840 starting from 33212 -> 39052. 05 responder > initiator: [.], ack 240, win 256, options [nop,nop,TS val 872 ecr 279], length 0 06 responder > initiator: [.], seq 33212:34530, ack 240, win 256, options [nop,nop,TS val 892 ecr 279], length 1318 This is fine, conntrack will flag the connection as having outstanding data (UNACKED), which lowers the conntrack timeout to 300s. 07 responder > initiator: [.], seq 34530:35848, ack 240, win 256, options [nop,nop,TS val 892 ecr 279], length 1318 08 responder > initiator: [.], seq 35848:37166, ack 240, win 256, options [nop,nop,TS val 892 ecr 279], length 1318 09 responder > initiator: [.], seq 37166:38484, ack 240, win 256, options [nop,nop,TS val 892 ecr 279], length 1318 10 responder > initiator: [.], seq 38484:39802, ack 240, win 256, options [nop,nop,TS val 892 ecr 279], length 1318 Packet 10 is already sending more than permitted, but conntrack doesn't validate this (only seq is tested vs. maxend, not 'seq+len'). 38484 is acceptable, but only up to 39052, so this packet should not have been sent (or only 568 bytes, not 1318). At this point, connection is still in '300s' mode. Next packet however will get flagged: 11 responder > initiator: [P.], seq 39802:40128, ack 240, win 256, options [nop,nop,TS val 892 ecr 279], length 326 nf_ct_proto_6: SEQ is over the upper bound (over the window of the receiver) .. LEN=378 .. SEQ=39802 ACK=240 ACK PSH .. Now, a couple of replies/acks comes in: 12 initiator > responder: [.], ack 34530, win 4368, [.. irrelevant acks removed ] 16 initiator > responder: [.], ack 39802, win 8712, options [nop,nop,TS val 296201291 ecr 2982371892], length 0 This ack is significant -- this acks the last packet send by the responder that conntrack considered valid. This means that ack == td_end. This will withdraw the 'unacked data' flag, the connection moves back to the 5-day timeout of established conntracks. 17 initiator > responder: ack 40128, win 10030, ... This packet is also flagged as invalid. Because conntrack only updates state based on packets that are considered valid, packet 11 'did not exist' and that gets us: nf_ct_proto_6: ACK is over upper bound 39803 (ACKed data not seen yet) .. SEQ=240 ACK=40128 WINDOW=10030 RES=0x00 ACK URG Because this received and processed by the endpoints, the conntrack entry remains in a bad state, no packets will ever be considered valid again: 30 responder > initiator: [F.], seq 40432, ack 2045, win 391, .. 31 initiator > responder: [.], ack 40433, win 11348, .. 32 initiator > responder: [F.], seq 2045, ack 40433, win 11348 .. ... all trigger 'ACK is over bound' test and we end up with non-early-evictable 5-day default timeout. NB: This patch triggers a bunch of checkpatch warnings because of silly indent. I will resend the cleanup series linked below to reduce the indent level once this change has propagated to net-next. I could route the cleanup via nf but that causes extra backport work for stable maintainers. Link: https://lore.kernel.org/netfilter-devel/20220720175228.17880-1-fw@strlen.de/T/#mb1d7147d36294573cc4f81d00f9f8dadfdd06cd8 Signed-off-by: Florian Westphal --- net/netfilter/nf_conntrack_proto_tcp.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index a63b51dceaf2..a634c72b1ffc 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -655,6 +655,37 @@ static bool tcp_in_window(struct nf_conn *ct, tn->tcp_be_liberal) res = true; if (!res) { + bool seq_ok = before(seq, sender->td_maxend + 1); + + if (!seq_ok) { + u32 overshot = end - sender->td_maxend + 1; + bool ack_ok; + + ack_ok = after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1); + + if (in_recv_win && + ack_ok && + overshot <= receiver->td_maxwin && + before(sack, receiver->td_end + 1)) { + /* Work around TCPs that send more bytes than allowed by + * the receive window. + * + * If the (marked as invalid) packet is allowed to pass by + * the ruleset and the peer acks this data, then its possible + * all future packets will trigger 'ACK is over upper bound' check. + * + * Thus if only the sequence check fails then do update td_end so + * possible ACK for this data can update internal state. + */ + sender->td_end = end; + sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + + nf_ct_l4proto_log_invalid(skb, ct, hook_state, + "%u bytes more than expected", overshot); + return res; + } + } + nf_ct_l4proto_log_invalid(skb, ct, hook_state, "%s", before(seq, sender->td_maxend + 1) ? -- cgit v1.2.3 From 18bbc3213383a82b05383827f4b1b882e3f0a5a5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 20 Aug 2022 17:54:06 +0200 Subject: netfilter: nft_tproxy: restrict to prerouting hook TPROXY is only allowed from prerouting, but nft_tproxy doesn't check this. This fixes a crash (null dereference) when using tproxy from e.g. output. Fixes: 4ed8eb6570a4 ("netfilter: nf_tables: Add native tproxy support") Reported-by: Shell Chen Signed-off-by: Florian Westphal --- net/netfilter/nft_tproxy.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index 68b2eed742df..62da25ad264b 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -312,6 +312,13 @@ static int nft_tproxy_dump(struct sk_buff *skb, return 0; } +static int nft_tproxy_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_hooks(ctx->chain, 1 << NF_INET_PRE_ROUTING); +} + static struct nft_expr_type nft_tproxy_type; static const struct nft_expr_ops nft_tproxy_ops = { .type = &nft_tproxy_type, @@ -321,6 +328,7 @@ static const struct nft_expr_ops nft_tproxy_ops = { .destroy = nft_tproxy_destroy, .dump = nft_tproxy_dump, .reduce = NFT_REDUCE_READONLY, + .validate = nft_tproxy_validate, }; static struct nft_expr_type nft_tproxy_type __read_mostly = { -- cgit v1.2.3 From 5dc52d83baac30decf5f3b371d5eb41dfa1d1412 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 21 Aug 2022 10:28:25 +0200 Subject: netfilter: nf_tables: disallow updates of implicit chain Updates on existing implicit chain make no sense, disallow this. Fixes: d0e2c7de92c7 ("netfilter: nf_tables: add NFT_CHAIN_BINDING") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 62cfb0e31c40..dff2b5851bbb 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2574,6 +2574,9 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); if (chain != NULL) { + if (chain->flags & NFT_CHAIN_BINDING) + return -EINVAL; + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, attr); return -EEXIST; -- cgit v1.2.3 From ab482c6b66a4a8c0a8c0b0f577a785cf9ff1c2e2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 21 Aug 2022 10:52:48 +0200 Subject: netfilter: nf_tables: make table handle allocation per-netns friendly mutex is per-netns, move table_netns to the pernet area. *read-write* to 0xffffffff883a01e8 of 8 bytes by task 6542 on cpu 0: nf_tables_newtable+0x6dc/0xc00 net/netfilter/nf_tables_api.c:1221 nfnetlink_rcv_batch net/netfilter/nfnetlink.c:513 [inline] nfnetlink_rcv_skb_batch net/netfilter/nfnetlink.c:634 [inline] nfnetlink_rcv+0xa6a/0x13a0 net/netfilter/nfnetlink.c:652 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] netlink_unicast+0x652/0x730 net/netlink/af_netlink.c:1345 netlink_sendmsg+0x643/0x740 net/netlink/af_netlink.c:1921 Fixes: f102d66b335a ("netfilter: nf_tables: use dedicated mutex to guard transactions") Reported-by: Abhishek Shah Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index dff2b5851bbb..0951f31caabe 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -32,7 +32,6 @@ static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); static LIST_HEAD(nf_tables_destroy_list); static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); -static u64 table_handle; enum { NFT_VALIDATE_SKIP = 0, @@ -1235,7 +1234,7 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info, INIT_LIST_HEAD(&table->flowtables); table->family = family; table->flags = flags; - table->handle = ++table_handle; + table->handle = ++nft_net->table_handle; if (table->flags & NFT_TABLE_F_OWNER) table->nlpid = NETLINK_CB(skb).portid; -- cgit v1.2.3 From 94254f990c07e9ddf1634e0b727fab821c3b5bf9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 21 Aug 2022 11:47:04 +0200 Subject: netfilter: nft_payload: report ERANGE for too long offset and length Instead of offset and length are truncation to u8, report ERANGE. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_payload.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 2e7ac007cb30..4fee67abfe2c 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -833,6 +833,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, { enum nft_payload_bases base; unsigned int offset, len; + int err; if (tb[NFTA_PAYLOAD_BASE] == NULL || tb[NFTA_PAYLOAD_OFFSET] == NULL || @@ -859,8 +860,13 @@ nft_payload_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_PAYLOAD_DREG] == NULL) return ERR_PTR(-EINVAL); - offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); - len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U8_MAX, &offset); + if (err < 0) + return ERR_PTR(err); + + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_LEN], U8_MAX, &len); + if (err < 0) + return ERR_PTR(err); if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) && base != NFT_PAYLOAD_LL_HEADER && base != NFT_PAYLOAD_INNER_HEADER) -- cgit v1.2.3 From 7044ab281febae9e2fa9b0b247693d6026166293 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 21 Aug 2022 11:55:19 +0200 Subject: netfilter: nft_payload: do not truncate csum_offset and csum_type Instead report ERANGE if csum_offset is too long, and EOPNOTSUPP if type is not support. Fixes: 7ec3f7b47b8d ("netfilter: nft_payload: add packet mangling support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_payload.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 4fee67abfe2c..eb0e40c29712 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -740,17 +740,23 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_payload_set *priv = nft_expr_priv(expr); + u32 csum_offset, csum_type = NFT_PAYLOAD_CSUM_NONE; + int err; priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); if (tb[NFTA_PAYLOAD_CSUM_TYPE]) - priv->csum_type = - ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE])); - if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) - priv->csum_offset = - ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_OFFSET])); + csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE])); + if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) { + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_CSUM_OFFSET], U8_MAX, + &csum_offset); + if (err < 0) + return err; + + priv->csum_offset = csum_offset; + } if (tb[NFTA_PAYLOAD_CSUM_FLAGS]) { u32 flags; @@ -761,7 +767,7 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, priv->csum_flags = flags; } - switch (priv->csum_type) { + switch (csum_type) { case NFT_PAYLOAD_CSUM_NONE: case NFT_PAYLOAD_CSUM_INET: break; @@ -775,6 +781,7 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, default: return -EOPNOTSUPP; } + priv->csum_type = csum_type; return nft_parse_register_load(tb[NFTA_PAYLOAD_SREG], &priv->sreg, priv->len); -- cgit v1.2.3 From 43eb8949cfdffa764b92bc6c54b87cbe5b0003fe Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 21 Aug 2022 12:41:33 +0200 Subject: netfilter: nf_tables: do not leave chain stats enabled on error Error might occur later in the nf_tables_addchain() codepath, enable static key only after transaction has been created. Fixes: 9f08ea848117 ("netfilter: nf_tables: keep chain counters away from hot path") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0951f31caabe..72c066a33416 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2195,9 +2195,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; + struct nft_stats __percpu *stats = NULL; struct nft_table *table = ctx->table; struct nft_base_chain *basechain; - struct nft_stats __percpu *stats; struct net *net = ctx->net; char name[NFT_NAME_MAXLEN]; struct nft_rule_blob *blob; @@ -2235,7 +2235,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, return PTR_ERR(stats); } rcu_assign_pointer(basechain->stats, stats); - static_branch_inc(&nft_counters_enabled); } err = nft_basechain_init(basechain, family, &hook, flags); @@ -2318,6 +2317,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, goto err_unregister_hook; } + if (stats) + static_branch_inc(&nft_counters_enabled); + table->use++; return 0; -- cgit v1.2.3 From 5f3b7aae14a706d0d7da9f9e39def52ff5fc3d39 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 21 Aug 2022 16:25:07 +0200 Subject: netfilter: nft_osf: restrict osf to ipv4, ipv6 and inet families As it was originally intended, restrict extension to supported families. Fixes: b96af92d6eaf ("netfilter: nf_tables: implement Passive OS fingerprint module in nft_osf") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_osf.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index 0053a697c931..89342ccccdcc 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -115,9 +115,21 @@ static int nft_osf_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) { - return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_FORWARD)); + unsigned int hooks; + + switch (ctx->family) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + case NFPROTO_INET: + hooks = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_FORWARD); + break; + default: + return -EOPNOTSUPP; + } + + return nft_chain_validate_hooks(ctx->chain, hooks); } static bool nft_osf_reduce(struct nft_regs_track *track, -- cgit v1.2.3 From 01e4092d53bc4fe122a6e4b6d664adbd57528ca3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 21 Aug 2022 16:32:44 +0200 Subject: netfilter: nft_tunnel: restrict it to netdev family Only allow to use this expression from NFPROTO_NETDEV family. Fixes: af308b94a2a4 ("netfilter: nf_tables: add tunnel support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_tunnel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 5edaaded706d..983ade4be3b3 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -161,6 +161,7 @@ static const struct nft_expr_ops nft_tunnel_get_ops = { static struct nft_expr_type nft_tunnel_type __read_mostly = { .name = "tunnel", + .family = NFPROTO_NETDEV, .ops = &nft_tunnel_get_ops, .policy = nft_tunnel_policy, .maxattr = NFTA_TUNNEL_MAX, -- cgit v1.2.3 From e02f0d3970404bfea385b6edb86f2d936db0ea2b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 Aug 2022 11:06:39 +0200 Subject: netfilter: nf_tables: disallow binding to already bound chain Update nft_data_init() to report EINVAL if chain is already bound. Fixes: d0e2c7de92c7 ("netfilter: nf_tables: add NFT_CHAIN_BINDING") Reported-by: Gwangun Jung Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 72c066a33416..2ee50e23c9b7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9711,6 +9711,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, return PTR_ERR(chain); if (nft_is_base_chain(chain)) return -EOPNOTSUPP; + if (nft_chain_is_bound(chain)) + return -EINVAL; if (desc->flags & NFT_DATA_DESC_SETELEM && chain->flags & NFT_CHAIN_BINDING) return -EINVAL; -- cgit v1.2.3 From 759eebbcfafcefa23b59e912396306543764bd3c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 Aug 2022 23:13:00 +0200 Subject: netfilter: flowtable: add function to invoke garbage collection immediately Expose nf_flow_table_gc_run() to force a garbage collector run from the offload infrastructure. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_core.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 765ac779bfc8..60fc1e1b7182 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -437,12 +437,17 @@ static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, } } +void nf_flow_table_gc_run(struct nf_flowtable *flow_table) +{ + nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL); +} + static void nf_flow_offload_work_gc(struct work_struct *work) { struct nf_flowtable *flow_table; flow_table = container_of(work, struct nf_flowtable, gc_work.work); - nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL); + nf_flow_table_gc_run(flow_table); queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); } @@ -601,10 +606,11 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) cancel_delayed_work_sync(&flow_table->gc_work); nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); - nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL); + nf_flow_table_gc_run(flow_table); nf_flow_table_offload_flush(flow_table); if (nf_flowtable_hw_offload(flow_table)) - nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL); + nf_flow_table_gc_run(flow_table); + rhashtable_destroy(&flow_table->rhashtable); } EXPORT_SYMBOL_GPL(nf_flow_table_free); -- cgit v1.2.3 From 9afb4b27349a499483ae0134282cefd0c90f480f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 18 Nov 2021 22:24:15 +0100 Subject: netfilter: flowtable: fix stuck flows on cleanup due to pending work To clear the flow table on flow table free, the following sequence normally happens in order: 1) gc_step work is stopped to disable any further stats/del requests. 2) All flow table entries are set to teardown state. 3) Run gc_step which will queue HW del work for each flow table entry. 4) Waiting for the above del work to finish (flush). 5) Run gc_step again, deleting all entries from the flow table. 6) Flow table is freed. But if a flow table entry already has pending HW stats or HW add work step 3 will not queue HW del work (it will be skipped), step 4 will wait for the pending add/stats to finish, and step 5 will queue HW del work which might execute after freeing of the flow table. To fix the above, this patch flushes the pending work, then it sets the teardown flag to all flows in the flowtable and it forces a garbage collector run to queue work to remove the flows from hardware, then it flushes this new pending work and (finally) it forces another garbage collector run to remove the entry from the software flowtable. Stack trace: [47773.882335] BUG: KASAN: use-after-free in down_read+0x99/0x460 [47773.883634] Write of size 8 at addr ffff888103b45aa8 by task kworker/u20:6/543704 [47773.885634] CPU: 3 PID: 543704 Comm: kworker/u20:6 Not tainted 5.12.0-rc7+ #2 [47773.886745] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009) [47773.888438] Workqueue: nf_ft_offload_del flow_offload_work_handler [nf_flow_table] [47773.889727] Call Trace: [47773.890214] dump_stack+0xbb/0x107 [47773.890818] print_address_description.constprop.0+0x18/0x140 [47773.892990] kasan_report.cold+0x7c/0xd8 [47773.894459] kasan_check_range+0x145/0x1a0 [47773.895174] down_read+0x99/0x460 [47773.899706] nf_flow_offload_tuple+0x24f/0x3c0 [nf_flow_table] [47773.907137] flow_offload_work_handler+0x72d/0xbe0 [nf_flow_table] [47773.913372] process_one_work+0x8ac/0x14e0 [47773.921325] [47773.921325] Allocated by task 592159: [47773.922031] kasan_save_stack+0x1b/0x40 [47773.922730] __kasan_kmalloc+0x7a/0x90 [47773.923411] tcf_ct_flow_table_get+0x3cb/0x1230 [act_ct] [47773.924363] tcf_ct_init+0x71c/0x1156 [act_ct] [47773.925207] tcf_action_init_1+0x45b/0x700 [47773.925987] tcf_action_init+0x453/0x6b0 [47773.926692] tcf_exts_validate+0x3d0/0x600 [47773.927419] fl_change+0x757/0x4a51 [cls_flower] [47773.928227] tc_new_tfilter+0x89a/0x2070 [47773.936652] [47773.936652] Freed by task 543704: [47773.937303] kasan_save_stack+0x1b/0x40 [47773.938039] kasan_set_track+0x1c/0x30 [47773.938731] kasan_set_free_info+0x20/0x30 [47773.939467] __kasan_slab_free+0xe7/0x120 [47773.940194] slab_free_freelist_hook+0x86/0x190 [47773.941038] kfree+0xce/0x3a0 [47773.941644] tcf_ct_flow_table_cleanup_work Original patch description and stack trace by Paul Blakey. Fixes: c29f74e0df7a ("netfilter: nf_flow_table: hardware offload support") Reported-by: Paul Blakey Tested-by: Paul Blakey Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_core.c | 7 +++---- net/netfilter/nf_flow_table_offload.c | 8 ++++++++ 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 60fc1e1b7182..81c26a96c30b 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -605,12 +605,11 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) mutex_unlock(&flowtable_lock); cancel_delayed_work_sync(&flow_table->gc_work); + nf_flow_table_offload_flush(flow_table); + /* ... no more pending work after this stage ... */ nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); nf_flow_table_gc_run(flow_table); - nf_flow_table_offload_flush(flow_table); - if (nf_flowtable_hw_offload(flow_table)) - nf_flow_table_gc_run(flow_table); - + nf_flow_table_offload_flush_cleanup(flow_table); rhashtable_destroy(&flow_table->rhashtable); } EXPORT_SYMBOL_GPL(nf_flow_table_free); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 103b6cbf257f..b04645ced89b 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -1074,6 +1074,14 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable, flow_offload_queue_work(offload); } +void nf_flow_table_offload_flush_cleanup(struct nf_flowtable *flowtable) +{ + if (nf_flowtable_hw_offload(flowtable)) { + flush_workqueue(nf_flow_offload_del_wq); + nf_flow_table_gc_run(flowtable); + } +} + void nf_flow_table_offload_flush(struct nf_flowtable *flowtable) { if (nf_flowtable_hw_offload(flowtable)) { -- cgit v1.2.3 From 00cd7bf9f9e06769ef84d5102774c8becd6a498a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 23 Aug 2022 16:38:48 -0700 Subject: netfilter: nf_defrag_ipv6: allow nf_conntrack_frag6_high_thresh increases Currently, net.netfilter.nf_conntrack_frag6_high_thresh can only be lowered. I found this issue while investigating a probable kernel issue causing flakes in tools/testing/selftests/net/ip_defrag.sh In particular, these sysctl changes were ignored: ip netns exec "${NETNS}" sysctl -w net.netfilter.nf_conntrack_frag6_high_thresh=9000000 >/dev/null 2>&1 ip netns exec "${NETNS}" sysctl -w net.netfilter.nf_conntrack_frag6_low_thresh=7000000 >/dev/null 2>&1 This change is inline with commit 836196239298 ("net/ipfrag: let ip[6]frag_high_thresh in ns be higher than in init_net") Fixes: 8db3d41569bb ("netfilter: nf_defrag_ipv6: use net_generic infra") Signed-off-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nf_conntrack_reasm.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 7dd3629dd19e..38db0064d661 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -86,7 +86,6 @@ static int nf_ct_frag6_sysctl_register(struct net *net) table[1].extra2 = &nf_frag->fqdir->high_thresh; table[2].data = &nf_frag->fqdir->high_thresh; table[2].extra1 = &nf_frag->fqdir->low_thresh; - table[2].extra2 = &nf_frag->fqdir->high_thresh; hdr = register_net_sysctl(net, "net/netfilter", table); if (hdr == NULL) -- cgit v1.2.3 From d5485d9dd24e1d04e5509916515260186eb1455c Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Mon, 22 Aug 2022 10:53:46 +0800 Subject: net: neigh: don't call kfree_skb() under spin_lock_irqsave() It is not allowed to call kfree_skb() from hardware interrupt context or with interrupts being disabled. So add all skb to a tmp list, then free them after spin_unlock_irqrestore() at once. Fixes: 66ba215cb513 ("neigh: fix possible DoS due to net iface start/stop loop") Suggested-by: Denis V. Lunev Signed-off-by: Yang Yingliang Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/core/neighbour.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 5b669eb80270..78cc8fb68814 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -309,14 +309,17 @@ static int neigh_del_timer(struct neighbour *n) static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net) { + struct sk_buff_head tmp; unsigned long flags; struct sk_buff *skb; + skb_queue_head_init(&tmp); spin_lock_irqsave(&list->lock, flags); skb = skb_peek(list); while (skb != NULL) { struct sk_buff *skb_next = skb_peek_next(skb, list); struct net_device *dev = skb->dev; + if (net == NULL || net_eq(dev_net(dev), net)) { struct in_device *in_dev; @@ -326,13 +329,16 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net) in_dev->arp_parms->qlen--; rcu_read_unlock(); __skb_unlink(skb, list); - - dev_put(dev); - kfree_skb(skb); + __skb_queue_tail(&tmp, skb); } skb = skb_next; } spin_unlock_irqrestore(&list->lock, flags); + + while ((skb = __skb_dequeue(&tmp))) { + dev_put(skb->dev); + kfree_skb(skb); + } } static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, -- cgit v1.2.3 From c624c58e08b15105662b9ab9be23d14a6b945a49 Mon Sep 17 00:00:00 2001 From: lily Date: Mon, 22 Aug 2022 22:44:11 -0700 Subject: net/core/skbuff: Check the return value of skb_copy_bits() skb_copy_bits() could fail, which requires a check on the return value. Signed-off-by: Li Zhong Signed-off-by: David S. Miller --- net/core/skbuff.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 974bbbbe7138..5ea1d074a920 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4205,9 +4205,8 @@ normal: SKB_GSO_CB(nskb)->csum_start = skb_headroom(nskb) + doffset; } else { - skb_copy_bits(head_skb, offset, - skb_put(nskb, len), - len); + if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len)) + goto err; } continue; } -- cgit v1.2.3 From 1227c1771dd2ad44318aa3ab9e3a293b3f34ff2a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 10:46:44 -0700 Subject: net: Fix data-races around sysctl_[rw]mem_(max|default). While reading sysctl_[rw]mem_(max|default), they can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/core/filter.c | 4 ++-- net/core/sock.c | 8 ++++---- net/ipv4/ip_output.c | 2 +- net/ipv4/tcp_output.c | 2 +- net/netfilter/ipvs/ip_vs_sync.c | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index e8508aaafd27..c4f14ad82029 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5034,14 +5034,14 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, /* Only some socketops are supported */ switch (optname) { case SO_RCVBUF: - val = min_t(u32, val, sysctl_rmem_max); + val = min_t(u32, val, READ_ONCE(sysctl_rmem_max)); val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_RCVBUF_LOCK; WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); break; case SO_SNDBUF: - val = min_t(u32, val, sysctl_wmem_max); + val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); val = min_t(int, val, INT_MAX / 2); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; WRITE_ONCE(sk->sk_sndbuf, diff --git a/net/core/sock.c b/net/core/sock.c index 4cb957d934a2..303af52f3b79 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1101,7 +1101,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ - val = min_t(u32, val, sysctl_wmem_max); + val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); set_sndbuf: /* Ensure val * 2 fits into an int, to prevent max_t() * from treating it as a negative value. @@ -1133,7 +1133,7 @@ set_sndbuf: * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ - __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); + __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); break; case SO_RCVBUFFORCE: @@ -3309,8 +3309,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) timer_setup(&sk->sk_timer, NULL, 0); sk->sk_allocation = GFP_KERNEL; - sk->sk_rcvbuf = sysctl_rmem_default; - sk->sk_sndbuf = sysctl_wmem_default; + sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); + sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); sk->sk_state = TCP_CLOSE; sk_set_socket(sk, sock); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index d7bd1daf022b..04e2034f2f8e 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1730,7 +1730,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; - sk->sk_sndbuf = sysctl_wmem_default; + sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); ipc.sockc.mark = fl4.flowi4_mark; err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 78b654ff421b..290019de766d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -239,7 +239,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, if (wscale_ok) { /* Set window scaling on max possible window */ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); - space = max_t(u32, space, sysctl_rmem_max); + space = max_t(u32, space, READ_ONCE(sysctl_rmem_max)); space = min_t(u32, space, *window_clamp); *rcv_wscale = clamp_t(int, ilog2(space) - 15, 0, TCP_MAX_WSCALE); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 9d43277b8b4f..a56fd0b5a430 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1280,12 +1280,12 @@ static void set_sock_size(struct sock *sk, int mode, int val) lock_sock(sk); if (mode) { val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2, - sysctl_wmem_max); + READ_ONCE(sysctl_wmem_max)); sk->sk_sndbuf = val * 2; sk->sk_userlocks |= SOCK_SNDBUF_LOCK; } else { val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2, - sysctl_rmem_max); + READ_ONCE(sysctl_rmem_max)); sk->sk_rcvbuf = val * 2; sk->sk_userlocks |= SOCK_RCVBUF_LOCK; } -- cgit v1.2.3 From bf955b5ab8f6f7b0632cdef8e36b14e4f6e77829 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 10:46:45 -0700 Subject: net: Fix data-races around weight_p and dev_weight_[rt]x_bias. While reading weight_p, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Also, dev_[rt]x_weight can be read/written at the same time. So, we need to use READ_ONCE() and WRITE_ONCE() for its access. Moreover, to use the same weight_p while changing dev_[rt]x_weight, we add a mutex in proc_do_dev_weight(). Fixes: 3d48b53fb2ae ("net: dev_weight: TX/RX orthogonality") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- net/core/sysctl_net_core.c | 15 +++++++++------ net/sched/sch_generic.c | 2 +- 3 files changed, 11 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 716df64fcfa5..b5b92dcd5eea 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5918,7 +5918,7 @@ static int process_backlog(struct napi_struct *napi, int quota) net_rps_action_and_irq_enable(sd); } - napi->weight = dev_rx_weight; + napi->weight = READ_ONCE(dev_rx_weight); while (again) { struct sk_buff *skb; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 71a13596ea2b..725891527814 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -234,14 +234,17 @@ static int set_default_qdisc(struct ctl_table *table, int write, static int proc_do_dev_weight(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - int ret; + static DEFINE_MUTEX(dev_weight_mutex); + int ret, weight; + mutex_lock(&dev_weight_mutex); ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret != 0) - return ret; - - dev_rx_weight = weight_p * dev_weight_rx_bias; - dev_tx_weight = weight_p * dev_weight_tx_bias; + if (!ret && write) { + weight = READ_ONCE(weight_p); + WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); + WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias); + } + mutex_unlock(&dev_weight_mutex); return ret; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index d47b9689eba6..99b697ad2b98 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -409,7 +409,7 @@ static inline bool qdisc_restart(struct Qdisc *q, int *packets) void __qdisc_run(struct Qdisc *q) { - int quota = dev_tx_weight; + int quota = READ_ONCE(dev_tx_weight); int packets; while (qdisc_restart(q, &packets)) { -- cgit v1.2.3 From 5dcd08cd19912892586c6082d56718333e2d19db Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 10:46:46 -0700 Subject: net: Fix data-races around netdev_max_backlog. While reading netdev_max_backlog, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. While at it, we remove the unnecessary spaces in the doc. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- net/core/gro_cells.c | 2 +- net/xfrm/espintcp.c | 2 +- net/xfrm/xfrm_input.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index b5b92dcd5eea..07da69c1ac0a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4624,7 +4624,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) struct softnet_data *sd; unsigned int old_flow, new_flow; - if (qlen < (netdev_max_backlog >> 1)) + if (qlen < (READ_ONCE(netdev_max_backlog) >> 1)) return false; sd = this_cpu_ptr(&softnet_data); @@ -4672,7 +4672,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, if (!netif_running(skb->dev)) goto drop; qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { + if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) { if (qlen) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index 541c7a72a28a..21619c70a82b 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -26,7 +26,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) cell = this_cpu_ptr(gcells->cells); - if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) { + if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(netdev_max_backlog)) { drop: dev_core_stats_rx_dropped_inc(dev); kfree_skb(skb); diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index 82d14eea1b5a..974eb97b77d2 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -168,7 +168,7 @@ int espintcp_queue_out(struct sock *sk, struct sk_buff *skb) { struct espintcp_ctx *ctx = espintcp_getctx(sk); - if (skb_queue_len(&ctx->out_queue) >= netdev_max_backlog) + if (skb_queue_len(&ctx->out_queue) >= READ_ONCE(netdev_max_backlog)) return -ENOBUFS; __skb_queue_tail(&ctx->out_queue, skb); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 70a8c36f0ba6..b2f4ec9c537f 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -782,7 +782,7 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, trans = this_cpu_ptr(&xfrm_trans_tasklet); - if (skb_queue_len(&trans->queue) >= netdev_max_backlog) + if (skb_queue_len(&trans->queue) >= READ_ONCE(netdev_max_backlog)) return -ENOBUFS; BUILD_BUG_ON(sizeof(struct xfrm_trans_cb) > sizeof(skb->cb)); -- cgit v1.2.3 From 61adf447e38664447526698872e21c04623afb8e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 10:46:47 -0700 Subject: net: Fix data-races around netdev_tstamp_prequeue. While reading netdev_tstamp_prequeue, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 3b098e2d7c69 ("net: Consistent skb timestamping") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 07da69c1ac0a..4705e6630efa 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4928,7 +4928,7 @@ static int netif_rx_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(netdev_tstamp_prequeue, skb); + net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); trace_netif_rx(skb); @@ -5281,7 +5281,7 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, int ret = NET_RX_DROP; __be16 type; - net_timestamp_check(!netdev_tstamp_prequeue, skb); + net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb); trace_netif_receive_skb(skb); @@ -5664,7 +5664,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(netdev_tstamp_prequeue, skb); + net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; @@ -5694,7 +5694,7 @@ void netif_receive_skb_list_internal(struct list_head *head) INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { - net_timestamp_check(netdev_tstamp_prequeue, skb); + net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); -- cgit v1.2.3 From 7de6d09f51917c829af2b835aba8bb5040f8e86a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 10:46:49 -0700 Subject: net: Fix data-races around sysctl_optmem_max. While reading sysctl_optmem_max, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/core/bpf_sk_storage.c | 5 +++-- net/core/filter.c | 9 +++++---- net/core/sock.c | 8 +++++--- net/ipv4/ip_sockglue.c | 6 +++--- net/ipv6/ipv6_sockglue.c | 4 ++-- 5 files changed, 18 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 1b7f385643b4..94374d529ea4 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -310,11 +310,12 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) { + int optmem_max = READ_ONCE(sysctl_optmem_max); struct sock *sk = (struct sock *)owner; /* same check as in sock_kmalloc() */ - if (size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + if (size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { atomic_add(size, &sk->sk_omem_alloc); return 0; } diff --git a/net/core/filter.c b/net/core/filter.c index c4f14ad82029..c191db80ce93 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1214,10 +1214,11 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) { u32 filter_size = bpf_prog_size(fp->prog->len); + int optmem_max = READ_ONCE(sysctl_optmem_max); /* same check as in sock_kmalloc() */ - if (filter_size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { + if (filter_size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) { atomic_add(filter_size, &sk->sk_omem_alloc); return true; } @@ -1548,7 +1549,7 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (IS_ERR(prog)) return PTR_ERR(prog); - if (bpf_prog_size(prog->len) > sysctl_optmem_max) + if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) err = -ENOMEM; else err = reuseport_attach_prog(sk, prog); @@ -1615,7 +1616,7 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) } } else { /* BPF_PROG_TYPE_SOCKET_FILTER */ - if (bpf_prog_size(prog->len) > sysctl_optmem_max) { + if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) { err = -ENOMEM; goto err_prog_put; } diff --git a/net/core/sock.c b/net/core/sock.c index 303af52f3b79..95abf4604d88 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2536,7 +2536,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > - sysctl_optmem_max) + READ_ONCE(sysctl_optmem_max)) return NULL; skb = alloc_skb(size, priority); @@ -2554,8 +2554,10 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) { - if ((unsigned int)size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + int optmem_max = READ_ONCE(sysctl_optmem_max); + + if ((unsigned int)size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { void *mem; /* First do the add, to avoid the race if kmalloc * might sleep. diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index a8a323ecbb54..e49a61a053a6 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -772,7 +772,7 @@ static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; - if (optlen > sysctl_optmem_max) + if (optlen > READ_ONCE(sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); @@ -808,7 +808,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (optlen < size0) return -EINVAL; - if (optlen > sysctl_optmem_max - 4) + if (optlen > READ_ONCE(sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); @@ -1233,7 +1233,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, if (optlen < IP_MSFILTER_SIZE(0)) goto e_inval; - if (optlen > sysctl_optmem_max) { + if (optlen > READ_ONCE(sysctl_optmem_max)) { err = -ENOBUFS; break; } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 222f6bf220ba..e0dcc7a193df 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -210,7 +210,7 @@ static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; - if (optlen > sysctl_optmem_max) + if (optlen > READ_ONCE(sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); @@ -244,7 +244,7 @@ static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (optlen < size0) return -EINVAL; - if (optlen > sysctl_optmem_max - 4) + if (optlen > READ_ONCE(sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); -- cgit v1.2.3 From d2154b0afa73c0159b2856f875c6b4fe7cf6a95e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 10:46:50 -0700 Subject: net: Fix a data-race around sysctl_tstamp_allow_data. While reading sysctl_tstamp_allow_data, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: b245be1f4db1 ("net-timestamp: no-payload only sysctl") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5ea1d074a920..84bb5e188d0d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4797,7 +4797,7 @@ static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) { bool ret; - if (likely(sysctl_tstamp_allow_data || tsonly)) + if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly)) return true; read_lock_bh(&sk->sk_callback_lock); -- cgit v1.2.3 From e59ef36f0795696ab229569c153936bfd068d21c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 10:46:52 -0700 Subject: net: Fix a data-race around sysctl_net_busy_read. While reading sysctl_net_busy_read, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 2d48d67fa8cd ("net: poll/select low latency socket support") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 95abf4604d88..788c1372663c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3367,7 +3367,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; - sk->sk_ll_usec = sysctl_net_busy_read; + sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); #endif sk->sk_max_pacing_rate = ~0UL; -- cgit v1.2.3 From 2e0c42374ee32e72948559d2ae2f7ba3dc6b977c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 10:46:53 -0700 Subject: net: Fix a data-race around netdev_budget. While reading netdev_budget, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 51b0bdedb8e7 ("[NET]: Separate two usages of netdev_max_backlog.") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 4705e6630efa..c83e23cfc57d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6666,7 +6666,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + usecs_to_jiffies(netdev_budget_usecs); - int budget = netdev_budget; + int budget = READ_ONCE(netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); -- cgit v1.2.3 From 657b991afb89d25fe6c4783b1b75a8ad4563670d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 10:46:54 -0700 Subject: net: Fix data-races around sysctl_max_skb_frags. While reading sysctl_max_skb_frags, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 5f74f82ea34c ("net:Add sysctl_max_skb_frags") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 ++-- net/mptcp/protocol.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bbe218753662..e5011c136fdb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1000,7 +1000,7 @@ new_segment: i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, page, offset); - if (!can_coalesce && i >= sysctl_max_skb_frags) { + if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) { tcp_mark_push(tp, skb); goto new_segment; } @@ -1354,7 +1354,7 @@ new_segment: if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { - if (i >= sysctl_max_skb_frags) { + if (i >= READ_ONCE(sysctl_max_skb_frags)) { tcp_mark_push(tp, skb); goto new_segment; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index da4257504fad..d398f3810662 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1263,7 +1263,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); - if (!can_coalesce && i >= sysctl_max_skb_frags) { + if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) { tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb; } -- cgit v1.2.3 From fa45d484c52c73f79db2c23b0cdfc6c6455093ad Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 10:46:55 -0700 Subject: net: Fix a data-race around netdev_budget_usecs. While reading netdev_budget_usecs, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 7acf8a1e8a28 ("Replace 2 jiffies with sysctl netdev_budget_usecs to enable softirq tuning") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index c83e23cfc57d..8221322d86db 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6665,7 +6665,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + - usecs_to_jiffies(netdev_budget_usecs); + usecs_to_jiffies(READ_ONCE(netdev_budget_usecs)); int budget = READ_ONCE(netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); -- cgit v1.2.3 From a5612ca10d1aa05624ebe72633e0c8c792970833 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 10:46:57 -0700 Subject: net: Fix data-races around sysctl_devconf_inherit_init_net. While reading sysctl_devconf_inherit_init_net, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 856c395cfa63 ("net: introduce a knob to control whether to inherit devconf config") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 16 ++++++++++------ net/ipv6/addrconf.c | 5 ++--- 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 92b778e423df..e8b9a9202fec 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2682,23 +2682,27 @@ static __net_init int devinet_init_net(struct net *net) #endif if (!net_eq(net, &init_net)) { - if (IS_ENABLED(CONFIG_SYSCTL) && - sysctl_devconf_inherit_init_net == 3) { + switch (net_inherit_devconf()) { + case 3: /* copy from the current netns */ memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all, sizeof(ipv4_devconf)); memcpy(dflt, current->nsproxy->net_ns->ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); - } else if (!IS_ENABLED(CONFIG_SYSCTL) || - sysctl_devconf_inherit_init_net != 2) { - /* inherit == 0 or 1: copy from init_net */ + break; + case 0: + case 1: + /* copy from init_net */ memcpy(all, init_net.ipv4.devconf_all, sizeof(ipv4_devconf)); memcpy(dflt, init_net.ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); + break; + case 2: + /* use compiled values */ + break; } - /* else inherit == 2: use compiled values */ } #ifdef CONFIG_SYSCTL diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b624e3d8c5f0..e15f64f22fa8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -7162,9 +7162,8 @@ static int __net_init addrconf_init_net(struct net *net) if (!dflt) goto err_alloc_dflt; - if (IS_ENABLED(CONFIG_SYSCTL) && - !net_eq(net, &init_net)) { - switch (sysctl_devconf_inherit_init_net) { + if (!net_eq(net, &init_net)) { + switch (net_inherit_devconf()) { case 1: /* copy from init_net */ memcpy(all, init_net.ipv6.devconf_all, sizeof(ipv6_devconf)); -- cgit v1.2.3 From 05e49cfc89e4f325eebbc62d24dd122e55f94c23 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 10:46:59 -0700 Subject: net: Fix a data-race around netdev_unregister_timeout_secs. While reading netdev_unregister_timeout_secs, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 5aa3afe107d9 ("net: make unregister netdev warning timeout configurable") Signed-off-by: Kuniyuki Iwashima Acked-by: Dmitry Vyukov Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 8221322d86db..56c8b0921c9f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10284,7 +10284,7 @@ static struct net_device *netdev_wait_allrefs_any(struct list_head *list) return dev; if (time_after(jiffies, warning_time + - netdev_unregister_timeout_secs * HZ)) { + READ_ONCE(netdev_unregister_timeout_secs) * HZ)) { list_for_each_entry(dev, list, todo_list) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", dev->name, netdev_refcnt_read(dev)); -- cgit v1.2.3 From 3c9ba81d72047f2e81bb535d42856517b613aba7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 23 Aug 2022 10:47:00 -0700 Subject: net: Fix a data-race around sysctl_somaxconn. While reading sysctl_somaxconn, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 9b27c5e4e5ba..7378375d3a5b 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1801,7 +1801,7 @@ int __sys_listen(int fd, int backlog) sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { - somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn; + somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn); if ((unsigned int)backlog > somaxconn) backlog = somaxconn; -- cgit v1.2.3 From 60deb9f10eec5c6a20252ed36238b55d8b614a2c Mon Sep 17 00:00:00 2001 From: Siddh Raman Pant Date: Sat, 20 Aug 2022 01:33:40 +0530 Subject: wifi: mac80211: Fix UAF in ieee80211_scan_rx() ieee80211_scan_rx() tries to access scan_req->flags after a null check, but a UAF is observed when the scan is completed and __ieee80211_scan_completed() executes, which then calls cfg80211_scan_done() leading to the freeing of scan_req. Since scan_req is rcu_dereference()'d, prevent the racing in __ieee80211_scan_completed() by ensuring that from mac80211's POV it is no longer accessed from an RCU read critical section before we call cfg80211_scan_done(). Cc: stable@vger.kernel.org Link: https://syzkaller.appspot.com/bug?extid=f9acff9bf08a845f225d Reported-by: syzbot+f9acff9bf08a845f225d@syzkaller.appspotmail.com Suggested-by: Johannes Berg Signed-off-by: Siddh Raman Pant Link: https://lore.kernel.org/r/20220819200340.34826-1-code@siddh.me Signed-off-by: Johannes Berg --- net/mac80211/scan.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index fa8ddf576bc1..c4f2aeb31da3 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -469,16 +469,19 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->mtx)); - if (scan_req != local->int_scan_req) { - local->scan_info.aborted = aborted; - cfg80211_scan_done(scan_req, &local->scan_info); - } RCU_INIT_POINTER(local->scan_req, NULL); RCU_INIT_POINTER(local->scan_sdata, NULL); local->scanning = 0; local->scan_chandef.chan = NULL; + synchronize_rcu(); + + if (scan_req != local->int_scan_req) { + local->scan_info.aborted = aborted; + cfg80211_scan_done(scan_req, &local->scan_info); + } + /* Set power back to normal operating levels. */ ieee80211_hw_config(local, 0); -- cgit v1.2.3 From 36fe8e4e5cb02131719612aea1e64379670d1846 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 23 Aug 2022 15:22:23 +0200 Subject: wifi: mac80211: always free sta in __sta_info_alloc in case of error Free sta pointer in __sta_info_alloc routine if sta_info_alloc_link() fails. Fixes: 246b39e4a1ba5 ("wifi: mac80211: refactor some sta_info link handling") Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/a3d079208684cddbc25289f7f7e0fed795b0cad4.1661260857.git.lorenzo@kernel.org Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index cb23da9aff1e..330dab41f2fe 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -494,7 +494,7 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->sdata = sdata; if (sta_info_alloc_link(local, &sta->deflink, gfp)) - return NULL; + goto free; if (link_id >= 0) { sta_info_add_link(sta, link_id, &sta->deflink, -- cgit v1.2.3 From 62b03f45c6352410d13bf0710d24ef6290632eb1 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 18 Aug 2022 12:33:49 +0800 Subject: wifi: mac80211: fix possible leak in ieee80211_tx_control_port() Add missing dev_kfree_skb() in an error path in ieee80211_tx_control_port() to avoid a memory leak. Fixes: dd820ed6336a ("wifi: mac80211: return error from control port TX for drops") Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20220818043349.4168835-1-yangyingliang@huawei.com Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 45df9932d0ba..594bd70ee641 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5885,6 +5885,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, rcu_read_lock(); err = ieee80211_lookup_ra_sta(sdata, skb, &sta); if (err) { + dev_kfree_skb(skb); rcu_read_unlock(); return err; } -- cgit v1.2.3 From 15bc8966b6d3a5b9bfe4c9facfa02f2b69b1e5f0 Mon Sep 17 00:00:00 2001 From: Siddh Raman Pant Date: Sun, 14 Aug 2022 20:45:12 +0530 Subject: wifi: mac80211: Don't finalize CSA in IBSS mode if state is disconnected When we are not connected to a channel, sending channel "switch" announcement doesn't make any sense. The BSS list is empty in that case. This causes the for loop in cfg80211_get_bss() to be bypassed, so the function returns NULL (check line 1424 of net/wireless/scan.c), causing the WARN_ON() in ieee80211_ibss_csa_beacon() to get triggered (check line 500 of net/mac80211/ibss.c), which was consequently reported on the syzkaller dashboard. Thus, check if we have an existing connection before generating the CSA beacon in ieee80211_ibss_finish_csa(). Cc: stable@vger.kernel.org Fixes: cd7760e62c2a ("mac80211: add support for CSA in IBSS mode") Link: https://syzkaller.appspot.com/bug?id=05603ef4ae8926761b678d2939a3b2ad28ab9ca6 Reported-by: syzbot+b6c9fe29aefe68e4ad34@syzkaller.appspotmail.com Signed-off-by: Siddh Raman Pant Tested-by: syzbot+b6c9fe29aefe68e4ad34@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20220814151512.9985-1-code@siddh.me Signed-off-by: Johannes Berg --- net/mac80211/ibss.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index d56890e3fabb..9b283bbc7bb4 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -530,6 +530,10 @@ int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata) sdata_assert_lock(sdata); + /* When not connected/joined, sending CSA doesn't make sense. */ + if (ifibss->state != IEEE80211_IBSS_MLME_JOINED) + return -ENOLINK; + /* update cfg80211 bss information with the new channel */ if (!is_zero_ether_addr(ifibss->bssid)) { cbss = cfg80211_get_bss(sdata->local->hw.wiphy, -- cgit v1.2.3 From d776763f48084926b5d9e25507a3ddb7c9243d5e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 4 Aug 2022 10:03:21 +0300 Subject: wifi: cfg80211: debugfs: fix return type in ht40allow_map_read() The return type is supposed to be ssize_t, which is signed long, but "r" was declared as unsigned int. This means that on 64 bit systems we return positive values instead of negative error codes. Fixes: 80a3511d70e8 ("cfg80211: add debugfs HT40 allow map") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/YutvOQeJm0UjLhwU@kili Signed-off-by: Johannes Berg --- net/wireless/debugfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c index aab43469a2f0..0878b162890a 100644 --- a/net/wireless/debugfs.c +++ b/net/wireless/debugfs.c @@ -65,9 +65,10 @@ static ssize_t ht40allow_map_read(struct file *file, { struct wiphy *wiphy = file->private_data; char *buf; - unsigned int offset = 0, buf_size = PAGE_SIZE, i, r; + unsigned int offset = 0, buf_size = PAGE_SIZE, i; enum nl80211_band band; struct ieee80211_supported_band *sband; + ssize_t r; buf = kzalloc(buf_size, GFP_KERNEL); if (!buf) -- cgit v1.2.3 From 55f0a4894484e8d6ddf662f5aebbf3b4cb028541 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 28 Jul 2022 17:25:16 +0300 Subject: wifi: mac80211: potential NULL dereference in ieee80211_tx_control_port() The ieee80211_lookup_ra_sta() function will sometimes set "sta" to NULL so add this NULL check to prevent an Oops. Fixes: 9dd1953846c7 ("wifi: nl80211/mac80211: clarify link ID in control port TX") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/YuKcTAyO94YOy0Bu@kili Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 594bd70ee641..bf7fe6cd9dfc 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5900,7 +5900,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, * for MLO STA, the SA should be the AP MLD address, but * the link ID has been selected already */ - if (sta->sta.mlo) + if (sta && sta->sta.mlo) memcpy(ehdr->h_source, sdata->vif.addr, ETH_ALEN); } rcu_read_unlock(); -- cgit v1.2.3 From b0f571ecd7943423c25947439045f0d352ca3dbf Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 24 Aug 2022 17:35:45 +0100 Subject: rxrpc: Fix locking in rxrpc's sendmsg Fix three bugs in the rxrpc's sendmsg implementation: (1) rxrpc_new_client_call() should release the socket lock when returning an error from rxrpc_get_call_slot(). (2) rxrpc_wait_for_tx_window_intr() will return without the call mutex held in the event that we're interrupted by a signal whilst waiting for tx space on the socket or relocking the call mutex afterwards. Fix this by: (a) moving the unlock/lock of the call mutex up to rxrpc_send_data() such that the lock is not held around all of rxrpc_wait_for_tx_window*() and (b) indicating to higher callers whether we're return with the lock dropped. Note that this means recvmsg() will not block on this call whilst we're waiting. (3) After dropping and regaining the call mutex, rxrpc_send_data() needs to go and recheck the state of the tx_pending buffer and the tx_total_len check in case we raced with another sendmsg() on the same call. Thinking on this some more, it might make sense to have different locks for sendmsg() and recvmsg(). There's probably no need to make recvmsg() wait for sendmsg(). It does mean that recvmsg() can return MSG_EOR indicating that a call is dead before a sendmsg() to that call returns - but that can currently happen anyway. Without fix (2), something like the following can be induced: WARNING: bad unlock balance detected! 5.16.0-rc6-syzkaller #0 Not tainted ------------------------------------- syz-executor011/3597 is trying to release lock (&call->user_mutex) at: [] rxrpc_do_sendmsg+0xc13/0x1350 net/rxrpc/sendmsg.c:748 but there are no more locks to release! other info that might help us debug this: no locks held by syz-executor011/3597. ... Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_unlock_imbalance_bug include/trace/events/lock.h:58 [inline] __lock_release kernel/locking/lockdep.c:5306 [inline] lock_release.cold+0x49/0x4e kernel/locking/lockdep.c:5657 __mutex_unlock_slowpath+0x99/0x5e0 kernel/locking/mutex.c:900 rxrpc_do_sendmsg+0xc13/0x1350 net/rxrpc/sendmsg.c:748 rxrpc_sendmsg+0x420/0x630 net/rxrpc/af_rxrpc.c:561 sock_sendmsg_nosec net/socket.c:704 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:724 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2409 ___sys_sendmsg+0xf3/0x170 net/socket.c:2463 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2492 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae [Thanks to Hawkins Jiawei and Khalid Masum for their attempts to fix this] Fixes: bc5e3a546d55 ("rxrpc: Use MSG_WAITALL to tell sendmsg() to temporarily ignore signals") Reported-by: syzbot+7f0483225d0c94cb3441@syzkaller.appspotmail.com Signed-off-by: David Howells Reviewed-by: Marc Dionne Tested-by: syzbot+7f0483225d0c94cb3441@syzkaller.appspotmail.com cc: Hawkins Jiawei cc: Khalid Masum cc: Dan Carpenter cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/166135894583.600315.7170979436768124075.stgit@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- net/rxrpc/call_object.c | 4 ++- net/rxrpc/sendmsg.c | 92 +++++++++++++++++++++++++++++-------------------- 2 files changed, 57 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 84d0a4109645..6401cdf7a624 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -285,8 +285,10 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, _enter("%p,%lx", rx, p->user_call_ID); limiter = rxrpc_get_call_slot(p, gfp); - if (!limiter) + if (!limiter) { + release_sock(&rx->sk); return ERR_PTR(-ERESTARTSYS); + } call = rxrpc_alloc_client_call(rx, srx, gfp, debug_id); if (IS_ERR(call)) { diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 1d38e279e2ef..3c3a626459de 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -51,10 +51,7 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx, return sock_intr_errno(*timeo); trace_rxrpc_transmit(call, rxrpc_transmit_wait); - mutex_unlock(&call->user_mutex); *timeo = schedule_timeout(*timeo); - if (mutex_lock_interruptible(&call->user_mutex) < 0) - return sock_intr_errno(*timeo); } } @@ -290,37 +287,48 @@ out: static int rxrpc_send_data(struct rxrpc_sock *rx, struct rxrpc_call *call, struct msghdr *msg, size_t len, - rxrpc_notify_end_tx_t notify_end_tx) + rxrpc_notify_end_tx_t notify_end_tx, + bool *_dropped_lock) { struct rxrpc_skb_priv *sp; struct sk_buff *skb; struct sock *sk = &rx->sk; + enum rxrpc_call_state state; long timeo; - bool more; - int ret, copied; + bool more = msg->msg_flags & MSG_MORE; + int ret, copied = 0; timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); /* this should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); +reload: + ret = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) - return -EPIPE; - - more = msg->msg_flags & MSG_MORE; - + goto maybe_error; + state = READ_ONCE(call->state); + ret = -ESHUTDOWN; + if (state >= RXRPC_CALL_COMPLETE) + goto maybe_error; + ret = -EPROTO; + if (state != RXRPC_CALL_CLIENT_SEND_REQUEST && + state != RXRPC_CALL_SERVER_ACK_REQUEST && + state != RXRPC_CALL_SERVER_SEND_REPLY) + goto maybe_error; + + ret = -EMSGSIZE; if (call->tx_total_len != -1) { - if (len > call->tx_total_len) - return -EMSGSIZE; - if (!more && len != call->tx_total_len) - return -EMSGSIZE; + if (len - copied > call->tx_total_len) + goto maybe_error; + if (!more && len - copied != call->tx_total_len) + goto maybe_error; } skb = call->tx_pending; call->tx_pending = NULL; rxrpc_see_skb(skb, rxrpc_skb_seen); - copied = 0; do { /* Check to see if there's a ping ACK to reply to. */ if (call->ackr_reason == RXRPC_ACK_PING_RESPONSE) @@ -331,16 +339,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, _debug("alloc"); - if (!rxrpc_check_tx_space(call, NULL)) { - ret = -EAGAIN; - if (msg->msg_flags & MSG_DONTWAIT) - goto maybe_error; - ret = rxrpc_wait_for_tx_window(rx, call, - &timeo, - msg->msg_flags & MSG_WAITALL); - if (ret < 0) - goto maybe_error; - } + if (!rxrpc_check_tx_space(call, NULL)) + goto wait_for_space; /* Work out the maximum size of a packet. Assume that * the security header is going to be in the padded @@ -468,6 +468,27 @@ maybe_error: efault: ret = -EFAULT; goto out; + +wait_for_space: + ret = -EAGAIN; + if (msg->msg_flags & MSG_DONTWAIT) + goto maybe_error; + mutex_unlock(&call->user_mutex); + *_dropped_lock = true; + ret = rxrpc_wait_for_tx_window(rx, call, &timeo, + msg->msg_flags & MSG_WAITALL); + if (ret < 0) + goto maybe_error; + if (call->interruptibility == RXRPC_INTERRUPTIBLE) { + if (mutex_lock_interruptible(&call->user_mutex) < 0) { + ret = sock_intr_errno(timeo); + goto maybe_error; + } + } else { + mutex_lock(&call->user_mutex); + } + *_dropped_lock = false; + goto reload; } /* @@ -629,6 +650,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) enum rxrpc_call_state state; struct rxrpc_call *call; unsigned long now, j; + bool dropped_lock = false; int ret; struct rxrpc_send_params p = { @@ -737,21 +759,13 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) ret = rxrpc_send_abort_packet(call); } else if (p.command != RXRPC_CMD_SEND_DATA) { ret = -EINVAL; - } else if (rxrpc_is_client_call(call) && - state != RXRPC_CALL_CLIENT_SEND_REQUEST) { - /* request phase complete for this client call */ - ret = -EPROTO; - } else if (rxrpc_is_service_call(call) && - state != RXRPC_CALL_SERVER_ACK_REQUEST && - state != RXRPC_CALL_SERVER_SEND_REPLY) { - /* Reply phase not begun or not complete for service call. */ - ret = -EPROTO; } else { - ret = rxrpc_send_data(rx, call, msg, len, NULL); + ret = rxrpc_send_data(rx, call, msg, len, NULL, &dropped_lock); } out_put_unlock: - mutex_unlock(&call->user_mutex); + if (!dropped_lock) + mutex_unlock(&call->user_mutex); error_put: rxrpc_put_call(call, rxrpc_call_put); _leave(" = %d", ret); @@ -779,6 +793,7 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, struct msghdr *msg, size_t len, rxrpc_notify_end_tx_t notify_end_tx) { + bool dropped_lock = false; int ret; _enter("{%d,%s},", call->debug_id, rxrpc_call_states[call->state]); @@ -796,7 +811,7 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, case RXRPC_CALL_SERVER_ACK_REQUEST: case RXRPC_CALL_SERVER_SEND_REPLY: ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len, - notify_end_tx); + notify_end_tx, &dropped_lock); break; case RXRPC_CALL_COMPLETE: read_lock_bh(&call->state_lock); @@ -810,7 +825,8 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, break; } - mutex_unlock(&call->user_mutex); + if (!dropped_lock) + mutex_unlock(&call->user_mutex); _leave(" = %d", ret); return ret; } -- cgit v1.2.3 From b82a26d8633cc89367fac75beb3ec33061bea44a Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 7 Aug 2022 22:57:40 +0200 Subject: Bluetooth: hci_event: Fix vendor (unknown) opcode status handling Commit c8992cffbe74 ("Bluetooth: hci_event: Use of a function table to handle Command Complete") was (presumably) meant to only refactor things without any functional changes. But it does have one undesirable side-effect, before *status would always be set to skb->data[0] and it might be overridden by some of the opcode specific handling. While now it always set by the opcode specific handlers. This means that if the opcode is not known *status does not get set any more at all! This behavior change has broken bluetooth support for BCM4343A0 HCIs, the hci_bcm.c code tries to configure UART attached HCIs at a higher baudraute using vendor specific opcodes. The BCM4343A0 does not support this and this used to simply fail: [ 25.646442] Bluetooth: hci0: BCM: failed to write clock (-56) [ 25.646481] Bluetooth: hci0: Failed to set baudrate After which things would continue with the initial baudraute. But now that hci_cmd_complete_evt() no longer sets status for unknown opcodes *status is left at 0. This causes the hci_bcm.c code to think the baudraute has been changed on the HCI side and to also adjust the UART baudrate, after which communication with the HCI is broken, leading to: [ 28.579042] Bluetooth: hci0: command 0x0c03 tx timeout [ 36.961601] Bluetooth: hci0: BCM: Reset failed (-110) And non working bluetooth. Fix this by restoring the previous default "*status = skb->data[0]" handling for unknown opcodes. Fixes: c8992cffbe74 ("Bluetooth: hci_event: Use of a function table to handle Command Complete") Signed-off-by: Hans de Goede Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 485c814cf44a..73aa9ee9d21a 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4179,6 +4179,17 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, void *data, } } + if (i == ARRAY_SIZE(hci_cc_table)) { + /* Unknown opcode, assume byte 0 contains the status, so + * that e.g. __hci_cmd_sync() properly returns errors + * for vendor specific commands send by HCI drivers. + * If a vendor doesn't actually follow this convention we may + * need to introduce a vendor CC table in order to properly set + * the status. + */ + *status = skb->data[0]; + } + handle_cmd_cnt_and_timer(hdev, ev->ncmd); hci_req_cmd_complete(hdev, *opcode, *status, req_complete, -- cgit v1.2.3 From 1fd02d56dae35b08e4cba8e6bd2c2e7ccff68ecc Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 11 Aug 2022 14:20:46 -0700 Subject: Bluetooth: hci_sync: Fix suspend performance regression This attempts to fix suspend performance when there is no connections by not updating the event mask. Fixes: ef61b6ea1544 ("Bluetooth: Always set event mask on suspend") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index e6d804b82b67..5fe440cdc68d 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -5288,17 +5288,21 @@ int hci_suspend_sync(struct hci_dev *hdev) /* Prevent disconnects from causing scanning to be re-enabled */ hci_pause_scan_sync(hdev); - /* Soft disconnect everything (power off) */ - err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF); - if (err) { - /* Set state to BT_RUNNING so resume doesn't notify */ - hdev->suspend_state = BT_RUNNING; - hci_resume_sync(hdev); - return err; - } + if (hci_conn_count(hdev)) { + /* Soft disconnect everything (power off) */ + err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF); + if (err) { + /* Set state to BT_RUNNING so resume doesn't notify */ + hdev->suspend_state = BT_RUNNING; + hci_resume_sync(hdev); + return err; + } - /* Update event mask so only the allowed event can wakeup the host */ - hci_set_event_mask_sync(hdev); + /* Update event mask so only the allowed event can wakeup the + * host. + */ + hci_set_event_mask_sync(hdev); + } /* Only configure accept list if disconnect succeeded and wake * isn't being prevented. -- cgit v1.2.3 From b840304fb46cdf7012722f456bce06f151b3e81b Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 12 Aug 2022 15:33:57 -0700 Subject: Bluetooth: L2CAP: Fix build errors in some archs This attempts to fix the follow errors: In function 'memcmp', inlined from 'bacmp' at ./include/net/bluetooth/bluetooth.h:347:9, inlined from 'l2cap_global_chan_by_psm' at net/bluetooth/l2cap_core.c:2003:15: ./include/linux/fortify-string.h:44:33: error: '__builtin_memcmp' specified bound 6 exceeds source size 0 [-Werror=stringop-overread] 44 | #define __underlying_memcmp __builtin_memcmp | ^ ./include/linux/fortify-string.h:420:16: note: in expansion of macro '__underlying_memcmp' 420 | return __underlying_memcmp(p, q, size); | ^~~~~~~~~~~~~~~~~~~ In function 'memcmp', inlined from 'bacmp' at ./include/net/bluetooth/bluetooth.h:347:9, inlined from 'l2cap_global_chan_by_psm' at net/bluetooth/l2cap_core.c:2004:15: ./include/linux/fortify-string.h:44:33: error: '__builtin_memcmp' specified bound 6 exceeds source size 0 [-Werror=stringop-overread] 44 | #define __underlying_memcmp __builtin_memcmp | ^ ./include/linux/fortify-string.h:420:16: note: in expansion of macro '__underlying_memcmp' 420 | return __underlying_memcmp(p, q, size); | ^~~~~~~~~~~~~~~~~~~ Fixes: 332f1795ca20 ("Bluetooth: L2CAP: Fix l2cap_global_chan_by_psm regression") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index cbe0cae73434..2c9de67daadc 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1992,11 +1992,11 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, src_match = !bacmp(&c->src, src); dst_match = !bacmp(&c->dst, dst); if (src_match && dst_match) { - c = l2cap_chan_hold_unless_zero(c); - if (c) { - read_unlock(&chan_list_lock); - return c; - } + if (!l2cap_chan_hold_unless_zero(c)) + continue; + + read_unlock(&chan_list_lock); + return c; } /* Closest match */ -- cgit v1.2.3 From 23b72814da1a094b4c065e0bb598249f310c5577 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 15 Aug 2022 16:14:32 -0700 Subject: Bluetooth: MGMT: Fix Get Device Flags Get Device Flags don't check if device does actually use an RPA in which case it shall only set HCI_CONN_FLAG_REMOTE_WAKEUP if LL Privacy is enabled. Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 71 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 6e31023b84f5..566b6291b38e 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4547,6 +4547,22 @@ static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, MGMT_STATUS_NOT_SUPPORTED); } +static u32 get_params_flags(struct hci_dev *hdev, + struct hci_conn_params *params) +{ + u32 flags = hdev->conn_flags; + + /* Devices using RPAs can only be programmed in the acceptlist if + * LL Privacy has been enable otherwise they cannot mark + * HCI_CONN_FLAG_REMOTE_WAKEUP. + */ + if ((flags & HCI_CONN_FLAG_REMOTE_WAKEUP) && !use_ll_privacy(hdev) && + hci_find_irk_by_addr(hdev, ¶ms->addr, params->addr_type)) + flags &= ~HCI_CONN_FLAG_REMOTE_WAKEUP; + + return flags; +} + static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { @@ -4578,10 +4594,10 @@ static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, } else { params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, le_addr_type(cp->addr.type)); - if (!params) goto done; + supported_flags = get_params_flags(hdev, params); current_flags = params->flags; } @@ -4649,38 +4665,35 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, bt_dev_warn(hdev, "No such BR/EDR device %pMR (0x%x)", &cp->addr.bdaddr, cp->addr.type); } - } else { - params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, - le_addr_type(cp->addr.type)); - if (params) { - /* Devices using RPAs can only be programmed in the - * acceptlist LL Privacy has been enable otherwise they - * cannot mark HCI_CONN_FLAG_REMOTE_WAKEUP. - */ - if ((current_flags & HCI_CONN_FLAG_REMOTE_WAKEUP) && - !use_ll_privacy(hdev) && - hci_find_irk_by_addr(hdev, ¶ms->addr, - params->addr_type)) { - bt_dev_warn(hdev, - "Cannot set wakeable for RPA"); - goto unlock; - } - params->flags = current_flags; - status = MGMT_STATUS_SUCCESS; + goto unlock; + } - /* Update passive scan if HCI_CONN_FLAG_DEVICE_PRIVACY - * has been set. - */ - if (params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY) - hci_update_passive_scan(hdev); - } else { - bt_dev_warn(hdev, "No such LE device %pMR (0x%x)", - &cp->addr.bdaddr, - le_addr_type(cp->addr.type)); - } + params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, + le_addr_type(cp->addr.type)); + if (!params) { + bt_dev_warn(hdev, "No such LE device %pMR (0x%x)", + &cp->addr.bdaddr, le_addr_type(cp->addr.type)); + goto unlock; + } + + supported_flags = get_params_flags(hdev, params); + + if ((supported_flags | current_flags) != supported_flags) { + bt_dev_warn(hdev, "Bad flag given (0x%x) vs supported (0x%0x)", + current_flags, supported_flags); + goto unlock; } + params->flags = current_flags; + status = MGMT_STATUS_SUCCESS; + + /* Update passive scan if HCI_CONN_FLAG_DEVICE_PRIVACY + * has been set. + */ + if (params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY) + hci_update_passive_scan(hdev); + unlock: hci_dev_unlock(hdev); -- cgit v1.2.3 From 3cfbc6ac22d62d0a06be9ce2996ba9fed75436cd Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 17 Aug 2022 20:14:36 +0900 Subject: Bluetooth: hci_sync: fix double mgmt_pending_free() in remove_adv_monitor() syzbot is reporting double kfree() at remove_adv_monitor() [1], for commit 7cf5c2978f23fdbb ("Bluetooth: hci_sync: Refactor remove Adv Monitor") forgot to remove duplicated mgmt_pending_remove() when merging "if (err) {" path and "if (!pending) {" path. Link: https://syzkaller.appspot.com/bug?extid=915a8416bf15895b8e07 [1] Reported-by: syzbot Fixes: 7cf5c2978f23fdbb ("Bluetooth: hci_sync: Refactor remove Adv Monitor") Signed-off-by: Tetsuo Handa Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 566b6291b38e..72e6595a71cc 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -5067,7 +5067,6 @@ static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev, else status = MGMT_STATUS_FAILED; - mgmt_pending_remove(cmd); goto unlock; } -- cgit v1.2.3 From c572909376673d8c126ae2ee53ba754ff9327d98 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 18 Aug 2022 14:31:42 -0700 Subject: Bluetooth: ISO: Fix not handling shutdown condition In order to properly handle shutdown syscall the code shall not assume that the how argument is always SHUT_RDWR resulting in SHUTDOWN_MASK as that would result in poll to immediately report EPOLLHUP instead of properly waiting for disconnect_cfm (Disconnect Complete) which is rather important for the likes of BAP as the CIG may need to be reprogrammed. Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index ced8ad4fed4f..613039ba5dbf 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1309,7 +1309,7 @@ static int iso_sock_shutdown(struct socket *sock, int how) struct sock *sk = sock->sk; int err = 0; - BT_DBG("sock %p, sk %p", sock, sk); + BT_DBG("sock %p, sk %p, how %d", sock, sk, how); if (!sk) return 0; @@ -1317,17 +1317,32 @@ static int iso_sock_shutdown(struct socket *sock, int how) sock_hold(sk); lock_sock(sk); - if (!sk->sk_shutdown) { - sk->sk_shutdown = SHUTDOWN_MASK; - iso_sock_clear_timer(sk); - __iso_sock_close(sk); - - if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && - !(current->flags & PF_EXITING)) - err = bt_sock_wait_state(sk, BT_CLOSED, - sk->sk_lingertime); + switch (how) { + case SHUT_RD: + if (sk->sk_shutdown & RCV_SHUTDOWN) + goto unlock; + sk->sk_shutdown |= RCV_SHUTDOWN; + break; + case SHUT_WR: + if (sk->sk_shutdown & SEND_SHUTDOWN) + goto unlock; + sk->sk_shutdown |= SEND_SHUTDOWN; + break; + case SHUT_RDWR: + if (sk->sk_shutdown & SHUTDOWN_MASK) + goto unlock; + sk->sk_shutdown |= SHUTDOWN_MASK; + break; } + iso_sock_clear_timer(sk); + __iso_sock_close(sk); + + if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && + !(current->flags & PF_EXITING)) + err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); + +unlock: release_sock(sk); sock_put(sk); -- cgit v1.2.3 From f48735a9aaf8258f39918e13adf464ccd7dce33b Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Tue, 23 Aug 2022 12:39:22 +0800 Subject: Bluetooth: hci_event: Fix checking conn for le_conn_complete_evt To prevent multiple conn complete events, we shouldn't look up the conn with hci_lookup_le_connect, since it requires the state to be BT_CONNECT. By the time the duplicate event is processed, the state might have changed, so we end up processing the new event anyway. Change the lookup function to hci_conn_hash_lookup_ba. Fixes: d5ebaa7c5f6f6 ("Bluetooth: hci_event: Ignore multiple conn complete events") Signed-off-by: Archie Pusaka Reviewed-by: Sonny Sasaka Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 73aa9ee9d21a..6643c9c20fa4 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5801,7 +5801,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, */ hci_dev_clear_flag(hdev, HCI_LE_ADV); - conn = hci_lookup_le_connect(hdev); + conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, bdaddr); if (!conn) { /* In case of error status and there is no connection pending * just unlock as there is nothing to cleanup. -- cgit v1.2.3 From cb0d160f813b23df2f4fe7af3237a026c44ed1aa Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:02:07 +0200 Subject: Bluetooth: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Signed-off-by: Wolfram Sang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hidp/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 5940744a8cd8..cc20e706c639 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -83,14 +83,14 @@ static void hidp_copy_session(struct hidp_session *session, struct hidp_conninfo ci->product = session->input->id.product; ci->version = session->input->id.version; if (session->input->name) - strlcpy(ci->name, session->input->name, 128); + strscpy(ci->name, session->input->name, 128); else - strlcpy(ci->name, "HID Boot Device", 128); + strscpy(ci->name, "HID Boot Device", 128); } else if (session->hid) { ci->vendor = session->hid->vendor; ci->product = session->hid->product; ci->version = session->hid->version; - strlcpy(ci->name, session->hid->name, 128); + strscpy(ci->name, session->hid->name, 128); } } -- cgit v1.2.3 From 2da8eb834b775a9d1acea6214d3e4a78ac841e6e Mon Sep 17 00:00:00 2001 From: Zhengping Jiang Date: Tue, 23 Aug 2022 10:28:08 -0700 Subject: Bluetooth: hci_sync: hold hdev->lock when cleanup hci_conn When disconnecting all devices, hci_conn_failed is used to cleanup hci_conn object when the hci_conn object cannot be aborted. The function hci_conn_failed requires the caller holds hdev->lock. Fixes: 9b3628d79b46f ("Bluetooth: hci_sync: Cleanup hci_conn if it cannot be aborted") Signed-off-by: Zhengping Jiang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 5fe440cdc68d..187786454d98 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4773,9 +4773,11 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) /* Cleanup hci_conn object if it cannot be cancelled as it * likelly means the controller and host stack are out of sync. */ - if (err) + if (err) { + hci_dev_lock(hdev); hci_conn_failed(conn, err); - + hci_dev_unlock(hdev); + } return err; case BT_CONNECT2: return hci_reject_conn_sync(hdev, conn, reason); -- cgit v1.2.3 From a87406f4adee9c53b311d8a1ba2849c69e29a6d0 Mon Sep 17 00:00:00 2001 From: Andrey Zhadchenko Date: Thu, 25 Aug 2022 05:03:26 +0300 Subject: openvswitch: fix memory leak at failed datapath creation ovs_dp_cmd_new()->ovs_dp_change()->ovs_dp_set_upcall_portids() allocates array via kmalloc. If for some reason new_vport() fails during ovs_dp_cmd_new() dp->upcall_portids must be freed. Add missing kfree. Kmemleak example: unreferenced object 0xffff88800c382500 (size 64): comm "dump_state", pid 323, jiffies 4294955418 (age 104.347s) hex dump (first 32 bytes): 5e c2 79 e4 1f 7a 38 c7 09 21 38 0c 80 88 ff ff ^.y..z8..!8..... 03 00 00 00 0a 00 00 00 14 00 00 00 28 00 00 00 ............(... backtrace: [<0000000071bebc9f>] ovs_dp_set_upcall_portids+0x38/0xa0 [<000000000187d8bd>] ovs_dp_change+0x63/0xe0 [<000000002397e446>] ovs_dp_cmd_new+0x1f0/0x380 [<00000000aa06f36e>] genl_family_rcv_msg_doit+0xea/0x150 [<000000008f583bc4>] genl_rcv_msg+0xdc/0x1e0 [<00000000fa10e377>] netlink_rcv_skb+0x50/0x100 [<000000004959cece>] genl_rcv+0x24/0x40 [<000000004699ac7f>] netlink_unicast+0x23e/0x360 [<00000000c153573e>] netlink_sendmsg+0x24e/0x4b0 [<000000006f4aa380>] sock_sendmsg+0x62/0x70 [<00000000d0068654>] ____sys_sendmsg+0x230/0x270 [<0000000012dacf7d>] ___sys_sendmsg+0x88/0xd0 [<0000000011776020>] __sys_sendmsg+0x59/0xa0 [<000000002e8f2dc1>] do_syscall_64+0x3b/0x90 [<000000003243e7cb>] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: b83d23a2a38b ("openvswitch: Introduce per-cpu upcall dispatch") Acked-by: Aaron Conole Signed-off-by: Andrey Zhadchenko Link: https://lore.kernel.org/r/20220825020326.664073-1-andrey.zhadchenko@virtuozzo.com Signed-off-by: Jakub Kicinski --- net/openvswitch/datapath.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 7e8a39a35627..6c9d153afbee 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1802,7 +1802,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_dp_reset_user_features(skb, info); } - goto err_unlock_and_destroy_meters; + goto err_destroy_portids; } err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, @@ -1817,6 +1817,8 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_notify(&dp_datapath_genl_family, reply, info); return 0; +err_destroy_portids: + kfree(rcu_dereference_raw(dp->upcall_portids)); err_unlock_and_destroy_meters: ovs_unlock(); ovs_meters_exit(dp); -- cgit v1.2.3 From f0da47118c7e93cdbbc6fb403dd729a5f2c90ee3 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 26 Aug 2022 16:29:54 +0200 Subject: net: mac802154: Fix a condition in the receive path Upon reception, a packet must be categorized, either it's destination is the host, or it is another host. A packet with no destination addressing fields may be valid in two situations: - the packet has no source field: only ACKs are built like that, we consider the host as the destination. - the packet has a valid source field: it is directed to the PAN coordinator, as for know we don't have this information we consider we are not the PAN coordinator. There was likely a copy/paste error made during a previous cleanup because the if clause is now containing exactly the same condition as in the switch case, which can never be true. In the past the destination address was used in the switch and the source address was used in the if, which matches what the spec says. Cc: stable@vger.kernel.org Fixes: ae531b9475f6 ("ieee802154: use ieee802154_addr instead of *_sa variants") Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/r/20220826142954.254853-1-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index b8ce84618a55..c439125ef2b9 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -44,7 +44,7 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, switch (mac_cb(skb)->dest.mode) { case IEEE802154_ADDR_NONE: - if (mac_cb(skb)->dest.mode != IEEE802154_ADDR_NONE) + if (hdr->source.mode != IEEE802154_ADDR_NONE) /* FIXME: check if we are PAN coordinator */ skb->pkt_type = PACKET_OTHERHOST; else -- cgit v1.2.3 From 278d3ba61563ceed3cb248383ced19e14ec7bc1f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 25 Aug 2022 13:36:45 +0200 Subject: net: Use u64_stats_fetch_begin_irq() for stats fetch. On 32bit-UP u64_stats_fetch_begin() disables only preemption. If the reader is in preemptible context and the writer side (u64_stats_update_begin*()) runs in an interrupt context (IRQ or softirq) then the writer can update the stats during the read operation. This update remains undetected. Use u64_stats_fetch_begin_irq() to ensure the stats fetch on 32bit-UP are not interrupted by a writer. 32bit-SMP remains unaffected by this change. Cc: "David S. Miller" Cc: Catherine Sullivan Cc: David Awogbemila Cc: Dimitris Michailidis Cc: Eric Dumazet Cc: Hans Ulli Kroll Cc: Jakub Kicinski Cc: Jeroen de Borst Cc: Johannes Berg Cc: Linus Walleij Cc: Paolo Abeni Cc: Simon Horman Cc: linux-arm-kernel@lists.infradead.org Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Cc: oss-drivers@corigine.com Cc: stable@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/mac80211/sta_info.c | 8 ++++---- net/mpls/af_mpls.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 330dab41f2fe..58998d821778 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2316,9 +2316,9 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, u64 value; do { - start = u64_stats_fetch_begin(&rxstats->syncp); + start = u64_stats_fetch_begin_irq(&rxstats->syncp); value = rxstats->msdu[tid]; - } while (u64_stats_fetch_retry(&rxstats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start)); return value; } @@ -2384,9 +2384,9 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) u64 value; do { - start = u64_stats_fetch_begin(&rxstats->syncp); + start = u64_stats_fetch_begin_irq(&rxstats->syncp); value = rxstats->bytes; - } while (u64_stats_fetch_retry(&rxstats->syncp, start)); + } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start)); return value; } diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 35b5f806fdda..b52afe316dc4 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1079,9 +1079,9 @@ static void mpls_get_stats(struct mpls_dev *mdev, p = per_cpu_ptr(mdev->stats, i); do { - start = u64_stats_fetch_begin(&p->syncp); + start = u64_stats_fetch_begin_irq(&p->syncp); local = p->stats; - } while (u64_stats_fetch_retry(&p->syncp, start)); + } while (u64_stats_fetch_retry_irq(&p->syncp, start)); stats->rx_packets += local.rx_packets; stats->rx_bytes += local.rx_bytes; -- cgit v1.2.3 From b05972f01e7d30419987a1f221b5593668fd6448 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Fri, 26 Aug 2022 09:39:30 +0800 Subject: net: sched: tbf: don't call qdisc_put() while holding tree lock The issue is the same to commit c2999f7fb05b ("net: sched: multiq: don't call qdisc_put() while holding tree lock"). Qdiscs call qdisc_put() while holding sch tree spinlock, which results sleeping-while-atomic BUG. Fixes: c266f64dbfa2 ("net: sched: protect block state with mutex") Signed-off-by: Zhengchao Shao Link: https://lore.kernel.org/r/20220826013930.340121-1-shaozhengchao@huawei.com Signed-off-by: Paolo Abeni --- net/sched/sch_tbf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 72102277449e..36079fdde2cb 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -356,6 +356,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, struct nlattr *tb[TCA_TBF_MAX + 1]; struct tc_tbf_qopt *qopt; struct Qdisc *child = NULL; + struct Qdisc *old = NULL; struct psched_ratecfg rate; struct psched_ratecfg peak; u64 max_size; @@ -447,7 +448,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); if (child) { qdisc_tree_flush_backlog(q->qdisc); - qdisc_put(q->qdisc); + old = q->qdisc; q->qdisc = child; } q->limit = qopt->limit; @@ -467,6 +468,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg)); sch_tree_unlock(sch); + qdisc_put(old); err = 0; tbf_offload_change(sch); -- cgit v1.2.3 From f612466ebecb12a00d9152344ddda6f6345f04dc Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Fri, 26 Aug 2022 17:00:55 +0800 Subject: net/sched: fix netdevice reference leaks in attach_default_qdiscs() In attach_default_qdiscs(), if a dev has multiple queues and queue 0 fails to attach qdisc because there is no memory in attach_one_default_qdisc(). Then dev->qdisc will be noop_qdisc by default. But the other queues may be able to successfully attach to default qdisc. In this case, the fallback to noqueue process will be triggered. If the original attached qdisc is not released and a new one is directly attached, this will cause netdevice reference leaks. The following is the bug log: veth0: default qdisc (fq_codel) fail, fallback to noqueue unregister_netdevice: waiting for veth0 to become free. Usage count = 32 leaked reference. qdisc_alloc+0x12e/0x210 qdisc_create_dflt+0x62/0x140 attach_one_default_qdisc.constprop.41+0x44/0x70 dev_activate+0x128/0x290 __dev_open+0x12a/0x190 __dev_change_flags+0x1a2/0x1f0 dev_change_flags+0x23/0x60 do_setlink+0x332/0x1150 __rtnl_newlink+0x52f/0x8e0 rtnl_newlink+0x43/0x70 rtnetlink_rcv_msg+0x140/0x3b0 netlink_rcv_skb+0x50/0x100 netlink_unicast+0x1bb/0x290 netlink_sendmsg+0x37c/0x4e0 sock_sendmsg+0x5f/0x70 ____sys_sendmsg+0x208/0x280 Fix this bug by clearing any non-noop qdiscs that may have been assigned before trying to re-attach. Fixes: bf6dba76d278 ("net: sched: fallback to qdisc noqueue if default qdisc setup fail") Signed-off-by: Wang Hai Link: https://lore.kernel.org/r/20220826090055.24424-1-wanghai38@huawei.com Signed-off-by: Paolo Abeni --- net/sched/sch_generic.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 99b697ad2b98..7a8ea03f673d 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1122,6 +1122,21 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, } EXPORT_SYMBOL(dev_graft_qdisc); +static void shutdown_scheduler_queue(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_qdisc_default) +{ + struct Qdisc *qdisc = dev_queue->qdisc_sleeping; + struct Qdisc *qdisc_default = _qdisc_default; + + if (qdisc) { + rcu_assign_pointer(dev_queue->qdisc, qdisc_default); + dev_queue->qdisc_sleeping = qdisc_default; + + qdisc_put(qdisc); + } +} + static void attach_one_default_qdisc(struct net_device *dev, struct netdev_queue *dev_queue, void *_unused) @@ -1169,6 +1184,7 @@ static void attach_default_qdiscs(struct net_device *dev) if (qdisc == &noop_qdisc) { netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n", default_qdisc_ops->id, noqueue_qdisc_ops.id); + netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); dev->priv_flags |= IFF_NO_QUEUE; netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); qdisc = txq->qdisc_sleeping; @@ -1447,21 +1463,6 @@ void dev_init_scheduler(struct net_device *dev) timer_setup(&dev->watchdog_timer, dev_watchdog, 0); } -static void shutdown_scheduler_queue(struct net_device *dev, - struct netdev_queue *dev_queue, - void *_qdisc_default) -{ - struct Qdisc *qdisc = dev_queue->qdisc_sleeping; - struct Qdisc *qdisc_default = _qdisc_default; - - if (qdisc) { - rcu_assign_pointer(dev_queue->qdisc, qdisc_default); - dev_queue->qdisc_sleeping = qdisc_default; - - qdisc_put(qdisc); - } -} - void dev_shutdown(struct net_device *dev) { netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); -- cgit v1.2.3 From b118509076b39cc5e616c0680312b5caaca535fe Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 26 Aug 2022 08:49:16 +0200 Subject: netfilter: remove nf_conntrack_helper sysctl and modparam toggles __nf_ct_try_assign_helper() remains in place but it now requires a template to configure the helper. A toggle to disable automatic helper assignment was added by: a9006892643a ("netfilter: nf_ct_helper: allow to disable automatic helper assignment") in 2012 to address the issues described in "Secure use of iptables and connection tracking helpers". Automatic conntrack helper assignment was disabled by: 3bb398d925ec ("netfilter: nf_ct_helper: disable automatic helper assignment") back in 2016. This patch removes the sysctl and modparam toggles, users now have to rely on explicit conntrack helper configuration via ruleset. Update tools/testing/selftests/netfilter/nft_conntrack_helper.sh to check that auto-assignment does not happen anymore. Acked-by: Aaron Conole Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 7 +-- net/netfilter/nf_conntrack_helper.c | 80 +++++---------------------------- net/netfilter/nf_conntrack_netlink.c | 5 --- net/netfilter/nf_conntrack_standalone.c | 10 ----- net/netfilter/nft_ct.c | 3 -- 5 files changed, 11 insertions(+), 94 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 71c2f4f95d36..1357a2729a4b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1782,7 +1782,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } spin_unlock_bh(&nf_conntrack_expect_lock); } - if (!exp) + if (!exp && tmpl) __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); /* Other CPU might have obtained a pointer to this object before it was @@ -2068,10 +2068,6 @@ void nf_conntrack_alter_reply(struct nf_conn *ct, ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; if (ct->master || (help && !hlist_empty(&help->expectations))) return; - - rcu_read_lock(); - __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); - rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); @@ -2797,7 +2793,6 @@ int nf_conntrack_init_net(struct net *net) nf_conntrack_acct_pernet_init(net); nf_conntrack_tstamp_pernet_init(net); nf_conntrack_ecache_pernet_init(net); - nf_conntrack_helper_pernet_init(net); nf_conntrack_proto_pernet_init(net); return 0; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index e96b32221444..ff737a76052e 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -35,11 +35,6 @@ unsigned int nf_ct_helper_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_helper_hsize); static unsigned int nf_ct_helper_count __read_mostly; -static bool nf_ct_auto_assign_helper __read_mostly = false; -module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644); -MODULE_PARM_DESC(nf_conntrack_helper, - "Enable automatic conntrack helper assignment (default 0)"); - static DEFINE_MUTEX(nf_ct_nat_helpers_mutex); static struct list_head nf_ct_nat_helpers __read_mostly; @@ -51,24 +46,6 @@ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple) (__force __u16)tuple->src.u.all) % nf_ct_helper_hsize; } -static struct nf_conntrack_helper * -__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple) -{ - struct nf_conntrack_helper *helper; - struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; - unsigned int h; - - if (!nf_ct_helper_count) - return NULL; - - h = helper_hash(tuple); - hlist_for_each_entry_rcu(helper, &nf_ct_helper_hash[h], hnode) { - if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask)) - return helper; - } - return NULL; -} - struct nf_conntrack_helper * __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum) { @@ -209,33 +186,11 @@ nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) } EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); -static struct nf_conntrack_helper * -nf_ct_lookup_helper(struct nf_conn *ct, struct net *net) -{ - struct nf_conntrack_net *cnet = nf_ct_pernet(net); - - if (!cnet->sysctl_auto_assign_helper) { - if (cnet->auto_assign_helper_warned) - return NULL; - if (!__nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)) - return NULL; - pr_info("nf_conntrack: default automatic helper assignment " - "has been turned off for security reasons and CT-based " - "firewall rule not found. Use the iptables CT target " - "to attach helpers instead.\n"); - cnet->auto_assign_helper_warned = true; - return NULL; - } - - return __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); -} - int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags) { struct nf_conntrack_helper *helper = NULL; struct nf_conn_help *help; - struct net *net = nf_ct_net(ct); /* We already got a helper explicitly attached. The function * nf_conntrack_alter_reply - in case NAT is in use - asks for looking @@ -246,23 +201,21 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, if (test_bit(IPS_HELPER_BIT, &ct->status)) return 0; - if (tmpl != NULL) { - help = nfct_help(tmpl); - if (help != NULL) { - helper = rcu_dereference(help->helper); - set_bit(IPS_HELPER_BIT, &ct->status); - } + if (WARN_ON_ONCE(!tmpl)) + return 0; + + help = nfct_help(tmpl); + if (help != NULL) { + helper = rcu_dereference(help->helper); + set_bit(IPS_HELPER_BIT, &ct->status); } help = nfct_help(ct); if (helper == NULL) { - helper = nf_ct_lookup_helper(ct, net); - if (helper == NULL) { - if (help) - RCU_INIT_POINTER(help->helper, NULL); - return 0; - } + if (help) + RCU_INIT_POINTER(help->helper, NULL); + return 0; } if (help == NULL) { @@ -545,19 +498,6 @@ void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat) } EXPORT_SYMBOL_GPL(nf_nat_helper_unregister); -void nf_ct_set_auto_assign_helper_warned(struct net *net) -{ - nf_ct_pernet(net)->auto_assign_helper_warned = true; -} -EXPORT_SYMBOL_GPL(nf_ct_set_auto_assign_helper_warned); - -void nf_conntrack_helper_pernet_init(struct net *net) -{ - struct nf_conntrack_net *cnet = nf_ct_pernet(net); - - cnet->sysctl_auto_assign_helper = nf_ct_auto_assign_helper; -} - int nf_conntrack_helper_init(void) { nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 04169b54f2a2..7562b215b932 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2298,11 +2298,6 @@ ctnetlink_create_conntrack(struct net *net, ct->status |= IPS_HELPER; RCU_INIT_POINTER(help->helper, helper); } - } else { - /* try an implicit helper assignation */ - err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); - if (err < 0) - goto err2; } err = ctnetlink_setup_nat(ct, cda); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 05895878610c..4ffe84c5a82c 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -561,7 +561,6 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_LOG_INVALID, NF_SYSCTL_CT_EXPECT_MAX, NF_SYSCTL_CT_ACCT, - NF_SYSCTL_CT_HELPER, #ifdef CONFIG_NF_CONNTRACK_EVENTS NF_SYSCTL_CT_EVENTS, #endif @@ -680,14 +679,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - [NF_SYSCTL_CT_HELPER] = { - .procname = "nf_conntrack_helper", - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = proc_dou8vec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, #ifdef CONFIG_NF_CONNTRACK_EVENTS [NF_SYSCTL_CT_EVENTS] = { .procname = "nf_conntrack_events", @@ -1100,7 +1091,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum; table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid; table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct; - table[NF_SYSCTL_CT_HELPER].data = &cnet->sysctl_auto_assign_helper; #ifdef CONFIG_NF_CONNTRACK_EVENTS table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events; #endif diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index b04995c3e17f..a3f01f209a53 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -1089,9 +1089,6 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, if (err < 0) goto err_put_helper; - /* Avoid the bogus warning, helper will be assigned after CT init */ - nf_ct_set_auto_assign_helper_warned(ctx->net); - return 0; err_put_helper: -- cgit v1.2.3 From d047283a7034140ea5da759a494fd2274affdd46 Mon Sep 17 00:00:00 2001 From: Harsh Modi Date: Tue, 30 Aug 2022 22:36:03 -0700 Subject: netfilter: br_netfilter: Drop dst references before setting. The IPv6 path already drops dst in the daddr changed case, but the IPv4 path does not. This change makes the two code paths consistent. Further, it is possible that there is already a metadata_dst allocated from ingress that might already be attached to skbuff->dst while following the bridge path. If it is not released before setting a new metadata_dst, it will be leaked. This is similar to what is done in bpf_set_tunnel_key() or ip6_route_input(). It is important to note that the memory being leaked is not the dst being set in the bridge code, but rather memory allocated from some other code path that is not being freed correctly before the skb dst is overwritten. An example of the leakage fixed by this commit found using kmemleak: unreferenced object 0xffff888010112b00 (size 256): comm "softirq", pid 0, jiffies 4294762496 (age 32.012s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 80 16 f1 83 ff ff ff ff ................ e1 4e f6 82 ff ff ff ff 00 00 00 00 00 00 00 00 .N.............. backtrace: [<00000000d79567ea>] metadata_dst_alloc+0x1b/0xe0 [<00000000be113e13>] udp_tun_rx_dst+0x174/0x1f0 [<00000000a36848f4>] geneve_udp_encap_recv+0x350/0x7b0 [<00000000d4afb476>] udp_queue_rcv_one_skb+0x380/0x560 [<00000000ac064aea>] udp_unicast_rcv_skb+0x75/0x90 [<000000009a8ee8c5>] ip_protocol_deliver_rcu+0xd8/0x230 [<00000000ef4980bb>] ip_local_deliver_finish+0x7a/0xa0 [<00000000d7533c8c>] __netif_receive_skb_one_core+0x89/0xa0 [<00000000a879497d>] process_backlog+0x93/0x190 [<00000000e41ade9f>] __napi_poll+0x28/0x170 [<00000000b4c0906b>] net_rx_action+0x14f/0x2a0 [<00000000b20dd5d4>] __do_softirq+0xf4/0x305 [<000000003a7d7e15>] __irq_exit_rcu+0xc3/0x140 [<00000000968d39a2>] sysvec_apic_timer_interrupt+0x9e/0xc0 [<000000009e920794>] asm_sysvec_apic_timer_interrupt+0x16/0x20 [<000000008942add0>] native_safe_halt+0x13/0x20 Florian Westphal says: "Original code was likely fine because nothing ever did set a skb->dst entry earlier than bridge in those days." Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Harsh Modi Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter_hooks.c | 2 ++ net/bridge/br_netfilter_ipv6.c | 1 + 2 files changed, 3 insertions(+) (limited to 'net') diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index ff4779036649..f20f4373ff40 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -384,6 +384,7 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ /* - Bridged-and-DNAT'ed traffic doesn't * require ip_forwarding. */ if (rt->dst.dev == dev) { + skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); goto bridged_dnat; } @@ -413,6 +414,7 @@ bridged_dnat: kfree_skb(skb); return 0; } + skb_dst_drop(skb); skb_dst_set_noref(skb, &rt->dst); } diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index e4e0c836c3f5..6b07f30675bb 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -197,6 +197,7 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc kfree_skb(skb); return 0; } + skb_dst_drop(skb); skb_dst_set_noref(skb, &rt->dst); } -- cgit v1.2.3 From 77972a36ecc4db7fc7c68f0e80714263c5f03f65 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 31 Aug 2022 13:11:47 +0200 Subject: netfilter: nf_tables: clean up hook list when offload flags check fails splice back the hook list so nft_chain_release_hook() has a chance to release the hooks. BUG: memory leak unreferenced object 0xffff88810180b100 (size 96): comm "syz-executor133", pid 3619, jiffies 4294945714 (age 12.690s) hex dump (first 32 bytes): 28 64 23 02 81 88 ff ff 28 64 23 02 81 88 ff ff (d#.....(d#..... 90 a8 aa 83 ff ff ff ff 00 00 b5 0f 81 88 ff ff ................ backtrace: [] kmalloc include/linux/slab.h:600 [inline] [] nft_netdev_hook_alloc+0x3b/0xc0 net/netfilter/nf_tables_api.c:1901 [] nft_chain_parse_netdev net/netfilter/nf_tables_api.c:1998 [inline] [] nft_chain_parse_hook+0x33a/0x530 net/netfilter/nf_tables_api.c:2073 [] nf_tables_addchain.constprop.0+0x10b/0x950 net/netfilter/nf_tables_api.c:2218 [] nf_tables_newchain+0xa8b/0xc60 net/netfilter/nf_tables_api.c:2593 [] nfnetlink_rcv_batch+0xa46/0xd20 net/netfilter/nfnetlink.c:517 [] nfnetlink_rcv_skb_batch net/netfilter/nfnetlink.c:638 [inline] [] nfnetlink_rcv+0x1f9/0x220 net/netfilter/nfnetlink.c:656 [] netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] [] netlink_unicast+0x397/0x4c0 net/netlink/af_netlink.c:1345 [] netlink_sendmsg+0x396/0x710 net/netlink/af_netlink.c:1921 [] sock_sendmsg_nosec net/socket.c:714 [inline] [] sock_sendmsg+0x56/0x80 net/socket.c:734 [] ____sys_sendmsg+0x36c/0x390 net/socket.c:2482 [] ___sys_sendmsg+0xa8/0x110 net/socket.c:2536 [] __sys_sendmsg+0x88/0x100 net/socket.c:2565 [] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 [] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: d54725cd11a5 ("netfilter: nf_tables: support for multiple devices per netdev hook") Reported-by: syzbot+5fcdbfab6d6744c57418@syzkaller.appspotmail.com Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2ee50e23c9b7..816052089b33 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2166,8 +2166,10 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, chain->flags |= NFT_CHAIN_BASE | flags; basechain->policy = NF_ACCEPT; if (chain->flags & NFT_CHAIN_HW_OFFLOAD && - !nft_chain_offload_support(basechain)) + !nft_chain_offload_support(basechain)) { + list_splice_init(&basechain->hook_list, &hook->list); return -EOPNOTSUPP; + } flow_block_init(&basechain->flow_block); -- cgit v1.2.3 From 8fc29ff3910f3af08a7c40a75d436b5720efe2bf Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 27 Aug 2022 11:13:14 -0700 Subject: kcm: fix strp_init() order and cleanup strp_init() is called just a few lines above this csk->sk_user_data check, it also initializes strp->work etc., therefore, it is unnecessary to call strp_done() to cancel the freshly initialized work. And if sk_user_data is already used by KCM, psock->strp should not be touched, particularly strp->work state, so we need to move strp_init() after the csk->sk_user_data check. This also makes a lockdep warning reported by syzbot go away. Reported-and-tested-by: syzbot+9fc084a4348493ef65d2@syzkaller.appspotmail.com Reported-by: syzbot+e696806ef96cdd2d87cd@syzkaller.appspotmail.com Fixes: e5571240236c ("kcm: Check if sk_user_data already set in kcm_attach") Fixes: dff8baa26117 ("kcm: Call strp_stop before strp_done in kcm_attach") Cc: Tom Herbert Signed-off-by: Cong Wang Link: https://lore.kernel.org/r/20220827181314.193710-1-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- net/kcm/kcmsock.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 71899e5a5a11..1215c863e1c4 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1412,12 +1412,6 @@ static int kcm_attach(struct socket *sock, struct socket *csock, psock->sk = csk; psock->bpf_prog = prog; - err = strp_init(&psock->strp, csk, &cb); - if (err) { - kmem_cache_free(kcm_psockp, psock); - goto out; - } - write_lock_bh(&csk->sk_callback_lock); /* Check if sk_user_data is already by KCM or someone else. @@ -1425,13 +1419,18 @@ static int kcm_attach(struct socket *sock, struct socket *csock, */ if (csk->sk_user_data) { write_unlock_bh(&csk->sk_callback_lock); - strp_stop(&psock->strp); - strp_done(&psock->strp); kmem_cache_free(kcm_psockp, psock); err = -EALREADY; goto out; } + err = strp_init(&psock->strp, csk, &cb); + if (err) { + write_unlock_bh(&csk->sk_callback_lock); + kmem_cache_free(kcm_psockp, psock); + goto out; + } + psock->save_data_ready = csk->sk_data_ready; psock->save_write_space = csk->sk_write_space; psock->save_state_change = csk->sk_state_change; -- cgit v1.2.3 From 90fabae8a2c225c4e4936723c38857887edde5cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 31 Aug 2022 11:21:03 +0200 Subject: sch_cake: Return __NET_XMIT_STOLEN when consuming enqueued skb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the GSO splitting feature of sch_cake is enabled, GSO superpackets will be broken up and the resulting segments enqueued in place of the original skb. In this case, CAKE calls consume_skb() on the original skb, but still returns NET_XMIT_SUCCESS. This can confuse parent qdiscs into assuming the original skb still exists, when it really has been freed. Fix this by adding the __NET_XMIT_STOLEN flag to the return value in this case. Fixes: 0c850344d388 ("sch_cake: Conditionally split GSO segments") Signed-off-by: Toke Høiland-Jørgensen Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-18231 Link: https://lore.kernel.org/r/20220831092103.442868-1-toke@toke.dk Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index a43a58a73d09..a04928082e4a 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1713,6 +1713,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, } idx--; flow = &b->flows[idx]; + ret = NET_XMIT_SUCCESS; /* ensure shaper state isn't stale */ if (!b->tin_backlog) { @@ -1771,6 +1772,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); consume_skb(skb); + ret |= __NET_XMIT_STOLEN; } else { /* not splitting */ cobalt_set_enqueue_time(skb, now); @@ -1904,7 +1906,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, } b->drop_overlimit += dropped; } - return NET_XMIT_SUCCESS; + return ret; } static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) -- cgit v1.2.3 From 0efe125cfb99e6773a7434f3463f7c2fa28f3a43 Mon Sep 17 00:00:00 2001 From: David Leadbeater Date: Fri, 26 Aug 2022 14:56:58 +1000 Subject: netfilter: nf_conntrack_irc: Fix forged IP logic Ensure the match happens in the right direction, previously the destination used was the server, not the NAT host, as the comment shows the code intended. Additionally nf_nat_irc uses port 0 as a signal and there's no valid way it can appear in a DCC message, so consider port 0 also forged. Fixes: 869f37d8e48f ("[NETFILTER]: nf_conntrack/nf_nat: add IRC helper port") Signed-off-by: David Leadbeater Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_irc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 1796c456ac98..992decbcaa5c 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -194,8 +194,9 @@ static int help(struct sk_buff *skb, unsigned int protoff, /* dcc_ip can be the internal OR external (NAT'ed) IP */ tuple = &ct->tuplehash[dir].tuple; - if (tuple->src.u3.ip != dcc_ip && - tuple->dst.u3.ip != dcc_ip) { + if ((tuple->src.u3.ip != dcc_ip && + ct->tuplehash[!dir].tuple.dst.u3.ip != dcc_ip) || + dcc_port == 0) { net_warn_ratelimited("Forged DCC command from %pI4: %pI4:%u\n", &tuple->src.u3.ip, &dcc_ip, dcc_port); -- cgit v1.2.3 From eb55dc09b5dd040232d5de32812cc83001a23da6 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 29 Aug 2022 12:01:21 +0200 Subject: ip: fix triggering of 'icmp redirect' __mkroute_input() uses fib_validate_source() to trigger an icmp redirect. My understanding is that fib_validate_source() is used to know if the src address and the gateway address are on the same link. For that, fib_validate_source() returns 1 (same link) or 0 (not the same network). __mkroute_input() is the only user of these positive values, all other callers only look if the returned value is negative. Since the below patch, fib_validate_source() didn't return anymore 1 when both addresses are on the same network, because the route lookup returns RT_SCOPE_LINK instead of RT_SCOPE_HOST. But this is, in fact, right. Let's adapat the test to return 1 again when both addresses are on the same link. CC: stable@vger.kernel.org Fixes: 747c14307214 ("ip: fix dflt addr selection for connected nexthop") Reported-by: kernel test robot Reported-by: Heng Qi Signed-off-by: Nicolas Dichtel Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220829100121.3821-1-nicolas.dichtel@6wind.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index f361d3d56be2..943edf4ad4db 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -389,7 +389,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dev_match = dev_match || (res.type == RTN_LOCAL && dev == net->loopback_dev); if (dev_match) { - ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; + ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK; return ret; } if (no_addr) @@ -401,7 +401,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, ret = 0; if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) - ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; + ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK; } return ret; -- cgit v1.2.3 From 52267ce25f60f37ae40ccbca0b21328ebae5ae75 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Tue, 30 Aug 2022 18:34:48 +0200 Subject: net: dsa: hellcreek: Print warning only once In case the source port cannot be decoded, print the warning only once. This still brings attention to the user and does not spam the logs at the same time. Signed-off-by: Kurt Kanzenbach Reviewed-by: Andrew Lunn Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220830163448.8921-1-kurt@linutronix.de Signed-off-by: Jakub Kicinski --- net/dsa/tag_hellcreek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c index eb204ad36eee..846588c0070a 100644 --- a/net/dsa/tag_hellcreek.c +++ b/net/dsa/tag_hellcreek.c @@ -45,7 +45,7 @@ static struct sk_buff *hellcreek_rcv(struct sk_buff *skb, skb->dev = dsa_master_find_slave(dev, 0, port); if (!skb->dev) { - netdev_warn(dev, "Failed to get source port: %d\n", port); + netdev_warn_once(dev, "Failed to get source port: %d\n", port); return NULL; } -- cgit v1.2.3 From 8c70521238b7863c2af607e20bcba20f974c969b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Aug 2022 11:56:55 -0700 Subject: tcp: annotate data-race around challenge_timestamp challenge_timestamp can be read an written by concurrent threads. This was expected, but we need to annotate the race to avoid potential issues. Following patch moves challenge_timestamp and challenge_count to per-netns storage to provide better isolation. Fixes: 354e4aa391ed ("tcp: RFC 5961 5.2 Blind Data Injection Attack Mitigation") Reported-by: syzbot Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ab5f0ea166f1..c184e15397a2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3629,11 +3629,11 @@ static void tcp_send_challenge_ack(struct sock *sk) /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; - if (now != challenge_timestamp) { + if (now != READ_ONCE(challenge_timestamp)) { u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit); u32 half = (ack_limit + 1) >> 1; - challenge_timestamp = now; + WRITE_ONCE(challenge_timestamp, now); WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit)); } count = READ_ONCE(challenge_count); -- cgit v1.2.3 From 79e3602caa6f9d59c4f66a268407080496dae408 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Aug 2022 11:56:56 -0700 Subject: tcp: make global challenge ack rate limitation per net-ns and default disabled Because per host rate limiting has been proven problematic (side channel attacks can be based on it), per host rate limiting of challenge acks ideally should be per netns and turned off by default. This is a long due followup of following commits: 083ae308280d ("tcp: enable per-socket rate limiting of all 'challenge acks'") f2b2c582e824 ("tcp: mitigate ACK loops for connections as tcp_sock") 75ff39ccc1bd ("tcp: make challenge acks less predictable") Signed-off-by: Eric Dumazet Cc: Jason Baron Acked-by: Neal Cardwell Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 21 +++++++++++---------- net/ipv4/tcp_ipv4.c | 6 ++++-- 2 files changed, 15 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c184e15397a2..b85a9f755da4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3614,12 +3614,9 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, /* RFC 5961 7 [ACK Throttling] */ static void tcp_send_challenge_ack(struct sock *sk) { - /* unprotected vars, we dont care of overwrites */ - static u32 challenge_timestamp; - static unsigned int challenge_count; struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); - u32 count, now; + u32 count, now, ack_limit; /* First check our per-socket dupack rate limit. */ if (__tcp_oow_rate_limited(net, @@ -3627,18 +3624,22 @@ static void tcp_send_challenge_ack(struct sock *sk) &tp->last_oow_ack_time)) return; + ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit); + if (ack_limit == INT_MAX) + goto send_ack; + /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; - if (now != READ_ONCE(challenge_timestamp)) { - u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit); + if (now != READ_ONCE(net->ipv4.tcp_challenge_timestamp)) { u32 half = (ack_limit + 1) >> 1; - WRITE_ONCE(challenge_timestamp, now); - WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit)); + WRITE_ONCE(net->ipv4.tcp_challenge_timestamp, now); + WRITE_ONCE(net->ipv4.tcp_challenge_count, half + prandom_u32_max(ack_limit)); } - count = READ_ONCE(challenge_count); + count = READ_ONCE(net->ipv4.tcp_challenge_count); if (count > 0) { - WRITE_ONCE(challenge_count, count - 1); + WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1); +send_ack: NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack(sk); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0c83780dc9bf..5b019ba2b9d2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3139,8 +3139,10 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_tso_win_divisor = 3; /* Default TSQ limit of 16 TSO segments */ net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; - /* rfc5961 challenge ack rate limiting */ - net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; + + /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ + net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; + net->ipv4.sysctl_tcp_min_tso_segs = 2; net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */ net->ipv4.sysctl_tcp_min_rtt_wlen = 300; -- cgit v1.2.3 From 0b4f688d53fdc2a731b9d9cdf0c96255bc024ea6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 31 Aug 2022 20:01:32 -0700 Subject: Revert "sch_cake: Return __NET_XMIT_STOLEN when consuming enqueued skb" This reverts commit 90fabae8a2c225c4e4936723c38857887edde5cc. Patch was applied hastily, revert and let the v2 be reviewed. Fixes: 90fabae8a2c2 ("sch_cake: Return __NET_XMIT_STOLEN when consuming enqueued skb") Link: https://lore.kernel.org/all/87wnao2ha3.fsf@toke.dk/ Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index a04928082e4a..a43a58a73d09 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1713,7 +1713,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, } idx--; flow = &b->flows[idx]; - ret = NET_XMIT_SUCCESS; /* ensure shaper state isn't stale */ if (!b->tin_backlog) { @@ -1772,7 +1771,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); consume_skb(skb); - ret |= __NET_XMIT_STOLEN; } else { /* not splitting */ cobalt_set_enqueue_time(skb, now); @@ -1906,7 +1904,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, } b->drop_overlimit += dropped; } - return ret; + return NET_XMIT_SUCCESS; } static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) -- cgit v1.2.3 From a8424a9b4522a3ab9f32175ad6d848739079071f Mon Sep 17 00:00:00 2001 From: Yacan Liu Date: Tue, 30 Aug 2022 23:23:14 +0800 Subject: net/smc: Remove redundant refcount increase For passive connections, the refcount increment has been done in smc_clcsock_accept()-->smc_sock_alloc(). Fixes: 3b2dec2603d5 ("net/smc: restructure client and server code in af_smc") Signed-off-by: Yacan Liu Reviewed-by: Tony Lu Link: https://lore.kernel.org/r/20220830152314.838736-1-liuyacan@corp.netease.com Signed-off-by: Paolo Abeni --- net/smc/af_smc.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 79c1318af1fe..0939cc3b915a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1855,7 +1855,6 @@ static void smc_listen_out_connected(struct smc_sock *new_smc) { struct sock *newsmcsk = &new_smc->sk; - sk_refcnt_debug_inc(newsmcsk); if (newsmcsk->sk_state == SMC_INIT) newsmcsk->sk_state = SMC_ACTIVE; -- cgit v1.2.3 From ac56a0b48da86fd1b4389632fb7c4c8a5d86eefa Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 26 Aug 2022 15:39:28 +0100 Subject: rxrpc: Fix ICMP/ICMP6 error handling Because rxrpc pretends to be a tunnel on top of a UDP/UDP6 socket, allowing it to siphon off UDP packets early in the handling of received UDP packets thereby avoiding the packet going through the UDP receive queue, it doesn't get ICMP packets through the UDP ->sk_error_report() callback. In fact, it doesn't appear that there's any usable option for getting hold of ICMP packets. Fix this by adding a new UDP encap hook to distribute error messages for UDP tunnels. If the hook is set, then the tunnel driver will be able to see ICMP packets. The hook provides the offset into the packet of the UDP header of the original packet that caused the notification. An alternative would be to call the ->error_handler() hook - but that requires that the skbuff be cloned (as ip_icmp_error() or ipv6_cmp_error() do, though isn't really necessary or desirable in rxrpc's case is we want to parse them there and then, not queue them). Changes ======= ver #3) - Fixed an uninitialised variable. ver #2) - Fixed some missing CONFIG_AF_RXRPC_IPV6 conditionals. Fixes: 5271953cad31 ("rxrpc: Use the UDP encap_rcv hook") Signed-off-by: David Howells --- net/ipv4/udp.c | 2 + net/ipv4/udp_tunnel_core.c | 1 + net/ipv6/udp.c | 5 +- net/rxrpc/ar-internal.h | 1 + net/rxrpc/local_object.c | 1 + net/rxrpc/peer_event.c | 293 +++++++++++++++++++++++++++++++++++++++------ 6 files changed, 265 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 34eda973bbf1..cd72158e953a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -783,6 +783,8 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) */ if (tunnel) { /* ...not for tunnels though: we don't have a sending socket */ + if (udp_sk(sk)->encap_err_rcv) + udp_sk(sk)->encap_err_rcv(sk, skb, iph->ihl << 2); goto out; } if (!inet->recverr) { diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 8efaf8c3fe2a..8242c8947340 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -72,6 +72,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->encap_type = cfg->encap_type; udp_sk(sk)->encap_rcv = cfg->encap_rcv; + udp_sk(sk)->encap_err_rcv = cfg->encap_err_rcv; udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup; udp_sk(sk)->encap_destroy = cfg->encap_destroy; udp_sk(sk)->gro_receive = cfg->gro_receive; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 16c176e7c69a..3366d6a77ff2 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -616,8 +616,11 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } /* Tunnels don't have an application socket: don't pass errors back */ - if (tunnel) + if (tunnel) { + if (udp_sk(sk)->encap_err_rcv) + udp_sk(sk)->encap_err_rcv(sk, skb, offset); goto out; + } if (!np->recverr) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 571436064cd6..62c70709d798 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -982,6 +982,7 @@ void rxrpc_send_keepalive(struct rxrpc_peer *); /* * peer_event.c */ +void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset); void rxrpc_error_report(struct sock *); void rxrpc_peer_keepalive_worker(struct work_struct *); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 96ecb7356c0f..79bb02eb67b2 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -137,6 +137,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) tuncfg.encap_type = UDP_ENCAP_RXRPC; tuncfg.encap_rcv = rxrpc_input_packet; + tuncfg.encap_err_rcv = rxrpc_encap_err_rcv; tuncfg.sk_user_data = local; setup_udp_tunnel_sock(net, local->socket, &tuncfg); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index be032850ae8c..32561e9567fe 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -16,22 +16,105 @@ #include #include #include +#include #include "ar-internal.h" +static void rxrpc_adjust_mtu(struct rxrpc_peer *, unsigned int); static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *); static void rxrpc_distribute_error(struct rxrpc_peer *, int, enum rxrpc_call_completion); /* - * Find the peer associated with an ICMP packet. + * Find the peer associated with an ICMPv4 packet. */ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, - const struct sk_buff *skb, + struct sk_buff *skb, + unsigned int udp_offset, + unsigned int *info, struct sockaddr_rxrpc *srx) { - struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + struct iphdr *ip, *ip0 = ip_hdr(skb); + struct icmphdr *icmp = icmp_hdr(skb); + struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset); - _enter(""); + _enter("%u,%u,%u", ip0->protocol, icmp->type, icmp->code); + + switch (icmp->type) { + case ICMP_DEST_UNREACH: + *info = ntohs(icmp->un.frag.mtu); + fallthrough; + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + ip = (struct iphdr *)((void *)icmp + 8); + break; + default: + return NULL; + } + + memset(srx, 0, sizeof(*srx)); + srx->transport_type = local->srx.transport_type; + srx->transport_len = local->srx.transport_len; + srx->transport.family = local->srx.transport.family; + + /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice + * versa? + */ + switch (srx->transport.family) { + case AF_INET: + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.family = AF_INET; + srx->transport.sin.sin_port = udp->dest; + memcpy(&srx->transport.sin.sin_addr, &ip->daddr, + sizeof(struct in_addr)); + break; + +#ifdef CONFIG_AF_RXRPC_IPV6 + case AF_INET6: + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.family = AF_INET; + srx->transport.sin.sin_port = udp->dest; + memcpy(&srx->transport.sin.sin_addr, &ip->daddr, + sizeof(struct in_addr)); + break; +#endif + + default: + WARN_ON_ONCE(1); + return NULL; + } + + _net("ICMP {%pISp}", &srx->transport); + return rxrpc_lookup_peer_rcu(local, srx); +} + +#ifdef CONFIG_AF_RXRPC_IPV6 +/* + * Find the peer associated with an ICMPv6 packet. + */ +static struct rxrpc_peer *rxrpc_lookup_peer_icmp6_rcu(struct rxrpc_local *local, + struct sk_buff *skb, + unsigned int udp_offset, + unsigned int *info, + struct sockaddr_rxrpc *srx) +{ + struct icmp6hdr *icmp = icmp6_hdr(skb); + struct ipv6hdr *ip, *ip0 = ipv6_hdr(skb); + struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset); + + _enter("%u,%u,%u", ip0->nexthdr, icmp->icmp6_type, icmp->icmp6_code); + + switch (icmp->icmp6_type) { + case ICMPV6_DEST_UNREACH: + *info = ntohl(icmp->icmp6_mtu); + fallthrough; + case ICMPV6_PKT_TOOBIG: + case ICMPV6_TIME_EXCEED: + case ICMPV6_PARAMPROB: + ip = (struct ipv6hdr *)((void *)icmp + 8); + break; + default: + return NULL; + } memset(srx, 0, sizeof(*srx)); srx->transport_type = local->srx.transport_type; @@ -41,6 +124,165 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice * versa? */ + switch (srx->transport.family) { + case AF_INET: + _net("Rx ICMP6 on v4 sock"); + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.family = AF_INET; + srx->transport.sin.sin_port = udp->dest; + memcpy(&srx->transport.sin.sin_addr, + &ip->daddr.s6_addr32[3], sizeof(struct in_addr)); + break; + case AF_INET6: + _net("Rx ICMP6"); + srx->transport.sin.sin_port = udp->dest; + memcpy(&srx->transport.sin6.sin6_addr, &ip->daddr, + sizeof(struct in6_addr)); + break; + default: + WARN_ON_ONCE(1); + return NULL; + } + + _net("ICMP {%pISp}", &srx->transport); + return rxrpc_lookup_peer_rcu(local, srx); +} +#endif /* CONFIG_AF_RXRPC_IPV6 */ + +/* + * Handle an error received on the local endpoint as a tunnel. + */ +void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, + unsigned int udp_offset) +{ + struct sock_extended_err ee; + struct sockaddr_rxrpc srx; + struct rxrpc_local *local; + struct rxrpc_peer *peer; + unsigned int info = 0; + int err; + u8 version = ip_hdr(skb)->version; + u8 type = icmp_hdr(skb)->type; + u8 code = icmp_hdr(skb)->code; + + rcu_read_lock(); + local = rcu_dereference_sk_user_data(sk); + if (unlikely(!local)) { + rcu_read_unlock(); + return; + } + + rxrpc_new_skb(skb, rxrpc_skb_received); + + switch (ip_hdr(skb)->version) { + case IPVERSION: + peer = rxrpc_lookup_peer_icmp_rcu(local, skb, udp_offset, + &info, &srx); + break; +#ifdef CONFIG_AF_RXRPC_IPV6 + case 6: + peer = rxrpc_lookup_peer_icmp6_rcu(local, skb, udp_offset, + &info, &srx); + break; +#endif + default: + rcu_read_unlock(); + return; + } + + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (!peer) { + rcu_read_unlock(); + return; + } + + memset(&ee, 0, sizeof(ee)); + + switch (version) { + case IPVERSION: + switch (type) { + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_FRAG_NEEDED: + rxrpc_adjust_mtu(peer, info); + rcu_read_unlock(); + rxrpc_put_peer(peer); + return; + default: + break; + } + + err = EHOSTUNREACH; + if (code <= NR_ICMP_UNREACH) { + /* Might want to do something different with + * non-fatal errors + */ + //harderr = icmp_err_convert[code].fatal; + err = icmp_err_convert[code].errno; + } + break; + + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + default: + err = EPROTO; + break; + } + + ee.ee_origin = SO_EE_ORIGIN_ICMP; + ee.ee_type = type; + ee.ee_code = code; + ee.ee_errno = err; + break; + +#ifdef CONFIG_AF_RXRPC_IPV6 + case 6: + switch (type) { + case ICMPV6_PKT_TOOBIG: + rxrpc_adjust_mtu(peer, info); + rcu_read_unlock(); + rxrpc_put_peer(peer); + return; + } + + icmpv6_err_convert(type, code, &err); + + if (err == EACCES) + err = EHOSTUNREACH; + + ee.ee_origin = SO_EE_ORIGIN_ICMP6; + ee.ee_type = type; + ee.ee_code = code; + ee.ee_errno = err; + break; +#endif + } + + trace_rxrpc_rx_icmp(peer, &ee, &srx); + + rxrpc_distribute_error(peer, err, RXRPC_CALL_NETWORK_ERROR); + rcu_read_unlock(); + rxrpc_put_peer(peer); +} + +/* + * Find the peer associated with a local error. + */ +static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local, + const struct sk_buff *skb, + struct sockaddr_rxrpc *srx) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + + _enter(""); + + memset(srx, 0, sizeof(*srx)); + srx->transport_type = local->srx.transport_type; + srx->transport_len = local->srx.transport_len; + srx->transport.family = local->srx.transport.family; + switch (srx->transport.family) { case AF_INET: srx->transport_len = sizeof(srx->transport.sin); @@ -104,10 +346,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, /* * Handle an MTU/fragmentation problem. */ -static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, struct sock_exterr_skb *serr) +static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) { - u32 mtu = serr->ee.ee_info; - _net("Rx ICMP Fragmentation Needed (%d)", mtu); /* wind down the local interface MTU */ @@ -148,7 +388,7 @@ void rxrpc_error_report(struct sock *sk) struct sock_exterr_skb *serr; struct sockaddr_rxrpc srx; struct rxrpc_local *local; - struct rxrpc_peer *peer; + struct rxrpc_peer *peer = NULL; struct sk_buff *skb; rcu_read_lock(); @@ -172,41 +412,20 @@ void rxrpc_error_report(struct sock *sk) } rxrpc_new_skb(skb, rxrpc_skb_received); serr = SKB_EXT_ERR(skb); - if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { - _leave("UDP empty message"); - rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_freed); - return; - } - peer = rxrpc_lookup_peer_icmp_rcu(local, skb, &srx); - if (peer && !rxrpc_get_peer_maybe(peer)) - peer = NULL; - if (!peer) { - rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_freed); - _leave(" [no peer]"); - return; - } - - trace_rxrpc_rx_icmp(peer, &serr->ee, &srx); - - if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP && - serr->ee.ee_type == ICMP_DEST_UNREACH && - serr->ee.ee_code == ICMP_FRAG_NEEDED)) { - rxrpc_adjust_mtu(peer, serr); - rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_freed); - rxrpc_put_peer(peer); - _leave(" [MTU update]"); - return; + if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) { + peer = rxrpc_lookup_peer_local_rcu(local, skb, &srx); + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (peer) { + trace_rxrpc_rx_icmp(peer, &serr->ee, &srx); + rxrpc_store_error(peer, serr); + } } - rxrpc_store_error(peer, serr); rcu_read_unlock(); rxrpc_free_skb(skb, rxrpc_skb_freed); rxrpc_put_peer(peer); - _leave(""); } -- cgit v1.2.3 From 0d40f728e28393a8817d1fcae923dfa3409e488c Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 24 Aug 2022 22:39:28 +0100 Subject: rxrpc: Fix an insufficiently large sglist in rxkad_verify_packet_2() rxkad_verify_packet_2() has a small stack-allocated sglist of 4 elements, but if that isn't sufficient for the number of fragments in the socket buffer, we try to allocate an sglist large enough to hold all the fragments. However, for large packets with a lot of fragments, this isn't sufficient and we need at least one additional fragment. The problem manifests as skb_to_sgvec() returning -EMSGSIZE and this then getting returned by userspace. Most of the time, this isn't a problem as rxrpc sets a limit of 5692, big enough for 4 jumbo subpackets to be glued together; occasionally, however, the server will ignore the reported limit and give a packet that's a lot bigger - say 19852 bytes with ->nr_frags being 7. skb_to_sgvec() then tries to return a "zeroth" fragment that seems to occur before the fragments counted by ->nr_frags and we hit the end of the sglist too early. Note that __skb_to_sgvec() also has an skb_walk_frags() loop that is recursive up to 24 deep. I'm not sure if I need to take account of that too - or if there's an easy way of counting those frags too. Fix this by counting an extra frag and allocating a larger sglist based on that. Fixes: d0d5c0cd1e71 ("rxrpc: Use skb_unshare() rather than skb_cow_data()") Reported-by: Marc Dionne Signed-off-by: David Howells cc: linux-afs@lists.infradead.org --- net/rxrpc/rxkad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 258917a714c8..78fa0524156f 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -540,7 +540,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, * directly into the target buffer. */ sg = _sg; - nsg = skb_shinfo(skb)->nr_frags; + nsg = skb_shinfo(skb)->nr_frags + 1; if (nsg <= 4) { nsg = 4; } else { -- cgit v1.2.3 From d3d863036d688313f8d566b87acd7d99daf82749 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 6 May 2022 23:55:21 +0100 Subject: rxrpc: Fix local destruction being repeated If the local processor work item for the rxrpc local endpoint gets requeued by an event (such as an incoming packet) between it getting scheduled for destruction and the UDP socket being closed, the rxrpc_local_destroyer() function can get run twice. The second time it can hang because it can end up waiting for cleanup events that will never happen. Signed-off-by: David Howells --- net/rxrpc/local_object.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 79bb02eb67b2..38ea98ff426b 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -406,6 +406,9 @@ static void rxrpc_local_processor(struct work_struct *work) container_of(work, struct rxrpc_local, processor); bool again; + if (local->dead) + return; + trace_rxrpc_local(local->debug_id, rxrpc_local_processing, refcount_read(&local->ref), NULL); -- cgit v1.2.3 From 214a9dc7d852216e83acac7b75bc18f01ce184c2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 5 Apr 2022 13:34:09 +0100 Subject: rxrpc: Fix calc of resend age Fix the calculation of the resend age to add a microsecond value as microseconds, not nanoseconds. Signed-off-by: David Howells --- net/rxrpc/call_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index f8ecad2b730e..2a93e7b5fbd0 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -166,7 +166,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) _enter("{%d,%d}", call->tx_hard_ack, call->tx_top); now = ktime_get_real(); - max_age = ktime_sub(now, jiffies_to_usecs(call->peer->rto_j)); + max_age = ktime_sub_us(now, jiffies_to_usecs(call->peer->rto_j)); spin_lock_bh(&call->lock); -- cgit v1.2.3 From 21457f4a91cb522f1a3ad9741ff1d25fadfaa3c5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 31 Aug 2022 13:24:44 +0100 Subject: rxrpc: Remove rxrpc_get_reply_time() which is no longer used Remove rxrpc_get_reply_time() as that is no longer used now that the call issue time is used instead of the reply time. Signed-off-by: David Howells --- net/rxrpc/recvmsg.c | 43 ------------------------------------------- 1 file changed, 43 deletions(-) (limited to 'net') diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 250f23bc1c07..7e39c262fd79 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -771,46 +771,3 @@ call_complete: goto out; } EXPORT_SYMBOL(rxrpc_kernel_recv_data); - -/** - * rxrpc_kernel_get_reply_time - Get timestamp on first reply packet - * @sock: The socket that the call exists on - * @call: The call to query - * @_ts: Where to put the timestamp - * - * Retrieve the timestamp from the first DATA packet of the reply if it is - * in the ring. Returns true if successful, false if not. - */ -bool rxrpc_kernel_get_reply_time(struct socket *sock, struct rxrpc_call *call, - ktime_t *_ts) -{ - struct sk_buff *skb; - rxrpc_seq_t hard_ack, top, seq; - bool success = false; - - mutex_lock(&call->user_mutex); - - if (READ_ONCE(call->state) != RXRPC_CALL_CLIENT_RECV_REPLY) - goto out; - - hard_ack = call->rx_hard_ack; - if (hard_ack != 0) - goto out; - - seq = hard_ack + 1; - top = smp_load_acquire(&call->rx_top); - if (after(seq, top)) - goto out; - - skb = call->rxtx_buffer[seq & RXRPC_RXTX_BUFF_MASK]; - if (!skb) - goto out; - - *_ts = skb_get_ktime(skb); - success = true; - -out: - mutex_unlock(&call->user_mutex); - return success; -} -EXPORT_SYMBOL(rxrpc_kernel_get_reply_time); -- cgit v1.2.3 From 17814819ac9829a437e06fbb5c7056a1f4f893da Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 31 Aug 2022 17:28:13 -0400 Subject: SUNRPC: Fix call completion races with call_decode() We need to make sure that the req->rq_private_buf is completely up to date before we make req->rq_reply_bytes_recvd visible to the call_decode() routine in order to avoid triggering the WARN_ON(). Reported-by: Benjamin Coddington Fixes: 72691a269f0b ("SUNRPC: Don't reuse bvec on retransmission of the request") Tested-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index d71eec494826..f8fae7815649 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1179,11 +1179,8 @@ xprt_request_dequeue_receive_locked(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) { + if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) xprt_request_rb_remove(req->rq_xprt, req); - xdr_free_bvec(&req->rq_rcv_buf); - req->rq_private_buf.bvec = NULL; - } } /** @@ -1221,6 +1218,8 @@ void xprt_complete_rqst(struct rpc_task *task, int copied) xprt->stat.recvs++; + xdr_free_bvec(&req->rq_rcv_buf); + req->rq_private_buf.bvec = NULL; req->rq_private_buf.len = copied; /* Ensure all writes are done before we update */ /* req->rq_reply_bytes_recvd */ @@ -1453,6 +1452,7 @@ xprt_request_dequeue_xprt(struct rpc_task *task) xprt_request_dequeue_transmit_locked(task); xprt_request_dequeue_receive_locked(task); spin_unlock(&xprt->queue_lock); + xdr_free_bvec(&req->rq_rcv_buf); } } -- cgit v1.2.3 From 9efd23297cca530bb35e1848665805d3fcdd7889 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 31 Aug 2022 23:52:18 +0200 Subject: sch_sfb: Don't assume the skb is still around after enqueueing to child MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sch_sfb enqueue() routine assumes the skb is still alive after it has been enqueued into a child qdisc, using the data in the skb cb field in the increment_qlen() routine after enqueue. However, the skb may in fact have been freed, causing a use-after-free in this case. In particular, this happens if sch_cake is used as a child of sfb, and the GSO splitting mode of CAKE is enabled (in which case the skb will be split into segments and the original skb freed). Fix this by copying the sfb cb data to the stack before enqueueing the skb, and using this stack copy in increment_qlen() instead of the skb pointer itself. Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-18231 Fixes: e13e02a3c68d ("net_sched: SFB flow scheduler") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_sfb.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 3d061a13d7ed..0d761f454ae8 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -135,15 +135,15 @@ static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) } } -static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) +static void increment_qlen(const struct sfb_skb_cb *cb, struct sfb_sched_data *q) { u32 sfbhash; - sfbhash = sfb_hash(skb, 0); + sfbhash = cb->hashes[0]; if (sfbhash) increment_one_qlen(sfbhash, 0, q); - sfbhash = sfb_hash(skb, 1); + sfbhash = cb->hashes[1]; if (sfbhash) increment_one_qlen(sfbhash, 1, q); } @@ -283,6 +283,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sfb_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; struct tcf_proto *fl; + struct sfb_skb_cb cb; int i; u32 p_min = ~0; u32 minqlen = ~0; @@ -399,11 +400,12 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, } enqueue: + memcpy(&cb, sfb_skb_cb(skb), sizeof(cb)); ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; - increment_qlen(skb, q); + increment_qlen(&cb, q); } else if (net_xmit_drop_count(ret)) { q->stats.childdrop++; qdisc_qstats_drop(sch); -- cgit v1.2.3 From e2b224abd9bf45dcb55750479fc35970725a430b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 31 Aug 2022 17:47:56 +0300 Subject: tipc: fix shift wrapping bug in map_get() There is a shift wrapping bug in this code so anything thing above 31 will return false. Fixes: 35c55c9877f8 ("tipc: add neighbor monitoring framework") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/tipc/monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 2f4d23238a7e..9618e4429f0f 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -160,7 +160,7 @@ static void map_set(u64 *up_map, int i, unsigned int v) static int map_get(u64 up_map, int i) { - return (up_map & (1 << i)) >> i; + return (up_map & (1ULL << i)) >> i; } static struct tipc_peer *peer_prev(struct tipc_peer *peer) -- cgit v1.2.3 From 3261400639463a853ba2b3be8bd009c2a8089775 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Aug 2022 23:38:09 +0000 Subject: tcp: TX zerocopy should not sense pfmemalloc status We got a recent syzbot report [1] showing a possible misuse of pfmemalloc page status in TCP zerocopy paths. Indeed, for pages coming from user space or other layers, using page_is_pfmemalloc() is moot, and possibly could give false positives. There has been attempts to make page_is_pfmemalloc() more robust, but not using it in the first place in this context is probably better, removing cpu cycles. Note to stable teams : You need to backport 84ce071e38a6 ("net: introduce __skb_fill_page_desc_noacc") as a prereq. Race is more probable after commit c07aea3ef4d4 ("mm: add a signature in struct page") because page_is_pfmemalloc() is now using low order bit from page->lru.next, which can change more often than page->index. Low order bit should never be set for lru.next (when used as an anchor in LRU list), so KCSAN report is mostly a false positive. Backporting to older kernel versions seems not necessary. [1] BUG: KCSAN: data-race in lru_add_fn / tcp_build_frag write to 0xffffea0004a1d2c8 of 8 bytes by task 18600 on cpu 0: __list_add include/linux/list.h:73 [inline] list_add include/linux/list.h:88 [inline] lruvec_add_folio include/linux/mm_inline.h:105 [inline] lru_add_fn+0x440/0x520 mm/swap.c:228 folio_batch_move_lru+0x1e1/0x2a0 mm/swap.c:246 folio_batch_add_and_move mm/swap.c:263 [inline] folio_add_lru+0xf1/0x140 mm/swap.c:490 filemap_add_folio+0xf8/0x150 mm/filemap.c:948 __filemap_get_folio+0x510/0x6d0 mm/filemap.c:1981 pagecache_get_page+0x26/0x190 mm/folio-compat.c:104 grab_cache_page_write_begin+0x2a/0x30 mm/folio-compat.c:116 ext4_da_write_begin+0x2dd/0x5f0 fs/ext4/inode.c:2988 generic_perform_write+0x1d4/0x3f0 mm/filemap.c:3738 ext4_buffered_write_iter+0x235/0x3e0 fs/ext4/file.c:270 ext4_file_write_iter+0x2e3/0x1210 call_write_iter include/linux/fs.h:2187 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x468/0x760 fs/read_write.c:578 ksys_write+0xe8/0x1a0 fs/read_write.c:631 __do_sys_write fs/read_write.c:643 [inline] __se_sys_write fs/read_write.c:640 [inline] __x64_sys_write+0x3e/0x50 fs/read_write.c:640 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffffea0004a1d2c8 of 8 bytes by task 18611 on cpu 1: page_is_pfmemalloc include/linux/mm.h:1740 [inline] __skb_fill_page_desc include/linux/skbuff.h:2422 [inline] skb_fill_page_desc include/linux/skbuff.h:2443 [inline] tcp_build_frag+0x613/0xb20 net/ipv4/tcp.c:1018 do_tcp_sendpages+0x3e8/0xaf0 net/ipv4/tcp.c:1075 tcp_sendpage_locked net/ipv4/tcp.c:1140 [inline] tcp_sendpage+0x89/0xb0 net/ipv4/tcp.c:1150 inet_sendpage+0x7f/0xc0 net/ipv4/af_inet.c:833 kernel_sendpage+0x184/0x300 net/socket.c:3561 sock_sendpage+0x5a/0x70 net/socket.c:1054 pipe_to_sendpage+0x128/0x160 fs/splice.c:361 splice_from_pipe_feed fs/splice.c:415 [inline] __splice_from_pipe+0x222/0x4d0 fs/splice.c:559 splice_from_pipe fs/splice.c:594 [inline] generic_splice_sendpage+0x89/0xc0 fs/splice.c:743 do_splice_from fs/splice.c:764 [inline] direct_splice_actor+0x80/0xa0 fs/splice.c:931 splice_direct_to_actor+0x305/0x620 fs/splice.c:886 do_splice_direct+0xfb/0x180 fs/splice.c:974 do_sendfile+0x3bf/0x910 fs/read_write.c:1249 __do_sys_sendfile64 fs/read_write.c:1317 [inline] __se_sys_sendfile64 fs/read_write.c:1303 [inline] __x64_sys_sendfile64+0x10c/0x150 fs/read_write.c:1303 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x0000000000000000 -> 0xffffea0004a1d288 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 18611 Comm: syz-executor.4 Not tainted 6.0.0-rc2-syzkaller-00248-ge022620b5d05-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/22/2022 Fixes: c07aea3ef4d4 ("mm: add a signature in struct page") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Shakeel Butt Reviewed-by: Shakeel Butt Signed-off-by: David S. Miller --- net/core/datagram.c | 2 +- net/ipv4/tcp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/datagram.c b/net/core/datagram.c index 7255531f63ae..e4ff2db40c98 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -677,7 +677,7 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, page_ref_sub(last_head, refs); refs = 0; } - skb_fill_page_desc(skb, frag++, head, start, size); + skb_fill_page_desc_noacc(skb, frag++, head, start, size); } if (refs) page_ref_sub(last_head, refs); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e5011c136fdb..6cdfce6f2867 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1015,7 +1015,7 @@ new_segment: skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { get_page(page); - skb_fill_page_desc(skb, i, page, offset, copy); + skb_fill_page_desc_noacc(skb, i, page, offset, copy); } if (!(flags & MSG_NO_SHARED_FRAGS)) -- cgit v1.2.3 From be318363daa2939453b4d80981de3e9c28b66135 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 1 Sep 2022 17:24:13 -0700 Subject: Bluetooth: hci_sync: Fix hci_read_buffer_size_sync hci_read_buffer_size_sync shall not use HCI_OP_LE_READ_BUFFER_SIZE_V2 sinze that is LE specific, instead it is hci_le_read_buffer_size_sync version that shall use it. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216382 Fixes: 26afbd826ee3 ("Bluetooth: Add initial implementation of CIS connections") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 187786454d98..fbd5613eebfc 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3018,12 +3018,6 @@ static const struct hci_init_stage amp_init2[] = { /* Read Buffer Size (ACL mtu, max pkt, etc.) */ static int hci_read_buffer_size_sync(struct hci_dev *hdev) { - /* Use Read LE Buffer Size V2 if supported */ - if (hdev->commands[41] & 0x20) - return __hci_cmd_sync_status(hdev, - HCI_OP_LE_READ_BUFFER_SIZE_V2, - 0, NULL, HCI_CMD_TIMEOUT); - return __hci_cmd_sync_status(hdev, HCI_OP_READ_BUFFER_SIZE, 0, NULL, HCI_CMD_TIMEOUT); } @@ -3237,6 +3231,12 @@ static const struct hci_init_stage hci_init2[] = { /* Read LE Buffer Size */ static int hci_le_read_buffer_size_sync(struct hci_dev *hdev) { + /* Use Read LE Buffer Size V2 if supported */ + if (hdev->commands[41] & 0x20) + return __hci_cmd_sync_status(hdev, + HCI_OP_LE_READ_BUFFER_SIZE_V2, + 0, NULL, HCI_CMD_TIMEOUT); + return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_BUFFER_SIZE, 0, NULL, HCI_CMD_TIMEOUT); } -- cgit v1.2.3 From 4a86c5462616e0d690ad3c94dc84c3b5f1ea5631 Mon Sep 17 00:00:00 2001 From: Mukesh Sisodiya Date: Fri, 2 Sep 2022 16:11:31 +0200 Subject: wifi: mac80211: fix link warning in RX agg timer expiry The rx data link pointer isn't set from the RX aggregation timer, resulting in a later warning. Fix that by setting it to the first valid link for now, with a FIXME to worry about statistics later, it's not very important since it's just the timeout case. Reported-by: Hans de Goede Link: https://lore.kernel.org/r/498d714c-76be-9d04-26db-a1206878de5e@redhat.com Fixes: 56057da4569b ("wifi: mac80211: rx: track link in RX data") Signed-off-by: Mukesh Sisodiya Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 57df21e2170a..45d7e71661e3 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4074,6 +4074,7 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) .link_id = -1, }; struct tid_ampdu_rx *tid_agg_rx; + u8 link_id; tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); if (!tid_agg_rx) @@ -4093,6 +4094,9 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) }; drv_event_callback(rx.local, rx.sdata, &event); } + /* FIXME: statistics won't be right with this */ + link_id = sta->sta.valid_links ? ffs(sta->sta.valid_links) - 1 : 0; + rx.link = rcu_dereference(sta->sdata->link[link_id]); ieee80211_rx_handlers(&rx, &frames); } -- cgit v1.2.3 From 7a2c6d1616be5d49c0dae2c876af3fe20e71a111 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 2 Sep 2022 16:11:15 +0200 Subject: wifi: mac80211: mlme: release deflink channel in error case In the prep_channel error case we didn't release the deflink channel leaving it to be left around. Fix that. Change-Id: If0dfd748125ec46a31fc6045a480dc28e03723d2 Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 3d4ab711f0d1..4c40f0427e88 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -6509,6 +6509,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, return 0; out_err: + ieee80211_link_release_channel(&sdata->deflink); ieee80211_vif_set_links(sdata, 0); return err; } -- cgit v1.2.3 From 69371801f929ff4b3c846a45b9d49db631897cd9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 2 Sep 2022 16:11:14 +0200 Subject: wifi: mac80211: fix locking in auth/assoc timeout If we hit an authentication or association timeout, we only release the chanctx for the deflink, and the other link(s) are released later by ieee80211_vif_set_links(), but we're not locking this correctly. Fix the locking here while releasing the channels and links. Change-Id: I9e08c1a5434592bdc75253c1abfa6c788f9f39b1 Fixes: 81151ce462e5 ("wifi: mac80211: support MLO authentication/association with one link") Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 4c40f0427e88..5265d2b6db12 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3420,11 +3420,11 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata, ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_BSSID); sdata->u.mgd.flags = 0; + mutex_lock(&sdata->local->mtx); ieee80211_link_release_channel(&sdata->deflink); - mutex_unlock(&sdata->local->mtx); - ieee80211_vif_set_links(sdata, 0); + mutex_unlock(&sdata->local->mtx); } cfg80211_put_bss(sdata->local->hw.wiphy, auth_data->bss); @@ -3462,10 +3462,6 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, sdata->u.mgd.flags = 0; sdata->vif.bss_conf.mu_mimo_owner = false; - mutex_lock(&sdata->local->mtx); - ieee80211_link_release_channel(&sdata->deflink); - mutex_unlock(&sdata->local->mtx); - if (status != ASSOC_REJECTED) { struct cfg80211_assoc_failure data = { .timeout = status == ASSOC_TIMEOUT, @@ -3484,7 +3480,10 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, cfg80211_assoc_failure(sdata->dev, &data); } + mutex_lock(&sdata->local->mtx); + ieee80211_link_release_channel(&sdata->deflink); ieee80211_vif_set_links(sdata, 0); + mutex_unlock(&sdata->local->mtx); } kfree(assoc_data); -- cgit v1.2.3 From 2aec909912da55a6e469fd6ee8412080a5433ed2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 29 Aug 2022 11:46:38 +0200 Subject: wifi: use struct_group to copy addresses We sometimes copy all the addresses from the 802.11 header for the AAD, which may cause complaints from fortify checks. Use struct_group() to avoid the compiler warnings/errors. Change-Id: Ic3ea389105e7813b22095b295079eecdabde5045 Signed-off-by: Johannes Berg --- net/mac80211/wpa.c | 4 ++-- net/wireless/lib80211_crypt_ccmp.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 93ec2f349748..20f742b5503b 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -351,7 +351,7 @@ static u8 ccmp_gcmp_aad(struct sk_buff *skb, u8 *aad) * FC | A1 | A2 | A3 | SC | [A4] | [QC] */ put_unaligned_be16(len_a, &aad[0]); put_unaligned(mask_fc, (__le16 *)&aad[2]); - memcpy(&aad[4], &hdr->addr1, 3 * ETH_ALEN); + memcpy(&aad[4], &hdr->addrs, 3 * ETH_ALEN); /* Mask Seq#, leave Frag# */ aad[22] = *((u8 *) &hdr->seq_ctrl) & 0x0f; @@ -792,7 +792,7 @@ static void bip_aad(struct sk_buff *skb, u8 *aad) IEEE80211_FCTL_MOREDATA); put_unaligned(mask_fc, (__le16 *) &aad[0]); /* A1 || A2 || A3 */ - memcpy(aad + 2, &hdr->addr1, 3 * ETH_ALEN); + memcpy(aad + 2, &hdr->addrs, 3 * ETH_ALEN); } diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c index 6a5f08f7491e..cca5e1cf089e 100644 --- a/net/wireless/lib80211_crypt_ccmp.c +++ b/net/wireless/lib80211_crypt_ccmp.c @@ -136,7 +136,7 @@ static int ccmp_init_iv_and_aad(const struct ieee80211_hdr *hdr, pos = (u8 *) hdr; aad[0] = pos[0] & 0x8f; aad[1] = pos[1] & 0xc7; - memcpy(aad + 2, hdr->addr1, 3 * ETH_ALEN); + memcpy(aad + 2, &hdr->addrs, 3 * ETH_ALEN); pos = (u8 *) & hdr->seq_ctrl; aad[20] = pos[0] & 0x0f; aad[21] = 0; /* all bits masked */ -- cgit v1.2.3 From fd16eb948ea8b28afb03e11a5b11841e6ac2aa2b Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 30 Aug 2022 17:37:21 +0800 Subject: bonding: add all node mcast address when slave up When a link is enslave to bond, it need to set the interface down first. This makes the slave remove mac multicast address 33:33:00:00:00:01(The IPv6 multicast address ff02::1 is kept even when the interface down). When bond set the slave up, ipv6_mc_up() was not called due to commit c2edacf80e15 ("bonding / ipv6: no addrconf for slaves separately from master"). This is not an issue before we adding the lladdr target feature for bonding, as the mac multicast address will be added back when bond interface up and join group ff02::1. But after adding lladdr target feature for bonding. When user set a lladdr target, the unsolicited NA message with all-nodes multicast dest will be dropped as the slave interface never add 33:33:00:00:00:01 back. Fix this by calling ipv6_mc_up() to add 33:33:00:00:00:01 back when the slave interface up. Reported-by: LiLiang Fixes: 5e1eeef69c0f ("bonding: NS target should accept link local address") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e15f64f22fa8..10ce86bf228e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3557,11 +3557,15 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, fallthrough; case NETDEV_UP: case NETDEV_CHANGE: - if (dev->flags & IFF_SLAVE) + if (idev && idev->cnf.disable_ipv6) break; - if (idev && idev->cnf.disable_ipv6) + if (dev->flags & IFF_SLAVE) { + if (event == NETDEV_UP && !IS_ERR_OR_NULL(idev) && + dev->flags & IFF_UP && dev->flags & IFF_MULTICAST) + ipv6_mc_up(idev); break; + } if (event == NETDEV_UP) { /* restore routes for permanent addresses */ -- cgit v1.2.3 From 84a53580c5d2138c7361c7c3eea5b31827e63b35 Mon Sep 17 00:00:00 2001 From: David Lebrun Date: Fri, 2 Sep 2022 10:45:06 +0100 Subject: ipv6: sr: fix out-of-bounds read when setting HMAC data. The SRv6 layer allows defining HMAC data that can later be used to sign IPv6 Segment Routing Headers. This configuration is realised via netlink through four attributes: SEG6_ATTR_HMACKEYID, SEG6_ATTR_SECRET, SEG6_ATTR_SECRETLEN and SEG6_ATTR_ALGID. Because the SECRETLEN attribute is decoupled from the actual length of the SECRET attribute, it is possible to provide invalid combinations (e.g., secret = "", secretlen = 64). This case is not checked in the code and with an appropriately crafted netlink message, an out-of-bounds read of up to 64 bytes (max secret length) can occur past the skb end pointer and into skb_shared_info: Breakpoint 1, seg6_genl_sethmac (skb=, info=) at net/ipv6/seg6.c:208 208 memcpy(hinfo->secret, secret, slen); (gdb) bt #0 seg6_genl_sethmac (skb=, info=) at net/ipv6/seg6.c:208 #1 0xffffffff81e012e9 in genl_family_rcv_msg_doit (skb=skb@entry=0xffff88800b1f9f00, nlh=nlh@entry=0xffff88800b1b7600, extack=extack@entry=0xffffc90000ba7af0, ops=ops@entry=0xffffc90000ba7a80, hdrlen=4, net=0xffffffff84237580 , family=, family=) at net/netlink/genetlink.c:731 #2 0xffffffff81e01435 in genl_family_rcv_msg (extack=0xffffc90000ba7af0, nlh=0xffff88800b1b7600, skb=0xffff88800b1f9f00, family=0xffffffff82fef6c0 ) at net/netlink/genetlink.c:775 #3 genl_rcv_msg (skb=0xffff88800b1f9f00, nlh=0xffff88800b1b7600, extack=0xffffc90000ba7af0) at net/netlink/genetlink.c:792 #4 0xffffffff81dfffc3 in netlink_rcv_skb (skb=skb@entry=0xffff88800b1f9f00, cb=cb@entry=0xffffffff81e01350 ) at net/netlink/af_netlink.c:2501 #5 0xffffffff81e00919 in genl_rcv (skb=0xffff88800b1f9f00) at net/netlink/genetlink.c:803 #6 0xffffffff81dff6ae in netlink_unicast_kernel (ssk=0xffff888010eec800, skb=0xffff88800b1f9f00, sk=0xffff888004aed000) at net/netlink/af_netlink.c:1319 #7 netlink_unicast (ssk=ssk@entry=0xffff888010eec800, skb=skb@entry=0xffff88800b1f9f00, portid=portid@entry=0, nonblock=) at net/netlink/af_netlink.c:1345 #8 0xffffffff81dff9a4 in netlink_sendmsg (sock=, msg=0xffffc90000ba7e48, len=) at net/netlink/af_netlink.c:1921 ... (gdb) p/x ((struct sk_buff *)0xffff88800b1f9f00)->head + ((struct sk_buff *)0xffff88800b1f9f00)->end $1 = 0xffff88800b1b76c0 (gdb) p/x secret $2 = 0xffff88800b1b76c0 (gdb) p slen $3 = 64 '@' The OOB data can then be read back from userspace by dumping HMAC state. This commit fixes this by ensuring SECRETLEN cannot exceed the actual length of SECRET. Reported-by: Lucas Leong Tested: verified that EINVAL is correctly returned when secretlen > len(secret) Fixes: 4f4853dc1c9c1 ("ipv6: sr: implement API to control SR HMAC structure") Signed-off-by: David Lebrun Signed-off-by: David S. Miller --- net/ipv6/seg6.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 73aaabf0e966..0b0e34ddc64e 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -191,6 +191,11 @@ static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info) goto out_unlock; } + if (slen > nla_len(info->attrs[SEG6_ATTR_SECRET])) { + err = -EINVAL; + goto out_unlock; + } + if (hinfo) { err = seg6_hmac_info_del(net, hmackeyid); if (err) -- cgit v1.2.3 From 686dc2db2a0fdc1d34b424ec2c0a735becd8d62b Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 3 Sep 2022 08:10:23 -0400 Subject: tcp: fix early ETIMEDOUT after spurious non-SACK RTO Fix a bug reported and analyzed by Nagaraj Arankal, where the handling of a spurious non-SACK RTO could cause a connection to fail to clear retrans_stamp, causing a later RTO to very prematurely time out the connection with ETIMEDOUT. Here is the buggy scenario, expanding upon Nagaraj Arankal's excellent report: (*1) Send one data packet on a non-SACK connection (*2) Because no ACK packet is received, the packet is retransmitted and we enter CA_Loss; but this retransmission is spurious. (*3) The ACK for the original data is received. The transmitted packet is acknowledged. The TCP timestamp is before the retrans_stamp, so tcp_may_undo() returns true, and tcp_try_undo_loss() returns true without changing state to Open (because tcp_is_sack() is false), and tcp_process_loss() returns without calling tcp_try_undo_recovery(). Normally after undoing a CA_Loss episode, tcp_fastretrans_alert() would see that the connection has returned to CA_Open and fall through and call tcp_try_to_open(), which would set retrans_stamp to 0. However, for non-SACK connections we hold the connection in CA_Loss, so do not fall through to call tcp_try_to_open() and do not set retrans_stamp to 0. So retrans_stamp is (erroneously) still non-zero. At this point the first "retransmission event" has passed and been recovered from. Any future retransmission is a completely new "event". However, retrans_stamp is erroneously still set. (And we are still in CA_Loss, which is correct.) (*4) After 16 minutes (to correspond with tcp_retries2=15), a new data packet is sent. Note: No data is transmitted between (*3) and (*4) and we disabled keep alives. The socket's timeout SHOULD be calculated from this point in time, but instead it's calculated from the prior "event" 16 minutes ago (step (*2)). (*5) Because no ACK packet is received, the packet is retransmitted. (*6) At the time of the 2nd retransmission, the socket returns ETIMEDOUT, prematurely, because retrans_stamp is (erroneously) too far in the past (set at the time of (*2)). This commit fixes this bug by ensuring that we reuse in tcp_try_undo_loss() the same careful logic for non-SACK connections that we have in tcp_try_undo_recovery(). To avoid duplicating logic, we factor out that logic into a new tcp_is_non_sack_preventing_reopen() helper and call that helper from both undo functions. Fixes: da34ac7626b5 ("tcp: only undo on partial ACKs in CA_Loss") Reported-by: Nagaraj Arankal Link: https://lore.kernel.org/all/SJ0PR84MB1847BE6C24D274C46A1B9B0EB27A9@SJ0PR84MB1847.NAMPRD84.PROD.OUTLOOK.COM/ Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220903121023.866900-1-ncardwell.kernel@gmail.com Signed-off-by: Paolo Abeni --- net/ipv4/tcp_input.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b85a9f755da4..bc2ea12221f9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2513,6 +2513,21 @@ static inline bool tcp_may_undo(const struct tcp_sock *tp) return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); } +static bool tcp_is_non_sack_preventing_reopen(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { + /* Hold old state until something *above* high_seq + * is ACKed. For Reno it is MUST to prevent false + * fast retransmits (RFC2582). SACK TCP is safe. */ + if (!tcp_any_retrans_done(sk)) + tp->retrans_stamp = 0; + return true; + } + return false; +} + /* People celebrate: "We love our President!" */ static bool tcp_try_undo_recovery(struct sock *sk) { @@ -2535,14 +2550,8 @@ static bool tcp_try_undo_recovery(struct sock *sk) } else if (tp->rack.reo_wnd_persist) { tp->rack.reo_wnd_persist--; } - if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { - /* Hold old state until something *above* high_seq - * is ACKed. For Reno it is MUST to prevent false - * fast retransmits (RFC2582). SACK TCP is safe. */ - if (!tcp_any_retrans_done(sk)) - tp->retrans_stamp = 0; + if (tcp_is_non_sack_preventing_reopen(sk)) return true; - } tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; return false; @@ -2578,6 +2587,8 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); inet_csk(sk)->icsk_retransmits = 0; + if (tcp_is_non_sack_preventing_reopen(sk)) + return true; if (frto_undo || tcp_is_sack(tp)) { tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; -- cgit v1.2.3 From 39aebedeaaa95757f5c1f2ddb5f43fdddbf478ca Mon Sep 17 00:00:00 2001 From: Igor Ryzhov Date: Wed, 5 Jun 2019 12:32:40 +0300 Subject: netfilter: nf_conntrack_sip: fix ct_sip_walk_headers ct_sip_next_header and ct_sip_get_header return an absolute value of matchoff, not a shift from current dataoff. So dataoff should be assigned matchoff, not incremented by it. This issue can be seen in the scenario when there are multiple Contact headers and the first one is using a hostname and other headers use IP addresses. In this case, ct_sip_walk_headers will work as follows: The first ct_sip_get_header call to will find the first Contact header but will return -1 as the header uses a hostname. But matchoff will be changed to the offset of this header. After that, dataoff should be set to matchoff, so that the next ct_sip_get_header call find the next Contact header. But instead of assigning dataoff to matchoff, it is incremented by it, which is not correct, as matchoff is an absolute value of the offset. So on the next call to the ct_sip_get_header, dataoff will be incorrect, and the next Contact header may not be found at all. Fixes: 05e3ced297fe ("[NETFILTER]: nf_conntrack_sip: introduce SIP-URI parsing helper") Signed-off-by: Igor Ryzhov Signed-off-by: Florian Westphal --- net/netfilter/nf_conntrack_sip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index daf06f71d31c..77f5e82d8e3f 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -477,7 +477,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr, return ret; if (ret == 0) break; - dataoff += *matchoff; + dataoff = *matchoff; } *in_header = 0; } @@ -489,7 +489,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr, break; if (ret == 0) return ret; - dataoff += *matchoff; + dataoff = *matchoff; } if (in_header) -- cgit v1.2.3 From e8d5dfd1d8747b56077d02664a8838c71ced948e Mon Sep 17 00:00:00 2001 From: David Leadbeater Date: Fri, 26 Aug 2022 14:56:57 +1000 Subject: netfilter: nf_conntrack_irc: Tighten matching on DCC message CTCP messages should only be at the start of an IRC message, not anywhere within it. While the helper only decodes packes in the ORIGINAL direction, its possible to make a client send a CTCP message back by empedding one into a PING request. As-is, thats enough to make the helper believe that it saw a CTCP message. Fixes: 869f37d8e48f ("[NETFILTER]: nf_conntrack/nf_nat: add IRC helper port") Signed-off-by: David Leadbeater Signed-off-by: Florian Westphal --- net/netfilter/nf_conntrack_irc.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 992decbcaa5c..5703846bea3b 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -157,15 +157,37 @@ static int help(struct sk_buff *skb, unsigned int protoff, data = ib_ptr; data_limit = ib_ptr + datalen; - /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24 - * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */ - while (data < data_limit - (19 + MINMATCHLEN)) { - if (memcmp(data, "\1DCC ", 5)) { + /* Skip any whitespace */ + while (data < data_limit - 10) { + if (*data == ' ' || *data == '\r' || *data == '\n') + data++; + else + break; + } + + /* strlen("PRIVMSG x ")=10 */ + if (data < data_limit - 10) { + if (strncasecmp("PRIVMSG ", data, 8)) + goto out; + data += 8; + } + + /* strlen(" :\1DCC SENT t AAAAAAAA P\1\n")=26 + * 7+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=26 + */ + while (data < data_limit - (21 + MINMATCHLEN)) { + /* Find first " :", the start of message */ + if (memcmp(data, " :", 2)) { data++; continue; } + data += 2; + + /* then check that place only for the DCC command */ + if (memcmp(data, "\1DCC ", 5)) + goto out; data += 5; - /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */ + /* we have at least (21+MINMATCHLEN)-(2+5) bytes valid data left */ iph = ip_hdr(skb); pr_debug("DCC found in master %pI4:%u %pI4:%u\n", @@ -181,7 +203,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, pr_debug("DCC %s detected\n", dccprotos[i]); /* we have at least - * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid + * (21+MINMATCHLEN)-7-dccprotos[i].matchlen bytes valid * data left (== 14/13 bytes) */ if (parse_dcc(data, data_limit, &dcc_ip, &dcc_port, &addr_beg_p, &addr_end_p)) { -- cgit v1.2.3 From 559c36c5a8d730c49ef805a72b213d3bba155cc8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 7 Sep 2022 10:26:18 +0200 Subject: netfilter: nfnetlink_osf: fix possible bogus match in nf_osf_find() nf_osf_find() incorrectly returns true on mismatch, this leads to copying uninitialized memory area in nft_osf which can be used to leak stale kernel stack data to userspace. Fixes: 22c7652cdaa8 ("netfilter: nft_osf: Add version option support") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- net/netfilter/nfnetlink_osf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 0fa2e2030427..ee6840bd5933 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -269,6 +269,7 @@ bool nf_osf_find(const struct sk_buff *skb, struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; struct tcphdr _tcph; + bool found = false; memset(&ctx, 0, sizeof(ctx)); @@ -283,10 +284,11 @@ bool nf_osf_find(const struct sk_buff *skb, data->genre = f->genre; data->version = f->version; + found = true; break; } - return true; + return found; } EXPORT_SYMBOL_GPL(nf_osf_find); -- cgit v1.2.3 From 9cb252c4c1c53ae58bc565bab76e98133288f23a Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Mon, 5 Sep 2022 11:50:15 +0800 Subject: net: skb: export skb drop reaons to user by TRACE_DEFINE_ENUM As Eric reported, the 'reason' field is not presented when trace the kfree_skb event by perf: $ perf record -e skb:kfree_skb -a sleep 10 $ perf script ip_defrag 14605 [021] 221.614303: skb:kfree_skb: skbaddr=0xffff9d2851242700 protocol=34525 location=0xffffffffa39346b1 reason: The cause seems to be passing kernel address directly to TP_printk(), which is not right. As the enum 'skb_drop_reason' is not exported to user space through TRACE_DEFINE_ENUM(), perf can't get the drop reason string from the 'reason' field, which is a number. Therefore, we introduce the macro DEFINE_DROP_REASON(), which is used to define the trace enum by TRACE_DEFINE_ENUM(). With the help of DEFINE_DROP_REASON(), now we can remove the auto-generate that we introduced in the commit ec43908dd556 ("net: skb: use auto-generation to convert skb drop reason to string"), and define the string array 'drop_reasons'. Hmmmm...now we come back to the situation that have to maintain drop reasons in both enum skb_drop_reason and DEFINE_DROP_REASON. But they are both in dropreason.h, which makes it easier. After this commit, now the format of kfree_skb is like this: $ cat /tracing/events/skb/kfree_skb/format name: kfree_skb ID: 1524 format: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1; signed:0; field:int common_pid; offset:4; size:4; signed:1; field:void * skbaddr; offset:8; size:8; signed:0; field:void * location; offset:16; size:8; signed:0; field:unsigned short protocol; offset:24; size:2; signed:0; field:enum skb_drop_reason reason; offset:28; size:4; signed:0; print fmt: "skbaddr=%p protocol=%u location=%p reason: %s", REC->skbaddr, REC->protocol, REC->location, __print_symbolic(REC->reason, { 1, "NOT_SPECIFIED" }, { 2, "NO_SOCKET" } ...... Fixes: ec43908dd556 ("net: skb: use auto-generation to convert skb drop reason to string") Link: https://lore.kernel.org/netdev/CANn89i+bx0ybvE55iMYf5GJM48WwV1HNpdm9Q6t-HaEstqpCSA@mail.gmail.com/ Reported-by: Eric Dumazet Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- net/core/.gitignore | 1 - net/core/Makefile | 22 +--------------------- net/core/skbuff.c | 6 +++++- 3 files changed, 6 insertions(+), 23 deletions(-) delete mode 100644 net/core/.gitignore (limited to 'net') diff --git a/net/core/.gitignore b/net/core/.gitignore deleted file mode 100644 index df1e74372cce..000000000000 --- a/net/core/.gitignore +++ /dev/null @@ -1 +0,0 @@ -dropreason_str.c diff --git a/net/core/Makefile b/net/core/Makefile index e8ce3bd283a6..5857cec87b83 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -5,7 +5,7 @@ obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \ gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \ - flow_dissector.o dropreason_str.o + flow_dissector.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o @@ -40,23 +40,3 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o obj-$(CONFIG_OF) += of_net.o - -clean-files := dropreason_str.c - -quiet_cmd_dropreason_str = GEN $@ -cmd_dropreason_str = awk -F ',' 'BEGIN{ print "\#include \n"; \ - print "const char * const drop_reasons[] = {" }\ - /^enum skb_drop/ { dr=1; }\ - /^\};/ { dr=0; }\ - /^\tSKB_DROP_REASON_/ {\ - if (dr) {\ - sub(/\tSKB_DROP_REASON_/, "", $$1);\ - printf "\t[SKB_DROP_REASON_%s] = \"%s\",\n", $$1, $$1;\ - }\ - }\ - END{ print "};" }' $< > $@ - -$(obj)/dropreason_str.c: $(srctree)/include/net/dropreason.h - $(call cmd,dropreason_str) - -$(obj)/dropreason_str.o: $(obj)/dropreason_str.c diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 84bb5e188d0d..417463da4fac 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -91,7 +91,11 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init; int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); -/* The array 'drop_reasons' is auto-generated in dropreason_str.c */ +#undef FN +#define FN(reason) [SKB_DROP_REASON_##reason] = #reason, +const char * const drop_reasons[] = { + DEFINE_DROP_REASON(FN, FN) +}; EXPORT_SYMBOL(drop_reasons); /** -- cgit v1.2.3 From e9b1a4f867ae9c1dbd1d71cd09cbdb3239fb4968 Mon Sep 17 00:00:00 2001 From: Yacan Liu Date: Tue, 6 Sep 2022 21:01:39 +0800 Subject: net/smc: Fix possible access to freed memory in link clear After modifying the QP to the Error state, all RX WR would be completed with WC in IB_WC_WR_FLUSH_ERR status. Current implementation does not wait for it is done, but destroy the QP and free the link group directly. So there is a risk that accessing the freed memory in tasklet context. Here is a crash example: BUG: unable to handle page fault for address: ffffffff8f220860 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD f7300e067 P4D f7300e067 PUD f7300f063 PMD 8c4e45063 PTE 800ffff08c9df060 Oops: 0002 [#1] SMP PTI CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Tainted: G S OE 5.10.0-0607+ #23 Hardware name: Inspur NF5280M4/YZMB-00689-101, BIOS 4.1.20 07/09/2018 RIP: 0010:native_queued_spin_lock_slowpath+0x176/0x1b0 Code: f3 90 48 8b 32 48 85 f6 74 f6 eb d5 c1 ee 12 83 e0 03 83 ee 01 48 c1 e0 05 48 63 f6 48 05 00 c8 02 00 48 03 04 f5 00 09 98 8e <48> 89 10 8b 42 08 85 c0 75 09 f3 90 8b 42 08 85 c0 74 f7 48 8b 32 RSP: 0018:ffffb3b6c001ebd8 EFLAGS: 00010086 RAX: ffffffff8f220860 RBX: 0000000000000246 RCX: 0000000000080000 RDX: ffff91db1f86c800 RSI: 000000000000173c RDI: ffff91db62bace00 RBP: ffff91db62bacc00 R08: 0000000000000000 R09: c00000010000028b R10: 0000000000055198 R11: ffffb3b6c001ea58 R12: ffff91db80e05010 R13: 000000000000000a R14: 0000000000000006 R15: 0000000000000040 FS: 0000000000000000(0000) GS:ffff91db1f840000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffff8f220860 CR3: 00000001f9580004 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: _raw_spin_lock_irqsave+0x30/0x40 mlx5_ib_poll_cq+0x4c/0xc50 [mlx5_ib] smc_wr_rx_tasklet_fn+0x56/0xa0 [smc] tasklet_action_common.isra.21+0x66/0x100 __do_softirq+0xd5/0x29c asm_call_irq_on_stack+0x12/0x20 do_softirq_own_stack+0x37/0x40 irq_exit_rcu+0x9d/0xa0 sysvec_call_function_single+0x34/0x80 asm_sysvec_call_function_single+0x12/0x20 Fixes: bd4ad57718cc ("smc: initialize IB transport incl. PD, MR, QP, CQ, event, WR") Signed-off-by: Yacan Liu Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/smc_core.c | 1 + net/smc/smc_core.h | 2 ++ net/smc/smc_wr.c | 5 +++++ net/smc/smc_wr.h | 5 +++++ 4 files changed, 13 insertions(+) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index ff49a11f57b8..ebf56cdf17db 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->lgr = lgr; smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ lnk->link_idx = link_idx; + lnk->wr_rx_id_compl = 0; smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); atomic_set(&lnk->conn_cnt, 0); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index fe8b524ad846..285f9bd8e232 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -115,8 +115,10 @@ struct smc_link { dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/ u64 wr_rx_id; /* seq # of last recv WR */ + u64 wr_rx_id_compl; /* seq # of last completed WR */ u32 wr_rx_cnt; /* number of WR recv buffers */ unsigned long wr_rx_tstamp; /* jiffies when last buf rx */ + wait_queue_head_t wr_rx_empty_wait; /* wait for RQ empty */ struct ib_reg_wr wr_reg; /* WR register memory region */ wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 26f8f240d9e8..b0678a417e09 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) for (i = 0; i < num; i++) { link = wc[i].qp->qp_context; + link->wr_rx_id_compl = wc[i].wr_id; if (wc[i].status == IB_WC_SUCCESS) { link->wr_rx_tstamp = jiffies; smc_wr_rx_demultiplex(&wc[i]); @@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) case IB_WC_RNR_RETRY_EXC_ERR: case IB_WC_WR_FLUSH_ERR: smcr_link_down_cond_sched(link); + if (link->wr_rx_id_compl == link->wr_rx_id) + wake_up(&link->wr_rx_empty_wait); break; default: smc_wr_rx_post(link); /* refill WR RX */ @@ -639,6 +642,7 @@ void smc_wr_free_link(struct smc_link *lnk) return; ibdev = lnk->smcibdev->ibdev; + smc_wr_drain_cq(lnk); smc_wr_wakeup_reg_wait(lnk); smc_wr_wakeup_tx_wait(lnk); @@ -889,6 +893,7 @@ int smc_wr_create_link(struct smc_link *lnk) atomic_set(&lnk->wr_tx_refcnt, 0); init_waitqueue_head(&lnk->wr_reg_wait); atomic_set(&lnk->wr_reg_refcnt, 0); + init_waitqueue_head(&lnk->wr_rx_empty_wait); return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index a54e90a1110f..45e9b894d3f8 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -73,6 +73,11 @@ static inline void smc_wr_tx_link_put(struct smc_link *link) wake_up_all(&link->wr_tx_wait); } +static inline void smc_wr_drain_cq(struct smc_link *lnk) +{ + wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id); +} + static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk) { wake_up_all(&lnk->wr_tx_wait); -- cgit v1.2.3 From 2f09707d0c972120bf794cfe0f0c67e2c2ddb252 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 5 Sep 2022 21:21:36 +0200 Subject: sch_sfb: Also store skb len before calling child enqueue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cong Wang noticed that the previous fix for sch_sfb accessing the queued skb after enqueueing it to a child qdisc was incomplete: the SFB enqueue function was also calling qdisc_qstats_backlog_inc() after enqueue, which reads the pkt len from the skb cb field. Fix this by also storing the skb len, and using the stored value to increment the backlog after enqueueing. Fixes: 9efd23297cca ("sch_sfb: Don't assume the skb is still around after enqueueing to child") Signed-off-by: Toke Høiland-Jørgensen Acked-by: Cong Wang Link: https://lore.kernel.org/r/20220905192137.965549-1-toke@toke.dk Signed-off-by: Paolo Abeni --- net/sched/sch_sfb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 0d761f454ae8..2829455211f8 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -281,6 +281,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, { struct sfb_sched_data *q = qdisc_priv(sch); + unsigned int len = qdisc_pkt_len(skb); struct Qdisc *child = q->qdisc; struct tcf_proto *fl; struct sfb_skb_cb cb; @@ -403,7 +404,7 @@ enqueue: memcpy(&cb, sfb_skb_cb(skb), sizeof(cb)); ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { - qdisc_qstats_backlog_inc(sch, skb); + sch->qstats.backlog += len; sch->q.qlen++; increment_qlen(&cb, q); } else if (net_xmit_drop_count(ret)) { -- cgit v1.2.3 From 13bd9014180425f5a35eaf3735971d582c299292 Mon Sep 17 00:00:00 2001 From: Dan Aloni Date: Thu, 8 Sep 2022 17:08:51 +0300 Subject: Revert "SUNRPC: Remove unreachable error condition" This reverts commit efe57fd58e1cb77f9186152ee12a8aa4ae3348e0. The assumption that it is impossible to return an ERR pointer from rpc_run_task() no longer holds due to commit 25cf32ad5dba ("SUNRPC: Handle allocation failure in rpc_new_task()"). Fixes: 25cf32ad5dba ('SUNRPC: Handle allocation failure in rpc_new_task()') Fixes: efe57fd58e1c ('SUNRPC: Remove unreachable error condition') Signed-off-by: Dan Aloni Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 7d268a291486..c284efa3d1ef 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2873,6 +2873,9 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, task = rpc_call_null_helper(clnt, xprt, NULL, RPC_TASK_ASYNC, &rpc_cb_add_xprt_call_ops, data); + if (IS_ERR(task)) + return PTR_ERR(task); + data->xps->xps_nunique_destaddr_xprts++; rpc_put_task(task); success: -- cgit v1.2.3 From 64ae13ed478428135cddc2f1113dff162d8112d4 Mon Sep 17 00:00:00 2001 From: Ludovic Cintrat Date: Wed, 7 Sep 2022 12:08:13 +0200 Subject: net: core: fix flow symmetric hash __flow_hash_consistentify() wrongly swaps ipv4 addresses in few cases. This function is indirectly used by __skb_get_hash_symmetric(), which is used to fanout packets in AF_PACKET. Intrusion detection systems may be impacted by this issue. __flow_hash_consistentify() computes the addresses difference then swaps them if the difference is negative. In few cases src - dst and dst - src are both negative. The following snippet mimics __flow_hash_consistentify(): ``` #include #include int main(int argc, char** argv) { int diffs_d, diffd_s; uint32_t dst = 0xb225a8c0; /* 178.37.168.192 --> 192.168.37.178 */ uint32_t src = 0x3225a8c0; /* 50.37.168.192 --> 192.168.37.50 */ uint32_t dst2 = 0x3325a8c0; /* 51.37.168.192 --> 192.168.37.51 */ diffs_d = src - dst; diffd_s = dst - src; printf("src:%08x dst:%08x, diff(s-d)=%d(0x%x) diff(d-s)=%d(0x%x)\n", src, dst, diffs_d, diffs_d, diffd_s, diffd_s); diffs_d = src - dst2; diffd_s = dst2 - src; printf("src:%08x dst:%08x, diff(s-d)=%d(0x%x) diff(d-s)=%d(0x%x)\n", src, dst2, diffs_d, diffs_d, diffd_s, diffd_s); return 0; } ``` Results: src:3225a8c0 dst:b225a8c0, \ diff(s-d)=-2147483648(0x80000000) \ diff(d-s)=-2147483648(0x80000000) src:3225a8c0 dst:3325a8c0, \ diff(s-d)=-16777216(0xff000000) \ diff(d-s)=16777216(0x1000000) In the first case the addresses differences are always < 0, therefore __flow_hash_consistentify() always swaps, thus dst->src and src->dst packets have differents hashes. Fixes: c3f8324188fa8 ("net: Add full IPv6 addresses to flow_keys") Signed-off-by: Ludovic Cintrat Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 764c4cb3fe8f..5dc3860e9fc7 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1611,9 +1611,8 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys) switch (keys->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: - addr_diff = (__force u32)keys->addrs.v4addrs.dst - - (__force u32)keys->addrs.v4addrs.src; - if (addr_diff < 0) + if ((__force u32)keys->addrs.v4addrs.dst < + (__force u32)keys->addrs.v4addrs.src) swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); if ((__force u16)keys->ports.dst < -- cgit v1.2.3 From 7288ff6ec795804b1b4632e535403d52bd2b3ce1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 6 Sep 2022 20:04:01 +0200 Subject: mptcp: fix fwd memory accounting on coalesce The intel bot reported a memory accounting related splat: [ 240.473094] ------------[ cut here ]------------ [ 240.478507] page_counter underflow: -4294828518 nr_pages=4294967290 [ 240.485500] WARNING: CPU: 2 PID: 14986 at mm/page_counter.c:56 page_counter_cancel+0x96/0xc0 [ 240.570849] CPU: 2 PID: 14986 Comm: mptcp_connect Tainted: G S 5.19.0-rc4-00739-gd24141fe7b48 #1 [ 240.581637] Hardware name: HP HP Z240 SFF Workstation/802E, BIOS N51 Ver. 01.63 10/05/2017 [ 240.590600] RIP: 0010:page_counter_cancel+0x96/0xc0 [ 240.596179] Code: 00 00 00 45 31 c0 48 89 ef 5d 4c 89 c6 41 5c e9 40 fd ff ff 4c 89 e2 48 c7 c7 20 73 39 84 c6 05 d5 b1 52 04 01 e8 e7 95 f3 01 <0f> 0b eb a9 48 89 ef e8 1e 25 fc ff eb c3 66 66 2e 0f 1f 84 00 00 [ 240.615639] RSP: 0018:ffffc9000496f7c8 EFLAGS: 00010082 [ 240.621569] RAX: 0000000000000000 RBX: ffff88819c9c0120 RCX: 0000000000000000 [ 240.629404] RDX: 0000000000000027 RSI: 0000000000000004 RDI: fffff5200092deeb [ 240.637239] RBP: ffff88819c9c0120 R08: 0000000000000001 R09: ffff888366527a2b [ 240.645069] R10: ffffed106cca4f45 R11: 0000000000000001 R12: 00000000fffffffa [ 240.652903] R13: ffff888366536118 R14: 00000000fffffffa R15: ffff88819c9c0000 [ 240.660738] FS: 00007f3786e72540(0000) GS:ffff888366500000(0000) knlGS:0000000000000000 [ 240.669529] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 240.675974] CR2: 00007f966b346000 CR3: 0000000168cea002 CR4: 00000000003706e0 [ 240.683807] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 240.691641] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 240.699468] Call Trace: [ 240.702613] [ 240.705413] page_counter_uncharge+0x29/0x80 [ 240.710389] drain_stock+0xd0/0x180 [ 240.714585] refill_stock+0x278/0x580 [ 240.718951] __sk_mem_reduce_allocated+0x222/0x5c0 [ 240.729248] __mptcp_update_rmem+0x235/0x2c0 [ 240.734228] __mptcp_move_skbs+0x194/0x6c0 [ 240.749764] mptcp_recvmsg+0xdfa/0x1340 [ 240.763153] inet_recvmsg+0x37f/0x500 [ 240.782109] sock_read_iter+0x24a/0x380 [ 240.805353] new_sync_read+0x420/0x540 [ 240.838552] vfs_read+0x37f/0x4c0 [ 240.842582] ksys_read+0x170/0x200 [ 240.864039] do_syscall_64+0x5c/0x80 [ 240.872770] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 240.878526] RIP: 0033:0x7f3786d9ae8e [ 240.882805] Code: c0 e9 b6 fe ff ff 50 48 8d 3d 6e 18 0a 00 e8 89 e8 01 00 66 0f 1f 84 00 00 00 00 00 64 8b 04 25 18 00 00 00 85 c0 75 14 0f 05 <48> 3d 00 f0 ff ff 77 5a c3 66 0f 1f 84 00 00 00 00 00 48 83 ec 28 [ 240.902259] RSP: 002b:00007fff7be81e08 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [ 240.910533] RAX: ffffffffffffffda RBX: 0000000000002000 RCX: 00007f3786d9ae8e [ 240.918368] RDX: 0000000000002000 RSI: 00007fff7be87ec0 RDI: 0000000000000005 [ 240.926206] RBP: 0000000000000005 R08: 00007f3786e6a230 R09: 00007f3786e6a240 [ 240.934046] R10: fffffffffffff288 R11: 0000000000000246 R12: 0000000000002000 [ 240.941884] R13: 00007fff7be87ec0 R14: 00007fff7be87ec0 R15: 0000000000002000 [ 240.949741] [ 240.952632] irq event stamp: 27367 [ 240.956735] hardirqs last enabled at (27366): [] mem_cgroup_uncharge_skmem+0x6a/0x80 [ 240.966848] hardirqs last disabled at (27367): [] refill_stock+0x282/0x580 [ 240.976017] softirqs last enabled at (27360): [] mptcp_recvmsg+0xaf/0x1340 [ 240.985273] softirqs last disabled at (27364): [] __mptcp_move_skbs+0x18c/0x6c0 [ 240.994872] ---[ end trace 0000000000000000 ]--- After commit d24141fe7b48 ("mptcp: drop SK_RECLAIM_* macros"), if rmem_fwd_alloc become negative, mptcp_rmem_uncharge() can try to reclaim a negative amount of pages, since the expression: reclaimable >= PAGE_SIZE will evaluate to true for any negative value of the int 'reclaimable': 'PAGE_SIZE' is an unsigned long and the negative integer will be promoted to a (very large) unsigned long value. Still after the mentioned commit, kfree_skb_partial() in mptcp_try_coalesce() will reclaim most of just released fwd memory, so that following charging of the skb delta size will lead to negative fwd memory values. At that point a racing recvmsg() can trigger the splat. Address the issue switching the order of the memory accounting operations. The fwd memory can still transiently reach negative values, but that will happen in an atomic scope and no code path could touch/use such value. Reported-by: kernel test robot Fixes: d24141fe7b48 ("mptcp: drop SK_RECLAIM_* macros") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Link: https://lore.kernel.org/r/20220906180404.1255873-1-matthieu.baerts@tessares.net Signed-off-by: Paolo Abeni --- net/mptcp/protocol.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index d398f3810662..969b33a9dd64 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -150,9 +150,15 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, to->len, MPTCP_SKB_CB(from)->end_seq); MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; - kfree_skb_partial(from, fragstolen); + + /* note the fwd memory can reach a negative value after accounting + * for the delta, but the later skb free will restore a non + * negative one + */ atomic_add(delta, &sk->sk_rmem_alloc); mptcp_rmem_charge(sk, delta); + kfree_skb_partial(from, fragstolen); + return true; } -- cgit v1.2.3 From 94160108a70c8af17fa1484a37e05181c0e094af Mon Sep 17 00:00:00 2001 From: Haimin Zhang Date: Thu, 8 Sep 2022 20:19:27 +0800 Subject: net/ieee802154: fix uninit value bug in dgram_sendmsg There is uninit value bug in dgram_sendmsg function in net/ieee802154/socket.c when the length of valid data pointed by the msg->msg_name isn't verified. We introducing a helper function ieee802154_sockaddr_check_size to check namelen. First we check there is addr_type in ieee802154_addr_sa. Then, we check namelen according to addr_type. Also fixed in raw_bind, dgram_bind, dgram_connect. Signed-off-by: Haimin Zhang Signed-off-by: David S. Miller --- net/ieee802154/socket.c | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 718fb77bb372..7889e1ef7fad 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -200,8 +200,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *_uaddr, int len) int err = 0; struct net_device *dev = NULL; - if (len < sizeof(*uaddr)) - return -EINVAL; + err = ieee802154_sockaddr_check_size(uaddr, len); + if (err < 0) + return err; uaddr = (struct sockaddr_ieee802154 *)_uaddr; if (uaddr->family != AF_IEEE802154) @@ -493,7 +494,8 @@ static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len) ro->bound = 0; - if (len < sizeof(*addr)) + err = ieee802154_sockaddr_check_size(addr, len); + if (err < 0) goto out; if (addr->family != AF_IEEE802154) @@ -564,8 +566,9 @@ static int dgram_connect(struct sock *sk, struct sockaddr *uaddr, struct dgram_sock *ro = dgram_sk(sk); int err = 0; - if (len < sizeof(*addr)) - return -EINVAL; + err = ieee802154_sockaddr_check_size(addr, len); + if (err < 0) + return err; if (addr->family != AF_IEEE802154) return -EINVAL; @@ -604,6 +607,7 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) struct ieee802154_mac_cb *cb; struct dgram_sock *ro = dgram_sk(sk); struct ieee802154_addr dst_addr; + DECLARE_SOCKADDR(struct sockaddr_ieee802154*, daddr, msg->msg_name); int hlen, tlen; int err; @@ -612,10 +616,20 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) return -EOPNOTSUPP; } - if (!ro->connected && !msg->msg_name) - return -EDESTADDRREQ; - else if (ro->connected && msg->msg_name) - return -EISCONN; + if (msg->msg_name) { + if (ro->connected) + return -EISCONN; + if (msg->msg_namelen < IEEE802154_MIN_NAMELEN) + return -EINVAL; + err = ieee802154_sockaddr_check_size(daddr, msg->msg_namelen); + if (err < 0) + return err; + ieee802154_addr_from_sa(&dst_addr, &daddr->addr); + } else { + if (!ro->connected) + return -EDESTADDRREQ; + dst_addr = ro->dst_addr; + } if (!ro->bound) dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154); @@ -651,16 +665,6 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) cb = mac_cb_init(skb); cb->type = IEEE802154_FC_TYPE_DATA; cb->ackreq = ro->want_ack; - - if (msg->msg_name) { - DECLARE_SOCKADDR(struct sockaddr_ieee802154*, - daddr, msg->msg_name); - - ieee802154_addr_from_sa(&dst_addr, &daddr->addr); - } else { - dst_addr = ro->dst_addr; - } - cb->secen = ro->secen; cb->secen_override = ro->secen_override; cb->seclevel = ro->seclevel; -- cgit v1.2.3 From 96628951869c0dedf0377adca01c8675172d8639 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Thu, 8 Sep 2022 16:15:23 -0700 Subject: tcp: Use WARN_ON_ONCE() in tcp_read_skb() Prevent tcp_read_skb() from flooding the syslog. Suggested-by: Jakub Sitnicki Signed-off-by: Peilin Ye Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6cdfce6f2867..3488388eea5d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1766,7 +1766,7 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) return 0; __skb_unlink(skb, &sk->sk_receive_queue); - WARN_ON(!skb_set_owner_sk_safe(skb, sk)); + WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); copied = recv_actor(sk, skb); if (copied >= 0) { seq += copied; -- cgit v1.2.3 From db4192a754ebd52300a28abe1a50dd18eae0eb12 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 12 Sep 2022 10:35:53 -0700 Subject: tcp: read multiple skbs in tcp_read_skb() Before we switched to ->read_skb(), ->read_sock() was passed with desc.count=1, which technically indicates we only read one skb per ->sk_data_ready() call. However, for TCP, this is not true. TCP at least has sk_rcvlowat which intentionally holds skb's in receive queue until this watermark is reached. This means when ->sk_data_ready() is invoked there could be multiple skb's in the queue, therefore we have to read multiple skbs in tcp_read_skb() instead of one. Fixes: 965b57b469a5 ("net: Introduce a new proto_ops ->read_skb()") Reported-by: Peilin Ye Cc: John Fastabend Cc: Jakub Sitnicki Cc: Eric Dumazet Signed-off-by: Cong Wang Link: https://lore.kernel.org/r/20220912173553.235838-1-xiyou.wangcong@gmail.com Signed-off-by: Paolo Abeni --- net/ipv4/tcp.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3488388eea5d..e373dde1f46f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1761,19 +1761,28 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; - skb = tcp_recv_skb(sk, seq, &offset); - if (!skb) - return 0; + while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { + u8 tcp_flags; + int used; - __skb_unlink(skb, &sk->sk_receive_queue); - WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); - copied = recv_actor(sk, skb); - if (copied >= 0) { - seq += copied; - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + __skb_unlink(skb, &sk->sk_receive_queue); + WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); + tcp_flags = TCP_SKB_CB(skb)->tcp_flags; + used = recv_actor(sk, skb); + consume_skb(skb); + if (used < 0) { + if (!copied) + copied = used; + break; + } + seq += used; + copied += used; + + if (tcp_flags & TCPHDR_FIN) { ++seq; + break; + } } - consume_skb(skb); WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); -- cgit v1.2.3 From b07a9b26e2b1aa3711fd6935eccb08a463b1fb11 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 14 Sep 2022 10:53:38 +0300 Subject: ipmr: Always call ip{,6}_mr_forward() from RCU read-side critical section These functions expect to be called from RCU read-side critical section, but this only happens when invoked from the data path via ip{,6}_mr_input(). They can also be invoked from process context in response to user space adding a multicast route which resolves a cache entry with queued packets [1][2]. Fix by adding missing rcu_read_lock() / rcu_read_unlock() in these call paths. [1] WARNING: suspicious RCU usage 6.0.0-rc3-custom-15969-g049d233c8bcc-dirty #1387 Not tainted ----------------------------- net/ipv4/ipmr.c:84 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by smcrouted/246: #0: ffffffff862389b0 (rtnl_mutex){+.+.}-{3:3}, at: ip_mroute_setsockopt+0x11c/0x1420 stack backtrace: CPU: 0 PID: 246 Comm: smcrouted Not tainted 6.0.0-rc3-custom-15969-g049d233c8bcc-dirty #1387 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.fc36 04/01/2014 Call Trace: dump_stack_lvl+0x91/0xb9 vif_dev_read+0xbf/0xd0 ipmr_queue_xmit+0x135/0x1ab0 ip_mr_forward+0xe7b/0x13d0 ipmr_mfc_add+0x1a06/0x2ad0 ip_mroute_setsockopt+0x5c1/0x1420 do_ip_setsockopt+0x23d/0x37f0 ip_setsockopt+0x56/0x80 raw_setsockopt+0x219/0x290 __sys_setsockopt+0x236/0x4d0 __x64_sys_setsockopt+0xbe/0x160 do_syscall_64+0x34/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd [2] WARNING: suspicious RCU usage 6.0.0-rc3-custom-15969-g049d233c8bcc-dirty #1387 Not tainted ----------------------------- net/ipv6/ip6mr.c:69 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by smcrouted/246: #0: ffffffff862389b0 (rtnl_mutex){+.+.}-{3:3}, at: ip6_mroute_setsockopt+0x6b9/0x2630 stack backtrace: CPU: 1 PID: 246 Comm: smcrouted Not tainted 6.0.0-rc3-custom-15969-g049d233c8bcc-dirty #1387 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.fc36 04/01/2014 Call Trace: dump_stack_lvl+0x91/0xb9 vif_dev_read+0xbf/0xd0 ip6mr_forward2.isra.0+0xc9/0x1160 ip6_mr_forward+0xef0/0x13f0 ip6mr_mfc_add+0x1ff2/0x31f0 ip6_mroute_setsockopt+0x1825/0x2630 do_ipv6_setsockopt+0x462/0x4440 ipv6_setsockopt+0x105/0x140 rawv6_setsockopt+0xd8/0x690 __sys_setsockopt+0x236/0x4d0 __x64_sys_setsockopt+0xbe/0x160 do_syscall_64+0x34/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: ebc3197963fc ("ipmr: add rcu protection over (struct vif_device)->dev") Signed-off-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 2 ++ net/ipv6/ip6mr.c | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 73651d17e51f..e11d6b0b62b7 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1004,7 +1004,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else { + rcu_read_lock(); ip_mr_forward(net, mrt, skb->dev, skb, c, 0); + rcu_read_unlock(); } } } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index a9ba41648e36..858fd8a28b5b 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1028,8 +1028,11 @@ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, ((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE; } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); - } else + } else { + rcu_read_lock(); ip6_mr_forward(net, mrt, skb->dev, skb, c); + rcu_read_unlock(); + } } } -- cgit v1.2.3 From d547c1b717fc5a1e062307e1efdf81590ba9f6c1 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 14 Sep 2022 18:51:54 +0900 Subject: net: clear msg_get_inq in __get_compat_msghdr() syzbot is still complaining uninit-value in tcp_recvmsg(), for commit 1228b34c8d0ecf6d ("net: clear msg_get_inq in __sys_recvfrom() and __copy_msghdr_from_user()") missed that __get_compat_msghdr() is called instead of copy_msghdr_from_user() when MSG_CMSG_COMPAT is specified. Signed-off-by: Tetsuo Handa Fixes: 1228b34c8d0ecf6d ("net: clear msg_get_inq in __sys_recvfrom() and __copy_msghdr_from_user()") Reviewed-by: Jens Axboe Link: https://lore.kernel.org/r/d06d0f7f-696c-83b4-b2d5-70b5f2730a37@I-love.SAKURA.ne.jp Signed-off-by: Jakub Kicinski --- net/compat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/compat.c b/net/compat.c index fe9be3c56ef7..385f04a6be2f 100644 --- a/net/compat.c +++ b/net/compat.c @@ -52,6 +52,7 @@ int __get_compat_msghdr(struct msghdr *kmsg, kmsg->msg_namelen = sizeof(struct sockaddr_storage); kmsg->msg_control_is_user = true; + kmsg->msg_get_inq = 0; kmsg->msg_control_user = compat_ptr(msg->msg_control); kmsg->msg_controllen = msg->msg_controllen; -- cgit v1.2.3 From 76dd07281338da6951fdab3432ced843fa87839c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 16 Sep 2022 11:48:21 +0300 Subject: ipv6: Fix crash when IPv6 is administratively disabled The global 'raw_v6_hashinfo' variable can be accessed even when IPv6 is administratively disabled via the 'ipv6.disable=1' kernel command line option, leading to a crash [1]. Fix by restoring the original behavior and always initializing the variable, regardless of IPv6 support being administratively disabled or not. [1] BUG: unable to handle page fault for address: ffffffffffffffc8 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 173e18067 P4D 173e18067 PUD 173e1a067 PMD 0 Oops: 0000 [#1] PREEMPT SMP KASAN CPU: 3 PID: 271 Comm: ss Not tainted 6.0.0-rc4-custom-00136-g0727a9a5fbc1 #1396 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.fc36 04/01/2014 RIP: 0010:raw_diag_dump+0x310/0x7f0 [...] Call Trace: __inet_diag_dump+0x10f/0x2e0 netlink_dump+0x575/0xfd0 __netlink_dump_start+0x67b/0x940 inet_diag_handler_cmd+0x273/0x2d0 sock_diag_rcv_msg+0x317/0x440 netlink_rcv_skb+0x15e/0x430 sock_diag_rcv+0x2b/0x40 netlink_unicast+0x53b/0x800 netlink_sendmsg+0x945/0xe60 ____sys_sendmsg+0x747/0x960 ___sys_sendmsg+0x13a/0x1e0 __sys_sendmsg+0x118/0x1e0 do_syscall_64+0x34/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Fixes: 0daf07e52709 ("raw: convert raw sockets to RCU") Reported-by: Roberto Ricci Tested-by: Roberto Ricci Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220916084821.229287-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv6/af_inet6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 2ce0c44d0081..dbb1430d6cc2 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -1070,13 +1070,13 @@ static int __init inet6_init(void) for (r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) INIT_LIST_HEAD(r); + raw_hashinfo_init(&raw_v6_hashinfo); + if (disable_ipv6_mod) { pr_info("Loaded, but administratively disabled, reboot required to enable\n"); goto out; } - raw_hashinfo_init(&raw_v6_hashinfo); - err = proto_register(&tcpv6_prot, 1); if (err) goto out; -- cgit v1.2.3 From db46e3a88a09c5cf7e505664d01da7238cd56c92 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Sep 2022 13:08:01 +0300 Subject: net/sched: taprio: avoid disabling offload when it was never enabled In an incredibly strange API design decision, qdisc->destroy() gets called even if qdisc->init() never succeeded, not exclusively since commit 87b60cfacf9f ("net_sched: fix error recovery at qdisc creation"), but apparently also earlier (in the case of qdisc_create_dflt()). The taprio qdisc does not fully acknowledge this when it attempts full offload, because it starts off with q->flags = TAPRIO_FLAGS_INVALID in taprio_init(), then it replaces q->flags with TCA_TAPRIO_ATTR_FLAGS parsed from netlink (in taprio_change(), tail called from taprio_init()). But in taprio_destroy(), we call taprio_disable_offload(), and this determines what to do based on FULL_OFFLOAD_IS_ENABLED(q->flags). But looking at the implementation of FULL_OFFLOAD_IS_ENABLED() (a bitwise check of bit 1 in q->flags), it is invalid to call this macro on q->flags when it contains TAPRIO_FLAGS_INVALID, because that is set to U32_MAX, and therefore FULL_OFFLOAD_IS_ENABLED() will return true on an invalid set of flags. As a result, it is possible to crash the kernel if user space forces an error between setting q->flags = TAPRIO_FLAGS_INVALID, and the calling of taprio_enable_offload(). This is because drivers do not expect the offload to be disabled when it was never enabled. The error that we force here is to attach taprio as a non-root qdisc, but instead as child of an mqprio root qdisc: $ tc qdisc add dev swp0 root handle 1: \ mqprio num_tc 8 map 0 1 2 3 4 5 6 7 \ queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 hw 0 $ tc qdisc replace dev swp0 parent 1:1 \ taprio num_tc 8 map 0 1 2 3 4 5 6 7 \ queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 base-time 0 \ sched-entry S 0x7f 990000 sched-entry S 0x80 100000 \ flags 0x0 clockid CLOCK_TAI Unable to handle kernel paging request at virtual address fffffffffffffff8 [fffffffffffffff8] pgd=0000000000000000, p4d=0000000000000000 Internal error: Oops: 96000004 [#1] PREEMPT SMP Call trace: taprio_dump+0x27c/0x310 vsc9959_port_setup_tc+0x1f4/0x460 felix_port_setup_tc+0x24/0x3c dsa_slave_setup_tc+0x54/0x27c taprio_disable_offload.isra.0+0x58/0xe0 taprio_destroy+0x80/0x104 qdisc_create+0x240/0x470 tc_modify_qdisc+0x1fc/0x6b0 rtnetlink_rcv_msg+0x12c/0x390 netlink_rcv_skb+0x5c/0x130 rtnetlink_rcv+0x1c/0x2c Fix this by keeping track of the operations we made, and undo the offload only if we actually did it. I've added "bool offloaded" inside a 4 byte hole between "int clockid" and "atomic64_t picos_per_byte". Now the first cache line looks like below: $ pahole -C taprio_sched net/sched/sch_taprio.o struct taprio_sched { struct Qdisc * * qdiscs; /* 0 8 */ struct Qdisc * root; /* 8 8 */ u32 flags; /* 16 4 */ enum tk_offsets tk_offset; /* 20 4 */ int clockid; /* 24 4 */ bool offloaded; /* 28 1 */ /* XXX 3 bytes hole, try to pack */ atomic64_t picos_per_byte; /* 32 0 */ /* XXX 8 bytes hole, try to pack */ spinlock_t current_entry_lock; /* 40 0 */ /* XXX 8 bytes hole, try to pack */ struct sched_entry * current_entry; /* 48 8 */ struct sched_gate_list * oper_sched; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ Fixes: 9c66d1564676 ("taprio: Add support for hardware offloading") Signed-off-by: Vladimir Oltean Reviewed-by: Vinicius Costa Gomes Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 0b941dd63d26..9bec73019f94 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -67,6 +67,7 @@ struct taprio_sched { u32 flags; enum tk_offsets tk_offset; int clockid; + bool offloaded; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte */ @@ -1279,6 +1280,8 @@ static int taprio_enable_offload(struct net_device *dev, goto done; } + q->offloaded = true; + done: taprio_offload_free(offload); @@ -1293,12 +1296,9 @@ static int taprio_disable_offload(struct net_device *dev, struct tc_taprio_qopt_offload *offload; int err; - if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) + if (!q->offloaded) return 0; - if (!ops->ndo_setup_tc) - return -EOPNOTSUPP; - offload = taprio_offload_alloc(0); if (!offload) { NL_SET_ERR_MSG(extack, @@ -1314,6 +1314,8 @@ static int taprio_disable_offload(struct net_device *dev, goto out; } + q->offloaded = false; + out: taprio_offload_free(offload); -- cgit v1.2.3 From 1461d212ab277d8bba1a753d33e9afe03d81f9d4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 15 Sep 2022 13:08:02 +0300 Subject: net/sched: taprio: make qdisc_leaf() see the per-netdev-queue pfifo child qdiscs taprio can only operate as root qdisc, and to that end, there exists the following check in taprio_init(), just as in mqprio: if (sch->parent != TC_H_ROOT) return -EOPNOTSUPP; And indeed, when we try to attach taprio to an mqprio child, it fails as expected: $ tc qdisc add dev swp0 root handle 1: mqprio num_tc 8 \ map 0 1 2 3 4 5 6 7 \ queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 hw 0 $ tc qdisc replace dev swp0 parent 1:2 taprio num_tc 8 \ map 0 1 2 3 4 5 6 7 \ queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \ base-time 0 sched-entry S 0x7f 990000 sched-entry S 0x80 100000 \ flags 0x0 clockid CLOCK_TAI Error: sch_taprio: Can only be attached as root qdisc. (extack message added by me) But when we try to attach a taprio child to a taprio root qdisc, surprisingly it doesn't fail: $ tc qdisc replace dev swp0 root handle 1: taprio num_tc 8 \ map 0 1 2 3 4 5 6 7 queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \ base-time 0 sched-entry S 0x7f 990000 sched-entry S 0x80 100000 \ flags 0x0 clockid CLOCK_TAI $ tc qdisc replace dev swp0 parent 1:2 taprio num_tc 8 \ map 0 1 2 3 4 5 6 7 \ queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \ base-time 0 sched-entry S 0x7f 990000 sched-entry S 0x80 100000 \ flags 0x0 clockid CLOCK_TAI This is because tc_modify_qdisc() behaves differently when mqprio is root, vs when taprio is root. In the mqprio case, it finds the parent qdisc through p = qdisc_lookup(dev, TC_H_MAJ(clid)), and then the child qdisc through q = qdisc_leaf(p, clid). This leaf qdisc q has handle 0, so it is ignored according to the comment right below ("It may be default qdisc, ignore it"). As a result, tc_modify_qdisc() goes through the qdisc_create() code path, and this gives taprio_init() a chance to check for sch_parent != TC_H_ROOT and error out. Whereas in the taprio case, the returned q = qdisc_leaf(p, clid) is different. It is not the default qdisc created for each netdev queue (both taprio and mqprio call qdisc_create_dflt() and keep them in a private q->qdiscs[], or priv->qdiscs[], respectively). Instead, taprio makes qdisc_leaf() return the _root_ qdisc, aka itself. When taprio does that, tc_modify_qdisc() goes through the qdisc_change() code path, because the qdisc layer never finds out about the child qdisc of the root. And through the ->change() ops, taprio has no reason to check whether its parent is root or not, just through ->init(), which is not called. The problem is the taprio_leaf() implementation. Even though code wise, it does the exact same thing as mqprio_leaf() which it is copied from, it works with different input data. This is because mqprio does not attach itself (the root) to each device TX queue, but one of the default qdiscs from its private array. In fact, since commit 13511704f8d7 ("net: taprio offload: enforce qdisc to netdev queue mapping"), taprio does this too, but just for the full offload case. So if we tried to attach a taprio child to a fully offloaded taprio root qdisc, it would properly fail too; just not to a software root taprio. To fix the problem, stop looking at the Qdisc that's attached to the TX queue, and instead, always return the default qdiscs that we've allocated (and to which we privately enqueue and dequeue, in software scheduling mode). Since Qdisc_class_ops :: leaf is only called from tc_modify_qdisc(), the risk of unforeseen side effects introduced by this change is minimal. Fixes: 5a781ccbd19e ("tc: Add support for configuring the taprio scheduler") Signed-off-by: Vladimir Oltean Reviewed-by: Vinicius Costa Gomes Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 9bec73019f94..86675a79da1e 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1951,12 +1951,14 @@ start_error: static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) { - struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + unsigned int ntx = cl - 1; - if (!dev_queue) + if (ntx >= dev->num_tx_queues) return NULL; - return dev_queue->qdisc_sleeping; + return q->qdiscs[ntx]; } static unsigned long taprio_find(struct Qdisc *sch, u32 classid) -- cgit v1.2.3 From 921ebde3c0d22c8cba74ce8eb3cc4626abff1ccd Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 12 Sep 2022 21:41:00 +0900 Subject: netfilter: nf_tables: fix nft_counters_enabled underflow at nf_tables_addchain() syzbot is reporting underflow of nft_counters_enabled counter at nf_tables_addchain() [1], for commit 43eb8949cfdffa76 ("netfilter: nf_tables: do not leave chain stats enabled on error") missed that nf_tables_chain_destroy() after nft_basechain_init() in the error path of nf_tables_addchain() decrements the counter because nft_basechain_init() makes nft_is_base_chain() return true by setting NFT_CHAIN_BASE flag. Increment the counter immediately after returning from nft_basechain_init(). Link: https://syzkaller.appspot.com/bug?extid=b5d82a651b71cd8a75ab [1] Reported-by: syzbot Signed-off-by: Tetsuo Handa Tested-by: syzbot Fixes: 43eb8949cfdffa76 ("netfilter: nf_tables: do not leave chain stats enabled on error") Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 816052089b33..e062754dc6cc 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2197,7 +2197,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; - struct nft_stats __percpu *stats = NULL; struct nft_table *table = ctx->table; struct nft_base_chain *basechain; struct net *net = ctx->net; @@ -2212,6 +2211,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, return -EOVERFLOW; if (nla[NFTA_CHAIN_HOOK]) { + struct nft_stats __percpu *stats = NULL; struct nft_chain_hook hook; if (flags & NFT_CHAIN_BINDING) @@ -2245,6 +2245,8 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, kfree(basechain); return err; } + if (stats) + static_branch_inc(&nft_counters_enabled); } else { if (flags & NFT_CHAIN_BASE) return -EINVAL; @@ -2319,9 +2321,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, goto err_unregister_hook; } - if (stats) - static_branch_inc(&nft_counters_enabled); - table->use++; return 0; -- cgit v1.2.3 From 9a4d6dd554b86e65581ef6b6638a39ae079b17ac Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 12 Sep 2022 22:58:51 +0900 Subject: netfilter: nf_tables: fix percpu memory leak at nf_tables_addchain() It seems to me that percpu memory for chain stats started leaking since commit 3bc158f8d0330f0a ("netfilter: nf_tables: map basechain priority to hardware priority") when nft_chain_offload_priority() returned an error. Signed-off-by: Tetsuo Handa Fixes: 3bc158f8d0330f0a ("netfilter: nf_tables: map basechain priority to hardware priority") Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e062754dc6cc..63c70141b3e5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2243,6 +2243,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (err < 0) { nft_chain_release_hook(&hook); kfree(basechain); + free_percpu(stats); return err; } if (stats) -- cgit v1.2.3 From 62ce44c4fff947eebdf10bb582267e686e6835c9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 20 Sep 2022 14:20:17 +0200 Subject: netfilter: ebtables: fix memory leak when blob is malformed The bug fix was incomplete, it "replaced" crash with a memory leak. The old code had an assignment to "ret" embedded into the conditional, restore this. Fixes: 7997eff82828 ("netfilter: ebtables: reject blobs that don't provide all entry points") Reported-and-tested-by: syzbot+a24c5252f3e3ab733464@syzkaller.appspotmail.com Signed-off-by: Florian Westphal --- net/bridge/netfilter/ebtables.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 9a0ae59cdc50..4f385d52a1c4 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1040,8 +1040,10 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, goto free_iterate; } - if (repl->valid_hooks != t->valid_hooks) + if (repl->valid_hooks != t->valid_hooks) { + ret = -EINVAL; goto free_unlock; + } if (repl->num_counters && repl->num_counters != t->private->nentries) { ret = -EINVAL; -- cgit v1.2.3 From d25088932227680988a6b794221e031a7232f137 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 20 Sep 2022 18:31:30 +0200 Subject: netfilter: nf_ct_ftp: fix deadlock when nat rewrite is needed We can't use ct->lock, this is already used by the seqadj internals. When using ftp helper + nat, seqadj will attempt to acquire ct->lock again. Revert back to a global lock for now. Fixes: c783a29c7e59 ("netfilter: nf_ct_ftp: prefer skb_linearize") Reported-by: Bruno de Paula Larini Signed-off-by: Florian Westphal --- net/netfilter/nf_conntrack_ftp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 0d9332e9cf71..617f744a2e3a 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -33,6 +33,7 @@ MODULE_AUTHOR("Rusty Russell "); MODULE_DESCRIPTION("ftp connection tracking helper"); MODULE_ALIAS("ip_conntrack_ftp"); MODULE_ALIAS_NFCT_HELPER(HELPER_NAME); +static DEFINE_SPINLOCK(nf_ftp_lock); #define MAX_PORTS 8 static u_int16_t ports[MAX_PORTS]; @@ -409,7 +410,8 @@ static int help(struct sk_buff *skb, } datalen = skb->len - dataoff; - spin_lock_bh(&ct->lock); + /* seqadj (nat) uses ct->lock internally, nf_nat_ftp would cause deadlock */ + spin_lock_bh(&nf_ftp_lock); fb_ptr = skb->data + dataoff; ends_in_nl = (fb_ptr[datalen - 1] == '\n'); @@ -538,7 +540,7 @@ out_update_nl: if (ends_in_nl) update_nl_seq(ct, seq, ct_ftp_info, dir, skb); out: - spin_unlock_bh(&ct->lock); + spin_unlock_bh(&nf_ftp_lock); return ret; } -- cgit v1.2.3 From e738455b2c6dcdab03e45d97de36476f93f557d2 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Tue, 20 Sep 2022 14:43:09 +0800 Subject: net/smc: Stop the CLC flow if no link to map buffers on There might be a potential race between SMC-R buffer map and link group termination. smc_smcr_terminate_all() | smc_connect_rdma() -------------------------------------------------------------- | smc_conn_create() for links in smcibdev | schedule links down | | smc_buf_create() | \- smcr_buf_map_usable_links() | \- no usable links found, | (rmb->mr = NULL) | | smc_clc_send_confirm() | \- access conn->rmb_desc->mr[]->rkey | (panic) During reboot and IB device module remove, all links will be set down and no usable links remain in link groups. In such situation smcr_buf_map_usable_links() should return an error and stop the CLC flow accessing to uninitialized mr. Fixes: b9247544c1bc ("net/smc: convert static link ID instances to support multiple links") Signed-off-by: Wen Gu Link: https://lore.kernel.org/r/1663656189-32090-1-git-send-email-guwen@linux.alibaba.com Signed-off-by: Paolo Abeni --- net/smc/smc_core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index ebf56cdf17db..df89c2e08cbf 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -2239,7 +2239,7 @@ out: static int smcr_buf_map_usable_links(struct smc_link_group *lgr, struct smc_buf_desc *buf_desc, bool is_rmb) { - int i, rc = 0; + int i, rc = 0, cnt = 0; /* protect against parallel link reconfiguration */ mutex_lock(&lgr->llc_conf_mutex); @@ -2252,9 +2252,12 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, rc = -ENOMEM; goto out; } + cnt++; } out: mutex_unlock(&lgr->llc_conf_mutex); + if (!rc && !cnt) + rc = -EINVAL; return rc; } -- cgit v1.2.3 From db39dfdc1c3bd9da273902da12f01973792ff911 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 20 Sep 2022 17:59:15 -0700 Subject: udp: Use WARN_ON_ONCE() in udp_read_skb() Prevent udp_read_skb() from flooding the syslog. Suggested-by: Jakub Sitnicki Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/20220921005915.2697-1-yepeilin.cs@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cd72158e953a..560d9eadeaa5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1821,7 +1821,7 @@ int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) continue; } - WARN_ON(!skb_set_owner_sk_safe(skb, sk)); + WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); used = recv_actor(sk, skb); if (used <= 0) { if (!copied) -- cgit v1.2.3 From c2e1cfefcac35e0eea229e148c8284088ce437b5 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Wed, 21 Sep 2022 17:27:34 +0800 Subject: net: sched: fix possible refcount leak in tc_new_tfilter() tfilter_put need to be called to put the refount got by tp->ops->get to avoid possible refcount leak when chain->tmplt_ops != NULL and chain->tmplt_ops != tp->ops. Fixes: 7d5509fa0d3d ("net: sched: extend proto ops with 'put' callback") Signed-off-by: Hangyu Hua Reviewed-by: Vlad Buslov Link: https://lore.kernel.org/r/20220921092734.31700-1-hbh25y@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/cls_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 790d6809be81..51d175f3fbcb 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -2137,6 +2137,7 @@ replay: } if (chain->tmplt_ops && chain->tmplt_ops != tp->ops) { + tfilter_put(tp, fh); NL_SET_ERR_MSG(extack, "Chain template is set to a different filter kind"); err = -EINVAL; goto errout; -- cgit v1.2.3