From b515d2637276a3810d6595e10ab02c13bfd0b63a Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 16 Apr 2021 11:27:59 +0200 Subject: xfrm: xfrm_state_mtu should return at least 1280 for ipv6 Jianwen reported that IPv6 Interoperability tests are failing in an IPsec case where one of the links between the IPsec peers has an MTU of 1280. The peer generates a packet larger than this MTU, the router replies with a "Packet too big" message indicating an MTU of 1280. When the peer tries to send another large packet, xfrm_state_mtu returns 1280 - ipsec_overhead, which causes ip6_setup_cork to fail with EINVAL. We can fix this by forcing xfrm_state_mtu to return IPV6_MIN_MTU when IPv6 is used. After going through IPsec, the packet will then be fragmented to obey the actual network's PMTU, just before leaving the host. Currently, TFC padding is capped to PMTU - overhead to avoid fragementation: after padding and encapsulation, we still fit within the PMTU. That behavior is preserved in this patch. Fixes: 91657eafb64b ("xfrm: take net hdr len into account for esp payload size calculation") Reported-by: Jianwen Ji Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 + net/ipv4/esp4.c | 2 +- net/ipv6/esp6.c | 2 +- net/xfrm/xfrm_state.c | 14 ++++++++++++-- 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index c58a6d4eb610..6232a5f048bd 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1546,6 +1546,7 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si); void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); int xfrm_init_replay(struct xfrm_state *x); +u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload); int xfrm_init_state(struct xfrm_state *x); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 4b834bbf95e0..ed9857b2875d 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -673,7 +673,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 727d791ed5e6..9d1327b36bd3 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -708,7 +708,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 4496f7efa220..c25586156c6a 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2518,7 +2518,7 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_delete_tunnel); -u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) +u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu) { const struct xfrm_type *type = READ_ONCE(x->type); struct crypto_aead *aead; @@ -2549,7 +2549,17 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - net_adj) & ~(blksize - 1)) + net_adj - 2; } -EXPORT_SYMBOL_GPL(xfrm_state_mtu); +EXPORT_SYMBOL_GPL(__xfrm_state_mtu); + +u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) +{ + mtu = __xfrm_state_mtu(x, mtu); + + if (x->props.family == AF_INET6 && mtu < IPV6_MIN_MTU) + return IPV6_MIN_MTU; + + return mtu; +} int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) { -- cgit v1.2.3 From d7b0408934c749f546b01f2b33d07421a49b6f3e Mon Sep 17 00:00:00 2001 From: Varad Gautam Date: Fri, 28 May 2021 18:04:06 +0200 Subject: xfrm: policy: Read seqcount outside of rcu-read side in xfrm_policy_lookup_bytype xfrm_policy_lookup_bytype loops on seqcount mutex xfrm_policy_hash_generation within an RCU read side critical section. Although ill advised, this is fine if the loop is bounded. xfrm_policy_hash_generation wraps mutex hash_resize_mutex, which is used to serialize writers (xfrm_hash_resize, xfrm_hash_rebuild). This is fine too. On PREEMPT_RT=y, the read_seqcount_begin call within xfrm_policy_lookup_bytype emits a mutex lock/unlock for hash_resize_mutex. Mutex locking is fine, since RCU read side critical sections are allowed to sleep with PREEMPT_RT. xfrm_hash_resize can, however, block on synchronize_rcu while holding hash_resize_mutex. This leads to the following situation on PREEMPT_RT, where the writer is blocked on RCU grace period expiry, while the reader is blocked on a lock held by the writer: Thead 1 (xfrm_hash_resize) Thread 2 (xfrm_policy_lookup_bytype) rcu_read_lock(); mutex_lock(&hash_resize_mutex); read_seqcount_begin(&xfrm_policy_hash_generation); mutex_lock(&hash_resize_mutex); // block xfrm_bydst_resize(); synchronize_rcu(); // block Move the read_seqcount_begin call outside of the RCU read side critical section, and do an rcu_read_unlock/retry if we got stale data within the critical section. On non-PREEMPT_RT, this shortens the time spent within RCU read side critical section in case the seqcount needs a retry, and avoids unbounded looping. Fixes: 77cc278f7b20 ("xfrm: policy: Use sequence counters with associated lock") Signed-off-by: Varad Gautam Cc: linux-rt-users Cc: netdev@vger.kernel.org Cc: stable@vger.kernel.org # v4.9 Cc: Steffen Klassert Cc: Herbert Xu Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Florian Westphal Cc: "Ahmed S. Darwish" Signed-off-by: Steffen Klassert Acked-by: Ahmed S. Darwish --- net/xfrm/xfrm_policy.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b74f28cabe24..8c56e3e59c3c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2092,12 +2092,15 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, if (unlikely(!daddr || !saddr)) return NULL; - rcu_read_lock(); retry: - do { - sequence = read_seqcount_begin(&xfrm_policy_hash_generation); - chain = policy_hash_direct(net, daddr, saddr, family, dir); - } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)); + sequence = read_seqcount_begin(&xfrm_policy_hash_generation); + rcu_read_lock(); + + chain = policy_hash_direct(net, daddr, saddr, family, dir); + if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) { + rcu_read_unlock(); + goto retry; + } ret = NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { @@ -2128,11 +2131,15 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, } skip_inexact: - if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) + if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) { + rcu_read_unlock(); goto retry; + } - if (ret && !xfrm_pol_hold_rcu(ret)) + if (ret && !xfrm_pol_hold_rcu(ret)) { + rcu_read_unlock(); goto retry; + } fail: rcu_read_unlock(); -- cgit v1.2.3 From 6e1e89418a5ccdfb325aed538307c2f9dba6ef51 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Sat, 29 May 2021 15:52:02 +0800 Subject: xfrm: Remove the repeated declaration Function 'xfrm_parse_spi' is declared twice, so remove the repeated declaration. Cc: Steffen Klassert Cc: Herbert Xu Cc: "David S. Miller" Signed-off-by: Shaokun Zhang Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 6232a5f048bd..b30623678430 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1571,7 +1571,6 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm4_transport_finish(struct sk_buff *skb, int async); int xfrm4_rcv(struct sk_buff *skb); -int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, __be32 *spi, __be32 *seq); static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) { -- cgit v1.2.3 From eebd49a4ffb420a991c606e54aa3c9f02857a334 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 29 May 2021 16:23:18 -0400 Subject: xfrm: remove the fragment check for ipv6 beet mode In commit 68dc022d04eb ("xfrm: BEET mode doesn't support fragments for inner packets"), it tried to fix the issue that in TX side the packet is fragmented before the ESP encapping while in the RX side the fragments always get reassembled before decapping with ESP. This is not true for IPv6. IPv6 is different, and it's using exthdr to save fragment info, as well as the ESP info. Exthdrs are added in TX and processed in RX both in order. So in the above case, the ESP decapping will be done earlier than the fragment reassembling in TX side. Here just remove the fragment check for the IPv6 inner packets to recover the fragments support for BEET mode. Fixes: 68dc022d04eb ("xfrm: BEET mode doesn't support fragments for inner packets") Reported-by: Xiumei Mu Signed-off-by: Xin Long Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_output.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index e4cb0ff4dcf4..ac907b9d32d1 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -711,15 +711,8 @@ out: static int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) - unsigned int ptr = 0; int err; - if (x->outer_mode.encap == XFRM_MODE_BEET && - ipv6_find_hdr(skb, &ptr, NEXTHDR_FRAGMENT, NULL, NULL) >= 0) { - net_warn_ratelimited("BEET mode doesn't support inner IPv6 fragments\n"); - return -EAFNOSUPPORT; - } - err = xfrm6_tunnel_check_size(skb); if (err) return err; -- cgit v1.2.3 From 1f9482aa8d412b4ba06ce6ab8e333fb8ca29a06e Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Fri, 14 May 2021 19:42:27 -0700 Subject: mwifiex: bring down link before deleting interface We can deadlock when rmmod'ing the driver or going through firmware reset, because the cfg80211_unregister_wdev() has to bring down the link for us, ... which then grab the same wiphy lock. nl80211_del_interface() already handles a very similar case, with a nice description: /* * We hold RTNL, so this is safe, without RTNL opencount cannot * reach 0, and thus the rdev cannot be deleted. * * We need to do it for the dev_close(), since that will call * the netdev notifiers, and we need to acquire the mutex there * but don't know if we get there from here or from some other * place (e.g. "ip link set ... down"). */ mutex_unlock(&rdev->wiphy.mtx); ... Do similarly for mwifiex teardown, by ensuring we bring the link down first. Sample deadlock trace: [ 247.103516] INFO: task rmmod:2119 blocked for more than 123 seconds. [ 247.110630] Not tainted 5.12.4 #5 [ 247.115796] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 247.124557] task:rmmod state:D stack: 0 pid: 2119 ppid: 2114 flags:0x00400208 [ 247.133905] Call trace: [ 247.136644] __switch_to+0x130/0x170 [ 247.140643] __schedule+0x714/0xa0c [ 247.144548] schedule_preempt_disabled+0x88/0xf4 [ 247.149714] __mutex_lock_common+0x43c/0x750 [ 247.154496] mutex_lock_nested+0x5c/0x68 [ 247.158884] cfg80211_netdev_notifier_call+0x280/0x4e0 [cfg80211] [ 247.165769] raw_notifier_call_chain+0x4c/0x78 [ 247.170742] call_netdevice_notifiers_info+0x68/0xa4 [ 247.176305] __dev_close_many+0x7c/0x138 [ 247.180693] dev_close_many+0x7c/0x10c [ 247.184893] unregister_netdevice_many+0xfc/0x654 [ 247.190158] unregister_netdevice_queue+0xb4/0xe0 [ 247.195424] _cfg80211_unregister_wdev+0xa4/0x204 [cfg80211] [ 247.201816] cfg80211_unregister_wdev+0x20/0x2c [cfg80211] [ 247.208016] mwifiex_del_virtual_intf+0xc8/0x188 [mwifiex] [ 247.214174] mwifiex_uninit_sw+0x158/0x1b0 [mwifiex] [ 247.219747] mwifiex_remove_card+0x38/0xa0 [mwifiex] [ 247.225316] mwifiex_pcie_remove+0xd0/0xe0 [mwifiex_pcie] [ 247.231451] pci_device_remove+0x50/0xe0 [ 247.235849] device_release_driver_internal+0x110/0x1b0 [ 247.241701] driver_detach+0x5c/0x9c [ 247.245704] bus_remove_driver+0x84/0xb8 [ 247.250095] driver_unregister+0x3c/0x60 [ 247.254486] pci_unregister_driver+0x2c/0x90 [ 247.259267] cleanup_module+0x18/0xcdc [mwifiex_pcie] Fixes: a05829a7222e ("cfg80211: avoid holding the RTNL when calling the driver") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/linux-wireless/98392296-40ee-6300-369c-32e16cff3725@gmail.com/ Link: https://lore.kernel.org/linux-wireless/ab4d00ce52f32bd8e45ad0448a44737e@bewaar.me/ Reported-by: Maximilian Luz Reported-by: dave@bewaar.me Cc: Johannes Berg Signed-off-by: Brian Norris Tested-by: Maximilian Luz Tested-by: Dave Olsthoorn Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210515024227.2159311-1-briannorris@chromium.org --- drivers/net/wireless/marvell/mwifiex/main.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/marvell/mwifiex/main.c b/drivers/net/wireless/marvell/mwifiex/main.c index 529dfd8b7ae8..17399d4aa129 100644 --- a/drivers/net/wireless/marvell/mwifiex/main.c +++ b/drivers/net/wireless/marvell/mwifiex/main.c @@ -1445,11 +1445,18 @@ static void mwifiex_uninit_sw(struct mwifiex_adapter *adapter) if (!priv) continue; rtnl_lock(); - wiphy_lock(adapter->wiphy); if (priv->netdev && - priv->wdev.iftype != NL80211_IFTYPE_UNSPECIFIED) + priv->wdev.iftype != NL80211_IFTYPE_UNSPECIFIED) { + /* + * Close the netdev now, because if we do it later, the + * netdev notifiers will need to acquire the wiphy lock + * again --> deadlock. + */ + dev_close(priv->wdev.netdev); + wiphy_lock(adapter->wiphy); mwifiex_del_virtual_intf(adapter->wiphy, &priv->wdev); - wiphy_unlock(adapter->wiphy); + wiphy_unlock(adapter->wiphy); + } rtnl_unlock(); } -- cgit v1.2.3 From 6fd06963fa74197103cdbb4b494763127b3f2f34 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 7 Jun 2021 15:21:49 +0200 Subject: xfrm: Fix error reporting in xfrm_state_construct. When memory allocation for XFRMA_ENCAP or XFRMA_COADDR fails, the error will not be reported because the -ENOMEM assignment to the err variable is overwritten before. Fix this by moving these two in front of the function so that memory allocation failures will be reported. Reported-by: Tobias Brunner Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 5a0ef4361e43..817e714dedea 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -580,6 +580,20 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, copy_from_user_state(x, p); + if (attrs[XFRMA_ENCAP]) { + x->encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]), + sizeof(*x->encap), GFP_KERNEL); + if (x->encap == NULL) + goto error; + } + + if (attrs[XFRMA_COADDR]) { + x->coaddr = kmemdup(nla_data(attrs[XFRMA_COADDR]), + sizeof(*x->coaddr), GFP_KERNEL); + if (x->coaddr == NULL) + goto error; + } + if (attrs[XFRMA_SA_EXTRA_FLAGS]) x->props.extra_flags = nla_get_u32(attrs[XFRMA_SA_EXTRA_FLAGS]); @@ -600,23 +614,9 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, attrs[XFRMA_ALG_COMP]))) goto error; - if (attrs[XFRMA_ENCAP]) { - x->encap = kmemdup(nla_data(attrs[XFRMA_ENCAP]), - sizeof(*x->encap), GFP_KERNEL); - if (x->encap == NULL) - goto error; - } - if (attrs[XFRMA_TFCPAD]) x->tfcpad = nla_get_u32(attrs[XFRMA_TFCPAD]); - if (attrs[XFRMA_COADDR]) { - x->coaddr = kmemdup(nla_data(attrs[XFRMA_COADDR]), - sizeof(*x->coaddr), GFP_KERNEL); - if (x->coaddr == NULL) - goto error; - } - xfrm_mark_get(attrs, &x->mark); xfrm_smark_init(attrs, &x->props.smark); -- cgit v1.2.3 From ab372c2293f5d0b279f31c8d768566ea37602dc9 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Fri, 11 Jun 2021 09:58:12 +0800 Subject: ieee802154: hwsim: Fix possible memory leak in hwsim_subscribe_all_others In hwsim_subscribe_all_others, the error handling code performs incorrectly if the second hwsim_alloc_edge fails. When this issue occurs, it goes to sub_fail, without cleaning the edges allocated before. Fixes: f25da51fdc38 ("ieee802154: hwsim: add replacement for fakelb") Signed-off-by: Dongliang Mu Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20210611015812.1626999-1-mudongliangabcd@gmail.com Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/mac802154_hwsim.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ieee802154/mac802154_hwsim.c b/drivers/net/ieee802154/mac802154_hwsim.c index da9135231c07..366eaae3550a 100644 --- a/drivers/net/ieee802154/mac802154_hwsim.c +++ b/drivers/net/ieee802154/mac802154_hwsim.c @@ -715,6 +715,8 @@ static int hwsim_subscribe_all_others(struct hwsim_phy *phy) return 0; +sub_fail: + hwsim_edge_unsubscribe_me(phy); me_fail: rcu_read_lock(); list_for_each_entry_rcu(e, &phy->edges, list) { @@ -722,8 +724,6 @@ me_fail: hwsim_free_edge(e); } rcu_read_unlock(); -sub_fail: - hwsim_edge_unsubscribe_me(phy); return -ENOMEM; } -- cgit v1.2.3 From 8744365e258459775bd9b49b705a82d66a21c2b4 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 28 May 2021 10:48:49 +0200 Subject: MAINTAINERS: netfilter: add irc channel The community #netfilter IRC channel is now live on the libera.chat network (https://libera.chat/). CC: Arturo Borrero Gonzalez Link: https://marc.info/?l=netfilter&m=162210948632717 Signed-off-by: Nicolas Dichtel Signed-off-by: Pablo Neira Ayuso --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index bfb3d0931cba..f3d44262d16e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12657,6 +12657,7 @@ W: http://www.netfilter.org/ W: http://www.iptables.org/ W: http://www.nftables.org/ Q: http://patchwork.ozlabs.org/project/netfilter-devel/list/ +C: irc://irc.libera.chat/netfilter T: git git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git F: include/linux/netfilter* -- cgit v1.2.3 From cdd73cc545c0fb9b1a1f7b209f4f536e7990cff4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 10 Jun 2021 20:20:30 +0200 Subject: netfilter: nft_exthdr: check for IPv6 packet before further processing ipv6_find_hdr() does not validate that this is an IPv6 packet. Add a sanity check for calling ipv6_find_hdr() to make sure an IPv6 packet is passed for parsing. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_exthdr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index f64f0017e9a5..670dd146fb2b 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -42,6 +42,9 @@ static void nft_exthdr_ipv6_eval(const struct nft_expr *expr, unsigned int offset = 0; int err; + if (pkt->skb->protocol != htons(ETH_P_IPV6)) + goto err; + err = ipv6_find_hdr(pkt->skb, &offset, priv->type, NULL, NULL); if (priv->flags & NFT_EXTHDR_F_PRESENT) { nft_reg_store8(dest, err >= 0); -- cgit v1.2.3 From 8f518d43f89ae00b9cf5460e10b91694944ca1a8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 10 Jun 2021 20:20:31 +0200 Subject: netfilter: nft_osf: check for TCP packet before further processing The osf expression only supports for TCP packets, add a upfront sanity check to skip packet parsing if this is not a TCP packet. Fixes: b96af92d6eaf ("netfilter: nf_tables: implement Passive OS fingerprint module in nft_osf") Signed-off-by: Pablo Neira Ayuso Reported-by: kernel test robot Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_osf.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index ac61f708b82d..d82677e83400 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -28,6 +28,11 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, struct nf_osf_data data; struct tcphdr _tcph; + if (pkt->tprot != IPPROTO_TCP) { + regs->verdict.code = NFT_BREAK; + return; + } + tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); if (!tcp) { -- cgit v1.2.3 From 52f0f4e178c757b3d356087376aad8bd77271828 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 11 Jun 2021 19:26:56 +0200 Subject: netfilter: nft_tproxy: restrict support to TCP and UDP transport protocols Add unfront check for TCP and UDP packets before performing further processing. Fixes: 4ed8eb6570a4 ("netfilter: nf_tables: Add native tproxy support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_tproxy.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index accef672088c..5cb4d575d47f 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -30,6 +30,12 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr, __be16 tport = 0; struct sock *sk; + if (pkt->tprot != IPPROTO_TCP && + pkt->tprot != IPPROTO_UDP) { + regs->verdict.code = NFT_BREAK; + return; + } + hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); if (!hp) { regs->verdict.code = NFT_BREAK; @@ -91,7 +97,8 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr, memset(&taddr, 0, sizeof(taddr)); - if (!pkt->tprot_set) { + if (pkt->tprot != IPPROTO_TCP && + pkt->tprot != IPPROTO_UDP) { regs->verdict.code = NFT_BREAK; return; } -- cgit v1.2.3 From 2f99619820c2269534eb2c0cde44870313c6d353 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Thu, 17 Jun 2021 11:22:55 +0200 Subject: xsk: Fix missing validation for skb and unaligned mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a missing validation of a Tx descriptor when executing in skb mode and the umem is in unaligned mode. A descriptor could point to a buffer straddling the end of the umem, thus effectively tricking the kernel to read outside the allowed umem region. This could lead to a kernel crash if that part of memory is not mapped. In zero-copy mode, the descriptor validation code rejects such descriptors by checking a bit in the DMA address that tells us if the next page is physically contiguous or not. For the last page in the umem, this bit is not set, therefore any descriptor pointing to a packet straddling this last page boundary will be rejected. However, the skb path does not use this bit since it copies out data and can do so to two different pages. (It also does not have the array of DMA address, so it cannot even store this bit.) The code just returned that the packet is always physically contiguous. But this is unfortunately also returned for the last page in the umem, which means that packets that cross the end of the umem are being allowed, which they should not be. Fix this by introducing a check for this in the SKB path only, not penalizing the zero-copy path. Fixes: 2b43470add8c ("xsk: Introduce AF_XDP buffer allocation API") Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20210617092255.3487-1-magnus.karlsson@gmail.com --- include/net/xsk_buff_pool.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index eaa8386dbc63..7a9a23e7a604 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -147,11 +147,16 @@ static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, { bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; - if (pool->dma_pages_cnt && cross_pg) { + if (likely(!cross_pg)) + return false; + + if (pool->dma_pages_cnt) { return !(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK); } - return false; + + /* skb path */ + return addr + len > pool->addrs_cnt; } static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) -- cgit v1.2.3 From f654fae47e83e56b454fbbfd0af0a4f232e356d6 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 18 Jun 2021 09:58:05 +0200 Subject: xsk: Fix broken Tx ring validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix broken Tx ring validation for AF_XDP. The commit under the Fixes tag, fixed an off-by-one error in the validation but introduced another error. Descriptors are now let through even if they straddle a chunk boundary which they are not allowed to do in aligned mode. Worse is that they are let through even if they straddle the end of the umem itself, tricking the kernel to read data outside the allowed umem region which might or might not be mapped at all. Fix this by reintroducing the old code, but subtract the length by one to fix the off-by-one error that the original patch was addressing. The test chunk != chunk_end makes sure packets do not straddle chunk boundraries. Note that packets of zero length are allowed in the interface, therefore the test if the length is non-zero. Fixes: ac31565c2193 ("xsk: Fix for xp_aligned_validate_desc() when len == chunk_size") Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Reviewed-by: Xuan Zhuo Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20210618075805.14412-1-magnus.karlsson@gmail.com --- net/xdp/xsk_queue.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 9d2a89d793c0..9ae13cccfb28 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -128,12 +128,15 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 chunk; - - if (desc->len > pool->chunk_size) - return false; + u64 chunk, chunk_end; chunk = xp_aligned_extract_addr(pool, desc->addr); + if (likely(desc->len)) { + chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len - 1); + if (chunk != chunk_end) + return false; + } + if (chunk >= pool->addrs_cnt) return false; -- cgit v1.2.3 From 61e8aeda9398925f8c6fc290585bdd9727d154c4 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Thu, 17 Jun 2021 23:14:04 -0700 Subject: bpf: Fix libelf endian handling in resolv_btfids The vmlinux ".BTF_ids" ELF section is declared in btf_ids.h to hold a list of zero-filled BTF IDs, which is then patched at link-time with correct values by resolv_btfids. The section is flagged as "allocable" to preclude compression, but notably the section contents (BTF IDs) are untyped. When patching the BTF IDs, resolve_btfids writes in host-native endianness and relies on libelf for any required translation on reading and updating vmlinux. However, since the type of the .BTF_ids section content defaults to ELF_T_BYTE (i.e. unsigned char), no translation occurs. This results in incorrect patched values when cross-compiling to non-native endianness, and can manifest as kernel Oops and test failures which are difficult to troubleshoot [1]. Explicitly set the type of patched data to ELF_T_WORD, the architecture- neutral ELF type corresponding to the u32 BTF IDs. This enables libelf to transparently perform any needed endian conversions. Fixes: fbbb68de80a4 ("bpf: Add resolve_btfids tool to resolve BTF IDs in ELF object") Signed-off-by: Tony Ambardar Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Cc: Frank Eigler Cc: Mark Wielaard Cc: Jiri Olsa Cc: Yonghong Song Link: https://lore.kernel.org/bpf/CAPGftE_eY-Zdi3wBcgDfkz_iOr1KF10n=9mJHm1_a_PykcsoeA@mail.gmail.com [1] Link: https://lore.kernel.org/bpf/20210618061404.818569-1-Tony.Ambardar@gmail.com --- tools/bpf/resolve_btfids/main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 7550fd9c3188..3ad9301b0f00 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -655,6 +655,9 @@ static int symbols_patch(struct object *obj) if (sets_patch(obj)) return -1; + /* Set type to ensure endian translation occurs. */ + obj->efile.idlist->d_type = ELF_T_WORD; + elf_flagdata(obj->efile.idlist, ELF_C_SET, ELF_F_DIRTY); err = elf_update(obj->efile.elf, ELF_C_WRITE); -- cgit v1.2.3 From e3a5de6d81d8b2199935c7eb3f7d17a50a7075b7 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Fri, 18 Jun 2021 17:57:31 +0300 Subject: net: ethernet: aeroflex: fix UAF in greth_of_remove static int greth_of_remove(struct platform_device *of_dev) { ... struct greth_private *greth = netdev_priv(ndev); ... unregister_netdev(ndev); free_netdev(ndev); of_iounmap(&of_dev->resource[0], greth->regs, resource_size(&of_dev->resource[0])); ... } greth is netdev private data, but it is used after free_netdev(). It can cause use-after-free when accessing greth pointer. So, fix it by moving free_netdev() after of_iounmap() call. Fixes: d4c41139df6e ("net: Add Aeroflex Gaisler 10/100/1G Ethernet MAC driver") Signed-off-by: Pavel Skripkin Signed-off-by: David S. Miller --- drivers/net/ethernet/aeroflex/greth.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/aeroflex/greth.c b/drivers/net/ethernet/aeroflex/greth.c index d77fafbc1530..c560ad06f0be 100644 --- a/drivers/net/ethernet/aeroflex/greth.c +++ b/drivers/net/ethernet/aeroflex/greth.c @@ -1539,10 +1539,11 @@ static int greth_of_remove(struct platform_device *of_dev) mdiobus_unregister(greth->mdio); unregister_netdev(ndev); - free_netdev(ndev); of_iounmap(&of_dev->resource[0], greth->regs, resource_size(&of_dev->resource[0])); + free_netdev(ndev); + return 0; } -- cgit v1.2.3 From e4b8700e07a86e8eab6916aa5c5ba99042c34089 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Fri, 18 Jun 2021 19:14:31 +0300 Subject: net: ethernet: ezchip: fix UAF in nps_enet_remove priv is netdev private data, but it is used after free_netdev(). It can cause use-after-free when accessing priv pointer. So, fix it by moving free_netdev() after netif_napi_del() call. Fixes: 0dd077093636 ("NET: Add ezchip ethernet driver") Signed-off-by: Pavel Skripkin Signed-off-by: David S. Miller --- drivers/net/ethernet/ezchip/nps_enet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ezchip/nps_enet.c b/drivers/net/ethernet/ezchip/nps_enet.c index e3954d8835e7..20d2c2bb26e4 100644 --- a/drivers/net/ethernet/ezchip/nps_enet.c +++ b/drivers/net/ethernet/ezchip/nps_enet.c @@ -642,8 +642,8 @@ static s32 nps_enet_remove(struct platform_device *pdev) struct nps_enet_priv *priv = netdev_priv(ndev); unregister_netdev(ndev); - free_netdev(ndev); netif_napi_del(&priv->napi); + free_netdev(ndev); return 0; } -- cgit v1.2.3 From 4ae85b23e1f052379f0316e42494e2f84f2a3e6f Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Fri, 18 Jun 2021 19:14:37 +0300 Subject: net: ethernet: ezchip: remove redundant check err varibale will be set everytime, when code gets into this path. This check will just slowdown the execution and that's all. Fixes: 0dd077093636 ("NET: Add ezchip ethernet driver") Signed-off-by: Pavel Skripkin Signed-off-by: David S. Miller --- drivers/net/ethernet/ezchip/nps_enet.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ezchip/nps_enet.c b/drivers/net/ethernet/ezchip/nps_enet.c index 20d2c2bb26e4..c562a1e83913 100644 --- a/drivers/net/ethernet/ezchip/nps_enet.c +++ b/drivers/net/ethernet/ezchip/nps_enet.c @@ -630,8 +630,7 @@ static s32 nps_enet_probe(struct platform_device *pdev) out_netif_api: netif_napi_del(&priv->napi); out_netdev: - if (err) - free_netdev(ndev); + free_netdev(ndev); return err; } -- cgit v1.2.3 From 0de449d599594f5472e00267d651615c7f2c6c1d Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Fri, 18 Jun 2021 19:14:47 +0300 Subject: net: ethernet: ezchip: fix error handling As documented at drivers/base/platform.c for platform_get_irq: * Gets an IRQ for a platform device and prints an error message if finding the * IRQ fails. Device drivers should check the return value for errors so as to * not pass a negative integer value to the request_irq() APIs. So, the driver should check that platform_get_irq() return value is _negative_, not that it's equal to zero, because -ENXIO (return value from request_irq() if irq was not found) will pass this check and it leads to passing negative irq to request_irq() Fixes: 0dd077093636 ("NET: Add ezchip ethernet driver") Signed-off-by: Pavel Skripkin Signed-off-by: David S. Miller --- drivers/net/ethernet/ezchip/nps_enet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ezchip/nps_enet.c b/drivers/net/ethernet/ezchip/nps_enet.c index c562a1e83913..f9a288a6ec8c 100644 --- a/drivers/net/ethernet/ezchip/nps_enet.c +++ b/drivers/net/ethernet/ezchip/nps_enet.c @@ -607,7 +607,7 @@ static s32 nps_enet_probe(struct platform_device *pdev) /* Get IRQ number */ priv->irq = platform_get_irq(pdev, 0); - if (!priv->irq) { + if (priv->irq < 0) { dev_err(dev, "failed to retrieve value from device tree\n"); err = -ENODEV; goto out_netdev; -- cgit v1.2.3 From d5f9023fa61ee8b94f37a93f08e94b136cf1e463 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Sat, 19 Jun 2021 13:18:13 -0300 Subject: can: bcm: delay release of struct bcm_op after synchronize_rcu() can_rx_register() callbacks may be called concurrently to the call to can_rx_unregister(). The callbacks and callback data, though, are protected by RCU and the struct sock reference count. So the callback data is really attached to the life of sk, meaning that it should be released on sk_destruct. However, bcm_remove_op() calls tasklet_kill(), and RCU callbacks may be called under RCU softirq, so that cannot be used on kernels before the introduction of HRTIMER_MODE_SOFT. However, bcm_rx_handler() is called under RCU protection, so after calling can_rx_unregister(), we may call synchronize_rcu() in order to wait for any RCU read-side critical sections to finish. That is, bcm_rx_handler() won't be called anymore for those ops. So, we only free them, after we do that synchronize_rcu(). Fixes: ffd980f976e7 ("[CAN]: Add broadcast manager (bcm) protocol") Link: https://lore.kernel.org/r/20210619161813.2098382-1-cascardo@canonical.com Cc: linux-stable Reported-by: syzbot+0f7e7e5e2f4f40fa89c0@syzkaller.appspotmail.com Reported-by: Norbert Slusarek Signed-off-by: Thadeu Lima de Souza Cascardo Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/bcm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/can/bcm.c b/net/can/bcm.c index f3e4d9528fa3..0928a39c4423 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -785,6 +785,7 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, bcm_rx_handler, op); list_del(&op->list); + synchronize_rcu(); bcm_remove_op(op); return 1; /* done */ } @@ -1533,9 +1534,13 @@ static int bcm_release(struct socket *sock) REGMASK(op->can_id), bcm_rx_handler, op); - bcm_remove_op(op); } + synchronize_rcu(); + + list_for_each_entry_safe(op, next, &bo->rx_ops, list) + bcm_remove_op(op); + #if IS_ENABLED(CONFIG_PROC_FS) /* remove procfs entry */ if (net->can.bcmproc_dir && bo->bcm_proc_read) -- cgit v1.2.3 From fb8696ab14adadb2e3f6c17c18ed26b3ecd96691 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 18 Jun 2021 19:36:45 +0200 Subject: can: gw: synchronize rcu operations before removing gw job entry can_can_gw_rcv() is called under RCU protection, so after calling can_rx_unregister(), we have to call synchronize_rcu in order to wait for any RCU read-side critical sections to finish before removing the kmem_cache entry with the referenced gw job entry. Link: https://lore.kernel.org/r/20210618173645.2238-1-socketcan@hartkopp.net Fixes: c1aabdf379bc ("can-gw: add netlink based CAN routing") Cc: linux-stable Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/gw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/can/gw.c b/net/can/gw.c index ba4124805602..d8861e862f15 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -596,6 +596,7 @@ static int cgw_notifier(struct notifier_block *nb, if (gwj->src.dev == dev || gwj->dst.dev == dev) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); + synchronize_rcu(); kmem_cache_free(cgw_cache, gwj); } } @@ -1154,6 +1155,7 @@ static void cgw_remove_all_jobs(struct net *net) hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); + synchronize_rcu(); kmem_cache_free(cgw_cache, gwj); } } @@ -1222,6 +1224,7 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); + synchronize_rcu(); kmem_cache_free(cgw_cache, gwj); err = 0; break; -- cgit v1.2.3 From 14a4696bc3118ba49da28f79280e1d55603aa737 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 18 Jun 2021 19:37:13 +0200 Subject: can: isotp: isotp_release(): omit unintended hrtimer restart on socket release When closing the isotp socket, the potentially running hrtimers are canceled before removing the subscription for CAN identifiers via can_rx_unregister(). This may lead to an unintended (re)start of a hrtimer in isotp_rcv_cf() and isotp_rcv_fc() in the case that a CAN frame is received by isotp_rcv() while the subscription removal is processed. However, isotp_rcv() is called under RCU protection, so after calling can_rx_unregister, we may call synchronize_rcu in order to wait for any RCU read-side critical sections to finish. This prevents the reception of CAN frames after hrtimer_cancel() and therefore the unintended (re)start of the hrtimers. Link: https://lore.kernel.org/r/20210618173713.2296-1-socketcan@hartkopp.net Fixes: e057dd3fc20f ("can: add ISO 15765-2:2016 transport protocol") Cc: linux-stable Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/can/isotp.c b/net/can/isotp.c index be6183f8ca11..234cc4ad179a 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1028,9 +1028,6 @@ static int isotp_release(struct socket *sock) lock_sock(sk); - hrtimer_cancel(&so->txtimer); - hrtimer_cancel(&so->rxtimer); - /* remove current filters & unregister */ if (so->bound && (!(so->opt.flags & CAN_ISOTP_SF_BROADCAST))) { if (so->ifindex) { @@ -1042,10 +1039,14 @@ static int isotp_release(struct socket *sock) SINGLE_MASK(so->rxid), isotp_rcv, sk); dev_put(dev); + synchronize_rcu(); } } } + hrtimer_cancel(&so->txtimer); + hrtimer_cancel(&so->rxtimer); + so->ifindex = 0; so->bound = 0; -- cgit v1.2.3 From 22c696fed25c63c7f67508309820358b94a96b6d Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 17 Jun 2021 15:06:23 +0200 Subject: can: j1939: j1939_sk_init(): set SOCK_RCU_FREE to call sk_destruct() after RCU is done Set SOCK_RCU_FREE to let RCU to call sk_destruct() on completion. Without this patch, we will run in to j1939_can_recv() after priv was freed by j1939_sk_release()->j1939_sk_sock_destruct() Fixes: 25fe97cb7620 ("can: j1939: move j1939_priv_put() into sk_destruct callback") Link: https://lore.kernel.org/r/20210617130623.12705-1-o.rempel@pengutronix.de Cc: linux-stable Reported-by: Thadeu Lima de Souza Cascardo Reported-by: syzbot+bdf710cfc41c186fdff3@syzkaller.appspotmail.com Signed-off-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- net/can/j1939/main.c | 4 ++++ net/can/j1939/socket.c | 3 +++ 2 files changed, 7 insertions(+) diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index da3a7a7bcff2..08c8606cfd9c 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -193,6 +193,10 @@ static void j1939_can_rx_unregister(struct j1939_priv *priv) can_rx_unregister(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK, j1939_can_recv, priv); + /* The last reference of priv is dropped by the RCU deferred + * j1939_sk_sock_destruct() of the last socket, so we can + * safely drop this reference here. + */ j1939_priv_put(priv); } diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 56aa66147d5a..fce8bc8afeb7 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -398,6 +398,9 @@ static int j1939_sk_init(struct sock *sk) atomic_set(&jsk->skb_pending, 0); spin_lock_init(&jsk->sk_session_queue_lock); INIT_LIST_HEAD(&jsk->sk_session_queue); + + /* j1939_sk_sock_destruct() depends on SOCK_RCU_FREE flag */ + sock_set_flag(sk, SOCK_RCU_FREE); sk->sk_destruct = j1939_sk_sock_destruct; sk->sk_protocol = CAN_J1939; -- cgit v1.2.3 From ab4a0b8fcb9a95c02909b62049811bd2e586aaa4 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Thu, 17 Jun 2021 21:51:30 +0300 Subject: net: can: ems_usb: fix use-after-free in ems_usb_disconnect() In ems_usb_disconnect() dev pointer, which is netdev private data, is used after free_candev() call: | if (dev) { | unregister_netdev(dev->netdev); | free_candev(dev->netdev); | | unlink_all_urbs(dev); | | usb_free_urb(dev->intr_urb); | | kfree(dev->intr_in_buffer); | kfree(dev->tx_msg_buffer); | } Fix it by simply moving free_candev() at the end of the block. Fail log: | BUG: KASAN: use-after-free in ems_usb_disconnect | Read of size 8 at addr ffff88804e041008 by task kworker/1:2/2895 | | CPU: 1 PID: 2895 Comm: kworker/1:2 Not tainted 5.13.0-rc5+ #164 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a-rebuilt.opensuse.4 | Workqueue: usb_hub_wq hub_event | Call Trace: | dump_stack (lib/dump_stack.c:122) | print_address_description.constprop.0.cold (mm/kasan/report.c:234) | kasan_report.cold (mm/kasan/report.c:420 mm/kasan/report.c:436) | ems_usb_disconnect (drivers/net/can/usb/ems_usb.c:683 drivers/net/can/usb/ems_usb.c:1058) Fixes: 702171adeed3 ("ems_usb: Added support for EMS CPC-USB/ARM7 CAN/USB interface") Link: https://lore.kernel.org/r/20210617185130.5834-1-paskripkin@gmail.com Cc: linux-stable Signed-off-by: Pavel Skripkin Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/ems_usb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c index 5af69787d9d5..0a37af4a3fa4 100644 --- a/drivers/net/can/usb/ems_usb.c +++ b/drivers/net/can/usb/ems_usb.c @@ -1053,7 +1053,6 @@ static void ems_usb_disconnect(struct usb_interface *intf) if (dev) { unregister_netdev(dev->netdev); - free_candev(dev->netdev); unlink_all_urbs(dev); @@ -1061,6 +1060,8 @@ static void ems_usb_disconnect(struct usb_interface *intf) kfree(dev->intr_in_buffer); kfree(dev->tx_msg_buffer); + + free_candev(dev->netdev); } } -- cgit v1.2.3 From 9f2470fbc4cb4583c080bb729a998933ba61aca4 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:35 -0700 Subject: skmsg: Improve udp_bpf_recvmsg() accuracy I tried to reuse sk_msg_wait_data() for different protocols, but it turns out it can not be simply reused. For example, UDP actually uses two queues to receive skb: udp_sk(sk)->reader_queue and sk->sk_receive_queue. So we have to check both of them to know whether we have received any packet. Also, UDP does not lock the sock during BH Rx path, it makes no sense for its ->recvmsg() to lock the sock. It is always possible for ->recvmsg() to be called before packets actually arrive in the receive queue, we just use best effort to make it accurate here. Fixes: 1f5be6b3b063 ("udp: Implement udp_bpf_recvmsg() for sockmap") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-2-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 -- net/core/skmsg.c | 23 ----------------------- net/ipv4/tcp_bpf.c | 24 +++++++++++++++++++++++- net/ipv4/udp_bpf.c | 47 ++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 65 insertions(+), 31 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index aba0f0f429be..e3d080c299f6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -126,8 +126,6 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); -int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, - long timeo, int *err); int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 43ce17a6a585..f9a81b314e4c 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -399,29 +399,6 @@ out: } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, - long timeo, int *err) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - int ret = 0; - - if (sk->sk_shutdown & RCV_SHUTDOWN) - return 1; - - if (!timeo) - return ret; - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - ret = sk_wait_event(sk, &timeo, - !list_empty(&psock->ingress_msg) || - !skb_queue_empty(&sk->sk_receive_queue), &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - return ret; -} -EXPORT_SYMBOL_GPL(sk_msg_wait_data); - /* Receive sk_msg from psock->ingress_msg to @msg. */ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ad9d17923fc5..bb49b52d7be8 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -163,6 +163,28 @@ static bool tcp_bpf_stream_read(const struct sock *sk) return !empty; } +static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, + long timeo, int *err) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = sk_wait_event(sk, &timeo, + !list_empty(&psock->ingress_msg) || + !skb_queue_empty(&sk->sk_receive_queue), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} + static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -188,7 +210,7 @@ msg_bytes_ready: long timeo; timeo = sock_rcvtimeo(sk, nonblock); - data = sk_msg_wait_data(sk, psock, flags, timeo, &err); + data = tcp_msg_wait_data(sk, psock, flags, timeo, &err); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 954c4591a6fd..565a70040c57 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -21,6 +21,45 @@ static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return udp_prot.recvmsg(sk, msg, len, noblock, flags, addr_len); } +static bool udp_sk_has_data(struct sock *sk) +{ + return !skb_queue_empty(&udp_sk(sk)->reader_queue) || + !skb_queue_empty(&sk->sk_receive_queue); +} + +static bool psock_has_data(struct sk_psock *psock) +{ + return !skb_queue_empty(&psock->ingress_skb) || + !sk_psock_queue_empty(psock); +} + +#define udp_msg_has_data(__sk, __psock) \ + ({ udp_sk_has_data(__sk) || psock_has_data(__psock); }) + +static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, + long timeo, int *err) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = udp_msg_has_data(sk, psock); + if (!ret) { + wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); + ret = udp_msg_has_data(sk, psock); + } + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} + static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -34,8 +73,7 @@ static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (unlikely(!psock)) return sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - lock_sock(sk); - if (sk_psock_queue_empty(psock)) { + if (!psock_has_data(psock)) { ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); goto out; } @@ -47,9 +85,9 @@ msg_bytes_ready: long timeo; timeo = sock_rcvtimeo(sk, nonblock); - data = sk_msg_wait_data(sk, psock, flags, timeo, &err); + data = udp_msg_wait_data(sk, psock, flags, timeo, &err); if (data) { - if (!sk_psock_queue_empty(psock)) + if (psock_has_data(psock)) goto msg_bytes_ready; ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); goto out; @@ -62,7 +100,6 @@ msg_bytes_ready: } ret = copied; out: - release_sock(sk); sk_psock_put(sk, psock); return ret; } -- cgit v1.2.3 From a7e65fe7d8201527129206754db1a2db6a6b2fde Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:36 -0700 Subject: selftests/bpf: Retry for EAGAIN in udp_redir_to_connected() We use non-blocking sockets for testing sockmap redirections, and got some random EAGAIN errors from UDP tests. There is no guarantee the packet would be immediately available to receive as soon as it is sent out, even on the local host. For UDP, this is especially true because it does not lock the sock during BH (unlike the TCP path). This is probably why we only saw this error in UDP cases. No matter how hard we try to make the queue empty check accurate, it is always possible for recvmsg() to beat ->sk_data_ready(). Therefore, we should just retry in case of EAGAIN. Fixes: d6378af615275 ("selftests/bpf: Add a test case for udp sockmap") Reported-by: Jiang Wang Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-3-xiyou.wangcong@gmail.com --- tools/testing/selftests/bpf/prog_tests/sockmap_listen.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index 648d9ae898d2..01ab11259809 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1610,6 +1610,7 @@ static void udp_redir_to_connected(int family, int sotype, int sock_mapfd, struct sockaddr_storage addr; int c0, c1, p0, p1; unsigned int pass; + int retries = 100; socklen_t len; int err, n; u64 value; @@ -1686,9 +1687,13 @@ static void udp_redir_to_connected(int family, int sotype, int sock_mapfd, if (pass != 1) FAIL("%s: want pass count 1, have %d", log_prefix, pass); +again: n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1); - if (n < 0) + if (n < 0) { + if (errno == EAGAIN && retries--) + goto again; FAIL_ERRNO("%s: read", log_prefix); + } if (n == 0) FAIL("%s: incomplete read", log_prefix); -- cgit v1.2.3 From e00a5c331bf57f41fcfdc5da4f5caeafe5e54c1d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:37 -0700 Subject: udp: Fix a memory leak in udp_read_sock() sk_psock_verdict_recv() clones the skb and uses the clone afterward, so udp_read_sock() should free the skb after using it, regardless of error or not. This fixes a real kmemleak. Fixes: d7f571188ecf ("udp: Implement ->read_sock() for sockmap") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-4-xiyou.wangcong@gmail.com --- net/ipv4/udp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1307ad0d3b9e..8091276cb85b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1798,11 +1798,13 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc, if (used <= 0) { if (!copied) copied = used; + kfree_skb(skb); break; } else if (used <= skb->len) { copied += used; } + kfree_skb(skb); if (!desc->count) break; } -- cgit v1.2.3 From 30b9c54a707db4155735cf71f4600241c1b7b6ff Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:38 -0700 Subject: skmsg: Clear skb redirect pointer before dropping it When we drop skb inside sk_psock_skb_redirect(), we have to clear its skb->_sk_redir pointer too, otherwise kfree_skb() would misinterpret it as a valid skb->_skb_refdst and dst_release() would eventually complain. Fixes: e3526bb92a20 ("skmsg: Move sk_redir from TCP_SKB_CB to skb") Reported-by: Jiang Wang Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-5-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index f9a81b314e4c..4334720e2a04 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -843,12 +843,14 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) * a socket that is in this state so we drop the skb. */ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { + skb_bpf_redirect_clear(skb); kfree_skb(skb); return; } spin_lock_bh(&psock_other->ingress_lock); if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { spin_unlock_bh(&psock_other->ingress_lock); + skb_bpf_redirect_clear(skb); kfree_skb(skb); return; } -- cgit v1.2.3 From 0cf6672b23c8aa9d9274798dd63cbf6ede77ef90 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:39 -0700 Subject: skmsg: Fix a memory leak in sk_psock_verdict_apply() If the dest psock does not set SK_PSOCK_TX_ENABLED, the skb can't be queued anywhere so must be dropped. This one is found during code review. Fixes: 799aa7f98d53 ("skmsg: Avoid lock_sock() in sk_psock_backlog()") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-6-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 4334720e2a04..5464477e2d3d 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -924,8 +924,13 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { skb_queue_tail(&psock->ingress_skb, skb); schedule_work(&psock->work); + err = 0; } spin_unlock_bh(&psock->ingress_lock); + if (err < 0) { + skb_bpf_redirect_clear(skb); + goto out_free; + } } break; case __SK_REDIRECT: -- cgit v1.2.3 From 1581a6c1c3291a8320b080f4411345f60229976d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:40 -0700 Subject: skmsg: Teach sk_psock_verdict_apply() to return errors Currently sk_psock_verdict_apply() is void, but it handles some error conditions too. Its caller is impossible to learn whether it succeeds or fails, especially sk_psock_verdict_recv(). Make it return int to indicate error cases and propagate errors to callers properly. Fixes: ef5659280eb1 ("bpf, sockmap: Allow skipping sk_skb parser program") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-7-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 5464477e2d3d..e3d210811db4 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -824,7 +824,7 @@ out: } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); -static void sk_psock_skb_redirect(struct sk_buff *skb) +static int sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; @@ -835,7 +835,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) */ if (unlikely(!sk_other)) { kfree_skb(skb); - return; + return -EIO; } psock_other = sk_psock(sk_other); /* This error indicates the socket is being torn down or had another @@ -845,19 +845,20 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { skb_bpf_redirect_clear(skb); kfree_skb(skb); - return; + return -EIO; } spin_lock_bh(&psock_other->ingress_lock); if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { spin_unlock_bh(&psock_other->ingress_lock); skb_bpf_redirect_clear(skb); kfree_skb(skb); - return; + return -EIO; } skb_queue_tail(&psock_other->ingress_skb, skb); schedule_work(&psock_other->work); spin_unlock_bh(&psock_other->ingress_lock); + return 0; } static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) @@ -894,14 +895,15 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); -static void sk_psock_verdict_apply(struct sk_psock *psock, - struct sk_buff *skb, int verdict) +static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, + int verdict) { struct sock *sk_other; - int err = -EIO; + int err = 0; switch (verdict) { case __SK_PASS: + err = -EIO; sk_other = psock->sk; if (sock_flag(sk_other, SOCK_DEAD) || !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { @@ -934,13 +936,15 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, } break; case __SK_REDIRECT: - sk_psock_skb_redirect(skb); + err = sk_psock_skb_redirect(skb); break; case __SK_DROP: default: out_free: kfree_skb(skb); } + + return err; } static void sk_psock_write_space(struct sock *sk) @@ -1107,7 +1111,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } - sk_psock_verdict_apply(psock, skb, ret); + if (sk_psock_verdict_apply(psock, skb, ret) < 0) + len = 0; out: rcu_read_unlock(); return len; -- cgit v1.2.3 From 42830571f1fd9751b3fbf38084bbb253320e185f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:41 -0700 Subject: skmsg: Pass source psock to sk_psock_skb_redirect() sk_psock_skb_redirect() only takes skb as a parameter, we will need to know where this skb is from, so just pass the source psock to this function as a new parameter. This patch prepares for the next one. Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-8-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e3d210811db4..3aa9065811ad 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -824,7 +824,7 @@ out: } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); -static int sk_psock_skb_redirect(struct sk_buff *skb) +static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; @@ -861,11 +861,12 @@ static int sk_psock_skb_redirect(struct sk_buff *skb) return 0; } -static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) +static void sk_psock_tls_verdict_apply(struct sk_buff *skb, + struct sk_psock *from, int verdict) { switch (verdict) { case __SK_REDIRECT: - sk_psock_skb_redirect(skb); + sk_psock_skb_redirect(from, skb); break; case __SK_PASS: case __SK_DROP: @@ -889,7 +890,7 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } - sk_psock_tls_verdict_apply(skb, psock->sk, ret); + sk_psock_tls_verdict_apply(skb, psock, ret); rcu_read_unlock(); return ret; } @@ -936,7 +937,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, } break; case __SK_REDIRECT: - err = sk_psock_skb_redirect(skb); + err = sk_psock_skb_redirect(psock, skb); break; case __SK_DROP: default: -- cgit v1.2.3 From 781dd0431eb549f9cb1fdddf91a50d985febe884 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 14 Jun 2021 19:13:42 -0700 Subject: skmsg: Increase sk->sk_drops when dropping packets It is hard to observe packet drops without increasing relevant drop counters, here we should increase sk->sk_drops which is a protocol-independent counter. Fortunately psock is always associated with a struct sock, we can just use psock->sk. Suggested-by: John Fastabend Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210615021342.7416-9-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 3aa9065811ad..9b6160a191f8 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -578,6 +578,12 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, return sk_psock_skb_ingress(psock, skb); } +static void sock_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + kfree_skb(skb); +} + static void sk_psock_backlog(struct work_struct *work) { struct sk_psock *psock = container_of(work, struct sk_psock, work); @@ -617,7 +623,7 @@ start: /* Hard errors break pipe and stop xmit. */ sk_psock_report_error(psock, ret ? -ret : EPIPE); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); - kfree_skb(skb); + sock_drop(psock->sk, skb); goto end; } off += ret; @@ -708,7 +714,7 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock) while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) { skb_bpf_redirect_clear(skb); - kfree_skb(skb); + sock_drop(psock->sk, skb); } __sk_psock_purge_ingress_msg(psock); } @@ -834,7 +840,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) * return code, but then didn't set a redirect interface. */ if (unlikely(!sk_other)) { - kfree_skb(skb); + sock_drop(from->sk, skb); return -EIO; } psock_other = sk_psock(sk_other); @@ -844,14 +850,14 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) */ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { skb_bpf_redirect_clear(skb); - kfree_skb(skb); + sock_drop(from->sk, skb); return -EIO; } spin_lock_bh(&psock_other->ingress_lock); if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { spin_unlock_bh(&psock_other->ingress_lock); skb_bpf_redirect_clear(skb); - kfree_skb(skb); + sock_drop(from->sk, skb); return -EIO; } @@ -942,7 +948,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, case __SK_DROP: default: out_free: - kfree_skb(skb); + sock_drop(psock->sk, skb); } return err; @@ -977,7 +983,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) sk = strp->sk; psock = sk_psock(sk); if (unlikely(!psock)) { - kfree_skb(skb); + sock_drop(sk, skb); goto out; } prog = READ_ONCE(psock->progs.stream_verdict); @@ -1098,7 +1104,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, psock = sk_psock(sk); if (unlikely(!psock)) { len = 0; - kfree_skb(skb); + sock_drop(sk, skb); goto out; } prog = READ_ONCE(psock->progs.stream_verdict); -- cgit v1.2.3 From 603113c514e95c3350598bc3cccbd03af7ea4ab2 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Fri, 18 Jun 2021 17:15:53 +0200 Subject: vrf: do not push non-ND strict packets with a source LLA through packet taps again Non-ND strict packets with a source LLA go through the packet taps again, while non-ND strict packets with other source addresses do not, and we can see a clone of those packets on the vrf interface (we should not). This is due to a series of changes: Commit 6f12fa775530[1] made non-ND strict packets not being pushed again in the packet taps. This changed with commit 205704c618af[2] for those packets having a source LLA, as they need a lookup with the orig_iif. The issue now is those packets do not skip the 'vrf_ip6_rcv' function to the end (as the ones without a source LLA) and go through the check to call packet taps again. This check was changed by commit 6f12fa775530[1] and do not exclude non-strict packets anymore. Packets matching 'need_strict && !is_ndisc && is_ll_src' are now being sent through the packet taps again. This can be seen by dumping packets on the vrf interface. Fix this by having the same code path for all non-ND strict packets and selectively lookup with the orig_iif for those with a source LLA. This has the effect to revert to the pre-205704c618af[2] condition, which should also be easier to maintain. [1] 6f12fa775530 ("vrf: mark skb for multicast or link-local as enslaved to VRF") [2] 205704c618af ("vrf: packets with lladdr src needs dst at input with orig_iif when needs strict") Fixes: 205704c618af ("vrf: packets with lladdr src needs dst at input with orig_iif when needs strict") Cc: Stephen Suryaputra Reported-by: Paolo Abeni Signed-off-by: Antoine Tenart Reviewed-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 28a6c4cfe9b8..414afcb0a23f 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1366,22 +1366,22 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, int orig_iif = skb->skb_iif; bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); bool is_ndisc = ipv6_ndisc_frame(skb); - bool is_ll_src; /* loopback, multicast & non-ND link-local traffic; do not push through * packet taps again. Reset pkt_type for upper layers to process skb. - * for packets with lladdr src, however, skip so that the dst can be - * determine at input using original ifindex in the case that daddr - * needs strict + * For strict packets with a source LLA, determine the dst using the + * original ifindex. */ - is_ll_src = ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL; - if (skb->pkt_type == PACKET_LOOPBACK || - (need_strict && !is_ndisc && !is_ll_src)) { + if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IP6CB(skb)->flags |= IP6SKB_L3SLAVE; + if (skb->pkt_type == PACKET_LOOPBACK) skb->pkt_type = PACKET_HOST; + else if (ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL) + vrf_ip6_input_dst(skb, vrf_dev, orig_iif); + goto out; } -- cgit v1.2.3 From 89837eb4b2463c556a123437f242d6c2bc62ce81 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Thu, 17 Jun 2021 09:04:14 +0800 Subject: net: sched: add barrier to ensure correct ordering for lockless qdisc The spin_trylock() was assumed to contain the implicit barrier needed to ensure the correct ordering between STATE_MISSED setting/clearing and STATE_MISSED checking in commit a90c57f2cedd ("net: sched: fix packet stuck problem for lockless qdisc"). But it turns out that spin_trylock() only has load-acquire semantic, for strongly-ordered system(like x86), the compiler barrier implicitly contained in spin_trylock() seems enough to ensure the correct ordering. But for weakly-orderly system (like arm64), the store-release semantic is needed to ensure the correct ordering as clear_bit() and test_bit() is store operation, see queued_spin_lock(). So add the explicit barrier to ensure the correct ordering for the above case. Fixes: a90c57f2cedd ("net: sched: fix packet stuck problem for lockless qdisc") Signed-off-by: Yunsheng Lin Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/sch_generic.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 1e625519ae96..57710303908c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -163,6 +163,12 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) if (spin_trylock(&qdisc->seqlock)) goto nolock_empty; + /* Paired with smp_mb__after_atomic() to make sure + * STATE_MISSED checking is synchronized with clearing + * in pfifo_fast_dequeue(). + */ + smp_mb__before_atomic(); + /* If the MISSED flag is set, it means other thread has * set the MISSED flag before second spin_trylock(), so * we can return false here to avoid multi cpus doing @@ -180,6 +186,12 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) */ set_bit(__QDISC_STATE_MISSED, &qdisc->state); + /* spin_trylock() only has load-acquire semantic, so use + * smp_mb__after_atomic() to ensure STATE_MISSED is set + * before doing the second spin_trylock(). + */ + smp_mb__after_atomic(); + /* Retry again in case other CPU may not see the new flag * after it releases the lock at the end of qdisc_run_end(). */ -- cgit v1.2.3 From 2b4cd14fd995e0a863b2ced4cba0bcd804d89ebc Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 17 Jun 2021 09:38:17 +0200 Subject: net/netif_receive_skb_core: Use migrate_disable() The preempt disable around do_xdp_generic() has been introduced in commit bbbe211c295ff ("net: rcu lock and preempt disable missing around generic xdp") For BPF it is enough to use migrate_disable() and the code was updated as it can be seen in commit 3c58482a382ba ("bpf: Provide bpf_prog_run_pin_on_cpu() helper") This is a leftover which was not converted. Use migrate_disable() before invoking do_xdp_generic(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index ef8cf7619baf..439faadab0c2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5277,9 +5277,9 @@ another_round: if (static_branch_unlikely(&generic_xdp_needed_key)) { int ret2; - preempt_disable(); + migrate_disable(); ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); - preempt_enable(); + migrate_enable(); if (ret2 != XDP_PASS) { ret = NET_RX_DROP; -- cgit v1.2.3 From 536ba2e06d1aaaed8a11c30e1609281cd955082e Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Fri, 18 Jun 2021 12:35:39 -0700 Subject: hv_netvsc: Set needed_headroom according to VF Set needed_headroom according to VF if VF needs a bigger headroom. Signed-off-by: Haiyang Zhang Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index f682a5572d84..382bebc2420d 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2384,6 +2384,9 @@ static int netvsc_register_vf(struct net_device *vf_netdev) dev_hold(vf_netdev); rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev); + if (ndev->needed_headroom < vf_netdev->needed_headroom) + ndev->needed_headroom = vf_netdev->needed_headroom; + vf_netdev->wanted_features = ndev->features; netdev_update_features(vf_netdev); @@ -2462,6 +2465,8 @@ static int netvsc_unregister_vf(struct net_device *vf_netdev) RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL); dev_put(vf_netdev); + ndev->needed_headroom = RNDIS_AND_PPI_SIZE; + return NOTIFY_OK; } -- cgit v1.2.3 From d452d48b9f8b1a7f8152d33ef52cfd7fe1735b0a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 18 Jun 2021 13:34:06 -0700 Subject: tls: prevent oversized sendfile() hangs by ignoring MSG_MORE We got multiple reports that multi_chunk_sendfile test case from tls selftest fails. This was sort of expected, as the original fix was never applied (see it in the first Link:). The test in question uses sendfile() with count larger than the size of the underlying file. This will make splice set MSG_MORE on all sendpage calls, meaning TLS will never close and flush the last partial record. Eric seem to have addressed a similar problem in commit 35f9c09fe9c7 ("tcp: tcp_sendpages() should call tcp_push() once") by introducing MSG_SENDPAGE_NOTLAST. Unlike MSG_MORE MSG_SENDPAGE_NOTLAST is not set on the last call of a "pipefull" of data (PIPE_DEF_BUFFERS == 16, so every 16 pages or whenever we run out of data). Having a break every 16 pages should be fine, TLS can pack exactly 4 pages into a record, so for aligned reads there should be no difference, unaligned may see one extra record per sendpage(). Sticking to TCP semantics seems preferable to modifying splice, but we can revisit it if real life scenarios show a regression. Reported-by: Vadim Fedorenko Reported-by: Seth Forshee Link: https://lore.kernel.org/netdev/1591392508-14592-1-git-send-email-pooja.trivedi@stackpath.com/ Fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: Jakub Kicinski Tested-by: Seth Forshee Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 694de024d0ee..74e5701034aa 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1153,7 +1153,7 @@ static int tls_sw_do_sendpage(struct sock *sk, struct page *page, int ret = 0; bool eor; - eor = !(flags & (MSG_MORE | MSG_SENDPAGE_NOTLAST)); + eor = !(flags & MSG_SENDPAGE_NOTLAST); sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* Call the sk_stream functions to manage the sndbuf mem. */ -- cgit v1.2.3 From 3c5e44622011b9ea21bd425875dcccfc9a158f5f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 19 Jun 2021 00:55:20 +0200 Subject: netfilter: nf_tables: memleak in hw offload abort path Release flow from the abort path, this is easy to reproduce since b72920f6e4a9 ("netfilter: nftables: counter hardware offload support"). If the preparation phase fails, then the abort path is exercised without releasing the flow rule object. unreferenced object 0xffff8881f0fa7700 (size 128): comm "nft", pid 1335, jiffies 4294931120 (age 4163.740s) hex dump (first 32 bytes): 08 e4 de 13 82 88 ff ff 98 e4 de 13 82 88 ff ff ................ 48 e4 de 13 82 88 ff ff 01 00 00 00 00 00 00 00 H............... backtrace: [<00000000634547e7>] flow_rule_alloc+0x26/0x80 [<00000000c8426156>] nft_flow_rule_create+0xc9/0x3f0 [nf_tables] [<0000000075ff8e46>] nf_tables_newrule+0xc79/0x10a0 [nf_tables] [<00000000ba65e40e>] nfnetlink_rcv_batch+0xaac/0xf90 [nfnetlink] [<00000000505c614a>] nfnetlink_rcv+0x1bb/0x1f0 [nfnetlink] [<00000000eb78e1fe>] netlink_unicast+0x34b/0x480 [<00000000a8f72c94>] netlink_sendmsg+0x3af/0x690 [<000000009cb1ddf4>] sock_sendmsg+0x96/0xa0 [<0000000039d06e44>] ____sys_sendmsg+0x3fe/0x440 [<00000000137e82ca>] ___sys_sendmsg+0xd8/0x140 [<000000000c6bf6a6>] __sys_sendmsg+0xb3/0x130 [<0000000043bd6268>] do_syscall_64+0x40/0xb0 [<00000000afdebc2d>] entry_SYSCALL_64_after_hwframe+0x44/0xae Remove flow rule release from the offload commit path, otherwise error from the offload commit phase might trigger a double-free due to the execution of the abort_offload -> abort. After this patch, the abort path takes care of releasing the flow rule. This fix also needs to move the nft_flow_rule_create() call before the transaction object is added otherwise the abort path might find a NULL pointer to the flow rule object for the NFT_CHAIN_HW_OFFLOAD case. While at it, rename BASIC-like goto tags to slightly more meaningful names rather than adding a new "err3" tag. Fixes: 63b48c73ff56 ("netfilter: nf_tables_offload: undo updates if transaction fails") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 51 ++++++++++++++++++++++++--------------- net/netfilter/nf_tables_offload.c | 17 ------------- 2 files changed, 31 insertions(+), 37 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index bf4d6ec9fc55..ca9ec8721e6c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3243,9 +3243,9 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, u8 genmask = nft_genmask_next(info->net); struct nft_rule *rule, *old_rule = NULL; struct nft_expr_info *expr_info = NULL; + struct nft_flow_rule *flow = NULL; int family = nfmsg->nfgen_family; struct net *net = info->net; - struct nft_flow_rule *flow; struct nft_userdata *udata; struct nft_table *table; struct nft_chain *chain; @@ -3340,13 +3340,13 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) { err = -EINVAL; if (nla_type(tmp) != NFTA_LIST_ELEM) - goto err1; + goto err_release_expr; if (n == NFT_RULE_MAXEXPRS) - goto err1; + goto err_release_expr; err = nf_tables_expr_parse(&ctx, tmp, &expr_info[n]); if (err < 0) { NL_SET_BAD_ATTR(extack, tmp); - goto err1; + goto err_release_expr; } size += expr_info[n].ops->size; n++; @@ -3355,7 +3355,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, /* Check for overflow of dlen field */ err = -EFBIG; if (size >= 1 << 12) - goto err1; + goto err_release_expr; if (nla[NFTA_RULE_USERDATA]) { ulen = nla_len(nla[NFTA_RULE_USERDATA]); @@ -3366,7 +3366,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, err = -ENOMEM; rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL); if (rule == NULL) - goto err1; + goto err_release_expr; nft_activate_next(net, rule); @@ -3385,7 +3385,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, err = nf_tables_newexpr(&ctx, &expr_info[i], expr); if (err < 0) { NL_SET_BAD_ATTR(extack, expr_info[i].attr); - goto err2; + goto err_release_rule; } if (expr_info[i].ops->validate) @@ -3395,16 +3395,24 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, expr = nft_expr_next(expr); } + if (chain->flags & NFT_CHAIN_HW_OFFLOAD) { + flow = nft_flow_rule_create(net, rule); + if (IS_ERR(flow)) { + err = PTR_ERR(flow); + goto err_release_rule; + } + } + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (trans == NULL) { err = -ENOMEM; - goto err2; + goto err_destroy_flow_rule; } err = nft_delrule(&ctx, old_rule); if (err < 0) { nft_trans_destroy(trans); - goto err2; + goto err_destroy_flow_rule; } list_add_tail_rcu(&rule->list, &old_rule->list); @@ -3412,7 +3420,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (!trans) { err = -ENOMEM; - goto err2; + goto err_destroy_flow_rule; } if (info->nlh->nlmsg_flags & NLM_F_APPEND) { @@ -3430,21 +3438,19 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, kvfree(expr_info); chain->use++; + if (flow) + nft_trans_flow_rule(trans) = flow; + if (nft_net->validate_state == NFT_VALIDATE_DO) return nft_table_validate(net, table); - if (chain->flags & NFT_CHAIN_HW_OFFLOAD) { - flow = nft_flow_rule_create(net, rule); - if (IS_ERR(flow)) - return PTR_ERR(flow); - - nft_trans_flow_rule(trans) = flow; - } - return 0; -err2: + +err_destroy_flow_rule: + nft_flow_rule_destroy(flow); +err_release_rule: nf_tables_rule_release(&ctx, rule); -err1: +err_release_expr: for (i = 0; i < n; i++) { if (expr_info[i].ops) { module_put(expr_info[i].ops->type->owner); @@ -8839,11 +8845,16 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_rule_expr_deactivate(&trans->ctx, nft_trans_rule(trans), NFT_TRANS_ABORT); + if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_DELRULE: trans->ctx.chain->use++; nft_clear(trans->ctx.net, nft_trans_rule(trans)); nft_rule_expr_activate(&trans->ctx, nft_trans_rule(trans)); + if (trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD) + nft_flow_rule_destroy(nft_trans_flow_rule(trans)); + nft_trans_destroy(trans); break; case NFT_MSG_NEWSET: diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index a48c5fd53a80..ec701b84844f 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -594,23 +594,6 @@ int nft_flow_rule_offload_commit(struct net *net) } } - list_for_each_entry(trans, &nft_net->commit_list, list) { - if (trans->ctx.family != NFPROTO_NETDEV) - continue; - - switch (trans->msg_type) { - case NFT_MSG_NEWRULE: - case NFT_MSG_DELRULE: - if (!(trans->ctx.chain->flags & NFT_CHAIN_HW_OFFLOAD)) - continue; - - nft_flow_rule_destroy(nft_trans_flow_rule(trans)); - break; - default: - break; - } - } - return err; } -- cgit v1.2.3 From ea45fdf82cc90430bb7c280e5e53821e833782c5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 19 Jun 2021 01:25:14 +0200 Subject: netfilter: nf_tables_offload: check FLOW_DISSECTOR_KEY_BASIC in VLAN transfer logic The VLAN transfer logic should actually check for FLOW_DISSECTOR_KEY_BASIC, not FLOW_DISSECTOR_KEY_CONTROL. Moreover, do not fallback to case 2) .n_proto is set to 802.1q or 802.1ad, if FLOW_DISSECTOR_KEY_BASIC is unset. Fixes: 783003f3bb8a ("netfilter: nftables_offload: special ethertype handling for VLAN") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_offload.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index ec701b84844f..b58d73a96523 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -54,15 +54,10 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow) { struct nft_flow_match *match = &flow->match; - struct nft_offload_ethertype ethertype; - - if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_CONTROL) && - match->key.basic.n_proto != htons(ETH_P_8021Q) && - match->key.basic.n_proto != htons(ETH_P_8021AD)) - return; - - ethertype.value = match->key.basic.n_proto; - ethertype.mask = match->mask.basic.n_proto; + struct nft_offload_ethertype ethertype = { + .value = match->key.basic.n_proto, + .mask = match->mask.basic.n_proto, + }; if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_VLAN) && (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) || @@ -76,7 +71,9 @@ static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] = offsetof(struct nft_flow_key, cvlan); match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CVLAN); - } else { + } else if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_BASIC) && + (match->key.basic.n_proto == htons(ETH_P_8021Q) || + match->key.basic.n_proto == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.vlan.vlan_tpid; match->mask.basic.n_proto = match->mask.vlan.vlan_tpid; match->key.vlan.vlan_tpid = ethertype.value; -- cgit v1.2.3 From 1502328f17ab0684ca5ed6764433aa0a83bdaf95 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Jun 2021 15:02:20 -0700 Subject: mptcp: fix bad handling of 32 bit ack wrap-around When receiving 32 bits DSS ack from the peer, the MPTCP need to expand them to 64 bits value. The current code is buggy WRT detecting 32 bits ack wrap-around: when the wrap-around happens the current unsigned 32 bit ack value is lower than the previous one. Additionally check for possible reverse wrap and make the helper visible, so that we could re-use it for the next patch. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/204 Fixes: cc9d25669866 ("mptcp: update per unacked sequence on pkt reception") Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/options.c | 29 +++++++++++++++-------------- net/mptcp/protocol.h | 8 ++++++++ 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 9b263f27ce9b..b87e46f515fb 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -896,19 +896,20 @@ reset: return false; } -static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) +u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq) { - u32 old_ack32, cur_ack32; - - if (use_64bit) - return cur_ack; - - old_ack32 = (u32)old_ack; - cur_ack32 = (u32)cur_ack; - cur_ack = (old_ack & GENMASK_ULL(63, 32)) + cur_ack32; - if (unlikely(before(cur_ack32, old_ack32))) - return cur_ack + (1LL << 32); - return cur_ack; + u32 old_seq32, cur_seq32; + + old_seq32 = (u32)old_seq; + cur_seq32 = (u32)cur_seq; + cur_seq = (old_seq & GENMASK_ULL(63, 32)) + cur_seq32; + if (unlikely(cur_seq32 < old_seq32 && before(old_seq32, cur_seq32))) + return cur_seq + (1LL << 32); + + /* reverse wrap could happen, too */ + if (unlikely(cur_seq32 > old_seq32 && after(old_seq32, cur_seq32))) + return cur_seq - (1LL << 32); + return cur_seq; } static void ack_update_msk(struct mptcp_sock *msk, @@ -926,7 +927,7 @@ static void ack_update_msk(struct mptcp_sock *msk, * more dangerous than missing an ack */ old_snd_una = msk->snd_una; - new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); + new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64); /* ACK for data not even sent yet? Ignore. */ if (after64(new_snd_una, snd_nxt)) @@ -963,7 +964,7 @@ bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool us return false; WRITE_ONCE(msk->rcv_data_fin_seq, - expand_ack(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit)); + mptcp_expand_seq(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit)); WRITE_ONCE(msk->rcv_data_fin, 1); return true; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 385796f0ef19..5d7c44028e47 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -593,6 +593,14 @@ int mptcp_setsockopt(struct sock *sk, int level, int optname, int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option); +u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq); +static inline u64 mptcp_expand_seq(u64 old_seq, u64 cur_seq, bool use_64bit) +{ + if (use_64bit) + return cur_seq; + + return __mptcp_expand_seq(old_seq, cur_seq); +} void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); void __mptcp_error_report(struct sock *sk); -- cgit v1.2.3 From 5957a8901db44c03540505ccedd95031c21ef2f2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Jun 2021 15:02:21 -0700 Subject: mptcp: fix 32 bit DSN expansion The current implementation of 32 bit DSN expansion is buggy. After the previous patch, we can simply reuse the newly introduced helper to do the expansion safely. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/120 Fixes: 648ef4b88673 ("mptcp: Implement MPTCP receive path") Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index be1de4084196..037fba41e170 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -775,15 +775,6 @@ enum mapping_status { MAPPING_DUMMY }; -static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq) -{ - if ((u32)seq == (u32)old_seq) - return old_seq; - - /* Assume map covers data not mapped yet. */ - return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32)); -} - static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) { pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d", @@ -907,13 +898,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, data_len--; } - if (!mpext->dsn64) { - map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, - mpext->data_seq); - pr_debug("expanded seq=%llu", subflow->map_seq); - } else { - map_seq = mpext->data_seq; - } + map_seq = mptcp_expand_seq(READ_ONCE(msk->ack_seq), mpext->data_seq, mpext->dsn64); WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64); if (subflow->map_valid) { -- cgit v1.2.3 From 6a1e5a4af17e440dd82a58a2c5f40ff17a82b722 Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Sun, 20 Jun 2021 15:24:14 +0000 Subject: atm: nicstar: use 'dma_free_coherent' instead of 'kfree' When 'nicstar_init_one' fails, 'ns_init_card_error' will be executed for error handling, but the correct memory free function should be used, otherwise it will cause an error. Since 'card->rsq.org' and 'card->tsq.org' are allocated using 'dma_alloc_coherent' function, they should be freed using 'dma_free_coherent'. Fix this by using 'dma_free_coherent' instead of 'kfree' This log reveals it: [ 3.440294] kernel BUG at mm/slub.c:4206! [ 3.441059] invalid opcode: 0000 [#1] PREEMPT SMP PTI [ 3.441430] CPU: 2 PID: 1 Comm: swapper/0 Not tainted 5.12.4-g70e7f0549188-dirty #141 [ 3.441986] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [ 3.442780] RIP: 0010:kfree+0x26a/0x300 [ 3.443065] Code: e8 3a c3 b9 ff e9 d6 fd ff ff 49 8b 45 00 31 db a9 00 00 01 00 75 4d 49 8b 45 00 a9 00 00 01 00 75 0a 49 8b 45 08 a8 01 75 02 <0f> 0b 89 d9 b8 00 10 00 00 be 06 00 00 00 48 d3 e0 f7 d8 48 63 d0 [ 3.443396] RSP: 0000:ffffc90000017b70 EFLAGS: 00010246 [ 3.443396] RAX: dead000000000100 RBX: 0000000000000000 RCX: 0000000000000000 [ 3.443396] RDX: 0000000000000000 RSI: ffffffff85d3df94 RDI: ffffffff85df38e6 [ 3.443396] RBP: ffffc90000017b90 R08: 0000000000000001 R09: 0000000000000001 [ 3.443396] R10: 0000000000000000 R11: 0000000000000001 R12: ffff888107dc0000 [ 3.443396] R13: ffffea00001f0100 R14: ffff888101a8bf00 R15: ffff888107dc0160 [ 3.443396] FS: 0000000000000000(0000) GS:ffff88817bc80000(0000) knlGS:0000000000000000 [ 3.443396] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3.443396] CR2: 0000000000000000 CR3: 000000000642e000 CR4: 00000000000006e0 [ 3.443396] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 3.443396] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 3.443396] Call Trace: [ 3.443396] ns_init_card_error+0x12c/0x220 [ 3.443396] nicstar_init_one+0x10d2/0x1130 [ 3.443396] local_pci_probe+0x4a/0xb0 [ 3.443396] pci_device_probe+0x126/0x1d0 [ 3.443396] ? pci_device_remove+0x100/0x100 [ 3.443396] really_probe+0x27e/0x650 [ 3.443396] driver_probe_device+0x84/0x1d0 [ 3.443396] ? mutex_lock_nested+0x16/0x20 [ 3.443396] device_driver_attach+0x63/0x70 [ 3.443396] __driver_attach+0x117/0x1a0 [ 3.443396] ? device_driver_attach+0x70/0x70 [ 3.443396] bus_for_each_dev+0xb6/0x110 [ 3.443396] ? rdinit_setup+0x40/0x40 [ 3.443396] driver_attach+0x22/0x30 [ 3.443396] bus_add_driver+0x1e6/0x2a0 [ 3.443396] driver_register+0xa4/0x180 [ 3.443396] __pci_register_driver+0x77/0x80 [ 3.443396] ? uPD98402_module_init+0xd/0xd [ 3.443396] nicstar_init+0x1f/0x75 [ 3.443396] do_one_initcall+0x7a/0x3d0 [ 3.443396] ? rdinit_setup+0x40/0x40 [ 3.443396] ? rcu_read_lock_sched_held+0x4a/0x70 [ 3.443396] kernel_init_freeable+0x2a7/0x2f9 [ 3.443396] ? rest_init+0x2c0/0x2c0 [ 3.443396] kernel_init+0x13/0x180 [ 3.443396] ? rest_init+0x2c0/0x2c0 [ 3.443396] ? rest_init+0x2c0/0x2c0 [ 3.443396] ret_from_fork+0x1f/0x30 [ 3.443396] Modules linked in: [ 3.443396] Dumping ftrace buffer: [ 3.443396] (ftrace buffer empty) [ 3.458593] ---[ end trace 3c6f8f0d8ef59bcd ]--- [ 3.458922] RIP: 0010:kfree+0x26a/0x300 [ 3.459198] Code: e8 3a c3 b9 ff e9 d6 fd ff ff 49 8b 45 00 31 db a9 00 00 01 00 75 4d 49 8b 45 00 a9 00 00 01 00 75 0a 49 8b 45 08 a8 01 75 02 <0f> 0b 89 d9 b8 00 10 00 00 be 06 00 00 00 48 d3 e0 f7 d8 48 63 d0 [ 3.460499] RSP: 0000:ffffc90000017b70 EFLAGS: 00010246 [ 3.460870] RAX: dead000000000100 RBX: 0000000000000000 RCX: 0000000000000000 [ 3.461371] RDX: 0000000000000000 RSI: ffffffff85d3df94 RDI: ffffffff85df38e6 [ 3.461873] RBP: ffffc90000017b90 R08: 0000000000000001 R09: 0000000000000001 [ 3.462372] R10: 0000000000000000 R11: 0000000000000001 R12: ffff888107dc0000 [ 3.462871] R13: ffffea00001f0100 R14: ffff888101a8bf00 R15: ffff888107dc0160 [ 3.463368] FS: 0000000000000000(0000) GS:ffff88817bc80000(0000) knlGS:0000000000000000 [ 3.463949] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3.464356] CR2: 0000000000000000 CR3: 000000000642e000 CR4: 00000000000006e0 [ 3.464856] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 3.465356] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 3.465860] Kernel panic - not syncing: Fatal exception [ 3.466370] Dumping ftrace buffer: [ 3.466616] (ftrace buffer empty) [ 3.466871] Kernel Offset: disabled [ 3.467122] Rebooting in 1 seconds.. Signed-off-by: Zheyu Ma Signed-off-by: David S. Miller --- drivers/atm/nicstar.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c index 5c7e4df159b9..e031f6d74e7e 100644 --- a/drivers/atm/nicstar.c +++ b/drivers/atm/nicstar.c @@ -839,10 +839,12 @@ static void ns_init_card_error(ns_dev *card, int error) dev_kfree_skb_any(hb); } if (error >= 12) { - kfree(card->rsq.org); + dma_free_coherent(&card->pcidev->dev, NS_RSQSIZE + NS_RSQ_ALIGNMENT, + card->rsq.org, card->rsq.dma); } if (error >= 11) { - kfree(card->tsq.org); + dma_free_coherent(&card->pcidev->dev, NS_TSQSIZE + NS_TSQ_ALIGNMENT, + card->tsq.org, card->tsq.dma); } if (error >= 10) { free_irq(card->pcidev->irq, card); -- cgit v1.2.3 From 70b639dc41ad499384e41e106fce72e36805c9f2 Mon Sep 17 00:00:00 2001 From: Zheyu Ma Date: Sun, 20 Jun 2021 15:24:15 +0000 Subject: atm: nicstar: register the interrupt handler in the right place Because the error handling is sequential, the application of resources should be carried out in the order of error handling, so the operation of registering the interrupt handler should be put in front, so as not to free the unregistered interrupt handler during error handling. This log reveals it: [ 3.438724] Trying to free already-free IRQ 23 [ 3.439060] WARNING: CPU: 5 PID: 1 at kernel/irq/manage.c:1825 free_irq+0xfb/0x480 [ 3.440039] Modules linked in: [ 3.440257] CPU: 5 PID: 1 Comm: swapper/0 Not tainted 5.12.4-g70e7f0549188-dirty #142 [ 3.440793] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [ 3.441561] RIP: 0010:free_irq+0xfb/0x480 [ 3.441845] Code: 6e 08 74 6f 4d 89 f4 e8 c3 78 09 00 4d 8b 74 24 18 4d 85 f6 75 e3 e8 b4 78 09 00 8b 75 c8 48 c7 c7 a0 ac d5 85 e8 95 d7 f5 ff <0f> 0b 48 8b 75 c0 4c 89 ff e8 87 c5 90 03 48 8b 43 40 4c 8b a0 80 [ 3.443121] RSP: 0000:ffffc90000017b50 EFLAGS: 00010086 [ 3.443483] RAX: 0000000000000000 RBX: ffff888107c6f000 RCX: 0000000000000000 [ 3.443972] RDX: 0000000000000000 RSI: ffffffff8123f301 RDI: 00000000ffffffff [ 3.444462] RBP: ffffc90000017b90 R08: 0000000000000001 R09: 0000000000000003 [ 3.444950] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 [ 3.444994] R13: ffff888107dc0000 R14: ffff888104f6bf00 R15: ffff888107c6f0a8 [ 3.444994] FS: 0000000000000000(0000) GS:ffff88817bd40000(0000) knlGS:0000000000000000 [ 3.444994] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3.444994] CR2: 0000000000000000 CR3: 000000000642e000 CR4: 00000000000006e0 [ 3.444994] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 3.444994] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 3.444994] Call Trace: [ 3.444994] ns_init_card_error+0x18e/0x250 [ 3.444994] nicstar_init_one+0x10d2/0x1130 [ 3.444994] local_pci_probe+0x4a/0xb0 [ 3.444994] pci_device_probe+0x126/0x1d0 [ 3.444994] ? pci_device_remove+0x100/0x100 [ 3.444994] really_probe+0x27e/0x650 [ 3.444994] driver_probe_device+0x84/0x1d0 [ 3.444994] ? mutex_lock_nested+0x16/0x20 [ 3.444994] device_driver_attach+0x63/0x70 [ 3.444994] __driver_attach+0x117/0x1a0 [ 3.444994] ? device_driver_attach+0x70/0x70 [ 3.444994] bus_for_each_dev+0xb6/0x110 [ 3.444994] ? rdinit_setup+0x40/0x40 [ 3.444994] driver_attach+0x22/0x30 [ 3.444994] bus_add_driver+0x1e6/0x2a0 [ 3.444994] driver_register+0xa4/0x180 [ 3.444994] __pci_register_driver+0x77/0x80 [ 3.444994] ? uPD98402_module_init+0xd/0xd [ 3.444994] nicstar_init+0x1f/0x75 [ 3.444994] do_one_initcall+0x7a/0x3d0 [ 3.444994] ? rdinit_setup+0x40/0x40 [ 3.444994] ? rcu_read_lock_sched_held+0x4a/0x70 [ 3.444994] kernel_init_freeable+0x2a7/0x2f9 [ 3.444994] ? rest_init+0x2c0/0x2c0 [ 3.444994] kernel_init+0x13/0x180 [ 3.444994] ? rest_init+0x2c0/0x2c0 [ 3.444994] ? rest_init+0x2c0/0x2c0 [ 3.444994] ret_from_fork+0x1f/0x30 [ 3.444994] Kernel panic - not syncing: panic_on_warn set ... [ 3.444994] CPU: 5 PID: 1 Comm: swapper/0 Not tainted 5.12.4-g70e7f0549188-dirty #142 [ 3.444994] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [ 3.444994] Call Trace: [ 3.444994] dump_stack+0xba/0xf5 [ 3.444994] ? free_irq+0xfb/0x480 [ 3.444994] panic+0x155/0x3ed [ 3.444994] ? __warn+0xed/0x150 [ 3.444994] ? free_irq+0xfb/0x480 [ 3.444994] __warn+0x103/0x150 [ 3.444994] ? free_irq+0xfb/0x480 [ 3.444994] report_bug+0x119/0x1c0 [ 3.444994] handle_bug+0x3b/0x80 [ 3.444994] exc_invalid_op+0x18/0x70 [ 3.444994] asm_exc_invalid_op+0x12/0x20 [ 3.444994] RIP: 0010:free_irq+0xfb/0x480 [ 3.444994] Code: 6e 08 74 6f 4d 89 f4 e8 c3 78 09 00 4d 8b 74 24 18 4d 85 f6 75 e3 e8 b4 78 09 00 8b 75 c8 48 c7 c7 a0 ac d5 85 e8 95 d7 f5 ff <0f> 0b 48 8b 75 c0 4c 89 ff e8 87 c5 90 03 48 8b 43 40 4c 8b a0 80 [ 3.444994] RSP: 0000:ffffc90000017b50 EFLAGS: 00010086 [ 3.444994] RAX: 0000000000000000 RBX: ffff888107c6f000 RCX: 0000000000000000 [ 3.444994] RDX: 0000000000000000 RSI: ffffffff8123f301 RDI: 00000000ffffffff [ 3.444994] RBP: ffffc90000017b90 R08: 0000000000000001 R09: 0000000000000003 [ 3.444994] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 [ 3.444994] R13: ffff888107dc0000 R14: ffff888104f6bf00 R15: ffff888107c6f0a8 [ 3.444994] ? vprintk_func+0x71/0x110 [ 3.444994] ns_init_card_error+0x18e/0x250 [ 3.444994] nicstar_init_one+0x10d2/0x1130 [ 3.444994] local_pci_probe+0x4a/0xb0 [ 3.444994] pci_device_probe+0x126/0x1d0 [ 3.444994] ? pci_device_remove+0x100/0x100 [ 3.444994] really_probe+0x27e/0x650 [ 3.444994] driver_probe_device+0x84/0x1d0 [ 3.444994] ? mutex_lock_nested+0x16/0x20 [ 3.444994] device_driver_attach+0x63/0x70 [ 3.444994] __driver_attach+0x117/0x1a0 [ 3.444994] ? device_driver_attach+0x70/0x70 [ 3.444994] bus_for_each_dev+0xb6/0x110 [ 3.444994] ? rdinit_setup+0x40/0x40 [ 3.444994] driver_attach+0x22/0x30 [ 3.444994] bus_add_driver+0x1e6/0x2a0 [ 3.444994] driver_register+0xa4/0x180 [ 3.444994] __pci_register_driver+0x77/0x80 [ 3.444994] ? uPD98402_module_init+0xd/0xd [ 3.444994] nicstar_init+0x1f/0x75 [ 3.444994] do_one_initcall+0x7a/0x3d0 [ 3.444994] ? rdinit_setup+0x40/0x40 [ 3.444994] ? rcu_read_lock_sched_held+0x4a/0x70 [ 3.444994] kernel_init_freeable+0x2a7/0x2f9 [ 3.444994] ? rest_init+0x2c0/0x2c0 [ 3.444994] kernel_init+0x13/0x180 [ 3.444994] ? rest_init+0x2c0/0x2c0 [ 3.444994] ? rest_init+0x2c0/0x2c0 [ 3.444994] ret_from_fork+0x1f/0x30 [ 3.444994] Dumping ftrace buffer: [ 3.444994] (ftrace buffer empty) [ 3.444994] Kernel Offset: disabled [ 3.444994] Rebooting in 1 seconds.. Signed-off-by: Zheyu Ma Signed-off-by: David S. Miller --- drivers/atm/nicstar.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c index e031f6d74e7e..e5117144347f 100644 --- a/drivers/atm/nicstar.c +++ b/drivers/atm/nicstar.c @@ -527,6 +527,15 @@ static int ns_init_card(int i, struct pci_dev *pcidev) /* Set the VPI/VCI MSb mask to zero so we can receive OAM cells */ writel(0x00000000, card->membase + VPM); + card->intcnt = 0; + if (request_irq + (pcidev->irq, &ns_irq_handler, IRQF_SHARED, "nicstar", card) != 0) { + pr_err("nicstar%d: can't allocate IRQ %d.\n", i, pcidev->irq); + error = 9; + ns_init_card_error(card, error); + return error; + } + /* Initialize TSQ */ card->tsq.org = dma_alloc_coherent(&card->pcidev->dev, NS_TSQSIZE + NS_TSQ_ALIGNMENT, @@ -753,15 +762,6 @@ static int ns_init_card(int i, struct pci_dev *pcidev) card->efbie = 1; - card->intcnt = 0; - if (request_irq - (pcidev->irq, &ns_irq_handler, IRQF_SHARED, "nicstar", card) != 0) { - printk("nicstar%d: can't allocate IRQ %d.\n", i, pcidev->irq); - error = 9; - ns_init_card_error(card, error); - return error; - } - /* Register device */ card->atmdev = atm_dev_register("nicstar", &card->pcidev->dev, &atm_ops, -1, NULL); -- cgit v1.2.3 From b90788459cd6d140171b046f0b37fad341ade0a3 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 20 Jun 2021 15:43:28 +0200 Subject: net: mana: Fix a memory leak in an error handling path in 'mana_create_txq()' If this test fails we must free some resources as in all the other error handling paths of this function. Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)") Signed-off-by: Christophe JAILLET Reviewed-by: Dexuan Cui Signed-off-by: David S. Miller --- drivers/net/ethernet/microsoft/mana/mana_en.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 04d067243457..1ed25e48f616 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1230,8 +1230,10 @@ static int mana_create_txq(struct mana_port_context *apc, cq->gdma_id = cq->gdma_cq->id; - if (WARN_ON(cq->gdma_id >= gc->max_num_cqs)) - return -EINVAL; + if (WARN_ON(cq->gdma_id >= gc->max_num_cqs)) { + err = -EINVAL; + goto out; + } gc->cq_table[cq->gdma_id] = cq->gdma_cq; -- cgit v1.2.3 From c7ff9cff70601ea19245d997bb977344663434c7 Mon Sep 17 00:00:00 2001 From: "Longpeng(Mike)" Date: Mon, 21 Jun 2021 14:26:01 +0800 Subject: vsock: notify server to shutdown when client has pending signal The client's sk_state will be set to TCP_ESTABLISHED if the server replay the client's connect request. However, if the client has pending signal, its sk_state will be set to TCP_CLOSE without notify the server, so the server will hold the corrupt connection. client server 1. sk_state=TCP_SYN_SENT | 2. call ->connect() | 3. wait reply | | 4. sk_state=TCP_ESTABLISHED | 5. insert to connected list | 6. reply to the client 7. sk_state=TCP_ESTABLISHED | 8. insert to connected list | 9. *signal pending* <--------------------- the user kill client 10. sk_state=TCP_CLOSE | client is exiting... | 11. call ->release() | virtio_transport_close if (!(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_CLOSING)) return true; *return at here, the server cannot notice the connection is corrupt* So the client should notify the peer in this case. Cc: David S. Miller Cc: Jakub Kicinski Cc: Jorgen Hansen Cc: Norbert Slusarek Cc: Andra Paraschiv Cc: Colin Ian King Cc: David Brazdil Cc: Alexander Popov Suggested-by: Stefano Garzarella Link: https://lkml.org/lkml/2021/5/17/418 Signed-off-by: lixianming Signed-off-by: Longpeng(Mike) Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 92a72f0e0d94..ae11311807fd 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1369,7 +1369,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (signal_pending(current)) { err = sock_intr_errno(timeout); - sk->sk_state = TCP_CLOSE; + sk->sk_state = sk->sk_state == TCP_ESTABLISHED ? TCP_CLOSING : TCP_CLOSE; sock->state = SS_UNCONNECTED; vsock_transport_cancel_pkt(vsk); goto out_wait; -- cgit v1.2.3 From b8b79c414eca4e9bcab645e02cb92c48db974ce9 Mon Sep 17 00:00:00 2001 From: Eldar Gasanov Date: Mon, 21 Jun 2021 11:54:38 +0300 Subject: net: dsa: mv88e6xxx: Fix adding vlan 0 8021q module adds vlan 0 to all interfaces when it starts. When 8021q module is loaded it isn't possible to create bond with mv88e6xxx interfaces, bonding module dipslay error "Couldn't add bond vlan ids", because it tries to add vlan 0 to slave interfaces. There is unexpected behavior in the switch. When a PVID is assigned to a port the switch changes VID to PVID in ingress frames with VID 0 on the port. Expected that the switch doesn't assign PVID to tagged frames with VID 0. But there isn't a way to change this behavior in the switch. Fixes: 57e661aae6a8 ("net: dsa: mv88e6xxx: Link aggregation support") Signed-off-by: Eldar Gasanov Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index eca285aaf72f..961fa6b75cad 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -1618,9 +1618,6 @@ static int mv88e6xxx_port_check_hw_vlan(struct dsa_switch *ds, int port, struct mv88e6xxx_vtu_entry vlan; int i, err; - if (!vid) - return -EOPNOTSUPP; - /* DSA and CPU ports have to be members of multiple vlans */ if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) return 0; @@ -2109,6 +2106,9 @@ static int mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int port, u8 member; int err; + if (!vlan->vid) + return 0; + err = mv88e6xxx_port_vlan_prepare(ds, port, vlan); if (err) return err; -- cgit v1.2.3 From 0cd58e5c53babb9237b741dbef711f0a9eb6d3fd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Jun 2021 10:54:49 -0700 Subject: pkt_sched: sch_qfq: fix qfq_change_class() error path If qfq_change_class() is unable to allocate memory for qfq_aggregate, it frees the class that has been inserted in the class hash table, but does not unhash it. Defer the insertion after the problematic allocation. BUG: KASAN: use-after-free in hlist_add_head include/linux/list.h:884 [inline] BUG: KASAN: use-after-free in qdisc_class_hash_insert+0x200/0x210 net/sched/sch_api.c:731 Write of size 8 at addr ffff88814a534f10 by task syz-executor.4/31478 CPU: 0 PID: 31478 Comm: syz-executor.4 Not tainted 5.13.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x141/0x1d7 lib/dump_stack.c:120 print_address_description.constprop.0.cold+0x5b/0x2f8 mm/kasan/report.c:233 __kasan_report mm/kasan/report.c:419 [inline] kasan_report.cold+0x7c/0xd8 mm/kasan/report.c:436 hlist_add_head include/linux/list.h:884 [inline] qdisc_class_hash_insert+0x200/0x210 net/sched/sch_api.c:731 qfq_change_class+0x96c/0x1990 net/sched/sch_qfq.c:489 tc_ctl_tclass+0x514/0xe50 net/sched/sch_api.c:2113 rtnetlink_rcv_msg+0x44e/0xad0 net/core/rtnetlink.c:5564 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2504 netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1340 netlink_sendmsg+0x856/0xd90 net/netlink/af_netlink.c:1929 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:674 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2350 ___sys_sendmsg+0xf3/0x170 net/socket.c:2404 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2433 do_syscall_64+0x3a/0xb0 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x4665d9 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fdc7b5f0188 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 000000000056bf80 RCX: 00000000004665d9 RDX: 0000000000000000 RSI: 00000000200001c0 RDI: 0000000000000003 RBP: 00007fdc7b5f01d0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000002 R13: 00007ffcf7310b3f R14: 00007fdc7b5f0300 R15: 0000000000022000 Allocated by task 31445: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:46 [inline] set_alloc_info mm/kasan/common.c:428 [inline] ____kasan_kmalloc mm/kasan/common.c:507 [inline] ____kasan_kmalloc mm/kasan/common.c:466 [inline] __kasan_kmalloc+0x9b/0xd0 mm/kasan/common.c:516 kmalloc include/linux/slab.h:556 [inline] kzalloc include/linux/slab.h:686 [inline] qfq_change_class+0x705/0x1990 net/sched/sch_qfq.c:464 tc_ctl_tclass+0x514/0xe50 net/sched/sch_api.c:2113 rtnetlink_rcv_msg+0x44e/0xad0 net/core/rtnetlink.c:5564 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2504 netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1340 netlink_sendmsg+0x856/0xd90 net/netlink/af_netlink.c:1929 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:674 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2350 ___sys_sendmsg+0xf3/0x170 net/socket.c:2404 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2433 do_syscall_64+0x3a/0xb0 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 31445: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:38 kasan_set_track+0x1c/0x30 mm/kasan/common.c:46 kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:357 ____kasan_slab_free mm/kasan/common.c:360 [inline] ____kasan_slab_free mm/kasan/common.c:325 [inline] __kasan_slab_free+0xfb/0x130 mm/kasan/common.c:368 kasan_slab_free include/linux/kasan.h:212 [inline] slab_free_hook mm/slub.c:1583 [inline] slab_free_freelist_hook+0xdf/0x240 mm/slub.c:1608 slab_free mm/slub.c:3168 [inline] kfree+0xe5/0x7f0 mm/slub.c:4212 qfq_change_class+0x10fb/0x1990 net/sched/sch_qfq.c:518 tc_ctl_tclass+0x514/0xe50 net/sched/sch_api.c:2113 rtnetlink_rcv_msg+0x44e/0xad0 net/core/rtnetlink.c:5564 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2504 netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1340 netlink_sendmsg+0x856/0xd90 net/netlink/af_netlink.c:1929 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:674 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2350 ___sys_sendmsg+0xf3/0x170 net/socket.c:2404 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2433 do_syscall_64+0x3a/0xb0 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff88814a534f00 which belongs to the cache kmalloc-128 of size 128 The buggy address is located 16 bytes inside of 128-byte region [ffff88814a534f00, ffff88814a534f80) The buggy address belongs to the page: page:ffffea0005294d00 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x14a534 flags: 0x57ff00000000200(slab|node=1|zone=2|lastcpupid=0x7ff) raw: 057ff00000000200 ffffea00004fee00 0000000600000006 ffff8880110418c0 raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 0, migratetype Unmovable, gfp_mask 0x12cc0(GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY), pid 29797, ts 604817765317, free_ts 604810151744 prep_new_page mm/page_alloc.c:2358 [inline] get_page_from_freelist+0x1033/0x2b60 mm/page_alloc.c:3994 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5200 alloc_pages+0x18c/0x2a0 mm/mempolicy.c:2272 alloc_slab_page mm/slub.c:1646 [inline] allocate_slab+0x2c5/0x4c0 mm/slub.c:1786 new_slab mm/slub.c:1849 [inline] new_slab_objects mm/slub.c:2595 [inline] ___slab_alloc+0x4a1/0x810 mm/slub.c:2758 __slab_alloc.constprop.0+0xa7/0xf0 mm/slub.c:2798 slab_alloc_node mm/slub.c:2880 [inline] slab_alloc mm/slub.c:2922 [inline] __kmalloc+0x315/0x330 mm/slub.c:4050 kmalloc include/linux/slab.h:561 [inline] kzalloc include/linux/slab.h:686 [inline] __register_sysctl_table+0x112/0x1090 fs/proc/proc_sysctl.c:1318 mpls_dev_sysctl_register+0x1b7/0x2d0 net/mpls/af_mpls.c:1421 mpls_add_dev net/mpls/af_mpls.c:1472 [inline] mpls_dev_notify+0x214/0x8b0 net/mpls/af_mpls.c:1588 notifier_call_chain+0xb5/0x200 kernel/notifier.c:83 call_netdevice_notifiers_info+0xb5/0x130 net/core/dev.c:2121 call_netdevice_notifiers_extack net/core/dev.c:2133 [inline] call_netdevice_notifiers net/core/dev.c:2147 [inline] register_netdevice+0x106b/0x1500 net/core/dev.c:10312 veth_newlink+0x585/0xac0 drivers/net/veth.c:1547 __rtnl_newlink+0x1062/0x1710 net/core/rtnetlink.c:3452 rtnl_newlink+0x64/0xa0 net/core/rtnetlink.c:3500 page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1298 [inline] free_pcp_prepare+0x223/0x300 mm/page_alloc.c:1342 free_unref_page_prepare mm/page_alloc.c:3250 [inline] free_unref_page+0x12/0x1d0 mm/page_alloc.c:3298 __vunmap+0x783/0xb60 mm/vmalloc.c:2566 free_work+0x58/0x70 mm/vmalloc.c:80 process_one_work+0x98d/0x1600 kernel/workqueue.c:2276 worker_thread+0x64c/0x1120 kernel/workqueue.c:2422 kthread+0x3b1/0x4a0 kernel/kthread.c:313 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294 Memory state around the buggy address: ffff88814a534e00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88814a534e80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff88814a534f00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88814a534f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88814a535000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 Fixes: 462dbc9101acd ("pkt_sched: QFQ Plus: fair-queueing service at DRR cost") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/sched/sch_qfq.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 1db9d4a2ef5e..b692a0de1ad5 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -485,11 +485,6 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl->qdisc != &noop_qdisc) qdisc_hash_add(cl->qdisc, true); - sch_tree_lock(sch); - qdisc_class_hash_insert(&q->clhash, &cl->common); - sch_tree_unlock(sch); - - qdisc_class_hash_grow(sch, &q->clhash); set_change_agg: sch_tree_lock(sch); @@ -507,8 +502,11 @@ set_change_agg: } if (existing) qfq_deact_rm_from_agg(q, cl); + else + qdisc_class_hash_insert(&q->clhash, &cl->common); qfq_add_to_agg(q, new_agg, cl); sch_tree_unlock(sch); + qdisc_class_hash_grow(sch, &q->clhash); *arg = (unsigned long)cl; return 0; -- cgit v1.2.3 From dd72fadf2186fc8a6018f97fe72f4d5ca05df440 Mon Sep 17 00:00:00 2001 From: Ayush Sawal Date: Tue, 22 Jun 2021 09:25:31 +0530 Subject: xfrm: Fix xfrm offload fallback fail case In case of xfrm offload, if xdo_dev_state_add() of driver returns -EOPNOTSUPP, xfrm offload fallback is failed. In xfrm state_add() both xso->dev and xso->real_dev are initialized to dev and when err(-EOPNOTSUPP) is returned only xso->dev is set to null. So in this scenario the condition in func validate_xmit_xfrm(), if ((x->xso.dev != dev) && (x->xso.real_dev == dev)) return skb; returns true, due to which skb is returned without calling esp_xmit() below which has fallback code. Hence the CRYPTO_FALLBACK is failing. So fixing this with by keeping x->xso.real_dev as NULL when err is returned in func xfrm_dev_state_add(). Fixes: bdfd2d1fa79a ("bonding/xfrm: use real_dev instead of slave_dev") Signed-off-by: Ayush Sawal Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 6d6917b68856..e843b0d9e2a6 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -268,6 +268,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, xso->num_exthdrs = 0; xso->flags = 0; xso->dev = NULL; + xso->real_dev = NULL; dev_put(dev); if (err != -EOPNOTSUPP) -- cgit v1.2.3 From 534799097a777e82910f77a4f9d289c815a9a64e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 22 Jun 2021 11:45:11 +0200 Subject: netfilter: nf_tables: skip netlink portID validation if zero nft_table_lookup() allows us to obtain the table object by the name and the family. The netlink portID validation needs to be skipped for the dump path, since the ownership only applies to commands to update the given table. Skip validation if the specified netlink PortID is zero when calling nft_table_lookup(). Fixes: 6001a930ce03 ("netfilter: nftables: introduce table ownership") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ca9ec8721e6c..1d62b1a83299 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -571,7 +571,7 @@ static struct nft_table *nft_table_lookup(const struct net *net, table->family == family && nft_active_genmask(table, genmask)) { if (nft_table_has_owner(table) && - table->nlpid != nlpid) + nlpid && table->nlpid != nlpid) return ERR_PTR(-EPERM); return table; -- cgit v1.2.3 From e31f072ffab0397a328b31a9589dcf9733dc9c72 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 22 Jun 2021 12:10:49 +0200 Subject: netfilter: nf_tables: do not allow to delete table with owner by handle nft_table_lookup_byhandle() also needs to validate the netlink PortID owner when deleting a table by handle. Fixes: 6001a930ce03 ("netfilter: nftables: introduce table ownership") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1d62b1a83299..fcb15b8904e8 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -583,7 +583,7 @@ static struct nft_table *nft_table_lookup(const struct net *net, static struct nft_table *nft_table_lookup_byhandle(const struct net *net, const struct nlattr *nla, - u8 genmask) + u8 genmask, u32 nlpid) { struct nftables_pernet *nft_net; struct nft_table *table; @@ -591,8 +591,13 @@ static struct nft_table *nft_table_lookup_byhandle(const struct net *net, nft_net = nft_pernet(net); list_for_each_entry(table, &nft_net->tables, list) { if (be64_to_cpu(nla_get_be64(nla)) == table->handle && - nft_active_genmask(table, genmask)) + nft_active_genmask(table, genmask)) { + if (nft_table_has_owner(table) && + nlpid && table->nlpid != nlpid) + return ERR_PTR(-EPERM); + return table; + } } return ERR_PTR(-ENOENT); @@ -1279,7 +1284,8 @@ static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_TABLE_HANDLE]) { attr = nla[NFTA_TABLE_HANDLE]; - table = nft_table_lookup_byhandle(net, attr, genmask); + table = nft_table_lookup_byhandle(net, attr, genmask, + NETLINK_CB(skb).portid); } else { attr = nla[NFTA_TABLE_NAME]; table = nft_table_lookup(net, attr, family, genmask, -- cgit v1.2.3 From 5dec6d96d12d33900ec315972c8e47a73bcc378d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Fri, 18 Jun 2021 03:55:26 -0700 Subject: bpf: Fix regression on BPF_OBJ_GET with non-O_RDWR flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit d37300ed1821 ("bpf: program: Refuse non-O_RDWR flags in BPF_OBJ_GET"). It breaks Android userspace which expects to be able to fetch programs with just read permissions. See: https://cs.android.com/android/platform/superproject/+/master:frameworks/libs/net/common/native/bpf_syscall_wrappers/include/BpfSyscallWrappers.h;drc=7005c764be23d31fa1d69e826b4a2f6689a8c81e;l=124 Side-note: another option to fix it would be to extend bpf_prog_new_fd() and to pass in used file mode flags in the same way as we do for maps via bpf_map_new_fd(). Meaning, they'd end up in anon_inode_getfd() and thus would be retained for prog fd operations with bpf() syscall. Right now these flags are not checked with progs since they are immutable for their lifetime (as opposed to maps which can be updated from user space). In future this could potentially change with new features, but at that point it's still fine to do the bpf_prog_new_fd() extension when needed. For a simple stable fix, a revert is less churn. Fixes: d37300ed1821 ("bpf: program: Refuse non-O_RDWR flags in BPF_OBJ_GET") Signed-off-by: Maciej Żenczykowski [ Daniel: added side-note to commit message ] Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/bpf/20210618105526.265003-1-zenczykowski@gmail.com --- kernel/bpf/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index b4ebd60a6c16..80da1db47c68 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -543,7 +543,7 @@ int bpf_obj_get_user(const char __user *pathname, int flags) return PTR_ERR(raw); if (type == BPF_TYPE_PROG) - ret = (f_flags != O_RDWR) ? -EINVAL : bpf_prog_new_fd(raw); + ret = bpf_prog_new_fd(raw); else if (type == BPF_TYPE_MAP) ret = bpf_map_new_fd(raw, f_flags); else if (type == BPF_TYPE_LINK) -- cgit v1.2.3 From 85e8b032d6ebb0f698a34dd22c2f13443d905888 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Jun 2021 07:44:17 -0700 Subject: vxlan: add missing rcu_read_lock() in neigh_reduce() syzbot complained in neigh_reduce(), because rcu_read_lock_bh() is treated differently than rcu_read_lock() WARNING: suspicious RCU usage 5.13.0-rc6-syzkaller #0 Not tainted ----------------------------- include/net/addrconf.h:313 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 3 locks held by kworker/0:0/5: #0: ffff888011064d38 ((wq_completion)events){+.+.}-{0:0}, at: arch_atomic64_set arch/x86/include/asm/atomic64_64.h:34 [inline] #0: ffff888011064d38 ((wq_completion)events){+.+.}-{0:0}, at: atomic64_set include/asm-generic/atomic-instrumented.h:856 [inline] #0: ffff888011064d38 ((wq_completion)events){+.+.}-{0:0}, at: atomic_long_set include/asm-generic/atomic-long.h:41 [inline] #0: ffff888011064d38 ((wq_completion)events){+.+.}-{0:0}, at: set_work_data kernel/workqueue.c:617 [inline] #0: ffff888011064d38 ((wq_completion)events){+.+.}-{0:0}, at: set_work_pool_and_clear_pending kernel/workqueue.c:644 [inline] #0: ffff888011064d38 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x871/0x1600 kernel/workqueue.c:2247 #1: ffffc90000ca7da8 ((work_completion)(&port->wq)){+.+.}-{0:0}, at: process_one_work+0x8a5/0x1600 kernel/workqueue.c:2251 #2: ffffffff8bf795c0 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x1da/0x3130 net/core/dev.c:4180 stack backtrace: CPU: 0 PID: 5 Comm: kworker/0:0 Not tainted 5.13.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events ipvlan_process_multicast Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x141/0x1d7 lib/dump_stack.c:120 __in6_dev_get include/net/addrconf.h:313 [inline] __in6_dev_get include/net/addrconf.h:311 [inline] neigh_reduce drivers/net/vxlan.c:2167 [inline] vxlan_xmit+0x34d5/0x4c30 drivers/net/vxlan.c:2919 __netdev_start_xmit include/linux/netdevice.h:4944 [inline] netdev_start_xmit include/linux/netdevice.h:4958 [inline] xmit_one net/core/dev.c:3654 [inline] dev_hard_start_xmit+0x1eb/0x920 net/core/dev.c:3670 __dev_queue_xmit+0x2133/0x3130 net/core/dev.c:4246 ipvlan_process_multicast+0xa99/0xd70 drivers/net/ipvlan/ipvlan_core.c:287 process_one_work+0x98d/0x1600 kernel/workqueue.c:2276 worker_thread+0x64c/0x1120 kernel/workqueue.c:2422 kthread+0x3b1/0x4a0 kernel/kthread.c:313 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294 Fixes: f564f45c4518 ("vxlan: add ipv6 proxy support") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 02a14f1b938a..5a8df5a195cb 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2164,6 +2164,7 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) struct neighbour *n; struct nd_msg *msg; + rcu_read_lock(); in6_dev = __in6_dev_get(dev); if (!in6_dev) goto out; @@ -2215,6 +2216,7 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) } out: + rcu_read_unlock(); consume_skb(skb); return NETDEV_TX_OK; } -- cgit v1.2.3 From 45423cff1db66cf0993e8a9bd0ac93e740149e49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Huguet?= Date: Mon, 21 Jun 2021 17:32:35 +0200 Subject: sfc: avoid double pci_remove of VFs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If pci_remove was called for a PF with VFs, the removal of the VFs was called twice from efx_ef10_sriov_fini: one directly with pci_driver->remove and another implicit by calling pci_disable_sriov, which also perform the VFs remove. This was leading to crashing the kernel on the second attempt. Given that pci_disable_sriov already calls to pci remove function, get rid of the direct call to pci_driver->remove from the driver. 2 different ways to trigger the bug: - Create one or more VFs, then attach the PF to a virtual machine (at least with qemu/KVM) - Create one or more VFs, then remove the PF with: echo 1 > /sys/bus/pci/devices/PF_PCI_ID/remove Removing sfc module does not trigger the error, at least for me, because it removes the VF first, and then the PF. Example of a log with the error: list_del corruption, ffff967fd20a8ad0->next is LIST_POISON1 (dead000000000100) ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:47! [...trimmed...] RIP: 0010:__list_del_entry_valid.cold.1+0x12/0x4c [...trimmed...] Call Trace: efx_dissociate+0x1f/0x140 [sfc] efx_pci_remove+0x27/0x150 [sfc] pci_device_remove+0x3b/0xc0 device_release_driver_internal+0x103/0x1f0 pci_stop_bus_device+0x69/0x90 pci_stop_and_remove_bus_device+0xe/0x20 pci_iov_remove_virtfn+0xba/0x120 sriov_disable+0x2f/0xe0 efx_ef10_pci_sriov_disable+0x52/0x80 [sfc] ? pcie_aer_is_native+0x12/0x40 efx_ef10_sriov_fini+0x72/0x110 [sfc] efx_pci_remove+0x62/0x150 [sfc] pci_device_remove+0x3b/0xc0 device_release_driver_internal+0x103/0x1f0 unbind_store+0xf6/0x130 kernfs_fop_write+0x116/0x190 vfs_write+0xa5/0x1a0 ksys_write+0x4f/0xb0 do_syscall_64+0x5b/0x1a0 entry_SYSCALL_64_after_hwframe+0x65/0xca Signed-off-by: Íñigo Huguet Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/ef10_sriov.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/net/ethernet/sfc/ef10_sriov.c b/drivers/net/ethernet/sfc/ef10_sriov.c index 21fa6c0e8873..a5d28b0f75ba 100644 --- a/drivers/net/ethernet/sfc/ef10_sriov.c +++ b/drivers/net/ethernet/sfc/ef10_sriov.c @@ -439,7 +439,6 @@ int efx_ef10_sriov_init(struct efx_nic *efx) void efx_ef10_sriov_fini(struct efx_nic *efx) { struct efx_ef10_nic_data *nic_data = efx->nic_data; - unsigned int i; int rc; if (!nic_data->vf) { @@ -449,14 +448,7 @@ void efx_ef10_sriov_fini(struct efx_nic *efx) return; } - /* Remove any VFs in the host */ - for (i = 0; i < efx->vf_count; ++i) { - struct efx_nic *vf_efx = nic_data->vf[i].efx; - - if (vf_efx) - vf_efx->pci_dev->driver->remove(vf_efx->pci_dev); - } - + /* Disable SRIOV and remove any VFs in the host */ rc = efx_ef10_pci_sriov_disable(efx, true); if (rc) netif_dbg(efx, drv, efx->net_dev, -- cgit v1.2.3 From 1ebe4feb8b442884f5a28d2437040096723dd1ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Huguet?= Date: Mon, 21 Jun 2021 17:32:36 +0200 Subject: sfc: error code if SRIOV cannot be disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If SRIOV cannot be disabled during device removal or module unloading, return error code so it can be logged properly in the calling function. Note that this can only happen if any VF is currently attached to a guest using Xen, but not with vfio/KVM. Despite that in that case the VFs won't work properly with PF removed and/or the module unloaded, I have let it as is because I don't know what side effects may have changing it, and also it seems to be the same that other drivers are doing in this situation. In the case of being called during SRIOV reconfiguration, the behavior hasn't changed because the function is called with force=false. Signed-off-by: Íñigo Huguet Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/ef10_sriov.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/sfc/ef10_sriov.c b/drivers/net/ethernet/sfc/ef10_sriov.c index a5d28b0f75ba..84041cd587d7 100644 --- a/drivers/net/ethernet/sfc/ef10_sriov.c +++ b/drivers/net/ethernet/sfc/ef10_sriov.c @@ -402,12 +402,17 @@ fail1: return rc; } +/* Disable SRIOV and remove VFs + * If some VFs are attached to a guest (using Xen, only) nothing is + * done if force=false, and vports are freed if force=true (for the non + * attachedc ones, only) but SRIOV is not disabled and VFs are not + * removed in either case. + */ static int efx_ef10_pci_sriov_disable(struct efx_nic *efx, bool force) { struct pci_dev *dev = efx->pci_dev; - unsigned int vfs_assigned = 0; - - vfs_assigned = pci_vfs_assigned(dev); + unsigned int vfs_assigned = pci_vfs_assigned(dev); + int rc = 0; if (vfs_assigned && !force) { netif_info(efx, drv, efx->net_dev, "VFs are assigned to guests; " @@ -417,10 +422,12 @@ static int efx_ef10_pci_sriov_disable(struct efx_nic *efx, bool force) if (!vfs_assigned) pci_disable_sriov(dev); + else + rc = -EBUSY; efx_ef10_sriov_free_vf_vswitching(efx); efx->vf_count = 0; - return 0; + return rc; } int efx_ef10_sriov_configure(struct efx_nic *efx, int num_vfs) -- cgit v1.2.3 From 9a022e76500e8ce86dbce83c7856cf59820dce24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Huguet?= Date: Mon, 21 Jun 2021 17:32:37 +0200 Subject: sfc: explain that "attached" VFs only refer to Xen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During SRIOV disabling it is checked wether any VF is currently attached to a guest, using pci_vfs_assigned function. However, this check only works with VFs attached with Xen, not with vfio/KVM. Added comments clarifying this point. Also, replaced manual check of PCI_DEV_FLAGS_ASSIGNED flag and used the helper function pci_is_dev_assigned instead. Signed-off-by: Íñigo Huguet Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/ef10.c | 3 ++- drivers/net/ethernet/sfc/ef10_sriov.c | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index c3f35da1b82a..bea961013f7c 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -1070,7 +1070,8 @@ static int efx_ef10_probe_vf(struct efx_nic *efx) /* If the parent PF has no VF data structure, it doesn't know about this * VF so fail probe. The VF needs to be re-created. This can happen - * if the PF driver is unloaded while the VF is assigned to a guest. + * if the PF driver was unloaded while any VF was assigned to a guest + * (using Xen, only). */ pci_dev_pf = efx->pci_dev->physfn; if (pci_dev_pf) { diff --git a/drivers/net/ethernet/sfc/ef10_sriov.c b/drivers/net/ethernet/sfc/ef10_sriov.c index 84041cd587d7..f8f8fbe51ef8 100644 --- a/drivers/net/ethernet/sfc/ef10_sriov.c +++ b/drivers/net/ethernet/sfc/ef10_sriov.c @@ -122,8 +122,7 @@ static void efx_ef10_sriov_free_vf_vports(struct efx_nic *efx) struct ef10_vf *vf = nic_data->vf + i; /* If VF is assigned, do not free the vport */ - if (vf->pci_dev && - vf->pci_dev->dev_flags & PCI_DEV_FLAGS_ASSIGNED) + if (vf->pci_dev && pci_is_dev_assigned(vf->pci_dev)) continue; if (vf->vport_assigned) { @@ -449,7 +448,9 @@ void efx_ef10_sriov_fini(struct efx_nic *efx) int rc; if (!nic_data->vf) { - /* Remove any un-assigned orphaned VFs */ + /* Remove any un-assigned orphaned VFs. This can happen if the PF driver + * was unloaded while any VF was assigned to a guest (using Xen, only). + */ if (pci_num_vf(efx->pci_dev) && !pci_vfs_assigned(efx->pci_dev)) pci_disable_sriov(efx->pci_dev); return; -- cgit v1.2.3 From 3ddd6e2f71092766b6040b9c33cf9906577b4025 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Huguet?= Date: Mon, 21 Jun 2021 17:32:38 +0200 Subject: sfc: avoid duplicated code in ef10_sriov MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fail path of efx_ef10_sriov_alloc_vf_vswitching is identical to the full content of efx_ef10_sriov_free_vf_vswitching, so replace it for a single call to efx_ef10_sriov_free_vf_vswitching. Signed-off-by: Íñigo Huguet Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/ef10_sriov.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/sfc/ef10_sriov.c b/drivers/net/ethernet/sfc/ef10_sriov.c index f8f8fbe51ef8..752d6406f07e 100644 --- a/drivers/net/ethernet/sfc/ef10_sriov.c +++ b/drivers/net/ethernet/sfc/ef10_sriov.c @@ -206,9 +206,7 @@ static int efx_ef10_sriov_alloc_vf_vswitching(struct efx_nic *efx) return 0; fail: - efx_ef10_sriov_free_vf_vports(efx); - kfree(nic_data->vf); - nic_data->vf = NULL; + efx_ef10_sriov_free_vf_vswitching(efx); return rc; } -- cgit v1.2.3 From 7dd5d437c258bbf4cc15b35229e5208b87b8b4e0 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 13 Jun 2021 21:34:39 +0700 Subject: bpf: Fix integer overflow in argument calculation for bpf_map_area_alloc In 32-bit architecture, the result of sizeof() is a 32-bit integer so the expression becomes the multiplication between 2 32-bit integer which can potentially leads to integer overflow. As a result, bpf_map_area_alloc() allocates less memory than needed. Fix this by casting 1 operand to u64. Fixes: 0d2c4f964050 ("bpf: Eliminate rlimit-based memory accounting for sockmap and sockhash maps") Fixes: 99c51064fb06 ("devmap: Use bpf_map_area_alloc() for allocating hash buckets") Fixes: 546ac1ffb70d ("bpf: add devmap, a map for storing net device references") Signed-off-by: Bui Quang Minh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210613143440.71975-1-minhquangbui99@gmail.com --- kernel/bpf/devmap.c | 4 ++-- net/core/sock_map.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index aa516472ce46..3b45c23286c0 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -92,7 +92,7 @@ static struct hlist_head *dev_map_create_hash(unsigned int entries, int i; struct hlist_head *hash; - hash = bpf_map_area_alloc(entries * sizeof(*hash), numa_node); + hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node); if (hash != NULL) for (i = 0; i < entries; i++) INIT_HLIST_HEAD(&hash[i]); @@ -143,7 +143,7 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) spin_lock_init(&dtab->index_lock); } else { - dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * + dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 6f1b82b8ad49..60decd6420ca 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -48,7 +48,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&stab->map, attr); raw_spin_lock_init(&stab->lock); - stab->sks = bpf_map_area_alloc(stab->map.max_entries * + stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries * sizeof(struct sock *), stab->map.numa_node); if (!stab->sks) { -- cgit v1.2.3 From 490274b47468793e3e157c2df6b2da0e646cc4a9 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 21 Jun 2021 17:33:08 -0700 Subject: mptcp: avoid race on msk state changes The msk socket state is currently updated in a few spots without owning the msk socket lock itself. Some of such operations are safe, as they happens before exposing the msk socket to user-space and can't race with other changes. A couple of them, at connect time, can actually race with close() or shutdown(), leaving breaking the socket state machine. This change addresses the issue moving such update under the msk socket lock with the usual: scheme. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/56 Fixes: 8fd738049ac3 ("mptcp: fallback in case of simultaneous connect") Fixes: c3c123d16c0e ("net: mptcp: don't hang in mptcp_sendmsg() after TCP fallback") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 5 +++++ net/mptcp/protocol.h | 2 ++ net/mptcp/subflow.c | 30 ++++++++++++++++++++++-------- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 632350018fb6..8ead550df8b1 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2946,6 +2946,11 @@ static void mptcp_release_cb(struct sock *sk) spin_lock_bh(&sk->sk_lock.slock); } + /* be sure to set the current sk state before tacking actions + * depending on sk_state + */ + if (test_and_clear_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags)) + __mptcp_set_connected(sk); if (test_and_clear_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->flags)) __mptcp_clean_una_wakeup(sk); if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags)) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 5d7c44028e47..7b634568f49c 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -109,6 +109,7 @@ #define MPTCP_ERROR_REPORT 8 #define MPTCP_RETRANSMIT 9 #define MPTCP_WORK_SYNC_SETSOCKOPT 10 +#define MPTCP_CONNECTED 11 static inline bool before64(__u64 seq1, __u64 seq2) { @@ -579,6 +580,7 @@ void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt); void mptcp_finish_connect(struct sock *sk); +void __mptcp_set_connected(struct sock *sk); static inline bool mptcp_is_fully_established(struct sock *sk) { return inet_sk_state_load(sk) == TCP_ESTABLISHED && diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 037fba41e170..9f934603bfe8 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -371,6 +371,24 @@ static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct soc return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport; } +void __mptcp_set_connected(struct sock *sk) +{ + if (sk->sk_state == TCP_SYN_SENT) { + inet_sk_state_store(sk, TCP_ESTABLISHED); + sk->sk_state_change(sk); + } +} + +static void mptcp_set_connected(struct sock *sk) +{ + mptcp_data_lock(sk); + if (!sock_owned_by_user(sk)) + __mptcp_set_connected(sk); + else + set_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->flags); + mptcp_data_unlock(sk); +} + static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -379,10 +397,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); - if (inet_sk_state_load(parent) == TCP_SYN_SENT) { - inet_sk_state_store(parent, TCP_ESTABLISHED); - parent->sk_state_change(parent); - } /* be sure no special action on any packet other than syn-ack */ if (subflow->conn_finished) @@ -411,6 +425,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->remote_key); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); + mptcp_set_connected(parent); } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; @@ -451,6 +466,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) } else if (mptcp_check_fallback(sk)) { fallback: mptcp_rcv_space_init(mptcp_sk(parent), sk); + mptcp_set_connected(parent); } return; @@ -558,6 +574,7 @@ static void mptcp_sock_destruct(struct sock *sk) static void mptcp_force_close(struct sock *sk) { + /* the msk is not yet exposed to user-space */ inet_sk_state_store(sk, TCP_CLOSE); sk_common_release(sk); } @@ -1474,10 +1491,7 @@ static void subflow_state_change(struct sock *sk) mptcp_rcv_space_init(mptcp_sk(parent), sk); pr_fallback(mptcp_sk(parent)); subflow->conn_finished = 1; - if (inet_sk_state_load(parent) == TCP_SYN_SENT) { - inet_sk_state_store(parent, TCP_ESTABLISHED); - parent->sk_state_change(parent); - } + mptcp_set_connected(parent); } /* as recvmsg() does not acquire the subflow socket for ssk selection -- cgit v1.2.3 From 597dbae77ee5a2347b1b800c25c89a9181dd8a57 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 21 Jun 2021 17:33:09 -0700 Subject: mptcp: drop duplicate mptcp_setsockopt() declaration commit 7896248983ef ("mptcp: add skeleton to sync msk socket options to subflows") introduced a duplicate declaration of mptcp_setsockopt(), just drop it. Reported-by: Florian Westphal Fixes: 7896248983ef ("mptcp: add skeleton to sync msk socket options to subflows") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 7b634568f49c..78ac28902f55 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -762,9 +762,6 @@ unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk); -int mptcp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen); - void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk); void mptcp_sockopt_sync_all(struct mptcp_sock *msk); -- cgit v1.2.3 From a6e3f2985a80ef6a45a17d2d9d9151f17ea3ce07 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 21 Jun 2021 18:52:54 -0700 Subject: ip6_tunnel: fix GRE6 segmentation Commit 6c11fbf97e69 ("ip6_tunnel: add MPLS transmit support") moved assiging inner_ipproto down from ipxip6_tnl_xmit() to its callee ip6_tnl_xmit(). The latter is also used by GRE. Since commit 38720352412a ("gre: Use inner_proto to obtain inner header protocol") GRE had been depending on skb->inner_protocol during segmentation. It sets it in gre_build_header() and reads it in gre_gso_segment(). Changes to ip6_tnl_xmit() overwrite the protocol, resulting in GSO skbs getting dropped. Note that inner_protocol is a union with inner_ipproto, GRE uses the former while the change switched it to the latter (always setting it to just IPPROTO_GRE). Restore the original location of skb_set_inner_ipproto(), it is unclear why it was moved in the first place. Fixes: 6c11fbf97e69 ("ip6_tunnel: add MPLS transmit support") Signed-off-by: Jakub Kicinski Tested-by: Vadim Fedorenko Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 288bafded998..28ca70af014a 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1239,8 +1239,6 @@ route_lookup: if (max_headroom > dev->needed_headroom) dev->needed_headroom = max_headroom; - skb_set_inner_ipproto(skb, proto); - err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) return err; @@ -1377,6 +1375,8 @@ ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; + skb_set_inner_ipproto(skb, protocol); + err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, protocol); if (err != 0) { -- cgit v1.2.3 From 3c9ef511b9fa128a4c62e3aa0aac4c6b190f0d55 Mon Sep 17 00:00:00 2001 From: Di Zhu Date: Tue, 22 Jun 2021 11:09:29 +0800 Subject: bonding: avoid adding slave device with IFF_MASTER flag The following steps will definitely cause the kernel to crash: ip link add vrf1 type vrf table 1 modprobe bonding.ko max_bonds=1 echo "+vrf1" >/sys/class/net/bond0/bonding/slaves rmmod bonding The root cause is that: When the VRF is added to the slave device, it will fail, and some cleaning work will be done. because VRF device has IFF_MASTER flag, cleanup process will not clear the IFF_BONDING flag. Then, when we unload the bonding module, unregister_netdevice_notifier() will treat the VRF device as a bond master device and treat netdev_priv() as struct bonding{} which actually is struct net_vrf{}. By analyzing the processing logic of bond_enslave(), it seems that it is not allowed to add the slave device with the IFF_MASTER flag, so we need to add a code check for this situation. Signed-off-by: Di Zhu Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index c5a646d06102..16840c9bc00d 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1601,6 +1601,12 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, int link_reporting; int res = 0, i; + if (slave_dev->flags & IFF_MASTER) { + netdev_err(bond_dev, + "Error: Device with IFF_MASTER cannot be enslaved\n"); + return -EPERM; + } + if (!bond->params.use_carrier && slave_dev->ethtool_ops->get_link == NULL && slave_ops->ndo_do_ioctl == NULL) { -- cgit v1.2.3 From c69f114d09891adfa3e301a35d9e872b8b7b5a50 Mon Sep 17 00:00:00 2001 From: Miao Wang Date: Tue, 22 Jun 2021 12:24:50 +0800 Subject: net/ipv4: swap flow ports when validating source When doing source address validation, the flowi4 struct used for fib_lookup should be in the reverse direction to the given skb. fl4_dport and fl4_sport returned by fib4_rules_early_flow_dissect should thus be swapped. Fixes: 5a847a6e1477 ("net/ipv4: Initialize proto and ports in flow struct") Signed-off-by: Miao Wang Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 84bb707bd88d..647bceab56c2 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -371,6 +371,8 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_proto = 0; fl4.fl4_sport = 0; fl4.fl4_dport = 0; + } else { + swap(fl4.fl4_sport, fl4.fl4_dport); } if (fib_lookup(net, &fl4, &res, 0)) -- cgit v1.2.3 From ddeacc4f6494e07cbb6f033627926623f3e7a9d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Tue, 22 Jun 2021 07:24:15 +0200 Subject: net: broadcom: bcm4908_enet: reset DMA rings sw indexes properly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resetting software indexes in bcm4908_dma_alloc_buf_descs() is not enough as it's called during device probe only. Driver resets DMA on every .ndo_open callback and it's required to reset indexes then. This fixes inconsistent rings state and stalled traffic after interface down & up sequence. Fixes: 4feffeadbcb2 ("net: broadcom: bcm4908enet: add BCM4908 controller driver") Signed-off-by: Rafał Miłecki Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcm4908_enet.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcm4908_enet.c b/drivers/net/ethernet/broadcom/bcm4908_enet.c index 60d908507f51..02a569500234 100644 --- a/drivers/net/ethernet/broadcom/bcm4908_enet.c +++ b/drivers/net/ethernet/broadcom/bcm4908_enet.c @@ -174,9 +174,6 @@ static int bcm4908_dma_alloc_buf_descs(struct bcm4908_enet *enet, if (!ring->slots) goto err_free_buf_descs; - ring->read_idx = 0; - ring->write_idx = 0; - return 0; err_free_buf_descs: @@ -304,6 +301,9 @@ static void bcm4908_enet_dma_ring_init(struct bcm4908_enet *enet, enet_write(enet, ring->st_ram_block + ENET_DMA_CH_STATE_RAM_BASE_DESC_PTR, (uint32_t)ring->dma_addr); + + ring->read_idx = 0; + ring->write_idx = 0; } static void bcm4908_enet_dma_uninit(struct bcm4908_enet *enet) -- cgit v1.2.3 From ce8eb4c728ef40b554b4f3d8963f11ed44502e00 Mon Sep 17 00:00:00 2001 From: Vignesh Raghavendra Date: Tue, 22 Jun 2021 20:08:57 +0530 Subject: net: ti: am65-cpsw-nuss: Fix crash when changing number of TX queues When changing number of TX queues using ethtool: # ethtool -L eth0 tx 1 [ 135.301047] Unable to handle kernel paging request at virtual address 00000000af5d0000 [...] [ 135.525128] Call trace: [ 135.525142] dma_release_from_dev_coherent+0x2c/0xb0 [ 135.525148] dma_free_attrs+0x54/0xe0 [ 135.525156] k3_cppi_desc_pool_destroy+0x50/0xa0 [ 135.525164] am65_cpsw_nuss_remove_tx_chns+0x88/0xdc [ 135.525171] am65_cpsw_set_channels+0x3c/0x70 [...] This is because k3_cppi_desc_pool_destroy() which is called after k3_udma_glue_release_tx_chn() in am65_cpsw_nuss_remove_tx_chns() references struct device that is unregistered at the end of k3_udma_glue_release_tx_chn() Therefore the right order is to call k3_cppi_desc_pool_destroy() and destroy desc pool before calling k3_udma_glue_release_tx_chn(). Fix this throughout the driver. Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver") Signed-off-by: Vignesh Raghavendra Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 6a67b026df0b..718539cdd2f2 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -1506,12 +1506,12 @@ static void am65_cpsw_nuss_free_tx_chns(void *data) for (i = 0; i < common->tx_ch_num; i++) { struct am65_cpsw_tx_chn *tx_chn = &common->tx_chns[i]; - if (!IS_ERR_OR_NULL(tx_chn->tx_chn)) - k3_udma_glue_release_tx_chn(tx_chn->tx_chn); - if (!IS_ERR_OR_NULL(tx_chn->desc_pool)) k3_cppi_desc_pool_destroy(tx_chn->desc_pool); + if (!IS_ERR_OR_NULL(tx_chn->tx_chn)) + k3_udma_glue_release_tx_chn(tx_chn->tx_chn); + memset(tx_chn, 0, sizeof(*tx_chn)); } } @@ -1531,12 +1531,12 @@ void am65_cpsw_nuss_remove_tx_chns(struct am65_cpsw_common *common) netif_napi_del(&tx_chn->napi_tx); - if (!IS_ERR_OR_NULL(tx_chn->tx_chn)) - k3_udma_glue_release_tx_chn(tx_chn->tx_chn); - if (!IS_ERR_OR_NULL(tx_chn->desc_pool)) k3_cppi_desc_pool_destroy(tx_chn->desc_pool); + if (!IS_ERR_OR_NULL(tx_chn->tx_chn)) + k3_udma_glue_release_tx_chn(tx_chn->tx_chn); + memset(tx_chn, 0, sizeof(*tx_chn)); } } @@ -1624,11 +1624,11 @@ static void am65_cpsw_nuss_free_rx_chns(void *data) rx_chn = &common->rx_chns; - if (!IS_ERR_OR_NULL(rx_chn->rx_chn)) - k3_udma_glue_release_rx_chn(rx_chn->rx_chn); - if (!IS_ERR_OR_NULL(rx_chn->desc_pool)) k3_cppi_desc_pool_destroy(rx_chn->desc_pool); + + if (!IS_ERR_OR_NULL(rx_chn->rx_chn)) + k3_udma_glue_release_rx_chn(rx_chn->rx_chn); } static int am65_cpsw_nuss_init_rx_chns(struct am65_cpsw_common *common) -- cgit v1.2.3 From 28a5501c3383f0e6643012c187b7c2027ef42aea Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Wed, 16 Jun 2021 10:09:01 +0800 Subject: ieee802154: hwsim: Fix memory leak in hwsim_add_one No matter from hwsim_remove or hwsim_del_radio_nl, hwsim_del fails to remove the entry in the edges list. Take the example below, phy0, phy1 and e0 will be deleted, resulting in e1 not freed and accessed in the future. hwsim_phys | ------------------------------ | | phy0 (edges) phy1 (edges) ----> e1 (idx = 1) ----> e0 (idx = 0) Fix this by deleting and freeing all the entries in the edges list between hwsim_edge_unsubscribe_me and list_del(&phy->list). Reported-by: syzbot+b80c9959009a9325cdff@syzkaller.appspotmail.com Fixes: 1c9f4a3fce77 ("ieee802154: hwsim: fix rcu handling") Signed-off-by: Dongliang Mu Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20210616020901.2759466-1-mudongliangabcd@gmail.com Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/mac802154_hwsim.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ieee802154/mac802154_hwsim.c b/drivers/net/ieee802154/mac802154_hwsim.c index 366eaae3550a..baa7e21b7f4f 100644 --- a/drivers/net/ieee802154/mac802154_hwsim.c +++ b/drivers/net/ieee802154/mac802154_hwsim.c @@ -824,12 +824,17 @@ err_pib: static void hwsim_del(struct hwsim_phy *phy) { struct hwsim_pib *pib; + struct hwsim_edge *e; hwsim_edge_unsubscribe_me(phy); list_del(&phy->list); rcu_read_lock(); + list_for_each_entry_rcu(e, &phy->edges, list) { + list_del_rcu(&e->list); + hwsim_free_edge(e); + } pib = rcu_dereference(phy->pib); rcu_read_unlock(); -- cgit v1.2.3 From 0303b30375dff5351a79cc2c3c87dfa4fda29bed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Jun 2021 11:02:44 -0700 Subject: ieee802154: hwsim: avoid possible crash in hwsim_del_edge_nl() Both MAC802154_HWSIM_ATTR_RADIO_ID and MAC802154_HWSIM_ATTR_RADIO_EDGE must be present to avoid a crash. Fixes: f25da51fdc38 ("ieee802154: hwsim: add replacement for fakelb") Signed-off-by: Eric Dumazet Cc: Alexander Aring Cc: Stefan Schmidt Reported-by: syzbot Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20210621180244.882076-1-eric.dumazet@gmail.com Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/mac802154_hwsim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ieee802154/mac802154_hwsim.c b/drivers/net/ieee802154/mac802154_hwsim.c index baa7e21b7f4f..ebc976b7fcc2 100644 --- a/drivers/net/ieee802154/mac802154_hwsim.c +++ b/drivers/net/ieee802154/mac802154_hwsim.c @@ -480,7 +480,7 @@ static int hwsim_del_edge_nl(struct sk_buff *msg, struct genl_info *info) struct hwsim_edge *e; u32 v0, v1; - if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID] && + if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID] || !info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE]) return -EINVAL; -- cgit v1.2.3 From 7506d211b932870155bcb39e3dd9e39fab45a7c7 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 16 Jun 2021 15:55:00 -0700 Subject: bpf: Fix null ptr deref with mixed tail calls and subprogs The sub-programs prog->aux->poke_tab[] is populated in jit_subprogs() and then used when emitting 'BPF_JMP|BPF_TAIL_CALL' insn->code from the individual JITs. The poke_tab[] to use is stored in the insn->imm by the code adding it to that array slot. The JIT then uses imm to find the right entry for an individual instruction. In the x86 bpf_jit_comp.c this is done by calling emit_bpf_tail_call_direct with the poke_tab[] of the imm value. However, we observed the below null-ptr-deref when mixing tail call programs with subprog programs. For this to happen we just need to mix bpf-2-bpf calls and tailcalls with some extra calls or instructions that would be patched later by one of the fixup routines. So whats happening? Before the fixup_call_args() -- where the jit op is done -- various code patching is done by do_misc_fixups(). This may increase the insn count, for example when we patch map_lookup_up using map_gen_lookup hook. This does two things. First, it means the instruction index, insn_idx field, of a tail call instruction will move by a 'delta'. In verifier code, struct bpf_jit_poke_descriptor desc = { .reason = BPF_POKE_REASON_TAIL_CALL, .tail_call.map = BPF_MAP_PTR(aux->map_ptr_state), .tail_call.key = bpf_map_key_immediate(aux), .insn_idx = i + delta, }; Then subprog start values subprog_info[i].start will be updated with the delta and any poke descriptor index will also be updated with the delta in adjust_poke_desc(). If we look at the adjust subprog starts though we see its only adjusted when the delta occurs before the new instructions, /* NOTE: fake 'exit' subprog should be updated as well. */ for (i = 0; i <= env->subprog_cnt; i++) { if (env->subprog_info[i].start <= off) continue; Earlier subprograms are not changed because their start values are not moved. But, adjust_poke_desc() does the offset + delta indiscriminately. The result is poke descriptors are potentially corrupted. Then in jit_subprogs() we only populate the poke_tab[] when the above insn_idx is less than the next subprogram start. From above we corrupted our insn_idx so we might incorrectly assume a poke descriptor is not used in a subprogram omitting it from the subprogram. And finally when the jit runs it does the deref of poke_tab when emitting the instruction and crashes with below. Because earlier step omitted the poke descriptor. The fix is straight forward with above context. Simply move same logic from adjust_subprog_starts() into adjust_poke_descs() and only adjust insn_idx when needed. [ 82.396354] bpf_testmod: version magic '5.12.0-rc2alu+ SMP preempt mod_unload ' should be '5.12.0+ SMP preempt mod_unload ' [ 82.623001] loop10: detected capacity change from 0 to 8 [ 88.487424] ================================================================== [ 88.487438] BUG: KASAN: null-ptr-deref in do_jit+0x184a/0x3290 [ 88.487455] Write of size 8 at addr 0000000000000008 by task test_progs/5295 [ 88.487471] CPU: 7 PID: 5295 Comm: test_progs Tainted: G I 5.12.0+ #386 [ 88.487483] Hardware name: Dell Inc. Precision 5820 Tower/002KVM, BIOS 1.9.2 01/24/2019 [ 88.487490] Call Trace: [ 88.487498] dump_stack+0x93/0xc2 [ 88.487515] kasan_report.cold+0x5f/0xd8 [ 88.487530] ? do_jit+0x184a/0x3290 [ 88.487542] do_jit+0x184a/0x3290 ... [ 88.487709] bpf_int_jit_compile+0x248/0x810 ... [ 88.487765] bpf_check+0x3718/0x5140 ... [ 88.487920] bpf_prog_load+0xa22/0xf10 Fixes: a748c6975dea3 ("bpf: propagate poke descriptors to subprograms") Reported-by: Jussi Maki Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Reviewed-by: Daniel Borkmann --- kernel/bpf/verifier.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c6a27574242d..6e2ebcb0d66f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11459,7 +11459,7 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len } } -static void adjust_poke_descs(struct bpf_prog *prog, u32 len) +static void adjust_poke_descs(struct bpf_prog *prog, u32 off, u32 len) { struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; int i, sz = prog->aux->size_poke_tab; @@ -11467,6 +11467,8 @@ static void adjust_poke_descs(struct bpf_prog *prog, u32 len) for (i = 0; i < sz; i++) { desc = &tab[i]; + if (desc->insn_idx <= off) + continue; desc->insn_idx += len - 1; } } @@ -11487,7 +11489,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of if (adjust_insn_aux_data(env, new_prog, off, len)) return NULL; adjust_subprog_starts(env, off, len); - adjust_poke_descs(new_prog, len); + adjust_poke_descs(new_prog, off, len); return new_prog; } -- cgit v1.2.3 From 4d293fe1c69c157c15ac06918a805e5fef036682 Mon Sep 17 00:00:00 2001 From: Di Zhu Date: Wed, 23 Jun 2021 11:21:08 +0800 Subject: bonding: allow nesting of bonding device The commit 3c9ef511b9fa ("bonding: avoid adding slave device with IFF_MASTER flag") fix a crash when add slave device with IFF_MASTER, but it rejects the scenario of nested bonding device. As Eric Dumazet described: since there indeed is a usage scenario about nesting bonding, we should not break it. So we add a new judgment condition to allow nesting of bonding device. Fixes: 3c9ef511b9fa ("bonding: avoid adding slave device with IFF_MASTER flag") Suggested-by: Jay Vosburgh Signed-off-by: Di Zhu Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 16840c9bc00d..03b1a93d7fea 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1601,7 +1601,9 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, int link_reporting; int res = 0, i; - if (slave_dev->flags & IFF_MASTER) { + if (slave_dev->flags & IFF_MASTER && + !netif_is_bond_master(slave_dev)) { + NL_SET_ERR_MSG(extack, "Device with IFF_MASTER cannot be enslaved"); netdev_err(bond_dev, "Error: Device with IFF_MASTER cannot be enslaved\n"); return -EPERM; -- cgit v1.2.3 From b2ac9800cfe0f8da16abc4e74e003440361c112e Mon Sep 17 00:00:00 2001 From: Jian-Hong Pan Date: Wed, 23 Jun 2021 11:28:03 +0800 Subject: net: bcmgenet: Fix attaching to PYH failed on RPi 4B The Broadcom UniMAC MDIO bus from mdio-bcm-unimac module comes too late. So, GENET cannot find the ethernet PHY on UniMAC MDIO bus. This leads GENET fail to attach the PHY as following log: bcmgenet fd580000.ethernet: GENET 5.0 EPHY: 0x0000 ... could not attach to PHY bcmgenet fd580000.ethernet eth0: failed to connect to PHY uart-pl011 fe201000.serial: no DMA platform data libphy: bcmgenet MII bus: probed ... unimac-mdio unimac-mdio.-19: Broadcom UniMAC MDIO bus This patch adds the soft dependency to load mdio-bcm-unimac module before genet module to avoid the issue. Fixes: 9a4e79697009 ("net: bcmgenet: utilize generic Broadcom UniMAC MDIO controller driver") Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=213485 Signed-off-by: Jian-Hong Pan Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index fcca023f22e5..41f7f078cd27 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -4296,3 +4296,4 @@ MODULE_AUTHOR("Broadcom Corporation"); MODULE_DESCRIPTION("Broadcom GENET Ethernet controller driver"); MODULE_ALIAS("platform:bcmgenet"); MODULE_LICENSE("GPL"); +MODULE_SOFTDEP("pre: mdio-bcm-unimac"); -- cgit v1.2.3 From bcc3f2a829b9edbe3da5fb117ee5a63686d31834 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 23 Jun 2021 08:27:00 -0700 Subject: ipv6: exthdrs: do not blindly use init_net I see no reason why max_dst_opts_cnt and max_hbh_opts_cnt are fetched from the initial net namespace. The other sysctls (max_dst_opts_len & max_hbh_opts_len) are in fact already using the current ns. Note: it is not clear why ipv6_destopt_rcv() use two ways to get to the netns : 1) dev_net(dst->dev) Originally used to increment IPSTATS_MIB_INHDRERRORS 2) dev_net(skb->dev) Tom used this variant in his patch. Maybe this calls to use ipv6_skb_net() instead ? Fixes: 47d3d7ac656a ("ipv6: Implement limits on Hop-by-Hop and Destination options") Signed-off-by: Eric Dumazet Cc: Tom Herbert Cc: Coco Li Signed-off-by: David S. Miller --- net/ipv6/exthdrs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 56e479d158b7..6f7da8f3e2e5 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -306,7 +306,7 @@ fail_and_free: #endif if (ip6_parse_tlv(tlvprocdestopt_lst, skb, - init_net.ipv6.sysctl.max_dst_opts_cnt)) { + net->ipv6.sysctl.max_dst_opts_cnt)) { skb->transport_header += extlen; opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) @@ -1037,7 +1037,7 @@ fail_and_free: opt->flags |= IP6SKB_HOPBYHOP; if (ip6_parse_tlv(tlvprochopopt_lst, skb, - init_net.ipv6.sysctl.max_hbh_opts_cnt)) { + net->ipv6.sysctl.max_hbh_opts_cnt)) { skb->transport_header += extlen; opt = IP6CB(skb); opt->nhoff = sizeof(struct ipv6hdr); -- cgit v1.2.3 From aaf473d0100f64abc88560e2bea905805bcf2a8e Mon Sep 17 00:00:00 2001 From: Norbert Slusarek Date: Sun, 20 Jun 2021 14:38:42 +0200 Subject: can: j1939: j1939_sk_setsockopt(): prevent allocation of j1939 filter for optlen == 0 If optval != NULL and optlen == 0 are specified for SO_J1939_FILTER in j1939_sk_setsockopt(), memdup_sockptr() will return ZERO_PTR for 0 size allocation. The new filter will be mistakenly assigned ZERO_PTR. This patch checks for optlen != 0 and filter will be assigned NULL in case of optlen == 0. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Link: https://lore.kernel.org/r/20210620123842.117975-1-nslusarek@gmx.net Signed-off-by: Norbert Slusarek Acked-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- net/can/j1939/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index fce8bc8afeb7..e1a399821238 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -676,7 +676,7 @@ static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, switch (optname) { case SO_J1939_FILTER: - if (!sockptr_is_null(optval)) { + if (!sockptr_is_null(optval) && optlen != 0) { struct j1939_filter *f; int c; -- cgit v1.2.3 From b17233d385d0b6b43ecf81d43008cb1bbb008166 Mon Sep 17 00:00:00 2001 From: Stephane Grosjean Date: Wed, 23 Jun 2021 16:26:00 +0200 Subject: can: peak_pciefd: pucan_handle_status(): fix a potential starvation issue in TX path Rather than just indicating that transmission can start, this patch requires the explicit flushing of the network TX queue when the driver is informed by the device that it can transmit, next to its configuration. In this way, if frames have already been written by the application, they will actually be transmitted. Fixes: ffd137f7043c ("can: peak/pcie_fd: remove useless code when interface starts") Link: https://lore.kernel.org/r/20210623142600.149904-1-s.grosjean@peak-system.com Cc: linux-stable Signed-off-by: Stephane Grosjean Signed-off-by: Marc Kleine-Budde --- drivers/net/can/peak_canfd/peak_canfd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/peak_canfd/peak_canfd.c b/drivers/net/can/peak_canfd/peak_canfd.c index 00847cbaf7b6..d08718e98e11 100644 --- a/drivers/net/can/peak_canfd/peak_canfd.c +++ b/drivers/net/can/peak_canfd/peak_canfd.c @@ -351,8 +351,8 @@ static int pucan_handle_status(struct peak_canfd_priv *priv, return err; } - /* start network queue (echo_skb array is empty) */ - netif_start_queue(ndev); + /* wake network queue up (echo_skb array is empty) */ + netif_wake_queue(ndev); return 0; } -- cgit v1.2.3 From 9c04cfcd4aad232e36306cdc5c74cd9fc9148a7e Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Sun, 28 Feb 2021 19:50:58 +0800 Subject: i40e: Fix error handling in i40e_vsi_open When vsi->type == I40E_VSI_FDIR, we have caught the return value of i40e_vsi_request_irq() but without further handling. Check and execute memory clean on failure just like the other i40e_vsi_request_irq(). Fixes: 8a9eb7d3cbcab ("i40e: rework fdir setup and teardown") Signed-off-by: Dinghao Liu Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 704e474879c5..526fa0a791ea 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -8703,6 +8703,8 @@ int i40e_vsi_open(struct i40e_vsi *vsi) dev_driver_string(&pf->pdev->dev), dev_name(&pf->pdev->dev)); err = i40e_vsi_request_irq(vsi, int_name); + if (err) + goto err_setup_rx; } else { err = -EINVAL; -- cgit v1.2.3 From 9262793e59f0423437166a879a73d056b1fe6f9a Mon Sep 17 00:00:00 2001 From: Mateusz Palczewski Date: Wed, 10 Mar 2021 11:12:54 +0000 Subject: i40e: Fix autoneg disabling for non-10GBaseT links Disabling autonegotiation was allowed only for 10GBaseT PHY. The condition was changed to check if link media type is BaseT. Fixes: 3ce12ee9d8f9 ("i40e: Fix order of checks when enabling/disabling autoneg in ethtool") Reviewed-by: Aleksandr Loktionov Reviewed-by: Karen Sornek Signed-off-by: Dawid Lukwinski Signed-off-by: Mateusz Palczewski Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index ccd5b9486ea9..3e822bad4851 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -1262,8 +1262,7 @@ static int i40e_set_link_ksettings(struct net_device *netdev, if (ethtool_link_ksettings_test_link_mode(&safe_ks, supported, Autoneg) && - hw->phy.link_info.phy_type != - I40E_PHY_TYPE_10GBASE_T) { + hw->phy.media_type != I40E_MEDIA_TYPE_BASET) { netdev_info(netdev, "Autoneg cannot be disabled on this phy\n"); err = -EINVAL; goto done; -- cgit v1.2.3 From 26b0ce8dd3dd704393dbace4dc416adfeffe531f Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Fri, 7 May 2021 11:56:25 -0700 Subject: i40e: fix PTP on 5Gb links As reported by Alex Sergeev, the i40e driver is incrementing the PTP clock at 40Gb speeds when linked at 5Gb. Fix this bug by making sure that the right multiplier is selected when linked at 5Gb. Fixes: 3dbdd6c2f70a ("i40e: Add support for 5Gbps cards") Cc: stable@vger.kernel.org Reported-by: Alex Sergeev Suggested-by: Alex Sergeev Signed-off-by: Jesse Brandeburg Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_ptp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c index f1f6fc3744e9..7b971b205d36 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c @@ -11,13 +11,14 @@ * operate with the nanosecond field directly without fear of overflow. * * Much like the 82599, the update period is dependent upon the link speed: - * At 40Gb link or no link, the period is 1.6ns. - * At 10Gb link, the period is multiplied by 2. (3.2ns) + * At 40Gb, 25Gb, or no link, the period is 1.6ns. + * At 10Gb or 5Gb link, the period is multiplied by 2. (3.2ns) * At 1Gb link, the period is multiplied by 20. (32ns) * 1588 functionality is not supported at 100Mbps. */ #define I40E_PTP_40GB_INCVAL 0x0199999999ULL #define I40E_PTP_10GB_INCVAL_MULT 2 +#define I40E_PTP_5GB_INCVAL_MULT 2 #define I40E_PTP_1GB_INCVAL_MULT 20 #define I40E_PRTTSYN_CTL1_TSYNTYPE_V1 BIT(I40E_PRTTSYN_CTL1_TSYNTYPE_SHIFT) @@ -465,6 +466,9 @@ void i40e_ptp_set_increment(struct i40e_pf *pf) case I40E_LINK_SPEED_10GB: mult = I40E_PTP_10GB_INCVAL_MULT; break; + case I40E_LINK_SPEED_5GB: + mult = I40E_PTP_5GB_INCVAL_MULT; + break; case I40E_LINK_SPEED_1GB: mult = I40E_PTP_1GB_INCVAL_MULT; break; -- cgit v1.2.3 From 956e759d5f8e0859e86b951a8779c60af633aafd Mon Sep 17 00:00:00 2001 From: Jan Sokolowski Date: Fri, 11 Jun 2021 12:01:41 +0200 Subject: i40e: Fix missing rtnl locking when setting up pf switch A recent change that made i40e use new udp_tunnel infrastructure uses a method that expects to be called under rtnl lock. However, not all codepaths do the lock prior to calling i40e_setup_pf_switch. Fix that by adding additional rtnl locking and unlocking. Fixes: 40a98cb6f01f ("i40e: convert to new udp_tunnel infrastructure") Signed-off-by: Jan Sokolowski Signed-off-by: Mateusz Palczewski Tested-by: Tony Brelinski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_main.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 526fa0a791ea..f9fe500d4ec4 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -32,7 +32,7 @@ static void i40e_vsi_reinit_locked(struct i40e_vsi *vsi); static void i40e_handle_reset_warning(struct i40e_pf *pf, bool lock_acquired); static int i40e_add_vsi(struct i40e_vsi *vsi); static int i40e_add_veb(struct i40e_veb *veb, struct i40e_vsi *vsi); -static int i40e_setup_pf_switch(struct i40e_pf *pf, bool reinit); +static int i40e_setup_pf_switch(struct i40e_pf *pf, bool reinit, bool lock_acquired); static int i40e_setup_misc_vector(struct i40e_pf *pf); static void i40e_determine_queue_usage(struct i40e_pf *pf); static int i40e_setup_pf_filter_control(struct i40e_pf *pf); @@ -10571,7 +10571,7 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) #endif /* CONFIG_I40E_DCB */ if (!lock_acquired) rtnl_lock(); - ret = i40e_setup_pf_switch(pf, reinit); + ret = i40e_setup_pf_switch(pf, reinit, true); if (ret) goto end_unlock; @@ -14629,10 +14629,11 @@ int i40e_fetch_switch_configuration(struct i40e_pf *pf, bool printconfig) * i40e_setup_pf_switch - Setup the HW switch on startup or after reset * @pf: board private structure * @reinit: if the Main VSI needs to re-initialized. + * @lock_acquired: indicates whether or not the lock has been acquired * * Returns 0 on success, negative value on failure **/ -static int i40e_setup_pf_switch(struct i40e_pf *pf, bool reinit) +static int i40e_setup_pf_switch(struct i40e_pf *pf, bool reinit, bool lock_acquired) { u16 flags = 0; int ret; @@ -14734,9 +14735,15 @@ static int i40e_setup_pf_switch(struct i40e_pf *pf, bool reinit) i40e_ptp_init(pf); + if (!lock_acquired) + rtnl_lock(); + /* repopulate tunnel port filters */ udp_tunnel_nic_reset_ntf(pf->vsi[pf->lan_vsi]->netdev); + if (!lock_acquired) + rtnl_unlock(); + return ret; } @@ -15530,7 +15537,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pf->flags |= I40E_FLAG_VEB_MODE_ENABLED; } #endif - err = i40e_setup_pf_switch(pf, false); + err = i40e_setup_pf_switch(pf, false, false); if (err) { dev_info(&pdev->dev, "setup_pf_switch failed: %d\n", err); goto err_vsis; -- cgit v1.2.3 From 0ec13aff058a82426c8d44b688c804cc4a5a0a3d Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 23 Jun 2021 21:13:10 -0700 Subject: Revert "ibmvnic: simplify reset_long_term_buff function" This reverts commit 1c7d45e7b2c29080bf6c8cd0e213cc3cbb62a054. We tried to optimize the number of hcalls we send and skipped sending the REQUEST_MAP calls for some maps. However during resets, we need to resend all the maps to the VIOS since the VIOS does not remember the old values. In fact we may have failed over to a new VIOS which will not have any of the mappings. When we send packets with map ids the VIOS does not know about, it triggers a FATAL reset. While the client does recover from the FATAL error reset, we are seeing a large number of such resets. Handling FATAL resets is lot more unnecessary work than issuing a few more hcalls so revert the commit and resend the maps to the VIOS. Fixes: 1c7d45e7b2c ("ibmvnic: simplify reset_long_term_buff function") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 46 +++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 5788bb956d73..4b4eccc496a8 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -257,12 +257,40 @@ static void free_long_term_buff(struct ibmvnic_adapter *adapter, dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr); } -static int reset_long_term_buff(struct ibmvnic_long_term_buff *ltb) +static int reset_long_term_buff(struct ibmvnic_adapter *adapter, + struct ibmvnic_long_term_buff *ltb) { - if (!ltb->buff) - return -EINVAL; + struct device *dev = &adapter->vdev->dev; + int rc; memset(ltb->buff, 0, ltb->size); + + mutex_lock(&adapter->fw_lock); + adapter->fw_done_rc = 0; + + reinit_completion(&adapter->fw_done); + rc = send_request_map(adapter, ltb->addr, ltb->size, ltb->map_id); + if (rc) { + mutex_unlock(&adapter->fw_lock); + return rc; + } + + rc = ibmvnic_wait_for_completion(adapter, &adapter->fw_done, 10000); + if (rc) { + dev_info(dev, + "Reset failed, long term map request timed out or aborted\n"); + mutex_unlock(&adapter->fw_lock); + return rc; + } + + if (adapter->fw_done_rc) { + dev_info(dev, + "Reset failed, attempting to free and reallocate buffer\n"); + free_long_term_buff(adapter, ltb); + mutex_unlock(&adapter->fw_lock); + return alloc_long_term_buff(adapter, ltb, ltb->size); + } + mutex_unlock(&adapter->fw_lock); return 0; } @@ -484,7 +512,8 @@ static int reset_rx_pools(struct ibmvnic_adapter *adapter) rx_pool->size * rx_pool->buff_size); } else { - rc = reset_long_term_buff(&rx_pool->long_term_buff); + rc = reset_long_term_buff(adapter, + &rx_pool->long_term_buff); } if (rc) @@ -607,11 +636,12 @@ static int init_rx_pools(struct net_device *netdev) return 0; } -static int reset_one_tx_pool(struct ibmvnic_tx_pool *tx_pool) +static int reset_one_tx_pool(struct ibmvnic_adapter *adapter, + struct ibmvnic_tx_pool *tx_pool) { int rc, i; - rc = reset_long_term_buff(&tx_pool->long_term_buff); + rc = reset_long_term_buff(adapter, &tx_pool->long_term_buff); if (rc) return rc; @@ -638,10 +668,10 @@ static int reset_tx_pools(struct ibmvnic_adapter *adapter) tx_scrqs = adapter->num_active_tx_pools; for (i = 0; i < tx_scrqs; i++) { - rc = reset_one_tx_pool(&adapter->tso_pool[i]); + rc = reset_one_tx_pool(adapter, &adapter->tso_pool[i]); if (rc) return rc; - rc = reset_one_tx_pool(&adapter->tx_pool[i]); + rc = reset_one_tx_pool(adapter, &adapter->tx_pool[i]); if (rc) return rc; } -- cgit v1.2.3 From 2ca220f92878470c6ba03f9946e412323093cc94 Mon Sep 17 00:00:00 2001 From: Dany Madden Date: Wed, 23 Jun 2021 21:13:11 -0700 Subject: Revert "ibmvnic: remove duplicate napi_schedule call in open function" This reverts commit 7c451f3ef676c805a4b77a743a01a5c21a250a73. When a vnic interface is taken down and then up, connectivity is not restored. We bisected it to this commit. Reverting this commit until we can fully investigate the issue/benefit of the change. Fixes: 7c451f3ef676 ("ibmvnic: remove duplicate napi_schedule call in open function") Reported-by: Cristobal Forno Reported-by: Abdul Haleem Signed-off-by: Dany Madden Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 4b4eccc496a8..8b2f6eb8eb21 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1210,6 +1210,11 @@ static int __ibmvnic_open(struct net_device *netdev) netif_tx_start_all_queues(netdev); + if (prev_state == VNIC_CLOSED) { + for (i = 0; i < adapter->req_rx_queues; i++) + napi_schedule(&adapter->napi[i]); + } + adapter->state = VNIC_OPEN; return rc; } -- cgit v1.2.3 From 65d6470d139a6c1655fccb5cbacbeaba8e8ad2f8 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 23 Jun 2021 21:13:12 -0700 Subject: ibmvnic: clean pending indirect buffs during reset We batch subordinate command response queue (scrq) descriptors that we need to send to the VIOS using an "indirect" buffer. If after we queue one or more scrqs in the indirect buffer encounter an error (say fail to allocate an skb), we leave the queued scrq descriptors in the indirect buffer until the next call to ibmvnic_xmit(). On the next call to ibmvnic_xmit(), it is possible that the adapter is going through a reset and it is possible that the long term buffers have been unmapped on the VIOS side. If we proceed to flush (send) the packets that are in the indirect buffer, we will end up using the old map ids and this can cause the VIOS to trigger an unnecessary FATAL error reset. Instead of flushing packets remaining on the indirect_buff, discard (clean) them instead. Fixes: 0d973388185d4 ("ibmvnic: Introduce xmit_more support using batched subCRQ hcalls") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 8b2f6eb8eb21..2d15b446ceb3 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -106,6 +106,8 @@ static void release_crq_queue(struct ibmvnic_adapter *); static int __ibmvnic_set_mac(struct net_device *, u8 *); static int init_crq_queue(struct ibmvnic_adapter *adapter); static int send_query_phys_parms(struct ibmvnic_adapter *adapter); +static void ibmvnic_tx_scrq_clean_buffer(struct ibmvnic_adapter *adapter, + struct ibmvnic_sub_crq_queue *tx_scrq); struct ibmvnic_stat { char name[ETH_GSTRING_LEN]; @@ -668,6 +670,7 @@ static int reset_tx_pools(struct ibmvnic_adapter *adapter) tx_scrqs = adapter->num_active_tx_pools; for (i = 0; i < tx_scrqs; i++) { + ibmvnic_tx_scrq_clean_buffer(adapter, adapter->tx_scrq[i]); rc = reset_one_tx_pool(adapter, &adapter->tso_pool[i]); if (rc) return rc; @@ -1618,7 +1621,8 @@ static void ibmvnic_tx_scrq_clean_buffer(struct ibmvnic_adapter *adapter, ind_bufp->index = 0; if (atomic_sub_return(entries, &tx_scrq->used) <= (adapter->req_tx_entries_per_subcrq / 2) && - __netif_subqueue_stopped(adapter->netdev, queue_num)) { + __netif_subqueue_stopped(adapter->netdev, queue_num) && + !test_bit(0, &adapter->resetting)) { netif_wake_subqueue(adapter->netdev, queue_num); netdev_dbg(adapter->netdev, "Started queue %d\n", queue_num); @@ -1711,7 +1715,6 @@ static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) tx_send_failed++; tx_dropped++; ret = NETDEV_TX_OK; - ibmvnic_tx_scrq_flush(adapter, tx_scrq); goto out; } @@ -3175,6 +3178,7 @@ static void release_sub_crqs(struct ibmvnic_adapter *adapter, bool do_h_free) netdev_dbg(adapter->netdev, "Releasing tx_scrq[%d]\n", i); + ibmvnic_tx_scrq_clean_buffer(adapter, adapter->tx_scrq[i]); if (adapter->tx_scrq[i]->irq) { free_irq(adapter->tx_scrq[i]->irq, adapter->tx_scrq[i]); -- cgit v1.2.3 From 72368f8b2b9e4106072a2728bed3367d54641c22 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 23 Jun 2021 21:13:13 -0700 Subject: ibmvnic: account for bufs already saved in indir_buf This fixes a crash in replenish_rx_pool() when called from ibmvnic_poll() after a previous call to replenish_rx_pool() encountered an error when allocating a socket buffer. Thanks to Rick Lindsley and Dany Madden for helping debug the crash. Fixes: 4f0b6812e9b9 ("ibmvnic: Introduce batched RX buffer descriptor transmission") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 2d15b446ceb3..779de81a54a6 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -328,7 +328,14 @@ static void replenish_rx_pool(struct ibmvnic_adapter *adapter, rx_scrq = adapter->rx_scrq[pool->index]; ind_bufp = &rx_scrq->ind_buf; - for (i = 0; i < count; ++i) { + + /* netdev_skb_alloc() could have failed after we saved a few skbs + * in the indir_buf and we would not have sent them to VIOS yet. + * To account for them, start the loop at ind_bufp->index rather + * than 0. If we pushed all the skbs to VIOS, ind_bufp->index will + * be 0. + */ + for (i = ind_bufp->index; i < count; ++i) { skb = netdev_alloc_skb(adapter->netdev, pool->buff_size); if (!skb) { dev_err(dev, "Couldn't replenish rx buff\n"); -- cgit v1.2.3 From 552a33729f1a7cc5115d0752064fe9abd6e3e336 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 23 Jun 2021 21:13:14 -0700 Subject: ibmvnic: set ltb->buff to NULL after freeing free_long_term_buff() checks ltb->buff to decide whether we have a long term buffer to free. So set ltb->buff to NULL afer freeing. While here, also clear ->map_id, fix up some coding style and log an error. Fixes: 9c4eaabd1bb39 ("Check CRQ command return codes") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 779de81a54a6..b8bdab0b2701 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -211,12 +211,11 @@ static int alloc_long_term_buff(struct ibmvnic_adapter *adapter, mutex_lock(&adapter->fw_lock); adapter->fw_done_rc = 0; reinit_completion(&adapter->fw_done); - rc = send_request_map(adapter, ltb->addr, - ltb->size, ltb->map_id); + + rc = send_request_map(adapter, ltb->addr, ltb->size, ltb->map_id); if (rc) { - dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr); - mutex_unlock(&adapter->fw_lock); - return rc; + dev_err(dev, "send_request_map failed, rc = %d\n", rc); + goto out; } rc = ibmvnic_wait_for_completion(adapter, &adapter->fw_done, 10000); @@ -224,20 +223,23 @@ static int alloc_long_term_buff(struct ibmvnic_adapter *adapter, dev_err(dev, "Long term map request aborted or timed out,rc = %d\n", rc); - dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr); - mutex_unlock(&adapter->fw_lock); - return rc; + goto out; } if (adapter->fw_done_rc) { dev_err(dev, "Couldn't map long term buffer,rc = %d\n", adapter->fw_done_rc); + rc = -1; + goto out; + } + rc = 0; +out: + if (rc) { dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr); - mutex_unlock(&adapter->fw_lock); - return -1; + ltb->buff = NULL; } mutex_unlock(&adapter->fw_lock); - return 0; + return rc; } static void free_long_term_buff(struct ibmvnic_adapter *adapter, @@ -257,6 +259,8 @@ static void free_long_term_buff(struct ibmvnic_adapter *adapter, adapter->reset_reason != VNIC_RESET_TIMEOUT) send_request_unmap(adapter, ltb->map_id); dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr); + ltb->buff = NULL; + ltb->map_id = 0; } static int reset_long_term_buff(struct ibmvnic_adapter *adapter, -- cgit v1.2.3 From f6ebca8efa52e4ae770f0325d618e7bcf08ada0c Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 23 Jun 2021 21:13:15 -0700 Subject: ibmvnic: free tx_pool if tso_pool alloc fails Free tx_pool and clear it, if allocation of tso_pool fails. release_tx_pools() assumes we have both tx and tso_pools if ->tx_pool is non-NULL. If allocation of tso_pool fails in init_tx_pools(), the assumption will not be true and we would end up dereferencing ->tx_buff, ->free_map fields from a NULL pointer. Fixes: 3205306c6b8d ("ibmvnic: Update TX pool initialization routine") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index b8bdab0b2701..ede65b32f821 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -778,8 +778,11 @@ static int init_tx_pools(struct net_device *netdev) adapter->tso_pool = kcalloc(tx_subcrqs, sizeof(struct ibmvnic_tx_pool), GFP_KERNEL); - if (!adapter->tso_pool) + if (!adapter->tso_pool) { + kfree(adapter->tx_pool); + adapter->tx_pool = NULL; return -1; + } adapter->num_active_tx_pools = tx_subcrqs; -- cgit v1.2.3 From 154b3b2a6ffca445379063ef49f71895104d5a5e Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 23 Jun 2021 21:13:16 -0700 Subject: ibmvnic: parenthesize a check Parenthesize a check to be more explicit and to fix a sparse warning seen on some distros. Fixes: 91dc5d2553fbf ("ibmvnic: fix miscellaneous checks") Signed-off-by: Sukadev Bhattiprolu Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index ede65b32f821..1c572491441c 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -3266,7 +3266,7 @@ static int enable_scrq_irq(struct ibmvnic_adapter *adapter, /* H_EOI would fail with rc = H_FUNCTION when running * in XIVE mode which is expected, but not an error. */ - if (rc && rc != H_FUNCTION) + if (rc && (rc != H_FUNCTION)) dev_err(dev, "H_EOI FAILED irq 0x%llx. rc=%ld\n", val, rc); } -- cgit v1.2.3 From 1f7fe5121127e037b86592ba42ce36515ea0e3f7 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 24 Jun 2021 11:38:28 +0200 Subject: net: macsec: fix the length used to copy the key for offloading The key length used when offloading macsec to Ethernet or PHY drivers was set to MACSEC_KEYID_LEN (16), which is an issue as: - This was never meant to be the key length. - The key length can be > 16. Fix this by using MACSEC_MAX_KEY_LEN to store the key (the max length accepted in uAPI) and secy->key_len to copy it. Fixes: 3cf3227a21d1 ("net: macsec: hardware offloading infrastructure") Reported-by: Lior Nahmanson Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/net/macsec.c | 4 ++-- include/net/macsec.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 92425e1fd70c..93dc48b9b4f2 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -1819,7 +1819,7 @@ static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info) ctx.sa.rx_sa = rx_sa; ctx.secy = secy; memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), - MACSEC_KEYID_LEN); + secy->key_len); err = macsec_offload(ops->mdo_add_rxsa, &ctx); if (err) @@ -2061,7 +2061,7 @@ static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info) ctx.sa.tx_sa = tx_sa; ctx.secy = secy; memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), - MACSEC_KEYID_LEN); + secy->key_len); err = macsec_offload(ops->mdo_add_txsa, &ctx); if (err) diff --git a/include/net/macsec.h b/include/net/macsec.h index 52874cdfe226..d6fa6b97f6ef 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -241,7 +241,7 @@ struct macsec_context { struct macsec_rx_sc *rx_sc; struct { unsigned char assoc_num; - u8 key[MACSEC_KEYID_LEN]; + u8 key[MACSEC_MAX_KEY_LEN]; union { struct macsec_rx_sa *rx_sa; struct macsec_tx_sa *tx_sa; -- cgit v1.2.3 From c309217f91f2d2097c2a0a832d9bff50b88c81dc Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 24 Jun 2021 11:38:29 +0200 Subject: net: phy: mscc: fix macsec key length The key length used to store the macsec key was set to MACSEC_KEYID_LEN (16), which is an issue as: - This was never meant to be the key length. - The key length can be > 16. Fix this by using MACSEC_MAX_KEY_LEN instead (the max length accepted in uAPI). Fixes: 28c5107aa904 ("net: phy: mscc: macsec support") Reported-by: Lior Nahmanson Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/net/phy/mscc/mscc_macsec.c | 2 +- drivers/net/phy/mscc/mscc_macsec.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/mscc/mscc_macsec.c b/drivers/net/phy/mscc/mscc_macsec.c index 10be266e48e8..b7b2521c73fb 100644 --- a/drivers/net/phy/mscc/mscc_macsec.c +++ b/drivers/net/phy/mscc/mscc_macsec.c @@ -501,7 +501,7 @@ static u32 vsc8584_macsec_flow_context_id(struct macsec_flow *flow) } /* Derive the AES key to get a key for the hash autentication */ -static int vsc8584_macsec_derive_key(const u8 key[MACSEC_KEYID_LEN], +static int vsc8584_macsec_derive_key(const u8 key[MACSEC_MAX_KEY_LEN], u16 key_len, u8 hkey[16]) { const u8 input[AES_BLOCK_SIZE] = {0}; diff --git a/drivers/net/phy/mscc/mscc_macsec.h b/drivers/net/phy/mscc/mscc_macsec.h index 9c6d25e36de2..453304bae778 100644 --- a/drivers/net/phy/mscc/mscc_macsec.h +++ b/drivers/net/phy/mscc/mscc_macsec.h @@ -81,7 +81,7 @@ struct macsec_flow { /* Highest takes precedence [0..15] */ u8 priority; - u8 key[MACSEC_KEYID_LEN]; + u8 key[MACSEC_MAX_KEY_LEN]; union { struct macsec_rx_sa *rx_sa; -- cgit v1.2.3 From d67fb4772d9a6cfd10f1109f0e7b1e6eb58c8e16 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 24 Jun 2021 11:38:30 +0200 Subject: net: atlantic: fix the macsec key length The key length used to store the macsec key was set to MACSEC_KEYID_LEN (16), which is an issue as: - This was never meant to be the key length. - The key length can be > 16. Fix this by using MACSEC_MAX_KEY_LEN instead (the max length accepted in uAPI). Fixes: 27736563ce32 ("net: atlantic: MACSec egress offload implementation") Fixes: 9ff40a751a6f ("net: atlantic: MACSec ingress offload implementation") Reported-by: Lior Nahmanson Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_macsec.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_macsec.h b/drivers/net/ethernet/aquantia/atlantic/aq_macsec.h index f5fba8b8cdea..a47e2710487e 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_macsec.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_macsec.h @@ -91,7 +91,7 @@ struct aq_macsec_txsc { u32 hw_sc_idx; unsigned long tx_sa_idx_busy; const struct macsec_secy *sw_secy; - u8 tx_sa_key[MACSEC_NUM_AN][MACSEC_KEYID_LEN]; + u8 tx_sa_key[MACSEC_NUM_AN][MACSEC_MAX_KEY_LEN]; struct aq_macsec_tx_sc_stats stats; struct aq_macsec_tx_sa_stats tx_sa_stats[MACSEC_NUM_AN]; }; @@ -101,7 +101,7 @@ struct aq_macsec_rxsc { unsigned long rx_sa_idx_busy; const struct macsec_secy *sw_secy; const struct macsec_rx_sc *sw_rxsc; - u8 rx_sa_key[MACSEC_NUM_AN][MACSEC_KEYID_LEN]; + u8 rx_sa_key[MACSEC_NUM_AN][MACSEC_MAX_KEY_LEN]; struct aq_macsec_rx_sa_stats rx_sa_stats[MACSEC_NUM_AN]; }; -- cgit v1.2.3 From 624085a31c1ad6a80b1e53f686bf6ee92abbf6e8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 24 Jun 2021 03:07:20 -0700 Subject: ipv6: fix out-of-bound access in ip6_parse_tlv() First problem is that optlen is fetched without checking there is more than one byte to parse. Fix this by taking care of IPV6_TLV_PAD1 before fetching optlen (under appropriate sanity checks against len) Second problem is that IPV6_TLV_PADN checks of zero padding are performed before the check of remaining length. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Fixes: c1412fce7ecc ("net/ipv6/exthdrs.c: Strict PadN option checking") Signed-off-by: Eric Dumazet Cc: Paolo Abeni Cc: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/exthdrs.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 6f7da8f3e2e5..26882e165c9e 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -135,18 +135,23 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, len -= 2; while (len > 0) { - int optlen = nh[off + 1] + 2; - int i; + int optlen, i; - switch (nh[off]) { - case IPV6_TLV_PAD1: - optlen = 1; + if (nh[off] == IPV6_TLV_PAD1) { padlen++; if (padlen > 7) goto bad; - break; + off++; + len--; + continue; + } + if (len < 2) + goto bad; + optlen = nh[off + 1] + 2; + if (optlen > len) + goto bad; - case IPV6_TLV_PADN: + if (nh[off] == IPV6_TLV_PADN) { /* RFC 2460 states that the purpose of PadN is * to align the containing header to multiples * of 8. 7 is therefore the highest valid value. @@ -163,12 +168,7 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, if (nh[off + i] != 0) goto bad; } - break; - - default: /* Other TLV code so scan list */ - if (optlen > len) - goto bad; - + } else { tlv_count++; if (tlv_count > max_count) goto bad; @@ -188,7 +188,6 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, return false; padlen = 0; - break; } off += optlen; len -= optlen; -- cgit v1.2.3 From 2e7256f12cdb16eaa2515b6231d665044a07c51a Mon Sep 17 00:00:00 2001 From: Sasha Neftin Date: Thu, 24 Jun 2021 12:02:48 -0700 Subject: e1000e: Check the PCIm state Complete to commit def4ec6dce393e ("e1000e: PCIm function state support") Check the PCIm state only on CSME systems. There is no point to do this check on non CSME systems. This patch fixes a generation a false-positive warning: "Error in exiting dmoff" Fixes: def4ec6dce39 ("e1000e: PCIm function state support") Signed-off-by: Sasha Neftin Tested-by: Dvora Fuxbrumer Signed-off-by: Tony Nguyen Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/e1000e/netdev.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 88e9035b75cf..dc0ded7e5e61 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -5223,18 +5223,20 @@ static void e1000_watchdog_task(struct work_struct *work) pm_runtime_resume(netdev->dev.parent); /* Checking if MAC is in DMoff state*/ - pcim_state = er32(STATUS); - while (pcim_state & E1000_STATUS_PCIM_STATE) { - if (tries++ == dmoff_exit_timeout) { - e_dbg("Error in exiting dmoff\n"); - break; - } - usleep_range(10000, 20000); + if (er32(FWSM) & E1000_ICH_FWSM_FW_VALID) { pcim_state = er32(STATUS); - - /* Checking if MAC exited DMoff state */ - if (!(pcim_state & E1000_STATUS_PCIM_STATE)) - e1000_phy_hw_reset(&adapter->hw); + while (pcim_state & E1000_STATUS_PCIM_STATE) { + if (tries++ == dmoff_exit_timeout) { + e_dbg("Error in exiting dmoff\n"); + break; + } + usleep_range(10000, 20000); + pcim_state = er32(STATUS); + + /* Checking if MAC exited DMoff state */ + if (!(pcim_state & E1000_STATUS_PCIM_STATE)) + e1000_phy_hw_reset(&adapter->hw); + } } /* update snapshot of PHY registers on LSC */ -- cgit v1.2.3 From be7f62eebaff2f86c1467a2d33930a0a7a87675b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 24 Jun 2021 18:52:07 +0300 Subject: net: dsa: sja1105: fix NULL pointer dereference in sja1105_reload_cbs() priv->cbs is an array of priv->info->num_cbs_shapers elements of type struct sja1105_cbs_entry which only get allocated if CONFIG_NET_SCH_CBS is enabled. However, sja1105_reload_cbs() is called from sja1105_static_config_reload() which in turn is called for any of the items in sja1105_reset_reasons, therefore during the normal runtime of the driver and not just from a code path which can be triggered by the tc-cbs offload. The sja1105_reload_cbs() function does not contain a check whether the priv->cbs array is NULL or not, it just assumes it isn't and proceeds to iterate through the credit-based shaper elements. This leads to a NULL pointer dereference. The solution is to return success if the priv->cbs array has not been allocated, since sja1105_reload_cbs() has nothing to do. Fixes: 4d7525085a9b ("net: dsa: sja1105: offload the Credit-Based Shaper qdisc") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index b88d9ef45a1f..ebe4d33cda27 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -1798,6 +1798,12 @@ static int sja1105_reload_cbs(struct sja1105_private *priv) { int rc = 0, i; + /* The credit based shapers are only allocated if + * CONFIG_NET_SCH_CBS is enabled. + */ + if (!priv->cbs) + return 0; + for (i = 0; i < priv->info->num_cbs_shapers; i++) { struct sja1105_cbs_entry *cbs = &priv->cbs[i]; -- cgit v1.2.3 From ff70202b2d1ad522275c6aadc8c53519b6a22c57 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 24 Jun 2021 10:05:05 +0200 Subject: dev_forward_skb: do not scrub skb mark within the same name space The goal is to keep the mark during a bpf_redirect(), like it is done for legacy encapsulation / decapsulation, when there is no x-netns. This was initially done in commit 213dd74aee76 ("skbuff: Do not scrub skb mark within the same name space"). When the call to skb_scrub_packet() was added in dev_forward_skb() (commit 8b27f27797ca ("skb: allow skb_scrub_packet() to be used by tunnels")), the second argument (xnet) was set to true to force a call to skb_orphan(). At this time, the mark was always cleanned up by skb_scrub_packet(), whatever xnet value was. This call to skb_orphan() was removed later in commit 9c4c325252c5 ("skbuff: preserve sock reference when scrubbing the skb."). But this 'true' stayed here without any real reason. Let's correctly set xnet in ____dev_forward_skb(), this function has access to the previous interface and to the new interface. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5cbc950b34df..5ab2d1917ca1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4114,7 +4114,7 @@ static __always_inline int ____dev_forward_skb(struct net_device *dev, return NET_RX_DROP; } - skb_scrub_packet(skb, true); + skb_scrub_packet(skb, !net_eq(dev_net(dev), dev_net(skb->dev))); skb->priority = 0; return 0; } -- cgit v1.2.3 From 1db1a862a08f85edc36aad091236ac9b818e949e Mon Sep 17 00:00:00 2001 From: Bailey Forrest Date: Thu, 24 Jun 2021 19:55:41 -0700 Subject: gve: Fix swapped vars when fetching max queues Fixes: 893ce44df565 ("gve: Add basic driver framework for Compute Engine Virtual NIC") Signed-off-by: Bailey Forrest Signed-off-by: David S. Miller --- drivers/net/ethernet/google/gve/gve_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index bbc423e93122..79cefe85a799 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1295,8 +1295,8 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) gve_write_version(®_bar->driver_version); /* Get max queues to alloc etherdev */ - max_rx_queues = ioread32be(®_bar->max_tx_queues); - max_tx_queues = ioread32be(®_bar->max_rx_queues); + max_tx_queues = ioread32be(®_bar->max_tx_queues); + max_rx_queues = ioread32be(®_bar->max_rx_queues); /* Alloc and setup the netdev and priv */ dev = alloc_etherdev_mqs(sizeof(*priv), max_tx_queues, max_rx_queues); if (!dev) { -- cgit v1.2.3 From d6765985a42a660f078896d5c5b27f97c580a490 Mon Sep 17 00:00:00 2001 From: Petr Oros Date: Fri, 25 Jun 2021 10:27:45 +0200 Subject: Revert "be2net: disable bh with spin_lock in be_process_mcc" Patch was based on wrong presumption that be_poll can be called only from bh context. It reintroducing old regression (also reverted) and causing deadlock when we use netconsole with benet in bonding. Old revert: commit 072a9c486004 ("netpoll: revert 6bdb7fe3104 and fix be_poll() instead") [ 331.269715] bond0: (slave enp0s7f0): Releasing backup interface [ 331.270121] CPU: 4 PID: 1479 Comm: ifenslave Not tainted 5.13.0-rc7+ #2 [ 331.270122] Call Trace: [ 331.270122] [c00000001789f200] [c0000000008c505c] dump_stack+0x100/0x174 (unreliable) [ 331.270124] [c00000001789f240] [c008000001238b9c] be_poll+0x64/0xe90 [be2net] [ 331.270125] [c00000001789f330] [c000000000d1e6e4] netpoll_poll_dev+0x174/0x3d0 [ 331.270127] [c00000001789f400] [c008000001bc167c] bond_poll_controller+0xb4/0x130 [bonding] [ 331.270128] [c00000001789f450] [c000000000d1e624] netpoll_poll_dev+0xb4/0x3d0 [ 331.270129] [c00000001789f520] [c000000000d1ed88] netpoll_send_skb+0x448/0x470 [ 331.270130] [c00000001789f5d0] [c0080000011f14f8] write_msg+0x180/0x1b0 [netconsole] [ 331.270131] [c00000001789f640] [c000000000230c0c] console_unlock+0x54c/0x790 [ 331.270132] [c00000001789f7b0] [c000000000233098] vprintk_emit+0x2d8/0x450 [ 331.270133] [c00000001789f810] [c000000000234758] vprintk+0xc8/0x270 [ 331.270134] [c00000001789f850] [c000000000233c28] printk+0x40/0x54 [ 331.270135] [c00000001789f870] [c000000000ccf908] __netdev_printk+0x150/0x198 [ 331.270136] [c00000001789f910] [c000000000ccfdb4] netdev_info+0x68/0x94 [ 331.270137] [c00000001789f950] [c008000001bcbd70] __bond_release_one+0x188/0x6b0 [bonding] [ 331.270138] [c00000001789faa0] [c008000001bcc6f4] bond_do_ioctl+0x42c/0x490 [bonding] [ 331.270139] [c00000001789fb60] [c000000000d0d17c] dev_ifsioc+0x17c/0x400 [ 331.270140] [c00000001789fbc0] [c000000000d0db70] dev_ioctl+0x390/0x890 [ 331.270141] [c00000001789fc10] [c000000000c7c76c] sock_do_ioctl+0xac/0x1b0 [ 331.270142] [c00000001789fc90] [c000000000c7ffac] sock_ioctl+0x31c/0x6e0 [ 331.270143] [c00000001789fd60] [c0000000005b9728] sys_ioctl+0xf8/0x150 [ 331.270145] [c00000001789fdb0] [c0000000000336c0] system_call_exception+0x160/0x2f0 [ 331.270146] [c00000001789fe10] [c00000000000d35c] system_call_common+0xec/0x278 [ 331.270147] --- interrupt: c00 at 0x7fffa6c6ec00 [ 331.270147] NIP: 00007fffa6c6ec00 LR: 0000000105c4185c CTR: 0000000000000000 [ 331.270148] REGS: c00000001789fe80 TRAP: 0c00 Not tainted (5.13.0-rc7+) [ 331.270148] MSR: 800000000280f033 CR: 28000428 XER: 00000000 [ 331.270155] IRQMASK: 0 [ 331.270156] GPR00: 0000000000000036 00007fffd494d5b0 00007fffa6d57100 0000000000000003 [ 331.270158] GPR04: 0000000000008991 00007fffd494d6d0 0000000000000008 00007fffd494f28c [ 331.270161] GPR08: 0000000000000003 0000000000000000 0000000000000000 0000000000000000 [ 331.270164] GPR12: 0000000000000000 00007fffa6dfa220 0000000000000000 0000000000000000 [ 331.270167] GPR16: 0000000105c44880 0000000000000000 0000000105c60088 0000000105c60318 [ 331.270170] GPR20: 0000000105c602c0 0000000105c44560 0000000000000000 0000000000000000 [ 331.270172] GPR24: 00007fffd494dc50 00007fffd494d6a8 0000000105c60008 00007fffd494d6d0 [ 331.270175] GPR28: 00007fffd494f27e 0000000105c6026c 00007fffd494f284 0000000000000000 [ 331.270178] NIP [00007fffa6c6ec00] 0x7fffa6c6ec00 [ 331.270178] LR [0000000105c4185c] 0x105c4185c [ 331.270179] --- interrupt: c00 This reverts commit d0d006a43e9a7a796f6f178839c92fcc222c564d. Fixes: d0d006a43e9a7a ("be2net: disable bh with spin_lock in be_process_mcc") Signed-off-by: Petr Oros Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_cmds.c | 6 ++++-- drivers/net/ethernet/emulex/benet/be_main.c | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c index 701c12c9e033..649c5c429bd7 100644 --- a/drivers/net/ethernet/emulex/benet/be_cmds.c +++ b/drivers/net/ethernet/emulex/benet/be_cmds.c @@ -550,7 +550,7 @@ int be_process_mcc(struct be_adapter *adapter) int num = 0, status = 0; struct be_mcc_obj *mcc_obj = &adapter->mcc_obj; - spin_lock_bh(&adapter->mcc_cq_lock); + spin_lock(&adapter->mcc_cq_lock); while ((compl = be_mcc_compl_get(adapter))) { if (compl->flags & CQE_FLAGS_ASYNC_MASK) { @@ -566,7 +566,7 @@ int be_process_mcc(struct be_adapter *adapter) if (num) be_cq_notify(adapter, mcc_obj->cq.id, mcc_obj->rearm_cq, num); - spin_unlock_bh(&adapter->mcc_cq_lock); + spin_unlock(&adapter->mcc_cq_lock); return status; } @@ -581,7 +581,9 @@ static int be_mcc_wait_compl(struct be_adapter *adapter) if (be_check_error(adapter, BE_ERROR_ANY)) return -EIO; + local_bh_disable(); status = be_process_mcc(adapter); + local_bh_enable(); if (atomic_read(&mcc_obj->q.used) == 0) break; diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 7968568bbe21..361c1c87c183 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -5501,7 +5501,9 @@ static void be_worker(struct work_struct *work) * mcc completions */ if (!netif_running(adapter->netdev)) { + local_bh_disable(); be_process_mcc(adapter); + local_bh_enable(); goto reschedule; } -- cgit v1.2.3 From fade56410c22cacafb1be9f911a0afd3701d8366 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Fri, 25 Jun 2021 19:21:39 +0300 Subject: net: lwtunnel: handle MTU calculation in forwading Commit 14972cbd34ff ("net: lwtunnel: Handle fragmentation") moved fragmentation logic away from lwtunnel by carry encap headroom and use it in output MTU calculation. But the forwarding part was not covered and created difference in MTU for output and forwarding and further to silent drops on ipv4 forwarding path. Fix it by taking into account lwtunnel encap headroom. The same commit also introduced difference in how to treat RTAX_MTU in IPv4 and IPv6 where latter explicitly removes lwtunnel encap headroom from route MTU. Make IPv4 version do the same. Fixes: 14972cbd34ff ("net: lwtunnel: Handle fragmentation") Suggested-by: David Ahern Signed-off-by: Vadim Fedorenko Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip.h | 12 ++++++++---- include/net/ip6_route.h | 16 ++++++++++++---- net/ipv4/route.c | 3 ++- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index e20874059f82..d9683bef8684 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -31,6 +31,7 @@ #include #include #include +#include #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ #define IPV4_MIN_MTU 68 /* RFC 791 */ @@ -445,22 +446,25 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, /* 'forwarding = true' case should always honour route mtu */ mtu = dst_metric_raw(dst, RTAX_MTU); - if (mtu) - return mtu; + if (!mtu) + mtu = min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU); - return min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU); + return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } static inline unsigned int ip_skb_dst_mtu(struct sock *sk, const struct sk_buff *skb) { + unsigned int mtu; + if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) { bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED; return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding); } - return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); + mtu = min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); + return mtu - lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu); } struct dst_metrics *ip_fib_metrics_init(struct net *net, struct nlattr *fc_mx, diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index f51a118bfce8..f14149df5a65 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -265,11 +265,18 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, static inline int ip6_skb_dst_mtu(struct sk_buff *skb) { + int mtu; + struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; - return (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) ? - skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); + if (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) { + mtu = READ_ONCE(skb_dst(skb)->dev->mtu); + mtu -= lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu); + } else + mtu = dst_mtu(skb_dst(skb)); + + return mtu; } static inline bool ip6_sk_accept_pmtu(const struct sock *sk) @@ -317,7 +324,7 @@ static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) if (dst_metric_locked(dst, RTAX_MTU)) { mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) - return mtu; + goto out; } mtu = IPV6_MIN_MTU; @@ -327,7 +334,8 @@ static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) mtu = idev->cnf.mtu6; rcu_read_unlock(); - return mtu; +out: + return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } u32 ip6_mtu_from_fib6(const struct fib6_result *res, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6a36ac98476f..78d1e5afc452 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1306,7 +1306,7 @@ INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) - return mtu; + goto out; mtu = READ_ONCE(dst->dev->mtu); @@ -1315,6 +1315,7 @@ INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = 576; } +out: mtu = min_t(unsigned int, mtu, IP_MAX_MTU); return mtu - lwtunnel_headroom(dst->lwtstate, mtu); -- cgit v1.2.3 From 3f2db250099f46988088800052cdf2332c7aba61 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Fri, 25 Jun 2021 23:23:48 +0300 Subject: net: sched: fix warning in tcindex_alloc_perfect_hash Syzbot reported warning in tcindex_alloc_perfect_hash. The problem was in too big cp->hash, which triggers warning in kmalloc. Since cp->hash comes from userspace, there is no need to warn if value is not correct Fixes: b9a24bb76bf6 ("net_sched: properly handle failure case of tcf_exts_init()") Reported-and-tested-by: syzbot+1071ad60cd7df39fdadb@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index c4007b9cd16d..5b274534264c 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -304,7 +304,7 @@ static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp) int i, err = 0; cp->perfect = kcalloc(cp->hash, sizeof(struct tcindex_filter_result), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (!cp->perfect) return -ENOMEM; -- cgit v1.2.3 From 9ea3e52c5bc8bb4a084938dc1e3160643438927a Mon Sep 17 00:00:00 2001 From: gushengxian Date: Sat, 26 Jun 2021 04:56:06 -0700 Subject: flow_offload: action should not be NULL when it is referenced "action" should not be NULL when it is referenced. Signed-off-by: gushengxian <13145886936@163.com> Signed-off-by: gushengxian Signed-off-by: David S. Miller --- include/net/flow_offload.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index dc5c1e69cd9f..69c9eabf8325 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -319,12 +319,14 @@ flow_action_mixed_hw_stats_check(const struct flow_action *action, if (flow_offload_has_one_action(action)) return true; - flow_action_for_each(i, action_entry, action) { - if (i && action_entry->hw_stats != last_hw_stats) { - NL_SET_ERR_MSG_MOD(extack, "Mixing HW stats types for actions is not supported"); - return false; + if (action) { + flow_action_for_each(i, action_entry, action) { + if (i && action_entry->hw_stats != last_hw_stats) { + NL_SET_ERR_MSG_MOD(extack, "Mixing HW stats types for actions is not supported"); + return false; + } + last_hw_stats = action_entry->hw_stats; } - last_hw_stats = action_entry->hw_stats; } return true; } -- cgit v1.2.3 From 0c5dc070ff3d6246d22ddd931f23a6266249e3db Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 28 Jun 2021 16:13:41 -0300 Subject: sctp: validate from_addr_param return Ilja reported that, simply putting it, nothing was validating that from_addr_param functions were operating on initialized memory. That is, the parameter itself was being validated by sctp_walk_params, but it doesn't check for types and their specific sizes and it could be a 0-length one, causing from_addr_param to potentially work over the next parameter or even uninitialized memory. The fix here is to, in all calls to from_addr_param, check if enough space is there for the wanted IP address type. Reported-by: Ilja Van Sprundel Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 2 +- net/sctp/bind_addr.c | 19 +++++++++++-------- net/sctp/input.c | 6 ++++-- net/sctp/ipv6.c | 7 ++++++- net/sctp/protocol.c | 7 ++++++- net/sctp/sm_make_chunk.c | 29 ++++++++++++++++------------- 6 files changed, 44 insertions(+), 26 deletions(-) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 1aa585216f34..d49593c72a55 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -461,7 +461,7 @@ struct sctp_af { int saddr); void (*from_sk) (union sctp_addr *, struct sock *sk); - void (*from_addr_param) (union sctp_addr *, + bool (*from_addr_param) (union sctp_addr *, union sctp_addr_param *, __be16 port, int iif); int (*to_addr_param) (const union sctp_addr *, diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index 53e5ed79f63f..59e653b528b1 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -270,22 +270,19 @@ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list, rawaddr = (union sctp_addr_param *)raw_addr_list; af = sctp_get_af_specific(param_type2af(param->type)); - if (unlikely(!af)) { + if (unlikely(!af) || + !af->from_addr_param(&addr, rawaddr, htons(port), 0)) { retval = -EINVAL; - sctp_bind_addr_clean(bp); - break; + goto out_err; } - af->from_addr_param(&addr, rawaddr, htons(port), 0); if (sctp_bind_addr_state(bp, &addr) != -1) goto next; retval = sctp_add_bind_addr(bp, &addr, sizeof(addr), SCTP_ADDR_SRC, gfp); - if (retval) { + if (retval) /* Can't finish building the list, clean up. */ - sctp_bind_addr_clean(bp); - break; - } + goto out_err; next: len = ntohs(param->length); @@ -294,6 +291,12 @@ next: } return retval; + +out_err: + if (retval) + sctp_bind_addr_clean(bp); + + return retval; } /******************************************************************** diff --git a/net/sctp/input.c b/net/sctp/input.c index d508f6f3dd08..8924e2e142c8 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -1131,7 +1131,8 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net, if (!af) continue; - af->from_addr_param(paddr, params.addr, sh->source, 0); + if (!af->from_addr_param(paddr, params.addr, sh->source, 0)) + continue; asoc = __sctp_lookup_association(net, laddr, paddr, transportp); if (asoc) @@ -1174,7 +1175,8 @@ static struct sctp_association *__sctp_rcv_asconf_lookup( if (unlikely(!af)) return NULL; - af->from_addr_param(&paddr, param, peer_port, 0); + if (af->from_addr_param(&paddr, param, peer_port, 0)) + return NULL; return __sctp_lookup_association(net, laddr, &paddr, transportp); } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index bd08807c9e44..5c6f5ced9cfa 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -551,15 +551,20 @@ static void sctp_v6_to_sk_daddr(union sctp_addr *addr, struct sock *sk) } /* Initialize a sctp_addr from an address parameter. */ -static void sctp_v6_from_addr_param(union sctp_addr *addr, +static bool sctp_v6_from_addr_param(union sctp_addr *addr, union sctp_addr_param *param, __be16 port, int iif) { + if (ntohs(param->v6.param_hdr.length) < sizeof(struct sctp_ipv6addr_param)) + return false; + addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = port; addr->v6.sin6_flowinfo = 0; /* BUG */ addr->v6.sin6_addr = param->v6.addr; addr->v6.sin6_scope_id = iif; + + return true; } /* Initialize an address parameter from a sctp_addr and return the length diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 6f2bbfeec3a4..25192b378e2e 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -254,14 +254,19 @@ static void sctp_v4_to_sk_daddr(union sctp_addr *addr, struct sock *sk) } /* Initialize a sctp_addr from an address parameter. */ -static void sctp_v4_from_addr_param(union sctp_addr *addr, +static bool sctp_v4_from_addr_param(union sctp_addr *addr, union sctp_addr_param *param, __be16 port, int iif) { + if (ntohs(param->v4.param_hdr.length) < sizeof(struct sctp_ipv4addr_param)) + return false; + addr->v4.sin_family = AF_INET; addr->v4.sin_port = port; addr->v4.sin_addr.s_addr = param->v4.addr.s_addr; memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); + + return true; } /* Initialize an address parameter from a sctp_addr and return the length diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 5b44d228b6ca..f33a870b483d 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2346,11 +2346,13 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, /* Process the initialization parameters. */ sctp_walk_params(param, peer_init, init_hdr.params) { - if (!src_match && (param.p->type == SCTP_PARAM_IPV4_ADDRESS || - param.p->type == SCTP_PARAM_IPV6_ADDRESS)) { + if (!src_match && + (param.p->type == SCTP_PARAM_IPV4_ADDRESS || + param.p->type == SCTP_PARAM_IPV6_ADDRESS)) { af = sctp_get_af_specific(param_type2af(param.p->type)); - af->from_addr_param(&addr, param.addr, - chunk->sctp_hdr->source, 0); + if (!af->from_addr_param(&addr, param.addr, + chunk->sctp_hdr->source, 0)) + continue; if (sctp_cmp_addr_exact(sctp_source(chunk), &addr)) src_match = 1; } @@ -2531,7 +2533,8 @@ static int sctp_process_param(struct sctp_association *asoc, break; do_addr_param: af = sctp_get_af_specific(param_type2af(param.p->type)); - af->from_addr_param(&addr, param.addr, htons(asoc->peer.port), 0); + if (!af->from_addr_param(&addr, param.addr, htons(asoc->peer.port), 0)) + break; scope = sctp_scope(peer_addr); if (sctp_in_scope(net, &addr, scope)) if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_UNCONFIRMED)) @@ -2632,15 +2635,13 @@ do_addr_param: addr_param = param.v + sizeof(struct sctp_addip_param); af = sctp_get_af_specific(param_type2af(addr_param->p.type)); - if (af == NULL) + if (!af) break; - af->from_addr_param(&addr, addr_param, - htons(asoc->peer.port), 0); + if (!af->from_addr_param(&addr, addr_param, + htons(asoc->peer.port), 0)) + break; - /* if the address is invalid, we can't process it. - * XXX: see spec for what to do. - */ if (!af->addr_valid(&addr, NULL, NULL)) break; @@ -3054,7 +3055,8 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, if (unlikely(!af)) return SCTP_ERROR_DNS_FAILED; - af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0); + if (!af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0)) + return SCTP_ERROR_DNS_FAILED; /* ADDIP 4.2.1 This parameter MUST NOT contain a broadcast * or multicast address. @@ -3331,7 +3333,8 @@ static void sctp_asconf_param_success(struct sctp_association *asoc, /* We have checked the packet before, so we do not check again. */ af = sctp_get_af_specific(param_type2af(addr_param->p.type)); - af->from_addr_param(&addr, addr_param, htons(bp->port), 0); + if (!af->from_addr_param(&addr, addr_param, htons(bp->port), 0)) + return; switch (asconf_param->param_hdr.type) { case SCTP_PARAM_ADD_IP: -- cgit v1.2.3 From 50619dbf8db77e98d821d615af4f634d08e22698 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 28 Jun 2021 16:13:42 -0300 Subject: sctp: add size validation when walking chunks The first chunk in a packet is ensured to be present at the beginning of sctp_rcv(), as a packet needs to have at least 1 chunk. But the second one, may not be completely available and ch->length can be over uninitialized memory. Fix here is by only trying to walk on the next chunk if there is enough to hold at least the header, and then proceed with the ch->length validation that is already there. Reported-by: Ilja Van Sprundel Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/input.c b/net/sctp/input.c index 8924e2e142c8..f72bff93745c 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -1247,7 +1247,7 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net, ch = (struct sctp_chunkhdr *)ch_end; chunk_num++; - } while (ch_end < skb_tail_pointer(skb)); + } while (ch_end + sizeof(*ch) < skb_tail_pointer(skb)); return asoc; } -- cgit v1.2.3 From b6ffe7671b24689c09faa5675dd58f93758a97ae Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 28 Jun 2021 16:13:43 -0300 Subject: sctp: validate chunk size in __rcv_asconf_lookup In one of the fallbacks that SCTP has for identifying an association for an incoming packet, it looks for AddIp chunk (from ASCONF) and take a peek. Thing is, at this stage nothing was validating that the chunk actually had enough content for that, allowing the peek to happen over uninitialized memory. Similar check already exists in actual asconf handling in sctp_verify_asconf(). Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/input.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sctp/input.c b/net/sctp/input.c index f72bff93745c..96dea8097dbe 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -1168,6 +1168,9 @@ static struct sctp_association *__sctp_rcv_asconf_lookup( union sctp_addr_param *param; union sctp_addr paddr; + if (ntohs(ch->length) < sizeof(*asconf) + sizeof(struct sctp_paramhdr)) + return NULL; + /* Skip over the ADDIP header and find the Address parameter */ param = (union sctp_addr_param *)(asconf + 1); -- cgit v1.2.3 From ef6c8d6ccf0c1dccdda092ebe8782777cd7803c9 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 28 Jun 2021 16:13:44 -0300 Subject: sctp: add param size validation for SCTP_PARAM_SET_PRIMARY When SCTP handles an INIT chunk, it calls for example: sctp_sf_do_5_1B_init sctp_verify_init sctp_verify_param sctp_process_init sctp_process_param handling of SCTP_PARAM_SET_PRIMARY sctp_verify_init() wasn't doing proper size validation and neither the later handling, allowing it to work over the chunk itself, possibly being uninitialized memory. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index f33a870b483d..587fb3cb88e2 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2166,9 +2166,16 @@ static enum sctp_ierror sctp_verify_param(struct net *net, break; case SCTP_PARAM_SET_PRIMARY: - if (ep->asconf_enable) - break; - goto unhandled; + if (!ep->asconf_enable) + goto unhandled; + + if (ntohs(param.p->length) < sizeof(struct sctp_addip_param) + + sizeof(struct sctp_paramhdr)) { + sctp_process_inv_paramlength(asoc, param.p, + chunk, err_chunk); + retval = SCTP_IERROR_ABORT; + } + break; case SCTP_PARAM_HOST_NAME_ADDRESS: /* Tell the peer, we won't support this param. */ -- cgit v1.2.3 From b856150c8098f12996ee81c3ab2a65adbaeeb3ec Mon Sep 17 00:00:00 2001 From: David Bauer Date: Sun, 27 Jun 2021 12:16:07 +0200 Subject: net: phy: at803x: mask 1000 Base-X link mode AR8031/AR8033 have different status registers for copper and fiber operation. However, the extended status register is the same for both operation modes. As a result of that, ESTATUS_1000_XFULL is set to 1 even when operating in copper TP mode. Remove this mode from the supported link modes, as this driver currently only supports copper operation. Signed-off-by: David Bauer Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/at803x.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index 32af52dd5aed..d797c2c9ae3f 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -610,6 +610,34 @@ static void at803x_remove(struct phy_device *phydev) regulator_disable(priv->vddio); } +static int at803x_get_features(struct phy_device *phydev) +{ + int err; + + err = genphy_read_abilities(phydev); + if (err) + return err; + + if (!at803x_match_phy_id(phydev, ATH8031_PHY_ID)) + return 0; + + /* AR8031/AR8033 have different status registers + * for copper and fiber operation. However, the + * extended status register is the same for both + * operation modes. + * + * As a result of that, ESTATUS_1000_XFULL is set + * to 1 even when operating in copper TP mode. + * + * Remove this mode from the supported link modes, + * as this driver currently only supports copper + * operation. + */ + linkmode_clear_bit(ETHTOOL_LINK_MODE_1000baseX_Full_BIT, + phydev->supported); + return 0; +} + static int at803x_smarteee_config(struct phy_device *phydev) { struct at803x_priv *priv = phydev->priv; @@ -1225,7 +1253,7 @@ static struct phy_driver at803x_driver[] = { .resume = at803x_resume, .read_page = at803x_read_page, .write_page = at803x_write_page, - /* PHY_GBIT_FEATURES */ + .get_features = at803x_get_features, .read_status = at803x_read_status, .config_intr = &at803x_config_intr, .handle_interrupt = at803x_handle_interrupt, -- cgit v1.2.3 From a118ff661889ecee3ca90f8125bad8fb5bbc07d5 Mon Sep 17 00:00:00 2001 From: Paolo Pisati Date: Mon, 28 Jun 2021 16:54:24 +0200 Subject: selftests: net: devlink_port_split: check devlink returned an element before dereferencing it And thus avoid a Python stacktrace: ~/linux/tools/testing/selftests/net$ ./devlink_port_split.py Traceback (most recent call last): File "/home/linux/tools/testing/selftests/net/./devlink_port_split.py", line 277, in main() File "/home/linux/tools/testing/selftests/net/./devlink_port_split.py", line 242, in main dev = list(devs.keys())[0] IndexError: list index out of range Signed-off-by: Paolo Pisati Signed-off-by: David S. Miller --- tools/testing/selftests/net/devlink_port_split.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/net/devlink_port_split.py b/tools/testing/selftests/net/devlink_port_split.py index 834066d465fc..d162915311fd 100755 --- a/tools/testing/selftests/net/devlink_port_split.py +++ b/tools/testing/selftests/net/devlink_port_split.py @@ -239,6 +239,9 @@ def main(cmdline=None): assert stderr == "" devs = json.loads(stdout)['dev'] + if len(devs.keys()) == 0: + print("no devlink device found") + sys.exit(1) dev = list(devs.keys())[0] cmd = "devlink dev show %s" % dev -- cgit v1.2.3