From f2b18baca9539c6a3116d48b70972c7a2ba5d766 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 Jan 2020 12:25:50 +0100 Subject: mac80211: use more bits for ack_frame_id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It turns out that this wasn't a good idea, I hit a test failure in hwsim due to this. That particular failure was easily worked around, but it raised questions: if an AP needs to, for example, send action frames to each connected station, the current limit is nowhere near enough (especially if those stations are sleeping and the frames are queued for a while.) Shuffle around some bits to make more room for ack_frame_id to allow up to 8192 queued up frames, that's enough for queueing 4 frames to each connected station, even at the maximum of 2007 stations on a single AP. We take the bits from band (which currently only 2 but I leave 3 in case we add another band) and from the hw_queue, which can only need 4 since it has a limit of 16 queues. Fixes: 6912daed05e1 ("mac80211: Shrink the size of ack_frame_id to make room for tx_time_est") Signed-off-by: Johannes Berg Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20200115122549.b9a4ef9f4980.Ied52ed90150220b83a280009c590b65d125d087c@changeid Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 2 +- net/mac80211/tx.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 000c742d0527..6aee699deb28 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3450,7 +3450,7 @@ int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, spin_lock_irqsave(&local->ack_status_lock, spin_flags); id = idr_alloc(&local->ack_status_frames, ack_skb, - 1, 0x40, GFP_ATOMIC); + 1, 0x2000, GFP_ATOMIC); spin_unlock_irqrestore(&local->ack_status_lock, spin_flags); if (id < 0) { diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 4bd1faf4f779..87def9cb91ff 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2442,7 +2442,7 @@ static int ieee80211_store_ack_skb(struct ieee80211_local *local, spin_lock_irqsave(&local->ack_status_lock, flags); id = idr_alloc(&local->ack_status_frames, ack_skb, - 1, 0x40, GFP_ATOMIC); + 1, 0x2000, GFP_ATOMIC); spin_unlock_irqrestore(&local->ack_status_lock, flags); if (id >= 0) { -- cgit v1.2.3 From 2bf973ff9b9aeceb8acda629ae65341820d4b35b Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Fri, 31 Jan 2020 13:12:51 +0200 Subject: mac80211: fix quiet mode activation in action frames Previously I intended to ignore quiet mode in probe response, however I ended up ignoring it instead for action frames. As a matter of fact, this path isn't invoked for probe responses to start with. Just revert this patch. Signed-off-by: Sara Sharon Fixes: 7976b1e9e3bf ("mac80211: ignore quiet mode in probe") Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/20200131111300.891737-15-luca@coelho.fi Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 5fa13176036f..e041af2f021a 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -8,7 +8,7 @@ * Copyright 2007, Michael Wu * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation */ #include @@ -1311,7 +1311,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, if (!res) { ch_switch.timestamp = timestamp; ch_switch.device_timestamp = device_timestamp; - ch_switch.block_tx = beacon ? csa_ie.mode : 0; + ch_switch.block_tx = csa_ie.mode; ch_switch.chandef = csa_ie.chandef; ch_switch.count = csa_ie.count; ch_switch.delay = csa_ie.max_switch_time; @@ -1404,7 +1404,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, sdata->vif.csa_active = true; sdata->csa_chandef = csa_ie.chandef; - sdata->csa_block_tx = ch_switch.block_tx; + sdata->csa_block_tx = csa_ie.mode; ifmgd->csa_ignored_same_chan = false; if (sdata->csa_block_tx) @@ -1438,7 +1438,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, * reset when the disconnection worker runs. */ sdata->vif.csa_active = true; - sdata->csa_block_tx = ch_switch.block_tx; + sdata->csa_block_tx = csa_ie.mode; ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work); mutex_unlock(&local->chanctx_mtx); -- cgit v1.2.3 From a04564c99bb4a92f805a58e56b2d22cc4978f152 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 31 Jan 2020 13:12:58 +0200 Subject: mac80211: consider more elements in parsing CRC We only use the parsing CRC for checking if a beacon changed, and elements with an ID > 63 cannot be represented in the filter. Thus, like we did before with WMM and Cisco vendor elements, just statically add these forgotten items to the CRC: - WLAN_EID_VHT_OPERATION - WLAN_EID_OPMODE_NOTIF I guess that in most cases when VHT/HE operation change, the HT operation also changed, and so the change was picked up, but we did notice that pure operating mode notification changes were ignored. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/20200131111300.891737-22-luca@coelho.fi [restrict to VHT for the mac80211 branch] Signed-off-by: Johannes Berg --- net/mac80211/util.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 32a7a53833c0..739e90555d8b 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1063,16 +1063,22 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, elem_parse_failed = true; break; case WLAN_EID_VHT_OPERATION: - if (elen >= sizeof(struct ieee80211_vht_operation)) + if (elen >= sizeof(struct ieee80211_vht_operation)) { elems->vht_operation = (void *)pos; - else - elem_parse_failed = true; + if (calc_crc) + crc = crc32_be(crc, pos - 2, elen + 2); + break; + } + elem_parse_failed = true; break; case WLAN_EID_OPMODE_NOTIF: - if (elen > 0) + if (elen > 0) { elems->opmode_notif = pos; - else - elem_parse_failed = true; + if (calc_crc) + crc = crc32_be(crc, pos - 2, elen + 2); + break; + } + elem_parse_failed = true; break; case WLAN_EID_MESH_ID: elems->mesh_id = pos; -- cgit v1.2.3 From bfb7bac3a8f47100ebe7961bd14e924c96e21ca7 Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Mon, 3 Feb 2020 10:56:50 +0000 Subject: cfg80211: check wiphy driver existence for drvinfo report When preparing ethtool drvinfo, check if wiphy driver is defined before dereferencing it. Driver may not exist, e.g. if wiphy is attached to a virtual platform device. Signed-off-by: Sergey Matyukevich Link: https://lore.kernel.org/r/20200203105644.28875-1-sergey.matyukevich.os@quantenna.com Signed-off-by: Johannes Berg --- net/wireless/ethtool.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/ethtool.c b/net/wireless/ethtool.c index a9c0f368db5d..24e18405cdb4 100644 --- a/net/wireless/ethtool.c +++ b/net/wireless/ethtool.c @@ -7,9 +7,13 @@ void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { struct wireless_dev *wdev = dev->ieee80211_ptr; + struct device *pdev = wiphy_dev(wdev->wiphy); - strlcpy(info->driver, wiphy_dev(wdev->wiphy)->driver->name, - sizeof(info->driver)); + if (pdev->driver) + strlcpy(info->driver, pdev->driver->name, + sizeof(info->driver)); + else + strlcpy(info->driver, "N/A", sizeof(info->driver)); strlcpy(info->version, init_utsname()->release, sizeof(info->version)); -- cgit v1.2.3 From c4a3922d2d20c710f827d3a115ee338e8d0467df Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 2 Feb 2020 20:30:52 -0800 Subject: netfilter: xt_hashlimit: reduce hashlimit_mutex scope for htable_put() It is unnecessary to hold hashlimit_mutex for htable_destroy() as it is already removed from the global hashtable and its refcount is already zero. Also, switch hinfo->use to refcount_t so that we don't have to hold the mutex until it reaches zero in htable_put(). Reported-and-tested-by: syzbot+adf6c6c2be1c3a718121@syzkaller.appspotmail.com Acked-by: Florian Westphal Signed-off-by: Cong Wang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_hashlimit.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index bccd47cd7190..cc475a608f81 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #define XT_HASHLIMIT_ALL (XT_HASHLIMIT_HASH_DIP | XT_HASHLIMIT_HASH_DPT | \ @@ -114,7 +115,7 @@ struct dsthash_ent { struct xt_hashlimit_htable { struct hlist_node node; /* global list of all htables */ - int use; + refcount_t use; u_int8_t family; bool rnd_initialized; @@ -315,7 +316,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, for (i = 0; i < hinfo->cfg.size; i++) INIT_HLIST_HEAD(&hinfo->hash[i]); - hinfo->use = 1; + refcount_set(&hinfo->use, 1); hinfo->count = 0; hinfo->family = family; hinfo->rnd_initialized = false; @@ -420,7 +421,7 @@ static struct xt_hashlimit_htable *htable_find_get(struct net *net, hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) { if (!strcmp(name, hinfo->name) && hinfo->family == family) { - hinfo->use++; + refcount_inc(&hinfo->use); return hinfo; } } @@ -429,12 +430,11 @@ static struct xt_hashlimit_htable *htable_find_get(struct net *net, static void htable_put(struct xt_hashlimit_htable *hinfo) { - mutex_lock(&hashlimit_mutex); - if (--hinfo->use == 0) { + if (refcount_dec_and_mutex_lock(&hinfo->use, &hashlimit_mutex)) { hlist_del(&hinfo->node); + mutex_unlock(&hashlimit_mutex); htable_destroy(hinfo); } - mutex_unlock(&hashlimit_mutex); } /* The algorithm used is the Simple Token Bucket Filter (TBF) -- cgit v1.2.3 From 8d0015a7ab76b8b1e89a3e5f5710a6e5103f2dd5 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 2 Feb 2020 20:30:53 -0800 Subject: netfilter: xt_hashlimit: limit the max size of hashtable The user-specified hashtable size is unbound, this could easily lead to an OOM or a hung task as we hold the global mutex while allocating and initializing the new hashtable. Add a max value to cap both cfg->size and cfg->max, as suggested by Florian. Reported-and-tested-by: syzbot+adf6c6c2be1c3a718121@syzkaller.appspotmail.com Signed-off-by: Cong Wang Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_hashlimit.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index cc475a608f81..7a2c4b8408c4 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -837,6 +837,8 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 3); } +#define HASHLIMIT_MAX_SIZE 1048576 + static int hashlimit_mt_check_common(const struct xt_mtchk_param *par, struct xt_hashlimit_htable **hinfo, struct hashlimit_cfg3 *cfg, @@ -847,6 +849,14 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par, if (cfg->gc_interval == 0 || cfg->expire == 0) return -EINVAL; + if (cfg->size > HASHLIMIT_MAX_SIZE) { + cfg->size = HASHLIMIT_MAX_SIZE; + pr_info_ratelimited("size too large, truncated to %u\n", cfg->size); + } + if (cfg->max > HASHLIMIT_MAX_SIZE) { + cfg->max = HASHLIMIT_MAX_SIZE; + pr_info_ratelimited("max too large, truncated to %u\n", cfg->max); + } if (par->family == NFPROTO_IPV4) { if (cfg->srcmask > 32 || cfg->dstmask > 32) return -EINVAL; -- cgit v1.2.3 From a7da92c2c8a1faf253a3b3e292fda6910deba540 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Feb 2020 13:06:18 +0100 Subject: netfilter: flowtable: skip offload setup if disabled nftables test case tests/shell/testcases/flowtable/0001flowtable_0 results in a crash. After the refactor, if we leave early via nf_flowtable_hw_offload(), then "struct flow_block_offload" is left in an uninitialized state, but later users assume its initialised. Fixes: a7965d58ddab02 ("netfilter: flowtable: add nf_flow_table_offload_cmd()") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 83e1db37c3b0..06f00cdc3891 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -847,9 +847,6 @@ static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, { int err; - if (!nf_flowtable_hw_offload(flowtable)) - return 0; - if (!dev->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; @@ -876,6 +873,9 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, struct flow_block_offload bo; int err; + if (!nf_flowtable_hw_offload(flowtable)) + return 0; + err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, &extack); if (err < 0) return err; -- cgit v1.2.3 From 2437fd7baf299c7b8a39fa3e727755e84ee7c4ea Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Mon, 10 Feb 2020 16:11:09 +0800 Subject: tipc: make three functions static Fix the following sparse warning: net/tipc/node.c:281:6: warning: symbol 'tipc_node_free' was not declared. Should it be static? net/tipc/node.c:2801:5: warning: symbol '__tipc_nl_node_set_key' was not declared. Should it be static? net/tipc/node.c:2878:5: warning: symbol '__tipc_nl_node_flush_key' was not declared. Should it be static? Fixes: fc1b6d6de220 ("tipc: introduce TIPC encryption & authentication") Fixes: e1f32190cf7d ("tipc: add support for AEAD key setting via netlink") Signed-off-by: Chen Wandun Signed-off-by: David S. Miller --- net/tipc/node.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 99b28b69fc17..0c88778c88b5 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -278,7 +278,7 @@ struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos) } #endif -void tipc_node_free(struct rcu_head *rp) +static void tipc_node_free(struct rcu_head *rp) { struct tipc_node *n = container_of(rp, struct tipc_node, rcu); @@ -2798,7 +2798,7 @@ static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id) return 0; } -int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) +static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1]; struct net *net = sock_net(skb->sk); @@ -2875,7 +2875,8 @@ int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) return err; } -int __tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) +static int __tipc_nl_node_flush_key(struct sk_buff *skb, + struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); -- cgit v1.2.3 From 5609e2bbefed38df060c2472af9cf0c469b8997f Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Mon, 10 Feb 2020 16:27:59 +0800 Subject: mptcp: make the symbol 'mptcp_sk_clone_lock' static Fix the following sparse warning: net/mptcp/protocol.c:646:13: warning: symbol 'mptcp_sk_clone_lock' was not declared. Should it be static? Fixes: b0519de8b3f1 ("mptcp: fix use-after-free for ipv6") Signed-off-by: Chen Wandun Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 73780b4cb108..030dee668e0a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -643,7 +643,7 @@ static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) } #endif -struct sock *mptcp_sk_clone_lock(const struct sock *sk) +static struct sock *mptcp_sk_clone_lock(const struct sock *sk) { struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); -- cgit v1.2.3 From 5391a87751a164b3194864126f3b016038abc9fe Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Mon, 10 Feb 2020 15:35:44 +0700 Subject: tipc: fix successful connect() but timed out MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 9546a0b7ce00 ("tipc: fix wrong connect() return code"), we fixed the issue with the 'connect()' that returns zero even though the connecting has failed by waiting for the connection to be 'ESTABLISHED' really. However, the approach has one drawback in conjunction with our 'lightweight' connection setup mechanism that the following scenario can happen: (server) (client) +- accept()| | wait_for_conn() | | |connect() -------+ | |<-------[SYN]---------| > sleeping | | *CONNECTING | |--------->*ESTABLISHED | | |--------[ACK]-------->*ESTABLISHED > wakeup() send()|--------[DATA]------->|\ > wakeup() send()|--------[DATA]------->| | > wakeup() . . . . |-> recvq . . . . . | . send()|--------[DATA]------->|/ > wakeup() close()|--------[FIN]-------->*DISCONNECTING | *DISCONNECTING | | | ~~~~~~~~~~~~~~~~~~> schedule() | wait again . . | ETIMEDOUT Upon the receipt of the server 'ACK', the client becomes 'ESTABLISHED' and the 'wait_for_conn()' process is woken up but not run. Meanwhile, the server starts to send a number of data following by a 'close()' shortly without waiting any response from the client, which then forces the client socket to be 'DISCONNECTING' immediately. When the wait process is switched to be running, it continues to wait until the timer expires because of the unexpected socket state. The client 'connect()' will finally get ‘-ETIMEDOUT’ and force to release the socket whereas there remains the messages in its receive queue. Obviously the issue would not happen if the server had some delay prior to its 'close()' (or the number of 'DATA' messages is large enough), but any kind of delay would make the connection setup/shutdown "heavy". We solve this by simply allowing the 'connect()' returns zero in this particular case. The socket is already 'DISCONNECTING', so any further write will get '-EPIPE' but the socket is still able to read the messages existing in its receive queue. Note: This solution doesn't break the previous one as it deals with a different situation that the socket state is 'DISCONNECTING' but has no error (i.e. sk->sk_err = 0). Fixes: 9546a0b7ce00 ("tipc: fix wrong connect() return code") Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/socket.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index f9b4fb92c0b1..693e8902161e 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2441,6 +2441,8 @@ static int tipc_wait_for_connect(struct socket *sock, long *timeo_p) return -ETIMEDOUT; if (signal_pending(current)) return sock_intr_errno(*timeo_p); + if (sk->sk_state == TIPC_DISCONNECTING) + break; add_wait_queue(sk_sleep(sk), &wait); done = sk_wait_event(sk, timeo_p, tipc_sk_connected(sk), -- cgit v1.2.3 From 3d1e0b406de16508de96f4a07fc3f94cfc678372 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Feb 2020 17:37:04 +0100 Subject: netfilter: conntrack: remove two args from resolve_clash ctinfo is whats taken from the skb, i.e. ct = nf_ct_get(skb, &ctinfo). We do not pass 'ct' and instead re-fetch it from the skb. Just do the same for both netns and ctinfo. Also add a comment on what clash resolution is supposed to do. While at it, one indent level can be removed. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 69 +++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index d1305423640f..5e332b01f3c0 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -894,31 +894,64 @@ static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, } } -/* Resolve race on insertion if this protocol allows this. */ +/** + * nf_ct_resolve_clash - attempt to handle clash without packet drop + * + * @skb: skb that causes the clash + * @h: tuplehash of the clashing entry already in table + * + * A conntrack entry can be inserted to the connection tracking table + * if there is no existing entry with an identical tuple. + * + * If there is one, @skb (and the assocated, unconfirmed conntrack) has + * to be dropped. In case @skb is retransmitted, next conntrack lookup + * will find the already-existing entry. + * + * The major problem with such packet drop is the extra delay added by + * the packet loss -- it will take some time for a retransmit to occur + * (or the sender to time out when waiting for a reply). + * + * This function attempts to handle the situation without packet drop. + * + * If @skb has no NAT transformation or if the colliding entries are + * exactly the same, only the to-be-confirmed conntrack entry is discarded + * and @skb is associated with the conntrack entry already in the table. + * + * Returns NF_DROP if the clash could not be resolved. + */ static __cold noinline int -nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, - enum ip_conntrack_info ctinfo, - struct nf_conntrack_tuple_hash *h) +nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) { /* This is the conntrack entry already in hashes that won race. */ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); const struct nf_conntrack_l4proto *l4proto; - enum ip_conntrack_info oldinfo; - struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo); + enum ip_conntrack_info ctinfo; + struct nf_conn *loser_ct; + struct net *net; + + loser_ct = nf_ct_get(skb, &ctinfo); l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); - if (l4proto->allow_clash && - !nf_ct_is_dying(ct) && - atomic_inc_not_zero(&ct->ct_general.use)) { - if (((ct->status & IPS_NAT_DONE_MASK) == 0) || - nf_ct_match(ct, loser_ct)) { - nf_ct_acct_merge(ct, ctinfo, loser_ct); - nf_conntrack_put(&loser_ct->ct_general); - nf_ct_set(skb, ct, oldinfo); - return NF_ACCEPT; - } - nf_ct_put(ct); + if (!l4proto->allow_clash) + goto drop; + + if (nf_ct_is_dying(ct)) + goto drop; + + if (!atomic_inc_not_zero(&ct->ct_general.use)) + goto drop; + + if (((ct->status & IPS_NAT_DONE_MASK) == 0) || + nf_ct_match(ct, loser_ct)) { + nf_ct_acct_merge(ct, ctinfo, loser_ct); + nf_conntrack_put(&loser_ct->ct_general); + nf_ct_set(skb, ct, ctinfo); + return NF_ACCEPT; } + + nf_ct_put(ct); +drop: + net = nf_ct_net(loser_ct); NF_CT_STAT_INC(net, drop); return NF_DROP; } @@ -1036,7 +1069,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) out: nf_ct_add_to_dying_list(ct); - ret = nf_ct_resolve_clash(net, skb, ctinfo, h); + ret = nf_ct_resolve_clash(skb, h); dying: nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert_failed); -- cgit v1.2.3 From b1b32552c1d81f0cf6a8e79043a2a47e769ff071 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Feb 2020 17:37:05 +0100 Subject: netfilter: conntrack: place confirm-bit setting in a helper ... so it can be re-used from clash resolution in followup patch. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 5e332b01f3c0..5fda5bd10160 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -894,6 +894,19 @@ static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, } } +static void __nf_conntrack_insert_prepare(struct nf_conn *ct) +{ + struct nf_conn_tstamp *tstamp; + + atomic_inc(&ct->ct_general.use); + ct->status |= IPS_CONFIRMED; + + /* set conntrack timestamp, if enabled. */ + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) + tstamp->start = ktime_get_real_ns(); +} + /** * nf_ct_resolve_clash - attempt to handle clash without packet drop * @@ -965,7 +978,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct nf_conn_help *help; - struct nf_conn_tstamp *tstamp; struct hlist_nulls_node *n; enum ip_conntrack_info ctinfo; struct net *net; @@ -1042,13 +1054,8 @@ __nf_conntrack_confirm(struct sk_buff *skb) setting time, otherwise we'd get timer wrap in weird delay cases. */ ct->timeout += nfct_time_stamp; - atomic_inc(&ct->ct_general.use); - ct->status |= IPS_CONFIRMED; - /* set conntrack timestamp, if enabled. */ - tstamp = nf_conn_tstamp_find(ct); - if (tstamp) - tstamp->start = ktime_get_real_ns(); + __nf_conntrack_insert_prepare(ct); /* Since the lookup is lockless, hash insertion must be done after * starting the timer and setting the CONFIRMED bit. The RCU barriers -- cgit v1.2.3 From bb89abe52bf426f1f40850c441efc77426cc31e1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Feb 2020 17:37:06 +0100 Subject: netfilter: conntrack: split resolve_clash function Followup patch will need a helper function with the 'clashing entries refer to the identical tuple in both directions' resolution logic. This patch will add another resolve_clash helper where loser_ct must not be added to the dying list because it will be inserted into the table. Therefore this also moves the stat counters and dying-list insertion of the losing ct. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 58 +++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 5fda5bd10160..3f069eb0f0fc 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -907,6 +907,39 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct) tstamp->start = ktime_get_real_ns(); } +static int __nf_ct_resolve_clash(struct sk_buff *skb, + struct nf_conntrack_tuple_hash *h) +{ + /* This is the conntrack entry already in hashes that won race. */ + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + enum ip_conntrack_info ctinfo; + struct nf_conn *loser_ct; + + loser_ct = nf_ct_get(skb, &ctinfo); + + if (nf_ct_is_dying(ct)) + return NF_DROP; + + if (!atomic_inc_not_zero(&ct->ct_general.use)) + return NF_DROP; + + if (((ct->status & IPS_NAT_DONE_MASK) == 0) || + nf_ct_match(ct, loser_ct)) { + struct net *net = nf_ct_net(ct); + + nf_ct_acct_merge(ct, ctinfo, loser_ct); + nf_ct_add_to_dying_list(loser_ct); + nf_conntrack_put(&loser_ct->ct_general); + nf_ct_set(skb, ct, ctinfo); + + NF_CT_STAT_INC(net, insert_failed); + return NF_ACCEPT; + } + + nf_ct_put(ct); + return NF_DROP; +} + /** * nf_ct_resolve_clash - attempt to handle clash without packet drop * @@ -941,31 +974,23 @@ nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) enum ip_conntrack_info ctinfo; struct nf_conn *loser_ct; struct net *net; + int ret; loser_ct = nf_ct_get(skb, &ctinfo); + net = nf_ct_net(loser_ct); l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); if (!l4proto->allow_clash) goto drop; - if (nf_ct_is_dying(ct)) - goto drop; - - if (!atomic_inc_not_zero(&ct->ct_general.use)) - goto drop; - - if (((ct->status & IPS_NAT_DONE_MASK) == 0) || - nf_ct_match(ct, loser_ct)) { - nf_ct_acct_merge(ct, ctinfo, loser_ct); - nf_conntrack_put(&loser_ct->ct_general); - nf_ct_set(skb, ct, ctinfo); - return NF_ACCEPT; - } + ret = __nf_ct_resolve_clash(skb, h); + if (ret == NF_ACCEPT) + return ret; - nf_ct_put(ct); drop: - net = nf_ct_net(loser_ct); + nf_ct_add_to_dying_list(loser_ct); NF_CT_STAT_INC(net, drop); + NF_CT_STAT_INC(net, insert_failed); return NF_DROP; } @@ -1034,6 +1059,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) if (unlikely(nf_ct_is_dying(ct))) { nf_ct_add_to_dying_list(ct); + NF_CT_STAT_INC(net, insert_failed); goto dying; } @@ -1075,11 +1101,9 @@ __nf_conntrack_confirm(struct sk_buff *skb) return NF_ACCEPT; out: - nf_ct_add_to_dying_list(ct); ret = nf_ct_resolve_clash(skb, h); dying: nf_conntrack_double_unlock(hash, reply_hash); - NF_CT_STAT_INC(net, insert_failed); local_bh_enable(); return ret; } -- cgit v1.2.3 From 30744a68626db6a0029aca9c646831c869c16d83 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 10 Feb 2020 16:27:12 +0100 Subject: xsk: Publish global consumer pointers when NAPI is finished The commit 4b638f13bab4 ("xsk: Eliminate the RX batch size") introduced a much more lazy way of updating the global consumer pointers from the kernel side, by only doing so when running out of entries in the fill or Tx rings (the rings consumed by the kernel). This can result in a deadlock with the user application if the kernel requires more than one entry to proceed and the application cannot put these entries in the fill ring because the kernel has not updated the global consumer pointer since the ring is not empty. Fix this by publishing the local kernel side consumer pointer whenever we have completed Rx or Tx processing in the kernel. This way, user space will have an up-to-date view of the consumer pointers whenever it gets to execute in the one core case (application and driver on the same core), or after a certain number of packets have been processed in the two core case (application and driver on different cores). A side effect of this patch is that the one core case gets better performance, but the two core case gets worse. The reason that the one core case improves is that updating the global consumer pointer is relatively cheap since the application by definition is not running when the kernel is (they are on the same core) and it is beneficial for the application, once it gets to run, to have pointers that are as up to date as possible since it then can operate on more packets and buffers. In the two core case, the most important performance aspect is to minimize the number of accesses to the global pointers since they are shared between two cores and bounces between the caches of those cores. This patch results in more updates to global state, which means lower performance in the two core case. Fixes: 4b638f13bab4 ("xsk: Eliminate the RX batch size") Reported-by: Ryan Goodfellow Reported-by: Maxim Mikityanskiy Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Jonathan Lemon Acked-by: Maxim Mikityanskiy Link: https://lore.kernel.org/bpf/1581348432-6747-1-git-send-email-magnus.karlsson@intel.com --- net/xdp/xsk.c | 2 ++ net/xdp/xsk_queue.h | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index df600487a68d..356f90e4522b 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -217,6 +217,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static void xsk_flush(struct xdp_sock *xs) { xskq_prod_submit(xs->rx); + __xskq_cons_release(xs->umem->fq); sock_def_readable(&xs->sk); } @@ -304,6 +305,7 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem) rcu_read_lock(); list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + __xskq_cons_release(xs->tx); xs->sk.sk_write_space(&xs->sk); } rcu_read_unlock(); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index bec2af11853a..89a01ac4e079 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -271,7 +271,8 @@ static inline void xskq_cons_release(struct xsk_queue *q) { /* To improve performance, only update local state here. * Reflect this to global state when we get new entries - * from the ring in xskq_cons_get_entries(). + * from the ring in xskq_cons_get_entries() and whenever + * Rx or Tx processing are completed in the NAPI loop. */ q->cached_cons++; } -- cgit v1.2.3 From ad1e03b2b3d4430baaa109b77bc308dc73050de3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 10 Feb 2020 17:10:46 +0100 Subject: core: Don't skip generic XDP program execution for cloned SKBs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current generic XDP handler skips execution of XDP programs entirely if an SKB is marked as cloned. This leads to some surprising behaviour, as packets can end up being cloned in various ways, which will make an XDP program not see all the traffic on an interface. This was discovered by a simple test case where an XDP program that always returns XDP_DROP is installed on a veth device. When combining this with the Scapy packet sniffer (which uses an AF_PACKET) socket on the sending side, SKBs reliably end up in the cloned state, causing them to be passed through to the receiving interface instead of being dropped. A minimal reproducer script for this is included below. This patch fixed the issue by simply triggering the existing linearisation code for cloned SKBs instead of skipping the XDP program execution. This behaviour is in line with the behaviour of the native XDP implementation for the veth driver, which will reallocate and copy the SKB data if the SKB is marked as shared. Reproducer Python script (requires BCC and Scapy): from scapy.all import TCP, IP, Ether, sendp, sniff, AsyncSniffer, Raw, UDP from bcc import BPF import time, sys, subprocess, shlex SKB_MODE = (1 << 1) DRV_MODE = (1 << 2) PYTHON=sys.executable def client(): time.sleep(2) # Sniffing on the sender causes skb_cloned() to be set s = AsyncSniffer() s.start() for p in range(10): sendp(Ether(dst="aa:aa:aa:aa:aa:aa", src="cc:cc:cc:cc:cc:cc")/IP()/UDP()/Raw("Test"), verbose=False) time.sleep(0.1) s.stop() return 0 def server(mode): prog = BPF(text="int dummy_drop(struct xdp_md *ctx) {return XDP_DROP;}") func = prog.load_func("dummy_drop", BPF.XDP) prog.attach_xdp("a_to_b", func, mode) time.sleep(1) s = sniff(iface="a_to_b", count=10, timeout=15) if len(s): print(f"Got {len(s)} packets - should have gotten 0") return 1 else: print("Got no packets - as expected") return 0 if len(sys.argv) < 2: print(f"Usage: {sys.argv[0]} ") sys.exit(1) if sys.argv[1] == "client": sys.exit(client()) elif sys.argv[1] == "server": mode = SKB_MODE if sys.argv[2] == 'skb' else DRV_MODE sys.exit(server(mode)) else: try: mode = sys.argv[1] if mode not in ('skb', 'drv'): print(f"Usage: {sys.argv[0]} ") sys.exit(1) print(f"Running in {mode} mode") for cmd in [ 'ip netns add netns_a', 'ip netns add netns_b', 'ip -n netns_a link add a_to_b type veth peer name b_to_a netns netns_b', # Disable ipv6 to make sure there's no address autoconf traffic 'ip netns exec netns_a sysctl -qw net.ipv6.conf.a_to_b.disable_ipv6=1', 'ip netns exec netns_b sysctl -qw net.ipv6.conf.b_to_a.disable_ipv6=1', 'ip -n netns_a link set dev a_to_b address aa:aa:aa:aa:aa:aa', 'ip -n netns_b link set dev b_to_a address cc:cc:cc:cc:cc:cc', 'ip -n netns_a link set dev a_to_b up', 'ip -n netns_b link set dev b_to_a up']: subprocess.check_call(shlex.split(cmd)) server = subprocess.Popen(shlex.split(f"ip netns exec netns_a {PYTHON} {sys.argv[0]} server {mode}")) client = subprocess.Popen(shlex.split(f"ip netns exec netns_b {PYTHON} {sys.argv[0]} client")) client.wait() server.wait() sys.exit(server.returncode) finally: subprocess.run(shlex.split("ip netns delete netns_a")) subprocess.run(shlex.split("ip netns delete netns_b")) Fixes: d445516966dc ("net: xdp: support xdp generic on virtual devices") Reported-by: Stepan Horacek Suggested-by: Paolo Abeni Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index a69e8bd7ed74..a6316b336128 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4527,14 +4527,14 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. */ - if (skb_cloned(skb) || skb_is_tc_redirected(skb)) + if (skb_is_tc_redirected(skb)) return XDP_PASS; /* XDP packets must be linear and must have sufficient headroom * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also * native XDP provides, thus we need to do it here as well. */ - if (skb_is_nonlinear(skb) || + if (skb_cloned(skb) || skb_is_nonlinear(skb) || skb_headroom(skb) < XDP_PACKET_HEADROOM) { int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); int troom = skb->tail + skb->data_len - skb->end; -- cgit v1.2.3 From 457fed775c97ac2c0cd1672aaf2ff2c8a6235e87 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 10 Feb 2020 11:36:13 -0800 Subject: net/smc: fix leak of kernel memory to user space As nlmsg_put() does not clear the memory that is reserved, it this the caller responsability to make sure all of this memory will be written, in order to not reveal prior content. While we are at it, we can provide the socket cookie even if clsock is not set. syzbot reported : BUG: KMSAN: uninit-value in __arch_swab32 arch/x86/include/uapi/asm/swab.h:10 [inline] BUG: KMSAN: uninit-value in __fswab32 include/uapi/linux/swab.h:59 [inline] BUG: KMSAN: uninit-value in __swab32p include/uapi/linux/swab.h:179 [inline] BUG: KMSAN: uninit-value in __be32_to_cpup include/uapi/linux/byteorder/little_endian.h:82 [inline] BUG: KMSAN: uninit-value in get_unaligned_be32 include/linux/unaligned/access_ok.h:30 [inline] BUG: KMSAN: uninit-value in ____bpf_skb_load_helper_32 net/core/filter.c:240 [inline] BUG: KMSAN: uninit-value in ____bpf_skb_load_helper_32_no_cache net/core/filter.c:255 [inline] BUG: KMSAN: uninit-value in bpf_skb_load_helper_32_no_cache+0x14a/0x390 net/core/filter.c:252 CPU: 1 PID: 5262 Comm: syz-executor.5 Not tainted 5.5.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x220 lib/dump_stack.c:118 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:118 __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:215 __arch_swab32 arch/x86/include/uapi/asm/swab.h:10 [inline] __fswab32 include/uapi/linux/swab.h:59 [inline] __swab32p include/uapi/linux/swab.h:179 [inline] __be32_to_cpup include/uapi/linux/byteorder/little_endian.h:82 [inline] get_unaligned_be32 include/linux/unaligned/access_ok.h:30 [inline] ____bpf_skb_load_helper_32 net/core/filter.c:240 [inline] ____bpf_skb_load_helper_32_no_cache net/core/filter.c:255 [inline] bpf_skb_load_helper_32_no_cache+0x14a/0x390 net/core/filter.c:252 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:144 [inline] kmsan_internal_poison_shadow+0x66/0xd0 mm/kmsan/kmsan.c:127 kmsan_kmalloc_large+0x73/0xc0 mm/kmsan/kmsan_hooks.c:128 kmalloc_large_node_hook mm/slub.c:1406 [inline] kmalloc_large_node+0x282/0x2c0 mm/slub.c:3841 __kmalloc_node_track_caller+0x44b/0x1200 mm/slub.c:4368 __kmalloc_reserve net/core/skbuff.c:141 [inline] __alloc_skb+0x2fd/0xac0 net/core/skbuff.c:209 alloc_skb include/linux/skbuff.h:1049 [inline] netlink_dump+0x44b/0x1ab0 net/netlink/af_netlink.c:2224 __netlink_dump_start+0xbb2/0xcf0 net/netlink/af_netlink.c:2352 netlink_dump_start include/linux/netlink.h:233 [inline] smc_diag_handler_dump+0x2ba/0x300 net/smc/smc_diag.c:242 sock_diag_rcv_msg+0x211/0x610 net/core/sock_diag.c:256 netlink_rcv_skb+0x451/0x650 net/netlink/af_netlink.c:2477 sock_diag_rcv+0x63/0x80 net/core/sock_diag.c:275 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0xf9e/0x1100 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x1248/0x14d0 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:639 [inline] sock_sendmsg net/socket.c:659 [inline] kernel_sendmsg+0x433/0x440 net/socket.c:679 sock_no_sendpage+0x235/0x300 net/core/sock.c:2740 kernel_sendpage net/socket.c:3776 [inline] sock_sendpage+0x1e1/0x2c0 net/socket.c:937 pipe_to_sendpage+0x38c/0x4c0 fs/splice.c:458 splice_from_pipe_feed fs/splice.c:512 [inline] __splice_from_pipe+0x539/0xed0 fs/splice.c:636 splice_from_pipe fs/splice.c:671 [inline] generic_splice_sendpage+0x1d5/0x2d0 fs/splice.c:844 do_splice_from fs/splice.c:863 [inline] do_splice fs/splice.c:1170 [inline] __do_sys_splice fs/splice.c:1447 [inline] __se_sys_splice+0x2380/0x3350 fs/splice.c:1427 __x64_sys_splice+0x6e/0x90 fs/splice.c:1427 do_syscall_64+0xb8/0x160 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: f16a7dd5cf27 ("smc: netlink interface for SMC sockets") Signed-off-by: Eric Dumazet Cc: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_diag.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index f38727ecf8b2..e1f64f4ba236 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -39,16 +39,15 @@ static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk) { struct smc_sock *smc = smc_sk(sk); + memset(r, 0, sizeof(*r)); r->diag_family = sk->sk_family; + sock_diag_save_cookie(sk, r->id.idiag_cookie); if (!smc->clcsock) return; r->id.idiag_sport = htons(smc->clcsock->sk->sk_num); r->id.idiag_dport = smc->clcsock->sk->sk_dport; r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if; - sock_diag_save_cookie(sk, r->id.idiag_cookie); if (sk->sk_protocol == SMCPROTO_SMC) { - memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); - memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr; r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr; #if IS_ENABLED(CONFIG_IPV6) -- cgit v1.2.3 From ca1c671302825182629d3c1a60363cee6f5455bb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 12 Feb 2020 11:12:30 -0500 Subject: xprtrdma: Fix DMA scatter-gather list mapping imbalance The @nents value that was passed to ib_dma_map_sg() has to be passed to the matching ib_dma_unmap_sg() call. If ib_dma_map_sg() choses to concatenate sg entries, it will return a different nents value than it was passed. The bug was exposed by recent changes to the AMD IOMMU driver, which enabled sg entry concatenation. Looking all the way back to commit 4143f34e01e9 ("xprtrdma: Port to new memory registration API") and reviewing other kernel ULPs, it's not clear that the frwr_map() logic was ever correct for this case. Reported-by: Andre Tomt Suggested-by: Robin Murphy Signed-off-by: Chuck Lever Cc: stable@vger.kernel.org Reviewed-by: Jason Gunthorpe Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 095be887753e..125297c9aa3e 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -288,8 +288,8 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, { struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct ib_reg_wr *reg_wr; + int i, n, dma_nents; struct ib_mr *ibmr; - int i, n; u8 key; if (nsegs > ia->ri_max_frwr_depth) @@ -313,15 +313,16 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, break; } mr->mr_dir = rpcrdma_data_dir(writing); + mr->mr_nents = i; - mr->mr_nents = - ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, i, mr->mr_dir); - if (!mr->mr_nents) + dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, mr->mr_nents, + mr->mr_dir); + if (!dma_nents) goto out_dmamap_err; ibmr = mr->frwr.fr_mr; - n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); - if (unlikely(n != mr->mr_nents)) + n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE); + if (n != dma_nents) goto out_mapmr_err; ibmr->iova &= 0x00000000ffffffff; -- cgit v1.2.3 From 304db6cb7610eb9104e1f911b235d87dc4802558 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 11 Feb 2020 10:13:44 +0800 Subject: page_pool: refill page when alloc.count of pool is zero "do {} while" in page_pool_refill_alloc_cache will always refill page once whether refill is true or false, and whether alloc.count of pool is less than PP_ALLOC_CACHE_REFILL or not this is wrong, and will cause overflow of pool->alloc.cache the caller of __page_pool_get_cached should provide guarantee that pool->alloc.cache is safe to access, so in_serving_softirq should be removed as suggested by Jesper Dangaard Brouer in https://patchwork.ozlabs.org/patch/1233713/ so fix this issue by calling page_pool_refill_alloc_cache() only when pool->alloc.count is zero Fixes: 44768decb7c0 ("page_pool: handle page recycle for NUMA_NO_NODE condition") Signed-off-by: Li RongQing Suggested-by: Jesper Dangaard Brouer Acked-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller --- net/core/page_pool.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 9b7cbe35df37..10d2b255df5e 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -99,8 +99,7 @@ EXPORT_SYMBOL(page_pool_create); static void __page_pool_return_page(struct page_pool *pool, struct page *page); noinline -static struct page *page_pool_refill_alloc_cache(struct page_pool *pool, - bool refill) +static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) { struct ptr_ring *r = &pool->ring; struct page *page; @@ -141,8 +140,7 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool, page = NULL; break; } - } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL && - refill); + } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); /* Return last page */ if (likely(pool->alloc.count > 0)) @@ -155,20 +153,16 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool, /* fast path */ static struct page *__page_pool_get_cached(struct page_pool *pool) { - bool refill = false; struct page *page; - /* Test for safe-context, caller should provide this guarantee */ - if (likely(in_serving_softirq())) { - if (likely(pool->alloc.count)) { - /* Fast-path */ - page = pool->alloc.cache[--pool->alloc.count]; - return page; - } - refill = true; + /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ + if (likely(pool->alloc.count)) { + /* Fast-path */ + page = pool->alloc.cache[--pool->alloc.count]; + } else { + page = page_pool_refill_alloc_cache(pool); } - page = page_pool_refill_alloc_cache(pool, refill); return page; } -- cgit v1.2.3 From 1afa3cc90f8fb745c777884d79eaa1001d6927a6 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 11 Feb 2020 19:33:39 +0100 Subject: net/sched: matchall: add missing validation of TCA_MATCHALL_FLAGS unlike other classifiers that can be offloaded (i.e. users can set flags like 'skip_hw' and 'skip_sw'), 'cls_matchall' doesn't validate the size of netlink attribute 'TCA_MATCHALL_FLAGS' provided by user: add a proper entry to mall_policy. Fixes: b87f7936a932 ("net/sched: Add match-all classifier hw offloading.") Signed-off-by: Davide Caratti Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_matchall.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 039cc86974f4..610a0b728161 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -157,6 +157,7 @@ static void *mall_get(struct tcf_proto *tp, u32 handle) static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = { [TCA_MATCHALL_UNSPEC] = { .type = NLA_UNSPEC }, [TCA_MATCHALL_CLASSID] = { .type = NLA_U32 }, + [TCA_MATCHALL_FLAGS] = { .type = NLA_U32 }, }; static int mall_set_parms(struct net *net, struct tcf_proto *tp, -- cgit v1.2.3 From e2debf0852c4d66ba1a8bde12869b196094c70a7 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 11 Feb 2020 19:33:40 +0100 Subject: net/sched: flower: add missing validation of TCA_FLOWER_FLAGS unlike other classifiers that can be offloaded (i.e. users can set flags like 'skip_hw' and 'skip_sw'), 'cls_flower' doesn't validate the size of netlink attribute 'TCA_FLOWER_FLAGS' provided by user: add a proper entry to fl_policy. Fixes: 5b33f48842fa ("net/flower: Introduce hardware offload support") Signed-off-by: Davide Caratti Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index f9c0d1e8d380..7e54d2ab5254 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -691,6 +691,7 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { .len = 128 / BITS_PER_BYTE }, [TCA_FLOWER_KEY_CT_LABELS_MASK] = { .type = NLA_BINARY, .len = 128 / BITS_PER_BYTE }, + [TCA_FLOWER_FLAGS] = { .type = NLA_U32 }, }; static const struct nla_policy -- cgit v1.2.3 From 0b41713b606694257b90d61ba7e2712d8457648b Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 11 Feb 2020 20:47:05 +0100 Subject: icmp: introduce helper for nat'd source address in network device context This introduces a helper function to be called only by network drivers that wraps calls to icmp[v6]_send in a conntrack transformation, in case NAT has been used. We don't want to pollute the non-driver path, though, so we introduce this as a helper to be called by places that actually make use of this, as suggested by Florian. Signed-off-by: Jason A. Donenfeld Cc: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 33 +++++++++++++++++++++++++++++++++ net/ipv6/ip6_icmp.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 18068ed42f25..f369e7ce685b 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -748,6 +748,39 @@ out:; } EXPORT_SYMBOL(__icmp_send); +#if IS_ENABLED(CONFIG_NF_NAT) +#include +void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) +{ + struct sk_buff *cloned_skb = NULL; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + __be32 orig_ip; + + ct = nf_ct_get(skb_in, &ctinfo); + if (!ct || !(ct->status & IPS_SRC_NAT)) { + icmp_send(skb_in, type, code, info); + return; + } + + if (skb_shared(skb_in)) + skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC); + + if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head || + (skb_network_header(skb_in) + sizeof(struct iphdr)) > + skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in, + skb_network_offset(skb_in) + sizeof(struct iphdr)))) + goto out; + + orig_ip = ip_hdr(skb_in)->saddr; + ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip; + icmp_send(skb_in, type, code, info); + ip_hdr(skb_in)->saddr = orig_ip; +out: + consume_skb(cloned_skb); +} +EXPORT_SYMBOL(icmp_ndo_send); +#endif static void icmp_socket_deliver(struct sk_buff *skb, u32 info) { diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c index 02045494c24c..e0086758b6ee 100644 --- a/net/ipv6/ip6_icmp.c +++ b/net/ipv6/ip6_icmp.c @@ -45,4 +45,38 @@ out: rcu_read_unlock(); } EXPORT_SYMBOL(icmpv6_send); + +#if IS_ENABLED(CONFIG_NF_NAT) +#include +void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) +{ + struct sk_buff *cloned_skb = NULL; + enum ip_conntrack_info ctinfo; + struct in6_addr orig_ip; + struct nf_conn *ct; + + ct = nf_ct_get(skb_in, &ctinfo); + if (!ct || !(ct->status & IPS_SRC_NAT)) { + icmpv6_send(skb_in, type, code, info); + return; + } + + if (skb_shared(skb_in)) + skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC); + + if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head || + (skb_network_header(skb_in) + sizeof(struct ipv6hdr)) > + skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in, + skb_network_offset(skb_in) + sizeof(struct ipv6hdr)))) + goto out; + + orig_ip = ipv6_hdr(skb_in)->saddr; + ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6; + icmpv6_send(skb_in, type, code, info); + ipv6_hdr(skb_in)->saddr = orig_ip; +out: + consume_skb(cloned_skb); +} +EXPORT_SYMBOL(icmpv6_ndo_send); +#endif #endif -- cgit v1.2.3 From 45942ba890e6f35232727a5fa33d732681f4eb9f Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 11 Feb 2020 20:47:09 +0100 Subject: xfrm: interface: use icmp_ndo_send helper Because xfrmi is calling icmp from network device context, it should use the ndo helper so that the rate limiting applies correctly. Signed-off-by: Jason A. Donenfeld Cc: Nicolas Dichtel Cc: Steffen Klassert Signed-off-by: David S. Miller --- net/xfrm/xfrm_interface.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index dc651a628dcf..3361e3ac5714 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -300,10 +300,10 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); } else { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); + icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); } dst_release(dst); -- cgit v1.2.3 From ea75080110a4c1fa011b0a73cb8f42227143ee3e Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Thu, 13 Feb 2020 13:16:16 +0000 Subject: cfg80211: add missing policy for NL80211_ATTR_STATUS_CODE The nl80211_policy is missing for NL80211_ATTR_STATUS_CODE attribute. As a result, for strictly validated commands, it's assumed to not be supported. Signed-off-by: Sergey Matyukevich Link: https://lore.kernel.org/r/20200213131608.10541-2-sergey.matyukevich.os@quantenna.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 123b8d720a59..cedf17d4933f 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -437,6 +437,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG }, [NL80211_ATTR_CONTROL_PORT_OVER_NL80211] = { .type = NLA_FLAG }, [NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG }, + [NL80211_ATTR_STATUS_CODE] = { .type = NLA_U16 }, [NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 }, [NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 }, [NL80211_ATTR_PID] = { .type = NLA_U32 }, -- cgit v1.2.3 From 33181ea7f5a62a17fbe55f0f73428ecb5e686be8 Mon Sep 17 00:00:00 2001 From: Shay Bar Date: Mon, 10 Feb 2020 15:07:28 +0200 Subject: mac80211: fix wrong 160/80+80 MHz setting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before this patch, STA's would set new width of 160/80+80 MHz based on AP capability only. This is wrong because STA may not support > 80MHz BW. Fix is to verify STA has 160/80+80 MHz capability before increasing its width to > 80MHz. The "support_80_80" and "support_160" setting is based on: "Table 9-272 — Setting of the Supported Channel Width Set subfield and Extended NSS BW Support subfield at a STA transmitting the VHT Capabilities Information field" From "Draft P802.11REVmd_D3.0.pdf" Signed-off-by: Aviad Brikman Signed-off-by: Shay Bar Link: https://lore.kernel.org/r/20200210130728.23674-1-shay.bar@celeno.com Signed-off-by: Johannes Berg --- net/mac80211/util.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 739e90555d8b..decd46b38393 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2993,10 +2993,22 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, int cf0, cf1; int ccfs0, ccfs1, ccfs2; int ccf0, ccf1; + u32 vht_cap; + bool support_80_80 = false; + bool support_160 = false; if (!oper || !htop) return false; + vht_cap = hw->wiphy->bands[chandef->chan->band]->vht_cap.cap; + support_160 = (vht_cap & (IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK | + IEEE80211_VHT_CAP_EXT_NSS_BW_MASK)); + support_80_80 = ((vht_cap & + IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) || + (vht_cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ && + vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) || + ((vht_cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) >> + IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT > 1)); ccfs0 = oper->center_freq_seg0_idx; ccfs1 = oper->center_freq_seg1_idx; ccfs2 = (le16_to_cpu(htop->operation_mode) & @@ -3024,10 +3036,10 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, unsigned int diff; diff = abs(ccf1 - ccf0); - if (diff == 8) { + if ((diff == 8) && support_160) { new.width = NL80211_CHAN_WIDTH_160; new.center_freq1 = cf1; - } else if (diff > 8) { + } else if ((diff > 8) && support_80_80) { new.width = NL80211_CHAN_WIDTH_80P80; new.center_freq2 = cf1; } -- cgit v1.2.3 From 67f562e3e147750a02b2a91d21a163fc44a1d13e Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Fri, 14 Feb 2020 08:58:59 +0100 Subject: net/smc: transfer fasync_list in case of fallback SMC does not work together with FASTOPEN. If sendmsg() is called with flag MSG_FASTOPEN in SMC_INIT state, the SMC-socket switches to fallback mode. To handle the previous ioctl FIOASYNC call correctly in this case, it is necessary to transfer the socket wait queue fasync_list to the internal TCP socket. Reported-by: syzbot+4b1fe8105f8044a26162@syzkaller.appspotmail.com Fixes: ee9dfbef02d18 ("net/smc: handle sockopts forcing fallback") Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index cee5bf4a9bb9..90988a511cd5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -470,6 +470,8 @@ static void smc_switch_to_fallback(struct smc_sock *smc) if (smc->sk.sk_socket && smc->sk.sk_socket->file) { smc->clcsock->file = smc->sk.sk_socket->file; smc->clcsock->file->private_data = smc->clcsock; + smc->clcsock->wq.fasync_list = + smc->sk.sk_socket->wq.fasync_list; } } -- cgit v1.2.3 From 369537c97024dca99303a8d4d6ab38b4f54d3909 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Fri, 14 Feb 2020 08:59:00 +0100 Subject: net/smc: no peer ID in CLC decline for SMCD Just SMCR requires a CLC Peer ID, but not SMCD. The field should be zero for SMCD. Fixes: c758dfddc1b5 ("net/smc: add SMC-D support in CLC messages") Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_clc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 0879f7bed967..86cccc24e52e 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -372,7 +372,9 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info) dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline)); dclc.hdr.version = SMC_CLC_V1; dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0; - memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); + if (smc->conn.lgr && !smc->conn.lgr->is_smcd) + memcpy(dclc.id_for_peer, local_systemid, + sizeof(local_systemid)); dclc.peer_diagnosis = htonl(peer_diag_info); memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); -- cgit v1.2.3 From 5fdcce211b3a41c3fed229333027f59a94a2f265 Mon Sep 17 00:00:00 2001 From: William Dauchy Date: Thu, 13 Feb 2020 18:19:22 +0100 Subject: net, ip6_tunnel: enhance tunnel locate with link check With ipip, it is possible to create an extra interface explicitly attached to a given physical interface: # ip link show tunl0 4: tunl0@NONE: mtu 1480 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ipip 0.0.0.0 brd 0.0.0.0 # ip link add tunl1 type ipip dev eth0 # ip link show tunl1 6: tunl1@eth0: mtu 1480 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ipip 0.0.0.0 brd 0.0.0.0 But it is not possible with ip6tnl: # ip link show ip6tnl0 5: ip6tnl0@NONE: mtu 1452 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/tunnel6 :: brd :: # ip link add ip6tnl1 type ip6tnl dev eth0 RTNETLINK answers: File exists This patch aims to make it possible by adding link comparaison in both tunnel locate and lookup functions; we also modify mtu calculation when attached to an interface with a lower mtu. This permits to make use of x-netns communication by moving the newly created tunnel in a given netns. Signed-off-by: William Dauchy Reviewed-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 68 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index b5dd20c4599b..5d65436ad5ad 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -121,6 +121,7 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses + * @link: ifindex of underlying interface * @remote: the address of the tunnel exit-point * @local: the address of the tunnel entry-point * @@ -134,37 +135,56 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) static struct ip6_tnl * -ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local) +ip6_tnl_lookup(struct net *net, int link, + const struct in6_addr *remote, const struct in6_addr *local) { unsigned int hash = HASH(remote, local); - struct ip6_tnl *t; + struct ip6_tnl *t, *cand = NULL; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct in6_addr any; for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { - if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_equal(remote, &t->parms.raddr) && - (t->dev->flags & IFF_UP)) + if (!ipv6_addr_equal(local, &t->parms.laddr) || + !ipv6_addr_equal(remote, &t->parms.raddr) || + !(t->dev->flags & IFF_UP)) + continue; + + if (link == t->parms.link) return t; + else + cand = t; } memset(&any, 0, sizeof(any)); hash = HASH(&any, local); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { - if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_any(&t->parms.raddr) && - (t->dev->flags & IFF_UP)) + if (!ipv6_addr_equal(local, &t->parms.laddr) || + !ipv6_addr_any(&t->parms.raddr) || + !(t->dev->flags & IFF_UP)) + continue; + + if (link == t->parms.link) return t; + else if (!cand) + cand = t; } hash = HASH(remote, &any); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { - if (ipv6_addr_equal(remote, &t->parms.raddr) && - ipv6_addr_any(&t->parms.laddr) && - (t->dev->flags & IFF_UP)) + if (!ipv6_addr_equal(remote, &t->parms.raddr) || + !ipv6_addr_any(&t->parms.laddr) || + !(t->dev->flags & IFF_UP)) + continue; + + if (link == t->parms.link) return t; + else if (!cand) + cand = t; } + if (cand) + return cand; + t = rcu_dereference(ip6n->collect_md_tun); if (t && t->dev->flags & IFF_UP) return t; @@ -351,7 +371,8 @@ static struct ip6_tnl *ip6_tnl_locate(struct net *net, (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_equal(remote, &t->parms.raddr)) { + ipv6_addr_equal(remote, &t->parms.raddr) && + p->link == t->parms.link) { if (create) return ERR_PTR(-EEXIST); @@ -485,7 +506,7 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, processing of the error. */ rcu_read_lock(); - t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->daddr, &ipv6h->saddr); + t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->daddr, &ipv6h->saddr); if (!t) goto out; @@ -887,7 +908,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, int ret = -1; rcu_read_lock(); - t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr); + t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->saddr, &ipv6h->daddr); if (t) { u8 tproto = READ_ONCE(t->parms.proto); @@ -1420,8 +1441,10 @@ tx_err: static void ip6_tnl_link_config(struct ip6_tnl *t) { struct net_device *dev = t->dev; + struct net_device *tdev = NULL; struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; + unsigned int mtu; int t_hlen; memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); @@ -1457,22 +1480,25 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, NULL, strict); + if (rt) { + tdev = rt->dst.dev; + ip6_rt_put(rt); + } - if (!rt) - return; + if (!tdev && p->link) + tdev = __dev_get_by_index(t->net, p->link); - if (rt->dst.dev) { - dev->hard_header_len = rt->dst.dev->hard_header_len + - t_hlen; + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + t_hlen; + mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU); - dev->mtu = rt->dst.dev->mtu - t_hlen; + dev->mtu = mtu - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; } - ip6_rt_put(rt); } } -- cgit v1.2.3 From 04fb91243a853dbde216d829c79d9632e52aa8d9 Mon Sep 17 00:00:00 2001 From: Per Forlin Date: Thu, 13 Feb 2020 15:37:09 +0100 Subject: net: dsa: tag_qca: Make sure there is headroom for tag Passing tag size to skb_cow_head will make sure there is enough headroom for the tag data. This change does not introduce any overhead in case there is already available headroom for tag. Signed-off-by: Per Forlin Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/tag_qca.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index c8a128c9e5e0..70db7c909f74 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -33,7 +33,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) struct dsa_port *dp = dsa_slave_to_port(dev); u16 *phdr, hdr; - if (skb_cow_head(skb, 0) < 0) + if (skb_cow_head(skb, QCA_HDR_LEN) < 0) return NULL; skb_push(skb, QCA_HDR_LEN); -- cgit v1.2.3 From ddc9abaf5d9924569afe09a605c9012089d0c25b Mon Sep 17 00:00:00 2001 From: Per Forlin Date: Thu, 13 Feb 2020 15:37:10 +0100 Subject: net: dsa: tag_ar9331: Make sure there is headroom for tag Passing tag size to skb_cow_head will make sure there is enough headroom for the tag data. This change does not introduce any overhead in case there is already available headroom for tag. Signed-off-by: Per Forlin Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/tag_ar9331.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c index 466ffa92a474..55b00694cdba 100644 --- a/net/dsa/tag_ar9331.c +++ b/net/dsa/tag_ar9331.c @@ -31,7 +31,7 @@ static struct sk_buff *ar9331_tag_xmit(struct sk_buff *skb, __le16 *phdr; u16 hdr; - if (skb_cow_head(skb, 0) < 0) + if (skb_cow_head(skb, AR9331_HDR_LEN) < 0) return NULL; phdr = skb_push(skb, AR9331_HDR_LEN); -- cgit v1.2.3 From e404b8c7cfb31654c9024d497cec58a501501692 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Wed, 12 Feb 2020 10:41:06 +0900 Subject: ipv6: Fix route replacement with dev-only route After commit 27596472473a ("ipv6: fix ECMP route replacement") it is no longer possible to replace an ECMP-able route by a non ECMP-able route. For example, ip route add 2001:db8::1/128 via fe80::1 dev dummy0 ip route replace 2001:db8::1/128 dev dummy0 does not work as expected. Tweak the replacement logic so that point 3 in the log of the above commit becomes: 3. If the new route is not ECMP-able, and no matching non-ECMP-able route exists, replace matching ECMP-able route (if any) or add the new route. We can now summarize the entire replace semantics to: When doing a replace, prefer replacing a matching route of the same "ECMP-able-ness" as the replace argument. If there is no such candidate, fallback to the first route found. Fixes: 27596472473a ("ipv6: fix ECMP route replacement") Signed-off-by: Benjamin Poirier Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 58fbde244381..72abf892302f 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1102,8 +1102,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, found++; break; } - if (rt_can_ecmp) - fallback_ins = fallback_ins ?: ins; + fallback_ins = fallback_ins ?: ins; goto next_iter; } @@ -1146,7 +1145,9 @@ next_iter: } if (fallback_ins && !found) { - /* No ECMP-able route found, replace first non-ECMP one */ + /* No matching route with same ecmp-able-ness found, replace + * first matching route + */ ins = fallback_ins; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); -- cgit v1.2.3 From afecdb376bd81d7e16578f0cfe82a1aec7ae18f3 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Wed, 12 Feb 2020 10:41:07 +0900 Subject: ipv6: Fix nlmsg_flags when splitting a multipath route When splitting an RTA_MULTIPATH request into multiple routes and adding the second and later components, we must not simply remove NLM_F_REPLACE but instead replace it by NLM_F_CREATE. Otherwise, it may look like the netlink message was malformed. For example, ip route add 2001:db8::1/128 dev dummy0 ip route change 2001:db8::1/128 nexthop via fe80::30:1 dev dummy0 \ nexthop via fe80::30:2 dev dummy0 results in the following warnings: [ 1035.057019] IPv6: RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE [ 1035.057517] IPv6: NLM_F_CREATE should be set when creating new route This patch makes the nlmsg sequence look equivalent for __ip6_ins_rt() to what it would get if the multipath route had been added in multiple netlink operations: ip route add 2001:db8::1/128 dev dummy0 ip route change 2001:db8::1/128 nexthop via fe80::30:1 dev dummy0 ip route append 2001:db8::1/128 nexthop via fe80::30:2 dev dummy0 Fixes: 27596472473a ("ipv6: fix ECMP route replacement") Signed-off-by: Benjamin Poirier Reviewed-by: Michal Kubecek Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4fbdc60b4e07..2931224b674e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5198,6 +5198,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, */ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_REPLACE); + cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE; nhn++; } -- cgit v1.2.3 From 0d4597c8c5abdeeaf50774066c16683f30184dc8 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 11 Feb 2020 19:03:55 -0800 Subject: net/rds: Track user mapped pages through special API Convert net/rds to use the newly introduces pin_user_pages() API, which properly sets FOLL_PIN. Setting FOLL_PIN is now required for code that requires tracking of pinned pages. Note that this effectively changes the code's behavior: it now ultimately calls set_page_dirty_lock(), instead of set_page_dirty(). This is probably more accurate. As Christoph Hellwig put it, "set_page_dirty() is only safe if we are dealing with a file backed page where we have reference on the inode it hangs off." [1] [1] https://lore.kernel.org/r/20190723153640.GB720@lst.de Cc: Hans Westgaard Ry Cc: Santosh Shilimkar Signed-off-by: Leon Romanovsky Signed-off-by: John Hubbard Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/rdma.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 3341eee87bf9..585e6b3b69ce 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -162,10 +162,9 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, if (write) gup_flags |= FOLL_WRITE; - ret = get_user_pages_fast(user_addr, nr_pages, gup_flags, pages); + ret = pin_user_pages_fast(user_addr, nr_pages, gup_flags, pages); if (ret >= 0 && ret < nr_pages) { - while (ret--) - put_page(pages[ret]); + unpin_user_pages(pages, ret); ret = -EFAULT; } @@ -300,8 +299,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, * to release anything. */ if (!need_odp) { - for (i = 0 ; i < nents; i++) - put_page(sg_page(&sg[i])); + unpin_user_pages(pages, nr_pages); kfree(sg); } ret = PTR_ERR(trans_private); @@ -325,7 +323,12 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, if (cookie_ret) *cookie_ret = cookie; - if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) { + if (args->cookie_addr && + put_user(cookie, (u64 __user *)(unsigned long)args->cookie_addr)) { + if (!need_odp) { + unpin_user_pages(pages, nr_pages); + kfree(sg); + } ret = -EFAULT; goto out; } @@ -496,9 +499,7 @@ void rds_rdma_free_op(struct rm_rdma_op *ro) * is the case for a RDMA_READ which copies from remote * to local memory */ - if (!ro->op_write) - set_page_dirty(page); - put_page(page); + unpin_user_pages_dirty_lock(&page, 1, !ro->op_write); } } @@ -515,8 +516,7 @@ void rds_atomic_free_op(struct rm_atomic_op *ao) /* Mark page dirty if it was possibly modified, which * is the case for a RDMA_READ which copies from remote * to local memory */ - set_page_dirty(page); - put_page(page); + unpin_user_pages_dirty_lock(&page, 1, true); kfree(ao->op_notifier); ao->op_notifier = NULL; @@ -944,7 +944,7 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, return ret; err: if (page) - put_page(page); + unpin_user_page(page); rm->atomic.op_active = 0; kfree(rm->atomic.op_notifier); -- cgit v1.2.3 From 540e585a79e9d643ede077b73bcc7aa2d7b4d919 Mon Sep 17 00:00:00 2001 From: Jethro Beekman Date: Wed, 12 Feb 2020 16:43:41 +0100 Subject: net: fib_rules: Correctly set table field when table number exceeds 8 bits In 709772e6e06564ed94ba740de70185ac3d792773, RT_TABLE_COMPAT was added to allow legacy software to deal with routing table numbers >= 256, but the same change to FIB rule queries was overlooked. Signed-off-by: Jethro Beekman Signed-off-by: David S. Miller --- net/core/fib_rules.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 3e7e15278c46..bd7eba9066f8 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -974,7 +974,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, frh = nlmsg_data(nlh); frh->family = ops->family; - frh->table = rule->table; + frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT; if (nla_put_u32(skb, FRA_TABLE, rule->table)) goto nla_put_failure; if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) -- cgit v1.2.3 From 44bfa9c5e5f06c72540273813e4c66beb5a8c213 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Feb 2020 20:58:26 -0800 Subject: net: rtnetlink: fix bugs in rtnl_alt_ifname() Since IFLA_ALT_IFNAME is an NLA_STRING, we have no guarantee it is nul terminated. We should use nla_strdup() instead of kstrdup(), since this helper will make sure not accessing out-of-bounds data. BUG: KMSAN: uninit-value in strlen+0x5e/0xa0 lib/string.c:535 CPU: 1 PID: 19157 Comm: syz-executor.5 Not tainted 5.5.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x220 lib/dump_stack.c:118 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:118 __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:215 strlen+0x5e/0xa0 lib/string.c:535 kstrdup+0x7f/0x1a0 mm/util.c:59 rtnl_alt_ifname net/core/rtnetlink.c:3495 [inline] rtnl_linkprop+0x85d/0xc00 net/core/rtnetlink.c:3553 rtnl_newlinkprop+0x9d/0xb0 net/core/rtnetlink.c:3568 rtnetlink_rcv_msg+0x1153/0x1570 net/core/rtnetlink.c:5424 netlink_rcv_skb+0x451/0x650 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x50/0x60 net/core/rtnetlink.c:5442 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0xf9e/0x1100 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x1248/0x14d0 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:639 [inline] sock_sendmsg net/socket.c:659 [inline] ____sys_sendmsg+0x12b6/0x1350 net/socket.c:2330 ___sys_sendmsg net/socket.c:2384 [inline] __sys_sendmsg+0x451/0x5f0 net/socket.c:2417 __do_sys_sendmsg net/socket.c:2426 [inline] __se_sys_sendmsg+0x97/0xb0 net/socket.c:2424 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2424 do_syscall_64+0xb8/0x160 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x45b3b9 Code: ad b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007ff1c7b1ac78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007ff1c7b1b6d4 RCX: 000000000045b3b9 RDX: 0000000000000000 RSI: 0000000020000040 RDI: 0000000000000003 RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 00000000000009cb R14: 00000000004cb3dd R15: 000000000075bf2c Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:144 [inline] kmsan_internal_poison_shadow+0x66/0xd0 mm/kmsan/kmsan.c:127 kmsan_slab_alloc+0x8a/0xe0 mm/kmsan/kmsan_hooks.c:82 slab_alloc_node mm/slub.c:2774 [inline] __kmalloc_node_track_caller+0xb40/0x1200 mm/slub.c:4382 __kmalloc_reserve net/core/skbuff.c:141 [inline] __alloc_skb+0x2fd/0xac0 net/core/skbuff.c:209 alloc_skb include/linux/skbuff.h:1049 [inline] netlink_alloc_large_skb net/netlink/af_netlink.c:1174 [inline] netlink_sendmsg+0x7d3/0x14d0 net/netlink/af_netlink.c:1892 sock_sendmsg_nosec net/socket.c:639 [inline] sock_sendmsg net/socket.c:659 [inline] ____sys_sendmsg+0x12b6/0x1350 net/socket.c:2330 ___sys_sendmsg net/socket.c:2384 [inline] __sys_sendmsg+0x451/0x5f0 net/socket.c:2417 __do_sys_sendmsg net/socket.c:2426 [inline] __se_sys_sendmsg+0x97/0xb0 net/socket.c:2424 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2424 do_syscall_64+0xb8/0x160 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 36fbf1e52bd3 ("net: rtnetlink: add linkprop commands to add and delete alternative ifnames") Signed-off-by: Eric Dumazet Cc: Jiri Pirko Reported-by: syzbot Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 09c44bf2e1d2..e1152f4ffe33 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3504,27 +3504,25 @@ static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, if (err) return err; - alt_ifname = nla_data(attr); + alt_ifname = nla_strdup(attr, GFP_KERNEL); + if (!alt_ifname) + return -ENOMEM; + if (cmd == RTM_NEWLINKPROP) { - alt_ifname = kstrdup(alt_ifname, GFP_KERNEL); - if (!alt_ifname) - return -ENOMEM; err = netdev_name_node_alt_create(dev, alt_ifname); - if (err) { - kfree(alt_ifname); - return err; - } + if (!err) + alt_ifname = NULL; } else if (cmd == RTM_DELLINKPROP) { err = netdev_name_node_alt_destroy(dev, alt_ifname); - if (err) - return err; } else { - WARN_ON(1); - return 0; + WARN_ON_ONCE(1); + err = -EINVAL; } - *changed = true; - return 0; + kfree(alt_ifname); + if (!err) + *changed = true; + return err; } static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, -- cgit v1.2.3 From e08ad80551b4b33c02f2fce1522f6c227d3976cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Feb 2020 07:53:53 -0800 Subject: net: add strict checks in netdev_name_node_alt_destroy() netdev_name_node_alt_destroy() does a lookup over all device names of a namespace. We need to make sure the name belongs to the device of interest, and that we do not destroy its primary name, since we rely on it being not deleted : dev->name_node would indeed point to freed memory. syzbot report was the following : BUG: KASAN: use-after-free in dev_net include/linux/netdevice.h:2206 [inline] BUG: KASAN: use-after-free in mld_force_mld_version net/ipv6/mcast.c:1172 [inline] BUG: KASAN: use-after-free in mld_in_v2_mode_only net/ipv6/mcast.c:1180 [inline] BUG: KASAN: use-after-free in mld_in_v1_mode+0x203/0x230 net/ipv6/mcast.c:1190 Read of size 8 at addr ffff88809886c588 by task swapper/1/0 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.6.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x197/0x210 lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xd4/0x30b mm/kasan/report.c:374 __kasan_report.cold+0x1b/0x32 mm/kasan/report.c:506 kasan_report+0x12/0x20 mm/kasan/common.c:641 __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:135 dev_net include/linux/netdevice.h:2206 [inline] mld_force_mld_version net/ipv6/mcast.c:1172 [inline] mld_in_v2_mode_only net/ipv6/mcast.c:1180 [inline] mld_in_v1_mode+0x203/0x230 net/ipv6/mcast.c:1190 mld_send_initial_cr net/ipv6/mcast.c:2083 [inline] mld_dad_timer_expire+0x24/0x230 net/ipv6/mcast.c:2118 call_timer_fn+0x1ac/0x780 kernel/time/timer.c:1404 expire_timers kernel/time/timer.c:1449 [inline] __run_timers kernel/time/timer.c:1773 [inline] __run_timers kernel/time/timer.c:1740 [inline] run_timer_softirq+0x6c3/0x1790 kernel/time/timer.c:1786 __do_softirq+0x262/0x98c kernel/softirq.c:292 invoke_softirq kernel/softirq.c:373 [inline] irq_exit+0x19b/0x1e0 kernel/softirq.c:413 exiting_irq arch/x86/include/asm/apic.h:546 [inline] smp_apic_timer_interrupt+0x1a3/0x610 arch/x86/kernel/apic/apic.c:1146 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:829 RIP: 0010:native_safe_halt+0xe/0x10 arch/x86/include/asm/irqflags.h:61 Code: 68 73 c5 f9 eb 8a cc cc cc cc cc cc e9 07 00 00 00 0f 00 2d 94 be 59 00 f4 c3 66 90 e9 07 00 00 00 0f 00 2d 84 be 59 00 fb f4 cc 55 48 89 e5 41 57 41 56 41 55 41 54 53 e8 de 2a 74 f9 e8 09 RSP: 0018:ffffc90000d3fd68 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13 RAX: 1ffffffff136761a RBX: ffff8880a99fc340 RCX: 0000000000000000 RDX: dffffc0000000000 RSI: 0000000000000006 RDI: ffff8880a99fcbd4 RBP: ffffc90000d3fd98 R08: ffff8880a99fc340 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: dffffc0000000000 R13: ffffffff8aa5a1c0 R14: 0000000000000000 R15: 0000000000000001 arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:686 default_idle_call+0x84/0xb0 kernel/sched/idle.c:94 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x3c8/0x6e0 kernel/sched/idle.c:269 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:361 start_secondary+0x2f4/0x410 arch/x86/kernel/smpboot.c:264 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:242 Allocated by task 10229: save_stack+0x23/0x90 mm/kasan/common.c:72 set_track mm/kasan/common.c:80 [inline] __kasan_kmalloc mm/kasan/common.c:515 [inline] __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:488 kasan_kmalloc+0x9/0x10 mm/kasan/common.c:529 __do_kmalloc_node mm/slab.c:3616 [inline] __kmalloc_node+0x4e/0x70 mm/slab.c:3623 kmalloc_node include/linux/slab.h:578 [inline] kvmalloc_node+0x68/0x100 mm/util.c:574 kvmalloc include/linux/mm.h:645 [inline] kvzalloc include/linux/mm.h:653 [inline] alloc_netdev_mqs+0x98/0xe40 net/core/dev.c:9797 rtnl_create_link+0x22d/0xaf0 net/core/rtnetlink.c:3047 __rtnl_newlink+0xf9f/0x1790 net/core/rtnetlink.c:3309 rtnl_newlink+0x69/0xa0 net/core/rtnetlink.c:3377 rtnetlink_rcv_msg+0x45e/0xaf0 net/core/rtnetlink.c:5438 netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5456 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0x59e/0x7e0 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x91c/0xea0 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:672 __sys_sendto+0x262/0x380 net/socket.c:1998 __do_compat_sys_socketcall net/compat.c:771 [inline] __se_compat_sys_socketcall net/compat.c:719 [inline] __ia32_compat_sys_socketcall+0x530/0x710 net/compat.c:719 do_syscall_32_irqs_on arch/x86/entry/common.c:337 [inline] do_fast_syscall_32+0x27b/0xe16 arch/x86/entry/common.c:408 entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139 Freed by task 10229: save_stack+0x23/0x90 mm/kasan/common.c:72 set_track mm/kasan/common.c:80 [inline] kasan_set_free_info mm/kasan/common.c:337 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/common.c:476 kasan_slab_free+0xe/0x10 mm/kasan/common.c:485 __cache_free mm/slab.c:3426 [inline] kfree+0x10a/0x2c0 mm/slab.c:3757 __netdev_name_node_alt_destroy+0x1ff/0x2a0 net/core/dev.c:322 netdev_name_node_alt_destroy+0x57/0x80 net/core/dev.c:334 rtnl_alt_ifname net/core/rtnetlink.c:3518 [inline] rtnl_linkprop.isra.0+0x575/0x6f0 net/core/rtnetlink.c:3567 rtnl_dellinkprop+0x46/0x60 net/core/rtnetlink.c:3588 rtnetlink_rcv_msg+0x45e/0xaf0 net/core/rtnetlink.c:5438 netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477 rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5456 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0x59e/0x7e0 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x91c/0xea0 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:672 ____sys_sendmsg+0x753/0x880 net/socket.c:2343 ___sys_sendmsg+0x100/0x170 net/socket.c:2397 __sys_sendmsg+0x105/0x1d0 net/socket.c:2430 __compat_sys_sendmsg net/compat.c:642 [inline] __do_compat_sys_sendmsg net/compat.c:649 [inline] __se_compat_sys_sendmsg net/compat.c:646 [inline] __ia32_compat_sys_sendmsg+0x7a/0xb0 net/compat.c:646 do_syscall_32_irqs_on arch/x86/entry/common.c:337 [inline] do_fast_syscall_32+0x27b/0xe16 arch/x86/entry/common.c:408 entry_SYSENTER_compat+0x70/0x7f arch/x86/entry/entry_64_compat.S:139 The buggy address belongs to the object at ffff88809886c000 which belongs to the cache kmalloc-4k of size 4096 The buggy address is located 1416 bytes inside of 4096-byte region [ffff88809886c000, ffff88809886d000) The buggy address belongs to the page: page:ffffea0002621b00 refcount:1 mapcount:0 mapping:ffff8880aa402000 index:0x0 compound_mapcount: 0 flags: 0xfffe0000010200(slab|head) raw: 00fffe0000010200 ffffea0002610d08 ffffea0002607608 ffff8880aa402000 raw: 0000000000000000 ffff88809886c000 0000000100000001 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88809886c480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88809886c500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff88809886c580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88809886c600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88809886c680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 36fbf1e52bd3 ("net: rtnetlink: add linkprop commands to add and delete alternative ifnames") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Jiri Pirko Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index a6316b336128..b6d13f3f1e5a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -331,6 +331,12 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) name_node = netdev_name_node_lookup(net, name); if (!name_node) return -ENOENT; + /* lookup might have found our primary name or a name belonging + * to another device. + */ + if (name_node == dev->name_node || name_node->dev != dev) + return -EINVAL; + __netdev_name_node_alt_destroy(name_node); return 0; -- cgit v1.2.3 From b6e4a1aeeb14cad595f70b31cc376903d322c821 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Fri, 14 Feb 2020 14:14:29 -0800 Subject: mptcp: Protect subflow socket options before connection completes Userspace should not be able to directly manipulate subflow socket options before a connection is established since it is not yet known if it will be an MPTCP subflow or a TCP fallback subflow. TCP fallback subflows can be more directly controlled by userspace because they are regular TCP connections, while MPTCP subflow sockets need to be configured for the specific needs of MPTCP. Use the same logic as sendmsg/recvmsg to ensure that socket option calls are only passed through to known TCP fallback subflows. Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 48 +++++++++++++++++++----------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 030dee668e0a..e9aa6807b5be 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -755,60 +755,50 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { struct mptcp_sock *msk = mptcp_sk(sk); - int ret = -EOPNOTSUPP; struct socket *ssock; - struct sock *ssk; pr_debug("msk=%p", msk); /* @@ the meaning of setsockopt() when the socket is connected and - * there are multiple subflows is not defined. + * there are multiple subflows is not yet defined. It is up to the + * MPTCP-level socket to configure the subflows until the subflow + * is in TCP fallback, when TCP socket options are passed through + * to the one remaining subflow. */ lock_sock(sk); - ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); - if (IS_ERR(ssock)) { - release_sock(sk); - return ret; - } + ssock = __mptcp_tcp_fallback(msk); + if (ssock) + return tcp_setsockopt(ssock->sk, level, optname, optval, + optlen); - ssk = ssock->sk; - sock_hold(ssk); release_sock(sk); - ret = tcp_setsockopt(ssk, level, optname, optval, optlen); - sock_put(ssk); - - return ret; + return -EOPNOTSUPP; } static int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option) { struct mptcp_sock *msk = mptcp_sk(sk); - int ret = -EOPNOTSUPP; struct socket *ssock; - struct sock *ssk; pr_debug("msk=%p", msk); - /* @@ the meaning of getsockopt() when the socket is connected and - * there are multiple subflows is not defined. + /* @@ the meaning of setsockopt() when the socket is connected and + * there are multiple subflows is not yet defined. It is up to the + * MPTCP-level socket to configure the subflows until the subflow + * is in TCP fallback, when socket options are passed through + * to the one remaining subflow. */ lock_sock(sk); - ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); - if (IS_ERR(ssock)) { - release_sock(sk); - return ret; - } + ssock = __mptcp_tcp_fallback(msk); + if (ssock) + return tcp_getsockopt(ssock->sk, level, optname, optval, + option); - ssk = ssock->sk; - sock_hold(ssk); release_sock(sk); - ret = tcp_getsockopt(ssk, level, optname, optval, option); - sock_put(ssk); - - return ret; + return -EOPNOTSUPP; } static int mptcp_get_port(struct sock *sk, unsigned short snum) -- cgit v1.2.3 From 6699170376ab941c1cc5c3bcefa766efc3575c73 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Sat, 15 Feb 2020 01:55:53 +0100 Subject: ethtool: fix application of verbose no_mask bitset A bitset without mask in a _SET request means we want exactly the bits in the bitset to be set. This works correctly for compact format but when verbose format is parsed, ethnl_update_bitset32_verbose() only sets the bits present in the request bitset but does not clear the rest. This can cause incorrect results like lion:~ # ethtool eth0 | grep Wake Supports Wake-on: pumbg Wake-on: g lion:~ # ethtool -s eth0 wol u lion:~ # ethtool eth0 | grep Wake Supports Wake-on: pumbg Wake-on: ug when the second ethtool command issues request ETHTOOL_MSG_WOL_SET ETHTOOL_A_WOL_HEADER ETHTOOL_A_HEADER_DEV_NAME = "eth0" ETHTOOL_A_WOL_MODES ETHTOOL_A_BITSET_NOMASK ETHTOOL_A_BITSET_BITS ETHTOOL_A_BITSET_BITS_BIT ETHTOOL_BITSET_BIT_INDEX = 1 Fix the logic by clearing the whole target bitmap before we start iterating through the request bits. Fixes: 10b518d4e6dd ("ethtool: netlink bitset handling") Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ethtool/bitset.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c index fce45dac4205..8977fe1f3946 100644 --- a/net/ethtool/bitset.c +++ b/net/ethtool/bitset.c @@ -447,7 +447,10 @@ ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits, "mask only allowed in compact bitset"); return -EINVAL; } + no_mask = tb[ETHTOOL_A_BITSET_NOMASK]; + if (no_mask) + ethnl_bitmap32_clear(bitmap, 0, nbits, mod); nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) { bool old_val, new_val; -- cgit v1.2.3 From 7151affeef8d527f50b4b68a871fd28bd660023f Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sat, 15 Feb 2020 10:50:21 +0000 Subject: net: export netdev_next_lower_dev_rcu() netdev_next_lower_dev_rcu() will be used to implement a function, which is to walk all lower interfaces. There are already functions that they walk their lower interface. (netdev_walk_all_lower_dev_rcu, netdev_walk_all_lower_dev()). But, there would be cases that couldn't be covered by given netdev_walk_all_lower_dev_{rcu}() function. So, some modules would want to implement own function, which is to walk all lower interfaces. In the next patch, netdev_next_lower_dev_rcu() will be used. In addition, this patch removes two unused prototypes in netdevice.h. Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index b6d13f3f1e5a..2577ebfed293 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -146,7 +146,6 @@ #include "net-sysfs.h" #define MAX_GRO_SKBS 8 -#define MAX_NEST_DEV 8 /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) @@ -7207,8 +7206,8 @@ static int __netdev_walk_all_lower_dev(struct net_device *dev, return 0; } -static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, - struct list_head **iter) +struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, + struct list_head **iter) { struct netdev_adjacent *lower; @@ -7220,6 +7219,7 @@ static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, return lower->dev; } +EXPORT_SYMBOL(netdev_next_lower_dev_rcu); static u8 __netdev_upper_depth(struct net_device *dev) { -- cgit v1.2.3 From 357b41caf949c57e426f1c5f18574b6b46583406 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Sat, 15 Feb 2020 15:45:56 +0100 Subject: mptcp: select CRYPTO Without this modification and if CRYPTO is not selected, we have this warning: WARNING: unmet direct dependencies detected for CRYPTO_LIB_SHA256 Depends on [n]: CRYPTO [=n] Selected by [y]: - MPTCP [=y] && NET [=y] && INET [=y] MPTCP selects CRYPTO_LIB_SHA256 which seems to depend on CRYPTO. CRYPTO is now selected to avoid this issue. Even though the config system prints that warning, it looks like sha256.c is compiled and linked even without CONFIG_CRYPTO. Since MPTCP will end up needing CONFIG_CRYPTO anyway in future commits -- currently in preparation for net-next -- we propose to add it now to fix the warning. The dependency in the config system comes from the fact that CRYPTO_LIB_SHA256 is defined in "lib/crypto/Kconfig" which is sourced from "crypto/Kconfig" only if CRYPTO is selected. Fixes: 65492c5a6ab5 (mptcp: move from sha1 (v0) to sha256 (v1)) Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index 49f6054e7f4e..a9ed3bf1d93f 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -4,6 +4,7 @@ config MPTCP depends on INET select SKB_EXTENSIONS select CRYPTO_LIB_SHA256 + select CRYPTO help Multipath TCP (MPTCP) connections send and receive data over multiple subflows in order to utilize multiple network paths. Each subflow -- cgit v1.2.3 From 8955b4357d6fc98734b53855b76ee37014a7e492 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 15 Feb 2020 13:41:12 -0800 Subject: skbuff: remove stale bit mask comments Remove stale comments since this flag is no longer a bit mask but is a bit field. Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 864cb9e9622f..1365a556152c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -467,7 +467,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, return NULL; } - /* use OR instead of assignment to avoid clearing of bits in mask */ if (pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; @@ -527,7 +526,6 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, return NULL; } - /* use OR instead of assignment to avoid clearing of bits in mask */ if (nc->page.pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; -- cgit v1.2.3 From 6a757c07e51f80ac34325fcd558490d2d1439e1b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Feb 2020 17:37:07 +0100 Subject: netfilter: conntrack: allow insertion of clashing entries This patch further relaxes the need to drop an skb due to a clash with an existing conntrack entry. Current clash resolution handles the case where the clash occurs between two identical entries (distinct nf_conn objects with same tuples), i.e.: Original Reply existing: 10.2.3.4:42 -> 10.8.8.8:53 10.2.3.4:42 <- 10.0.0.6:5353 clashing: 10.2.3.4:42 -> 10.8.8.8:53 10.2.3.4:42 <- 10.0.0.6:5353 ... existing handling will discard the unconfirmed clashing entry and makes skb->_nfct point to the existing one. The skb can then be processed normally just as if the clash would not have existed in the first place. For other clashes, the skb needs to be dropped. This frequently happens with DNS resolvers that send A and AAAA queries back-to-back when NAT rules are present that cause packets to get different DNAT transformations applied, for example: -m statistics --mode random ... -j DNAT --dnat-to 10.0.0.6:5353 -m statistics --mode random ... -j DNAT --dnat-to 10.0.0.7:5353 In this case the A or AAAA query is dropped which incurs a costly delay during name resolution. This patch also allows this collision type: Original Reply existing: 10.2.3.4:42 -> 10.8.8.8:53 10.2.3.4:42 <- 10.0.0.6:5353 clashing: 10.2.3.4:42 -> 10.8.8.8:53 10.2.3.4:42 <- 10.0.0.7:5353 In this case, clash is in original direction -- the reply direction is still unique. The change makes it so that when the 2nd colliding packet is received, the clashing conntrack is tagged with new IPS_NAT_CLASH_BIT, gets a fixed 1 second timeout and is inserted in the reply direction only. The entry is hidden from 'conntrack -L', it will time out quickly and it can be early dropped because it will never progress to the ASSURED state. To avoid special-casing the delete code path to special case the ORIGINAL hlist_nulls node, a new helper, "hlist_nulls_add_fake", is added so hlist_nulls_del() will work. Example: CPU A: CPU B: 1. 10.2.3.4:42 -> 10.8.8.8:53 (A) 2. 10.2.3.4:42 -> 10.8.8.8:53 (AAAA) 3. Apply DNAT, reply changed to 10.0.0.6 4. 10.2.3.4:42 -> 10.8.8.8:53 (AAAA) 5. Apply DNAT, reply changed to 10.0.0.7 6. confirm/commit to conntrack table, no collisions 7. commit clashing entry Reply comes in: 10.2.3.4:42 <- 10.0.0.6:5353 (A) -> Finds a conntrack, DNAT is reversed & packet forwarded to 10.2.3.4:42 10.2.3.4:42 <- 10.0.0.7:5353 (AAAA) -> Finds a conntrack, DNAT is reversed & packet forwarded to 10.2.3.4:42 The conntrack entry is deleted from table, as it has the NAT_CLASH bit set. In case of a retransmit from ORIGINAL dir, all further packets will get the DNAT transformation to 10.0.0.6. I tried to come up with other solutions but they all have worse problems. Alternatives considered were: 1. Confirm ct entries at allocation time, not in postrouting. a. will cause uneccesarry work when the skb that creates the conntrack is dropped by ruleset. b. in case nat is applied, ct entry would need to be moved in the table, which requires another spinlock pair to be taken. c. breaks the 'unconfirmed entry is private to cpu' assumption: we would need to guard all nfct->ext allocation requests with ct->lock spinlock. 2. Make the unconfirmed list a hash table instead of a pcpu list. Shares drawback c) of the first alternative. 3. Document this is expected and force users to rearrange their ruleset (e.g. by using "-m cluster" instead of "-m statistics"). nft has the 'jhash' expression which can be used instead of 'numgen'. Major drawback: doesn't fix what I consider a bug, not very realistic and I believe its reasonable to have the existing rulesets to 'just work'. 4. Document this is expected and force users to steer problematic packets to the same CPU -- this would serialize the "allocate new conntrack entry/nat table evaluation/perform nat/confirm entry", so no race can occur. Similar drawback to 3. Another advantage of this patch compared to 1) and 2) is that there are no changes to the hot path; things are handled in the udp tracker and the clash resolution path. Cc: rcu@vger.kernel.org Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Jozsef Kadlecsik Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 76 +++++++++++++++++++++++++++++++++- net/netfilter/nf_conntrack_proto_udp.c | 20 +++++++-- 2 files changed, 90 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3f069eb0f0fc..1927fc296f95 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -940,11 +940,71 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb, return NF_DROP; } +/** + * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry + * + * @skb: skb that causes the collision + * @repl_idx: hash slot for reply direction + * + * Called when origin or reply direction had a clash. + * The skb can be handled without packet drop provided the reply direction + * is unique or there the existing entry has the identical tuple in both + * directions. + * + * Caller must hold conntrack table locks to prevent concurrent updates. + * + * Returns NF_DROP if the clash could not be handled. + */ +static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) +{ + struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); + const struct nf_conntrack_zone *zone; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct net *net; + + zone = nf_ct_zone(loser_ct); + net = nf_ct_net(loser_ct); + + /* Reply direction must never result in a clash, unless both origin + * and reply tuples are identical. + */ + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { + if (nf_ct_key_equal(h, + &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, + zone, net)) + return __nf_ct_resolve_clash(skb, h); + } + + /* We want the clashing entry to go away real soon: 1 second timeout. */ + loser_ct->timeout = nfct_time_stamp + HZ; + + /* IPS_NAT_CLASH removes the entry automatically on the first + * reply. Also prevents UDP tracker from moving the entry to + * ASSURED state, i.e. the entry can always be evicted under + * pressure. + */ + loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; + + __nf_conntrack_insert_prepare(loser_ct); + + /* fake add for ORIGINAL dir: we want lookups to only find the entry + * already in the table. This also hides the clashing entry from + * ctnetlink iteration, i.e. conntrack -L won't show them. + */ + hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + + hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, + &nf_conntrack_hash[repl_idx]); + return NF_ACCEPT; +} + /** * nf_ct_resolve_clash - attempt to handle clash without packet drop * * @skb: skb that causes the clash * @h: tuplehash of the clashing entry already in table + * @hash_reply: hash slot for reply direction * * A conntrack entry can be inserted to the connection tracking table * if there is no existing entry with an identical tuple. @@ -963,10 +1023,18 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb, * exactly the same, only the to-be-confirmed conntrack entry is discarded * and @skb is associated with the conntrack entry already in the table. * + * Failing that, the new, unconfirmed conntrack is still added to the table + * provided that the collision only occurs in the ORIGINAL direction. + * The new entry will be added after the existing one in the hash list, + * so packets in the ORIGINAL direction will continue to match the existing + * entry. The new entry will also have a fixed timeout so it expires -- + * due to the collision, it will not see bidirectional traffic. + * * Returns NF_DROP if the clash could not be resolved. */ static __cold noinline int -nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) +nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, + u32 reply_hash) { /* This is the conntrack entry already in hashes that won race. */ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); @@ -987,6 +1055,10 @@ nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) if (ret == NF_ACCEPT) return ret; + ret = nf_ct_resolve_clash_harder(skb, reply_hash); + if (ret == NF_ACCEPT) + return ret; + drop: nf_ct_add_to_dying_list(loser_ct); NF_CT_STAT_INC(net, drop); @@ -1101,7 +1173,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) return NF_ACCEPT; out: - ret = nf_ct_resolve_clash(skb, h); + ret = nf_ct_resolve_clash(skb, h, reply_hash); dying: nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 7365b43f8f98..760ca2422816 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -81,6 +81,18 @@ static bool udp_error(struct sk_buff *skb, return false; } +static void nf_conntrack_udp_refresh_unreplied(struct nf_conn *ct, + struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + u32 extra_jiffies) +{ + if (unlikely(ctinfo == IP_CT_ESTABLISHED_REPLY && + ct->status & IPS_NAT_CLASH)) + nf_ct_kill(ct); + else + nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies); +} + /* Returns verdict for packet, and may modify conntracktype */ int nf_conntrack_udp_packet(struct nf_conn *ct, struct sk_buff *skb, @@ -116,8 +128,8 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { - nf_ct_refresh_acct(ct, ctinfo, skb, - timeouts[UDP_CT_UNREPLIED]); + nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo, + timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } @@ -198,8 +210,8 @@ int nf_conntrack_udplite_packet(struct nf_conn *ct, if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { - nf_ct_refresh_acct(ct, ctinfo, skb, - timeouts[UDP_CT_UNREPLIED]); + nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo, + timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } -- cgit v1.2.3 From 8a9093c79863b58cc2f9874d7ae788f0d622a596 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 17 Feb 2020 15:38:09 -0500 Subject: net: sched: correct flower port blocking tc flower rules that are based on src or dst port blocking are sometimes ineffective due to uninitialized stack data. __skb_flow_dissect() extracts ports from the skb for tc flower to match against. However, the port dissection is not done when when the FLOW_DIS_IS_FRAGMENT bit is set in key_control->flags. All callers of __skb_flow_dissect(), zero-out the key_control field except for fl_classify() as used by the flower classifier. Thus, the FLOW_DIS_IS_FRAGMENT may be set on entry to __skb_flow_dissect(), since key_control is allocated on the stack and may not be initialized. Since key_basic and key_control are present for all flow keys, let's make sure they are initialized. Fixes: 62230715fd24 ("flow_dissector: do not dissect l4 ports for fragments") Co-developed-by: Eric Dumazet Signed-off-by: Eric Dumazet Acked-by: Cong Wang Signed-off-by: Jason Baron Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 7e54d2ab5254..d32d4233d337 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -305,6 +305,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_fl_filter *f; list_for_each_entry_rcu(mask, &head->masks, list) { + flow_dissector_init_keys(&skb_key.control, &skb_key.basic); fl_clear_masked_range(&skb_key, mask); skb_flow_dissect_meta(skb, &mask->dissector, &skb_key); -- cgit v1.2.3 From 245709ec8be89af46ea7ef0444c9c80913999d99 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 18 Feb 2020 12:07:53 +0800 Subject: sctp: move the format error check out of __sctp_sf_do_9_1_abort When T2 timer is to be stopped, the asoc should also be deleted, otherwise, there will be no chance to call sctp_association_free and the asoc could last in memory forever. However, in sctp_sf_shutdown_sent_abort(), after adding the cmd SCTP_CMD_TIMER_STOP for T2 timer, it may return error due to the format error from __sctp_sf_do_9_1_abort() and miss adding SCTP_CMD_ASSOC_FAILED where the asoc will be deleted. This patch is to fix it by moving the format error check out of __sctp_sf_do_9_1_abort(), and do it before adding the cmd SCTP_CMD_TIMER_STOP for T2 timer. Thanks Hangbin for reporting this issue by the fuzz testing. v1->v2: - improve the comment in the code as Marcelo's suggestion. Fixes: 96ca468b86b0 ("sctp: check invalid value of length parameter in error cause") Reported-by: Hangbin Liu Acked-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 748e3b19ec1d..6a16af4b1ef6 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -170,6 +170,16 @@ static inline bool sctp_chunk_length_valid(struct sctp_chunk *chunk, return true; } +/* Check for format error in an ABORT chunk */ +static inline bool sctp_err_chunk_valid(struct sctp_chunk *chunk) +{ + struct sctp_errhdr *err; + + sctp_walk_errors(err, chunk->chunk_hdr); + + return (void *)err == (void *)chunk->chunk_end; +} + /********************************************************** * These are the state functions for handling chunk events. **********************************************************/ @@ -2255,6 +2265,9 @@ enum sctp_disposition sctp_sf_shutdown_pending_abort( sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); + if (!sctp_err_chunk_valid(chunk)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands); } @@ -2298,6 +2311,9 @@ enum sctp_disposition sctp_sf_shutdown_sent_abort( sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); + if (!sctp_err_chunk_valid(chunk)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* Stop the T2-shutdown timer. */ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); @@ -2565,6 +2581,9 @@ enum sctp_disposition sctp_sf_do_9_1_abort( sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); + if (!sctp_err_chunk_valid(chunk)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + return __sctp_sf_do_9_1_abort(net, ep, asoc, type, arg, commands); } @@ -2582,16 +2601,8 @@ static enum sctp_disposition __sctp_sf_do_9_1_abort( /* See if we have an error cause code in the chunk. */ len = ntohs(chunk->chunk_hdr->length); - if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) { - struct sctp_errhdr *err; - - sctp_walk_errors(err, chunk->chunk_hdr); - if ((void *)err != (void *)chunk->chunk_end) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, - commands); - + if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) error = ((struct sctp_errhdr *)chunk->skb->data)->cause; - } sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNRESET)); /* ASSOC_FAILED will DELETE_TCB. */ -- cgit v1.2.3 From d99bfed58d9698c0ea1dbf47e4fdf4b87cc7203f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 17 Feb 2020 16:54:38 +0100 Subject: mptcp: fix bogus socket flag values Dan Carpenter reports static checker warnings due to bogus BIT() usage: net/mptcp/subflow.c:571 subflow_write_space() warn: test_bit() takes a bit number net/mptcp/subflow.c:694 subflow_state_change() warn: test_bit() takes a bit number net/mptcp/protocol.c:261 ssk_check_wmem() warn: test_bit() takes a bit number [..] This is harmless (we use bits 1 & 2 instead of 0 and 1), but would break eventually when adding BIT(5) (or 6, depends on size of 'long'). Just use 0 and 1, the values are only passed to test/set/clear_bit functions. Fixes: 648ef4b88673 ("mptcp: Implement MPTCP receive path") Reported-by: Dan Carpenter Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 8a99a2930284..9f8663b30456 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -56,8 +56,8 @@ #define MPTCP_DSS_FLAG_MASK (0x1F) /* MPTCP socket flags */ -#define MPTCP_DATA_READY BIT(0) -#define MPTCP_SEND_SPACE BIT(1) +#define MPTCP_DATA_READY 0 +#define MPTCP_SEND_SPACE 1 /* MPTCP connection sock */ struct mptcp_sock { -- cgit v1.2.3 From 379349e9bc3b42b8b2f8f7a03f64a97623fff323 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Feb 2020 18:15:44 +0100 Subject: Revert "net: dev: introduce support for sch BYPASS for lockless qdisc" This reverts commit ba27b4cdaaa66561aaedb2101876e563738d36fe Ahmed reported ouf-of-order issues bisected to commit ba27b4cdaaa6 ("net: dev: introduce support for sch BYPASS for lockless qdisc"). I can't find any working solution other than a plain revert. This will introduce some minor performance regressions for pfifo_fast qdisc. I plan to address them in net-next with more indirect call wrapper boilerplate for qdiscs. Reported-by: Ahmad Fatoum Fixes: ba27b4cdaaa6 ("net: dev: introduce support for sch BYPASS for lockless qdisc") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/core/dev.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 2577ebfed293..e10bd680dc03 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3662,26 +3662,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); if (q->flags & TCQ_F_NOLOCK) { - if ((q->flags & TCQ_F_CAN_BYPASS) && READ_ONCE(q->empty) && - qdisc_run_begin(q)) { - if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, - &q->state))) { - __qdisc_drop(skb, &to_free); - rc = NET_XMIT_DROP; - goto end_run; - } - qdisc_bstats_cpu_update(q, skb); - - rc = NET_XMIT_SUCCESS; - if (sch_direct_xmit(skb, q, dev, txq, NULL, true)) - __qdisc_run(q); - -end_run: - qdisc_run_end(q); - } else { - rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; - qdisc_run(q); - } + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; + qdisc_run(q); if (unlikely(to_free)) kfree_skb_list(to_free); -- cgit v1.2.3 From 8c70c3d72833a08214ff8c8df1f7d9778509888d Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Tue, 18 Feb 2020 23:47:18 +0530 Subject: net: netlabel: Use built-in RCU list checking list_for_each_entry_rcu() has built-in RCU and lock checking. Pass cond argument to list_for_each_entry_rcu() to silence false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled by default. Signed-off-by: Madhuparna Bhowmik Signed-off-by: David S. Miller --- net/netlabel/netlabel_unlabeled.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index d2e4ab8d1cb1..77bb1bb22c3b 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -207,7 +207,8 @@ static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex) bkt = netlbl_unlhsh_hash(ifindex); bkt_list = &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]; - list_for_each_entry_rcu(iter, bkt_list, list) + list_for_each_entry_rcu(iter, bkt_list, list, + lockdep_is_held(&netlbl_unlhsh_lock)) if (iter->valid && iter->ifindex == ifindex) return iter; -- cgit v1.2.3 From 9facfdb5467382a21fc0bd4211ced26c06f28832 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Wed, 19 Feb 2020 00:11:32 +0530 Subject: netlabel_domainhash.c: Use built-in RCU list checking list_for_each_entry_rcu() has built-in RCU and lock checking. Pass cond argument to list_for_each_entry_rcu() to silence false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled by default. Signed-off-by: Madhuparna Bhowmik Signed-off-by: David S. Miller --- net/netlabel/netlabel_domainhash.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index f5d34da0646e..a1f2320ecc16 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -143,7 +143,8 @@ static struct netlbl_dom_map *netlbl_domhsh_search(const char *domain, if (domain != NULL) { bkt = netlbl_domhsh_hash(domain); bkt_list = &netlbl_domhsh_rcu_deref(netlbl_domhsh)->tbl[bkt]; - list_for_each_entry_rcu(iter, bkt_list, list) + list_for_each_entry_rcu(iter, bkt_list, list, + lockdep_is_held(&netlbl_domhsh_lock)) if (iter->valid && netlbl_family_match(iter->family, family) && strcmp(iter->domain, domain) == 0) -- cgit v1.2.3 From 7790614616458b6dd3d90652acfa6b7443ee7041 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Wed, 19 Feb 2020 01:24:25 +0530 Subject: meter.c: Use built-in RCU list checking hlist_for_each_entry_rcu() has built-in RCU and lock checking. Pass cond argument to list_for_each_entry_rcu() to silence false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled by default. Signed-off-by: Madhuparna Bhowmik Signed-off-by: David S. Miller --- net/openvswitch/meter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 3323b79ff548..5010d1ddd4bd 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -61,7 +61,8 @@ static struct dp_meter *lookup_meter(const struct datapath *dp, struct hlist_head *head; head = meter_hash_bucket(dp, meter_id); - hlist_for_each_entry_rcu(meter, head, dp_hash_node) { + hlist_for_each_entry_rcu(meter, head, dp_hash_node, + lockdep_ovsl_is_held()) { if (meter->id == meter_id) return meter; } -- cgit v1.2.3 From fed48423f14d9fa184b262d7c35d9dc1c3698500 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Wed, 19 Feb 2020 01:27:42 +0530 Subject: vport.c: Use built-in RCU list checking hlist_for_each_entry_rcu() has built-in RCU and lock checking. Pass cond argument to list_for_each_entry_rcu() to silence false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled by default. Signed-off-by: Madhuparna Bhowmik Signed-off-by: David S. Miller --- net/openvswitch/vport.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 5da9392b03d6..47febb4504f0 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -96,7 +96,8 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name) struct hlist_head *bucket = hash_bucket(net, name); struct vport *vport; - hlist_for_each_entry_rcu(vport, bucket, hash_node) + hlist_for_each_entry_rcu(vport, bucket, hash_node, + lockdep_ovsl_is_held()) if (!strcmp(name, ovs_vport_name(vport)) && net_eq(ovs_dp_get_net(vport->dp), net)) return vport; -- cgit v1.2.3 From 53742e69e85d2eb7ed56f58d277bc3e682f8949e Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Wed, 19 Feb 2020 01:28:02 +0530 Subject: datapath.c: Use built-in RCU list checking hlist_for_each_entry_rcu() has built-in RCU and lock checking. Pass cond argument to list_for_each_entry_rcu() to silence false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled by default. Signed-off-by: Madhuparna Bhowmik Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 659c2a790fe7..c047afd12116 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -179,7 +179,8 @@ struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no) struct hlist_head *head; head = vport_hash_bucket(dp, port_no); - hlist_for_each_entry_rcu(vport, head, dp_hash_node) { + hlist_for_each_entry_rcu(vport, head, dp_hash_node, + lockdep_ovsl_is_held()) { if (vport->port_no == port_no) return vport; } @@ -2042,7 +2043,8 @@ static unsigned int ovs_get_max_headroom(struct datapath *dp) int i; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { - hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node, + lockdep_ovsl_is_held()) { dev = vport->dev; dev_headroom = netdev_get_fwd_headroom(dev); if (dev_headroom > max_headroom) @@ -2061,7 +2063,8 @@ static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom) dp->max_headroom = new_headroom; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) - hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node, + lockdep_ovsl_is_held()) netdev_set_rx_headroom(vport->dev, new_headroom); } -- cgit v1.2.3 From a2cfb96cc3654c6d451020480a4bcfbbca564350 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Wed, 19 Feb 2020 01:28:20 +0530 Subject: flow_table.c: Use built-in RCU list checking hlist_for_each_entry_rcu() has built-in RCU and lock checking. Pass cond argument to list_for_each_entry_rcu() to silence false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled by default. Signed-off-by: Madhuparna Bhowmik Signed-off-by: David S. Miller --- net/openvswitch/flow_table.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 5904e93e5765..fd8a01ca7a2d 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -585,7 +585,8 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti, head = find_bucket(ti, hash); (*n_mask_hit)++; - hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) { + hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver], + lockdep_ovsl_is_held()) { if (flow->mask == mask && flow->flow_table.hash == hash && flow_cmp_masked_key(flow, &masked_key, &mask->range)) return flow; @@ -769,7 +770,8 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl, hash = ufid_hash(ufid); head = find_bucket(ti, hash); - hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver]) { + hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver], + lockdep_ovsl_is_held()) { if (flow->ufid_table.hash == hash && ovs_flow_cmp_ufid(flow, ufid)) return flow; -- cgit v1.2.3 From bd97ad51a7eb1b02049deca56bc26d96cabbac8a Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Fri, 14 Feb 2020 18:14:13 +0100 Subject: netfilter: nft_set_pipapo: Fix mapping table example in comments In both insertion and lookup examples, the two element pointers of rule mapping tables were swapped. Fix that. Reported-by: Pablo Neira Ayuso Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index f0cb1e13af50..579600b39f39 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -203,7 +203,7 @@ * :: * * rule indices in last field: 0 1 - * map to elements: 0x42 0x66 + * map to elements: 0x66 0x42 * * * Matching @@ -298,7 +298,7 @@ * :: * * rule indices in last field: 0 1 - * map to elements: 0x42 0x66 + * map to elements: 0x66 0x42 * * the matching element is at 0x42. * -- cgit v1.2.3 From 9a7712048f9d43da5022e75eca3d6b81080e76d3 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Fri, 14 Feb 2020 18:14:14 +0100 Subject: netfilter: nft_set_pipapo: Don't abuse unlikely() in pipapo_refill() I originally used unlikely() in the if (match_only) clause, which we hit on the mapping table for the last field in a set, to ensure we avoid branching to the rest of for loop body, which is executed more frequently. However, Pablo reports, this is confusing as it gives the impression that this is not a common case, and it's actually not the intended usage of unlikely(). I couldn't observe any statistical difference in matching rates on x864_64 and aarch64 without it, so just drop it. Reported-by: Pablo Neira Ayuso Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 579600b39f39..feac8553f6d9 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -503,7 +503,7 @@ static int pipapo_refill(unsigned long *map, int len, int rules, return -1; } - if (unlikely(match_only)) { + if (match_only) { bitmap_clear(map, i, 1); return i; } -- cgit v1.2.3 From a7a9456e8d28e81030f7cf6f1f59f907089916a9 Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Wed, 19 Feb 2020 15:30:11 +0530 Subject: net: hsr: Pass lockdep expression to RCU lists node_db is traversed using list_for_each_entry_rcu outside an RCU read-side critical section but under the protection of hsr->list_lock. Hence, add corresponding lockdep expression to silence false-positive warnings, and harden RCU lists. Signed-off-by: Amol Grover Signed-off-by: David S. Miller --- net/hsr/hsr_framereg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 364ea2cc028e..3ba7f61be107 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -155,7 +155,8 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, new_node->seq_out[i] = seq_out; spin_lock_bh(&hsr->list_lock); - list_for_each_entry_rcu(node, node_db, mac_list) { + list_for_each_entry_rcu(node, node_db, mac_list, + lockdep_is_held(&hsr->list_lock)) { if (ether_addr_equal(node->macaddress_A, addr)) goto out; if (ether_addr_equal(node->macaddress_B, addr)) -- cgit v1.2.3 From 33c4acbe2f4e8f2866914b1fb90ce74fc7216c21 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Wed, 19 Feb 2020 20:47:46 +0530 Subject: bridge: br_stp: Use built-in RCU list checking list_for_each_entry_rcu() has built-in RCU and lock checking. Pass cond argument to list_for_each_entry_rcu() to silence false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled by default. Signed-off-by: Madhuparna Bhowmik Signed-off-by: David S. Miller --- net/bridge/br_stp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 6856a6d9282b..1f14b8455345 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -63,7 +63,8 @@ struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no) { struct net_bridge_port *p; - list_for_each_entry_rcu(p, &br->port_list, list) { + list_for_each_entry_rcu(p, &br->port_list, list, + lockdep_is_held(&br->lock)) { if (p->port_no == port_no) return p; } -- cgit v1.2.3 From 06f5201c6392f998a49ca9c9173e2930c8eb51d8 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Wed, 19 Feb 2020 09:40:22 +0530 Subject: net/tls: Fix to avoid gettig invalid tls record Current code doesn't check if tcp sequence number is starting from (/after) 1st record's start sequnce number. It only checks if seq number is before 1st record's end sequnce number. This problem will always be a possibility in re-transmit case. If a record which belongs to a requested seq number is already deleted, tls_get_record will start looking into list and as per the check it will look if seq number is before the end seq of 1st record, which will always be true and will return 1st record always, it should in fact return NULL. As part of the fix, start looking each record only if the sequence number lies in the list else return NULL. There is one more check added, driver look for the start marker record to handle tcp packets which are before the tls offload start sequence number, hence return 1st record if the record is tls start marker and seq number is before the 1st record's starting sequence number. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Rohit Maheshwari Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/tls/tls_device.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 1ba5a92832bb..1c5574e2e058 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -593,7 +593,7 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn) { u64 record_sn = context->hint_record_sn; - struct tls_record_info *info; + struct tls_record_info *info, *last; info = context->retransmit_hint; if (!info || @@ -605,6 +605,24 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, struct tls_record_info, list); if (!info) return NULL; + /* send the start_marker record if seq number is before the + * tls offload start marker sequence number. This record is + * required to handle TCP packets which are before TLS offload + * started. + * And if it's not start marker, look if this seq number + * belongs to the list. + */ + if (likely(!tls_record_is_start_marker(info))) { + /* we have the first record, get the last record to see + * if this seq number belongs to the list. + */ + last = list_last_entry(&context->records_list, + struct tls_record_info, list); + + if (!between(seq, tls_record_start_seq(info), + last->end_seq)) + return NULL; + } record_sn = context->unacked_record_sn; } -- cgit v1.2.3 From 303d0403b8c25e994e4a6e45389e173cf8706fb5 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 19 Feb 2020 14:16:32 -0500 Subject: udp: rehash on disconnect As of the below commit, udp sockets bound to a specific address can coexist with one bound to the any addr for the same port. The commit also phased out the use of socket hashing based only on port (hslot), in favor of always hashing on {addr, port} (hslot2). The change broke the following behavior with disconnect (AF_UNSPEC): server binds to 0.0.0.0:1337 server connects to 127.0.0.1:80 server disconnects client connects to 127.0.0.1:1337 client sends "hello" server reads "hello" // times out, packet did not find sk On connect the server acquires a specific source addr suitable for routing to its destination. On disconnect it reverts to the any addr. The connect call triggers a rehash to a different hslot2. On disconnect, add the same to return to the original hslot2. Skip this step if the socket is going to be unhashed completely. Fixes: 4cdeeee9252a ("net: udp: prefer listeners bound to an address") Reported-by: Pavel Roskin Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index db76b9609299..08a41f1e1cd2 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1857,8 +1857,12 @@ int __udp_disconnect(struct sock *sk, int flags) inet->inet_dport = 0; sock_rps_reset_rxhash(sk); sk->sk_bound_dev_if = 0; - if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) { inet_reset_saddr(sk); + if (sk->sk_prot->rehash && + (sk->sk_userlocks & SOCK_BINDPORT_LOCK)) + sk->sk_prot->rehash(sk); + } if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) { sk->sk_prot->unhash(sk); -- cgit v1.2.3 From 161d179261f95ac56f61f94f89304e0620534230 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 19 Feb 2020 22:23:04 -0800 Subject: net: core: Distribute switch variables for initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variables declared in a switch statement before any case statements cannot be automatically initialized with compiler instrumentation (as they are not part of any execution flow). With GCC's proposed automatic stack variable initialization feature, this triggers a warning (and they don't get initialized). Clang's automatic stack variable initialization (via CONFIG_INIT_STACK_ALL=y) doesn't throw a warning, but it also doesn't initialize such variables[1]. Note that these warnings (or silent skipping) happen before the dead-store elimination optimization phase, so even when the automatic initializations are later elided in favor of direct initializations, the warnings remain. To avoid these problems, move such variables into the "case" where they're used or lift them up into the main function body. net/core/skbuff.c: In function ‘skb_checksum_setup_ip’: net/core/skbuff.c:4809:7: warning: statement will never be executed [-Wswitch-unreachable] 4809 | int err; | ^~~ [1] https://bugs.llvm.org/show_bug.cgi?id=44916 Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1365a556152c..e1101a4f90a6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4803,9 +4803,9 @@ static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, typeof(IPPROTO_IP) proto, unsigned int off) { - switch (proto) { - int err; + int err; + switch (proto) { case IPPROTO_TCP: err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr), off + MAX_TCP_HDR_LEN); -- cgit v1.2.3 From 46d30cb1045c2ab1ada269702c8c84d6446baf81 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 19 Feb 2020 22:23:07 -0800 Subject: net: ip6_gre: Distribute switch variables for initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variables declared in a switch statement before any case statements cannot be automatically initialized with compiler instrumentation (as they are not part of any execution flow). With GCC's proposed automatic stack variable initialization feature, this triggers a warning (and they don't get initialized). Clang's automatic stack variable initialization (via CONFIG_INIT_STACK_ALL=y) doesn't throw a warning, but it also doesn't initialize such variables[1]. Note that these warnings (or silent skipping) happen before the dead-store elimination optimization phase, so even when the automatic initializations are later elided in favor of direct initializations, the warnings remain. To avoid these problems, move such variables into the "case" where they're used or lift them up into the main function body. net/ipv6/ip6_gre.c: In function ‘ip6gre_err’: net/ipv6/ip6_gre.c:440:32: warning: statement will never be executed [-Wswitch-unreachable] 440 | struct ipv6_tlv_tnl_enc_lim *tel; | ^~~ net/ipv6/ip6_tunnel.c: In function ‘ip6_tnl_err’: net/ipv6/ip6_tunnel.c:520:32: warning: statement will never be executed [-Wswitch-unreachable] 520 | struct ipv6_tlv_tnl_enc_lim *tel; | ^~~ [1] https://bugs.llvm.org/show_bug.cgi?id=44916 Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 8 +++++--- net/ipv6/ip6_tunnel.c | 13 +++++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 55bfc5149d0c..781ca8c07a0d 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -437,8 +437,6 @@ static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return -ENOENT; switch (type) { - struct ipv6_tlv_tnl_enc_lim *tel; - __u32 teli; case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); @@ -452,7 +450,10 @@ static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, break; } return 0; - case ICMPV6_PARAMPROB: + case ICMPV6_PARAMPROB: { + struct ipv6_tlv_tnl_enc_lim *tel; + __u32 teli; + teli = 0; if (code == ICMPV6_HDR_FIELD) teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); @@ -468,6 +469,7 @@ static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, t->parms.name); } return 0; + } case ICMPV6_PKT_TOOBIG: ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); return 0; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 5d65436ad5ad..4703b09808d0 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -517,8 +517,6 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, err = 0; switch (*type) { - struct ipv6_tlv_tnl_enc_lim *tel; - __u32 mtu, teli; case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); @@ -531,7 +529,10 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, rel_msg = 1; } break; - case ICMPV6_PARAMPROB: + case ICMPV6_PARAMPROB: { + struct ipv6_tlv_tnl_enc_lim *tel; + __u32 teli; + teli = 0; if ((*code) == ICMPV6_HDR_FIELD) teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); @@ -548,7 +549,10 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, t->parms.name); } break; - case ICMPV6_PKT_TOOBIG: + } + case ICMPV6_PKT_TOOBIG: { + __u32 mtu; + ip6_update_pmtu(skb, net, htonl(*info), 0, 0, sock_net_uid(net, NULL)); mtu = *info - offset; @@ -562,6 +566,7 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, rel_msg = 1; } break; + } case NDISC_REDIRECT: ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); -- cgit v1.2.3 From 16a556eeb7ed2dc3709fe2c5be76accdfa4901ab Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 19 Feb 2020 22:23:09 -0800 Subject: openvswitch: Distribute switch variables for initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variables declared in a switch statement before any case statements cannot be automatically initialized with compiler instrumentation (as they are not part of any execution flow). With GCC's proposed automatic stack variable initialization feature, this triggers a warning (and they don't get initialized). Clang's automatic stack variable initialization (via CONFIG_INIT_STACK_ALL=y) doesn't throw a warning, but it also doesn't initialize such variables[1]. Note that these warnings (or silent skipping) happen before the dead-store elimination optimization phase, so even when the automatic initializations are later elided in favor of direct initializations, the warnings remain. To avoid these problems, move such variables into the "case" where they're used or lift them up into the main function body. net/openvswitch/flow_netlink.c: In function ‘validate_set’: net/openvswitch/flow_netlink.c:2711:29: warning: statement will never be executed [-Wswitch-unreachable] 2711 | const struct ovs_key_ipv4 *ipv4_key; | ^~~~~~~~ [1] https://bugs.llvm.org/show_bug.cgi?id=44916 Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 7da4230627f5..288122eec7c8 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2708,10 +2708,6 @@ static int validate_set(const struct nlattr *a, return -EINVAL; switch (key_type) { - const struct ovs_key_ipv4 *ipv4_key; - const struct ovs_key_ipv6 *ipv6_key; - int err; - case OVS_KEY_ATTR_PRIORITY: case OVS_KEY_ATTR_SKB_MARK: case OVS_KEY_ATTR_CT_MARK: @@ -2723,7 +2719,9 @@ static int validate_set(const struct nlattr *a, return -EINVAL; break; - case OVS_KEY_ATTR_TUNNEL: + case OVS_KEY_ATTR_TUNNEL: { + int err; + if (masked) return -EINVAL; /* Masked tunnel set not supported. */ @@ -2732,8 +2730,10 @@ static int validate_set(const struct nlattr *a, if (err) return err; break; + } + case OVS_KEY_ATTR_IPV4: { + const struct ovs_key_ipv4 *ipv4_key; - case OVS_KEY_ATTR_IPV4: if (eth_type != htons(ETH_P_IP)) return -EINVAL; @@ -2753,8 +2753,10 @@ static int validate_set(const struct nlattr *a, return -EINVAL; } break; + } + case OVS_KEY_ATTR_IPV6: { + const struct ovs_key_ipv6 *ipv6_key; - case OVS_KEY_ATTR_IPV6: if (eth_type != htons(ETH_P_IPV6)) return -EINVAL; @@ -2781,7 +2783,7 @@ static int validate_set(const struct nlattr *a, return -EINVAL; break; - + } case OVS_KEY_ATTR_TCP: if ((eth_type != htons(ETH_P_IP) && eth_type != htons(ETH_P_IPV6)) || -- cgit v1.2.3 From 98bda63e20daab95bdc084ce00459a4f622a0505 Mon Sep 17 00:00:00 2001 From: Roman Kiryanov Date: Wed, 19 Feb 2020 13:40:06 -0800 Subject: net: disable BRIDGE_NETFILTER by default The description says 'If unsure, say N.' but the module is built as M by default (once the dependencies are satisfied). When the module is selected (Y or M), it enables NETFILTER_FAMILY_BRIDGE and SKB_EXTENSIONS which alter kernel internal structures. We (Android Studio Emulator) currently do not use this module and think this it is more consistent to have it disabled by default as opposite to disabling it explicitly to prevent enabling NETFILTER_FAMILY_BRIDGE and SKB_EXTENSIONS. Signed-off-by: Roman Kiryanov Acked-by: Florian Westphal Signed-off-by: David S. Miller --- net/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/Kconfig b/net/Kconfig index b0937a700f01..2eeb0e55f7c9 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -189,7 +189,6 @@ config BRIDGE_NETFILTER depends on NETFILTER_ADVANCED select NETFILTER_FAMILY_BRIDGE select SKB_EXTENSIONS - default m ---help--- Enabling this option will let arptables resp. iptables see bridged ARP resp. IP traffic. If you want a bridging firewall, you probably -- cgit v1.2.3 From 3a20773beeeeadec41477a5ba872175b778ff752 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 20 Feb 2020 16:42:13 +0200 Subject: net: netlink: cap max groups which will be considered in netlink_bind() Since nl_groups is a u32 we can't bind more groups via ->bind (netlink_bind) call, but netlink has supported more groups via setsockopt() for a long time and thus nlk->ngroups could be over 32. Recently I added support for per-vlan notifications and increased the groups to 33 for NETLINK_ROUTE which exposed an old bug in the netlink_bind() code causing out-of-bounds access on archs where unsigned long is 32 bits via test_bit() on a local variable. Fix this by capping the maximum groups in netlink_bind() to BITS_PER_TYPE(u32), effectively capping them at 32 which is the minimum of allocated groups and the maximum groups which can be bound via netlink_bind(). CC: Christophe Leroy CC: Richard Guy Briggs Fixes: 4f520900522f ("netlink: have netlink per-protocol bind function return an error code.") Reported-by: Erhard F. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4e31721e7293..edf3e285e242 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1014,7 +1014,8 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (nlk->netlink_bind && groups) { int group; - for (group = 0; group < nlk->ngroups; group++) { + /* nl_groups is a u32, so cap the maximum groups we can bind */ + for (group = 0; group < BITS_PER_TYPE(u32); group++) { if (!test_bit(group, &groups)) continue; err = nlk->netlink_bind(net, group + 1); @@ -1033,7 +1034,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, netlink_insert(sk, nladdr->nl_pid) : netlink_autobind(sock); if (err) { - netlink_undo_bind(nlk->ngroups, groups, sk); + netlink_undo_bind(BITS_PER_TYPE(u32), groups, sk); goto unlock; } } -- cgit v1.2.3 From 9951ebfcdf2b97dbb28a5d930458424341e61aa2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 21 Feb 2020 10:41:43 +0100 Subject: nl80211: fix potential leak in AP start If nl80211_parse_he_obss_pd() fails, we leak the previously allocated ACL memory. Free it in this case. Fixes: 796e90f42b7e ("cfg80211: add support for parsing OBBS_PD attributes") Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200221104142.835aba4cdd14.I1923b55ba9989c57e13978f91f40bfdc45e60cbd@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index cedf17d4933f..46be40e19e7f 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4800,8 +4800,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) err = nl80211_parse_he_obss_pd( info->attrs[NL80211_ATTR_HE_OBSS_PD], ¶ms.he_obss_pd); - if (err) - return err; + goto out; } nl80211_calculate_ap_params(¶ms); @@ -4823,6 +4822,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) } wdev_unlock(wdev); +out: kfree(params.acl); return err; -- cgit v1.2.3 From a7ee7d44b57c9ae174088e53a668852b7f4f452d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 21 Feb 2020 10:44:50 +0100 Subject: cfg80211: check reg_rule for NULL in handle_channel_custom() We may end up with a NULL reg_rule after the loop in handle_channel_custom() if the bandwidth didn't fit, check if this is the case and bail out if so. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200221104449.3b558a50201c.I4ad3725c4dacaefd2d18d3cc65ba6d18acd5dbfe@changeid Signed-off-by: Johannes Berg --- net/wireless/reg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index fff9a74891fc..1a8218f1bbe0 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2276,7 +2276,7 @@ static void handle_channel_custom(struct wiphy *wiphy, break; } - if (IS_ERR(reg_rule)) { + if (IS_ERR_OR_NULL(reg_rule)) { pr_debug("Disabling freq %d MHz as custom regd has no rule that fits it\n", chan->center_freq); if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) { -- cgit v1.2.3 From 0daa63ed4c6c4302790ce67b7a90c0997ceb7514 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Fri, 21 Feb 2020 10:47:20 +0100 Subject: mac80211: Remove a redundant mutex unlock The below-mentioned commit changed the code to unlock *inside* the function, but previously the unlock was *outside*. It failed to remove the outer unlock, however, leading to double unlock. Fix this. Fixes: 33483a6b88e4 ("mac80211: fix missing unlock on error in ieee80211_mark_sta_auth()") Signed-off-by: Andrei Otcheretianski Link: https://lore.kernel.org/r/20200221104719.cce4741cf6eb.I671567b185c8a4c2409377e483fd149ce590f56d@changeid [rewrite commit message to better explain what happened] Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index e041af2f021a..88d7a692a965 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2959,7 +2959,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, (auth_transaction == 2 && ifmgd->auth_data->expected_transaction == 2)) { if (!ieee80211_mark_sta_auth(sdata, bssid)) - goto out_err; + return; /* ignore frame -- wait for timeout */ } else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE && auth_transaction == 2) { sdata_info(sdata, "SAE peer confirmed\n"); @@ -2967,10 +2967,6 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, } cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); - return; - out_err: - mutex_unlock(&sdata->local->sta_mtx); - /* ignore frame -- wait for timeout */ } #define case_WLAN(type) \ -- cgit v1.2.3 From f66ee0410b1c3481ee75e5db9b34547b4d582465 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 11 Feb 2020 23:20:43 +0100 Subject: netfilter: ipset: Fix "INFO: rcu detected stall in hash_xxx" reports In the case of huge hash:* types of sets, due to the single spinlock of a set the processing of the whole set under spinlock protection could take too long. There were four places where the whole hash table of the set was processed from bucket to bucket under holding the spinlock: - During resizing a set, the original set was locked to exclude kernel side add/del element operations (userspace add/del is excluded by the nfnetlink mutex). The original set is actually just read during the resize, so the spinlocking is replaced with rcu locking of regions. However, thus there can be parallel kernel side add/del of entries. In order not to loose those operations a backlog is added and replayed after the successful resize. - Garbage collection of timed out entries was also protected by the spinlock. In order not to lock too long, region locking is introduced and a single region is processed in one gc go. Also, the simple timer based gc running is replaced with a workqueue based solution. The internal book-keeping (number of elements, size of extensions) is moved to region level due to the region locking. - Adding elements: when the max number of the elements is reached, the gc was called to evict the timed out entries. The new approach is that the gc is called just for the matching region, assuming that if the region (proportionally) seems to be full, then the whole set does. We could scan the other regions to check every entry under rcu locking, but for huge sets it'd mean a slowdown at adding elements. - Listing the set header data: when the set was defined with timeout support, the garbage collector was called to clean up timed out entries to get the correct element numbers and set size values. Now the set is scanned to check non-timed out entries, without actually calling the gc for the whole set. Thanks to Florian Westphal for helping me to solve the SOFTIRQ-safe -> SOFTIRQ-unsafe lock order issues during working on the patch. Reported-by: syzbot+4b0e9d4ff3cf117837e5@syzkaller.appspotmail.com Reported-by: syzbot+c27b8d5010f45c666ed1@syzkaller.appspotmail.com Reported-by: syzbot+68a806795ac89df3aa1c@syzkaller.appspotmail.com Fixes: 23c42a403a9c ("netfilter: ipset: Introduction of new commands and protocol version 7") Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_core.c | 34 +- net/netfilter/ipset/ip_set_hash_gen.h | 633 +++++++++++++++++++++++----------- 2 files changed, 462 insertions(+), 205 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 69c107f9ba8d..8dd17589217d 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -723,6 +723,20 @@ ip_set_rcu_get(struct net *net, ip_set_id_t index) return set; } +static inline void +ip_set_lock(struct ip_set *set) +{ + if (!set->variant->region_lock) + spin_lock_bh(&set->lock); +} + +static inline void +ip_set_unlock(struct ip_set *set) +{ + if (!set->variant->region_lock) + spin_unlock_bh(&set->lock); +} + int ip_set_test(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) @@ -744,9 +758,9 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, if (ret == -EAGAIN) { /* Type requests element to be completed */ pr_debug("element must be completed, ADD is triggered\n"); - spin_lock_bh(&set->lock); + ip_set_lock(set); set->variant->kadt(set, skb, par, IPSET_ADD, opt); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); ret = 1; } else { /* --return-nomatch: invert matched element */ @@ -775,9 +789,9 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb, !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; - spin_lock_bh(&set->lock); + ip_set_lock(set); ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); return ret; } @@ -797,9 +811,9 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb, !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; - spin_lock_bh(&set->lock); + ip_set_lock(set); ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); return ret; } @@ -1264,9 +1278,9 @@ ip_set_flush_set(struct ip_set *set) { pr_debug("set: %s\n", set->name); - spin_lock_bh(&set->lock); + ip_set_lock(set); set->variant->flush(set); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); } static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb, @@ -1713,9 +1727,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, bool eexist = flags & IPSET_FLAG_EXIST, retried = false; do { - spin_lock_bh(&set->lock); + ip_set_lock(set); ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); retried = true; } while (ret == -EAGAIN && set->variant->resize && diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 7480ce55b5c8..71e93eac0831 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -7,13 +7,21 @@ #include #include #include +#include #include -#define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c) -#define ipset_dereference_protected(p, set) \ - __ipset_dereference_protected(p, lockdep_is_held(&(set)->lock)) - -#define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1) +#define __ipset_dereference(p) \ + rcu_dereference_protected(p, 1) +#define ipset_dereference_nfnl(p) \ + rcu_dereference_protected(p, \ + lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) +#define ipset_dereference_set(p, set) \ + rcu_dereference_protected(p, \ + lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ + lockdep_is_held(&(set)->lock)) +#define ipset_dereference_bh_nfnl(p) \ + rcu_dereference_bh_check(p, \ + lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) /* Hashing which uses arrays to resolve clashing. The hash table is resized * (doubled) when searching becomes too long. @@ -72,11 +80,35 @@ struct hbucket { __aligned(__alignof__(u64)); }; +/* Region size for locking == 2^HTABLE_REGION_BITS */ +#define HTABLE_REGION_BITS 10 +#define ahash_numof_locks(htable_bits) \ + ((htable_bits) < HTABLE_REGION_BITS ? 1 \ + : jhash_size((htable_bits) - HTABLE_REGION_BITS)) +#define ahash_sizeof_regions(htable_bits) \ + (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region)) +#define ahash_region(n, htable_bits) \ + ((n) % ahash_numof_locks(htable_bits)) +#define ahash_bucket_start(h, htable_bits) \ + ((htable_bits) < HTABLE_REGION_BITS ? 0 \ + : (h) * jhash_size(HTABLE_REGION_BITS)) +#define ahash_bucket_end(h, htable_bits) \ + ((htable_bits) < HTABLE_REGION_BITS ? jhash_size(htable_bits) \ + : ((h) + 1) * jhash_size(HTABLE_REGION_BITS)) + +struct htable_gc { + struct delayed_work dwork; + struct ip_set *set; /* Set the gc belongs to */ + u32 region; /* Last gc run position */ +}; + /* The hash table: the table size stored here in order to make resizing easy */ struct htable { atomic_t ref; /* References for resizing */ - atomic_t uref; /* References for dumping */ + atomic_t uref; /* References for dumping and gc */ u8 htable_bits; /* size of hash table == 2^htable_bits */ + u32 maxelem; /* Maxelem per region */ + struct ip_set_region *hregion; /* Region locks and ext sizes */ struct hbucket __rcu *bucket[0]; /* hashtable buckets */ }; @@ -162,6 +194,10 @@ htable_bits(u32 hashsize) #define NLEN 0 #endif /* IP_SET_HASH_WITH_NETS */ +#define SET_ELEM_EXPIRED(set, d) \ + (SET_WITH_TIMEOUT(set) && \ + ip_set_timeout_expired(ext_timeout(d, set))) + #endif /* _IP_SET_HASH_GEN_H */ #ifndef MTYPE @@ -205,10 +241,12 @@ htable_bits(u32 hashsize) #undef mtype_test_cidrs #undef mtype_test #undef mtype_uref -#undef mtype_expire #undef mtype_resize +#undef mtype_ext_size +#undef mtype_resize_ad #undef mtype_head #undef mtype_list +#undef mtype_gc_do #undef mtype_gc #undef mtype_gc_init #undef mtype_variant @@ -247,10 +285,12 @@ htable_bits(u32 hashsize) #define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs) #define mtype_test IPSET_TOKEN(MTYPE, _test) #define mtype_uref IPSET_TOKEN(MTYPE, _uref) -#define mtype_expire IPSET_TOKEN(MTYPE, _expire) #define mtype_resize IPSET_TOKEN(MTYPE, _resize) +#define mtype_ext_size IPSET_TOKEN(MTYPE, _ext_size) +#define mtype_resize_ad IPSET_TOKEN(MTYPE, _resize_ad) #define mtype_head IPSET_TOKEN(MTYPE, _head) #define mtype_list IPSET_TOKEN(MTYPE, _list) +#define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) #define mtype_variant IPSET_TOKEN(MTYPE, _variant) @@ -275,8 +315,7 @@ htable_bits(u32 hashsize) /* The generic hash structure */ struct htype { struct htable __rcu *table; /* the hash table */ - struct timer_list gc; /* garbage collection when timeout enabled */ - struct ip_set *set; /* attached to this ip_set */ + struct htable_gc gc; /* gc workqueue */ u32 maxelem; /* max elements in the hash */ u32 initval; /* random jhash init value */ #ifdef IP_SET_HASH_WITH_MARKMASK @@ -288,21 +327,33 @@ struct htype { #ifdef IP_SET_HASH_WITH_NETMASK u8 netmask; /* netmask value for subnets to store */ #endif + struct list_head ad; /* Resize add|del backlist */ struct mtype_elem next; /* temporary storage for uadd */ #ifdef IP_SET_HASH_WITH_NETS struct net_prefixes nets[NLEN]; /* book-keeping of prefixes */ #endif }; +/* ADD|DEL entries saved during resize */ +struct mtype_resize_ad { + struct list_head list; + enum ipset_adt ad; /* ADD|DEL element */ + struct mtype_elem d; /* Element value */ + struct ip_set_ext ext; /* Extensions for ADD */ + struct ip_set_ext mext; /* Target extensions for ADD */ + u32 flags; /* Flags for ADD */ +}; + #ifdef IP_SET_HASH_WITH_NETS /* Network cidr size book keeping when the hash stores different * sized networks. cidr == real cidr + 1 to support /0. */ static void -mtype_add_cidr(struct htype *h, u8 cidr, u8 n) +mtype_add_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n) { int i, j; + spin_lock_bh(&set->lock); /* Add in increasing prefix order, so larger cidr first */ for (i = 0, j = -1; i < NLEN && h->nets[i].cidr[n]; i++) { if (j != -1) { @@ -311,7 +362,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 n) j = i; } else if (h->nets[i].cidr[n] == cidr) { h->nets[CIDR_POS(cidr)].nets[n]++; - return; + goto unlock; } } if (j != -1) { @@ -320,24 +371,29 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 n) } h->nets[i].cidr[n] = cidr; h->nets[CIDR_POS(cidr)].nets[n] = 1; +unlock: + spin_unlock_bh(&set->lock); } static void -mtype_del_cidr(struct htype *h, u8 cidr, u8 n) +mtype_del_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n) { u8 i, j, net_end = NLEN - 1; + spin_lock_bh(&set->lock); for (i = 0; i < NLEN; i++) { if (h->nets[i].cidr[n] != cidr) continue; h->nets[CIDR_POS(cidr)].nets[n]--; if (h->nets[CIDR_POS(cidr)].nets[n] > 0) - return; + goto unlock; for (j = i; j < net_end && h->nets[j].cidr[n]; j++) h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; h->nets[j].cidr[n] = 0; - return; + goto unlock; } +unlock: + spin_unlock_bh(&set->lock); } #endif @@ -345,7 +401,7 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 n) static size_t mtype_ahash_memsize(const struct htype *h, const struct htable *t) { - return sizeof(*h) + sizeof(*t); + return sizeof(*h) + sizeof(*t) + ahash_sizeof_regions(t->htable_bits); } /* Get the ith element from the array block n */ @@ -369,24 +425,29 @@ mtype_flush(struct ip_set *set) struct htype *h = set->data; struct htable *t; struct hbucket *n; - u32 i; - - t = ipset_dereference_protected(h->table, set); - for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = __ipset_dereference_protected(hbucket(t, i), 1); - if (!n) - continue; - if (set->extensions & IPSET_EXT_DESTROY) - mtype_ext_cleanup(set, n); - /* FIXME: use slab cache */ - rcu_assign_pointer(hbucket(t, i), NULL); - kfree_rcu(n, rcu); + u32 r, i; + + t = ipset_dereference_nfnl(h->table); + for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) { + spin_lock_bh(&t->hregion[r].lock); + for (i = ahash_bucket_start(r, t->htable_bits); + i < ahash_bucket_end(r, t->htable_bits); i++) { + n = __ipset_dereference(hbucket(t, i)); + if (!n) + continue; + if (set->extensions & IPSET_EXT_DESTROY) + mtype_ext_cleanup(set, n); + /* FIXME: use slab cache */ + rcu_assign_pointer(hbucket(t, i), NULL); + kfree_rcu(n, rcu); + } + t->hregion[r].ext_size = 0; + t->hregion[r].elements = 0; + spin_unlock_bh(&t->hregion[r].lock); } #ifdef IP_SET_HASH_WITH_NETS memset(h->nets, 0, sizeof(h->nets)); #endif - set->elements = 0; - set->ext_size = 0; } /* Destroy the hashtable part of the set */ @@ -397,7 +458,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) u32 i; for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = __ipset_dereference_protected(hbucket(t, i), 1); + n = __ipset_dereference(hbucket(t, i)); if (!n) continue; if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) @@ -406,6 +467,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) kfree(n); } + ip_set_free(t->hregion); ip_set_free(t); } @@ -414,28 +476,21 @@ static void mtype_destroy(struct ip_set *set) { struct htype *h = set->data; + struct list_head *l, *lt; if (SET_WITH_TIMEOUT(set)) - del_timer_sync(&h->gc); + cancel_delayed_work_sync(&h->gc.dwork); - mtype_ahash_destroy(set, - __ipset_dereference_protected(h->table, 1), true); + mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true); + list_for_each_safe(l, lt, &h->ad) { + list_del(l); + kfree(l); + } kfree(h); set->data = NULL; } -static void -mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t)) -{ - struct htype *h = set->data; - - timer_setup(&h->gc, gc, 0); - mod_timer(&h->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ); - pr_debug("gc initialized, run in every %u\n", - IPSET_GC_PERIOD(set->timeout)); -} - static bool mtype_same_set(const struct ip_set *a, const struct ip_set *b) { @@ -454,11 +509,9 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b) a->extensions == b->extensions; } -/* Delete expired elements from the hashtable */ static void -mtype_expire(struct ip_set *set, struct htype *h) +mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r) { - struct htable *t; struct hbucket *n, *tmp; struct mtype_elem *data; u32 i, j, d; @@ -466,10 +519,12 @@ mtype_expire(struct ip_set *set, struct htype *h) #ifdef IP_SET_HASH_WITH_NETS u8 k; #endif + u8 htable_bits = t->htable_bits; - t = ipset_dereference_protected(h->table, set); - for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = __ipset_dereference_protected(hbucket(t, i), 1); + spin_lock_bh(&t->hregion[r].lock); + for (i = ahash_bucket_start(r, htable_bits); + i < ahash_bucket_end(r, htable_bits); i++) { + n = __ipset_dereference(hbucket(t, i)); if (!n) continue; for (j = 0, d = 0; j < n->pos; j++) { @@ -485,58 +540,100 @@ mtype_expire(struct ip_set *set, struct htype *h) smp_mb__after_atomic(); #ifdef IP_SET_HASH_WITH_NETS for (k = 0; k < IPSET_NET_COUNT; k++) - mtype_del_cidr(h, + mtype_del_cidr(set, h, NCIDR_PUT(DCIDR_GET(data->cidr, k)), k); #endif + t->hregion[r].elements--; ip_set_ext_destroy(set, data); - set->elements--; d++; } if (d >= AHASH_INIT_SIZE) { if (d >= n->size) { + t->hregion[r].ext_size -= + ext_size(n->size, dsize); rcu_assign_pointer(hbucket(t, i), NULL); kfree_rcu(n, rcu); continue; } tmp = kzalloc(sizeof(*tmp) + - (n->size - AHASH_INIT_SIZE) * dsize, - GFP_ATOMIC); + (n->size - AHASH_INIT_SIZE) * dsize, + GFP_ATOMIC); if (!tmp) - /* Still try to delete expired elements */ + /* Still try to delete expired elements. */ continue; tmp->size = n->size - AHASH_INIT_SIZE; for (j = 0, d = 0; j < n->pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, dsize); - memcpy(tmp->value + d * dsize, data, dsize); + memcpy(tmp->value + d * dsize, + data, dsize); set_bit(d, tmp->used); d++; } tmp->pos = d; - set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize); + t->hregion[r].ext_size -= + ext_size(AHASH_INIT_SIZE, dsize); rcu_assign_pointer(hbucket(t, i), tmp); kfree_rcu(n, rcu); } } + spin_unlock_bh(&t->hregion[r].lock); } static void -mtype_gc(struct timer_list *t) +mtype_gc(struct work_struct *work) { - struct htype *h = from_timer(h, t, gc); - struct ip_set *set = h->set; + struct htable_gc *gc; + struct ip_set *set; + struct htype *h; + struct htable *t; + u32 r, numof_locks; + unsigned int next_run; + + gc = container_of(work, struct htable_gc, dwork.work); + set = gc->set; + h = set->data; - pr_debug("called\n"); spin_lock_bh(&set->lock); - mtype_expire(set, h); + t = ipset_dereference_set(h->table, set); + atomic_inc(&t->uref); + numof_locks = ahash_numof_locks(t->htable_bits); + r = gc->region++; + if (r >= numof_locks) { + r = gc->region = 0; + } + next_run = (IPSET_GC_PERIOD(set->timeout) * HZ) / numof_locks; + if (next_run < HZ/10) + next_run = HZ/10; spin_unlock_bh(&set->lock); - h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; - add_timer(&h->gc); + mtype_gc_do(set, h, t, r); + + if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { + pr_debug("Table destroy after resize by expire: %p\n", t); + mtype_ahash_destroy(set, t, false); + } + + queue_delayed_work(system_power_efficient_wq, &gc->dwork, next_run); + +} + +static void +mtype_gc_init(struct htable_gc *gc) +{ + INIT_DEFERRABLE_WORK(&gc->dwork, mtype_gc); + queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ); } +static int +mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags); +static int +mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags); + /* Resize a hash: create a new hash table with doubling the hashsize * and inserting the elements to it. Repeat until we succeed or * fail due to memory pressures. @@ -547,7 +644,7 @@ mtype_resize(struct ip_set *set, bool retried) struct htype *h = set->data; struct htable *t, *orig; u8 htable_bits; - size_t extsize, dsize = set->dsize; + size_t dsize = set->dsize; #ifdef IP_SET_HASH_WITH_NETS u8 flags; struct mtype_elem *tmp; @@ -555,7 +652,9 @@ mtype_resize(struct ip_set *set, bool retried) struct mtype_elem *data; struct mtype_elem *d; struct hbucket *n, *m; - u32 i, j, key; + struct list_head *l, *lt; + struct mtype_resize_ad *x; + u32 i, j, r, nr, key; int ret; #ifdef IP_SET_HASH_WITH_NETS @@ -563,10 +662,8 @@ mtype_resize(struct ip_set *set, bool retried) if (!tmp) return -ENOMEM; #endif - rcu_read_lock_bh(); - orig = rcu_dereference_bh_nfnl(h->table); + orig = ipset_dereference_bh_nfnl(h->table); htable_bits = orig->htable_bits; - rcu_read_unlock_bh(); retry: ret = 0; @@ -583,88 +680,124 @@ retry: ret = -ENOMEM; goto out; } + t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits)); + if (!t->hregion) { + kfree(t); + ret = -ENOMEM; + goto out; + } t->htable_bits = htable_bits; + t->maxelem = h->maxelem / ahash_numof_locks(htable_bits); + for (i = 0; i < ahash_numof_locks(htable_bits); i++) + spin_lock_init(&t->hregion[i].lock); - spin_lock_bh(&set->lock); - orig = __ipset_dereference_protected(h->table, 1); - /* There can't be another parallel resizing, but dumping is possible */ + /* There can't be another parallel resizing, + * but dumping, gc, kernel side add/del are possible + */ + orig = ipset_dereference_bh_nfnl(h->table); atomic_set(&orig->ref, 1); atomic_inc(&orig->uref); - extsize = 0; pr_debug("attempt to resize set %s from %u to %u, t %p\n", set->name, orig->htable_bits, htable_bits, orig); - for (i = 0; i < jhash_size(orig->htable_bits); i++) { - n = __ipset_dereference_protected(hbucket(orig, i), 1); - if (!n) - continue; - for (j = 0; j < n->pos; j++) { - if (!test_bit(j, n->used)) + for (r = 0; r < ahash_numof_locks(orig->htable_bits); r++) { + /* Expire may replace a hbucket with another one */ + rcu_read_lock_bh(); + for (i = ahash_bucket_start(r, orig->htable_bits); + i < ahash_bucket_end(r, orig->htable_bits); i++) { + n = __ipset_dereference(hbucket(orig, i)); + if (!n) continue; - data = ahash_data(n, j, dsize); + for (j = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, dsize); + if (SET_ELEM_EXPIRED(set, data)) + continue; #ifdef IP_SET_HASH_WITH_NETS - /* We have readers running parallel with us, - * so the live data cannot be modified. - */ - flags = 0; - memcpy(tmp, data, dsize); - data = tmp; - mtype_data_reset_flags(data, &flags); + /* We have readers running parallel with us, + * so the live data cannot be modified. + */ + flags = 0; + memcpy(tmp, data, dsize); + data = tmp; + mtype_data_reset_flags(data, &flags); #endif - key = HKEY(data, h->initval, htable_bits); - m = __ipset_dereference_protected(hbucket(t, key), 1); - if (!m) { - m = kzalloc(sizeof(*m) + + key = HKEY(data, h->initval, htable_bits); + m = __ipset_dereference(hbucket(t, key)); + nr = ahash_region(key, htable_bits); + if (!m) { + m = kzalloc(sizeof(*m) + AHASH_INIT_SIZE * dsize, GFP_ATOMIC); - if (!m) { - ret = -ENOMEM; - goto cleanup; - } - m->size = AHASH_INIT_SIZE; - extsize += ext_size(AHASH_INIT_SIZE, dsize); - RCU_INIT_POINTER(hbucket(t, key), m); - } else if (m->pos >= m->size) { - struct hbucket *ht; - - if (m->size >= AHASH_MAX(h)) { - ret = -EAGAIN; - } else { - ht = kzalloc(sizeof(*ht) + + if (!m) { + ret = -ENOMEM; + goto cleanup; + } + m->size = AHASH_INIT_SIZE; + t->hregion[nr].ext_size += + ext_size(AHASH_INIT_SIZE, + dsize); + RCU_INIT_POINTER(hbucket(t, key), m); + } else if (m->pos >= m->size) { + struct hbucket *ht; + + if (m->size >= AHASH_MAX(h)) { + ret = -EAGAIN; + } else { + ht = kzalloc(sizeof(*ht) + (m->size + AHASH_INIT_SIZE) * dsize, GFP_ATOMIC); - if (!ht) - ret = -ENOMEM; + if (!ht) + ret = -ENOMEM; + } + if (ret < 0) + goto cleanup; + memcpy(ht, m, sizeof(struct hbucket) + + m->size * dsize); + ht->size = m->size + AHASH_INIT_SIZE; + t->hregion[nr].ext_size += + ext_size(AHASH_INIT_SIZE, + dsize); + kfree(m); + m = ht; + RCU_INIT_POINTER(hbucket(t, key), ht); } - if (ret < 0) - goto cleanup; - memcpy(ht, m, sizeof(struct hbucket) + - m->size * dsize); - ht->size = m->size + AHASH_INIT_SIZE; - extsize += ext_size(AHASH_INIT_SIZE, dsize); - kfree(m); - m = ht; - RCU_INIT_POINTER(hbucket(t, key), ht); - } - d = ahash_data(m, m->pos, dsize); - memcpy(d, data, dsize); - set_bit(m->pos++, m->used); + d = ahash_data(m, m->pos, dsize); + memcpy(d, data, dsize); + set_bit(m->pos++, m->used); + t->hregion[nr].elements++; #ifdef IP_SET_HASH_WITH_NETS - mtype_data_reset_flags(d, &flags); + mtype_data_reset_flags(d, &flags); #endif + } } + rcu_read_unlock_bh(); } - rcu_assign_pointer(h->table, t); - set->ext_size = extsize; - spin_unlock_bh(&set->lock); + /* There can't be any other writer. */ + rcu_assign_pointer(h->table, t); /* Give time to other readers of the set */ synchronize_rcu(); pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name, orig->htable_bits, orig, t->htable_bits, t); - /* If there's nobody else dumping the table, destroy it */ + /* Add/delete elements processed by the SET target during resize. + * Kernel-side add cannot trigger a resize and userspace actions + * are serialized by the mutex. + */ + list_for_each_safe(l, lt, &h->ad) { + x = list_entry(l, struct mtype_resize_ad, list); + if (x->ad == IPSET_ADD) { + mtype_add(set, &x->d, &x->ext, &x->mext, x->flags); + } else { + mtype_del(set, &x->d, NULL, NULL, 0); + } + list_del(l); + kfree(l); + } + /* If there's nobody else using the table, destroy it */ if (atomic_dec_and_test(&orig->uref)) { pr_debug("Table destroy by resize %p\n", orig); mtype_ahash_destroy(set, orig, false); @@ -677,15 +810,44 @@ out: return ret; cleanup: + rcu_read_unlock_bh(); atomic_set(&orig->ref, 0); atomic_dec(&orig->uref); - spin_unlock_bh(&set->lock); mtype_ahash_destroy(set, t, false); if (ret == -EAGAIN) goto retry; goto out; } +/* Get the current number of elements and ext_size in the set */ +static void +mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size) +{ + struct htype *h = set->data; + const struct htable *t; + u32 i, j, r; + struct hbucket *n; + struct mtype_elem *data; + + t = rcu_dereference_bh(h->table); + for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) { + for (i = ahash_bucket_start(r, t->htable_bits); + i < ahash_bucket_end(r, t->htable_bits); i++) { + n = rcu_dereference_bh(hbucket(t, i)); + if (!n) + continue; + for (j = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, set->dsize); + if (!SET_ELEM_EXPIRED(set, data)) + (*elements)++; + } + } + *ext_size += t->hregion[r].ext_size; + } +} + /* Add an element to a hash and update the internal counters when succeeded, * otherwise report the proper error code. */ @@ -698,32 +860,49 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, const struct mtype_elem *d = value; struct mtype_elem *data; struct hbucket *n, *old = ERR_PTR(-ENOENT); - int i, j = -1; + int i, j = -1, ret; bool flag_exist = flags & IPSET_FLAG_EXIST; bool deleted = false, forceadd = false, reuse = false; - u32 key, multi = 0; + u32 r, key, multi = 0, elements, maxelem; - if (set->elements >= h->maxelem) { - if (SET_WITH_TIMEOUT(set)) - /* FIXME: when set is full, we slow down here */ - mtype_expire(set, h); - if (set->elements >= h->maxelem && SET_WITH_FORCEADD(set)) + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); + key = HKEY(value, h->initval, t->htable_bits); + r = ahash_region(key, t->htable_bits); + atomic_inc(&t->uref); + elements = t->hregion[r].elements; + maxelem = t->maxelem; + if (elements >= maxelem) { + u32 e; + if (SET_WITH_TIMEOUT(set)) { + rcu_read_unlock_bh(); + mtype_gc_do(set, h, t, r); + rcu_read_lock_bh(); + } + maxelem = h->maxelem; + elements = 0; + for (e = 0; e < ahash_numof_locks(t->htable_bits); e++) + elements += t->hregion[e].elements; + if (elements >= maxelem && SET_WITH_FORCEADD(set)) forceadd = true; } + rcu_read_unlock_bh(); - t = ipset_dereference_protected(h->table, set); - key = HKEY(value, h->initval, t->htable_bits); - n = __ipset_dereference_protected(hbucket(t, key), 1); + spin_lock_bh(&t->hregion[r].lock); + n = rcu_dereference_bh(hbucket(t, key)); if (!n) { - if (forceadd || set->elements >= h->maxelem) + if (forceadd || elements >= maxelem) goto set_full; old = NULL; n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize, GFP_ATOMIC); - if (!n) - return -ENOMEM; + if (!n) { + ret = -ENOMEM; + goto unlock; + } n->size = AHASH_INIT_SIZE; - set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize); + t->hregion[r].ext_size += + ext_size(AHASH_INIT_SIZE, set->dsize); goto copy_elem; } for (i = 0; i < n->pos; i++) { @@ -737,19 +916,16 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, } data = ahash_data(n, i, set->dsize); if (mtype_data_equal(data, d, &multi)) { - if (flag_exist || - (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(data, set)))) { + if (flag_exist || SET_ELEM_EXPIRED(set, data)) { /* Just the extensions could be overwritten */ j = i; goto overwrite_extensions; } - return -IPSET_ERR_EXIST; + ret = -IPSET_ERR_EXIST; + goto unlock; } /* Reuse first timed out entry */ - if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(data, set)) && - j == -1) { + if (SET_ELEM_EXPIRED(set, data) && j == -1) { j = i; reuse = true; } @@ -759,16 +935,16 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (!deleted) { #ifdef IP_SET_HASH_WITH_NETS for (i = 0; i < IPSET_NET_COUNT; i++) - mtype_del_cidr(h, + mtype_del_cidr(set, h, NCIDR_PUT(DCIDR_GET(data->cidr, i)), i); #endif ip_set_ext_destroy(set, data); - set->elements--; + t->hregion[r].elements--; } goto copy_data; } - if (set->elements >= h->maxelem) + if (elements >= maxelem) goto set_full; /* Create a new slot */ if (n->pos >= n->size) { @@ -776,28 +952,32 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (n->size >= AHASH_MAX(h)) { /* Trigger rehashing */ mtype_data_next(&h->next, d); - return -EAGAIN; + ret = -EAGAIN; + goto resize; } old = n; n = kzalloc(sizeof(*n) + (old->size + AHASH_INIT_SIZE) * set->dsize, GFP_ATOMIC); - if (!n) - return -ENOMEM; + if (!n) { + ret = -ENOMEM; + goto unlock; + } memcpy(n, old, sizeof(struct hbucket) + old->size * set->dsize); n->size = old->size + AHASH_INIT_SIZE; - set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize); + t->hregion[r].ext_size += + ext_size(AHASH_INIT_SIZE, set->dsize); } copy_elem: j = n->pos++; data = ahash_data(n, j, set->dsize); copy_data: - set->elements++; + t->hregion[r].elements++; #ifdef IP_SET_HASH_WITH_NETS for (i = 0; i < IPSET_NET_COUNT; i++) - mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i); + mtype_add_cidr(set, h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i); #endif memcpy(data, d, sizeof(struct mtype_elem)); overwrite_extensions: @@ -820,13 +1000,41 @@ overwrite_extensions: if (old) kfree_rcu(old, rcu); } + ret = 0; +resize: + spin_unlock_bh(&t->hregion[r].lock); + if (atomic_read(&t->ref) && ext->target) { + /* Resize is in process and kernel side add, save values */ + struct mtype_resize_ad *x; + + x = kzalloc(sizeof(struct mtype_resize_ad), GFP_ATOMIC); + if (!x) + /* Don't bother */ + goto out; + x->ad = IPSET_ADD; + memcpy(&x->d, value, sizeof(struct mtype_elem)); + memcpy(&x->ext, ext, sizeof(struct ip_set_ext)); + memcpy(&x->mext, mext, sizeof(struct ip_set_ext)); + x->flags = flags; + spin_lock_bh(&set->lock); + list_add_tail(&x->list, &h->ad); + spin_unlock_bh(&set->lock); + } + goto out; - return 0; set_full: if (net_ratelimit()) pr_warn("Set %s is full, maxelem %u reached\n", - set->name, h->maxelem); - return -IPSET_ERR_HASH_FULL; + set->name, maxelem); + ret = -IPSET_ERR_HASH_FULL; +unlock: + spin_unlock_bh(&t->hregion[r].lock); +out: + if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { + pr_debug("Table destroy after resize by add: %p\n", t); + mtype_ahash_destroy(set, t, false); + } + return ret; } /* Delete an element from the hash and free up space if possible. @@ -840,13 +1048,23 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, const struct mtype_elem *d = value; struct mtype_elem *data; struct hbucket *n; - int i, j, k, ret = -IPSET_ERR_EXIST; + struct mtype_resize_ad *x = NULL; + int i, j, k, r, ret = -IPSET_ERR_EXIST; u32 key, multi = 0; size_t dsize = set->dsize; - t = ipset_dereference_protected(h->table, set); + /* Userspace add and resize is excluded by the mutex. + * Kernespace add does not trigger resize. + */ + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); key = HKEY(value, h->initval, t->htable_bits); - n = __ipset_dereference_protected(hbucket(t, key), 1); + r = ahash_region(key, t->htable_bits); + atomic_inc(&t->uref); + rcu_read_unlock_bh(); + + spin_lock_bh(&t->hregion[r].lock); + n = rcu_dereference_bh(hbucket(t, key)); if (!n) goto out; for (i = 0, k = 0; i < n->pos; i++) { @@ -857,8 +1075,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, data = ahash_data(n, i, dsize); if (!mtype_data_equal(data, d, &multi)) continue; - if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(data, set))) + if (SET_ELEM_EXPIRED(set, data)) goto out; ret = 0; @@ -866,20 +1083,33 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, smp_mb__after_atomic(); if (i + 1 == n->pos) n->pos--; - set->elements--; + t->hregion[r].elements--; #ifdef IP_SET_HASH_WITH_NETS for (j = 0; j < IPSET_NET_COUNT; j++) - mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)), - j); + mtype_del_cidr(set, h, + NCIDR_PUT(DCIDR_GET(d->cidr, j)), j); #endif ip_set_ext_destroy(set, data); + if (atomic_read(&t->ref) && ext->target) { + /* Resize is in process and kernel side del, + * save values + */ + x = kzalloc(sizeof(struct mtype_resize_ad), + GFP_ATOMIC); + if (x) { + x->ad = IPSET_DEL; + memcpy(&x->d, value, + sizeof(struct mtype_elem)); + x->flags = flags; + } + } for (; i < n->pos; i++) { if (!test_bit(i, n->used)) k++; } if (n->pos == 0 && k == 0) { - set->ext_size -= ext_size(n->size, dsize); + t->hregion[r].ext_size -= ext_size(n->size, dsize); rcu_assign_pointer(hbucket(t, key), NULL); kfree_rcu(n, rcu); } else if (k >= AHASH_INIT_SIZE) { @@ -898,7 +1128,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, k++; } tmp->pos = k; - set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize); + t->hregion[r].ext_size -= + ext_size(AHASH_INIT_SIZE, dsize); rcu_assign_pointer(hbucket(t, key), tmp); kfree_rcu(n, rcu); } @@ -906,6 +1137,16 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, } out: + spin_unlock_bh(&t->hregion[r].lock); + if (x) { + spin_lock_bh(&set->lock); + list_add(&x->list, &h->ad); + spin_unlock_bh(&set->lock); + } + if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { + pr_debug("Table destroy after resize by del: %p\n", t); + mtype_ahash_destroy(set, t, false); + } return ret; } @@ -991,6 +1232,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, int i, ret = 0; u32 key, multi = 0; + rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); #ifdef IP_SET_HASH_WITH_NETS /* If we test an IP address and not a network address, @@ -1022,6 +1264,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, goto out; } out: + rcu_read_unlock_bh(); return ret; } @@ -1033,23 +1276,14 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) const struct htable *t; struct nlattr *nested; size_t memsize; + u32 elements = 0; + size_t ext_size = 0; u8 htable_bits; - /* If any members have expired, set->elements will be wrong - * mytype_expire function will update it with the right count. - * we do not hold set->lock here, so grab it first. - * set->elements can still be incorrect in the case of a huge set, - * because elements might time out during the listing. - */ - if (SET_WITH_TIMEOUT(set)) { - spin_lock_bh(&set->lock); - mtype_expire(set, h); - spin_unlock_bh(&set->lock); - } - rcu_read_lock_bh(); - t = rcu_dereference_bh_nfnl(h->table); - memsize = mtype_ahash_memsize(h, t) + set->ext_size; + t = rcu_dereference_bh(h->table); + mtype_ext_size(set, &elements, &ext_size); + memsize = mtype_ahash_memsize(h, t) + ext_size + set->ext_size; htable_bits = t->htable_bits; rcu_read_unlock_bh(); @@ -1071,7 +1305,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) #endif if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) || - nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements))) + nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; @@ -1091,15 +1325,15 @@ mtype_uref(struct ip_set *set, struct netlink_callback *cb, bool start) if (start) { rcu_read_lock_bh(); - t = rcu_dereference_bh_nfnl(h->table); + t = ipset_dereference_bh_nfnl(h->table); atomic_inc(&t->uref); cb->args[IPSET_CB_PRIVATE] = (unsigned long)t; rcu_read_unlock_bh(); } else if (cb->args[IPSET_CB_PRIVATE]) { t = (struct htable *)cb->args[IPSET_CB_PRIVATE]; if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { - /* Resizing didn't destroy the hash table */ - pr_debug("Table destroy by dump: %p\n", t); + pr_debug("Table destroy after resize " + " by dump: %p\n", t); mtype_ahash_destroy(set, t, false); } cb->args[IPSET_CB_PRIVATE] = 0; @@ -1141,8 +1375,7 @@ mtype_list(const struct ip_set *set, if (!test_bit(i, n->used)) continue; e = ahash_data(n, i, set->dsize); - if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set))) + if (SET_ELEM_EXPIRED(set, e)) continue; pr_debug("list hash %lu hbucket %p i %u, data %p\n", cb->args[IPSET_CB_ARG0], n, i, e); @@ -1208,6 +1441,7 @@ static const struct ip_set_type_variant mtype_variant = { .uref = mtype_uref, .resize = mtype_resize, .same_set = mtype_same_set, + .region_lock = true, }; #ifdef IP_SET_EMIT_CREATE @@ -1226,6 +1460,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, size_t hsize; struct htype *h; struct htable *t; + u32 i; pr_debug("Create set %s with family %s\n", set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6"); @@ -1294,6 +1529,15 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, kfree(h); return -ENOMEM; } + t->hregion = ip_set_alloc(ahash_sizeof_regions(hbits)); + if (!t->hregion) { + kfree(t); + kfree(h); + return -ENOMEM; + } + h->gc.set = set; + for (i = 0; i < ahash_numof_locks(hbits); i++) + spin_lock_init(&t->hregion[i].lock); h->maxelem = maxelem; #ifdef IP_SET_HASH_WITH_NETMASK h->netmask = netmask; @@ -1304,9 +1548,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, get_random_bytes(&h->initval, sizeof(h->initval)); t->htable_bits = hbits; + t->maxelem = h->maxelem / ahash_numof_locks(hbits); RCU_INIT_POINTER(h->table, t); - h->set = set; + INIT_LIST_HEAD(&h->ad); set->data = h; #ifndef IP_SET_PROTO_UNDEF if (set->family == NFPROTO_IPV4) { @@ -1329,12 +1574,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #ifndef IP_SET_PROTO_UNDEF if (set->family == NFPROTO_IPV4) #endif - IPSET_TOKEN(HTYPE, 4_gc_init)(set, - IPSET_TOKEN(HTYPE, 4_gc)); + IPSET_TOKEN(HTYPE, 4_gc_init)(&h->gc); #ifndef IP_SET_PROTO_UNDEF else - IPSET_TOKEN(HTYPE, 6_gc_init)(set, - IPSET_TOKEN(HTYPE, 6_gc)); + IPSET_TOKEN(HTYPE, 6_gc_init)(&h->gc); #endif } pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", -- cgit v1.2.3 From 8af1c6fbd9239877998c7f5a591cb2c88d41fb66 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sat, 22 Feb 2020 12:01:43 +0100 Subject: netfilter: ipset: Fix forceadd evaluation path When the forceadd option is enabled, the hash:* types should find and replace the first entry in the bucket with the new one if there are no reuseable (deleted or timed out) entries. However, the position index was just not set to zero and remained the invalid -1 if there were no reuseable entries. Reported-by: syzbot+6a86565c74ebe30aea18@syzkaller.appspotmail.com Fixes: 23c42a403a9c ("netfilter: ipset: Introduction of new commands and protocol version 7") Signed-off-by: Jozsef Kadlecsik --- net/netfilter/ipset/ip_set_hash_gen.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 71e93eac0831..e52d7b7597a0 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -931,6 +931,8 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, } } if (reuse || forceadd) { + if (j == -1) + j = 0; data = ahash_data(n, j, set->dsize); if (!deleted) { #ifdef IP_SET_HASH_WITH_NETS -- cgit v1.2.3 From 3e72dfdf8227b052393f71d820ec7599909dddc2 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 21 Feb 2020 12:28:38 +0100 Subject: ipv4: ensure rcu_read_lock() in cipso_v4_error() Similarly to commit c543cb4a5f07 ("ipv4: ensure rcu_read_lock() in ipv4_link_failure()"), __ip_options_compile() must be called under rcu protection. Fixes: 3da1ed7ac398 ("net: avoid use IPCB in cipso_v4_error") Suggested-by: Guillaume Nault Signed-off-by: Matteo Croce Acked-by: Paul Moore Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 376882215919..0bd10a1f477f 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1724,6 +1724,7 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) { unsigned char optbuf[sizeof(struct ip_options) + 40]; struct ip_options *opt = (struct ip_options *)optbuf; + int res; if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) return; @@ -1735,7 +1736,11 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) memset(opt, 0, sizeof(struct ip_options)); opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr); - if (__ip_options_compile(dev_net(skb->dev), opt, skb, NULL)) + rcu_read_lock(); + res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL); + rcu_read_unlock(); + + if (res) return; if (gateway) -- cgit v1.2.3 From 39f3b41aa7cae917f928ef9f31d09da28188e5ed Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 21 Feb 2020 19:42:13 +0100 Subject: net: genetlink: return the error code when attribute parsing fails. Currently if attribute parsing fails and the genl family does not support parallel operation, the error code returned by __nlmsg_parse() is discarded by genl_family_rcv_msg_attrs_parse(). Be sure to report the error for all genl families. Fixes: c10e6cf85e7d ("net: genetlink: push attrbuf allocation and parsing to a separate function") Fixes: ab5b526da048 ("net: genetlink: always allocate separate attrs for dumpit ops") Signed-off-by: Paolo Abeni Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 0522b2b1fd95..9f357aa22b94 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -497,8 +497,9 @@ genl_family_rcv_msg_attrs_parse(const struct genl_family *family, err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, family->policy, validate, extack); - if (err && parallel) { - kfree(attrbuf); + if (err) { + if (parallel) + kfree(attrbuf); return ERR_PTR(err); } return attrbuf; -- cgit v1.2.3 From dad8cea7add96a353fa1898b5ccefbb72da66f29 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 22 Feb 2020 11:21:15 -0500 Subject: tcp: fix TFO SYNACK undo to avoid double-timestamp-undo In a rare corner case the new logic for undo of SYNACK RTO could result in triggering the warning in tcp_fastretrans_alert() that says: WARN_ON(tp->retrans_out != 0); The warning looked like: WARNING: CPU: 1 PID: 1 at net/ipv4/tcp_input.c:2818 tcp_ack+0x13e0/0x3270 The sequence that tickles this bug is: - Fast Open server receives TFO SYN with data, sends SYNACK - (client receives SYNACK and sends ACK, but ACK is lost) - server app sends some data packets - (N of the first data packets are lost) - server receives client ACK that has a TS ECR matching first SYNACK, and also SACKs suggesting the first N data packets were lost - server performs TS undo of SYNACK RTO, then immediately enters recovery - buggy behavior then performed a *second* undo that caused the connection to be in CA_Open with retrans_out != 0 Basically, the incoming ACK packet with SACK blocks causes us to first undo the cwnd reduction from the SYNACK RTO, but then immediately enters fast recovery, which then makes us eligible for undo again. And then tcp_rcv_synrecv_state_fastopen() accidentally performs an undo using a "mash-up" of state from two different loss recovery phases: it uses the timestamp info from the ACK of the original SYNACK, and the undo_marker from the fast recovery. This fix refines the logic to only invoke the tcp_try_undo_loss() inside tcp_rcv_synrecv_state_fastopen() if the connection is still in CA_Loss. If peer SACKs triggered fast recovery, then tcp_rcv_synrecv_state_fastopen() can't safely undo. Fixes: 794200d66273 ("tcp: undo cwnd on Fast Open spurious SYNACK retransmit") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 316ebdf8151d..6b6b57000dad 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6124,7 +6124,11 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) { struct request_sock *req; - tcp_try_undo_loss(sk, false); + /* If we are still handling the SYNACK RTO, see if timestamp ECR allows + * undo. If peer SACKs triggered fast recovery, we can't undo here. + */ + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) + tcp_try_undo_loss(sk, false); /* Reset rtx states to prevent spurious retransmits_timed_out() */ tcp_sk(sk)->retrans_stamp = 0; -- cgit v1.2.3 From 6132c1d9033d158bd0464e90bc46544fbe0bd6bc Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Sun, 23 Feb 2020 16:52:33 +0530 Subject: net: core: devlink.c: Hold devlink->lock from the beginning of devlink_dpipe_table_register() devlink_dpipe_table_find() should be called under either rcu_read_lock() or devlink->lock. devlink_dpipe_table_register() calls devlink_dpipe_table_find() without holding the lock and acquires it later. Therefore hold the devlink->lock from the beginning of devlink_dpipe_table_register(). Suggested-by: Jiri Pirko Signed-off-by: Madhuparna Bhowmik Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 549ee56b7a21..8d0b558be942 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -6878,26 +6878,33 @@ int devlink_dpipe_table_register(struct devlink *devlink, void *priv, bool counter_control_extern) { struct devlink_dpipe_table *table; - - if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name)) - return -EEXIST; + int err = 0; if (WARN_ON(!table_ops->size_get)) return -EINVAL; + mutex_lock(&devlink->lock); + + if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name)) { + err = -EEXIST; + goto unlock; + } + table = kzalloc(sizeof(*table), GFP_KERNEL); - if (!table) - return -ENOMEM; + if (!table) { + err = -ENOMEM; + goto unlock; + } table->name = table_name; table->table_ops = table_ops; table->priv = priv; table->counter_control_extern = counter_control_extern; - mutex_lock(&devlink->lock); list_add_tail_rcu(&table->list, &devlink->dpipe_table_list); +unlock: mutex_unlock(&devlink->lock); - return 0; + return err; } EXPORT_SYMBOL_GPL(devlink_dpipe_table_register); -- cgit v1.2.3 From e3ae39edbce6dc933fb1393490d1b5d76d3edb90 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 24 Feb 2020 09:38:15 +0100 Subject: nl80211: explicitly include if_vlan.h We use that here, and do seem to get it through some recursive include, but better include it explicitly. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200224093814.1b9c258fec67.I45ac150d4e11c72eb263abec9f1f0c7add9bef2b@changeid Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 46be40e19e7f..5b19e9fac4aa 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 253216ffb2a002a682c6f68bd3adff5b98b71de8 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Sun, 23 Feb 2020 20:03:02 +0530 Subject: mac80211: rx: avoid RCU list traversal under mutex local->sta_mtx is held in __ieee80211_check_fast_rx_iface(). No need to use list_for_each_entry_rcu() as it also requires a cond argument to avoid false lockdep warnings when not used in RCU read-side section (with CONFIG_PROVE_RCU_LIST). Therefore use list_for_each_entry(); Signed-off-by: Madhuparna Bhowmik Link: https://lore.kernel.org/r/20200223143302.15390-1-madhuparnabhowmik10@gmail.com Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 0e05ff037672..0ba98ad9bc85 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4114,7 +4114,7 @@ void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata) lockdep_assert_held(&local->sta_mtx); - list_for_each_entry_rcu(sta, &local->sta_list, list) { + list_for_each_entry(sta, &local->sta_list, list) { if (sdata != sta->sdata && (!sta->sdata->bss || sta->sdata->bss != sdata->bss)) continue; -- cgit v1.2.3 From 823d81b0fa2cd83a640734e74caee338b5d3c093 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 24 Feb 2020 18:46:22 +0200 Subject: net: bridge: fix stale eth hdr pointer in br_dev_xmit In br_dev_xmit() we perform vlan filtering in br_allowed_ingress() but if the packet has the vlan header inside (e.g. bridge with disabled tx-vlan-offload) then the vlan filtering code will use skb_vlan_untag() to extract the vid before filtering which in turn calls pskb_may_pull() and we may end up with a stale eth pointer. Moreover the cached eth header pointer will generally be wrong after that operation. Remove the eth header caching and just use eth_hdr() directly, the compiler does the right thing and calculates it only once so we don't lose anything. Fixes: 057658cb33fb ("bridge: suppress arp pkts on BR_NEIGH_SUPPRESS ports") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_device.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index dc3d2c1dd9d5..0e3dbc5f3c34 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -34,7 +34,6 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) const struct nf_br_ops *nf_ops; u8 state = BR_STATE_FORWARDING; const unsigned char *dest; - struct ethhdr *eth; u16 vid = 0; rcu_read_lock(); @@ -54,15 +53,14 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) BR_INPUT_SKB_CB(skb)->frag_max_size = 0; skb_reset_mac_header(skb); - eth = eth_hdr(skb); skb_pull(skb, ETH_HLEN); if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid, &state)) goto out; if (IS_ENABLED(CONFIG_INET) && - (eth->h_proto == htons(ETH_P_ARP) || - eth->h_proto == htons(ETH_P_RARP)) && + (eth_hdr(skb)->h_proto == htons(ETH_P_ARP) || + eth_hdr(skb)->h_proto == htons(ETH_P_RARP)) && br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) { br_do_proxy_suppress_arp(skb, br, vid, NULL); } else if (IS_ENABLED(CONFIG_IPV6) && -- cgit v1.2.3 From 212d58c106fd0f2704664be2bb173e14cb4e86d3 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Fri, 21 Feb 2020 03:04:21 +0100 Subject: nft_set_pipapo: Actually fetch key data in nft_pipapo_remove() Phil reports that adding elements, flushing and re-adding them right away: nft add table t '{ set s { type ipv4_addr . inet_service; flags interval; }; }' nft add element t s '{ 10.0.0.1 . 22-25, 10.0.0.1 . 10-20 }' nft flush set t s nft add element t s '{ 10.0.0.1 . 10-20, 10.0.0.1 . 22-25 }' triggers, almost reliably, a crash like this one: [ 71.319848] general protection fault, probably for non-canonical address 0x6f6b6e696c2e756e: 0000 [#1] PREEMPT SMP PTI [ 71.321540] CPU: 3 PID: 1201 Comm: kworker/3:2 Not tainted 5.6.0-rc1-00377-g2bb07f4e1d861 #192 [ 71.322746] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190711_202441-buildvm-armv7-10.arm.fedoraproject.org-2.fc31 04/01/2014 [ 71.324430] Workqueue: events nf_tables_trans_destroy_work [nf_tables] [ 71.325387] RIP: 0010:nft_set_elem_destroy+0xa5/0x110 [nf_tables] [ 71.326164] Code: 89 d4 84 c0 74 0e 8b 77 44 0f b6 f8 48 01 df e8 41 ff ff ff 45 84 e4 74 36 44 0f b6 63 08 45 84 e4 74 2c 49 01 dc 49 8b 04 24 <48> 8b 40 38 48 85 c0 74 4f 48 89 e7 4c 8b [ 71.328423] RSP: 0018:ffffc9000226fd90 EFLAGS: 00010282 [ 71.329225] RAX: 6f6b6e696c2e756e RBX: ffff88813ab79f60 RCX: ffff88813931b5a0 [ 71.330365] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff88813ab79f9a [ 71.331473] RBP: ffff88813ab79f60 R08: 0000000000000008 R09: 0000000000000000 [ 71.332627] R10: 000000000000021c R11: 0000000000000000 R12: ffff88813ab79fc2 [ 71.333615] R13: ffff88813b3adf50 R14: dead000000000100 R15: ffff88813931b8a0 [ 71.334596] FS: 0000000000000000(0000) GS:ffff88813bd80000(0000) knlGS:0000000000000000 [ 71.335780] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 71.336577] CR2: 000055ac683710f0 CR3: 000000013a222003 CR4: 0000000000360ee0 [ 71.337533] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 71.338557] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 71.339718] Call Trace: [ 71.340093] nft_pipapo_destroy+0x7a/0x170 [nf_tables_set] [ 71.340973] nft_set_destroy+0x20/0x50 [nf_tables] [ 71.341879] nf_tables_trans_destroy_work+0x246/0x260 [nf_tables] [ 71.342916] process_one_work+0x1d5/0x3c0 [ 71.343601] worker_thread+0x4a/0x3c0 [ 71.344229] kthread+0xfb/0x130 [ 71.344780] ? process_one_work+0x3c0/0x3c0 [ 71.345477] ? kthread_park+0x90/0x90 [ 71.346129] ret_from_fork+0x35/0x40 [ 71.346748] Modules linked in: nf_tables_set nf_tables nfnetlink 8021q [last unloaded: nfnetlink] [ 71.348153] ---[ end trace 2eaa8149ca759bcc ]--- [ 71.349066] RIP: 0010:nft_set_elem_destroy+0xa5/0x110 [nf_tables] [ 71.350016] Code: 89 d4 84 c0 74 0e 8b 77 44 0f b6 f8 48 01 df e8 41 ff ff ff 45 84 e4 74 36 44 0f b6 63 08 45 84 e4 74 2c 49 01 dc 49 8b 04 24 <48> 8b 40 38 48 85 c0 74 4f 48 89 e7 4c 8b [ 71.350017] RSP: 0018:ffffc9000226fd90 EFLAGS: 00010282 [ 71.350019] RAX: 6f6b6e696c2e756e RBX: ffff88813ab79f60 RCX: ffff88813931b5a0 [ 71.350019] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff88813ab79f9a [ 71.350020] RBP: ffff88813ab79f60 R08: 0000000000000008 R09: 0000000000000000 [ 71.350021] R10: 000000000000021c R11: 0000000000000000 R12: ffff88813ab79fc2 [ 71.350022] R13: ffff88813b3adf50 R14: dead000000000100 R15: ffff88813931b8a0 [ 71.350025] FS: 0000000000000000(0000) GS:ffff88813bd80000(0000) knlGS:0000000000000000 [ 71.350026] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 71.350027] CR2: 000055ac683710f0 CR3: 000000013a222003 CR4: 0000000000360ee0 [ 71.350028] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 71.350028] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 71.350030] Kernel panic - not syncing: Fatal exception [ 71.350412] Kernel Offset: disabled [ 71.365922] ---[ end Kernel panic - not syncing: Fatal exception ]--- which is caused by dangling elements that have been deactivated, but never removed. On a flush operation, nft_pipapo_walk() walks through all the elements in the mapping table, which are then deactivated by nft_flush_set(), one by one, and added to the commit list for removal. Element data is then freed. On transaction commit, nft_pipapo_remove() is called, and failed to remove these elements, leading to the stale references in the mapping. The first symptom of this, revealed by KASan, is a one-byte use-after-free in subsequent calls to nft_pipapo_walk(), which is usually not enough to trigger a panic. When stale elements are used more heavily, though, such as double-free via nft_pipapo_destroy() as in Phil's case, the problem becomes more noticeable. The issue comes from that fact that, on a flush operation, nft_pipapo_remove() won't get the actual key data via elem->key, elements to be deleted upon commit won't be found by the lookup via pipapo_get(), and removal will be skipped. Key data should be fetched via nft_set_ext_key(), instead. Reported-by: Phil Sutter Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Signed-off-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_pipapo.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index feac8553f6d9..4fc0c924ed5d 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1766,11 +1766,13 @@ static bool pipapo_match_field(struct nft_pipapo_field *f, static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { - const u8 *data = (const u8 *)elem->key.val.data; struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m = priv->clone; + struct nft_pipapo_elem *e = elem->priv; int rules_f0, first_rule = 0; - struct nft_pipapo_elem *e; + const u8 *data; + + data = (const u8 *)nft_set_ext_key(&e->ext); e = pipapo_get(net, set, data, 0); if (IS_ERR(e)) -- cgit v1.2.3 From 6e11d1578fba8d09d03a286740ffcf336d53928c Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Mon, 24 Feb 2020 10:56:00 -0800 Subject: net: Fix Tx hash bound checking Fixes the lower and upper bounds when there are multiple TCs and traffic is on the the same TC on the same device. The lower bound is represented by 'qoffset' and the upper limit for hash value is 'qcount + qoffset'. This gives a clean Rx to Tx queue mapping when there are multiple TCs, as the queue indices for upper TCs will be offset by 'qoffset'. v2: Fixed commit description based on comments. Fixes: 1b837d489e06 ("net: Revoke export for __skb_tx_hash, update it to just be static skb_tx_hash") Fixes: eadec877ce9c ("net: Add support for subordinate traffic classes to netdev_pick_tx") Signed-off-by: Amritha Nambiar Reviewed-by: Alexander Duyck Reviewed-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/core/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index e10bd680dc03..c6c985fe7b1b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3076,6 +3076,8 @@ static u16 skb_tx_hash(const struct net_device *dev, if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); + if (hash >= qoffset) + hash -= qoffset; while (unlikely(hash >= qcount)) hash -= qcount; return hash + qoffset; -- cgit v1.2.3 From e34f1753eebc428c312527662eb1b529cf260240 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Mon, 24 Feb 2020 20:42:12 +0100 Subject: ethtool: limit bitset size Syzbot reported that ethnl_compact_sanity_checks() can be tricked into reading past the end of ETHTOOL_A_BITSET_VALUE and ETHTOOL_A_BITSET_MASK attributes and even the message by passing a value between (u32)(-31) and (u32)(-1) as ETHTOOL_A_BITSET_SIZE. The problem is that DIV_ROUND_UP(attr_nbits, 32) is 0 for such values so that zero length ETHTOOL_A_BITSET_VALUE will pass the length check but ethnl_bitmap32_not_zero() check would try to access up to 512 MB of attribute "payload". Prevent this overflow byt limiting the bitset size. Technically, compact bitset format would allow bitset sizes up to almost 2^18 (so that the nest size does not exceed U16_MAX) but bitsets used by ethtool are much shorter. S16_MAX, the largest value which can be directly used as an upper limit in policy, should be a reasonable compromise. Fixes: 10b518d4e6dd ("ethtool: netlink bitset handling") Reported-by: syzbot+7fd4ed5b4234ab1fdccd@syzkaller.appspotmail.com Reported-by: syzbot+709b7a64d57978247e44@syzkaller.appspotmail.com Reported-by: syzbot+983cb8fb2d17a7af549d@syzkaller.appspotmail.com Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ethtool/bitset.c | 3 ++- net/ethtool/bitset.h | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c index 8977fe1f3946..ef9197541cb3 100644 --- a/net/ethtool/bitset.c +++ b/net/ethtool/bitset.c @@ -305,7 +305,8 @@ nla_put_failure: static const struct nla_policy bitset_policy[ETHTOOL_A_BITSET_MAX + 1] = { [ETHTOOL_A_BITSET_UNSPEC] = { .type = NLA_REJECT }, [ETHTOOL_A_BITSET_NOMASK] = { .type = NLA_FLAG }, - [ETHTOOL_A_BITSET_SIZE] = { .type = NLA_U32 }, + [ETHTOOL_A_BITSET_SIZE] = NLA_POLICY_MAX(NLA_U32, + ETHNL_MAX_BITSET_SIZE), [ETHTOOL_A_BITSET_BITS] = { .type = NLA_NESTED }, [ETHTOOL_A_BITSET_VALUE] = { .type = NLA_BINARY }, [ETHTOOL_A_BITSET_MASK] = { .type = NLA_BINARY }, diff --git a/net/ethtool/bitset.h b/net/ethtool/bitset.h index b8247e34109d..b849f9d19676 100644 --- a/net/ethtool/bitset.h +++ b/net/ethtool/bitset.h @@ -3,6 +3,8 @@ #ifndef _NET_ETHTOOL_BITSET_H #define _NET_ETHTOOL_BITSET_H +#define ETHNL_MAX_BITSET_SIZE S16_MAX + typedef const char (*const ethnl_string_array_t)[ETH_GSTRING_LEN]; int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact); -- cgit v1.2.3 From 99b79c3900d4627672c85d9f344b5b0f06bc2a4d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 12 Feb 2020 22:53:52 -0800 Subject: netfilter: xt_hashlimit: unregister proc file before releasing mutex Before releasing the global mutex, we only unlink the hashtable from the hash list, its proc file is still not unregistered at this point. So syzbot could trigger a race condition where a parallel htable_create() could register the same file immediately after the mutex is released. Move htable_remove_proc_entry() back to mutex protection to fix this. And, fold htable_destroy() into htable_put() to make the code slightly easier to understand. Reported-and-tested-by: syzbot+d195fd3b9a364ddd6731@syzkaller.appspotmail.com Fixes: c4a3922d2d20 ("netfilter: xt_hashlimit: reduce hashlimit_mutex scope for htable_put()") Signed-off-by: Cong Wang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_hashlimit.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 7a2c4b8408c4..8c835ad63729 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -402,15 +402,6 @@ static void htable_remove_proc_entry(struct xt_hashlimit_htable *hinfo) remove_proc_entry(hinfo->name, parent); } -static void htable_destroy(struct xt_hashlimit_htable *hinfo) -{ - cancel_delayed_work_sync(&hinfo->gc_work); - htable_remove_proc_entry(hinfo); - htable_selective_cleanup(hinfo, true); - kfree(hinfo->name); - vfree(hinfo); -} - static struct xt_hashlimit_htable *htable_find_get(struct net *net, const char *name, u_int8_t family) @@ -432,8 +423,13 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) { if (refcount_dec_and_mutex_lock(&hinfo->use, &hashlimit_mutex)) { hlist_del(&hinfo->node); + htable_remove_proc_entry(hinfo); mutex_unlock(&hashlimit_mutex); - htable_destroy(hinfo); + + cancel_delayed_work_sync(&hinfo->gc_work); + htable_selective_cleanup(hinfo, true); + kfree(hinfo->name); + vfree(hinfo); } } -- cgit v1.2.3 From 2eb51c75dcb354f8aef03d7648318b24630632e1 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Tue, 25 Feb 2020 17:57:45 +0530 Subject: net: core: devlink.c: Use built-in RCU list checking list_for_each_entry_rcu() has built-in RCU and lock checking. Pass cond argument to list_for_each_entry_rcu() to silence false lockdep warning when CONFIG_PROVE_RCU_LIST is enabled. The devlink->lock is held when devlink_dpipe_table_find() is called in non RCU read side section. Therefore, pass struct devlink to devlink_dpipe_table_find() for lockdep checking. Signed-off-by: Madhuparna Bhowmik Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 8d0b558be942..5e220809844c 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2103,11 +2103,11 @@ err_action_values_put: static struct devlink_dpipe_table * devlink_dpipe_table_find(struct list_head *dpipe_tables, - const char *table_name) + const char *table_name, struct devlink *devlink) { struct devlink_dpipe_table *table; - - list_for_each_entry_rcu(table, dpipe_tables, list) { + list_for_each_entry_rcu(table, dpipe_tables, list, + lockdep_is_held(&devlink->lock)) { if (!strcmp(table->name, table_name)) return table; } @@ -2226,7 +2226,7 @@ static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb, table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name); + table_name, devlink); if (!table) return -EINVAL; @@ -2382,7 +2382,7 @@ static int devlink_dpipe_table_counters_set(struct devlink *devlink, struct devlink_dpipe_table *table; table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name); + table_name, devlink); if (!table) return -EINVAL; @@ -6854,7 +6854,7 @@ bool devlink_dpipe_table_counter_enabled(struct devlink *devlink, rcu_read_lock(); table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name); + table_name, devlink); enabled = false; if (table) enabled = table->counters_enabled; @@ -6885,7 +6885,8 @@ int devlink_dpipe_table_register(struct devlink *devlink, mutex_lock(&devlink->lock); - if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name)) { + if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name, + devlink)) { err = -EEXIST; goto unlock; } @@ -6921,7 +6922,7 @@ void devlink_dpipe_table_unregister(struct devlink *devlink, mutex_lock(&devlink->lock); table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name); + table_name, devlink); if (!table) goto unlock; list_del_rcu(&table->list); @@ -7078,7 +7079,7 @@ int devlink_dpipe_table_resource_set(struct devlink *devlink, mutex_lock(&devlink->lock); table = devlink_dpipe_table_find(&devlink->dpipe_table_list, - table_name); + table_name, devlink); if (!table) { err = -EINVAL; goto out; -- cgit v1.2.3 From 1521a67e6016664941f0917d50cb20053a8826a2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 25 Feb 2020 13:54:12 +0100 Subject: sched: act: count in the size of action flags bitfield The put of the flags was added by the commit referenced in fixes tag, however the size of the message was not extended accordingly. Fix this by adding size of the flags bitfield to the message size. Fixes: e38226786022 ("net: sched: update action implementations to support flags") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 90a31b15585f..8c466a712cda 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -186,6 +186,7 @@ static size_t tcf_action_shared_attrs_size(const struct tc_action *act) + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */ + cookie_len /* TCA_ACT_COOKIE */ + nla_total_size(0) /* TCA_ACT_STATS nested */ + + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_FLAGS */ /* TCA_STATS_BASIC */ + nla_total_size_64bit(sizeof(struct gnet_stats_basic)) /* TCA_STATS_PKT64 */ -- cgit v1.2.3 From 51e3dfa8906ace90c809235b3d3afebc166b6433 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Tue, 25 Feb 2020 16:34:36 +0100 Subject: net/smc: fix cleanup for linkgroup setup failures If an SMC connection to a certain peer is setup the first time, a new linkgroup is created. In case of setup failures, such a linkgroup is unusable and should disappear. As a first step the linkgroup is removed from the linkgroup list in smc_lgr_forget(). There are 2 problems: smc_listen_decline() might be called before linkgroup creation resulting in a crash due to calling smc_lgr_forget() with parameter NULL. If a setup failure occurs after linkgroup creation, the connection is never unregistered from the linkgroup, preventing linkgroup freeing. This patch introduces an enhanced smc_lgr_cleanup_early() function which * contains a linkgroup check for early smc_listen_decline() invocations * invokes smc_conn_free() to guarantee unregistering of the connection. * schedules fast linkgroup removal of the unusable linkgroup And the unused function smcd_conn_free() is removed from smc_core.h. Fixes: 3b2dec2603d5b ("net/smc: restructure client and server code in af_smc") Fixes: 2a0674fffb6bc ("net/smc: improve abnormal termination of link groups") Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/af_smc.c | 25 +++++++++++++++---------- net/smc/smc_core.c | 12 ++++++++++++ net/smc/smc_core.h | 2 +- 3 files changed, 28 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 90988a511cd5..6fd44bdb0fc3 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -512,15 +512,18 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code) static int smc_connect_abort(struct smc_sock *smc, int reason_code, int local_contact) { + bool is_smcd = smc->conn.lgr->is_smcd; + if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(smc->conn.lgr); - if (smc->conn.lgr->is_smcd) + smc_lgr_cleanup_early(&smc->conn); + else + smc_conn_free(&smc->conn); + if (is_smcd) /* there is only one lgr role for SMC-D; use server lock */ mutex_unlock(&smc_server_lgr_pending); else mutex_unlock(&smc_client_lgr_pending); - smc_conn_free(&smc->conn); smc->connect_nonblock = 0; return reason_code; } @@ -1091,7 +1094,6 @@ static void smc_listen_out_err(struct smc_sock *new_smc) if (newsmcsk->sk_state == SMC_INIT) sock_put(&new_smc->sk); /* passive closing */ newsmcsk->sk_state = SMC_CLOSED; - smc_conn_free(&new_smc->conn); smc_listen_out(new_smc); } @@ -1102,12 +1104,13 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, { /* RDMA setup failed, switch back to TCP */ if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(new_smc->conn.lgr); + smc_lgr_cleanup_early(&new_smc->conn); + else + smc_conn_free(&new_smc->conn); if (reason_code < 0) { /* error, no fallback possible */ smc_listen_out_err(new_smc); return; } - smc_conn_free(&new_smc->conn); smc_switch_to_fallback(new_smc); new_smc->fallback_rsn = reason_code; if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) { @@ -1170,16 +1173,18 @@ static int smc_listen_ism_init(struct smc_sock *new_smc, new_smc->conn.lgr->vlan_id, new_smc->conn.lgr->smcd)) { if (ini->cln_first_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(new_smc->conn.lgr); - smc_conn_free(&new_smc->conn); + smc_lgr_cleanup_early(&new_smc->conn); + else + smc_conn_free(&new_smc->conn); return SMC_CLC_DECL_SMCDNOTALK; } /* Create send and receive buffers */ if (smc_buf_create(new_smc, true)) { if (ini->cln_first_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(new_smc->conn.lgr); - smc_conn_free(&new_smc->conn); + smc_lgr_cleanup_early(&new_smc->conn); + else + smc_conn_free(&new_smc->conn); return SMC_CLC_DECL_MEM; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 2249de5379ee..5b085efa3bce 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -162,6 +162,18 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) conn->lgr = NULL; } +void smc_lgr_cleanup_early(struct smc_connection *conn) +{ + struct smc_link_group *lgr = conn->lgr; + + if (!lgr) + return; + + smc_conn_free(conn); + smc_lgr_forget(lgr); + smc_lgr_schedule_free_work_fast(lgr); +} + /* Send delete link, either as client to request the initiation * of the DELETE LINK sequence from server; or as server to * initiate the delete processing. See smc_llc_rx_delete_link(). diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index c472e12951d1..234ae25f0025 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -296,6 +296,7 @@ struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; void smc_lgr_forget(struct smc_link_group *lgr); +void smc_lgr_cleanup_early(struct smc_connection *conn); void smc_lgr_terminate(struct smc_link_group *lgr, bool soft); void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, @@ -316,7 +317,6 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini); void smc_conn_free(struct smc_connection *conn); int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini); -void smcd_conn_free(struct smc_connection *conn); void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); int smc_core_init(void); void smc_core_exit(void); -- cgit v1.2.3 From b6f6118901d1e867ac9177bbff3b00b185bd4fdc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Feb 2020 11:52:29 -0800 Subject: ipv6: restrict IPV6_ADDRFORM operation IPV6_ADDRFORM is able to transform IPv6 socket to IPv4 one. While this operation sounds illogical, we have to support it. One of the things it does for TCP socket is to switch sk->sk_prot to tcp_prot. We now have other layers playing with sk->sk_prot, so we should make sure to not interfere with them. This patch makes sure sk_prot is the default pointer for TCP IPv6 socket. syzbot reported : BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD a0113067 P4D a0113067 PUD a8771067 PMD 0 Oops: 0010 [#1] PREEMPT SMP KASAN CPU: 0 PID: 10686 Comm: syz-executor.0 Not tainted 5.6.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:0x0 Code: Bad RIP value. RSP: 0018:ffffc9000281fce0 EFLAGS: 00010246 RAX: 1ffffffff15f48ac RBX: ffffffff8afa4560 RCX: dffffc0000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8880a69a8f40 RBP: ffffc9000281fd10 R08: ffffffff86ed9b0c R09: ffffed1014d351f5 R10: ffffed1014d351f5 R11: 0000000000000000 R12: ffff8880920d3098 R13: 1ffff1101241a613 R14: ffff8880a69a8f40 R15: 0000000000000000 FS: 00007f2ae75db700(0000) GS:ffff8880aea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 00000000a3b85000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: inet_release+0x165/0x1c0 net/ipv4/af_inet.c:427 __sock_release net/socket.c:605 [inline] sock_close+0xe1/0x260 net/socket.c:1283 __fput+0x2e4/0x740 fs/file_table.c:280 ____fput+0x15/0x20 fs/file_table.c:313 task_work_run+0x176/0x1b0 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:188 [inline] exit_to_usermode_loop arch/x86/entry/common.c:164 [inline] prepare_exit_to_usermode+0x480/0x5b0 arch/x86/entry/common.c:195 syscall_return_slowpath+0x113/0x4a0 arch/x86/entry/common.c:278 do_syscall_64+0x11f/0x1c0 arch/x86/entry/common.c:304 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x45c429 Code: ad b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f2ae75dac78 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: 0000000000000000 RBX: 00007f2ae75db6d4 RCX: 000000000045c429 RDX: 0000000000000001 RSI: 000000000000011a RDI: 0000000000000004 RBP: 000000000076bf20 R08: 0000000000000038 R09: 0000000000000000 R10: 0000000020000180 R11: 0000000000000246 R12: 00000000ffffffff R13: 0000000000000a9d R14: 00000000004ccfb4 R15: 000000000076bf2c Modules linked in: CR2: 0000000000000000 ---[ end trace 82567b5207e87bae ]--- RIP: 0010:0x0 Code: Bad RIP value. RSP: 0018:ffffc9000281fce0 EFLAGS: 00010246 RAX: 1ffffffff15f48ac RBX: ffffffff8afa4560 RCX: dffffc0000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8880a69a8f40 RBP: ffffc9000281fd10 R08: ffffffff86ed9b0c R09: ffffed1014d351f5 R10: ffffed1014d351f5 R11: 0000000000000000 R12: ffff8880920d3098 R13: 1ffff1101241a613 R14: ffff8880a69a8f40 R15: 0000000000000000 FS: 00007f2ae75db700(0000) GS:ffff8880aea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 00000000a3b85000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Eric Dumazet Reported-by: syzbot+1938db17e275e85dc328@syzkaller.appspotmail.com Cc: Daniel Borkmann Signed-off-by: David S. Miller --- net/ipv6/ipv6_sockglue.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 79fc012dd2ca..debdaeba5d8c 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -183,9 +183,15 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = -EBUSY; break; } - } else if (sk->sk_protocol != IPPROTO_TCP) + } else if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_prot != &tcpv6_prot) { + retv = -EBUSY; + break; + } break; - + } else { + break; + } if (sk->sk_state != TCP_ESTABLISHED) { retv = -ENOTCONN; break; -- cgit v1.2.3 From dc24f8b4ecd3d6c4153a1ec1bc2006ab32a41b8d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 26 Feb 2020 12:19:03 +0100 Subject: mptcp: add dummy icsk_sync_mss() syzbot noted that the master MPTCP socket lacks the icsk_sync_mss callback, and was able to trigger a null pointer dereference: BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 8e171067 P4D 8e171067 PUD 93fa2067 PMD 0 Oops: 0010 [#1] PREEMPT SMP KASAN CPU: 0 PID: 8984 Comm: syz-executor066 Not tainted 5.6.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:0x0 Code: Bad RIP value. RSP: 0018:ffffc900020b7b80 EFLAGS: 00010246 RAX: 1ffff110124ba600 RBX: 0000000000000000 RCX: ffff88809fefa600 RDX: ffff8880994cdb18 RSI: 0000000000000000 RDI: ffff8880925d3140 RBP: ffffc900020b7bd8 R08: ffffffff870225be R09: fffffbfff140652a R10: fffffbfff140652a R11: 0000000000000000 R12: ffff8880925d35d0 R13: ffff8880925d3140 R14: dffffc0000000000 R15: 1ffff110124ba6ba FS: 0000000001a0b880(0000) GS:ffff8880aea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 00000000a6d6f000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: cipso_v4_sock_setattr+0x34b/0x470 net/ipv4/cipso_ipv4.c:1888 netlbl_sock_setattr+0x2a7/0x310 net/netlabel/netlabel_kapi.c:989 smack_netlabel security/smack/smack_lsm.c:2425 [inline] smack_inode_setsecurity+0x3da/0x4a0 security/smack/smack_lsm.c:2716 security_inode_setsecurity+0xb2/0x140 security/security.c:1364 __vfs_setxattr_noperm+0x16f/0x3e0 fs/xattr.c:197 vfs_setxattr fs/xattr.c:224 [inline] setxattr+0x335/0x430 fs/xattr.c:451 __do_sys_fsetxattr fs/xattr.c:506 [inline] __se_sys_fsetxattr+0x130/0x1b0 fs/xattr.c:495 __x64_sys_fsetxattr+0xbf/0xd0 fs/xattr.c:495 do_syscall_64+0xf7/0x1c0 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x440199 Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 fb 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007ffcadc19e48 EFLAGS: 00000246 ORIG_RAX: 00000000000000be RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000440199 RDX: 0000000020000200 RSI: 00000000200001c0 RDI: 0000000000000003 RBP: 00000000006ca018 R08: 0000000000000003 R09: 00000000004002c8 R10: 0000000000000009 R11: 0000000000000246 R12: 0000000000401a20 R13: 0000000000401ab0 R14: 0000000000000000 R15: 0000000000000000 Modules linked in: CR2: 0000000000000000 Address the issue adding a dummy icsk_sync_mss callback. To properly sync the subflows mss and options list we need some additional infrastructure, which will land to net-next. Reported-by: syzbot+f4dfece964792d80b139@syzkaller.appspotmail.com Fixes: 2303f994b3e1 ("mptcp: Associate MPTCP context with TCP socket") Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e9aa6807b5be..3c19a8efdcea 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -543,6 +543,11 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, } } +static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) +{ + return 0; +} + static int __mptcp_init_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -551,6 +556,7 @@ static int __mptcp_init_sock(struct sock *sk) __set_bit(MPTCP_SEND_SPACE, &msk->flags); msk->first = NULL; + inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; return 0; } -- cgit v1.2.3 From a2f2ef4a54c0d97aa6a8386f4ff23f36ebb488cf Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 26 Feb 2020 17:52:46 +0100 Subject: net/smc: check for valid ib_client_data In smc_ib_remove_dev() check if the provided ib device was actually initialized for SMC before. Reported-by: syzbot+84484ccebdd4e5451d91@syzkaller.appspotmail.com Fixes: a4cf0443c414 ("smc: introduce SMC as an IB-client") Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- net/smc/smc_ib.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 548632621f4b..d6ba186f67e2 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -573,6 +573,8 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) struct smc_ib_device *smcibdev; smcibdev = ib_get_client_data(ibdev, &smc_ib_client); + if (!smcibdev || smcibdev->ibdev != ibdev) + return; ib_set_client_data(ibdev, &smc_ib_client, NULL); spin_lock(&smc_ib_devices.lock); list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ -- cgit v1.2.3 From 3a12500ed5dd21a63da779ac73503f11085bbc1c Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 26 Feb 2020 18:29:53 +0100 Subject: unix: define and set show_fdinfo only if procfs is enabled Follow the pattern used with other *_show_fdinfo functions and only define unix_show_fdinfo and set it in proto_ops if CONFIG_PROCFS is set. Fixes: 3c32da19a858 ("unix: Show number of pending scm files of receive queue in fdinfo") Signed-off-by: Tobias Klauser Reviewed-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/unix/af_unix.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 62c12cb5763e..aa6e2530e1ec 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -682,6 +682,7 @@ static int unix_set_peek_off(struct sock *sk, int val) return 0; } +#ifdef CONFIG_PROCFS static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) { struct sock *sk = sock->sk; @@ -692,6 +693,9 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) seq_printf(m, "scm_fds: %u\n", READ_ONCE(u->scm_stat.nr_fds)); } } +#else +#define unix_show_fdinfo NULL +#endif static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, -- cgit v1.2.3 From 5c05a164d441a1792791175e4959ea9df12f7e2b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 27 Feb 2020 11:52:35 -0800 Subject: unix: It's CONFIG_PROC_FS not CONFIG_PROCFS Fixes: 3a12500ed5dd ("unix: define and set show_fdinfo only if procfs is enabled") Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index aa6e2530e1ec..68debcb28fa4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -682,7 +682,7 @@ static int unix_set_peek_off(struct sock *sk, int val) return 0; } -#ifdef CONFIG_PROCFS +#ifdef CONFIG_PROC_FS static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) { struct sock *sk = sock->sk; -- cgit v1.2.3 From 3f74957fcbeab703297ed0f135430414ed7e0dd0 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Wed, 26 Feb 2020 11:58:18 +0100 Subject: vsock: fix potential deadlock in transport->release() Some transports (hyperv, virtio) acquire the sock lock during the .release() callback. In the vsock_stream_connect() we call vsock_assign_transport(); if the socket was previously assigned to another transport, the vsk->transport->release() is called, but the sock lock is already held in the vsock_stream_connect(), causing a deadlock reported by syzbot: INFO: task syz-executor280:9768 blocked for more than 143 seconds. Not tainted 5.6.0-rc1-syzkaller #0 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor280 D27912 9768 9766 0x00000000 Call Trace: context_switch kernel/sched/core.c:3386 [inline] __schedule+0x934/0x1f90 kernel/sched/core.c:4082 schedule+0xdc/0x2b0 kernel/sched/core.c:4156 __lock_sock+0x165/0x290 net/core/sock.c:2413 lock_sock_nested+0xfe/0x120 net/core/sock.c:2938 virtio_transport_release+0xc4/0xd60 net/vmw_vsock/virtio_transport_common.c:832 vsock_assign_transport+0xf3/0x3b0 net/vmw_vsock/af_vsock.c:454 vsock_stream_connect+0x2b3/0xc70 net/vmw_vsock/af_vsock.c:1288 __sys_connect_file+0x161/0x1c0 net/socket.c:1857 __sys_connect+0x174/0x1b0 net/socket.c:1874 __do_sys_connect net/socket.c:1885 [inline] __se_sys_connect net/socket.c:1882 [inline] __x64_sys_connect+0x73/0xb0 net/socket.c:1882 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe To avoid this issue, this patch remove the lock acquiring in the .release() callback of hyperv and virtio transports, and it holds the lock when we call vsk->transport->release() in the vsock core. Reported-by: syzbot+731710996d79d0d58fbc@syzkaller.appspotmail.com Fixes: 408624af4c89 ("vsock: use local transport when it is loaded") Signed-off-by: Stefano Garzarella Reviewed-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- net/vmw_vsock/af_vsock.c | 20 ++++++++++++-------- net/vmw_vsock/hyperv_transport.c | 3 --- net/vmw_vsock/virtio_transport_common.c | 2 -- 3 files changed, 12 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 9c5b2a91baad..a5f28708e0e7 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -451,6 +451,12 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) if (vsk->transport == new_transport) return 0; + /* transport->release() must be called with sock lock acquired. + * This path can only be taken during vsock_stream_connect(), + * where we have already held the sock lock. + * In the other cases, this function is called on a new socket + * which is not assigned to any transport. + */ vsk->transport->release(vsk); vsock_deassign_transport(vsk); } @@ -753,20 +759,18 @@ static void __vsock_release(struct sock *sk, int level) vsk = vsock_sk(sk); pending = NULL; /* Compiler warning. */ - /* The release call is supposed to use lock_sock_nested() - * rather than lock_sock(), if a sock lock should be acquired. - */ - if (vsk->transport) - vsk->transport->release(vsk); - else if (sk->sk_type == SOCK_STREAM) - vsock_remove_sock(vsk); - /* When "level" is SINGLE_DEPTH_NESTING, use the nested * version to avoid the warning "possible recursive locking * detected". When "level" is 0, lock_sock_nested(sk, level) * is the same as lock_sock(sk). */ lock_sock_nested(sk, level); + + if (vsk->transport) + vsk->transport->release(vsk); + else if (sk->sk_type == SOCK_STREAM) + vsock_remove_sock(vsk); + sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 3492c021925f..630b851f8150 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -526,12 +526,9 @@ static bool hvs_close_lock_held(struct vsock_sock *vsk) static void hvs_release(struct vsock_sock *vsk) { - struct sock *sk = sk_vsock(vsk); bool remove_sock; - lock_sock_nested(sk, SINGLE_DEPTH_NESTING); remove_sock = hvs_close_lock_held(vsk); - release_sock(sk); if (remove_sock) vsock_remove_sock(vsk); } diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index d9f0c9c5425a..f3c4bab2f737 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -829,7 +829,6 @@ void virtio_transport_release(struct vsock_sock *vsk) struct sock *sk = &vsk->sk; bool remove_sock = true; - lock_sock_nested(sk, SINGLE_DEPTH_NESTING); if (sk->sk_type == SOCK_STREAM) remove_sock = virtio_transport_close(vsk); @@ -837,7 +836,6 @@ void virtio_transport_release(struct vsock_sock *vsk) list_del(&pkt->list); virtio_transport_free_pkt(pkt); } - release_sock(sk); if (remove_sock) vsock_remove_sock(vsk); -- cgit v1.2.3