From 10b68487869031828aede7313c2befc53d6d30ec Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 27 Oct 2014 11:56:06 +0100 Subject: mac80211: flush keys for AP mode on ieee80211_do_stop Userspace can add keys to an AP mode interface before start_ap has been called. If there have been no calls to start_ap/stop_ap in the mean time, the keys will still be around when the interface is brought down. Signed-off-by: Felix Fietkau [adjust comments, fix AP_VLAN case] Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index af237223a8cd..3b9e2b7b3f30 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -898,6 +898,8 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, list_del(&sdata->u.vlan.list); mutex_unlock(&local->mtx); RCU_INIT_POINTER(sdata->vif.chanctx_conf, NULL); + /* see comment in the default case below */ + ieee80211_free_keys(sdata, true); /* no need to tell driver */ break; case NL80211_IFTYPE_MONITOR: @@ -923,17 +925,16 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, /* * When we get here, the interface is marked down. * Free the remaining keys, if there are any - * (shouldn't be, except maybe in WDS mode?) + * (which can happen in AP mode if userspace sets + * keys before the interface is operating, and maybe + * also in WDS mode) * * Force the key freeing to always synchronize_net() * to wait for the RX path in case it is using this - * interface enqueuing frames * at this very time on + * interface enqueuing frames at this very time on * another CPU. */ ieee80211_free_keys(sdata, true); - - /* fall through */ - case NL80211_IFTYPE_AP: skb_queue_purge(&sdata->skb_queue); } -- cgit v1.2.3 From 84469a45a1bedec9918e94ab2f78c5dc0739e4a7 Mon Sep 17 00:00:00 2001 From: Luciano Coelho Date: Tue, 28 Oct 2014 13:33:04 +0200 Subject: mac80211: use secondary channel offset IE also beacons during CSA If we are switching from an HT40+ to an HT40- channel (or vice-versa), we need the secondary channel offset IE to specify what is the post-CSA offset to be used. This applies both to beacons and to probe responses. In ieee80211_parse_ch_switch_ie() we were ignoring this IE from beacons and using the *current* HT information IE instead. This was causing us to use the same offset as before the switch. Fix that by using the secondary channel offset IE also for beacons and don't ever use the pre-switch offset. Additionally, remove the "beacon" argument from ieee80211_parse_ch_switch_ie(), since it's not needed anymore. Cc: stable@vger.kernel.org Reported-by: Jouni Malinen Signed-off-by: Luciano Coelho Signed-off-by: Johannes Berg --- net/mac80211/ibss.c | 2 +- net/mac80211/ieee80211_i.h | 3 +-- net/mac80211/mesh.c | 2 +- net/mac80211/mlme.c | 2 +- net/mac80211/spectmgmt.c | 18 ++++++------------ 5 files changed, 10 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 56b53571c807..509bc157ce55 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -805,7 +805,7 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, memset(¶ms, 0, sizeof(params)); memset(&csa_ie, 0, sizeof(csa_ie)); - err = ieee80211_parse_ch_switch_ie(sdata, elems, beacon, + err = ieee80211_parse_ch_switch_ie(sdata, elems, ifibss->chandef.chan->band, sta_flags, ifibss->bssid, &csa_ie); /* can't switch to destination channel, fail */ diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index c2aaec4dfcf0..8c68da30595d 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1642,7 +1642,6 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, * ieee80211_parse_ch_switch_ie - parses channel switch IEs * @sdata: the sdata of the interface which has received the frame * @elems: parsed 802.11 elements received with the frame - * @beacon: indicates if the frame was a beacon or probe response * @current_band: indicates the current band * @sta_flags: contains information about own capabilities and restrictions * to decide which channel switch announcements can be accepted. Only the @@ -1656,7 +1655,7 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, * Return: 0 on success, <0 on error and >0 if there is nothing to parse. */ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, - struct ieee802_11_elems *elems, bool beacon, + struct ieee802_11_elems *elems, enum ieee80211_band current_band, u32 sta_flags, u8 *bssid, struct ieee80211_csa_ie *csa_ie); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index e9f99c1e3fad..0c8b2a77d312 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -874,7 +874,7 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, memset(¶ms, 0, sizeof(params)); memset(&csa_ie, 0, sizeof(csa_ie)); - err = ieee80211_parse_ch_switch_ie(sdata, elems, beacon, band, + err = ieee80211_parse_ch_switch_ie(sdata, elems, band, sta_flags, sdata->vif.addr, &csa_ie); if (err < 0) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 2de88704278b..08f51c6d0953 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1072,7 +1072,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, current_band = cbss->channel->band; memset(&csa_ie, 0, sizeof(csa_ie)); - res = ieee80211_parse_ch_switch_ie(sdata, elems, beacon, current_band, + res = ieee80211_parse_ch_switch_ie(sdata, elems, current_band, ifmgd->flags, ifmgd->associated->bssid, &csa_ie); if (res < 0) diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index 6ab009070084..efeba56c913b 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -22,7 +22,7 @@ #include "wme.h" int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, - struct ieee802_11_elems *elems, bool beacon, + struct ieee802_11_elems *elems, enum ieee80211_band current_band, u32 sta_flags, u8 *bssid, struct ieee80211_csa_ie *csa_ie) @@ -91,19 +91,13 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, return -EINVAL; } - if (!beacon && sec_chan_offs) { + if (sec_chan_offs) { secondary_channel_offset = sec_chan_offs->sec_chan_offs; - } else if (beacon && ht_oper) { - secondary_channel_offset = - ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET; } else if (!(sta_flags & IEEE80211_STA_DISABLE_HT)) { - /* If it's not a beacon, HT is enabled and the IE not present, - * it's 20 MHz, 802.11-2012 8.5.2.6: - * This element [the Secondary Channel Offset Element] is - * present when switching to a 40 MHz channel. It may be - * present when switching to a 20 MHz channel (in which - * case the secondary channel offset is set to SCN). - */ + /* If the secondary channel offset IE is not present, + * we can't know what's the post-CSA offset, so the + * best we can do is use 20MHz. + */ secondary_channel_offset = IEEE80211_HT_PARAM_CHA_SEC_NONE; } -- cgit v1.2.3 From ff1e417c7c239b7abfe70aa90460a77eaafc7f83 Mon Sep 17 00:00:00 2001 From: Luciano Coelho Date: Tue, 28 Oct 2014 13:33:05 +0200 Subject: mac80211: schedule the actual switch of the station before CSA count 0 Due to the time it takes to process the beacon that started the CSA process, we may be late for the switch if we try to reach exactly beacon 0. To avoid that, use count - 1 when calculating the switch time. Cc: stable@vger.kernel.org Reported-by: Jouni Malinen Signed-off-by: Luciano Coelho Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 08f51c6d0953..93af0f1c9d99 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1168,7 +1168,8 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, ieee80211_queue_work(&local->hw, &ifmgd->chswitch_work); else mod_timer(&ifmgd->chswitch_timer, - TU_TO_EXP_TIME(csa_ie.count * cbss->beacon_interval)); + TU_TO_EXP_TIME((csa_ie.count - 1) * + cbss->beacon_interval)); } static bool -- cgit v1.2.3 From 46238845bd609a5c0fbe076e1b82b4c5b33360b2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 21 Oct 2014 20:56:42 +0200 Subject: mac80211: properly flush delayed scan work on interface removal When an interface is deleted, an ongoing hardware scan is canceled and the driver must abort the scan, at the very least reporting completion while the interface is removed. However, if it scheduled the work that might only run after everything is said and done, which leads to cfg80211 warning that the scan isn't reported as finished yet; this is no fault of the driver, it already did, but mac80211 hasn't processed it. To fix this situation, flush the delayed work when the interface being removed is the one that was executing the scan. Cc: stable@vger.kernel.org Reported-by: Sujith Manoharan Tested-by: Sujith Manoharan Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 3b9e2b7b3f30..653f5eb07a27 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -766,10 +766,12 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, int i, flushed; struct ps_data *ps; struct cfg80211_chan_def chandef; + bool cancel_scan; clear_bit(SDATA_STATE_RUNNING, &sdata->state); - if (rcu_access_pointer(local->scan_sdata) == sdata) + cancel_scan = rcu_access_pointer(local->scan_sdata) == sdata; + if (cancel_scan) ieee80211_scan_cancel(local); /* @@ -992,6 +994,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, ieee80211_recalc_ps(local, -1); + if (cancel_scan) + flush_delayed_work(&local->scan_work); + if (local->open_count == 0) { ieee80211_stop_device(local); -- cgit v1.2.3 From b8fff407a180286aa683d543d878d98d9fc57b13 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 3 Nov 2014 13:57:46 +0100 Subject: mac80211: fix use-after-free in defragmentation Upon receiving the last fragment, all but the first fragment are freed, but the multicast check for statistics at the end of the function refers to the current skb (the last fragment) causing a use-after-free bug. Since multicast frames cannot be fragmented and we check for this early in the function, just modify that check to also do the accounting to fix the issue. Cc: stable@vger.kernel.org Reported-by: Yosef Khyal Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index b04ca4049c95..a37f9af634cb 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1678,11 +1678,14 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) sc = le16_to_cpu(hdr->seq_ctrl); frag = sc & IEEE80211_SCTL_FRAG; - if (likely((!ieee80211_has_morefrags(fc) && frag == 0) || - is_multicast_ether_addr(hdr->addr1))) { - /* not fragmented */ + if (likely(!ieee80211_has_morefrags(fc) && frag == 0)) + goto out; + + if (is_multicast_ether_addr(hdr->addr1)) { + rx->local->dot11MulticastReceivedFrameCount++; goto out; } + I802_DEBUG_INC(rx->local->rx_handlers_fragments); if (skb_linearize(rx->skb)) @@ -1775,10 +1778,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) out: if (rx->sta) rx->sta->rx_packets++; - if (is_multicast_ether_addr(hdr->addr1)) - rx->local->dot11MulticastReceivedFrameCount++; - else - ieee80211_led_rx(rx->local); + ieee80211_led_rx(rx->local); return RX_CONTINUE; } -- cgit v1.2.3 From c1207c049b204b0a96535dc5416aee331b51e0e1 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 2 Nov 2014 18:19:15 -0800 Subject: netfilter: nft_reject_bridge: Fix powerpc build error Fix: net/bridge/netfilter/nft_reject_bridge.c: In function 'nft_reject_br_send_v6_unreach': net/bridge/netfilter/nft_reject_bridge.c:240:3: error: implicit declaration of function 'csum_ipv6_magic' csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, ^ make[3]: *** [net/bridge/netfilter/nft_reject_bridge.o] Error 1 Seen with powerpc:allmodconfig. Fixes: 523b929d5446 ("netfilter: nft_reject_bridge: don't use IP stack to reject traffic") Cc: Pablo Neira Ayuso Signed-off-by: Guenter Roeck Signed-off-by: David S. Miller --- net/bridge/netfilter/nft_reject_bridge.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 654c9018e3e7..48da2c54a69e 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "../br_private.h" -- cgit v1.2.3 From 6c6151daaf2d8dc2046d9926539feed5f66bf74e Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 3 Nov 2014 09:19:27 +0100 Subject: ip6_tunnel: Use ip6_tnl_dev_init as the ndo_init function. ip6_tnl_dev_init() sets the dev->iflink via a call to ip6_tnl_link_config(). After that, register_netdevice() sets dev->iflink = -1. So we loose the iflink configuration for ipv6 tunnels. Fix this by using ip6_tnl_dev_init() as the ndo_init function. Then ip6_tnl_dev_init() is called after dev->iflink is set to -1 from register_netdevice(). Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 9409887fb664..9cb94cfa0ae7 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -272,9 +272,6 @@ static int ip6_tnl_create2(struct net_device *dev) int err; t = netdev_priv(dev); - err = ip6_tnl_dev_init(dev); - if (err < 0) - goto out; err = register_netdevice(dev); if (err < 0) @@ -1462,6 +1459,7 @@ ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) static const struct net_device_ops ip6_tnl_netdev_ops = { + .ndo_init = ip6_tnl_dev_init, .ndo_uninit = ip6_tnl_dev_uninit, .ndo_start_xmit = ip6_tnl_xmit, .ndo_do_ioctl = ip6_tnl_ioctl, @@ -1546,16 +1544,10 @@ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - int err = ip6_tnl_dev_init_gen(dev); - - if (err) - return err; t->parms.proto = IPPROTO_IPV6; dev_hold(dev); - ip6_tnl_link_config(t); - rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; } -- cgit v1.2.3 From 16a0231bf7dc3fb37e9b1f1cb1a277dc220b5c5e Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 3 Nov 2014 09:19:28 +0100 Subject: vti6: Use vti6_dev_init as the ndo_init function. vti6_dev_init() sets the dev->iflink via a call to vti6_link_config(). After that, register_netdevice() sets dev->iflink = -1. So we loose the iflink configuration for vti6 tunnels. Fix this by using vti6_dev_init() as the ndo_init function. Then vti6_dev_init() is called after dev->iflink is set to -1 from register_netdevice(). Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv6/ip6_vti.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index d440bb585524..31089d153fd3 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -172,10 +172,6 @@ static int vti6_tnl_create2(struct net_device *dev) struct vti6_net *ip6n = net_generic(net, vti6_net_id); int err; - err = vti6_dev_init(dev); - if (err < 0) - goto out; - err = register_netdevice(dev); if (err < 0) goto out; @@ -783,6 +779,7 @@ static int vti6_change_mtu(struct net_device *dev, int new_mtu) } static const struct net_device_ops vti6_netdev_ops = { + .ndo_init = vti6_dev_init, .ndo_uninit = vti6_dev_uninit, .ndo_start_xmit = vti6_tnl_xmit, .ndo_do_ioctl = vti6_ioctl, @@ -852,16 +849,10 @@ static int __net_init vti6_fb_tnl_dev_init(struct net_device *dev) struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); - int err = vti6_dev_init_gen(dev); - - if (err) - return err; t->parms.proto = IPPROTO_IPV6; dev_hold(dev); - vti6_link_config(t); - rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; } -- cgit v1.2.3 From ebe084aafb7e93adf210e80043c9f69adf56820d Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 3 Nov 2014 09:19:29 +0100 Subject: sit: Use ipip6_tunnel_init as the ndo_init function. ipip6_tunnel_init() sets the dev->iflink via a call to ipip6_tunnel_bind_dev(). After that, register_netdevice() sets dev->iflink = -1. So we loose the iflink configuration for ipv6 tunnels. Fix this by using ipip6_tunnel_init() as the ndo_init function. Then ipip6_tunnel_init() is called after dev->iflink is set to -1 from register_netdevice(). Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv6/sit.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 58e5b4710127..a24557a1c1d8 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -195,10 +195,8 @@ static int ipip6_tunnel_create(struct net_device *dev) struct sit_net *sitn = net_generic(net, sit_net_id); int err; - err = ipip6_tunnel_init(dev); - if (err < 0) - goto out; - ipip6_tunnel_clone_6rd(dev, sitn); + memcpy(dev->dev_addr, &t->parms.iph.saddr, 4); + memcpy(dev->broadcast, &t->parms.iph.daddr, 4); if ((__force u16)t->parms.i_flags & SIT_ISATAP) dev->priv_flags |= IFF_ISATAP; @@ -207,7 +205,8 @@ static int ipip6_tunnel_create(struct net_device *dev) if (err < 0) goto out; - strcpy(t->parms.name, dev->name); + ipip6_tunnel_clone_6rd(dev, sitn); + dev->rtnl_link_ops = &sit_link_ops; dev_hold(dev); @@ -1330,6 +1329,7 @@ static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu) } static const struct net_device_ops ipip6_netdev_ops = { + .ndo_init = ipip6_tunnel_init, .ndo_uninit = ipip6_tunnel_uninit, .ndo_start_xmit = sit_tunnel_xmit, .ndo_do_ioctl = ipip6_tunnel_ioctl, @@ -1378,9 +1378,7 @@ static int ipip6_tunnel_init(struct net_device *dev) tunnel->dev = dev; tunnel->net = dev_net(dev); - - memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); - memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); + strcpy(tunnel->parms.name, dev->name); ipip6_tunnel_bind_dev(dev); dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); @@ -1405,7 +1403,6 @@ static int __net_init ipip6_fb_tunnel_init(struct net_device *dev) tunnel->dev = dev; tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); iph->version = 4; iph->protocol = IPPROTO_IPV6; -- cgit v1.2.3 From f03eb128e3f4276f46442d14f3b8f864f3775821 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 3 Nov 2014 09:19:30 +0100 Subject: gre6: Move the setting of dev->iflink into the ndo_init functions. Otherwise it gets overwritten by register_netdev(). Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 12c3c8ef3849..4564e1fca3eb 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -961,8 +961,6 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) else dev->flags &= ~IFF_POINTOPOINT; - dev->iflink = p->link; - /* Precalculate GRE options length */ if (t->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { if (t->parms.o_flags&GRE_CSUM) @@ -1272,6 +1270,7 @@ static int ip6gre_tunnel_init(struct net_device *dev) u64_stats_init(&ip6gre_tunnel_stats->syncp); } + dev->iflink = tunnel->parms.link; return 0; } @@ -1481,6 +1480,8 @@ static int ip6gre_tap_init(struct net_device *dev) if (!dev->tstats) return -ENOMEM; + dev->iflink = tunnel->parms.link; + return 0; } -- cgit v1.2.3 From 45cac46e51da75628ac2a593c70f5144abb9b31d Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 3 Nov 2014 19:38:37 -0800 Subject: geneve: Set GSO type on transmit. Geneve does not currently set the inner protocol type when transmitting packets. This causes GSO segmentation to fail on NICs that do not support Geneve offloading. CC: Andy Zhou Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- net/ipv4/geneve.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c index 065cd94c640c..6e5266cf403d 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve.c @@ -144,6 +144,8 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); + skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst, tos, ttl, df, src_port, dst_port, xnet); } -- cgit v1.2.3 From d3ca9eafc0ed97b8f56fdf23655cfece89c48354 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 3 Nov 2014 19:38:38 -0800 Subject: geneve: Unregister pernet subsys on module unload. The pernet ops aren't ever unregistered, which causes a memory leak and an OOPs if the module is ever reinserted. Fixes: 0b5e8b8eeae4 ("net: Add Geneve tunneling protocol driver") CC: Andy Zhou Signed-off-by: Jesse Gross Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv4/geneve.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c index 6e5266cf403d..dedb21e99914 100644 --- a/net/ipv4/geneve.c +++ b/net/ipv4/geneve.c @@ -366,6 +366,7 @@ late_initcall(geneve_init_module); static void __exit geneve_cleanup_module(void) { destroy_workqueue(geneve_wq); + unregister_pernet_subsys(&geneve_net_ops); } module_exit(geneve_cleanup_module); -- cgit v1.2.3 From 1f37bf87aa7523d28e7e4c4f7bb5dba98faa3e00 Mon Sep 17 00:00:00 2001 From: Marcelo Leitner Date: Tue, 4 Nov 2014 17:15:08 -0200 Subject: tcp: zero retrans_stamp if all retrans were acked Ueki Kohei reported that when we are using NewReno with connections that have a very low traffic, we may timeout the connection too early if a second loss occurs after the first one was successfully acked but no data was transfered later. Below is his description of it: When SACK is disabled, and a socket suffers multiple separate TCP retransmissions, that socket's ETIMEDOUT value is calculated from the time of the *first* retransmission instead of the *latest* retransmission. This happens because the tcp_sock's retrans_stamp is set once then never cleared. Take the following connection: Linux remote-machine | | send#1---->(*1)|--------> data#1 --------->| | | | RTO : : | | | ---(*2)|----> data#1(retrans) ---->| | (*3)|<---------- ACK <----------| | | | | : : | : : | : : 16 minutes (or more) : | : : | : : | : : | | | send#2---->(*4)|--------> data#2 --------->| | | | RTO : : | | | ---(*5)|----> data#2(retrans) ---->| | | | | | | RTO*2 : : | | | | | | ETIMEDOUT<----(*6)| | (*1) One data packet sent. (*2) Because no ACK packet is received, the packet is retransmitted. (*3) The ACK packet is received. The transmitted packet is acknowledged. At this point the first "retransmission event" has passed and been recovered from. Any future retransmission is a completely new "event". (*4) After 16 minutes (to correspond with retries2=15), a new data packet is sent. Note: No data is transmitted between (*3) and (*4). The socket's timeout SHOULD be calculated from this point in time, but instead it's calculated from the prior "event" 16 minutes ago. (*5) Because no ACK packet is received, the packet is retransmitted. (*6) At the time of the 2nd retransmission, the socket returns ETIMEDOUT. Therefore, now we clear retrans_stamp as soon as all data during the loss window is fully acked. Reported-by: Ueki Kohei Cc: Neal Cardwell Cc: Yuchung Cheng Signed-off-by: Marcelo Ricardo Leitner Acked-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 60 +++++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a12b455928e5..88fa2d160685 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2315,6 +2315,35 @@ static inline bool tcp_packet_delayed(const struct tcp_sock *tp) /* Undo procedures. */ +/* We can clear retrans_stamp when there are no retransmissions in the + * window. It would seem that it is trivially available for us in + * tp->retrans_out, however, that kind of assumptions doesn't consider + * what will happen if errors occur when sending retransmission for the + * second time. ...It could the that such segment has only + * TCPCB_EVER_RETRANS set at the present time. It seems that checking + * the head skb is enough except for some reneging corner cases that + * are not worth the effort. + * + * Main reason for all this complexity is the fact that connection dying + * time now depends on the validity of the retrans_stamp, in particular, + * that successive retransmissions of a segment must not advance + * retrans_stamp under any conditions. + */ +static bool tcp_any_retrans_done(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if (tp->retrans_out) + return true; + + skb = tcp_write_queue_head(sk); + if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) + return true; + + return false; +} + #if FASTRETRANS_DEBUG > 1 static void DBGUNDO(struct sock *sk, const char *msg) { @@ -2410,6 +2439,8 @@ static bool tcp_try_undo_recovery(struct sock *sk) * is ACKed. For Reno it is MUST to prevent false * fast retransmits (RFC2582). SACK TCP is safe. */ tcp_moderate_cwnd(tp); + if (!tcp_any_retrans_done(sk)) + tp->retrans_stamp = 0; return true; } tcp_set_ca_state(sk, TCP_CA_Open); @@ -2430,35 +2461,6 @@ static bool tcp_try_undo_dsack(struct sock *sk) return false; } -/* We can clear retrans_stamp when there are no retransmissions in the - * window. It would seem that it is trivially available for us in - * tp->retrans_out, however, that kind of assumptions doesn't consider - * what will happen if errors occur when sending retransmission for the - * second time. ...It could the that such segment has only - * TCPCB_EVER_RETRANS set at the present time. It seems that checking - * the head skb is enough except for some reneging corner cases that - * are not worth the effort. - * - * Main reason for all this complexity is the fact that connection dying - * time now depends on the validity of the retrans_stamp, in particular, - * that successive retransmissions of a segment must not advance - * retrans_stamp under any conditions. - */ -static bool tcp_any_retrans_done(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - - if (tp->retrans_out) - return true; - - skb = tcp_write_queue_head(sk); - if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) - return true; - - return false; -} - /* Undo during loss recovery after partial ACK or using F-RTO. */ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) { -- cgit v1.2.3 From 4f031fa9f188b2b0641ac20087d9e16bcfb4e49d Mon Sep 17 00:00:00 2001 From: Ronald Wahl Date: Thu, 6 Nov 2014 11:52:13 +0100 Subject: mac80211: Fix regression that triggers a kernel BUG with CCMP Commit 7ec7c4a9a686c608315739ab6a2b0527a240883c (mac80211: port CCMP to cryptoapi's CCM driver) introduced a regression when decrypting empty packets (data_len == 0). This will lead to backtraces like: (scatterwalk_start) from [] (scatterwalk_map_and_copy+0x2c/0xa8) (scatterwalk_map_and_copy) from [] (crypto_ccm_decrypt+0x7c/0x25c) (crypto_ccm_decrypt) from [] (ieee80211_aes_ccm_decrypt+0x160/0x170) (ieee80211_aes_ccm_decrypt) from [] (ieee80211_crypto_ccmp_decrypt+0x1ac/0x238) (ieee80211_crypto_ccmp_decrypt) from [] (ieee80211_rx_handlers+0x870/0x1d24) (ieee80211_rx_handlers) from [] (ieee80211_prepare_and_rx_handle+0x8a0/0x91c) (ieee80211_prepare_and_rx_handle) from [] (ieee80211_rx+0x568/0x730) (ieee80211_rx) from [] (__carl9170_rx+0x94c/0xa20) (__carl9170_rx) from [] (carl9170_rx_stream+0x1fc/0x320) (carl9170_rx_stream) from [] (carl9170_usb_tasklet+0x80/0xc8) (carl9170_usb_tasklet) from [] (tasklet_hi_action+0x88/0xcc) (tasklet_hi_action) from [] (__do_softirq+0xcc/0x200) (__do_softirq) from [] (irq_exit+0x80/0xe0) (irq_exit) from [] (handle_IRQ+0x64/0x80) (handle_IRQ) from [] (__irq_svc+0x40/0x4c) (__irq_svc) from [] (arch_cpu_idle+0x2c/0x34) Such packets can appear for example when using the carl9170 wireless driver because hardware sometimes generates garbage when the internal FIFO overruns. This patch adds an additional length check. Cc: stable@vger.kernel.org Fixes: 7ec7c4a9a686 ("mac80211: port CCMP to cryptoapi's CCM driver") Acked-by: Ard Biesheuvel Signed-off-by: Ronald Wahl Signed-off-by: Johannes Berg --- net/mac80211/aes_ccm.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/mac80211/aes_ccm.c b/net/mac80211/aes_ccm.c index ec24378caaaf..09d9caaec591 100644 --- a/net/mac80211/aes_ccm.c +++ b/net/mac80211/aes_ccm.c @@ -53,6 +53,9 @@ int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, __aligned(__alignof__(struct aead_request)); struct aead_request *aead_req = (void *) aead_req_data; + if (data_len == 0) + return -EINVAL; + memset(aead_req, 0, sizeof(aead_req_data)); sg_init_one(&pt, data, data_len); -- cgit v1.2.3 From b31f65fb4383a49bdcfa465176754b37e44e1e17 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 5 Nov 2014 19:47:28 +0100 Subject: net: dsa: slave: Fix autoneg for phys on switch MDIO bus When the ports phys are connected to the switches internal MDIO bus, we need to connect the phy to the slave netdev, otherwise auto-negotiation etc, does not work. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- net/dsa/slave.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 6d1817449c36..ab03e00ffe8f 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -489,11 +489,14 @@ static void dsa_slave_phy_setup(struct dsa_slave_priv *p, /* We could not connect to a designated PHY, so use the switch internal * MDIO bus instead */ - if (!p->phy) + if (!p->phy) { p->phy = ds->slave_mii_bus->phy_map[p->port]; - else + phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link, + p->phy_interface); + } else { pr_info("attached PHY at address %d [%s]\n", p->phy->addr, p->phy->drv->name); + } } int dsa_slave_suspend(struct net_device *slave_dev) -- cgit v1.2.3 From 6b96686ecffcbea85dcb502e4584e4a20a2bfb29 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 7 Nov 2014 15:34:54 +0100 Subject: netfilter: nft_masq: fix uninitialized range in nft_masq_{ipv4, ipv6}_eval When transferring from the original range in nf_nat_masquerade_{ipv4,ipv6}() we copy over values from stack in from min_proto/max_proto due to uninitialized range variable in both, nft_masq_{ipv4,ipv6}_eval. As we only initialize flags at this time from nft_masq struct, just zero out the rest. Fixes: 9ba1f726bec09 ("netfilter: nf_tables: add new nft_masq expression") Signed-off-by: Daniel Borkmann Acked-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nft_masq_ipv4.c | 1 + net/ipv6/netfilter/nft_masq_ipv6.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index c1023c445920..665de06561cd 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -24,6 +24,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, struct nf_nat_range range; unsigned int verdict; + memset(&range, 0, sizeof(range)); range.flags = priv->flags; verdict = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum, diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c index 8a7ac685076d..529c119cbb14 100644 --- a/net/ipv6/netfilter/nft_masq_ipv6.c +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -25,6 +25,7 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr, struct nf_nat_range range; unsigned int verdict; + memset(&range, 0, sizeof(range)); range.flags = priv->flags; verdict = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out); -- cgit v1.2.3 From cfdf1e1ba5bf55e095cf4bcaa9585c4759f239e8 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 10 Nov 2014 11:45:13 -0800 Subject: udptunnel: Add SKB_GSO_UDP_TUNNEL during gro_complete. When doing GRO processing for UDP tunnels, we never add SKB_GSO_UDP_TUNNEL to gso_type - only the type of the inner protocol is added (such as SKB_GSO_TCPV4). The result is that if the packet is later resegmented we will do GSO but not treat it as a tunnel. This results in UDP fragmentation of the outer header instead of (i.e.) TCP segmentation of the inner header as was originally on the wire. Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- net/ipv4/fou.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 32e78924e246..606c520ffd5a 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -133,6 +133,8 @@ static int fou_gro_complete(struct sk_buff *skb, int nhoff) int err = -ENOSYS; const struct net_offload **offloads; + udp_tunnel_gro_complete(skb, nhoff); + rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); -- cgit v1.2.3 From 2196937e12b1b4ba139806d132647e1651d655df Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 10 Nov 2014 17:11:21 +0100 Subject: netfilter: ipset: small potential read beyond the end of buffer We could be reading 8 bytes into a 4 byte buffer here. It seems harmless but adding a check is the right thing to do and it silences a static checker warning. Signed-off-by: Dan Carpenter Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 86f9d76b1464..d259da3ce67a 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1863,6 +1863,12 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) if (*op < IP_SET_OP_VERSION) { /* Check the version at the beginning of operations */ struct ip_set_req_version *req_version = data; + + if (*len < sizeof(struct ip_set_req_version)) { + ret = -EINVAL; + goto done; + } + if (req_version->version != IPSET_PROTOCOL) { ret = -EPROTO; goto done; -- cgit v1.2.3 From e40607cbe270a9e8360907cb1e62ddf0736e4864 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 10 Nov 2014 17:54:26 +0100 Subject: net: sctp: fix NULL pointer dereference in af->from_addr_param on malformed packet An SCTP server doing ASCONF will panic on malformed INIT ping-of-death in the form of: ------------ INIT[PARAM: SET_PRIMARY_IP] ------------> While the INIT chunk parameter verification dissects through many things in order to detect malformed input, it misses to actually check parameters inside of parameters. E.g. RFC5061, section 4.2.4 proposes a 'set primary IP address' parameter in ASCONF, which has as a subparameter an address parameter. So an attacker may send a parameter type other than SCTP_PARAM_IPV4_ADDRESS or SCTP_PARAM_IPV6_ADDRESS, param_type2af() will subsequently return 0 and thus sctp_get_af_specific() returns NULL, too, which we then happily dereference unconditionally through af->from_addr_param(). The trace for the log: BUG: unable to handle kernel NULL pointer dereference at 0000000000000078 IP: [] sctp_process_init+0x492/0x990 [sctp] PGD 0 Oops: 0000 [#1] SMP [...] Pid: 0, comm: swapper Not tainted 2.6.32-504.el6.x86_64 #1 Bochs Bochs RIP: 0010:[] [] sctp_process_init+0x492/0x990 [sctp] [...] Call Trace: [] ? sctp_bind_addr_copy+0x5d/0xe0 [sctp] [] sctp_sf_do_5_1B_init+0x21b/0x340 [sctp] [] sctp_do_sm+0x71/0x1210 [sctp] [] ? sctp_endpoint_lookup_assoc+0xc9/0xf0 [sctp] [] sctp_endpoint_bh_rcv+0x116/0x230 [sctp] [] sctp_inq_push+0x56/0x80 [sctp] [] sctp_rcv+0x982/0xa10 [sctp] [] ? ipt_local_in_hook+0x23/0x28 [iptable_filter] [] ? nf_iterate+0x69/0xb0 [] ? ip_local_deliver_finish+0x0/0x2d0 [] ? nf_hook_slow+0x76/0x120 [] ? ip_local_deliver_finish+0x0/0x2d0 [...] A minimal way to address this is to check for NULL as we do on all other such occasions where we know sctp_get_af_specific() could possibly return with NULL. Fixes: d6de3097592b ("[SCTP]: Add the handling of "Set Primary IP Address" parameter to INIT") Signed-off-by: Daniel Borkmann Cc: Vlad Yasevich Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index ab734be8cb20..9f32741abb1c 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2609,6 +2609,9 @@ do_addr_param: addr_param = param.v + sizeof(sctp_addip_param_t); af = sctp_get_af_specific(param_type2af(param.p->type)); + if (af == NULL) + break; + af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0); -- cgit v1.2.3 From 4184b2a79a7612a9272ce20d639934584a1f3786 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 10 Nov 2014 18:00:09 +0100 Subject: net: sctp: fix memory leak in auth key management A very minimal and simple user space application allocating an SCTP socket, setting SCTP_AUTH_KEY setsockopt(2) on it and then closing the socket again will leak the memory containing the authentication key from user space: unreferenced object 0xffff8800837047c0 (size 16): comm "a.out", pid 2789, jiffies 4296954322 (age 192.258s) hex dump (first 16 bytes): 01 00 00 00 04 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x4e/0xb0 [] __kmalloc+0xe8/0x270 [] sctp_auth_create_key+0x23/0x50 [sctp] [] sctp_auth_set_key+0xa1/0x140 [sctp] [] sctp_setsockopt+0xd03/0x1180 [sctp] [] sock_common_setsockopt+0x14/0x20 [] SyS_setsockopt+0x71/0xd0 [] system_call_fastpath+0x12/0x17 [] 0xffffffffffffffff This is bad because of two things, we can bring down a machine from user space when auth_enable=1, but also we would leave security sensitive keying material in memory without clearing it after use. The issue is that sctp_auth_create_key() already sets the refcount to 1, but after allocation sctp_auth_set_key() does an additional refcount on it, and thus leaving it around when we free the socket. Fixes: 65b07e5d0d0 ("[SCTP]: API updates to suport SCTP-AUTH extensions.") Signed-off-by: Daniel Borkmann Cc: Vlad Yasevich Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/auth.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 0e8529113dc5..fb7976aee61c 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -862,8 +862,6 @@ int sctp_auth_set_key(struct sctp_endpoint *ep, list_add(&cur_key->key_list, sh_keys); cur_key->key = key; - sctp_auth_key_hold(key); - return 0; nomem: if (!replace) -- cgit v1.2.3 From 5337b5b75cd9bd3624a6820e3c2a084d2480061c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 10 Nov 2014 17:54:25 -0800 Subject: ipv6: fix IPV6_PKTINFO with v4 mapped Use IS_ENABLED(CONFIG_IPV6), to enable this code if IPv6 is a module. Signed-off-by: Eric Dumazet Fixes: c8e6ad0829a7 ("ipv6: honor IPV6_PKTINFO with v4 mapped addresses on sendmsg") Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index c373a9ad4555..9daf2177dc00 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -195,7 +195,7 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; -#if defined(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_IPV6) if (allow_ipv6 && cmsg->cmsg_level == SOL_IPV6 && cmsg->cmsg_type == IPV6_PKTINFO) { -- cgit v1.2.3 From 50656d9df63d69ce399c8be62d4473b039dac36a Mon Sep 17 00:00:00 2001 From: Calvin Owens Date: Tue, 4 Nov 2014 16:37:40 -0800 Subject: ipvs: Keep skb->sk when allocating headroom on tunnel xmit ip_vs_prepare_tunneled_skb() ignores ->sk when allocating a new skb, either unconditionally setting ->sk to NULL or allowing the uninitialized ->sk from a newly allocated skb to leak through to the caller. This patch properly copies ->sk and increments its reference count. Signed-off-by: Calvin Owens Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_xmit.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 437a3663ad03..bd90bf8107da 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -846,6 +846,8 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) goto error; + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); consume_skb(skb); skb = new_skb; } -- cgit v1.2.3 From 2daf1b4d18e3add229d1a3b5c554331d99ac6c7e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 7 Nov 2014 18:48:33 +0100 Subject: netfilter: nft_compat: use current net namespace Instead of init_net when using xtables over nftables compat. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 9d6d6f60a80f..b92f129beade 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -117,7 +117,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, struct xt_target *target, void *info, union nft_entry *entry, u8 proto, bool inv) { - par->net = &init_net; + par->net = ctx->net; par->table = ctx->table->name; switch (ctx->afi->family) { case AF_INET: @@ -324,7 +324,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, struct xt_match *match, void *info, union nft_entry *entry, u8 proto, bool inv) { - par->net = &init_net; + par->net = ctx->net; par->table = ctx->table->name; switch (ctx->afi->family) { case AF_INET: -- cgit v1.2.3 From c918687f5e3962375a19de6ded3c1be85ebdbcd6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 10 Nov 2014 20:53:55 +0100 Subject: netfilter: nft_compat: relax chain type validation Check for nat chain dependency only, which is the one that can actually crash the kernel. Don't care if mangle, filter and security specific match and targets are used out of their scope, they are harmless. This restores iptables-compat with mangle specific match/target when used out of the OUTPUT chain, that are actually emulated through filter chains, which broke when performing strict validation. Fixes: f3f5dde ("netfilter: nft_compat: validate chain type in match/target") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 32 ++------------------------------ 1 file changed, 2 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index b92f129beade..70dc96516305 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -21,45 +21,17 @@ #include #include -static const struct { - const char *name; - u8 type; -} table_to_chaintype[] = { - { "filter", NFT_CHAIN_T_DEFAULT }, - { "raw", NFT_CHAIN_T_DEFAULT }, - { "security", NFT_CHAIN_T_DEFAULT }, - { "mangle", NFT_CHAIN_T_ROUTE }, - { "nat", NFT_CHAIN_T_NAT }, - { }, -}; - -static int nft_compat_table_to_chaintype(const char *table) -{ - int i; - - for (i = 0; table_to_chaintype[i].name != NULL; i++) { - if (strcmp(table_to_chaintype[i].name, table) == 0) - return table_to_chaintype[i].type; - } - - return -1; -} - static int nft_compat_chain_validate_dependency(const char *tablename, const struct nft_chain *chain) { - enum nft_chain_type type; const struct nft_base_chain *basechain; if (!tablename || !(chain->flags & NFT_BASE_CHAIN)) return 0; - type = nft_compat_table_to_chaintype(tablename); - if (type < 0) - return -EINVAL; - basechain = nft_base_chain(chain); - if (basechain->type->type != type) + if (strcmp(tablename, "nat") == 0 && + basechain->type->type != NFT_CHAIN_T_NAT) return -EINVAL; return 0; -- cgit v1.2.3 From afefb6f928ed42d5db452ee9251ce6de62673c67 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 10 Nov 2014 19:08:21 +0100 Subject: netfilter: nft_compat: use the match->table to validate dependencies Instead of the match->name, which is of course not relevant. Fixes: f3f5dde ("netfilter: nft_compat: validate chain type in match/target") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 70dc96516305..265e190f2218 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -346,7 +346,7 @@ nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, union nft_entry e = {}; int ret; - ret = nft_compat_chain_validate_dependency(match->name, ctx->chain); + ret = nft_compat_chain_validate_dependency(match->table, ctx->chain); if (ret < 0) goto err; @@ -420,7 +420,7 @@ static int nft_match_validate(const struct nft_ctx *ctx, if (!(hook_mask & match->hooks)) return -EINVAL; - ret = nft_compat_chain_validate_dependency(match->name, + ret = nft_compat_chain_validate_dependency(match->table, ctx->chain); if (ret < 0) return ret; -- cgit v1.2.3 From b326dd37b94e29bf6a15940f4fa66aa21a678ab1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 10 Nov 2014 21:14:12 +0100 Subject: netfilter: nf_tables: restore synchronous object release from commit/abort The existing xtables matches and targets, when used from nft_compat, may sleep from the destroy path, ie. when removing rules. Since the objects are released via call_rcu from softirq context, this results in lockdep splats and possible lockups that may be hard to reproduce. Patrick also indicated that delayed object release via call_rcu can cause us problems in the ordering of event notifications when anonymous sets are in place. So, this patch restores the synchronous object release from the commit and abort paths. This includes a call to synchronize_rcu() to make sure that no packets are walking on the objects that are going to be released. This is slowier though, but it's simple and it resolves the aforementioned problems. This is a partial revert of c7c32e7 ("netfilter: nf_tables: defer all object release via rcu") that was introduced in 3.16 to speed up interaction with userspace. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 11ab4b078f3b..66e8425dbfe7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3484,13 +3484,8 @@ static void nft_chain_commit_update(struct nft_trans *trans) } } -/* Schedule objects for release via rcu to make sure no packets are accesing - * removed rules. - */ -static void nf_tables_commit_release_rcu(struct rcu_head *rt) +static void nf_tables_commit_release(struct nft_trans *trans) { - struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head); - switch (trans->msg_type) { case NFT_MSG_DELTABLE: nf_tables_table_destroy(&trans->ctx); @@ -3612,10 +3607,11 @@ static int nf_tables_commit(struct sk_buff *skb) } } + synchronize_rcu(); + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { list_del(&trans->list); - trans->ctx.nla = NULL; - call_rcu(&trans->rcu_head, nf_tables_commit_release_rcu); + nf_tables_commit_release(trans); } nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); @@ -3623,13 +3619,8 @@ static int nf_tables_commit(struct sk_buff *skb) return 0; } -/* Schedule objects for release via rcu to make sure no packets are accesing - * aborted rules. - */ -static void nf_tables_abort_release_rcu(struct rcu_head *rt) +static void nf_tables_abort_release(struct nft_trans *trans) { - struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head); - switch (trans->msg_type) { case NFT_MSG_NEWTABLE: nf_tables_table_destroy(&trans->ctx); @@ -3725,11 +3716,12 @@ static int nf_tables_abort(struct sk_buff *skb) } } + synchronize_rcu(); + list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list, list) { list_del(&trans->list); - trans->ctx.nla = NULL; - call_rcu(&trans->rcu_head, nf_tables_abort_release_rcu); + nf_tables_abort_release(trans); } return 0; -- cgit v1.2.3 From 6251edd932ce3faadbfe27b0a0fe79780e0972e9 Mon Sep 17 00:00:00 2001 From: Hiroaki SHIMODA Date: Thu, 13 Nov 2014 04:24:10 +0900 Subject: netlink: Properly unbind in error conditions. Even if netlink_kernel_cfg::unbind is implemented the unbind() method is not called, because cfg->unbind is omitted in __netlink_kernel_create(). And fix wrong argument of test_bit() and off by one problem. At this point, no unbind() method is implemented, so there is no real issue. Fixes: 4f520900522f ("netlink: have netlink per-protocol bind function return an error code.") Signed-off-by: Hiroaki SHIMODA Cc: Richard Guy Briggs Acked-by: Richard Guy Briggs Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f1de72de273e..0007b8180397 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1440,7 +1440,7 @@ static void netlink_unbind(int group, long unsigned int groups, return; for (undo = 0; undo < group; undo++) - if (test_bit(group, &groups)) + if (test_bit(undo, &groups)) nlk->netlink_unbind(undo); } @@ -1492,7 +1492,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, netlink_insert(sk, net, nladdr->nl_pid) : netlink_autobind(sock); if (err) { - netlink_unbind(nlk->ngroups - 1, groups, nlk); + netlink_unbind(nlk->ngroups, groups, nlk); return err; } } @@ -2509,6 +2509,7 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, nl_table[unit].module = module; if (cfg) { nl_table[unit].bind = cfg->bind; + nl_table[unit].unbind = cfg->unbind; nl_table[unit].flags = cfg->flags; if (cfg->compare) nl_table[unit].compare = cfg->compare; -- cgit v1.2.3 From b3ecba096729f521312d1863ad22530695527aed Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 13 Nov 2014 07:30:46 -0500 Subject: sunrpc: fix sleeping under rcu_read_lock in gss_stringify_acceptor Bruce reported that he was seeing the following BUG pop: BUG: sleeping function called from invalid context at mm/slab.c:2846 in_atomic(): 0, irqs_disabled(): 0, pid: 4539, name: mount.nfs 2 locks held by mount.nfs/4539: #0: (nfs_clid_init_mutex){+.+.+.}, at: [] nfs4_discover_server_trunking+0x4a/0x2f0 [nfsv4] #1: (rcu_read_lock){......}, at: [] gss_stringify_acceptor+0x5/0xb0 [auth_rpcgss] Preemption disabled at:[] printk+0x4d/0x4f CPU: 3 PID: 4539 Comm: mount.nfs Not tainted 3.18.0-rc1-00013-g5b095e9 #3393 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 ffff880021499390 ffff8800381476a8 ffffffff81a534cf 0000000000000001 0000000000000000 ffff8800381476c8 ffffffff81097854 00000000000000d0 0000000000000018 ffff880038147718 ffffffff8118e4f3 0000000020479f00 Call Trace: [] dump_stack+0x4f/0x7c [] __might_sleep+0x114/0x180 [] __kmalloc+0x1a3/0x280 [] gss_stringify_acceptor+0x58/0xb0 [auth_rpcgss] [] ? gss_stringify_acceptor+0x5/0xb0 [auth_rpcgss] [] rpcauth_stringify_acceptor+0x18/0x30 [sunrpc] [] nfs4_proc_setclientid+0x199/0x380 [nfsv4] [] ? nfs4_proc_setclientid+0x200/0x380 [nfsv4] [] nfs40_discover_server_trunking+0xda/0x150 [nfsv4] [] ? nfs40_discover_server_trunking+0x5/0x150 [nfsv4] [] nfs4_discover_server_trunking+0x7f/0x2f0 [nfsv4] [] nfs4_init_client+0x104/0x2f0 [nfsv4] [] nfs_get_client+0x314/0x3f0 [nfs] [] ? nfs_get_client+0xe0/0x3f0 [nfs] [] nfs4_set_client+0x8a/0x110 [nfsv4] [] ? __rpc_init_priority_wait_queue+0xa8/0xf0 [sunrpc] [] nfs4_create_server+0x12f/0x390 [nfsv4] [] nfs4_remote_mount+0x32/0x60 [nfsv4] [] mount_fs+0x39/0x1b0 [] ? __alloc_percpu+0x15/0x20 [] vfs_kern_mount+0x6b/0x150 [] nfs_do_root_mount+0x86/0xc0 [nfsv4] [] nfs4_try_mount+0x44/0xc0 [nfsv4] [] ? get_nfs_version+0x27/0x90 [nfs] [] nfs_fs_mount+0x47d/0xd60 [nfs] [] ? mutex_unlock+0xe/0x10 [] ? nfs_remount+0x430/0x430 [nfs] [] ? nfs_clone_super+0x140/0x140 [nfs] [] mount_fs+0x39/0x1b0 [] ? __alloc_percpu+0x15/0x20 [] vfs_kern_mount+0x6b/0x150 [] do_mount+0x210/0xbe0 [] ? copy_mount_options+0x3a/0x160 [] SyS_mount+0x6f/0xb0 [] system_call_fastpath+0x12/0x17 Sleeping under the rcu_read_lock is bad. This patch fixes it by dropping the rcu_read_lock before doing the allocation and then reacquiring it and redoing the dereference before doing the copy. If we find that the string has somehow grown in the meantime, we'll reallocate and try again. Cc: # v3.17+ Reported-by: "J. Bruce Fields" Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index afb292cd797d..53ed8d3f8897 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1353,6 +1353,7 @@ gss_stringify_acceptor(struct rpc_cred *cred) char *string = NULL; struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx; + unsigned int len; struct xdr_netobj *acceptor; rcu_read_lock(); @@ -1360,15 +1361,39 @@ gss_stringify_acceptor(struct rpc_cred *cred) if (!ctx) goto out; - acceptor = &ctx->gc_acceptor; + len = ctx->gc_acceptor.len; + rcu_read_unlock(); /* no point if there's no string */ - if (!acceptor->len) - goto out; - - string = kmalloc(acceptor->len + 1, GFP_KERNEL); + if (!len) + return NULL; +realloc: + string = kmalloc(len + 1, GFP_KERNEL); if (!string) + return NULL; + + rcu_read_lock(); + ctx = rcu_dereference(gss_cred->gc_ctx); + + /* did the ctx disappear or was it replaced by one with no acceptor? */ + if (!ctx || !ctx->gc_acceptor.len) { + kfree(string); + string = NULL; goto out; + } + + acceptor = &ctx->gc_acceptor; + + /* + * Did we find a new acceptor that's longer than the original? Allocate + * a longer buffer and try again. + */ + if (len < acceptor->len) { + len = acceptor->len; + rcu_read_unlock(); + kfree(string); + goto realloc; + } memcpy(string, acceptor->data, acceptor->len); string[acceptor->len] = '\0'; -- cgit v1.2.3 From aaef31703a0cf6a733e651885bfb49edc3ac6774 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 23 Oct 2014 00:25:22 +0400 Subject: libceph: do not crash on large auth tickets Large (greater than 32k, the value of PAGE_ALLOC_COSTLY_ORDER) auth tickets will have their buffers vmalloc'ed, which leads to the following crash in crypto: [ 28.685082] BUG: unable to handle kernel paging request at ffffeb04000032c0 [ 28.686032] IP: [] scatterwalk_pagedone+0x22/0x80 [ 28.686032] PGD 0 [ 28.688088] Oops: 0000 [#1] PREEMPT SMP [ 28.688088] Modules linked in: [ 28.688088] CPU: 0 PID: 878 Comm: kworker/0:2 Not tainted 3.17.0-vm+ #305 [ 28.688088] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007 [ 28.688088] Workqueue: ceph-msgr con_work [ 28.688088] task: ffff88011a7f9030 ti: ffff8800d903c000 task.ti: ffff8800d903c000 [ 28.688088] RIP: 0010:[] [] scatterwalk_pagedone+0x22/0x80 [ 28.688088] RSP: 0018:ffff8800d903f688 EFLAGS: 00010286 [ 28.688088] RAX: ffffeb04000032c0 RBX: ffff8800d903f718 RCX: ffffeb04000032c0 [ 28.688088] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8800d903f750 [ 28.688088] RBP: ffff8800d903f688 R08: 00000000000007de R09: ffff8800d903f880 [ 28.688088] R10: 18df467c72d6257b R11: 0000000000000000 R12: 0000000000000010 [ 28.688088] R13: ffff8800d903f750 R14: ffff8800d903f8a0 R15: 0000000000000000 [ 28.688088] FS: 00007f50a41c7700(0000) GS:ffff88011fc00000(0000) knlGS:0000000000000000 [ 28.688088] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 28.688088] CR2: ffffeb04000032c0 CR3: 00000000da3f3000 CR4: 00000000000006b0 [ 28.688088] Stack: [ 28.688088] ffff8800d903f698 ffffffff81392ca8 ffff8800d903f6e8 ffffffff81395d32 [ 28.688088] ffff8800dac96000 ffff880000000000 ffff8800d903f980 ffff880119b7e020 [ 28.688088] ffff880119b7e010 0000000000000000 0000000000000010 0000000000000010 [ 28.688088] Call Trace: [ 28.688088] [] scatterwalk_done+0x38/0x40 [ 28.688088] [] scatterwalk_done+0x38/0x40 [ 28.688088] [] blkcipher_walk_done+0x182/0x220 [ 28.688088] [] crypto_cbc_encrypt+0x15f/0x180 [ 28.688088] [] ? crypto_aes_set_key+0x30/0x30 [ 28.688088] [] ceph_aes_encrypt2+0x29c/0x2e0 [ 28.688088] [] ceph_encrypt2+0x93/0xb0 [ 28.688088] [] ceph_x_encrypt+0x4a/0x60 [ 28.688088] [] ? ceph_buffer_new+0x5d/0xf0 [ 28.688088] [] ceph_x_build_authorizer.isra.6+0x297/0x360 [ 28.688088] [] ? kmem_cache_alloc_trace+0x11b/0x1c0 [ 28.688088] [] ? ceph_auth_create_authorizer+0x36/0x80 [ 28.688088] [] ceph_x_create_authorizer+0x63/0xd0 [ 28.688088] [] ceph_auth_create_authorizer+0x54/0x80 [ 28.688088] [] get_authorizer+0x80/0xd0 [ 28.688088] [] prepare_write_connect+0x18b/0x2b0 [ 28.688088] [] try_read+0x1e59/0x1f10 This is because we set up crypto scatterlists as if all buffers were kmalloc'ed. Fix it. Cc: stable@vger.kernel.org Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/crypto.c | 169 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 132 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 62fc5e7a9acf..790fe89d90c0 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -90,11 +90,82 @@ static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void) static const u8 *aes_iv = (u8 *)CEPH_AES_IV; +/* + * Should be used for buffers allocated with ceph_kvmalloc(). + * Currently these are encrypt out-buffer (ceph_buffer) and decrypt + * in-buffer (msg front). + * + * Dispose of @sgt with teardown_sgtable(). + * + * @prealloc_sg is to avoid memory allocation inside sg_alloc_table() + * in cases where a single sg is sufficient. No attempt to reduce the + * number of sgs by squeezing physically contiguous pages together is + * made though, for simplicity. + */ +static int setup_sgtable(struct sg_table *sgt, struct scatterlist *prealloc_sg, + const void *buf, unsigned int buf_len) +{ + struct scatterlist *sg; + const bool is_vmalloc = is_vmalloc_addr(buf); + unsigned int off = offset_in_page(buf); + unsigned int chunk_cnt = 1; + unsigned int chunk_len = PAGE_ALIGN(off + buf_len); + int i; + int ret; + + if (buf_len == 0) { + memset(sgt, 0, sizeof(*sgt)); + return -EINVAL; + } + + if (is_vmalloc) { + chunk_cnt = chunk_len >> PAGE_SHIFT; + chunk_len = PAGE_SIZE; + } + + if (chunk_cnt > 1) { + ret = sg_alloc_table(sgt, chunk_cnt, GFP_NOFS); + if (ret) + return ret; + } else { + WARN_ON(chunk_cnt != 1); + sg_init_table(prealloc_sg, 1); + sgt->sgl = prealloc_sg; + sgt->nents = sgt->orig_nents = 1; + } + + for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) { + struct page *page; + unsigned int len = min(chunk_len - off, buf_len); + + if (is_vmalloc) + page = vmalloc_to_page(buf); + else + page = virt_to_page(buf); + + sg_set_page(sg, page, len, off); + + off = 0; + buf += len; + buf_len -= len; + } + WARN_ON(buf_len != 0); + + return 0; +} + +static void teardown_sgtable(struct sg_table *sgt) +{ + if (sgt->orig_nents > 1) + sg_free_table(sgt); +} + static int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len, const void *src, size_t src_len) { - struct scatterlist sg_in[2], sg_out[1]; + struct scatterlist sg_in[2], prealloc_sg; + struct sg_table sg_out; struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 }; int ret; @@ -110,16 +181,18 @@ static int ceph_aes_encrypt(const void *key, int key_len, *dst_len = src_len + zero_padding; - crypto_blkcipher_setkey((void *)tfm, key, key_len); sg_init_table(sg_in, 2); sg_set_buf(&sg_in[0], src, src_len); sg_set_buf(&sg_in[1], pad, zero_padding); - sg_init_table(sg_out, 1); - sg_set_buf(sg_out, dst, *dst_len); + ret = setup_sgtable(&sg_out, &prealloc_sg, dst, *dst_len); + if (ret) + goto out_tfm; + + crypto_blkcipher_setkey((void *)tfm, key, key_len); iv = crypto_blkcipher_crt(tfm)->iv; ivsize = crypto_blkcipher_ivsize(tfm); - memcpy(iv, aes_iv, ivsize); + /* print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1, key, key_len, 1); @@ -128,16 +201,22 @@ static int ceph_aes_encrypt(const void *key, int key_len, print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1, pad, zero_padding, 1); */ - ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in, + ret = crypto_blkcipher_encrypt(&desc, sg_out.sgl, sg_in, src_len + zero_padding); - crypto_free_blkcipher(tfm); - if (ret < 0) + if (ret < 0) { pr_err("ceph_aes_crypt failed %d\n", ret); + goto out_sg; + } /* print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1, dst, *dst_len, 1); */ - return 0; + +out_sg: + teardown_sgtable(&sg_out); +out_tfm: + crypto_free_blkcipher(tfm); + return ret; } static int ceph_aes_encrypt2(const void *key, int key_len, void *dst, @@ -145,7 +224,8 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst, const void *src1, size_t src1_len, const void *src2, size_t src2_len) { - struct scatterlist sg_in[3], sg_out[1]; + struct scatterlist sg_in[3], prealloc_sg; + struct sg_table sg_out; struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 }; int ret; @@ -161,17 +241,19 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst, *dst_len = src1_len + src2_len + zero_padding; - crypto_blkcipher_setkey((void *)tfm, key, key_len); sg_init_table(sg_in, 3); sg_set_buf(&sg_in[0], src1, src1_len); sg_set_buf(&sg_in[1], src2, src2_len); sg_set_buf(&sg_in[2], pad, zero_padding); - sg_init_table(sg_out, 1); - sg_set_buf(sg_out, dst, *dst_len); + ret = setup_sgtable(&sg_out, &prealloc_sg, dst, *dst_len); + if (ret) + goto out_tfm; + + crypto_blkcipher_setkey((void *)tfm, key, key_len); iv = crypto_blkcipher_crt(tfm)->iv; ivsize = crypto_blkcipher_ivsize(tfm); - memcpy(iv, aes_iv, ivsize); + /* print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1, key, key_len, 1); @@ -182,23 +264,30 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst, print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1, pad, zero_padding, 1); */ - ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in, + ret = crypto_blkcipher_encrypt(&desc, sg_out.sgl, sg_in, src1_len + src2_len + zero_padding); - crypto_free_blkcipher(tfm); - if (ret < 0) + if (ret < 0) { pr_err("ceph_aes_crypt2 failed %d\n", ret); + goto out_sg; + } /* print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1, dst, *dst_len, 1); */ - return 0; + +out_sg: + teardown_sgtable(&sg_out); +out_tfm: + crypto_free_blkcipher(tfm); + return ret; } static int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len, const void *src, size_t src_len) { - struct scatterlist sg_in[1], sg_out[2]; + struct sg_table sg_in; + struct scatterlist sg_out[2], prealloc_sg; struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); struct blkcipher_desc desc = { .tfm = tfm }; char pad[16]; @@ -210,16 +299,16 @@ static int ceph_aes_decrypt(const void *key, int key_len, if (IS_ERR(tfm)) return PTR_ERR(tfm); - crypto_blkcipher_setkey((void *)tfm, key, key_len); - sg_init_table(sg_in, 1); sg_init_table(sg_out, 2); - sg_set_buf(sg_in, src, src_len); sg_set_buf(&sg_out[0], dst, *dst_len); sg_set_buf(&sg_out[1], pad, sizeof(pad)); + ret = setup_sgtable(&sg_in, &prealloc_sg, src, src_len); + if (ret) + goto out_tfm; + crypto_blkcipher_setkey((void *)tfm, key, key_len); iv = crypto_blkcipher_crt(tfm)->iv; ivsize = crypto_blkcipher_ivsize(tfm); - memcpy(iv, aes_iv, ivsize); /* @@ -228,12 +317,10 @@ static int ceph_aes_decrypt(const void *key, int key_len, print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1, src, src_len, 1); */ - - ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len); - crypto_free_blkcipher(tfm); + ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in.sgl, src_len); if (ret < 0) { pr_err("ceph_aes_decrypt failed %d\n", ret); - return ret; + goto out_sg; } if (src_len <= *dst_len) @@ -251,7 +338,12 @@ static int ceph_aes_decrypt(const void *key, int key_len, print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1, dst, *dst_len, 1); */ - return 0; + +out_sg: + teardown_sgtable(&sg_in); +out_tfm: + crypto_free_blkcipher(tfm); + return ret; } static int ceph_aes_decrypt2(const void *key, int key_len, @@ -259,7 +351,8 @@ static int ceph_aes_decrypt2(const void *key, int key_len, void *dst2, size_t *dst2_len, const void *src, size_t src_len) { - struct scatterlist sg_in[1], sg_out[3]; + struct sg_table sg_in; + struct scatterlist sg_out[3], prealloc_sg; struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); struct blkcipher_desc desc = { .tfm = tfm }; char pad[16]; @@ -271,17 +364,17 @@ static int ceph_aes_decrypt2(const void *key, int key_len, if (IS_ERR(tfm)) return PTR_ERR(tfm); - sg_init_table(sg_in, 1); - sg_set_buf(sg_in, src, src_len); sg_init_table(sg_out, 3); sg_set_buf(&sg_out[0], dst1, *dst1_len); sg_set_buf(&sg_out[1], dst2, *dst2_len); sg_set_buf(&sg_out[2], pad, sizeof(pad)); + ret = setup_sgtable(&sg_in, &prealloc_sg, src, src_len); + if (ret) + goto out_tfm; crypto_blkcipher_setkey((void *)tfm, key, key_len); iv = crypto_blkcipher_crt(tfm)->iv; ivsize = crypto_blkcipher_ivsize(tfm); - memcpy(iv, aes_iv, ivsize); /* @@ -290,12 +383,10 @@ static int ceph_aes_decrypt2(const void *key, int key_len, print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1, src, src_len, 1); */ - - ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len); - crypto_free_blkcipher(tfm); + ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in.sgl, src_len); if (ret < 0) { pr_err("ceph_aes_decrypt failed %d\n", ret); - return ret; + goto out_sg; } if (src_len <= *dst1_len) @@ -325,7 +416,11 @@ static int ceph_aes_decrypt2(const void *key, int key_len, dst2, *dst2_len, 1); */ - return 0; +out_sg: + teardown_sgtable(&sg_in); +out_tfm: + crypto_free_blkcipher(tfm); + return ret; } -- cgit v1.2.3 From a390de0208e7f2f8fdb2fbf970240e4f7b308037 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 4 Nov 2014 18:32:14 +0300 Subject: libceph: unlink from o_linger_requests when clearing r_osd Requests have to be unlinked from both osd->o_requests (normal requests) and osd->o_linger_requests (linger requests) lists when clearing req->r_osd. Otherwise __unregister_linger_request() gets confused and we trip over a !list_empty(&osd->o_linger_requests) assert in __remove_osd(). MON=1 OSD=1: # cat remove-osd.sh #!/bin/bash rbd create --size 1 test DEV=$(rbd map test) ceph osd out 0 sleep 3 rbd map dne/dne # obtain a new osdmap as a side effect rbd unmap $DEV & # will block sleep 3 ceph osd in 0 Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osd_client.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f3fc54eac09d..75abaa87abac 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1395,6 +1395,7 @@ static int __map_request(struct ceph_osd_client *osdc, if (req->r_osd) { __cancel_request(req); list_del_init(&req->r_osd_item); + list_del_init(&req->r_linger_osd_item); req->r_osd = NULL; } -- cgit v1.2.3 From ba9d114ec5578e6e99a4dfa37ff8ae688040fd64 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 5 Nov 2014 15:45:58 +0300 Subject: libceph: clear r_req_lru_item in __unregister_linger_request() kick_requests() can put linger requests on the notarget list. This means we need to clear the much-overloaded req->r_req_lru_item in __unregister_linger_request() as well, or we get an assertion failure in ceph_osdc_release_request() - !list_empty(&req->r_req_lru_item). AFAICT the assumption was that registered linger requests cannot be on any of req->r_req_lru_item lists, but that's clearly not the case. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osd_client.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 75abaa87abac..decc3b74e65f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1254,6 +1254,8 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc, if (list_empty(&req->r_osd_item)) req->r_osd = NULL; } + + list_del_init(&req->r_req_lru_item); /* can be on notarget */ ceph_osdc_put_request(req); } -- cgit v1.2.3 From cc9f1f518cec079289d11d732efa490306b1ddad Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 5 Nov 2014 19:33:44 +0300 Subject: libceph: change from BUG to WARN for __remove_osd() asserts No reason to use BUG_ON for osd request list assertions. Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- net/ceph/osd_client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index decc3b74e65f..6f164289bde8 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1007,8 +1007,8 @@ static void put_osd(struct ceph_osd *osd) static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) { dout("__remove_osd %p\n", osd); - BUG_ON(!list_empty(&osd->o_requests)); - BUG_ON(!list_empty(&osd->o_linger_requests)); + WARN_ON(!list_empty(&osd->o_requests)); + WARN_ON(!list_empty(&osd->o_linger_requests)); rb_erase(&osd->o_node, &osdc->osds); list_del_init(&osd->o_osd_lru); -- cgit v1.2.3 From 5195c14c8b27cc0b18220ddbf0e5ad3328a04187 Mon Sep 17 00:00:00 2001 From: bill bonaparte Date: Thu, 6 Nov 2014 14:36:48 +0100 Subject: netfilter: conntrack: fix race in __nf_conntrack_confirm against get_next_corpse After removal of the central spinlock nf_conntrack_lock, in commit 93bb0ceb75be2 ("netfilter: conntrack: remove central spinlock nf_conntrack_lock"), it is possible to race against get_next_corpse(). The race is against the get_next_corpse() cleanup on the "unconfirmed" list (a per-cpu list with seperate locking), which set the DYING bit. Fix this race, in __nf_conntrack_confirm(), by removing the CT from unconfirmed list before checking the DYING bit. In case race occured, re-add the CT to the dying list. While at this, fix coding style of the comment that has been updated. Fixes: 93bb0ceb75be2 ("netfilter: conntrack: remove central spinlock nf_conntrack_lock") Reported-by: bill bonaparte Signed-off-by: bill bonaparte Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 5016a6929085..2c699757bccf 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -611,12 +611,16 @@ __nf_conntrack_confirm(struct sk_buff *skb) */ NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); pr_debug("Confirming conntrack %p\n", ct); - /* We have to check the DYING flag inside the lock to prevent - a race against nf_ct_get_next_corpse() possibly called from - user context, else we insert an already 'dead' hash, blocking - further use of that particular connection -JM */ + + /* We have to check the DYING flag after unlink to prevent + * a race against nf_ct_get_next_corpse() possibly called from + * user context, else we insert an already 'dead' hash, blocking + * further use of that particular connection -JM. + */ + nf_ct_del_from_dying_or_unconfirmed_list(ct); if (unlikely(nf_ct_is_dying(ct))) { + nf_ct_add_to_dying_list(ct); nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); return NF_ACCEPT; @@ -636,8 +640,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) goto out; - nf_ct_del_from_dying_or_unconfirmed_list(ct); - /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ -- cgit v1.2.3 From ab64f16ff2e83371927c57a0380fd3c0fee5c1c1 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 11 Nov 2014 13:40:49 -0800 Subject: openvswitch: Fix memory leak. Need to free memory in case of sample action error. Introduced by commit 651887b0c22cffcfce7eb9c ("openvswitch: Sample action without side effects"). Signed-off-by: Pravin B Shelar --- net/openvswitch/actions.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 006886dbee36..00e447a17f64 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -722,8 +722,6 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_SAMPLE: err = sample(dp, skb, key, a); - if (unlikely(err)) /* skb already freed. */ - return err; break; } -- cgit v1.2.3 From 856447d0209c2214d806b30bd3b0d873db5998bd Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Tue, 11 Nov 2014 14:32:20 -0800 Subject: openvswitch: Fix checksum calculation when modifying ICMPv6 packets. The checksum of ICMPv6 packets uses the IP pseudoheader as part of the calculation, unlike ICMP in IPv4. This was not implemented, which means that modifying the IP addresses of an ICMPv6 packet would cause the checksum to no longer be correct as the psuedoheader did not match. Introduced by commit 3fdbd1ce11e5 ("openvswitch: add ipv6 'set' action"). Reported-by: Neal Shrader Signed-off-by: Jesse Gross Signed-off-by: Pravin B Shelar --- net/openvswitch/actions.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 00e447a17f64..8c4229b11c34 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -246,11 +246,11 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, { int transport_len = skb->len - skb_transport_offset(skb); - if (l4_proto == IPPROTO_TCP) { + if (l4_proto == NEXTHDR_TCP) { if (likely(transport_len >= sizeof(struct tcphdr))) inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb, addr, new_addr, 1); - } else if (l4_proto == IPPROTO_UDP) { + } else if (l4_proto == NEXTHDR_UDP) { if (likely(transport_len >= sizeof(struct udphdr))) { struct udphdr *uh = udp_hdr(skb); @@ -261,6 +261,10 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, uh->check = CSUM_MANGLED_0; } } + } else if (l4_proto == NEXTHDR_ICMP) { + if (likely(transport_len >= sizeof(struct icmp6hdr))) + inet_proto_csum_replace16(&icmp6_hdr(skb)->icmp6_cksum, + skb, addr, new_addr, 1); } } -- cgit v1.2.3 From 19e7a3df7261c9b7ebced8163c383712d5b6ac6b Mon Sep 17 00:00:00 2001 From: Daniele Di Proietto Date: Tue, 11 Nov 2014 14:51:22 -0800 Subject: openvswitch: Fix NDP flow mask validation match_validate() enforce that a mask matching on NDP attributes has also an exact match on ICMPv6 type. The ICMPv6 type, which is 8-bit wide, is stored in the 'tp.src' field of 'struct sw_flow_key', which is 16-bit wide. Therefore, an exact match on ICMPv6 type should only check the first 8 bits. This commit fixes a bug that prevented flows with an exact match on NDP field from being installed Introduced by commit 03f0d916aa03 ("openvswitch: Mega flow implementation"). Signed-off-by: Daniele Di Proietto Signed-off-by: Pravin B Shelar --- net/openvswitch/flow_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 939bcb32100f..dda040e693a3 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -145,7 +145,7 @@ static bool match_validate(const struct sw_flow_match *match, if (match->key->eth.type == htons(ETH_P_ARP) || match->key->eth.type == htons(ETH_P_RARP)) { key_expected |= 1 << OVS_KEY_ATTR_ARP; - if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + if (match->mask && (match->mask->key.tp.src == htons(0xff))) mask_allowed |= 1 << OVS_KEY_ATTR_ARP; } -- cgit v1.2.3 From 8ec609d8b561468691b60347ff594bd443ea58c0 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 11 Nov 2014 15:55:16 -0800 Subject: openvswitch: Convert dp rcu read operation to locked operations dp read operations depends on ovs_dp_cmd_fill_info(). This API needs to looup vport to find dp name, but vport lookup can fail. Therefore to keep vport reference alive we need to take ovs lock. Introduced by commit 6093ae9abac1 ("openvswitch: Minimize dp and vport critical sections"). Signed-off-by: Pravin B Shelar Acked-by: Andy Zhou --- net/openvswitch/datapath.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index e6d7255183eb..f9e556b56086 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1265,7 +1265,7 @@ static size_t ovs_dp_cmd_msg_size(void) return msgsize; } -/* Called with ovs_mutex or RCU read lock. */ +/* Called with ovs_mutex. */ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, u32 portid, u32 seq, u32 flags, u8 cmd) { @@ -1555,7 +1555,7 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) if (!reply) return -ENOMEM; - rcu_read_lock(); + ovs_lock(); dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); if (IS_ERR(dp)) { err = PTR_ERR(dp); @@ -1564,12 +1564,12 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_NEW); BUG_ON(err < 0); - rcu_read_unlock(); + ovs_unlock(); return genlmsg_reply(reply, info); err_unlock_free: - rcu_read_unlock(); + ovs_unlock(); kfree_skb(reply); return err; } @@ -1581,8 +1581,8 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) int skip = cb->args[0]; int i = 0; - rcu_read_lock(); - list_for_each_entry_rcu(dp, &ovs_net->dps, list_node) { + ovs_lock(); + list_for_each_entry(dp, &ovs_net->dps, list_node) { if (i >= skip && ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, @@ -1590,7 +1590,7 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) break; i++; } - rcu_read_unlock(); + ovs_unlock(); cb->args[0] = i; -- cgit v1.2.3 From fecaef85f7188ad1822210e2c7a7625c9a32a8e4 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Tue, 11 Nov 2014 14:36:30 -0800 Subject: openvswitch: Validate IPv6 flow key and mask values. Reject flow label key and mask values with invalid bits set. Introduced by commit 3fdbd1ce11e5 ("openvswitch: add ipv6 'set' action"). Signed-off-by: Jarno Rajahalme Acked-by: Jesse Gross Signed-off-by: Pravin B Shelar --- net/openvswitch/flow_netlink.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index dda040e693a3..fa4ec2e4a78b 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -689,6 +689,13 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX); return -EINVAL; } + + if (ipv6_key->ipv6_label & htonl(0xFFF00000)) { + OVS_NLERR("IPv6 flow label %x is out of range (max=%x).\n", + ntohl(ipv6_key->ipv6_label), (1 << 20) - 1); + return -EINVAL; + } + SW_FLOW_KEY_PUT(match, ipv6.label, ipv6_key->ipv6_label, is_mask); SW_FLOW_KEY_PUT(match, ip.proto, -- cgit v1.2.3 From 49dd18ba4615eaa72f15c9087dea1c2ab4744cf5 Mon Sep 17 00:00:00 2001 From: Panu Matilainen Date: Fri, 14 Nov 2014 13:14:32 +0200 Subject: ipv4: Fix incorrect error code when adding an unreachable route Trying to add an unreachable route incorrectly returns -ESRCH if if custom FIB rules are present: [root@localhost ~]# ip route add 74.125.31.199 dev eth0 via 1.2.3.4 RTNETLINK answers: Network is unreachable [root@localhost ~]# ip rule add to 55.66.77.88 table 200 [root@localhost ~]# ip route add 74.125.31.199 dev eth0 via 1.2.3.4 RTNETLINK answers: No such process [root@localhost ~]# Commit 83886b6b636173b206f475929e58fac75c6f2446 ("[NET]: Change "not found" return value for rule lookup") changed fib_rules_lookup() to use -ESRCH as a "not found" code internally, but for user space it should be translated into -ENETUNREACH. Handle the translation centrally in ipv4-specific fib_lookup(), leaving the DECnet case alone. On a related note, commit b7a71b51ee37d919e4098cd961d59a883fd272d8 ("ipv4: removed redundant conditional") removed a similar translation from ip_route_input_slow() prematurely AIUI. Fixes: b7a71b51ee37 ("ipv4: removed redundant conditional") Signed-off-by: Panu Matilainen Signed-off-by: David S. Miller --- net/ipv4/fib_rules.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index f2e15738534d..8f7bd56955b0 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -62,6 +62,10 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) else res->tclassid = 0; #endif + + if (err == -ESRCH) + err = -ENETUNREACH; + return err; } EXPORT_SYMBOL_GPL(__fib_lookup); -- cgit v1.2.3 From 52cff74eef5dd7bdab759300e7d1ca36eba18254 Mon Sep 17 00:00:00 2001 From: Anish Bhatt Date: Fri, 14 Nov 2014 16:38:31 -0800 Subject: dcbnl : Disable software interrupts before taking dcb_lock Solves possible lockup issues that can be seen from firmware DCB agents calling into the DCB app api. DCB firmware event queues can be tied in with NAPI so that dcb events are generated in softIRQ context. This can results in calls to dcb_*app() functions which try to take the dcb_lock. If the the event triggers while we also have the dcb_lock because lldpad or some other agent happened to be issuing a get/set command we could see a cpu lockup. This code was not originally written with firmware agents in mind, hence grabbing dcb_lock from softIRQ context was not considered. Signed-off-by: Anish Bhatt Signed-off-by: David S. Miller --- net/dcb/dcbnl.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index ca11d283bbeb..93ea80196f0e 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1080,13 +1080,13 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) if (!app) return -EMSGSIZE; - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == netdev->ifindex) { err = nla_put(skb, DCB_ATTR_IEEE_APP, sizeof(itr->app), &itr->app); if (err) { - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); return -EMSGSIZE; } } @@ -1097,7 +1097,7 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) else dcbx = -EOPNOTSUPP; - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); nla_nest_end(skb, app); /* get peer info if available */ @@ -1234,7 +1234,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) } /* local app */ - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); app = nla_nest_start(skb, DCB_ATTR_CEE_APP_TABLE); if (!app) goto dcb_unlock; @@ -1271,7 +1271,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) else dcbx = -EOPNOTSUPP; - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); /* features flags */ if (ops->getfeatcfg) { @@ -1326,7 +1326,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) return 0; dcb_unlock: - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); nla_put_failure: return err; } @@ -1762,10 +1762,10 @@ u8 dcb_getapp(struct net_device *dev, struct dcb_app *app) struct dcb_app_type *itr; u8 prio = 0; - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); if ((itr = dcb_app_lookup(app, dev->ifindex, 0))) prio = itr->app.priority; - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); return prio; } @@ -1789,7 +1789,7 @@ int dcb_setapp(struct net_device *dev, struct dcb_app *new) if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); /* Search for existing match and replace */ if ((itr = dcb_app_lookup(new, dev->ifindex, 0))) { if (new->priority) @@ -1804,7 +1804,7 @@ int dcb_setapp(struct net_device *dev, struct dcb_app *new) if (new->priority) err = dcb_app_add(new, dev->ifindex); out: - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; @@ -1823,10 +1823,10 @@ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app) struct dcb_app_type *itr; u8 prio = 0; - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); if ((itr = dcb_app_lookup(app, dev->ifindex, 0))) prio |= 1 << itr->app.priority; - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); return prio; } @@ -1850,7 +1850,7 @@ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new) if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); /* Search for existing match and abort if found */ if (dcb_app_lookup(new, dev->ifindex, new->priority)) { err = -EEXIST; @@ -1859,7 +1859,7 @@ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new) err = dcb_app_add(new, dev->ifindex); out: - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; @@ -1882,7 +1882,7 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); /* Search for existing match and remove it. */ if ((itr = dcb_app_lookup(del, dev->ifindex, del->priority))) { list_del(&itr->list); @@ -1890,7 +1890,7 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) err = 0; } - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; @@ -1902,12 +1902,12 @@ static void dcb_flushapp(void) struct dcb_app_type *app; struct dcb_app_type *tmp; - spin_lock(&dcb_lock); + spin_lock_bh(&dcb_lock); list_for_each_entry_safe(app, tmp, &dcb_app_list, list) { list_del(&app->list); kfree(app); } - spin_unlock(&dcb_lock); + spin_unlock_bh(&dcb_lock); } static int __init dcbnl_init(void) -- cgit v1.2.3 From feb91a02ccb09661507f170b2a444aec94f307f9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 5 Nov 2014 20:27:38 +0100 Subject: ipv6: mld: fix add_grhead skb_over_panic for devs with large MTUs It has been reported that generating an MLD listener report on devices with large MTUs (e.g. 9000) and a high number of IPv6 addresses can trigger a skb_over_panic(): skbuff: skb_over_panic: text:ffffffff80612a5d len:3776 put:20 head:ffff88046d751000 data:ffff88046d751010 tail:0xed0 end:0xec0 dev:port1 ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:100! invalid opcode: 0000 [#1] SMP Modules linked in: ixgbe(O) CPU: 3 PID: 0 Comm: swapper/3 Tainted: G O 3.14.23+ #4 [...] Call Trace: [] ? skb_put+0x3a/0x3b [] ? add_grhead+0x45/0x8e [] ? add_grec+0x394/0x3d4 [] ? mld_ifc_timer_expire+0x195/0x20d [] ? mld_dad_timer_expire+0x45/0x45 [] ? call_timer_fn.isra.29+0x12/0x68 [] ? run_timer_softirq+0x163/0x182 [] ? __do_softirq+0xe0/0x21d [] ? irq_exit+0x4e/0xd3 [] ? smp_apic_timer_interrupt+0x3b/0x46 [] ? apic_timer_interrupt+0x6a/0x70 mld_newpack() skb allocations are usually requested with dev->mtu in size, since commit 72e09ad107e7 ("ipv6: avoid high order allocations") we have changed the limit in order to be less likely to fail. However, in MLD/IGMP code, we have some rather ugly AVAILABLE(skb) macros, which determine if we may end up doing an skb_put() for adding another record. To avoid possible fragmentation, we check the skb's tailroom as skb->dev->mtu - skb->len, which is a wrong assumption as the actual max allocation size can be much smaller. The IGMP case doesn't have this issue as commit 57e1ab6eaddc ("igmp: refine skb allocations") stores the allocation size in the cb[]. Set a reserved_tailroom to make it fit into the MTU and use skb_availroom() helper instead. This also allows to get rid of igmp_skb_size(). Reported-by: Wei Liu Fixes: 72e09ad107e7 ("ipv6: avoid high order allocations") Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Cc: Hannes Frederic Sowa Cc: David L Stevens Acked-by: Eric Dumazet Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 11 +++++------ net/ipv6/mcast.c | 9 +++++---- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index fb70e3ecc3e4..bb15d0e03d4f 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -318,9 +318,7 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) return scount; } -#define igmp_skb_size(skb) (*(unsigned int *)((skb)->cb)) - -static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) +static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) { struct sk_buff *skb; struct rtable *rt; @@ -330,6 +328,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) struct flowi4 fl4; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; + unsigned int size = mtu; while (1) { skb = alloc_skb(size + hlen + tlen, @@ -341,7 +340,6 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) return NULL; } skb->priority = TC_PRIO_CONTROL; - igmp_skb_size(skb) = size; rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, 0, 0, @@ -354,6 +352,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) skb_dst_set(skb, &rt->dst); skb->dev = dev; + skb->reserved_tailroom = skb_end_offset(skb) - + min(mtu, skb_end_offset(skb)); skb_reserve(skb, hlen); skb_reset_network_header(skb); @@ -423,8 +423,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, return skb; } -#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? igmp_skb_size(skb) - (skb)->len : \ - skb_tailroom(skb)) : 0) +#define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 9648de2b6745..ed2c4e400b46 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1550,7 +1550,7 @@ static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb, hdr->daddr = *daddr; } -static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size) +static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) { struct net_device *dev = idev->dev; struct net *net = dev_net(dev); @@ -1561,13 +1561,13 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size) const struct in6_addr *saddr; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; + unsigned int size = mtu + hlen + tlen; int err; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; /* we assume size > sizeof(ra) here */ - size += hlen + tlen; /* limit our allocations to order-0 page */ size = min_t(int, size, SKB_MAX_ORDER(0, 0)); skb = sock_alloc_send_skb(sk, size, 1, &err); @@ -1576,6 +1576,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size) return NULL; skb->priority = TC_PRIO_CONTROL; + skb->reserved_tailroom = skb_end_offset(skb) - + min(mtu, skb_end_offset(skb)); skb_reserve(skb, hlen); if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) { @@ -1690,8 +1692,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, return skb; } -#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \ - skb_tailroom(skb)) : 0) +#define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted, int crsend) -- cgit v1.2.3 From 97840cb67ff5ac8add836684f011fd838518d698 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 14 Nov 2014 18:14:33 +0100 Subject: netfilter: nfnetlink: fix insufficient validation in nfnetlink_bind Make sure the netlink group exists, otherwise you can trigger an out of bound array memory access from the netlink_bind() path. This splat can only be triggered only by superuser. [ 180.203600] UBSan: Undefined behaviour in ../net/netfilter/nfnetlink.c:467:28 [ 180.204249] index 9 is out of range for type 'int [9]' [ 180.204697] CPU: 0 PID: 1771 Comm: trinity-main Not tainted 3.18.0-rc4-mm1+ #122 [ 180.205365] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org +04/01/2014 [ 180.206498] 0000000000000018 0000000000000000 0000000000000009 ffff88007bdf7da8 [ 180.207220] ffffffff82b0ef5f 0000000000000092 ffffffff845ae2e0 ffff88007bdf7db8 [ 180.207887] ffffffff8199e489 ffff88007bdf7e18 ffffffff8199ea22 0000003900000000 [ 180.208639] Call Trace: [ 180.208857] dump_stack (lib/dump_stack.c:52) [ 180.209370] ubsan_epilogue (lib/ubsan.c:174) [ 180.209849] __ubsan_handle_out_of_bounds (lib/ubsan.c:400) [ 180.210512] nfnetlink_bind (net/netfilter/nfnetlink.c:467) [ 180.210986] netlink_bind (net/netlink/af_netlink.c:1483) [ 180.211495] SYSC_bind (net/socket.c:1541) Moreover, define the missing nf_tables and nf_acct multicast groups too. Reported-by: Andrey Ryabinin Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 6c5a915cfa75..13c2e17bbe27 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -47,6 +47,8 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = { [NFNLGRP_CONNTRACK_EXP_NEW] = NFNL_SUBSYS_CTNETLINK_EXP, [NFNLGRP_CONNTRACK_EXP_UPDATE] = NFNL_SUBSYS_CTNETLINK_EXP, [NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP, + [NFNLGRP_NFTABLES] = NFNL_SUBSYS_NFTABLES, + [NFNLGRP_ACCT_QUOTA] = NFNL_SUBSYS_ACCT, }; void nfnl_lock(__u8 subsys_id) @@ -464,7 +466,12 @@ static void nfnetlink_rcv(struct sk_buff *skb) static int nfnetlink_bind(int group) { const struct nfnetlink_subsystem *ss; - int type = nfnl_group2type[group]; + int type; + + if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX) + return -EINVAL; + + type = nfnl_group2type[group]; rcu_read_lock(); ss = nfnetlink_get_subsys(type); @@ -514,6 +521,9 @@ static int __init nfnetlink_init(void) { int i; + for (i = NFNLGRP_NONE + 1; i <= NFNLGRP_MAX; i++) + BUG_ON(nfnl_group2type[i] == NFNL_SUBSYS_NONE); + for (i=0; i Date: Mon, 17 Nov 2014 12:20:28 +0100 Subject: bridge: fix netfilter/NF_BR_LOCAL_OUT for own, locally generated queries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ebtables on the OUTPUT chain (NF_BR_LOCAL_OUT) would not work as expected for both locally generated IGMP and MLD queries. The IP header specific filter options are off by 14 Bytes for netfilter (actual output on interfaces is fine). NF_HOOK() expects the skb->data to point to the IP header, not the ethernet one (while dev_queue_xmit() does not). Luckily there is an br_dev_queue_push_xmit() helper function already - let's just use that. Introduced by eb1d16414339a6e113d89e2cca2556005d7ce919 ("bridge: Add core IGMP snooping support") Ebtables example: $ ebtables -I OUTPUT -p IPv6 -o eth1 --logical-out br0 \ --log --log-level 6 --log-ip6 --log-prefix="~EBT: " -j DROP before (broken): ~EBT: IN= OUT=eth1 MAC source = 02:04:64:a4:39:c2 \ MAC dest = 33:33:00:00:00:01 proto = 0x86dd IPv6 \ SRC=64a4:39c2:86dd:6000:0000:0020:0001:fe80 IPv6 \ DST=0000:0000:0000:0004:64ff:fea4:39c2:ff02, \ IPv6 priority=0x3, Next Header=2 after (working): ~EBT: IN= OUT=eth1 MAC source = 02:04:64:a4:39:c2 \ MAC dest = 33:33:00:00:00:01 proto = 0x86dd IPv6 \ SRC=fe80:0000:0000:0000:0004:64ff:fea4:39c2 IPv6 \ DST=ff02:0000:0000:0000:0000:0000:0000:0001, \ IPv6 priority=0x0, Next Header=0 Signed-off-by: Linus Lüssing Acked-by: Herbert Xu Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_multicast.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 648d79ccf462..c465876c7861 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -813,10 +813,9 @@ static void __br_multicast_send_query(struct net_bridge *br, return; if (port) { - __skb_push(skb, sizeof(struct ethhdr)); skb->dev = port->dev; NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, - dev_queue_xmit); + br_dev_queue_push_xmit); } else { br_multicast_select_own_querier(br, ip, skb); netif_rx(skb); -- cgit v1.2.3 From 280ba51d60be6f4ca3347eaa60783314f38df72e Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 18 Nov 2014 22:35:31 +0100 Subject: mac80211: minstrel_ht: fix a crash in rate sorting The commit 5935839ad73583781b8bbe8d91412f6826e218a4 "mac80211: improve minstrel_ht rate sorting by throughput & probability" introduced a crash on rate sorting that occurs when the rate added to the sorting array is faster than all the previous rates. Due to an off-by-one error, it reads the rate index from tp_list[-1], which contains uninitialized stack garbage, and then uses the resulting index for accessing the group rate stats, leading to a crash if the garbage value is big enough. Cc: Thomas Huehn Reported-by: Jouni Malinen Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/rc80211_minstrel_ht.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index df90ce2db00c..408fd8ab4eef 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -252,19 +252,16 @@ minstrel_ht_sort_best_tp_rates(struct minstrel_ht_sta *mi, u8 index, cur_thr = mi->groups[cur_group].rates[cur_idx].cur_tp; cur_prob = mi->groups[cur_group].rates[cur_idx].probability; - tmp_group = tp_list[j - 1] / MCS_GROUP_RATES; - tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES; - tmp_thr = mi->groups[tmp_group].rates[tmp_idx].cur_tp; - tmp_prob = mi->groups[tmp_group].rates[tmp_idx].probability; - - while (j > 0 && (cur_thr > tmp_thr || - (cur_thr == tmp_thr && cur_prob > tmp_prob))) { - j--; + do { tmp_group = tp_list[j - 1] / MCS_GROUP_RATES; tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES; tmp_thr = mi->groups[tmp_group].rates[tmp_idx].cur_tp; tmp_prob = mi->groups[tmp_group].rates[tmp_idx].probability; - } + if (cur_thr < tmp_thr || + (cur_thr == tmp_thr && cur_prob <= tmp_prob)) + break; + j--; + } while (j > 0); if (j < MAX_THR_RATES - 1) { memmove(&tp_list[j + 1], &tp_list[j], (sizeof(*tp_list) * -- cgit v1.2.3 From 093a1468b6edb0e568be7311b8d2228d205702db Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Nov 2014 18:04:04 -0500 Subject: SUNRPC: Fix locking around callback channel reply receive Both xprt_lookup_rqst() and xprt_complete_rqst() require that you take the transport lock in order to avoid races with xprt_transmit(). Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 3f959c681885..f9c052d508f0 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1019,17 +1019,12 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) xid = *p++; calldir = *p; - if (bc_xprt) - req = xprt_lookup_rqst(bc_xprt, xid); - - if (!req) { - printk(KERN_NOTICE - "%s: Got unrecognized reply: " - "calldir 0x%x xpt_bc_xprt %p xid %08x\n", - __func__, ntohl(calldir), - bc_xprt, ntohl(xid)); + if (!bc_xprt) return -EAGAIN; - } + spin_lock_bh(&bc_xprt->transport_lock); + req = xprt_lookup_rqst(bc_xprt, xid); + if (!req) + goto unlock_notfound; memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf)); /* @@ -1040,11 +1035,21 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) dst = &req->rq_private_buf.head[0]; src = &rqstp->rq_arg.head[0]; if (dst->iov_len < src->iov_len) - return -EAGAIN; /* whatever; just giving up. */ + goto unlock_eagain; /* whatever; just giving up. */ memcpy(dst->iov_base, src->iov_base, src->iov_len); xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len); rqstp->rq_arg.len = 0; + spin_unlock_bh(&bc_xprt->transport_lock); return 0; +unlock_notfound: + printk(KERN_NOTICE + "%s: Got unrecognized reply: " + "calldir 0x%x xpt_bc_xprt %p xid %08x\n", + __func__, ntohl(calldir), + bc_xprt, ntohl(xid)); +unlock_eagain: + spin_unlock_bh(&bc_xprt->transport_lock); + return -EAGAIN; } static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len) -- cgit v1.2.3 From ffb1388a364d135810337182d6800a0c7ee44f48 Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Wed, 19 Nov 2014 09:35:39 +0800 Subject: ipv6: delete protocol and unregister rtnetlink when cleanup pim6_protocol was added when initiation, but it not deleted. Similarly, unregister RTNL_FAMILY_IP6MR rtnetlink. Signed-off-by: Duan Jiong Reviewed-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv6/ip6mr.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 0171f08325c3..1a01d79b8698 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1439,6 +1439,10 @@ reg_pernet_fail: void ip6_mr_cleanup(void) { + rtnl_unregister(RTNL_FAMILY_IP6MR, RTM_GETROUTE); +#ifdef CONFIG_IPV6_PIMSM_V2 + inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); +#endif unregister_netdevice_notifier(&ip6_mr_notifier); unregister_pernet_subsys(&ip6mr_net_ops); kmem_cache_destroy(mrt_cachep); -- cgit v1.2.3 From d3052bb5d306b29c1e7d9e5998c5ac4ca1ff0ca9 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Wed, 19 Nov 2014 13:54:49 -0800 Subject: openvswitch: Don't validate IPv6 label masks. When userspace doesn't provide a mask, OVS datapath generates a fully unwildcarded mask for the flow by copying the flow and setting all bits in all fields. For IPv6 label, this creates a mask that matches on the upper 12 bits, causing the following error: openvswitch: netlink: Invalid IPv6 flow label value (value=ffffffff, max=fffff) This patch ignores the label validation check for masks, avoiding this error. Signed-off-by: Joe Stringer Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/flow_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index fa4ec2e4a78b..089b195c064a 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -690,7 +690,7 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, return -EINVAL; } - if (ipv6_key->ipv6_label & htonl(0xFFF00000)) { + if (!is_mask && ipv6_key->ipv6_label & htonl(0xFFF00000)) { OVS_NLERR("IPv6 flow label %x is out of range (max=%x).\n", ntohl(ipv6_key->ipv6_label), (1 << 20) - 1); return -EINVAL; -- cgit v1.2.3 From 01462405f0c093b2f8dfddafcadcda6c9e4c5cdf Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Wed, 19 Nov 2014 23:05:49 +0100 Subject: ipx: fix locking regression in ipx_sendmsg and ipx_recvmsg This fixes an old regression introduced by commit b0d0d915 (ipx: remove the BKL). When a recvmsg syscall blocks waiting for new data, no data can be sent on the same socket with sendmsg because ipx_recvmsg() sleeps with the socket locked. This breaks mars-nwe (NetWare emulator): - the ncpserv process reads the request using recvmsg - ncpserv forks and spawns nwconn - ncpserv calls a (blocking) recvmsg and waits for new requests - nwconn deadlocks in sendmsg on the same socket Commit b0d0d915 has simply replaced BKL locking with lock_sock/release_sock. Unlike now, BKL got unlocked while sleeping, so a blocking recvmsg did not block a concurrent sendmsg. Only keep the socket locked while actually working with the socket data and release it prior to calling skb_recv_datagram(). Signed-off-by: Jiri Bohac Reviewed-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/ipx/af_ipx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 91729b807c7d..1b095ca37aa4 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1764,6 +1764,7 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, struct ipxhdr *ipx = NULL; struct sk_buff *skb; int copied, rc; + bool locked = true; lock_sock(sk); /* put the autobinding in */ @@ -1790,6 +1791,8 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, if (sock_flag(sk, SOCK_ZAPPED)) goto out; + release_sock(sk); + locked = false; skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &rc); if (!skb) { @@ -1826,7 +1829,8 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, out_free: skb_free_datagram(sk, skb); out: - release_sock(sk); + if (locked) + release_sock(sk); return rc; } -- cgit v1.2.3 From e7820e39b7d19b9fe1928fc19de9361b44150ca6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2014 11:47:16 -0800 Subject: net: Revert "net: avoid one atomic operation in skb_clone()" Not sure what I was thinking, but doing anything after releasing a refcount is suicidal or/and embarrassing. By the time we set skb->fclone to SKB_FCLONE_FREE, another cpu could have released last reference and freed whole skb. We potentially corrupt memory or trap if CONFIG_DEBUG_PAGEALLOC is set. Reported-by: Chris Mason Fixes: ce1a4ea3f1258 ("net: avoid one atomic operation in skb_clone()") Signed-off-by: Eric Dumazet Cc: Sabrina Dubroca Signed-off-by: David S. Miller --- net/core/skbuff.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c16615bfb61e..32e31c299631 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -552,20 +552,13 @@ static void kfree_skbmem(struct sk_buff *skb) case SKB_FCLONE_CLONE: fclones = container_of(skb, struct sk_buff_fclones, skb2); - /* Warning : We must perform the atomic_dec_and_test() before - * setting skb->fclone back to SKB_FCLONE_FREE, otherwise - * skb_clone() could set clone_ref to 2 before our decrement. - * Anyway, if we are going to free the structure, no need to - * rewrite skb->fclone. + /* The clone portion is available for + * fast-cloning again. */ - if (atomic_dec_and_test(&fclones->fclone_ref)) { + skb->fclone = SKB_FCLONE_FREE; + + if (atomic_dec_and_test(&fclones->fclone_ref)) kmem_cache_free(skbuff_fclone_cache, fclones); - } else { - /* The clone portion is available for - * fast-cloning again. - */ - skb->fclone = SKB_FCLONE_FREE; - } break; } } @@ -887,11 +880,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) if (skb->fclone == SKB_FCLONE_ORIG && n->fclone == SKB_FCLONE_FREE) { n->fclone = SKB_FCLONE_CLONE; - /* As our fastclone was free, clone_ref must be 1 at this point. - * We could use atomic_inc() here, but it is faster - * to set the final value. - */ - atomic_set(&fclones->fclone_ref, 2); + atomic_inc(&fclones->fclone_ref); } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; -- cgit v1.2.3 From 0c228e833c88e3aa029250f5db77d5968c5ce5b5 Mon Sep 17 00:00:00 2001 From: Calvin Owens Date: Thu, 20 Nov 2014 15:09:53 -0800 Subject: tcp: Restore RFC5961-compliant behavior for SYN packets Commit c3ae62af8e755 ("tcp: should drop incoming frames without ACK flag set") was created to mitigate a security vulnerability in which a local attacker is able to inject data into locally-opened sockets by using TCP protocol statistics in procfs to quickly find the correct sequence number. This broke the RFC5961 requirement to send a challenge ACK in response to spurious RST packets, which was subsequently fixed by commit 7b514a886ba50 ("tcp: accept RST without ACK flag"). Unfortunately, the RFC5961 requirement that spurious SYN packets be handled in a similar manner remains broken. RFC5961 section 4 states that: ... the handling of the SYN in the synchronized state SHOULD be performed as follows: 1) If the SYN bit is set, irrespective of the sequence number, TCP MUST send an ACK (also referred to as challenge ACK) to the remote peer: After sending the acknowledgment, TCP MUST drop the unacceptable segment and stop processing further. By sending an ACK, the remote peer is challenged to confirm the loss of the previous connection and the request to start a new connection. A legitimate peer, after restart, would not have a TCB in the synchronized state. Thus, when the ACK arrives, the peer should send a RST segment back with the sequence number derived from the ACK field that caused the RST. This RST will confirm that the remote peer has indeed closed the previous connection. Upon receipt of a valid RST, the local TCP endpoint MUST terminate its connection. The local TCP endpoint should then rely on SYN retransmission from the remote end to re-establish the connection. This patch lets SYN packets through the discard added in c3ae62af8e755, so that spurious SYN packets are properly dealt with as per the RFC. The challenge ACK is sent unconditionally and is rate-limited, so the original vulnerability is not reintroduced by this patch. Signed-off-by: Calvin Owens Acked-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 88fa2d160685..d107ee246a1d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5231,7 +5231,7 @@ slow_path: if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb)) goto csum_error; - if (!th->ack && !th->rst) + if (!th->ack && !th->rst && !th->syn) goto discard; /* @@ -5650,7 +5650,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; } - if (!th->ack && !th->rst) + if (!th->ack && !th->rst && !th->syn) goto discard; if (!tcp_validate_incoming(sk, skb, th, 0)) -- cgit v1.2.3 From b6fef4c6b8c1994ffe050dcdb9391427bf0d9882 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 21 Nov 2014 19:37:09 -0800 Subject: ipv6: Do not treat a GSO_TCPV4 request from UDP tunnel over IPv6 as invalid This patch adds SKB_GSO_TCPV4 to the list of supported GSO types handled by the IPv6 GSO offloads. Without this change VXLAN tunnels running over IPv6 do not currently handle IPv4 TCP TSO requests correctly and end up handing the non-segmented frame off to the device. Below is the before and after for a simple netperf TCP_STREAM test between two endpoints tunneling IPv4 over a VXLAN tunnel running on IPv6 on top of a 1Gb/s network adapter. Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 16384 16384 10.29 0.88 Before 87380 16384 16384 10.03 895.69 After Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv6/ip6_offload.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index a071563a7e6e..01e12d0d8fcc 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -69,7 +69,8 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, int nhoff; if (unlikely(skb_shinfo(skb)->gso_type & - ~(SKB_GSO_UDP | + ~(SKB_GSO_TCPV4 | + SKB_GSO_UDP | SKB_GSO_DODGY | SKB_GSO_TCP_ECN | SKB_GSO_GRE | -- cgit v1.2.3 From 20ea60ca9952bd19d4b0d74719daba305aef5178 Mon Sep 17 00:00:00 2001 From: lucien Date: Sun, 23 Nov 2014 15:04:11 +0800 Subject: ip_tunnel: the lack of vti_link_ops' dellink() cause kernel panic Now the vti_link_ops do not point the .dellink, for fb tunnel device (ip_vti0), the net_device will be removed as the default .dellink is unregister_netdevice_queue,but the tunnel still in the tunnel list, then if we add a new vti tunnel, in ip_tunnel_find(): hlist_for_each_entry_rcu(t, head, hash_node) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && link == t->parms.link && ==> type == t->dev->type && ip_tunnel_key_match(&t->parms, flags, key)) break; } the panic will happen, cause dev of ip_tunnel *t is null: [ 3835.072977] IP: [] ip_tunnel_find+0x9d/0xc0 [ip_tunnel] [ 3835.073008] PGD b2c21067 PUD b7277067 PMD 0 [ 3835.073008] Oops: 0000 [#1] SMP ..... [ 3835.073008] Stack: [ 3835.073008] ffff8800b72d77f0 ffffffffa0411924 ffff8800bb956000 ffff8800b72d78e0 [ 3835.073008] ffff8800b72d78a0 0000000000000000 ffffffffa040d100 ffff8800b72d7858 [ 3835.073008] ffffffffa040b2e3 0000000000000000 0000000000000000 0000000000000000 [ 3835.073008] Call Trace: [ 3835.073008] [] ip_tunnel_newlink+0x64/0x160 [ip_tunnel] [ 3835.073008] [] vti_newlink+0x43/0x70 [ip_vti] [ 3835.073008] [] rtnl_newlink+0x4fa/0x5f0 [ 3835.073008] [] ? nla_strlcpy+0x5b/0x70 [ 3835.073008] [] ? rtnl_link_ops_get+0x40/0x60 [ 3835.073008] [] ? rtnl_newlink+0x13f/0x5f0 [ 3835.073008] [] rtnetlink_rcv_msg+0xa4/0x270 [ 3835.073008] [] ? sock_has_perm+0x75/0x90 [ 3835.073008] [] ? rtnetlink_rcv+0x30/0x30 [ 3835.073008] [] netlink_rcv_skb+0xa9/0xc0 [ 3835.073008] [] rtnetlink_rcv+0x28/0x30 .... modprobe ip_vti ip link del ip_vti0 type vti ip link add ip_vti0 type vti rmmod ip_vti do that one or more times, kernel will panic. fix it by assigning ip_tunnel_dellink to vti_link_ops' dellink, in which we skip the unregister of fb tunnel device. do the same on ip6_vti. Signed-off-by: Xin Long Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv4/ip_vti.c | 1 + net/ipv6/ip6_vti.c | 11 +++++++++++ 2 files changed, 12 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 3e861011e4a3..1a7e979e80ba 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -528,6 +528,7 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = { .validate = vti_tunnel_validate, .newlink = vti_newlink, .changelink = vti_changelink, + .dellink = ip_tunnel_dellink, .get_size = vti_get_size, .fill_info = vti_fill_info, }; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 31089d153fd3..bcda14de7f84 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -905,6 +905,15 @@ static int vti6_newlink(struct net *src_net, struct net_device *dev, return vti6_tnl_create2(dev); } +static void vti6_dellink(struct net_device *dev, struct list_head *head) +{ + struct net *net = dev_net(dev); + struct vti6_net *ip6n = net_generic(net, vti6_net_id); + + if (dev != ip6n->fb_tnl_dev) + unregister_netdevice_queue(dev, head); +} + static int vti6_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { @@ -980,6 +989,7 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = { .setup = vti6_dev_setup, .validate = vti6_validate, .newlink = vti6_newlink, + .dellink = vti6_dellink, .changelink = vti6_changelink, .get_size = vti6_get_size, .fill_info = vti6_fill_info, @@ -1020,6 +1030,7 @@ static int __net_init vti6_init_net(struct net *net) if (!ip6n->fb_tnl_dev) goto err_alloc_dev; dev_net_set(ip6n->fb_tnl_dev, net); + ip6n->fb_tnl_dev->rtnl_link_ops = &vti6_link_ops; err = vti6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) -- cgit v1.2.3 From be6572fdb1bfbe23b2624d477de50af50b02f5d6 Mon Sep 17 00:00:00 2001 From: Yuri Chislov Date: Mon, 24 Nov 2014 11:25:15 +0100 Subject: ipv6: gre: fix wrong skb->protocol in WCCP When using GRE redirection in WCCP, it sets the wrong skb->protocol, that is, ETH_P_IP instead of ETH_P_IPV6 for the encapuslated traffic. Fixes: c12b395a4664 ("gre: Support GRE over IPv6") Cc: Dmitry Kozlov Signed-off-by: Yuri Chislov Tested-by: Yuri Chislov Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4564e1fca3eb..0e32d2e1bdbf 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -502,11 +502,11 @@ static int ip6gre_rcv(struct sk_buff *skb) skb->protocol = gre_proto; /* WCCP version 1 and 2 protocol decoding. - * - Change protocol to IP + * - Change protocol to IPv6 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header */ if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) { - skb->protocol = htons(ETH_P_IP); + skb->protocol = htons(ETH_P_IPV6); if ((*(h + offset) & 0xF0) != 0x40) offset += 4; } -- cgit v1.2.3 From 6e58040b848162d77243cf499d026c6253278e29 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 24 Nov 2014 13:32:16 +0200 Subject: af_packet: fix sparse warning af_packet produces lots of these: net/packet/af_packet.c:384:39: warning: incorrect type in return expression (different modifiers) net/packet/af_packet.c:384:39: expected struct page [pure] * net/packet/af_packet.c:384:39: got struct page * this seems to be because sparse does not realize that _pure refers to function, not the returned pointer. Tweak code slightly to avoid the warning. Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- net/packet/af_packet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 87d20f48ff06..07c04a841ba0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -378,7 +378,7 @@ static void unregister_prot_hook(struct sock *sk, bool sync) __unregister_prot_hook(sk, sync); } -static inline __pure struct page *pgv_to_page(void *addr) +static inline struct page * __pure pgv_to_page(void *addr) { if (is_vmalloc_addr(addr)) return vmalloc_to_page(addr); -- cgit v1.2.3 From 91a0b603469069cdcce4d572b7525ffc9fd352a6 Mon Sep 17 00:00:00 2001 From: Jane Zhou Date: Mon, 24 Nov 2014 11:44:08 -0800 Subject: net/ping: handle protocol mismatching scenario ping_lookup() may return a wrong sock if sk_buff's and sock's protocols dont' match. For example, sk_buff's protocol is ETH_P_IPV6, but sock's sk_family is AF_INET, in that case, if sk->sk_bound_dev_if is zero, a wrong sock will be returned. the fix is to "continue" the searching, if no matching, return NULL. Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Cc: netdev@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Jane Zhou Signed-off-by: Yiwei Zhao Signed-off-by: David S. Miller --- net/ipv4/ping.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 57f7c9804139..85a02a75ad83 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -217,6 +217,8 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) &ipv6_hdr(skb)->daddr)) continue; #endif + } else { + continue; } if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) -- cgit v1.2.3 From f3750817a9034bd8cbb5ba583cd544e7cf14c7d8 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 24 Nov 2014 20:08:32 -0800 Subject: ip6_udp_tunnel: Fix checksum calculation The UDP checksum calculation for VXLAN tunnels is currently using the socket addresses instead of the actual packet source and destination addresses. As a result the checksum calculated is incorrect in some cases. Also uh->check was being set twice, first it was set to 0, and then it is set again in udp6_set_csum. This change removes the redundant assignment to 0. Fixes: acbf74a7 ("vxlan: Refactor vxlan driver to make use of the common UDP tunnel functions.") Cc: Andy Zhou Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv6/ip6_udp_tunnel.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index b04ed72c4542..8db6c98fe218 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -79,15 +79,13 @@ int udp_tunnel6_xmit_skb(struct socket *sock, struct dst_entry *dst, uh->source = src_port; uh->len = htons(skb->len); - uh->check = 0; memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); skb_dst_set(skb, dst); - udp6_set_csum(udp_get_no_check6_tx(sk), skb, &inet6_sk(sk)->saddr, - &sk->sk_v6_daddr, skb->len); + udp6_set_csum(udp_get_no_check6_tx(sk), skb, saddr, daddr, skb->len); __skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); -- cgit v1.2.3 From 43612d7c04f1a4f5e60104143918fcdf018b66ee Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Tue, 25 Nov 2014 19:54:47 +0100 Subject: Revert "netfilter: conntrack: fix race in __nf_conntrack_confirm against get_next_corpse" This reverts commit 5195c14c8b27cc0b18220ddbf0e5ad3328a04187. If the conntrack clashes with an existing one, it is left out of the unconfirmed list, thus, crashing when dropping the packet and releasing the conntrack since golden rule is that conntracks are always placed in any of the existing lists for traceability reasons. Reported-by: Daniel Borkmann Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=88841 Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_core.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 2c699757bccf..5016a6929085 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -611,16 +611,12 @@ __nf_conntrack_confirm(struct sk_buff *skb) */ NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); pr_debug("Confirming conntrack %p\n", ct); - - /* We have to check the DYING flag after unlink to prevent - * a race against nf_ct_get_next_corpse() possibly called from - * user context, else we insert an already 'dead' hash, blocking - * further use of that particular connection -JM. - */ - nf_ct_del_from_dying_or_unconfirmed_list(ct); + /* We have to check the DYING flag inside the lock to prevent + a race against nf_ct_get_next_corpse() possibly called from + user context, else we insert an already 'dead' hash, blocking + further use of that particular connection -JM */ if (unlikely(nf_ct_is_dying(ct))) { - nf_ct_add_to_dying_list(ct); nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); return NF_ACCEPT; @@ -640,6 +636,8 @@ __nf_conntrack_confirm(struct sk_buff *skb) zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) goto out; + nf_ct_del_from_dying_or_unconfirmed_list(ct); + /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ -- cgit v1.2.3 From c3658e8d0f10147fc86018be7f11668246c156d3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Nov 2014 07:40:04 -0800 Subject: tcp: fix possible NULL dereference in tcp_vX_send_reset() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit ca777eff51f7 ("tcp: remove dst refcount false sharing for prequeue mode") we have to relax check against skb dst in tcp_v[46]_send_reset() if prequeue dropped the dst. If a socket is provided, a full lookup was done to find this socket, so the dst test can be skipped. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=88191 Reported-by: Jaša Bartelj Signed-off-by: Eric Dumazet Reported-by: Daniel Borkmann Fixes: ca777eff51f7 ("tcp: remove dst refcount false sharing for prequeue mode") Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 5 ++++- net/ipv6/tcp_ipv6.c | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9c7d7621466b..147be2024290 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -598,7 +598,10 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) if (th->rst) return; - if (skb_rtable(skb)->rt_type != RTN_LOCAL) + /* If sk not NULL, it means we did a successful lookup and incoming + * route had to be correct. prequeue might have dropped our dst. + */ + if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL) return; /* Swap the send and the receive. */ diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ace29b60813c..dc495ae2ead0 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -903,7 +903,10 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) if (th->rst) return; - if (!ipv6_unicast_destination(skb)) + /* If sk not NULL, it means we did a successful lookup and incoming + * route had to be correct. prequeue might have dropped our dst. + */ + if (!sk && !ipv6_unicast_destination(skb)) return; #ifdef CONFIG_TCP_MD5SIG -- cgit v1.2.3 From 6e8d1c55454574a22b3e8e263b1a12888909b033 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 26 Nov 2014 13:42:16 +0100 Subject: bridge: Validate IFLA_BRIDGE_FLAGS attribute length Payload is currently accessed blindly and may exceed valid message boundaries. Fixes: 407af3299 ("bridge: Add netlink interface to configure vlans on bridge ports") Cc: Vlad Yasevich Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a6882686ca3a..5a853f8fad6c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2798,6 +2798,9 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { + if (nla_len(attr) < sizeof(flags)) + return -EINVAL; + have_flags = true; flags = nla_get_u16(attr); break; @@ -2868,6 +2871,9 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { + if (nla_len(attr) < sizeof(flags)) + return -EINVAL; + have_flags = true; flags = nla_get_u16(attr); break; -- cgit v1.2.3 From 6f705d8cfc0af3607aed890c53d86255e315c6f4 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 26 Nov 2014 13:42:19 +0100 Subject: bridge: Add missing policy entry for IFLA_BRPORT_FAST_LEAVE Fixes: c2d3babf ("bridge: implement multicast fast leave") Cc: David S. Miller Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 2ff9706647f2..e5ec470b851f 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -280,6 +280,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_MODE] = { .type = NLA_U8 }, [IFLA_BRPORT_GUARD] = { .type = NLA_U8 }, [IFLA_BRPORT_PROTECT] = { .type = NLA_U8 }, + [IFLA_BRPORT_FAST_LEAVE]= { .type = NLA_U8 }, [IFLA_BRPORT_LEARNING] = { .type = NLA_U8 }, [IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 }, }; -- cgit v1.2.3 From aa68c20ff32f9a6fb3ca7f93ed9beae01899d00d Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 26 Nov 2014 13:42:20 +0100 Subject: bridge: Sanitize IFLA_EXT_MASK for AF_BRIDGE:RTM_GETLINK Only search for IFLA_EXT_MASK if the message actually carries a ifinfomsg header and validate minimal length requirements for IFLA_EXT_MASK. Fixes: 6cbdceeb ("bridge: Dump vlan information from a bridge port") Cc: Vlad Yasevich Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5a853f8fad6c..b9b7dfaf202b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2685,13 +2685,20 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) int idx = 0; u32 portid = NETLINK_CB(cb->skb).portid; u32 seq = cb->nlh->nlmsg_seq; - struct nlattr *extfilt; u32 filter_mask = 0; - extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg), - IFLA_EXT_MASK); - if (extfilt) - filter_mask = nla_get_u32(extfilt); + if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) { + struct nlattr *extfilt; + + extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg), + IFLA_EXT_MASK); + if (extfilt) { + if (nla_len(extfilt) < sizeof(filter_mask)) + return -EINVAL; + + filter_mask = nla_get_u32(extfilt); + } + } rcu_read_lock(); for_each_netdev_rcu(net, dev) { -- cgit v1.2.3 From f4713a3dfad045d46afcb9c2a7d0bba288920ed4 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 26 Nov 2014 14:53:02 -0500 Subject: net-timestamp: make tcp_recvmsg call ipv6_recv_error for AF_INET6 socks TCP timestamping introduced MSG_ERRQUEUE handling for TCP sockets. If the socket is of family AF_INET6, call ipv6_recv_error instead of ip_recv_error. This change is more complex than a single branch due to the loadable ipv6 module. It reuses a pre-existing indirect function call from ping. The ping code is safe to call, because it is part of the core ipv6 module and always present when AF_INET6 sockets are active. Fixes: 4ed2d765 (net-timestamp: TCP timestamping) Signed-off-by: Willem de Bruijn ---- It may also be worthwhile to add WARN_ON_ONCE(sk->family == AF_INET6) to ip_recv_error. Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 11 +++++++++++ net/ipv4/ping.c | 12 ++---------- net/ipv4/tcp.c | 2 +- 3 files changed, 14 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8b7fe5b03906..e67da4e6c324 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1386,6 +1386,17 @@ out: return pp; } +int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) +{ + if (sk->sk_family == AF_INET) + return ip_recv_error(sk, msg, len, addr_len); +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len); +#endif + return -EINVAL; +} + static int inet_gro_complete(struct sk_buff *skb, int nhoff) { __be16 newlen = htons(skb->len - nhoff); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 85a02a75ad83..5d740cccf69e 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -855,16 +855,8 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (flags & MSG_OOB) goto out; - if (flags & MSG_ERRQUEUE) { - if (family == AF_INET) { - return ip_recv_error(sk, msg, len, addr_len); -#if IS_ENABLED(CONFIG_IPV6) - } else if (family == AF_INET6) { - return pingv6_ops.ipv6_recv_error(sk, msg, len, - addr_len); -#endif - } - } + if (flags & MSG_ERRQUEUE) + return inet_recv_error(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 39ec0c379545..38c2bcb8dd5d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1598,7 +1598,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, u32 urg_hole = 0; if (unlikely(flags & MSG_ERRQUEUE)) - return ip_recv_error(sk, msg, len, addr_len); + return inet_recv_error(sk, msg, len, addr_len); if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) && (sk->sk_state == TCP_ESTABLISHED)) -- cgit v1.2.3