From 10b68487869031828aede7313c2befc53d6d30ec Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Mon, 27 Oct 2014 11:56:06 +0100
Subject: mac80211: flush keys for AP mode on ieee80211_do_stop

Userspace can add keys to an AP mode interface before start_ap has been
called. If there have been no calls to start_ap/stop_ap in the mean
time, the keys will still be around when the interface is brought down.

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
[adjust comments, fix AP_VLAN case]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/iface.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index af237223a8cd..3b9e2b7b3f30 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -898,6 +898,8 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 		list_del(&sdata->u.vlan.list);
 		mutex_unlock(&local->mtx);
 		RCU_INIT_POINTER(sdata->vif.chanctx_conf, NULL);
+		/* see comment in the default case below */
+		ieee80211_free_keys(sdata, true);
 		/* no need to tell driver */
 		break;
 	case NL80211_IFTYPE_MONITOR:
@@ -923,17 +925,16 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 		/*
 		 * When we get here, the interface is marked down.
 		 * Free the remaining keys, if there are any
-		 * (shouldn't be, except maybe in WDS mode?)
+		 * (which can happen in AP mode if userspace sets
+		 * keys before the interface is operating, and maybe
+		 * also in WDS mode)
 		 *
 		 * Force the key freeing to always synchronize_net()
 		 * to wait for the RX path in case it is using this
-		 * interface enqueuing frames * at this very time on
+		 * interface enqueuing frames at this very time on
 		 * another CPU.
 		 */
 		ieee80211_free_keys(sdata, true);
-
-		/* fall through */
-	case NL80211_IFTYPE_AP:
 		skb_queue_purge(&sdata->skb_queue);
 	}
 
-- 
cgit v1.2.3


From 84469a45a1bedec9918e94ab2f78c5dc0739e4a7 Mon Sep 17 00:00:00 2001
From: Luciano Coelho <luciano.coelho@intel.com>
Date: Tue, 28 Oct 2014 13:33:04 +0200
Subject: mac80211: use secondary channel offset IE also beacons during CSA

If we are switching from an HT40+ to an HT40- channel (or vice-versa),
we need the secondary channel offset IE to specify what is the
post-CSA offset to be used.  This applies both to beacons and to probe
responses.

In ieee80211_parse_ch_switch_ie() we were ignoring this IE from
beacons and using the *current* HT information IE instead.  This was
causing us to use the same offset as before the switch.

Fix that by using the secondary channel offset IE also for beacons and
don't ever use the pre-switch offset.  Additionally, remove the
"beacon" argument from ieee80211_parse_ch_switch_ie(), since it's not
needed anymore.

Cc: stable@vger.kernel.org
Reported-by: Jouni Malinen <j@w1.fi>
Signed-off-by: Luciano Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ibss.c        |  2 +-
 net/mac80211/ieee80211_i.h |  3 +--
 net/mac80211/mesh.c        |  2 +-
 net/mac80211/mlme.c        |  2 +-
 net/mac80211/spectmgmt.c   | 18 ++++++------------
 5 files changed, 10 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 56b53571c807..509bc157ce55 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -805,7 +805,7 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 
 	memset(&params, 0, sizeof(params));
 	memset(&csa_ie, 0, sizeof(csa_ie));
-	err = ieee80211_parse_ch_switch_ie(sdata, elems, beacon,
+	err = ieee80211_parse_ch_switch_ie(sdata, elems,
 					   ifibss->chandef.chan->band,
 					   sta_flags, ifibss->bssid, &csa_ie);
 	/* can't switch to destination channel, fail */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c2aaec4dfcf0..8c68da30595d 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1642,7 +1642,6 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
  * ieee80211_parse_ch_switch_ie - parses channel switch IEs
  * @sdata: the sdata of the interface which has received the frame
  * @elems: parsed 802.11 elements received with the frame
- * @beacon: indicates if the frame was a beacon or probe response
  * @current_band: indicates the current band
  * @sta_flags: contains information about own capabilities and restrictions
  *	to decide which channel switch announcements can be accepted. Only the
@@ -1656,7 +1655,7 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
  * Return: 0 on success, <0 on error and >0 if there is nothing to parse.
  */
 int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
-				 struct ieee802_11_elems *elems, bool beacon,
+				 struct ieee802_11_elems *elems,
 				 enum ieee80211_band current_band,
 				 u32 sta_flags, u8 *bssid,
 				 struct ieee80211_csa_ie *csa_ie);
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index e9f99c1e3fad..0c8b2a77d312 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -874,7 +874,7 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata,
 
 	memset(&params, 0, sizeof(params));
 	memset(&csa_ie, 0, sizeof(csa_ie));
-	err = ieee80211_parse_ch_switch_ie(sdata, elems, beacon, band,
+	err = ieee80211_parse_ch_switch_ie(sdata, elems, band,
 					   sta_flags, sdata->vif.addr,
 					   &csa_ie);
 	if (err < 0)
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 2de88704278b..08f51c6d0953 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1072,7 +1072,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 
 	current_band = cbss->channel->band;
 	memset(&csa_ie, 0, sizeof(csa_ie));
-	res = ieee80211_parse_ch_switch_ie(sdata, elems, beacon, current_band,
+	res = ieee80211_parse_ch_switch_ie(sdata, elems, current_band,
 					   ifmgd->flags,
 					   ifmgd->associated->bssid, &csa_ie);
 	if (res	< 0)
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
index 6ab009070084..efeba56c913b 100644
--- a/net/mac80211/spectmgmt.c
+++ b/net/mac80211/spectmgmt.c
@@ -22,7 +22,7 @@
 #include "wme.h"
 
 int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
-				 struct ieee802_11_elems *elems, bool beacon,
+				 struct ieee802_11_elems *elems,
 				 enum ieee80211_band current_band,
 				 u32 sta_flags, u8 *bssid,
 				 struct ieee80211_csa_ie *csa_ie)
@@ -91,19 +91,13 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
 		return -EINVAL;
 	}
 
-	if (!beacon && sec_chan_offs) {
+	if (sec_chan_offs) {
 		secondary_channel_offset = sec_chan_offs->sec_chan_offs;
-	} else if (beacon && ht_oper) {
-		secondary_channel_offset =
-			ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET;
 	} else if (!(sta_flags & IEEE80211_STA_DISABLE_HT)) {
-		/* If it's not a beacon, HT is enabled and the IE not present,
-		 * it's 20 MHz, 802.11-2012 8.5.2.6:
-		 *	This element [the Secondary Channel Offset Element] is
-		 *	present when switching to a 40 MHz channel. It may be
-		 *	present when switching to a 20 MHz channel (in which
-		 *	case the secondary channel offset is set to SCN).
-		 */
+		/* If the secondary channel offset IE is not present,
+		 * we can't know what's the post-CSA offset, so the
+		 * best we can do is use 20MHz.
+		*/
 		secondary_channel_offset = IEEE80211_HT_PARAM_CHA_SEC_NONE;
 	}
 
-- 
cgit v1.2.3


From ff1e417c7c239b7abfe70aa90460a77eaafc7f83 Mon Sep 17 00:00:00 2001
From: Luciano Coelho <luciano.coelho@intel.com>
Date: Tue, 28 Oct 2014 13:33:05 +0200
Subject: mac80211: schedule the actual switch of the station before CSA count
 0

Due to the time it takes to process the beacon that started the CSA
process, we may be late for the switch if we try to reach exactly
beacon 0.  To avoid that, use count - 1 when calculating the switch time.

Cc: stable@vger.kernel.org
Reported-by: Jouni Malinen <j@w1.fi>
Signed-off-by: Luciano Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 08f51c6d0953..93af0f1c9d99 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1168,7 +1168,8 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 		ieee80211_queue_work(&local->hw, &ifmgd->chswitch_work);
 	else
 		mod_timer(&ifmgd->chswitch_timer,
-			  TU_TO_EXP_TIME(csa_ie.count * cbss->beacon_interval));
+			  TU_TO_EXP_TIME((csa_ie.count - 1) *
+					 cbss->beacon_interval));
 }
 
 static bool
-- 
cgit v1.2.3


From 46238845bd609a5c0fbe076e1b82b4c5b33360b2 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes@sipsolutions.net>
Date: Tue, 21 Oct 2014 20:56:42 +0200
Subject: mac80211: properly flush delayed scan work on interface removal

When an interface is deleted, an ongoing hardware scan is canceled and
the driver must abort the scan, at the very least reporting completion
while the interface is removed.

However, if it scheduled the work that might only run after everything
is said and done, which leads to cfg80211 warning that the scan isn't
reported as finished yet; this is no fault of the driver, it already
did, but mac80211 hasn't processed it.

To fix this situation, flush the delayed work when the interface being
removed is the one that was executing the scan.

Cc: stable@vger.kernel.org
Reported-by: Sujith Manoharan <sujith@msujith.org>
Tested-by: Sujith Manoharan <sujith@msujith.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/iface.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 3b9e2b7b3f30..653f5eb07a27 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -766,10 +766,12 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 	int i, flushed;
 	struct ps_data *ps;
 	struct cfg80211_chan_def chandef;
+	bool cancel_scan;
 
 	clear_bit(SDATA_STATE_RUNNING, &sdata->state);
 
-	if (rcu_access_pointer(local->scan_sdata) == sdata)
+	cancel_scan = rcu_access_pointer(local->scan_sdata) == sdata;
+	if (cancel_scan)
 		ieee80211_scan_cancel(local);
 
 	/*
@@ -992,6 +994,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
 
 	ieee80211_recalc_ps(local, -1);
 
+	if (cancel_scan)
+		flush_delayed_work(&local->scan_work);
+
 	if (local->open_count == 0) {
 		ieee80211_stop_device(local);
 
-- 
cgit v1.2.3


From b8fff407a180286aa683d543d878d98d9fc57b13 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 3 Nov 2014 13:57:46 +0100
Subject: mac80211: fix use-after-free in defragmentation

Upon receiving the last fragment, all but the first fragment
are freed, but the multicast check for statistics at the end
of the function refers to the current skb (the last fragment)
causing a use-after-free bug.

Since multicast frames cannot be fragmented and we check for
this early in the function, just modify that check to also
do the accounting to fix the issue.

Cc: stable@vger.kernel.org
Reported-by: Yosef Khyal <yosefx.khyal@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index b04ca4049c95..a37f9af634cb 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1678,11 +1678,14 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 	sc = le16_to_cpu(hdr->seq_ctrl);
 	frag = sc & IEEE80211_SCTL_FRAG;
 
-	if (likely((!ieee80211_has_morefrags(fc) && frag == 0) ||
-		   is_multicast_ether_addr(hdr->addr1))) {
-		/* not fragmented */
+	if (likely(!ieee80211_has_morefrags(fc) && frag == 0))
+		goto out;
+
+	if (is_multicast_ether_addr(hdr->addr1)) {
+		rx->local->dot11MulticastReceivedFrameCount++;
 		goto out;
 	}
+
 	I802_DEBUG_INC(rx->local->rx_handlers_fragments);
 
 	if (skb_linearize(rx->skb))
@@ -1775,10 +1778,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
  out:
 	if (rx->sta)
 		rx->sta->rx_packets++;
-	if (is_multicast_ether_addr(hdr->addr1))
-		rx->local->dot11MulticastReceivedFrameCount++;
-	else
-		ieee80211_led_rx(rx->local);
+	ieee80211_led_rx(rx->local);
 	return RX_CONTINUE;
 }
 
-- 
cgit v1.2.3


From c1207c049b204b0a96535dc5416aee331b51e0e1 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Sun, 2 Nov 2014 18:19:15 -0800
Subject: netfilter: nft_reject_bridge: Fix powerpc build error

Fix:
net/bridge/netfilter/nft_reject_bridge.c:
In function 'nft_reject_br_send_v6_unreach':
net/bridge/netfilter/nft_reject_bridge.c:240:3:
	error: implicit declaration of function 'csum_ipv6_magic'
   csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr,
   ^
make[3]: *** [net/bridge/netfilter/nft_reject_bridge.o] Error 1

Seen with powerpc:allmodconfig.

Fixes: 523b929d5446 ("netfilter: nft_reject_bridge: don't use IP stack to reject traffic")
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/netfilter/nft_reject_bridge.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index 654c9018e3e7..48da2c54a69e 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -18,6 +18,7 @@
 #include <net/netfilter/ipv6/nf_reject.h>
 #include <linux/ip.h>
 #include <net/ip.h>
+#include <net/ip6_checksum.h>
 #include <linux/netfilter_bridge.h>
 #include "../br_private.h"
 
-- 
cgit v1.2.3


From 6c6151daaf2d8dc2046d9926539feed5f66bf74e Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Mon, 3 Nov 2014 09:19:27 +0100
Subject: ip6_tunnel: Use ip6_tnl_dev_init as the ndo_init function.

ip6_tnl_dev_init() sets the dev->iflink via a call to
ip6_tnl_link_config(). After that, register_netdevice()
sets dev->iflink = -1. So we loose the iflink configuration
for ipv6 tunnels. Fix this by using ip6_tnl_dev_init() as the
ndo_init function. Then ip6_tnl_dev_init() is called after
dev->iflink is set to -1 from register_netdevice().

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 9409887fb664..9cb94cfa0ae7 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -272,9 +272,6 @@ static int ip6_tnl_create2(struct net_device *dev)
 	int err;
 
 	t = netdev_priv(dev);
-	err = ip6_tnl_dev_init(dev);
-	if (err < 0)
-		goto out;
 
 	err = register_netdevice(dev);
 	if (err < 0)
@@ -1462,6 +1459,7 @@ ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
 
 
 static const struct net_device_ops ip6_tnl_netdev_ops = {
+	.ndo_init	= ip6_tnl_dev_init,
 	.ndo_uninit	= ip6_tnl_dev_uninit,
 	.ndo_start_xmit = ip6_tnl_xmit,
 	.ndo_do_ioctl	= ip6_tnl_ioctl,
@@ -1546,16 +1544,10 @@ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
 	struct ip6_tnl *t = netdev_priv(dev);
 	struct net *net = dev_net(dev);
 	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
-	int err = ip6_tnl_dev_init_gen(dev);
-
-	if (err)
-		return err;
 
 	t->parms.proto = IPPROTO_IPV6;
 	dev_hold(dev);
 
-	ip6_tnl_link_config(t);
-
 	rcu_assign_pointer(ip6n->tnls_wc[0], t);
 	return 0;
 }
-- 
cgit v1.2.3


From 16a0231bf7dc3fb37e9b1f1cb1a277dc220b5c5e Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Mon, 3 Nov 2014 09:19:28 +0100
Subject: vti6: Use vti6_dev_init as the ndo_init function.

vti6_dev_init() sets the dev->iflink via a call to
vti6_link_config(). After that, register_netdevice()
sets dev->iflink = -1. So we loose the iflink configuration
for vti6 tunnels. Fix this by using vti6_dev_init() as the
ndo_init function. Then vti6_dev_init() is called after
dev->iflink is set to -1 from register_netdevice().

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_vti.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index d440bb585524..31089d153fd3 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -172,10 +172,6 @@ static int vti6_tnl_create2(struct net_device *dev)
 	struct vti6_net *ip6n = net_generic(net, vti6_net_id);
 	int err;
 
-	err = vti6_dev_init(dev);
-	if (err < 0)
-		goto out;
-
 	err = register_netdevice(dev);
 	if (err < 0)
 		goto out;
@@ -783,6 +779,7 @@ static int vti6_change_mtu(struct net_device *dev, int new_mtu)
 }
 
 static const struct net_device_ops vti6_netdev_ops = {
+	.ndo_init	= vti6_dev_init,
 	.ndo_uninit	= vti6_dev_uninit,
 	.ndo_start_xmit = vti6_tnl_xmit,
 	.ndo_do_ioctl	= vti6_ioctl,
@@ -852,16 +849,10 @@ static int __net_init vti6_fb_tnl_dev_init(struct net_device *dev)
 	struct ip6_tnl *t = netdev_priv(dev);
 	struct net *net = dev_net(dev);
 	struct vti6_net *ip6n = net_generic(net, vti6_net_id);
-	int err = vti6_dev_init_gen(dev);
-
-	if (err)
-		return err;
 
 	t->parms.proto = IPPROTO_IPV6;
 	dev_hold(dev);
 
-	vti6_link_config(t);
-
 	rcu_assign_pointer(ip6n->tnls_wc[0], t);
 	return 0;
 }
-- 
cgit v1.2.3


From ebe084aafb7e93adf210e80043c9f69adf56820d Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Mon, 3 Nov 2014 09:19:29 +0100
Subject: sit: Use ipip6_tunnel_init as the ndo_init function.

ipip6_tunnel_init() sets the dev->iflink via a call to
ipip6_tunnel_bind_dev(). After that, register_netdevice()
sets dev->iflink = -1. So we loose the iflink configuration
for ipv6 tunnels. Fix this by using ipip6_tunnel_init() as the
ndo_init function. Then ipip6_tunnel_init() is called after
dev->iflink is set to -1 from register_netdevice().

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/sit.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 58e5b4710127..a24557a1c1d8 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -195,10 +195,8 @@ static int ipip6_tunnel_create(struct net_device *dev)
 	struct sit_net *sitn = net_generic(net, sit_net_id);
 	int err;
 
-	err = ipip6_tunnel_init(dev);
-	if (err < 0)
-		goto out;
-	ipip6_tunnel_clone_6rd(dev, sitn);
+	memcpy(dev->dev_addr, &t->parms.iph.saddr, 4);
+	memcpy(dev->broadcast, &t->parms.iph.daddr, 4);
 
 	if ((__force u16)t->parms.i_flags & SIT_ISATAP)
 		dev->priv_flags |= IFF_ISATAP;
@@ -207,7 +205,8 @@ static int ipip6_tunnel_create(struct net_device *dev)
 	if (err < 0)
 		goto out;
 
-	strcpy(t->parms.name, dev->name);
+	ipip6_tunnel_clone_6rd(dev, sitn);
+
 	dev->rtnl_link_ops = &sit_link_ops;
 
 	dev_hold(dev);
@@ -1330,6 +1329,7 @@ static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu)
 }
 
 static const struct net_device_ops ipip6_netdev_ops = {
+	.ndo_init	= ipip6_tunnel_init,
 	.ndo_uninit	= ipip6_tunnel_uninit,
 	.ndo_start_xmit	= sit_tunnel_xmit,
 	.ndo_do_ioctl	= ipip6_tunnel_ioctl,
@@ -1378,9 +1378,7 @@ static int ipip6_tunnel_init(struct net_device *dev)
 
 	tunnel->dev = dev;
 	tunnel->net = dev_net(dev);
-
-	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
-	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+	strcpy(tunnel->parms.name, dev->name);
 
 	ipip6_tunnel_bind_dev(dev);
 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
@@ -1405,7 +1403,6 @@ static int __net_init ipip6_fb_tunnel_init(struct net_device *dev)
 
 	tunnel->dev = dev;
 	tunnel->net = dev_net(dev);
-	strcpy(tunnel->parms.name, dev->name);
 
 	iph->version		= 4;
 	iph->protocol		= IPPROTO_IPV6;
-- 
cgit v1.2.3


From f03eb128e3f4276f46442d14f3b8f864f3775821 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Mon, 3 Nov 2014 09:19:30 +0100
Subject: gre6: Move the setting of dev->iflink into the ndo_init functions.

Otherwise it gets overwritten by register_netdev().

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 12c3c8ef3849..4564e1fca3eb 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -961,8 +961,6 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
 	else
 		dev->flags &= ~IFF_POINTOPOINT;
 
-	dev->iflink = p->link;
-
 	/* Precalculate GRE options length */
 	if (t->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
 		if (t->parms.o_flags&GRE_CSUM)
@@ -1272,6 +1270,7 @@ static int ip6gre_tunnel_init(struct net_device *dev)
 		u64_stats_init(&ip6gre_tunnel_stats->syncp);
 	}
 
+	dev->iflink = tunnel->parms.link;
 
 	return 0;
 }
@@ -1481,6 +1480,8 @@ static int ip6gre_tap_init(struct net_device *dev)
 	if (!dev->tstats)
 		return -ENOMEM;
 
+	dev->iflink = tunnel->parms.link;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 45cac46e51da75628ac2a593c70f5144abb9b31d Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@nicira.com>
Date: Mon, 3 Nov 2014 19:38:37 -0800
Subject: geneve: Set GSO type on transmit.

Geneve does not currently set the inner protocol type when
transmitting packets. This causes GSO segmentation to fail on NICs
that do not support Geneve offloading.

CC: Andy Zhou <azhou@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/geneve.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 065cd94c640c..6e5266cf403d 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -144,6 +144,8 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 	gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
 	geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
 
+	skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+
 	return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst,
 				   tos, ttl, df, src_port, dst_port, xnet);
 }
-- 
cgit v1.2.3


From d3ca9eafc0ed97b8f56fdf23655cfece89c48354 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@nicira.com>
Date: Mon, 3 Nov 2014 19:38:38 -0800
Subject: geneve: Unregister pernet subsys on module unload.

The pernet ops aren't ever unregistered, which causes a memory
leak and an OOPs if the module is ever reinserted.

Fixes: 0b5e8b8eeae4 ("net: Add Geneve tunneling protocol driver")
CC: Andy Zhou <azhou@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/geneve.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 6e5266cf403d..dedb21e99914 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -366,6 +366,7 @@ late_initcall(geneve_init_module);
 static void __exit geneve_cleanup_module(void)
 {
 	destroy_workqueue(geneve_wq);
+	unregister_pernet_subsys(&geneve_net_ops);
 }
 module_exit(geneve_cleanup_module);
 
-- 
cgit v1.2.3


From 1f37bf87aa7523d28e7e4c4f7bb5dba98faa3e00 Mon Sep 17 00:00:00 2001
From: Marcelo Leitner <mleitner@redhat.com>
Date: Tue, 4 Nov 2014 17:15:08 -0200
Subject: tcp: zero retrans_stamp if all retrans were acked

Ueki Kohei reported that when we are using NewReno with connections that
have a very low traffic, we may timeout the connection too early if a
second loss occurs after the first one was successfully acked but no
data was transfered later. Below is his description of it:

When SACK is disabled, and a socket suffers multiple separate TCP
retransmissions, that socket's ETIMEDOUT value is calculated from the
time of the *first* retransmission instead of the *latest*
retransmission.

This happens because the tcp_sock's retrans_stamp is set once then never
cleared.

Take the following connection:

                      Linux                    remote-machine
                        |                           |
         send#1---->(*1)|--------> data#1 --------->|
                  |     |                           |
                 RTO    :                           :
                  |     |                           |
                 ---(*2)|----> data#1(retrans) ---->|
                  | (*3)|<---------- ACK <----------|
                  |     |                           |
                  |     :                           :
                  |     :                           :
                  |     :                           :
                16 minutes (or more)                :
                  |     :                           :
                  |     :                           :
                  |     :                           :
                  |     |                           |
         send#2---->(*4)|--------> data#2 --------->|
                  |     |                           |
                 RTO    :                           :
                  |     |                           |
                 ---(*5)|----> data#2(retrans) ---->|
                  |     |                           |
                  |     |                           |
                RTO*2   :                           :
                  |     |                           |
                  |     |                           |
      ETIMEDOUT<----(*6)|                           |

(*1) One data packet sent.
(*2) Because no ACK packet is received, the packet is retransmitted.
(*3) The ACK packet is received. The transmitted packet is acknowledged.

At this point the first "retransmission event" has passed and been
recovered from. Any future retransmission is a completely new "event".

(*4) After 16 minutes (to correspond with retries2=15), a new data
packet is sent. Note: No data is transmitted between (*3) and (*4).

The socket's timeout SHOULD be calculated from this point in time, but
instead it's calculated from the prior "event" 16 minutes ago.

(*5) Because no ACK packet is received, the packet is retransmitted.
(*6) At the time of the 2nd retransmission, the socket returns
ETIMEDOUT.

Therefore, now we clear retrans_stamp as soon as all data during the
loss window is fully acked.

Reported-by: Ueki Kohei
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Tested-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 60 +++++++++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 29 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a12b455928e5..88fa2d160685 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2315,6 +2315,35 @@ static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
 
 /* Undo procedures. */
 
+/* We can clear retrans_stamp when there are no retransmissions in the
+ * window. It would seem that it is trivially available for us in
+ * tp->retrans_out, however, that kind of assumptions doesn't consider
+ * what will happen if errors occur when sending retransmission for the
+ * second time. ...It could the that such segment has only
+ * TCPCB_EVER_RETRANS set at the present time. It seems that checking
+ * the head skb is enough except for some reneging corner cases that
+ * are not worth the effort.
+ *
+ * Main reason for all this complexity is the fact that connection dying
+ * time now depends on the validity of the retrans_stamp, in particular,
+ * that successive retransmissions of a segment must not advance
+ * retrans_stamp under any conditions.
+ */
+static bool tcp_any_retrans_done(const struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	if (tp->retrans_out)
+		return true;
+
+	skb = tcp_write_queue_head(sk);
+	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
+		return true;
+
+	return false;
+}
+
 #if FASTRETRANS_DEBUG > 1
 static void DBGUNDO(struct sock *sk, const char *msg)
 {
@@ -2410,6 +2439,8 @@ static bool tcp_try_undo_recovery(struct sock *sk)
 		 * is ACKed. For Reno it is MUST to prevent false
 		 * fast retransmits (RFC2582). SACK TCP is safe. */
 		tcp_moderate_cwnd(tp);
+		if (!tcp_any_retrans_done(sk))
+			tp->retrans_stamp = 0;
 		return true;
 	}
 	tcp_set_ca_state(sk, TCP_CA_Open);
@@ -2430,35 +2461,6 @@ static bool tcp_try_undo_dsack(struct sock *sk)
 	return false;
 }
 
-/* We can clear retrans_stamp when there are no retransmissions in the
- * window. It would seem that it is trivially available for us in
- * tp->retrans_out, however, that kind of assumptions doesn't consider
- * what will happen if errors occur when sending retransmission for the
- * second time. ...It could the that such segment has only
- * TCPCB_EVER_RETRANS set at the present time. It seems that checking
- * the head skb is enough except for some reneging corner cases that
- * are not worth the effort.
- *
- * Main reason for all this complexity is the fact that connection dying
- * time now depends on the validity of the retrans_stamp, in particular,
- * that successive retransmissions of a segment must not advance
- * retrans_stamp under any conditions.
- */
-static bool tcp_any_retrans_done(const struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb;
-
-	if (tp->retrans_out)
-		return true;
-
-	skb = tcp_write_queue_head(sk);
-	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
-		return true;
-
-	return false;
-}
-
 /* Undo during loss recovery after partial ACK or using F-RTO. */
 static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
 {
-- 
cgit v1.2.3


From 4f031fa9f188b2b0641ac20087d9e16bcfb4e49d Mon Sep 17 00:00:00 2001
From: Ronald Wahl <ronald.wahl@raritan.com>
Date: Thu, 6 Nov 2014 11:52:13 +0100
Subject: mac80211: Fix regression that triggers a kernel BUG with CCMP

Commit 7ec7c4a9a686c608315739ab6a2b0527a240883c (mac80211: port CCMP to
cryptoapi's CCM driver) introduced a regression when decrypting empty
packets (data_len == 0). This will lead to backtraces like:

(scatterwalk_start) from [<c01312f4>] (scatterwalk_map_and_copy+0x2c/0xa8)
(scatterwalk_map_and_copy) from [<c013a5a0>] (crypto_ccm_decrypt+0x7c/0x25c)
(crypto_ccm_decrypt) from [<c032886c>] (ieee80211_aes_ccm_decrypt+0x160/0x170)
(ieee80211_aes_ccm_decrypt) from [<c031c628>] (ieee80211_crypto_ccmp_decrypt+0x1ac/0x238)
(ieee80211_crypto_ccmp_decrypt) from [<c032ef28>] (ieee80211_rx_handlers+0x870/0x1d24)
(ieee80211_rx_handlers) from [<c0330c7c>] (ieee80211_prepare_and_rx_handle+0x8a0/0x91c)
(ieee80211_prepare_and_rx_handle) from [<c0331260>] (ieee80211_rx+0x568/0x730)
(ieee80211_rx) from [<c01d3054>] (__carl9170_rx+0x94c/0xa20)
(__carl9170_rx) from [<c01d3324>] (carl9170_rx_stream+0x1fc/0x320)
(carl9170_rx_stream) from [<c01cbccc>] (carl9170_usb_tasklet+0x80/0xc8)
(carl9170_usb_tasklet) from [<c00199dc>] (tasklet_hi_action+0x88/0xcc)
(tasklet_hi_action) from [<c00193c8>] (__do_softirq+0xcc/0x200)
(__do_softirq) from [<c0019734>] (irq_exit+0x80/0xe0)
(irq_exit) from [<c0009c10>] (handle_IRQ+0x64/0x80)
(handle_IRQ) from [<c000c3a0>] (__irq_svc+0x40/0x4c)
(__irq_svc) from [<c0009d44>] (arch_cpu_idle+0x2c/0x34)

Such packets can appear for example when using the carl9170 wireless driver
because hardware sometimes generates garbage when the internal FIFO overruns.

This patch adds an additional length check.

Cc: stable@vger.kernel.org
Fixes: 7ec7c4a9a686 ("mac80211: port CCMP to cryptoapi's CCM driver")
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Ronald Wahl <ronald.wahl@raritan.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/aes_ccm.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/aes_ccm.c b/net/mac80211/aes_ccm.c
index ec24378caaaf..09d9caaec591 100644
--- a/net/mac80211/aes_ccm.c
+++ b/net/mac80211/aes_ccm.c
@@ -53,6 +53,9 @@ int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad,
 		__aligned(__alignof__(struct aead_request));
 	struct aead_request *aead_req = (void *) aead_req_data;
 
+	if (data_len == 0)
+		return -EINVAL;
+
 	memset(aead_req, 0, sizeof(aead_req_data));
 
 	sg_init_one(&pt, data, data_len);
-- 
cgit v1.2.3


From b31f65fb4383a49bdcfa465176754b37e44e1e17 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Wed, 5 Nov 2014 19:47:28 +0100
Subject: net: dsa: slave: Fix autoneg for phys on switch MDIO bus

When the ports phys are connected to the switches internal MDIO bus,
we need to connect the phy to the slave netdev, otherwise
auto-negotiation etc, does not work.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 6d1817449c36..ab03e00ffe8f 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -489,11 +489,14 @@ static void dsa_slave_phy_setup(struct dsa_slave_priv *p,
 	/* We could not connect to a designated PHY, so use the switch internal
 	 * MDIO bus instead
 	 */
-	if (!p->phy)
+	if (!p->phy) {
 		p->phy = ds->slave_mii_bus->phy_map[p->port];
-	else
+		phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link,
+				   p->phy_interface);
+	} else {
 		pr_info("attached PHY at address %d [%s]\n",
 			p->phy->addr, p->phy->drv->name);
+	}
 }
 
 int dsa_slave_suspend(struct net_device *slave_dev)
-- 
cgit v1.2.3


From 6b96686ecffcbea85dcb502e4584e4a20a2bfb29 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 7 Nov 2014 15:34:54 +0100
Subject: netfilter: nft_masq: fix uninitialized range in nft_masq_{ipv4,
 ipv6}_eval

When transferring from the original range in nf_nat_masquerade_{ipv4,ipv6}()
we copy over values from stack in from min_proto/max_proto due to uninitialized
range variable in both, nft_masq_{ipv4,ipv6}_eval. As we only initialize
flags at this time from nft_masq struct, just zero out the rest.

Fixes: 9ba1f726bec09 ("netfilter: nf_tables: add new nft_masq expression")
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nft_masq_ipv4.c | 1 +
 net/ipv6/netfilter/nft_masq_ipv6.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index c1023c445920..665de06561cd 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -24,6 +24,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
 	struct nf_nat_range range;
 	unsigned int verdict;
 
+	memset(&range, 0, sizeof(range));
 	range.flags = priv->flags;
 
 	verdict = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum,
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index 8a7ac685076d..529c119cbb14 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -25,6 +25,7 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
 	struct nf_nat_range range;
 	unsigned int verdict;
 
+	memset(&range, 0, sizeof(range));
 	range.flags = priv->flags;
 
 	verdict = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out);
-- 
cgit v1.2.3


From cfdf1e1ba5bf55e095cf4bcaa9585c4759f239e8 Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@nicira.com>
Date: Mon, 10 Nov 2014 11:45:13 -0800
Subject: udptunnel: Add SKB_GSO_UDP_TUNNEL during gro_complete.

When doing GRO processing for UDP tunnels, we never add
SKB_GSO_UDP_TUNNEL to gso_type - only the type of the inner protocol
is added (such as SKB_GSO_TCPV4). The result is that if the packet is
later resegmented we will do GSO but not treat it as a tunnel. This
results in UDP fragmentation of the outer header instead of (i.e.) TCP
segmentation of the inner header as was originally on the wire.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fou.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 32e78924e246..606c520ffd5a 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -133,6 +133,8 @@ static int fou_gro_complete(struct sk_buff *skb, int nhoff)
 	int err = -ENOSYS;
 	const struct net_offload **offloads;
 
+	udp_tunnel_gro_complete(skb, nhoff);
+
 	rcu_read_lock();
 	offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
 	ops = rcu_dereference(offloads[proto]);
-- 
cgit v1.2.3


From 2196937e12b1b4ba139806d132647e1651d655df Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 10 Nov 2014 17:11:21 +0100
Subject: netfilter: ipset: small potential read beyond the end of buffer

We could be reading 8 bytes into a 4 byte buffer here.  It seems
harmless but adding a check is the right thing to do and it silences a
static checker warning.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_core.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 86f9d76b1464..d259da3ce67a 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1863,6 +1863,12 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
 	if (*op < IP_SET_OP_VERSION) {
 		/* Check the version at the beginning of operations */
 		struct ip_set_req_version *req_version = data;
+
+		if (*len < sizeof(struct ip_set_req_version)) {
+			ret = -EINVAL;
+			goto done;
+		}
+
 		if (req_version->version != IPSET_PROTOCOL) {
 			ret = -EPROTO;
 			goto done;
-- 
cgit v1.2.3


From e40607cbe270a9e8360907cb1e62ddf0736e4864 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Mon, 10 Nov 2014 17:54:26 +0100
Subject: net: sctp: fix NULL pointer dereference in af->from_addr_param on
 malformed packet

An SCTP server doing ASCONF will panic on malformed INIT ping-of-death
in the form of:

  ------------ INIT[PARAM: SET_PRIMARY_IP] ------------>

While the INIT chunk parameter verification dissects through many things
in order to detect malformed input, it misses to actually check parameters
inside of parameters. E.g. RFC5061, section 4.2.4 proposes a 'set primary
IP address' parameter in ASCONF, which has as a subparameter an address
parameter.

So an attacker may send a parameter type other than SCTP_PARAM_IPV4_ADDRESS
or SCTP_PARAM_IPV6_ADDRESS, param_type2af() will subsequently return 0
and thus sctp_get_af_specific() returns NULL, too, which we then happily
dereference unconditionally through af->from_addr_param().

The trace for the log:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000078
IP: [<ffffffffa01e9c62>] sctp_process_init+0x492/0x990 [sctp]
PGD 0
Oops: 0000 [#1] SMP
[...]
Pid: 0, comm: swapper Not tainted 2.6.32-504.el6.x86_64 #1 Bochs Bochs
RIP: 0010:[<ffffffffa01e9c62>]  [<ffffffffa01e9c62>] sctp_process_init+0x492/0x990 [sctp]
[...]
Call Trace:
 <IRQ>
 [<ffffffffa01f2add>] ? sctp_bind_addr_copy+0x5d/0xe0 [sctp]
 [<ffffffffa01e1fcb>] sctp_sf_do_5_1B_init+0x21b/0x340 [sctp]
 [<ffffffffa01e3751>] sctp_do_sm+0x71/0x1210 [sctp]
 [<ffffffffa01e5c09>] ? sctp_endpoint_lookup_assoc+0xc9/0xf0 [sctp]
 [<ffffffffa01e61f6>] sctp_endpoint_bh_rcv+0x116/0x230 [sctp]
 [<ffffffffa01ee986>] sctp_inq_push+0x56/0x80 [sctp]
 [<ffffffffa01fcc42>] sctp_rcv+0x982/0xa10 [sctp]
 [<ffffffffa01d5123>] ? ipt_local_in_hook+0x23/0x28 [iptable_filter]
 [<ffffffff8148bdc9>] ? nf_iterate+0x69/0xb0
 [<ffffffff81496d10>] ? ip_local_deliver_finish+0x0/0x2d0
 [<ffffffff8148bf86>] ? nf_hook_slow+0x76/0x120
 [<ffffffff81496d10>] ? ip_local_deliver_finish+0x0/0x2d0
[...]

A minimal way to address this is to check for NULL as we do on all
other such occasions where we know sctp_get_af_specific() could
possibly return with NULL.

Fixes: d6de3097592b ("[SCTP]: Add the handling of "Set Primary IP Address" parameter to INIT")
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Vlad Yasevich <vyasevich@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_make_chunk.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index ab734be8cb20..9f32741abb1c 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2609,6 +2609,9 @@ do_addr_param:
 		addr_param = param.v + sizeof(sctp_addip_param_t);
 
 		af = sctp_get_af_specific(param_type2af(param.p->type));
+		if (af == NULL)
+			break;
+
 		af->from_addr_param(&addr, addr_param,
 				    htons(asoc->peer.port), 0);
 
-- 
cgit v1.2.3


From 4184b2a79a7612a9272ce20d639934584a1f3786 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Mon, 10 Nov 2014 18:00:09 +0100
Subject: net: sctp: fix memory leak in auth key management

A very minimal and simple user space application allocating an SCTP
socket, setting SCTP_AUTH_KEY setsockopt(2) on it and then closing
the socket again will leak the memory containing the authentication
key from user space:

unreferenced object 0xffff8800837047c0 (size 16):
  comm "a.out", pid 2789, jiffies 4296954322 (age 192.258s)
  hex dump (first 16 bytes):
    01 00 00 00 04 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<ffffffff816d7e8e>] kmemleak_alloc+0x4e/0xb0
    [<ffffffff811c88d8>] __kmalloc+0xe8/0x270
    [<ffffffffa0870c23>] sctp_auth_create_key+0x23/0x50 [sctp]
    [<ffffffffa08718b1>] sctp_auth_set_key+0xa1/0x140 [sctp]
    [<ffffffffa086b383>] sctp_setsockopt+0xd03/0x1180 [sctp]
    [<ffffffff815bfd94>] sock_common_setsockopt+0x14/0x20
    [<ffffffff815beb61>] SyS_setsockopt+0x71/0xd0
    [<ffffffff816e58a9>] system_call_fastpath+0x12/0x17
    [<ffffffffffffffff>] 0xffffffffffffffff

This is bad because of two things, we can bring down a machine from
user space when auth_enable=1, but also we would leave security sensitive
keying material in memory without clearing it after use. The issue is
that sctp_auth_create_key() already sets the refcount to 1, but after
allocation sctp_auth_set_key() does an additional refcount on it, and
thus leaving it around when we free the socket.

Fixes: 65b07e5d0d0 ("[SCTP]: API updates to suport SCTP-AUTH extensions.")
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Vlad Yasevich <vyasevich@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/auth.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 0e8529113dc5..fb7976aee61c 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -862,8 +862,6 @@ int sctp_auth_set_key(struct sctp_endpoint *ep,
 		list_add(&cur_key->key_list, sh_keys);
 
 	cur_key->key = key;
-	sctp_auth_key_hold(key);
-
 	return 0;
 nomem:
 	if (!replace)
-- 
cgit v1.2.3


From 5337b5b75cd9bd3624a6820e3c2a084d2480061c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 10 Nov 2014 17:54:25 -0800
Subject: ipv6: fix IPV6_PKTINFO with v4 mapped

Use IS_ENABLED(CONFIG_IPV6), to enable this code if IPv6 is
a module.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Fixes: c8e6ad0829a7 ("ipv6: honor IPV6_PKTINFO with v4 mapped addresses on sendmsg")
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index c373a9ad4555..9daf2177dc00 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -195,7 +195,7 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
 		if (!CMSG_OK(msg, cmsg))
 			return -EINVAL;
-#if defined(CONFIG_IPV6)
+#if IS_ENABLED(CONFIG_IPV6)
 		if (allow_ipv6 &&
 		    cmsg->cmsg_level == SOL_IPV6 &&
 		    cmsg->cmsg_type == IPV6_PKTINFO) {
-- 
cgit v1.2.3


From 50656d9df63d69ce399c8be62d4473b039dac36a Mon Sep 17 00:00:00 2001
From: Calvin Owens <calvinowens@fb.com>
Date: Tue, 4 Nov 2014 16:37:40 -0800
Subject: ipvs: Keep skb->sk when allocating headroom on tunnel xmit

ip_vs_prepare_tunneled_skb() ignores ->sk when allocating a new
skb, either unconditionally setting ->sk to NULL or allowing
the uninitialized ->sk from a newly allocated skb to leak through
to the caller.

This patch properly copies ->sk and increments its reference count.

Signed-off-by: Calvin Owens <calvinowens@fb.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 437a3663ad03..bd90bf8107da 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -846,6 +846,8 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
 		new_skb = skb_realloc_headroom(skb, max_headroom);
 		if (!new_skb)
 			goto error;
+		if (skb->sk)
+			skb_set_owner_w(new_skb, skb->sk);
 		consume_skb(skb);
 		skb = new_skb;
 	}
-- 
cgit v1.2.3


From 2daf1b4d18e3add229d1a3b5c554331d99ac6c7e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 7 Nov 2014 18:48:33 +0100
Subject: netfilter: nft_compat: use current net namespace

Instead of init_net when using xtables over nftables compat.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 9d6d6f60a80f..b92f129beade 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -117,7 +117,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par,
 			   struct xt_target *target, void *info,
 			   union nft_entry *entry, u8 proto, bool inv)
 {
-	par->net	= &init_net;
+	par->net	= ctx->net;
 	par->table	= ctx->table->name;
 	switch (ctx->afi->family) {
 	case AF_INET:
@@ -324,7 +324,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx,
 			  struct xt_match *match, void *info,
 			  union nft_entry *entry, u8 proto, bool inv)
 {
-	par->net	= &init_net;
+	par->net	= ctx->net;
 	par->table	= ctx->table->name;
 	switch (ctx->afi->family) {
 	case AF_INET:
-- 
cgit v1.2.3


From c918687f5e3962375a19de6ded3c1be85ebdbcd6 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 10 Nov 2014 20:53:55 +0100
Subject: netfilter: nft_compat: relax chain type validation

Check for nat chain dependency only, which is the one that can
actually crash the kernel. Don't care if mangle, filter and security
specific match and targets are used out of their scope, they are
harmless.

This restores iptables-compat with mangle specific match/target when
used out of the OUTPUT chain, that are actually emulated through filter
chains, which broke when performing strict validation.

Fixes: f3f5dde ("netfilter: nft_compat: validate chain type in match/target")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 32 ++------------------------------
 1 file changed, 2 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index b92f129beade..70dc96516305 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -21,45 +21,17 @@
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <net/netfilter/nf_tables.h>
 
-static const struct {
-       const char	*name;
-       u8		type;
-} table_to_chaintype[] = {
-       { "filter",     NFT_CHAIN_T_DEFAULT },
-       { "raw",        NFT_CHAIN_T_DEFAULT },
-       { "security",   NFT_CHAIN_T_DEFAULT },
-       { "mangle",     NFT_CHAIN_T_ROUTE },
-       { "nat",        NFT_CHAIN_T_NAT },
-       { },
-};
-
-static int nft_compat_table_to_chaintype(const char *table)
-{
-	int i;
-
-	for (i = 0; table_to_chaintype[i].name != NULL; i++) {
-		if (strcmp(table_to_chaintype[i].name, table) == 0)
-			return table_to_chaintype[i].type;
-	}
-
-	return -1;
-}
-
 static int nft_compat_chain_validate_dependency(const char *tablename,
 						const struct nft_chain *chain)
 {
-	enum nft_chain_type type;
 	const struct nft_base_chain *basechain;
 
 	if (!tablename || !(chain->flags & NFT_BASE_CHAIN))
 		return 0;
 
-	type = nft_compat_table_to_chaintype(tablename);
-	if (type < 0)
-		return -EINVAL;
-
 	basechain = nft_base_chain(chain);
-	if (basechain->type->type != type)
+	if (strcmp(tablename, "nat") == 0 &&
+	    basechain->type->type != NFT_CHAIN_T_NAT)
 		return -EINVAL;
 
 	return 0;
-- 
cgit v1.2.3


From afefb6f928ed42d5db452ee9251ce6de62673c67 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 10 Nov 2014 19:08:21 +0100
Subject: netfilter: nft_compat: use the match->table to validate dependencies

Instead of the match->name, which is of course not relevant.

Fixes: f3f5dde ("netfilter: nft_compat: validate chain type in match/target")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 70dc96516305..265e190f2218 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -346,7 +346,7 @@ nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 	union nft_entry e = {};
 	int ret;
 
-	ret = nft_compat_chain_validate_dependency(match->name, ctx->chain);
+	ret = nft_compat_chain_validate_dependency(match->table, ctx->chain);
 	if (ret < 0)
 		goto err;
 
@@ -420,7 +420,7 @@ static int nft_match_validate(const struct nft_ctx *ctx,
 		if (!(hook_mask & match->hooks))
 			return -EINVAL;
 
-		ret = nft_compat_chain_validate_dependency(match->name,
+		ret = nft_compat_chain_validate_dependency(match->table,
 							   ctx->chain);
 		if (ret < 0)
 			return ret;
-- 
cgit v1.2.3


From b326dd37b94e29bf6a15940f4fa66aa21a678ab1 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 10 Nov 2014 21:14:12 +0100
Subject: netfilter: nf_tables: restore synchronous object release from
 commit/abort

The existing xtables matches and targets, when used from nft_compat, may
sleep from the destroy path, ie. when removing rules. Since the objects
are released via call_rcu from softirq context, this results in lockdep
splats and possible lockups that may be hard to reproduce.

Patrick also indicated that delayed object release via call_rcu can
cause us problems in the ordering of event notifications when anonymous
sets are in place.

So, this patch restores the synchronous object release from the commit
and abort paths. This includes a call to synchronize_rcu() to make sure
that no packets are walking on the objects that are going to be
released. This is slowier though, but it's simple and it resolves the
aforementioned problems.

This is a partial revert of c7c32e7 ("netfilter: nf_tables: defer all
object release via rcu") that was introduced in 3.16 to speed up
interaction with userspace.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 11ab4b078f3b..66e8425dbfe7 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3484,13 +3484,8 @@ static void nft_chain_commit_update(struct nft_trans *trans)
 	}
 }
 
-/* Schedule objects for release via rcu to make sure no packets are accesing
- * removed rules.
- */
-static void nf_tables_commit_release_rcu(struct rcu_head *rt)
+static void nf_tables_commit_release(struct nft_trans *trans)
 {
-	struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head);
-
 	switch (trans->msg_type) {
 	case NFT_MSG_DELTABLE:
 		nf_tables_table_destroy(&trans->ctx);
@@ -3612,10 +3607,11 @@ static int nf_tables_commit(struct sk_buff *skb)
 		}
 	}
 
+	synchronize_rcu();
+
 	list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
 		list_del(&trans->list);
-		trans->ctx.nla = NULL;
-		call_rcu(&trans->rcu_head, nf_tables_commit_release_rcu);
+		nf_tables_commit_release(trans);
 	}
 
 	nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
@@ -3623,13 +3619,8 @@ static int nf_tables_commit(struct sk_buff *skb)
 	return 0;
 }
 
-/* Schedule objects for release via rcu to make sure no packets are accesing
- * aborted rules.
- */
-static void nf_tables_abort_release_rcu(struct rcu_head *rt)
+static void nf_tables_abort_release(struct nft_trans *trans)
 {
-	struct nft_trans *trans = container_of(rt, struct nft_trans, rcu_head);
-
 	switch (trans->msg_type) {
 	case NFT_MSG_NEWTABLE:
 		nf_tables_table_destroy(&trans->ctx);
@@ -3725,11 +3716,12 @@ static int nf_tables_abort(struct sk_buff *skb)
 		}
 	}
 
+	synchronize_rcu();
+
 	list_for_each_entry_safe_reverse(trans, next,
 					 &net->nft.commit_list, list) {
 		list_del(&trans->list);
-		trans->ctx.nla = NULL;
-		call_rcu(&trans->rcu_head, nf_tables_abort_release_rcu);
+		nf_tables_abort_release(trans);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 6251edd932ce3faadbfe27b0a0fe79780e0972e9 Mon Sep 17 00:00:00 2001
From: Hiroaki SHIMODA <shimoda.hiroaki@gmail.com>
Date: Thu, 13 Nov 2014 04:24:10 +0900
Subject: netlink: Properly unbind in error conditions.

Even if netlink_kernel_cfg::unbind is implemented the unbind() method is
not called, because cfg->unbind is omitted in __netlink_kernel_create().
And fix wrong argument of test_bit() and off by one problem.

At this point, no unbind() method is implemented, so there is no real
issue.

Fixes: 4f520900522f ("netlink: have netlink per-protocol bind function return an error code.")
Signed-off-by: Hiroaki SHIMODA <shimoda.hiroaki@gmail.com>
Cc: Richard Guy Briggs <rgb@redhat.com>
Acked-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f1de72de273e..0007b8180397 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1440,7 +1440,7 @@ static void netlink_unbind(int group, long unsigned int groups,
 		return;
 
 	for (undo = 0; undo < group; undo++)
-		if (test_bit(group, &groups))
+		if (test_bit(undo, &groups))
 			nlk->netlink_unbind(undo);
 }
 
@@ -1492,7 +1492,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
 			netlink_insert(sk, net, nladdr->nl_pid) :
 			netlink_autobind(sock);
 		if (err) {
-			netlink_unbind(nlk->ngroups - 1, groups, nlk);
+			netlink_unbind(nlk->ngroups, groups, nlk);
 			return err;
 		}
 	}
@@ -2509,6 +2509,7 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module,
 		nl_table[unit].module = module;
 		if (cfg) {
 			nl_table[unit].bind = cfg->bind;
+			nl_table[unit].unbind = cfg->unbind;
 			nl_table[unit].flags = cfg->flags;
 			if (cfg->compare)
 				nl_table[unit].compare = cfg->compare;
-- 
cgit v1.2.3


From b3ecba096729f521312d1863ad22530695527aed Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Thu, 13 Nov 2014 07:30:46 -0500
Subject: sunrpc: fix sleeping under rcu_read_lock in gss_stringify_acceptor

Bruce reported that he was seeing the following BUG pop:

    BUG: sleeping function called from invalid context at mm/slab.c:2846
    in_atomic(): 0, irqs_disabled(): 0, pid: 4539, name: mount.nfs
    2 locks held by mount.nfs/4539:
    #0:  (nfs_clid_init_mutex){+.+.+.}, at: [<ffffffffa01c0a9a>] nfs4_discover_server_trunking+0x4a/0x2f0 [nfsv4]
    #1:  (rcu_read_lock){......}, at: [<ffffffffa00e3185>] gss_stringify_acceptor+0x5/0xb0 [auth_rpcgss]
    Preemption disabled at:[<ffffffff81a4f082>] printk+0x4d/0x4f

    CPU: 3 PID: 4539 Comm: mount.nfs Not tainted 3.18.0-rc1-00013-g5b095e9 #3393
    Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
    ffff880021499390 ffff8800381476a8 ffffffff81a534cf 0000000000000001
    0000000000000000 ffff8800381476c8 ffffffff81097854 00000000000000d0
    0000000000000018 ffff880038147718 ffffffff8118e4f3 0000000020479f00
    Call Trace:
    [<ffffffff81a534cf>] dump_stack+0x4f/0x7c
    [<ffffffff81097854>] __might_sleep+0x114/0x180
    [<ffffffff8118e4f3>] __kmalloc+0x1a3/0x280
    [<ffffffffa00e31d8>] gss_stringify_acceptor+0x58/0xb0 [auth_rpcgss]
    [<ffffffffa00e3185>] ? gss_stringify_acceptor+0x5/0xb0 [auth_rpcgss]
    [<ffffffffa006b438>] rpcauth_stringify_acceptor+0x18/0x30 [sunrpc]
    [<ffffffffa01b0469>] nfs4_proc_setclientid+0x199/0x380 [nfsv4]
    [<ffffffffa01b04d0>] ? nfs4_proc_setclientid+0x200/0x380 [nfsv4]
    [<ffffffffa01bdf1a>] nfs40_discover_server_trunking+0xda/0x150 [nfsv4]
    [<ffffffffa01bde45>] ? nfs40_discover_server_trunking+0x5/0x150 [nfsv4]
    [<ffffffffa01c0acf>] nfs4_discover_server_trunking+0x7f/0x2f0 [nfsv4]
    [<ffffffffa01c8e24>] nfs4_init_client+0x104/0x2f0 [nfsv4]
    [<ffffffffa01539b4>] nfs_get_client+0x314/0x3f0 [nfs]
    [<ffffffffa0153780>] ? nfs_get_client+0xe0/0x3f0 [nfs]
    [<ffffffffa01c83aa>] nfs4_set_client+0x8a/0x110 [nfsv4]
    [<ffffffffa0069708>] ? __rpc_init_priority_wait_queue+0xa8/0xf0 [sunrpc]
    [<ffffffffa01c9b2f>] nfs4_create_server+0x12f/0x390 [nfsv4]
    [<ffffffffa01c1472>] nfs4_remote_mount+0x32/0x60 [nfsv4]
    [<ffffffff81196489>] mount_fs+0x39/0x1b0
    [<ffffffff81166145>] ? __alloc_percpu+0x15/0x20
    [<ffffffff811b276b>] vfs_kern_mount+0x6b/0x150
    [<ffffffffa01c1396>] nfs_do_root_mount+0x86/0xc0 [nfsv4]
    [<ffffffffa01c1784>] nfs4_try_mount+0x44/0xc0 [nfsv4]
    [<ffffffffa01549b7>] ? get_nfs_version+0x27/0x90 [nfs]
    [<ffffffffa0161a2d>] nfs_fs_mount+0x47d/0xd60 [nfs]
    [<ffffffff81a59c5e>] ? mutex_unlock+0xe/0x10
    [<ffffffffa01606a0>] ? nfs_remount+0x430/0x430 [nfs]
    [<ffffffffa01609c0>] ? nfs_clone_super+0x140/0x140 [nfs]
    [<ffffffff81196489>] mount_fs+0x39/0x1b0
    [<ffffffff81166145>] ? __alloc_percpu+0x15/0x20
    [<ffffffff811b276b>] vfs_kern_mount+0x6b/0x150
    [<ffffffff811b5830>] do_mount+0x210/0xbe0
    [<ffffffff811b54ca>] ? copy_mount_options+0x3a/0x160
    [<ffffffff811b651f>] SyS_mount+0x6f/0xb0
    [<ffffffff81a5c852>] system_call_fastpath+0x12/0x17

Sleeping under the rcu_read_lock is bad. This patch fixes it by dropping
the rcu_read_lock before doing the allocation and then reacquiring it
and redoing the dereference before doing the copy. If we find that the
string has somehow grown in the meantime, we'll reallocate and try again.

Cc: <stable@vger.kernel.org> # v3.17+
Reported-by: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 net/sunrpc/auth_gss/auth_gss.c | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index afb292cd797d..53ed8d3f8897 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1353,6 +1353,7 @@ gss_stringify_acceptor(struct rpc_cred *cred)
 	char *string = NULL;
 	struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base);
 	struct gss_cl_ctx *ctx;
+	unsigned int len;
 	struct xdr_netobj *acceptor;
 
 	rcu_read_lock();
@@ -1360,15 +1361,39 @@ gss_stringify_acceptor(struct rpc_cred *cred)
 	if (!ctx)
 		goto out;
 
-	acceptor = &ctx->gc_acceptor;
+	len = ctx->gc_acceptor.len;
+	rcu_read_unlock();
 
 	/* no point if there's no string */
-	if (!acceptor->len)
-		goto out;
-
-	string = kmalloc(acceptor->len + 1, GFP_KERNEL);
+	if (!len)
+		return NULL;
+realloc:
+	string = kmalloc(len + 1, GFP_KERNEL);
 	if (!string)
+		return NULL;
+
+	rcu_read_lock();
+	ctx = rcu_dereference(gss_cred->gc_ctx);
+
+	/* did the ctx disappear or was it replaced by one with no acceptor? */
+	if (!ctx || !ctx->gc_acceptor.len) {
+		kfree(string);
+		string = NULL;
 		goto out;
+	}
+
+	acceptor = &ctx->gc_acceptor;
+
+	/*
+	 * Did we find a new acceptor that's longer than the original? Allocate
+	 * a longer buffer and try again.
+	 */
+	if (len < acceptor->len) {
+		len = acceptor->len;
+		rcu_read_unlock();
+		kfree(string);
+		goto realloc;
+	}
 
 	memcpy(string, acceptor->data, acceptor->len);
 	string[acceptor->len] = '\0';
-- 
cgit v1.2.3


From aaef31703a0cf6a733e651885bfb49edc3ac6774 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@redhat.com>
Date: Thu, 23 Oct 2014 00:25:22 +0400
Subject: libceph: do not crash on large auth tickets

Large (greater than 32k, the value of PAGE_ALLOC_COSTLY_ORDER) auth
tickets will have their buffers vmalloc'ed, which leads to the
following crash in crypto:

[   28.685082] BUG: unable to handle kernel paging request at ffffeb04000032c0
[   28.686032] IP: [<ffffffff81392b42>] scatterwalk_pagedone+0x22/0x80
[   28.686032] PGD 0
[   28.688088] Oops: 0000 [#1] PREEMPT SMP
[   28.688088] Modules linked in:
[   28.688088] CPU: 0 PID: 878 Comm: kworker/0:2 Not tainted 3.17.0-vm+ #305
[   28.688088] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[   28.688088] Workqueue: ceph-msgr con_work
[   28.688088] task: ffff88011a7f9030 ti: ffff8800d903c000 task.ti: ffff8800d903c000
[   28.688088] RIP: 0010:[<ffffffff81392b42>]  [<ffffffff81392b42>] scatterwalk_pagedone+0x22/0x80
[   28.688088] RSP: 0018:ffff8800d903f688  EFLAGS: 00010286
[   28.688088] RAX: ffffeb04000032c0 RBX: ffff8800d903f718 RCX: ffffeb04000032c0
[   28.688088] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8800d903f750
[   28.688088] RBP: ffff8800d903f688 R08: 00000000000007de R09: ffff8800d903f880
[   28.688088] R10: 18df467c72d6257b R11: 0000000000000000 R12: 0000000000000010
[   28.688088] R13: ffff8800d903f750 R14: ffff8800d903f8a0 R15: 0000000000000000
[   28.688088] FS:  00007f50a41c7700(0000) GS:ffff88011fc00000(0000) knlGS:0000000000000000
[   28.688088] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[   28.688088] CR2: ffffeb04000032c0 CR3: 00000000da3f3000 CR4: 00000000000006b0
[   28.688088] Stack:
[   28.688088]  ffff8800d903f698 ffffffff81392ca8 ffff8800d903f6e8 ffffffff81395d32
[   28.688088]  ffff8800dac96000 ffff880000000000 ffff8800d903f980 ffff880119b7e020
[   28.688088]  ffff880119b7e010 0000000000000000 0000000000000010 0000000000000010
[   28.688088] Call Trace:
[   28.688088]  [<ffffffff81392ca8>] scatterwalk_done+0x38/0x40
[   28.688088]  [<ffffffff81392ca8>] scatterwalk_done+0x38/0x40
[   28.688088]  [<ffffffff81395d32>] blkcipher_walk_done+0x182/0x220
[   28.688088]  [<ffffffff813990bf>] crypto_cbc_encrypt+0x15f/0x180
[   28.688088]  [<ffffffff81399780>] ? crypto_aes_set_key+0x30/0x30
[   28.688088]  [<ffffffff8156c40c>] ceph_aes_encrypt2+0x29c/0x2e0
[   28.688088]  [<ffffffff8156d2a3>] ceph_encrypt2+0x93/0xb0
[   28.688088]  [<ffffffff8156d7da>] ceph_x_encrypt+0x4a/0x60
[   28.688088]  [<ffffffff8155b39d>] ? ceph_buffer_new+0x5d/0xf0
[   28.688088]  [<ffffffff8156e837>] ceph_x_build_authorizer.isra.6+0x297/0x360
[   28.688088]  [<ffffffff8112089b>] ? kmem_cache_alloc_trace+0x11b/0x1c0
[   28.688088]  [<ffffffff8156b496>] ? ceph_auth_create_authorizer+0x36/0x80
[   28.688088]  [<ffffffff8156ed83>] ceph_x_create_authorizer+0x63/0xd0
[   28.688088]  [<ffffffff8156b4b4>] ceph_auth_create_authorizer+0x54/0x80
[   28.688088]  [<ffffffff8155f7c0>] get_authorizer+0x80/0xd0
[   28.688088]  [<ffffffff81555a8b>] prepare_write_connect+0x18b/0x2b0
[   28.688088]  [<ffffffff81559289>] try_read+0x1e59/0x1f10

This is because we set up crypto scatterlists as if all buffers were
kmalloc'ed.  Fix it.

Cc: stable@vger.kernel.org
Signed-off-by: Ilya Dryomov <idryomov@redhat.com>
Reviewed-by: Sage Weil <sage@redhat.com>
---
 net/ceph/crypto.c | 169 ++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 132 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index 62fc5e7a9acf..790fe89d90c0 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -90,11 +90,82 @@ static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
 
 static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
 
+/*
+ * Should be used for buffers allocated with ceph_kvmalloc().
+ * Currently these are encrypt out-buffer (ceph_buffer) and decrypt
+ * in-buffer (msg front).
+ *
+ * Dispose of @sgt with teardown_sgtable().
+ *
+ * @prealloc_sg is to avoid memory allocation inside sg_alloc_table()
+ * in cases where a single sg is sufficient.  No attempt to reduce the
+ * number of sgs by squeezing physically contiguous pages together is
+ * made though, for simplicity.
+ */
+static int setup_sgtable(struct sg_table *sgt, struct scatterlist *prealloc_sg,
+			 const void *buf, unsigned int buf_len)
+{
+	struct scatterlist *sg;
+	const bool is_vmalloc = is_vmalloc_addr(buf);
+	unsigned int off = offset_in_page(buf);
+	unsigned int chunk_cnt = 1;
+	unsigned int chunk_len = PAGE_ALIGN(off + buf_len);
+	int i;
+	int ret;
+
+	if (buf_len == 0) {
+		memset(sgt, 0, sizeof(*sgt));
+		return -EINVAL;
+	}
+
+	if (is_vmalloc) {
+		chunk_cnt = chunk_len >> PAGE_SHIFT;
+		chunk_len = PAGE_SIZE;
+	}
+
+	if (chunk_cnt > 1) {
+		ret = sg_alloc_table(sgt, chunk_cnt, GFP_NOFS);
+		if (ret)
+			return ret;
+	} else {
+		WARN_ON(chunk_cnt != 1);
+		sg_init_table(prealloc_sg, 1);
+		sgt->sgl = prealloc_sg;
+		sgt->nents = sgt->orig_nents = 1;
+	}
+
+	for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) {
+		struct page *page;
+		unsigned int len = min(chunk_len - off, buf_len);
+
+		if (is_vmalloc)
+			page = vmalloc_to_page(buf);
+		else
+			page = virt_to_page(buf);
+
+		sg_set_page(sg, page, len, off);
+
+		off = 0;
+		buf += len;
+		buf_len -= len;
+	}
+	WARN_ON(buf_len != 0);
+
+	return 0;
+}
+
+static void teardown_sgtable(struct sg_table *sgt)
+{
+	if (sgt->orig_nents > 1)
+		sg_free_table(sgt);
+}
+
 static int ceph_aes_encrypt(const void *key, int key_len,
 			    void *dst, size_t *dst_len,
 			    const void *src, size_t src_len)
 {
-	struct scatterlist sg_in[2], sg_out[1];
+	struct scatterlist sg_in[2], prealloc_sg;
+	struct sg_table sg_out;
 	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
 	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
 	int ret;
@@ -110,16 +181,18 @@ static int ceph_aes_encrypt(const void *key, int key_len,
 
 	*dst_len = src_len + zero_padding;
 
-	crypto_blkcipher_setkey((void *)tfm, key, key_len);
 	sg_init_table(sg_in, 2);
 	sg_set_buf(&sg_in[0], src, src_len);
 	sg_set_buf(&sg_in[1], pad, zero_padding);
-	sg_init_table(sg_out, 1);
-	sg_set_buf(sg_out, dst, *dst_len);
+	ret = setup_sgtable(&sg_out, &prealloc_sg, dst, *dst_len);
+	if (ret)
+		goto out_tfm;
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
 	iv = crypto_blkcipher_crt(tfm)->iv;
 	ivsize = crypto_blkcipher_ivsize(tfm);
-
 	memcpy(iv, aes_iv, ivsize);
+
 	/*
 	print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
 		       key, key_len, 1);
@@ -128,16 +201,22 @@ static int ceph_aes_encrypt(const void *key, int key_len,
 	print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
 			pad, zero_padding, 1);
 	*/
-	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+	ret = crypto_blkcipher_encrypt(&desc, sg_out.sgl, sg_in,
 				     src_len + zero_padding);
-	crypto_free_blkcipher(tfm);
-	if (ret < 0)
+	if (ret < 0) {
 		pr_err("ceph_aes_crypt failed %d\n", ret);
+		goto out_sg;
+	}
 	/*
 	print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
 		       dst, *dst_len, 1);
 	*/
-	return 0;
+
+out_sg:
+	teardown_sgtable(&sg_out);
+out_tfm:
+	crypto_free_blkcipher(tfm);
+	return ret;
 }
 
 static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
@@ -145,7 +224,8 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
 			     const void *src1, size_t src1_len,
 			     const void *src2, size_t src2_len)
 {
-	struct scatterlist sg_in[3], sg_out[1];
+	struct scatterlist sg_in[3], prealloc_sg;
+	struct sg_table sg_out;
 	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
 	struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
 	int ret;
@@ -161,17 +241,19 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
 
 	*dst_len = src1_len + src2_len + zero_padding;
 
-	crypto_blkcipher_setkey((void *)tfm, key, key_len);
 	sg_init_table(sg_in, 3);
 	sg_set_buf(&sg_in[0], src1, src1_len);
 	sg_set_buf(&sg_in[1], src2, src2_len);
 	sg_set_buf(&sg_in[2], pad, zero_padding);
-	sg_init_table(sg_out, 1);
-	sg_set_buf(sg_out, dst, *dst_len);
+	ret = setup_sgtable(&sg_out, &prealloc_sg, dst, *dst_len);
+	if (ret)
+		goto out_tfm;
+
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
 	iv = crypto_blkcipher_crt(tfm)->iv;
 	ivsize = crypto_blkcipher_ivsize(tfm);
-
 	memcpy(iv, aes_iv, ivsize);
+
 	/*
 	print_hex_dump(KERN_ERR, "enc  key: ", DUMP_PREFIX_NONE, 16, 1,
 		       key, key_len, 1);
@@ -182,23 +264,30 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
 	print_hex_dump(KERN_ERR, "enc  pad: ", DUMP_PREFIX_NONE, 16, 1,
 			pad, zero_padding, 1);
 	*/
-	ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+	ret = crypto_blkcipher_encrypt(&desc, sg_out.sgl, sg_in,
 				     src1_len + src2_len + zero_padding);
-	crypto_free_blkcipher(tfm);
-	if (ret < 0)
+	if (ret < 0) {
 		pr_err("ceph_aes_crypt2 failed %d\n", ret);
+		goto out_sg;
+	}
 	/*
 	print_hex_dump(KERN_ERR, "enc  out: ", DUMP_PREFIX_NONE, 16, 1,
 		       dst, *dst_len, 1);
 	*/
-	return 0;
+
+out_sg:
+	teardown_sgtable(&sg_out);
+out_tfm:
+	crypto_free_blkcipher(tfm);
+	return ret;
 }
 
 static int ceph_aes_decrypt(const void *key, int key_len,
 			    void *dst, size_t *dst_len,
 			    const void *src, size_t src_len)
 {
-	struct scatterlist sg_in[1], sg_out[2];
+	struct sg_table sg_in;
+	struct scatterlist sg_out[2], prealloc_sg;
 	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
 	struct blkcipher_desc desc = { .tfm = tfm };
 	char pad[16];
@@ -210,16 +299,16 @@ static int ceph_aes_decrypt(const void *key, int key_len,
 	if (IS_ERR(tfm))
 		return PTR_ERR(tfm);
 
-	crypto_blkcipher_setkey((void *)tfm, key, key_len);
-	sg_init_table(sg_in, 1);
 	sg_init_table(sg_out, 2);
-	sg_set_buf(sg_in, src, src_len);
 	sg_set_buf(&sg_out[0], dst, *dst_len);
 	sg_set_buf(&sg_out[1], pad, sizeof(pad));
+	ret = setup_sgtable(&sg_in, &prealloc_sg, src, src_len);
+	if (ret)
+		goto out_tfm;
 
+	crypto_blkcipher_setkey((void *)tfm, key, key_len);
 	iv = crypto_blkcipher_crt(tfm)->iv;
 	ivsize = crypto_blkcipher_ivsize(tfm);
-
 	memcpy(iv, aes_iv, ivsize);
 
 	/*
@@ -228,12 +317,10 @@ static int ceph_aes_decrypt(const void *key, int key_len,
 	print_hex_dump(KERN_ERR, "dec  in: ", DUMP_PREFIX_NONE, 16, 1,
 		       src, src_len, 1);
 	*/
-
-	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
-	crypto_free_blkcipher(tfm);
+	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in.sgl, src_len);
 	if (ret < 0) {
 		pr_err("ceph_aes_decrypt failed %d\n", ret);
-		return ret;
+		goto out_sg;
 	}
 
 	if (src_len <= *dst_len)
@@ -251,7 +338,12 @@ static int ceph_aes_decrypt(const void *key, int key_len,
 	print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
 		       dst, *dst_len, 1);
 	*/
-	return 0;
+
+out_sg:
+	teardown_sgtable(&sg_in);
+out_tfm:
+	crypto_free_blkcipher(tfm);
+	return ret;
 }
 
 static int ceph_aes_decrypt2(const void *key, int key_len,
@@ -259,7 +351,8 @@ static int ceph_aes_decrypt2(const void *key, int key_len,
 			     void *dst2, size_t *dst2_len,
 			     const void *src, size_t src_len)
 {
-	struct scatterlist sg_in[1], sg_out[3];
+	struct sg_table sg_in;
+	struct scatterlist sg_out[3], prealloc_sg;
 	struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
 	struct blkcipher_desc desc = { .tfm = tfm };
 	char pad[16];
@@ -271,17 +364,17 @@ static int ceph_aes_decrypt2(const void *key, int key_len,
 	if (IS_ERR(tfm))
 		return PTR_ERR(tfm);
 
-	sg_init_table(sg_in, 1);
-	sg_set_buf(sg_in, src, src_len);
 	sg_init_table(sg_out, 3);
 	sg_set_buf(&sg_out[0], dst1, *dst1_len);
 	sg_set_buf(&sg_out[1], dst2, *dst2_len);
 	sg_set_buf(&sg_out[2], pad, sizeof(pad));
+	ret = setup_sgtable(&sg_in, &prealloc_sg, src, src_len);
+	if (ret)
+		goto out_tfm;
 
 	crypto_blkcipher_setkey((void *)tfm, key, key_len);
 	iv = crypto_blkcipher_crt(tfm)->iv;
 	ivsize = crypto_blkcipher_ivsize(tfm);
-
 	memcpy(iv, aes_iv, ivsize);
 
 	/*
@@ -290,12 +383,10 @@ static int ceph_aes_decrypt2(const void *key, int key_len,
 	print_hex_dump(KERN_ERR, "dec   in: ", DUMP_PREFIX_NONE, 16, 1,
 		       src, src_len, 1);
 	*/
-
-	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
-	crypto_free_blkcipher(tfm);
+	ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in.sgl, src_len);
 	if (ret < 0) {
 		pr_err("ceph_aes_decrypt failed %d\n", ret);
-		return ret;
+		goto out_sg;
 	}
 
 	if (src_len <= *dst1_len)
@@ -325,7 +416,11 @@ static int ceph_aes_decrypt2(const void *key, int key_len,
 		       dst2, *dst2_len, 1);
 	*/
 
-	return 0;
+out_sg:
+	teardown_sgtable(&sg_in);
+out_tfm:
+	crypto_free_blkcipher(tfm);
+	return ret;
 }
 
 
-- 
cgit v1.2.3


From a390de0208e7f2f8fdb2fbf970240e4f7b308037 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@redhat.com>
Date: Tue, 4 Nov 2014 18:32:14 +0300
Subject: libceph: unlink from o_linger_requests when clearing r_osd

Requests have to be unlinked from both osd->o_requests (normal
requests) and osd->o_linger_requests (linger requests) lists when
clearing req->r_osd.  Otherwise __unregister_linger_request() gets
confused and we trip over a !list_empty(&osd->o_linger_requests)
assert in __remove_osd().

MON=1 OSD=1:

    # cat remove-osd.sh
    #!/bin/bash
    rbd create --size 1 test
    DEV=$(rbd map test)
    ceph osd out 0
    sleep 3
    rbd map dne/dne # obtain a new osdmap as a side effect
    rbd unmap $DEV & # will block
    sleep 3
    ceph osd in 0

Signed-off-by: Ilya Dryomov <idryomov@redhat.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osd_client.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index f3fc54eac09d..75abaa87abac 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1395,6 +1395,7 @@ static int __map_request(struct ceph_osd_client *osdc,
 	if (req->r_osd) {
 		__cancel_request(req);
 		list_del_init(&req->r_osd_item);
+		list_del_init(&req->r_linger_osd_item);
 		req->r_osd = NULL;
 	}
 
-- 
cgit v1.2.3


From ba9d114ec5578e6e99a4dfa37ff8ae688040fd64 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@redhat.com>
Date: Wed, 5 Nov 2014 15:45:58 +0300
Subject: libceph: clear r_req_lru_item in __unregister_linger_request()

kick_requests() can put linger requests on the notarget list.  This
means we need to clear the much-overloaded req->r_req_lru_item in
__unregister_linger_request() as well, or we get an assertion failure
in ceph_osdc_release_request() - !list_empty(&req->r_req_lru_item).

AFAICT the assumption was that registered linger requests cannot be on
any of req->r_req_lru_item lists, but that's clearly not the case.

Signed-off-by: Ilya Dryomov <idryomov@redhat.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osd_client.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 75abaa87abac..decc3b74e65f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1254,6 +1254,8 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc,
 		if (list_empty(&req->r_osd_item))
 			req->r_osd = NULL;
 	}
+
+	list_del_init(&req->r_req_lru_item); /* can be on notarget */
 	ceph_osdc_put_request(req);
 }
 
-- 
cgit v1.2.3


From cc9f1f518cec079289d11d732efa490306b1ddad Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@redhat.com>
Date: Wed, 5 Nov 2014 19:33:44 +0300
Subject: libceph: change from BUG to WARN for __remove_osd() asserts

No reason to use BUG_ON for osd request list assertions.

Signed-off-by: Ilya Dryomov <idryomov@redhat.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 net/ceph/osd_client.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index decc3b74e65f..6f164289bde8 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1007,8 +1007,8 @@ static void put_osd(struct ceph_osd *osd)
 static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 {
 	dout("__remove_osd %p\n", osd);
-	BUG_ON(!list_empty(&osd->o_requests));
-	BUG_ON(!list_empty(&osd->o_linger_requests));
+	WARN_ON(!list_empty(&osd->o_requests));
+	WARN_ON(!list_empty(&osd->o_linger_requests));
 
 	rb_erase(&osd->o_node, &osdc->osds);
 	list_del_init(&osd->o_osd_lru);
-- 
cgit v1.2.3


From 5195c14c8b27cc0b18220ddbf0e5ad3328a04187 Mon Sep 17 00:00:00 2001
From: bill bonaparte <programme110@gmail.com>
Date: Thu, 6 Nov 2014 14:36:48 +0100
Subject: netfilter: conntrack: fix race in __nf_conntrack_confirm against
 get_next_corpse

After removal of the central spinlock nf_conntrack_lock, in
commit 93bb0ceb75be2 ("netfilter: conntrack: remove central
spinlock nf_conntrack_lock"), it is possible to race against
get_next_corpse().

The race is against the get_next_corpse() cleanup on
the "unconfirmed" list (a per-cpu list with seperate locking),
which set the DYING bit.

Fix this race, in __nf_conntrack_confirm(), by removing the CT
from unconfirmed list before checking the DYING bit.  In case
race occured, re-add the CT to the dying list.

While at this, fix coding style of the comment that has been
updated.

Fixes: 93bb0ceb75be2 ("netfilter: conntrack: remove central spinlock nf_conntrack_lock")
Reported-by: bill bonaparte <programme110@gmail.com>
Signed-off-by: bill bonaparte <programme110@gmail.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_core.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 5016a6929085..2c699757bccf 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -611,12 +611,16 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	 */
 	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
 	pr_debug("Confirming conntrack %p\n", ct);
-	/* We have to check the DYING flag inside the lock to prevent
-	   a race against nf_ct_get_next_corpse() possibly called from
-	   user context, else we insert an already 'dead' hash, blocking
-	   further use of that particular connection -JM */
+
+	/* We have to check the DYING flag after unlink to prevent
+	 * a race against nf_ct_get_next_corpse() possibly called from
+	 * user context, else we insert an already 'dead' hash, blocking
+	 * further use of that particular connection -JM.
+	 */
+	nf_ct_del_from_dying_or_unconfirmed_list(ct);
 
 	if (unlikely(nf_ct_is_dying(ct))) {
+		nf_ct_add_to_dying_list(ct);
 		nf_conntrack_double_unlock(hash, reply_hash);
 		local_bh_enable();
 		return NF_ACCEPT;
@@ -636,8 +640,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
 			goto out;
 
-	nf_ct_del_from_dying_or_unconfirmed_list(ct);
-
 	/* Timer relative to confirmation time, not original
 	   setting time, otherwise we'd get timer wrap in
 	   weird delay cases. */
-- 
cgit v1.2.3


From ab64f16ff2e83371927c57a0380fd3c0fee5c1c1 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Tue, 11 Nov 2014 13:40:49 -0800
Subject: openvswitch: Fix memory leak.

Need to free memory in case of sample action error.

Introduced by commit 651887b0c22cffcfce7eb9c ("openvswitch: Sample
action without side effects").

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/actions.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 006886dbee36..00e447a17f64 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -722,8 +722,6 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 
 		case OVS_ACTION_ATTR_SAMPLE:
 			err = sample(dp, skb, key, a);
-			if (unlikely(err)) /* skb already freed. */
-				return err;
 			break;
 		}
 
-- 
cgit v1.2.3


From 856447d0209c2214d806b30bd3b0d873db5998bd Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@nicira.com>
Date: Tue, 11 Nov 2014 14:32:20 -0800
Subject: openvswitch: Fix checksum calculation when modifying ICMPv6 packets.

The checksum of ICMPv6 packets uses the IP pseudoheader as part of
the calculation, unlike ICMP in IPv4. This was not implemented,
which means that modifying the IP addresses of an ICMPv6 packet
would cause the checksum to no longer be correct as the psuedoheader
did not match.
Introduced by commit 3fdbd1ce11e5 ("openvswitch: add ipv6 'set' action").

Reported-by: Neal Shrader <icosahedral@gmail.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/actions.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 00e447a17f64..8c4229b11c34 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -246,11 +246,11 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto,
 {
 	int transport_len = skb->len - skb_transport_offset(skb);
 
-	if (l4_proto == IPPROTO_TCP) {
+	if (l4_proto == NEXTHDR_TCP) {
 		if (likely(transport_len >= sizeof(struct tcphdr)))
 			inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb,
 						  addr, new_addr, 1);
-	} else if (l4_proto == IPPROTO_UDP) {
+	} else if (l4_proto == NEXTHDR_UDP) {
 		if (likely(transport_len >= sizeof(struct udphdr))) {
 			struct udphdr *uh = udp_hdr(skb);
 
@@ -261,6 +261,10 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto,
 					uh->check = CSUM_MANGLED_0;
 			}
 		}
+	} else if (l4_proto == NEXTHDR_ICMP) {
+		if (likely(transport_len >= sizeof(struct icmp6hdr)))
+			inet_proto_csum_replace16(&icmp6_hdr(skb)->icmp6_cksum,
+						  skb, addr, new_addr, 1);
 	}
 }
 
-- 
cgit v1.2.3


From 19e7a3df7261c9b7ebced8163c383712d5b6ac6b Mon Sep 17 00:00:00 2001
From: Daniele Di Proietto <ddiproietto@vmware.com>
Date: Tue, 11 Nov 2014 14:51:22 -0800
Subject: openvswitch: Fix NDP flow mask validation

match_validate() enforce that a mask matching on NDP attributes has also an
exact match on ICMPv6 type.
The ICMPv6 type, which is 8-bit wide, is stored in the 'tp.src' field of
'struct sw_flow_key', which is 16-bit wide.
Therefore, an exact match on ICMPv6 type should only check the first 8 bits.

This commit fixes a bug that prevented flows with an exact match on NDP field
from being installed
Introduced by commit 03f0d916aa03 ("openvswitch: Mega flow implementation").

Signed-off-by: Daniele Di Proietto <ddiproietto@vmware.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/flow_netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 939bcb32100f..dda040e693a3 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -145,7 +145,7 @@ static bool match_validate(const struct sw_flow_match *match,
 	if (match->key->eth.type == htons(ETH_P_ARP)
 			|| match->key->eth.type == htons(ETH_P_RARP)) {
 		key_expected |= 1 << OVS_KEY_ATTR_ARP;
-		if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+		if (match->mask && (match->mask->key.tp.src == htons(0xff)))
 			mask_allowed |= 1 << OVS_KEY_ATTR_ARP;
 	}
 
-- 
cgit v1.2.3


From 8ec609d8b561468691b60347ff594bd443ea58c0 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Tue, 11 Nov 2014 15:55:16 -0800
Subject: openvswitch: Convert dp rcu read operation to locked operations

dp read operations depends on ovs_dp_cmd_fill_info(). This API
needs to looup vport to find dp name, but vport lookup can
fail. Therefore to keep vport reference alive we need to
take ovs lock.

Introduced by commit 6093ae9abac1 ("openvswitch: Minimize
dp and vport critical sections").

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Andy Zhou <azhou@nicira.com>
---
 net/openvswitch/datapath.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index e6d7255183eb..f9e556b56086 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1265,7 +1265,7 @@ static size_t ovs_dp_cmd_msg_size(void)
 	return msgsize;
 }
 
-/* Called with ovs_mutex or RCU read lock. */
+/* Called with ovs_mutex. */
 static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb,
 				u32 portid, u32 seq, u32 flags, u8 cmd)
 {
@@ -1555,7 +1555,7 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
 	if (!reply)
 		return -ENOMEM;
 
-	rcu_read_lock();
+	ovs_lock();
 	dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs);
 	if (IS_ERR(dp)) {
 		err = PTR_ERR(dp);
@@ -1564,12 +1564,12 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
 	err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid,
 				   info->snd_seq, 0, OVS_DP_CMD_NEW);
 	BUG_ON(err < 0);
-	rcu_read_unlock();
+	ovs_unlock();
 
 	return genlmsg_reply(reply, info);
 
 err_unlock_free:
-	rcu_read_unlock();
+	ovs_unlock();
 	kfree_skb(reply);
 	return err;
 }
@@ -1581,8 +1581,8 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	int skip = cb->args[0];
 	int i = 0;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(dp, &ovs_net->dps, list_node) {
+	ovs_lock();
+	list_for_each_entry(dp, &ovs_net->dps, list_node) {
 		if (i >= skip &&
 		    ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid,
 					 cb->nlh->nlmsg_seq, NLM_F_MULTI,
@@ -1590,7 +1590,7 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb)
 			break;
 		i++;
 	}
-	rcu_read_unlock();
+	ovs_unlock();
 
 	cb->args[0] = i;
 
-- 
cgit v1.2.3


From fecaef85f7188ad1822210e2c7a7625c9a32a8e4 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jrajahalme@nicira.com>
Date: Tue, 11 Nov 2014 14:36:30 -0800
Subject: openvswitch: Validate IPv6 flow key and mask values.

Reject flow label key and mask values with invalid bits set.
Introduced by commit 3fdbd1ce11e5 ("openvswitch: add ipv6 'set'
action").

Signed-off-by: Jarno Rajahalme <jrajahalme@nicira.com>
Acked-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
---
 net/openvswitch/flow_netlink.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index dda040e693a3..fa4ec2e4a78b 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -689,6 +689,13 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
 				ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX);
 			return -EINVAL;
 		}
+
+		if (ipv6_key->ipv6_label & htonl(0xFFF00000)) {
+			OVS_NLERR("IPv6 flow label %x is out of range (max=%x).\n",
+				  ntohl(ipv6_key->ipv6_label), (1 << 20) - 1);
+			return -EINVAL;
+		}
+
 		SW_FLOW_KEY_PUT(match, ipv6.label,
 				ipv6_key->ipv6_label, is_mask);
 		SW_FLOW_KEY_PUT(match, ip.proto,
-- 
cgit v1.2.3


From 49dd18ba4615eaa72f15c9087dea1c2ab4744cf5 Mon Sep 17 00:00:00 2001
From: Panu Matilainen <pmatilai@redhat.com>
Date: Fri, 14 Nov 2014 13:14:32 +0200
Subject: ipv4: Fix incorrect error code when adding an unreachable route

Trying to add an unreachable route incorrectly returns -ESRCH if
if custom FIB rules are present:

[root@localhost ~]# ip route add 74.125.31.199 dev eth0 via 1.2.3.4
RTNETLINK answers: Network is unreachable
[root@localhost ~]# ip rule add to 55.66.77.88 table 200
[root@localhost ~]# ip route add 74.125.31.199 dev eth0 via 1.2.3.4
RTNETLINK answers: No such process
[root@localhost ~]#

Commit 83886b6b636173b206f475929e58fac75c6f2446 ("[NET]: Change "not found"
return value for rule lookup") changed fib_rules_lookup()
to use -ESRCH as a "not found" code internally, but for user space it
should be translated into -ENETUNREACH. Handle the translation centrally in
ipv4-specific fib_lookup(), leaving the DECnet case alone.

On a related note, commit b7a71b51ee37d919e4098cd961d59a883fd272d8
("ipv4: removed redundant conditional") removed a similar translation from
ip_route_input_slow() prematurely AIUI.

Fixes: b7a71b51ee37 ("ipv4: removed redundant conditional")
Signed-off-by: Panu Matilainen <pmatilai@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_rules.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index f2e15738534d..8f7bd56955b0 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -62,6 +62,10 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
 	else
 		res->tclassid = 0;
 #endif
+
+	if (err == -ESRCH)
+		err = -ENETUNREACH;
+
 	return err;
 }
 EXPORT_SYMBOL_GPL(__fib_lookup);
-- 
cgit v1.2.3


From 52cff74eef5dd7bdab759300e7d1ca36eba18254 Mon Sep 17 00:00:00 2001
From: Anish Bhatt <anish@chelsio.com>
Date: Fri, 14 Nov 2014 16:38:31 -0800
Subject: dcbnl : Disable software interrupts before taking dcb_lock

Solves possible lockup issues that can be seen from firmware DCB agents calling
into the DCB app api.

DCB firmware event queues can be tied in with NAPI so that dcb events are
generated in softIRQ context. This can results in calls to dcb_*app()
functions which try to take the dcb_lock.

If the the event triggers while we also have the dcb_lock because lldpad or
some other agent happened to be issuing a  get/set command we could see a cpu
lockup.

This code was not originally written with firmware agents in mind, hence
grabbing dcb_lock from softIRQ context was not considered.

Signed-off-by: Anish Bhatt <anish@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dcb/dcbnl.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index ca11d283bbeb..93ea80196f0e 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1080,13 +1080,13 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
 	if (!app)
 		return -EMSGSIZE;
 
-	spin_lock(&dcb_lock);
+	spin_lock_bh(&dcb_lock);
 	list_for_each_entry(itr, &dcb_app_list, list) {
 		if (itr->ifindex == netdev->ifindex) {
 			err = nla_put(skb, DCB_ATTR_IEEE_APP, sizeof(itr->app),
 					 &itr->app);
 			if (err) {
-				spin_unlock(&dcb_lock);
+				spin_unlock_bh(&dcb_lock);
 				return -EMSGSIZE;
 			}
 		}
@@ -1097,7 +1097,7 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
 	else
 		dcbx = -EOPNOTSUPP;
 
-	spin_unlock(&dcb_lock);
+	spin_unlock_bh(&dcb_lock);
 	nla_nest_end(skb, app);
 
 	/* get peer info if available */
@@ -1234,7 +1234,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev)
 	}
 
 	/* local app */
-	spin_lock(&dcb_lock);
+	spin_lock_bh(&dcb_lock);
 	app = nla_nest_start(skb, DCB_ATTR_CEE_APP_TABLE);
 	if (!app)
 		goto dcb_unlock;
@@ -1271,7 +1271,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev)
 	else
 		dcbx = -EOPNOTSUPP;
 
-	spin_unlock(&dcb_lock);
+	spin_unlock_bh(&dcb_lock);
 
 	/* features flags */
 	if (ops->getfeatcfg) {
@@ -1326,7 +1326,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev)
 	return 0;
 
 dcb_unlock:
-	spin_unlock(&dcb_lock);
+	spin_unlock_bh(&dcb_lock);
 nla_put_failure:
 	return err;
 }
@@ -1762,10 +1762,10 @@ u8 dcb_getapp(struct net_device *dev, struct dcb_app *app)
 	struct dcb_app_type *itr;
 	u8 prio = 0;
 
-	spin_lock(&dcb_lock);
+	spin_lock_bh(&dcb_lock);
 	if ((itr = dcb_app_lookup(app, dev->ifindex, 0)))
 		prio = itr->app.priority;
-	spin_unlock(&dcb_lock);
+	spin_unlock_bh(&dcb_lock);
 
 	return prio;
 }
@@ -1789,7 +1789,7 @@ int dcb_setapp(struct net_device *dev, struct dcb_app *new)
 	if (dev->dcbnl_ops->getdcbx)
 		event.dcbx = dev->dcbnl_ops->getdcbx(dev);
 
-	spin_lock(&dcb_lock);
+	spin_lock_bh(&dcb_lock);
 	/* Search for existing match and replace */
 	if ((itr = dcb_app_lookup(new, dev->ifindex, 0))) {
 		if (new->priority)
@@ -1804,7 +1804,7 @@ int dcb_setapp(struct net_device *dev, struct dcb_app *new)
 	if (new->priority)
 		err = dcb_app_add(new, dev->ifindex);
 out:
-	spin_unlock(&dcb_lock);
+	spin_unlock_bh(&dcb_lock);
 	if (!err)
 		call_dcbevent_notifiers(DCB_APP_EVENT, &event);
 	return err;
@@ -1823,10 +1823,10 @@ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app)
 	struct dcb_app_type *itr;
 	u8 prio = 0;
 
-	spin_lock(&dcb_lock);
+	spin_lock_bh(&dcb_lock);
 	if ((itr = dcb_app_lookup(app, dev->ifindex, 0)))
 		prio |= 1 << itr->app.priority;
-	spin_unlock(&dcb_lock);
+	spin_unlock_bh(&dcb_lock);
 
 	return prio;
 }
@@ -1850,7 +1850,7 @@ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new)
 	if (dev->dcbnl_ops->getdcbx)
 		event.dcbx = dev->dcbnl_ops->getdcbx(dev);
 
-	spin_lock(&dcb_lock);
+	spin_lock_bh(&dcb_lock);
 	/* Search for existing match and abort if found */
 	if (dcb_app_lookup(new, dev->ifindex, new->priority)) {
 		err = -EEXIST;
@@ -1859,7 +1859,7 @@ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new)
 
 	err = dcb_app_add(new, dev->ifindex);
 out:
-	spin_unlock(&dcb_lock);
+	spin_unlock_bh(&dcb_lock);
 	if (!err)
 		call_dcbevent_notifiers(DCB_APP_EVENT, &event);
 	return err;
@@ -1882,7 +1882,7 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del)
 	if (dev->dcbnl_ops->getdcbx)
 		event.dcbx = dev->dcbnl_ops->getdcbx(dev);
 
-	spin_lock(&dcb_lock);
+	spin_lock_bh(&dcb_lock);
 	/* Search for existing match and remove it. */
 	if ((itr = dcb_app_lookup(del, dev->ifindex, del->priority))) {
 		list_del(&itr->list);
@@ -1890,7 +1890,7 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del)
 		err = 0;
 	}
 
-	spin_unlock(&dcb_lock);
+	spin_unlock_bh(&dcb_lock);
 	if (!err)
 		call_dcbevent_notifiers(DCB_APP_EVENT, &event);
 	return err;
@@ -1902,12 +1902,12 @@ static void dcb_flushapp(void)
 	struct dcb_app_type *app;
 	struct dcb_app_type *tmp;
 
-	spin_lock(&dcb_lock);
+	spin_lock_bh(&dcb_lock);
 	list_for_each_entry_safe(app, tmp, &dcb_app_list, list) {
 		list_del(&app->list);
 		kfree(app);
 	}
-	spin_unlock(&dcb_lock);
+	spin_unlock_bh(&dcb_lock);
 }
 
 static int __init dcbnl_init(void)
-- 
cgit v1.2.3


From feb91a02ccb09661507f170b2a444aec94f307f9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Wed, 5 Nov 2014 20:27:38 +0100
Subject: ipv6: mld: fix add_grhead skb_over_panic for devs with large MTUs

It has been reported that generating an MLD listener report on
devices with large MTUs (e.g. 9000) and a high number of IPv6
addresses can trigger a skb_over_panic():

skbuff: skb_over_panic: text:ffffffff80612a5d len:3776 put:20
head:ffff88046d751000 data:ffff88046d751010 tail:0xed0 end:0xec0
dev:port1
 ------------[ cut here ]------------
kernel BUG at net/core/skbuff.c:100!
invalid opcode: 0000 [#1] SMP
Modules linked in: ixgbe(O)
CPU: 3 PID: 0 Comm: swapper/3 Tainted: G O 3.14.23+ #4
[...]
Call Trace:
 <IRQ>
 [<ffffffff80578226>] ? skb_put+0x3a/0x3b
 [<ffffffff80612a5d>] ? add_grhead+0x45/0x8e
 [<ffffffff80612e3a>] ? add_grec+0x394/0x3d4
 [<ffffffff80613222>] ? mld_ifc_timer_expire+0x195/0x20d
 [<ffffffff8061308d>] ? mld_dad_timer_expire+0x45/0x45
 [<ffffffff80255b5d>] ? call_timer_fn.isra.29+0x12/0x68
 [<ffffffff80255d16>] ? run_timer_softirq+0x163/0x182
 [<ffffffff80250e6f>] ? __do_softirq+0xe0/0x21d
 [<ffffffff8025112b>] ? irq_exit+0x4e/0xd3
 [<ffffffff802214bb>] ? smp_apic_timer_interrupt+0x3b/0x46
 [<ffffffff8063f10a>] ? apic_timer_interrupt+0x6a/0x70

mld_newpack() skb allocations are usually requested with dev->mtu
in size, since commit 72e09ad107e7 ("ipv6: avoid high order allocations")
we have changed the limit in order to be less likely to fail.

However, in MLD/IGMP code, we have some rather ugly AVAILABLE(skb)
macros, which determine if we may end up doing an skb_put() for
adding another record. To avoid possible fragmentation, we check
the skb's tailroom as skb->dev->mtu - skb->len, which is a wrong
assumption as the actual max allocation size can be much smaller.

The IGMP case doesn't have this issue as commit 57e1ab6eaddc
("igmp: refine skb allocations") stores the allocation size in
the cb[].

Set a reserved_tailroom to make it fit into the MTU and use
skb_availroom() helper instead. This also allows to get rid of
igmp_skb_size().

Reported-by: Wei Liu <lw1a2.jing@gmail.com>
Fixes: 72e09ad107e7 ("ipv6: avoid high order allocations")
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: David L Stevens <david.stevens@oracle.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c  | 11 +++++------
 net/ipv6/mcast.c |  9 +++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index fb70e3ecc3e4..bb15d0e03d4f 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -318,9 +318,7 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
 	return scount;
 }
 
-#define igmp_skb_size(skb) (*(unsigned int *)((skb)->cb))
-
-static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
+static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
 {
 	struct sk_buff *skb;
 	struct rtable *rt;
@@ -330,6 +328,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
 	struct flowi4 fl4;
 	int hlen = LL_RESERVED_SPACE(dev);
 	int tlen = dev->needed_tailroom;
+	unsigned int size = mtu;
 
 	while (1) {
 		skb = alloc_skb(size + hlen + tlen,
@@ -341,7 +340,6 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
 			return NULL;
 	}
 	skb->priority = TC_PRIO_CONTROL;
-	igmp_skb_size(skb) = size;
 
 	rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
 				   0, 0,
@@ -354,6 +352,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
 	skb_dst_set(skb, &rt->dst);
 	skb->dev = dev;
 
+	skb->reserved_tailroom = skb_end_offset(skb) -
+				 min(mtu, skb_end_offset(skb));
 	skb_reserve(skb, hlen);
 
 	skb_reset_network_header(skb);
@@ -423,8 +423,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
 	return skb;
 }
 
-#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? igmp_skb_size(skb) - (skb)->len : \
-	skb_tailroom(skb)) : 0)
+#define AVAILABLE(skb)	((skb) ? skb_availroom(skb) : 0)
 
 static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
 	int type, int gdeleted, int sdeleted)
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 9648de2b6745..ed2c4e400b46 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1550,7 +1550,7 @@ static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb,
 	hdr->daddr = *daddr;
 }
 
-static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size)
+static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
 {
 	struct net_device *dev = idev->dev;
 	struct net *net = dev_net(dev);
@@ -1561,13 +1561,13 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size)
 	const struct in6_addr *saddr;
 	int hlen = LL_RESERVED_SPACE(dev);
 	int tlen = dev->needed_tailroom;
+	unsigned int size = mtu + hlen + tlen;
 	int err;
 	u8 ra[8] = { IPPROTO_ICMPV6, 0,
 		     IPV6_TLV_ROUTERALERT, 2, 0, 0,
 		     IPV6_TLV_PADN, 0 };
 
 	/* we assume size > sizeof(ra) here */
-	size += hlen + tlen;
 	/* limit our allocations to order-0 page */
 	size = min_t(int, size, SKB_MAX_ORDER(0, 0));
 	skb = sock_alloc_send_skb(sk, size, 1, &err);
@@ -1576,6 +1576,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, int size)
 		return NULL;
 
 	skb->priority = TC_PRIO_CONTROL;
+	skb->reserved_tailroom = skb_end_offset(skb) -
+				 min(mtu, skb_end_offset(skb));
 	skb_reserve(skb, hlen);
 
 	if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) {
@@ -1690,8 +1692,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc,
 	return skb;
 }
 
-#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \
-	skb_tailroom(skb)) : 0)
+#define AVAILABLE(skb)	((skb) ? skb_availroom(skb) : 0)
 
 static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
 	int type, int gdeleted, int sdeleted, int crsend)
-- 
cgit v1.2.3


From 97840cb67ff5ac8add836684f011fd838518d698 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 14 Nov 2014 18:14:33 +0100
Subject: netfilter: nfnetlink: fix insufficient validation in nfnetlink_bind

Make sure the netlink group exists, otherwise you can trigger an out
of bound array memory access from the netlink_bind() path. This splat
can only be triggered only by superuser.

[  180.203600] UBSan: Undefined behaviour in ../net/netfilter/nfnetlink.c:467:28
[  180.204249] index 9 is out of range for type 'int [9]'
[  180.204697] CPU: 0 PID: 1771 Comm: trinity-main Not tainted 3.18.0-rc4-mm1+ #122
[  180.205365] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org
+04/01/2014
[  180.206498]  0000000000000018 0000000000000000 0000000000000009 ffff88007bdf7da8
[  180.207220]  ffffffff82b0ef5f 0000000000000092 ffffffff845ae2e0 ffff88007bdf7db8
[  180.207887]  ffffffff8199e489 ffff88007bdf7e18 ffffffff8199ea22 0000003900000000
[  180.208639] Call Trace:
[  180.208857] dump_stack (lib/dump_stack.c:52)
[  180.209370] ubsan_epilogue (lib/ubsan.c:174)
[  180.209849] __ubsan_handle_out_of_bounds (lib/ubsan.c:400)
[  180.210512] nfnetlink_bind (net/netfilter/nfnetlink.c:467)
[  180.210986] netlink_bind (net/netlink/af_netlink.c:1483)
[  180.211495] SYSC_bind (net/socket.c:1541)

Moreover, define the missing nf_tables and nf_acct multicast groups too.

Reported-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 6c5a915cfa75..13c2e17bbe27 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -47,6 +47,8 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = {
 	[NFNLGRP_CONNTRACK_EXP_NEW]	= NFNL_SUBSYS_CTNETLINK_EXP,
 	[NFNLGRP_CONNTRACK_EXP_UPDATE]	= NFNL_SUBSYS_CTNETLINK_EXP,
 	[NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP,
+	[NFNLGRP_NFTABLES]		= NFNL_SUBSYS_NFTABLES,
+	[NFNLGRP_ACCT_QUOTA]		= NFNL_SUBSYS_ACCT,
 };
 
 void nfnl_lock(__u8 subsys_id)
@@ -464,7 +466,12 @@ static void nfnetlink_rcv(struct sk_buff *skb)
 static int nfnetlink_bind(int group)
 {
 	const struct nfnetlink_subsystem *ss;
-	int type = nfnl_group2type[group];
+	int type;
+
+	if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX)
+		return -EINVAL;
+
+	type = nfnl_group2type[group];
 
 	rcu_read_lock();
 	ss = nfnetlink_get_subsys(type);
@@ -514,6 +521,9 @@ static int __init nfnetlink_init(void)
 {
 	int i;
 
+	for (i = NFNLGRP_NONE + 1; i <= NFNLGRP_MAX; i++)
+		BUG_ON(nfnl_group2type[i] == NFNL_SUBSYS_NONE);
+
 	for (i=0; i<NFNL_SUBSYS_COUNT; i++)
 		mutex_init(&table[i].mutex);
 
-- 
cgit v1.2.3


From f0b4eeced518c632210ef2aea44fc92cc9e86cce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Linus=20L=C3=BCssing?= <linus.luessing@web.de>
Date: Mon, 17 Nov 2014 12:20:28 +0100
Subject: bridge: fix netfilter/NF_BR_LOCAL_OUT for own, locally generated
 queries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ebtables on the OUTPUT chain (NF_BR_LOCAL_OUT) would not work as expected
for both locally generated IGMP and MLD queries. The IP header specific
filter options are off by 14 Bytes for netfilter (actual output on
interfaces is fine).

NF_HOOK() expects the skb->data to point to the IP header, not the
ethernet one (while dev_queue_xmit() does not). Luckily there is an
br_dev_queue_push_xmit() helper function already - let's just use that.

Introduced by eb1d16414339a6e113d89e2cca2556005d7ce919
("bridge: Add core IGMP snooping support")

Ebtables example:

$ ebtables -I OUTPUT -p IPv6 -o eth1 --logical-out br0 \
	--log --log-level 6 --log-ip6 --log-prefix="~EBT: " -j DROP

before (broken):

~EBT:  IN= OUT=eth1 MAC source = 02:04:64:a4:39:c2 \
	MAC dest = 33:33:00:00:00:01 proto = 0x86dd IPv6 \
	SRC=64a4:39c2:86dd:6000:0000:0020:0001:fe80 IPv6 \
	DST=0000:0000:0000:0004:64ff:fea4:39c2:ff02, \
	IPv6 priority=0x3, Next Header=2

after (working):

~EBT:  IN= OUT=eth1 MAC source = 02:04:64:a4:39:c2 \
	MAC dest = 33:33:00:00:00:01 proto = 0x86dd IPv6 \
	SRC=fe80:0000:0000:0000:0004:64ff:fea4:39c2 IPv6 \
	DST=ff02:0000:0000:0000:0000:0000:0000:0001, \
	IPv6 priority=0x0, Next Header=0

Signed-off-by: Linus Lüssing <linus.luessing@web.de>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/br_multicast.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 648d79ccf462..c465876c7861 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -813,10 +813,9 @@ static void __br_multicast_send_query(struct net_bridge *br,
 		return;
 
 	if (port) {
-		__skb_push(skb, sizeof(struct ethhdr));
 		skb->dev = port->dev;
 		NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
-			dev_queue_xmit);
+			br_dev_queue_push_xmit);
 	} else {
 		br_multicast_select_own_querier(br, ip, skb);
 		netif_rx(skb);
-- 
cgit v1.2.3


From 280ba51d60be6f4ca3347eaa60783314f38df72e Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Tue, 18 Nov 2014 22:35:31 +0100
Subject: mac80211: minstrel_ht: fix a crash in rate sorting

The commit 5935839ad73583781b8bbe8d91412f6826e218a4
"mac80211: improve minstrel_ht rate sorting by throughput & probability"

introduced a crash on rate sorting that occurs when the rate added to
the sorting array is faster than all the previous rates. Due to an
off-by-one error, it reads the rate index from tp_list[-1], which
contains uninitialized stack garbage, and then uses the resulting index
for accessing the group rate stats, leading to a crash if the garbage
value is big enough.

Cc: Thomas Huehn <thomas@net.t-labs.tu-berlin.de>
Reported-by: Jouni Malinen <j@w1.fi>
Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rc80211_minstrel_ht.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index df90ce2db00c..408fd8ab4eef 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -252,19 +252,16 @@ minstrel_ht_sort_best_tp_rates(struct minstrel_ht_sta *mi, u8 index,
 	cur_thr = mi->groups[cur_group].rates[cur_idx].cur_tp;
 	cur_prob = mi->groups[cur_group].rates[cur_idx].probability;
 
-	tmp_group = tp_list[j - 1] / MCS_GROUP_RATES;
-	tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES;
-	tmp_thr = mi->groups[tmp_group].rates[tmp_idx].cur_tp;
-	tmp_prob = mi->groups[tmp_group].rates[tmp_idx].probability;
-
-	while (j > 0 && (cur_thr > tmp_thr ||
-	      (cur_thr == tmp_thr && cur_prob > tmp_prob))) {
-		j--;
+	do {
 		tmp_group = tp_list[j - 1] / MCS_GROUP_RATES;
 		tmp_idx = tp_list[j - 1] % MCS_GROUP_RATES;
 		tmp_thr = mi->groups[tmp_group].rates[tmp_idx].cur_tp;
 		tmp_prob = mi->groups[tmp_group].rates[tmp_idx].probability;
-	}
+		if (cur_thr < tmp_thr ||
+		    (cur_thr == tmp_thr && cur_prob <= tmp_prob))
+			break;
+		j--;
+	} while (j > 0);
 
 	if (j < MAX_THR_RATES - 1) {
 		memmove(&tp_list[j + 1], &tp_list[j], (sizeof(*tp_list) *
-- 
cgit v1.2.3


From 093a1468b6edb0e568be7311b8d2228d205702db Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 12 Nov 2014 18:04:04 -0500
Subject: SUNRPC: Fix locking around callback channel reply receive

Both xprt_lookup_rqst() and xprt_complete_rqst() require that you
take the transport lock in order to avoid races with xprt_transmit().

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: stable@vger.kernel.org
Reviewed-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svcsock.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 3f959c681885..f9c052d508f0 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1019,17 +1019,12 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
 	xid = *p++;
 	calldir = *p;
 
-	if (bc_xprt)
-		req = xprt_lookup_rqst(bc_xprt, xid);
-
-	if (!req) {
-		printk(KERN_NOTICE
-			"%s: Got unrecognized reply: "
-			"calldir 0x%x xpt_bc_xprt %p xid %08x\n",
-			__func__, ntohl(calldir),
-			bc_xprt, ntohl(xid));
+	if (!bc_xprt)
 		return -EAGAIN;
-	}
+	spin_lock_bh(&bc_xprt->transport_lock);
+	req = xprt_lookup_rqst(bc_xprt, xid);
+	if (!req)
+		goto unlock_notfound;
 
 	memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf));
 	/*
@@ -1040,11 +1035,21 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
 	dst = &req->rq_private_buf.head[0];
 	src = &rqstp->rq_arg.head[0];
 	if (dst->iov_len < src->iov_len)
-		return -EAGAIN; /* whatever; just giving up. */
+		goto unlock_eagain; /* whatever; just giving up. */
 	memcpy(dst->iov_base, src->iov_base, src->iov_len);
 	xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len);
 	rqstp->rq_arg.len = 0;
+	spin_unlock_bh(&bc_xprt->transport_lock);
 	return 0;
+unlock_notfound:
+	printk(KERN_NOTICE
+		"%s: Got unrecognized reply: "
+		"calldir 0x%x xpt_bc_xprt %p xid %08x\n",
+		__func__, ntohl(calldir),
+		bc_xprt, ntohl(xid));
+unlock_eagain:
+	spin_unlock_bh(&bc_xprt->transport_lock);
+	return -EAGAIN;
 }
 
 static int copy_pages_to_kvecs(struct kvec *vec, struct page **pages, int len)
-- 
cgit v1.2.3


From ffb1388a364d135810337182d6800a0c7ee44f48 Mon Sep 17 00:00:00 2001
From: Duan Jiong <duanj.fnst@cn.fujitsu.com>
Date: Wed, 19 Nov 2014 09:35:39 +0800
Subject: ipv6: delete protocol and unregister rtnetlink when cleanup

pim6_protocol was added when initiation, but it not deleted.
Similarly, unregister RTNL_FAMILY_IP6MR rtnetlink.

Signed-off-by: Duan Jiong <duanj.fnst@cn.fujitsu.com>
Reviewed-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6mr.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 0171f08325c3..1a01d79b8698 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1439,6 +1439,10 @@ reg_pernet_fail:
 
 void ip6_mr_cleanup(void)
 {
+	rtnl_unregister(RTNL_FAMILY_IP6MR, RTM_GETROUTE);
+#ifdef CONFIG_IPV6_PIMSM_V2
+	inet6_del_protocol(&pim6_protocol, IPPROTO_PIM);
+#endif
 	unregister_netdevice_notifier(&ip6_mr_notifier);
 	unregister_pernet_subsys(&ip6mr_net_ops);
 	kmem_cache_destroy(mrt_cachep);
-- 
cgit v1.2.3


From d3052bb5d306b29c1e7d9e5998c5ac4ca1ff0ca9 Mon Sep 17 00:00:00 2001
From: Joe Stringer <joestringer@nicira.com>
Date: Wed, 19 Nov 2014 13:54:49 -0800
Subject: openvswitch: Don't validate IPv6 label masks.

When userspace doesn't provide a mask, OVS datapath generates a fully
unwildcarded mask for the flow by copying the flow and setting all bits
in all fields. For IPv6 label, this creates a mask that matches on the
upper 12 bits, causing the following error:

openvswitch: netlink: Invalid IPv6 flow label value (value=ffffffff, max=fffff)

This patch ignores the label validation check for masks, avoiding this
error.

Signed-off-by: Joe Stringer <joestringer@nicira.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/flow_netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index fa4ec2e4a78b..089b195c064a 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -690,7 +690,7 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
 			return -EINVAL;
 		}
 
-		if (ipv6_key->ipv6_label & htonl(0xFFF00000)) {
+		if (!is_mask && ipv6_key->ipv6_label & htonl(0xFFF00000)) {
 			OVS_NLERR("IPv6 flow label %x is out of range (max=%x).\n",
 				  ntohl(ipv6_key->ipv6_label), (1 << 20) - 1);
 			return -EINVAL;
-- 
cgit v1.2.3


From 01462405f0c093b2f8dfddafcadcda6c9e4c5cdf Mon Sep 17 00:00:00 2001
From: Jiri Bohac <jbohac@suse.cz>
Date: Wed, 19 Nov 2014 23:05:49 +0100
Subject: ipx: fix locking regression in ipx_sendmsg and ipx_recvmsg

This fixes an old regression introduced by commit
b0d0d915 (ipx: remove the BKL).

When a recvmsg syscall blocks waiting for new data, no data can be sent on the
same socket with sendmsg because ipx_recvmsg() sleeps with the socket locked.

This breaks mars-nwe (NetWare emulator):
- the ncpserv process reads the request using recvmsg
- ncpserv forks and spawns nwconn
- ncpserv calls a (blocking) recvmsg and waits for new requests
- nwconn deadlocks in sendmsg on the same socket

Commit b0d0d915 has simply replaced BKL locking with
lock_sock/release_sock. Unlike now, BKL got unlocked while
sleeping, so a blocking recvmsg did not block a concurrent
sendmsg.

Only keep the socket locked while actually working with the socket data and
release it prior to calling skb_recv_datagram().

Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipx/af_ipx.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 91729b807c7d..1b095ca37aa4 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1764,6 +1764,7 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock,
 	struct ipxhdr *ipx = NULL;
 	struct sk_buff *skb;
 	int copied, rc;
+	bool locked = true;
 
 	lock_sock(sk);
 	/* put the autobinding in */
@@ -1790,6 +1791,8 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock,
 	if (sock_flag(sk, SOCK_ZAPPED))
 		goto out;
 
+	release_sock(sk);
+	locked = false;
 	skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
 				flags & MSG_DONTWAIT, &rc);
 	if (!skb) {
@@ -1826,7 +1829,8 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock,
 out_free:
 	skb_free_datagram(sk, skb);
 out:
-	release_sock(sk);
+	if (locked)
+		release_sock(sk);
 	return rc;
 }
 
-- 
cgit v1.2.3


From e7820e39b7d19b9fe1928fc19de9361b44150ca6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 21 Nov 2014 11:47:16 -0800
Subject: net: Revert "net: avoid one atomic operation in skb_clone()"

Not sure what I was thinking, but doing anything after
releasing a refcount is suicidal or/and embarrassing.

By the time we set skb->fclone to SKB_FCLONE_FREE, another cpu
could have released last reference and freed whole skb.

We potentially corrupt memory or trap if CONFIG_DEBUG_PAGEALLOC is set.

Reported-by: Chris Mason <clm@fb.com>
Fixes: ce1a4ea3f1258 ("net: avoid one atomic operation in skb_clone()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c16615bfb61e..32e31c299631 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -552,20 +552,13 @@ static void kfree_skbmem(struct sk_buff *skb)
 	case SKB_FCLONE_CLONE:
 		fclones = container_of(skb, struct sk_buff_fclones, skb2);
 
-		/* Warning : We must perform the atomic_dec_and_test() before
-		 * setting skb->fclone back to SKB_FCLONE_FREE, otherwise
-		 * skb_clone() could set clone_ref to 2 before our decrement.
-		 * Anyway, if we are going to free the structure, no need to
-		 * rewrite skb->fclone.
+		/* The clone portion is available for
+		 * fast-cloning again.
 		 */
-		if (atomic_dec_and_test(&fclones->fclone_ref)) {
+		skb->fclone = SKB_FCLONE_FREE;
+
+		if (atomic_dec_and_test(&fclones->fclone_ref))
 			kmem_cache_free(skbuff_fclone_cache, fclones);
-		} else {
-			/* The clone portion is available for
-			 * fast-cloning again.
-			 */
-			skb->fclone = SKB_FCLONE_FREE;
-		}
 		break;
 	}
 }
@@ -887,11 +880,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 	if (skb->fclone == SKB_FCLONE_ORIG &&
 	    n->fclone == SKB_FCLONE_FREE) {
 		n->fclone = SKB_FCLONE_CLONE;
-		/* As our fastclone was free, clone_ref must be 1 at this point.
-		 * We could use atomic_inc() here, but it is faster
-		 * to set the final value.
-		 */
-		atomic_set(&fclones->fclone_ref, 2);
+		atomic_inc(&fclones->fclone_ref);
 	} else {
 		if (skb_pfmemalloc(skb))
 			gfp_mask |= __GFP_MEMALLOC;
-- 
cgit v1.2.3


From 0c228e833c88e3aa029250f5db77d5968c5ce5b5 Mon Sep 17 00:00:00 2001
From: Calvin Owens <calvinowens@fb.com>
Date: Thu, 20 Nov 2014 15:09:53 -0800
Subject: tcp: Restore RFC5961-compliant behavior for SYN packets

Commit c3ae62af8e755 ("tcp: should drop incoming frames without ACK
flag set") was created to mitigate a security vulnerability in which a
local attacker is able to inject data into locally-opened sockets by
using TCP protocol statistics in procfs to quickly find the correct
sequence number.

This broke the RFC5961 requirement to send a challenge ACK in response
to spurious RST packets, which was subsequently fixed by commit
7b514a886ba50 ("tcp: accept RST without ACK flag").

Unfortunately, the RFC5961 requirement that spurious SYN packets be
handled in a similar manner remains broken.

RFC5961 section 4 states that:

   ... the handling of the SYN in the synchronized state SHOULD be
   performed as follows:

   1) If the SYN bit is set, irrespective of the sequence number, TCP
      MUST send an ACK (also referred to as challenge ACK) to the remote
      peer:

      <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>

      After sending the acknowledgment, TCP MUST drop the unacceptable
      segment and stop processing further.

   By sending an ACK, the remote peer is challenged to confirm the loss
   of the previous connection and the request to start a new connection.
   A legitimate peer, after restart, would not have a TCB in the
   synchronized state.  Thus, when the ACK arrives, the peer should send
   a RST segment back with the sequence number derived from the ACK
   field that caused the RST.

   This RST will confirm that the remote peer has indeed closed the
   previous connection.  Upon receipt of a valid RST, the local TCP
   endpoint MUST terminate its connection.  The local TCP endpoint
   should then rely on SYN retransmission from the remote end to
   re-establish the connection.

This patch lets SYN packets through the discard added in c3ae62af8e755,
so that spurious SYN packets are properly dealt with as per the RFC.

The challenge ACK is sent unconditionally and is rate-limited, so the
original vulnerability is not reintroduced by this patch.

Signed-off-by: Calvin Owens <calvinowens@fb.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 88fa2d160685..d107ee246a1d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5231,7 +5231,7 @@ slow_path:
 	if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
 		goto csum_error;
 
-	if (!th->ack && !th->rst)
+	if (!th->ack && !th->rst && !th->syn)
 		goto discard;
 
 	/*
@@ -5650,7 +5650,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			goto discard;
 	}
 
-	if (!th->ack && !th->rst)
+	if (!th->ack && !th->rst && !th->syn)
 		goto discard;
 
 	if (!tcp_validate_incoming(sk, skb, th, 0))
-- 
cgit v1.2.3


From b6fef4c6b8c1994ffe050dcdb9391427bf0d9882 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Fri, 21 Nov 2014 19:37:09 -0800
Subject: ipv6: Do not treat a GSO_TCPV4 request from UDP tunnel over IPv6 as
 invalid

This patch adds SKB_GSO_TCPV4 to the list of supported GSO types handled by
the IPv6 GSO offloads.  Without this change VXLAN tunnels running over IPv6
do not currently handle IPv4 TCP TSO requests correctly and end up handing
the non-segmented frame off to the device.

Below is the before and after for a simple netperf TCP_STREAM test between
two endpoints tunneling IPv4 over a VXLAN tunnel running on IPv6 on top of
a 1Gb/s network adapter.

Recv   Send    Send
Socket Socket  Message  Elapsed
Size   Size    Size     Time     Throughput
bytes  bytes   bytes    secs.    10^6bits/sec

 87380  16384  16384    10.29       0.88      Before
 87380  16384  16384    10.03     895.69      After

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_offload.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index a071563a7e6e..01e12d0d8fcc 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -69,7 +69,8 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	int nhoff;
 
 	if (unlikely(skb_shinfo(skb)->gso_type &
-		     ~(SKB_GSO_UDP |
+		     ~(SKB_GSO_TCPV4 |
+		       SKB_GSO_UDP |
 		       SKB_GSO_DODGY |
 		       SKB_GSO_TCP_ECN |
 		       SKB_GSO_GRE |
-- 
cgit v1.2.3


From 20ea60ca9952bd19d4b0d74719daba305aef5178 Mon Sep 17 00:00:00 2001
From: lucien <lucien.xin@gmail.com>
Date: Sun, 23 Nov 2014 15:04:11 +0800
Subject: ip_tunnel: the lack of vti_link_ops' dellink() cause kernel panic

Now the vti_link_ops do not point the .dellink, for fb tunnel device
(ip_vti0), the net_device will be removed as the default .dellink is
unregister_netdevice_queue,but the tunnel still in the tunnel list,
then if we add a new vti tunnel, in ip_tunnel_find():

        hlist_for_each_entry_rcu(t, head, hash_node) {
                if (local == t->parms.iph.saddr &&
                    remote == t->parms.iph.daddr &&
                    link == t->parms.link &&
==>                 type == t->dev->type &&
                    ip_tunnel_key_match(&t->parms, flags, key))
                        break;
        }

the panic will happen, cause dev of ip_tunnel *t is null:
[ 3835.072977] IP: [<ffffffffa04103fd>] ip_tunnel_find+0x9d/0xc0 [ip_tunnel]
[ 3835.073008] PGD b2c21067 PUD b7277067 PMD 0
[ 3835.073008] Oops: 0000 [#1] SMP
.....
[ 3835.073008] Stack:
[ 3835.073008]  ffff8800b72d77f0 ffffffffa0411924 ffff8800bb956000 ffff8800b72d78e0
[ 3835.073008]  ffff8800b72d78a0 0000000000000000 ffffffffa040d100 ffff8800b72d7858
[ 3835.073008]  ffffffffa040b2e3 0000000000000000 0000000000000000 0000000000000000
[ 3835.073008] Call Trace:
[ 3835.073008]  [<ffffffffa0411924>] ip_tunnel_newlink+0x64/0x160 [ip_tunnel]
[ 3835.073008]  [<ffffffffa040b2e3>] vti_newlink+0x43/0x70 [ip_vti]
[ 3835.073008]  [<ffffffff8150d4da>] rtnl_newlink+0x4fa/0x5f0
[ 3835.073008]  [<ffffffff812f68bb>] ? nla_strlcpy+0x5b/0x70
[ 3835.073008]  [<ffffffff81508fb0>] ? rtnl_link_ops_get+0x40/0x60
[ 3835.073008]  [<ffffffff8150d11f>] ? rtnl_newlink+0x13f/0x5f0
[ 3835.073008]  [<ffffffff81509cf4>] rtnetlink_rcv_msg+0xa4/0x270
[ 3835.073008]  [<ffffffff8126adf5>] ? sock_has_perm+0x75/0x90
[ 3835.073008]  [<ffffffff81509c50>] ? rtnetlink_rcv+0x30/0x30
[ 3835.073008]  [<ffffffff81529e39>] netlink_rcv_skb+0xa9/0xc0
[ 3835.073008]  [<ffffffff81509c48>] rtnetlink_rcv+0x28/0x30
....

modprobe ip_vti
ip link del ip_vti0 type vti
ip link add ip_vti0 type vti
rmmod ip_vti

do that one or more times, kernel will panic.

fix it by assigning ip_tunnel_dellink to vti_link_ops' dellink, in
which we skip the unregister of fb tunnel device. do the same on ip6_vti.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_vti.c  |  1 +
 net/ipv6/ip6_vti.c | 11 +++++++++++
 2 files changed, 12 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 3e861011e4a3..1a7e979e80ba 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -528,6 +528,7 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = {
 	.validate	= vti_tunnel_validate,
 	.newlink	= vti_newlink,
 	.changelink	= vti_changelink,
+	.dellink        = ip_tunnel_dellink,
 	.get_size	= vti_get_size,
 	.fill_info	= vti_fill_info,
 };
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 31089d153fd3..bcda14de7f84 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -905,6 +905,15 @@ static int vti6_newlink(struct net *src_net, struct net_device *dev,
 	return vti6_tnl_create2(dev);
 }
 
+static void vti6_dellink(struct net_device *dev, struct list_head *head)
+{
+	struct net *net = dev_net(dev);
+	struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+
+	if (dev != ip6n->fb_tnl_dev)
+		unregister_netdevice_queue(dev, head);
+}
+
 static int vti6_changelink(struct net_device *dev, struct nlattr *tb[],
 			   struct nlattr *data[])
 {
@@ -980,6 +989,7 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = {
 	.setup		= vti6_dev_setup,
 	.validate	= vti6_validate,
 	.newlink	= vti6_newlink,
+	.dellink	= vti6_dellink,
 	.changelink	= vti6_changelink,
 	.get_size	= vti6_get_size,
 	.fill_info	= vti6_fill_info,
@@ -1020,6 +1030,7 @@ static int __net_init vti6_init_net(struct net *net)
 	if (!ip6n->fb_tnl_dev)
 		goto err_alloc_dev;
 	dev_net_set(ip6n->fb_tnl_dev, net);
+	ip6n->fb_tnl_dev->rtnl_link_ops = &vti6_link_ops;
 
 	err = vti6_fb_tnl_dev_init(ip6n->fb_tnl_dev);
 	if (err < 0)
-- 
cgit v1.2.3


From be6572fdb1bfbe23b2624d477de50af50b02f5d6 Mon Sep 17 00:00:00 2001
From: Yuri Chislov <yuri.chislov@gmail.com>
Date: Mon, 24 Nov 2014 11:25:15 +0100
Subject: ipv6: gre: fix wrong skb->protocol in WCCP

When using GRE redirection in WCCP, it sets the wrong skb->protocol,
that is, ETH_P_IP instead of ETH_P_IPV6 for the encapuslated traffic.

Fixes: c12b395a4664 ("gre: Support GRE over IPv6")
Cc: Dmitry Kozlov <xeb@mail.ru>
Signed-off-by: Yuri Chislov <yuri.chislov@gmail.com>
Tested-by: Yuri Chislov <yuri.chislov@gmail.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 4564e1fca3eb..0e32d2e1bdbf 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -502,11 +502,11 @@ static int ip6gre_rcv(struct sk_buff *skb)
 
 		skb->protocol = gre_proto;
 		/* WCCP version 1 and 2 protocol decoding.
-		 * - Change protocol to IP
+		 * - Change protocol to IPv6
 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
 		 */
 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
-			skb->protocol = htons(ETH_P_IP);
+			skb->protocol = htons(ETH_P_IPV6);
 			if ((*(h + offset) & 0xF0) != 0x40)
 				offset += 4;
 		}
-- 
cgit v1.2.3


From 6e58040b848162d77243cf499d026c6253278e29 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 24 Nov 2014 13:32:16 +0200
Subject: af_packet: fix sparse warning

af_packet produces lots of these:
	net/packet/af_packet.c:384:39: warning: incorrect type in return expression (different modifiers)
	net/packet/af_packet.c:384:39:    expected struct page [pure] *
	net/packet/af_packet.c:384:39:    got struct page *

this seems to be because sparse does not realize that _pure
refers to function, not the returned pointer.

Tweak code slightly to avoid the warning.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 87d20f48ff06..07c04a841ba0 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -378,7 +378,7 @@ static void unregister_prot_hook(struct sock *sk, bool sync)
 		__unregister_prot_hook(sk, sync);
 }
 
-static inline __pure struct page *pgv_to_page(void *addr)
+static inline struct page * __pure pgv_to_page(void *addr)
 {
 	if (is_vmalloc_addr(addr))
 		return vmalloc_to_page(addr);
-- 
cgit v1.2.3


From 91a0b603469069cdcce4d572b7525ffc9fd352a6 Mon Sep 17 00:00:00 2001
From: Jane Zhou <a17711@motorola.com>
Date: Mon, 24 Nov 2014 11:44:08 -0800
Subject: net/ping: handle protocol mismatching scenario

ping_lookup() may return a wrong sock if sk_buff's and sock's protocols
dont' match. For example, sk_buff's protocol is ETH_P_IPV6, but sock's
sk_family is AF_INET, in that case, if sk->sk_bound_dev_if is zero, a wrong
sock will be returned.
the fix is to "continue" the searching, if no matching, return NULL.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
Cc: netdev@vger.kernel.org
Cc: stable@vger.kernel.org
Signed-off-by: Jane Zhou <a17711@motorola.com>
Signed-off-by: Yiwei Zhao <gbjc64@motorola.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ping.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 57f7c9804139..85a02a75ad83 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -217,6 +217,8 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
 					     &ipv6_hdr(skb)->daddr))
 				continue;
 #endif
+		} else {
+			continue;
 		}
 
 		if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
-- 
cgit v1.2.3


From f3750817a9034bd8cbb5ba583cd544e7cf14c7d8 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Mon, 24 Nov 2014 20:08:32 -0800
Subject: ip6_udp_tunnel: Fix checksum calculation

The UDP checksum calculation for VXLAN tunnels is currently using the
socket addresses instead of the actual packet source and destination
addresses.  As a result the checksum calculated is incorrect in some
cases.

Also uh->check was being set twice, first it was set to 0, and then it is
set again in udp6_set_csum.  This change removes the redundant assignment
to 0.

Fixes: acbf74a7 ("vxlan: Refactor vxlan driver to make use of the common UDP tunnel functions.")

Cc: Andy Zhou <azhou@nicira.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_udp_tunnel.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index b04ed72c4542..8db6c98fe218 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -79,15 +79,13 @@ int udp_tunnel6_xmit_skb(struct socket *sock, struct dst_entry *dst,
 	uh->source = src_port;
 
 	uh->len = htons(skb->len);
-	uh->check = 0;
 
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED
 			    | IPSKB_REROUTED);
 	skb_dst_set(skb, dst);
 
-	udp6_set_csum(udp_get_no_check6_tx(sk), skb, &inet6_sk(sk)->saddr,
-		      &sk->sk_v6_daddr, skb->len);
+	udp6_set_csum(udp_get_no_check6_tx(sk), skb, saddr, daddr, skb->len);
 
 	__skb_push(skb, sizeof(*ip6h));
 	skb_reset_network_header(skb);
-- 
cgit v1.2.3


From 43612d7c04f1a4f5e60104143918fcdf018b66ee Mon Sep 17 00:00:00 2001
From: Pablo Neira <pablo@netfilter.org>
Date: Tue, 25 Nov 2014 19:54:47 +0100
Subject: Revert "netfilter: conntrack: fix race in __nf_conntrack_confirm
 against get_next_corpse"

This reverts commit 5195c14c8b27cc0b18220ddbf0e5ad3328a04187.

If the conntrack clashes with an existing one, it is left out of
the unconfirmed list, thus, crashing when dropping the packet and
releasing the conntrack since golden rule is that conntracks are
always placed in any of the existing lists for traceability reasons.

Reported-by: Daniel Borkmann <dborkman@redhat.com>
Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=88841
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_conntrack_core.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 2c699757bccf..5016a6929085 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -611,16 +611,12 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 	 */
 	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
 	pr_debug("Confirming conntrack %p\n", ct);
-
-	/* We have to check the DYING flag after unlink to prevent
-	 * a race against nf_ct_get_next_corpse() possibly called from
-	 * user context, else we insert an already 'dead' hash, blocking
-	 * further use of that particular connection -JM.
-	 */
-	nf_ct_del_from_dying_or_unconfirmed_list(ct);
+	/* We have to check the DYING flag inside the lock to prevent
+	   a race against nf_ct_get_next_corpse() possibly called from
+	   user context, else we insert an already 'dead' hash, blocking
+	   further use of that particular connection -JM */
 
 	if (unlikely(nf_ct_is_dying(ct))) {
-		nf_ct_add_to_dying_list(ct);
 		nf_conntrack_double_unlock(hash, reply_hash);
 		local_bh_enable();
 		return NF_ACCEPT;
@@ -640,6 +636,8 @@ __nf_conntrack_confirm(struct sk_buff *skb)
 		    zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
 			goto out;
 
+	nf_ct_del_from_dying_or_unconfirmed_list(ct);
+
 	/* Timer relative to confirmation time, not original
 	   setting time, otherwise we'd get timer wrap in
 	   weird delay cases. */
-- 
cgit v1.2.3


From c3658e8d0f10147fc86018be7f11668246c156d3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 25 Nov 2014 07:40:04 -0800
Subject: tcp: fix possible NULL dereference in tcp_vX_send_reset()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After commit ca777eff51f7 ("tcp: remove dst refcount false sharing for
prequeue mode") we have to relax check against skb dst in
tcp_v[46]_send_reset() if prequeue dropped the dst.

If a socket is provided, a full lookup was done to find this socket,
so the dst test can be skipped.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=88191
Reported-by: Jaša Bartelj <jasa.bartelj@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Daniel Borkmann <dborkman@redhat.com>
Fixes: ca777eff51f7 ("tcp: remove dst refcount false sharing for prequeue mode")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 5 ++++-
 net/ipv6/tcp_ipv6.c | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 9c7d7621466b..147be2024290 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -598,7 +598,10 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 	if (th->rst)
 		return;
 
-	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
+	/* If sk not NULL, it means we did a successful lookup and incoming
+	 * route had to be correct. prequeue might have dropped our dst.
+	 */
+	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
 		return;
 
 	/* Swap the send and the receive. */
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ace29b60813c..dc495ae2ead0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -903,7 +903,10 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
 	if (th->rst)
 		return;
 
-	if (!ipv6_unicast_destination(skb))
+	/* If sk not NULL, it means we did a successful lookup and incoming
+	 * route had to be correct. prequeue might have dropped our dst.
+	 */
+	if (!sk && !ipv6_unicast_destination(skb))
 		return;
 
 #ifdef CONFIG_TCP_MD5SIG
-- 
cgit v1.2.3


From 6e8d1c55454574a22b3e8e263b1a12888909b033 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 26 Nov 2014 13:42:16 +0100
Subject: bridge: Validate IFLA_BRIDGE_FLAGS attribute length

Payload is currently accessed blindly and may exceed valid message
boundaries.

Fixes: 407af3299 ("bridge: Add netlink interface to configure vlans on bridge ports")
Cc: Vlad Yasevich <vyasevic@redhat.com>
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a6882686ca3a..5a853f8fad6c 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2798,6 +2798,9 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if (br_spec) {
 		nla_for_each_nested(attr, br_spec, rem) {
 			if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
+				if (nla_len(attr) < sizeof(flags))
+					return -EINVAL;
+
 				have_flags = true;
 				flags = nla_get_u16(attr);
 				break;
@@ -2868,6 +2871,9 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if (br_spec) {
 		nla_for_each_nested(attr, br_spec, rem) {
 			if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
+				if (nla_len(attr) < sizeof(flags))
+					return -EINVAL;
+
 				have_flags = true;
 				flags = nla_get_u16(attr);
 				break;
-- 
cgit v1.2.3


From 6f705d8cfc0af3607aed890c53d86255e315c6f4 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 26 Nov 2014 13:42:19 +0100
Subject: bridge: Add missing policy entry for IFLA_BRPORT_FAST_LEAVE

Fixes: c2d3babf ("bridge: implement multicast fast leave")
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 2ff9706647f2..e5ec470b851f 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -280,6 +280,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
 	[IFLA_BRPORT_MODE]	= { .type = NLA_U8 },
 	[IFLA_BRPORT_GUARD]	= { .type = NLA_U8 },
 	[IFLA_BRPORT_PROTECT]	= { .type = NLA_U8 },
+	[IFLA_BRPORT_FAST_LEAVE]= { .type = NLA_U8 },
 	[IFLA_BRPORT_LEARNING]	= { .type = NLA_U8 },
 	[IFLA_BRPORT_UNICAST_FLOOD] = { .type = NLA_U8 },
 };
-- 
cgit v1.2.3


From aa68c20ff32f9a6fb3ca7f93ed9beae01899d00d Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 26 Nov 2014 13:42:20 +0100
Subject: bridge: Sanitize IFLA_EXT_MASK for AF_BRIDGE:RTM_GETLINK

Only search for IFLA_EXT_MASK if the message actually carries a
ifinfomsg header and validate minimal length requirements for
IFLA_EXT_MASK.

Fixes: 6cbdceeb ("bridge: Dump vlan information from a bridge port")
Cc: Vlad Yasevich <vyasevic@redhat.com>
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 5a853f8fad6c..b9b7dfaf202b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2685,13 +2685,20 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
 	int idx = 0;
 	u32 portid = NETLINK_CB(cb->skb).portid;
 	u32 seq = cb->nlh->nlmsg_seq;
-	struct nlattr *extfilt;
 	u32 filter_mask = 0;
 
-	extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg),
-				  IFLA_EXT_MASK);
-	if (extfilt)
-		filter_mask = nla_get_u32(extfilt);
+	if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) {
+		struct nlattr *extfilt;
+
+		extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg),
+					  IFLA_EXT_MASK);
+		if (extfilt) {
+			if (nla_len(extfilt) < sizeof(filter_mask))
+				return -EINVAL;
+
+			filter_mask = nla_get_u32(extfilt);
+		}
+	}
 
 	rcu_read_lock();
 	for_each_netdev_rcu(net, dev) {
-- 
cgit v1.2.3


From f4713a3dfad045d46afcb9c2a7d0bba288920ed4 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 26 Nov 2014 14:53:02 -0500
Subject: net-timestamp: make tcp_recvmsg call ipv6_recv_error for AF_INET6
 socks

TCP timestamping introduced MSG_ERRQUEUE handling for TCP sockets.
If the socket is of family AF_INET6, call ipv6_recv_error instead
of ip_recv_error.

This change is more complex than a single branch due to the loadable
ipv6 module. It reuses a pre-existing indirect function call from
ping. The ping code is safe to call, because it is part of the core
ipv6 module and always present when AF_INET6 sockets are active.

Fixes: 4ed2d765 (net-timestamp: TCP timestamping)
Signed-off-by: Willem de Bruijn <willemb@google.com>

----

It may also be worthwhile to add WARN_ON_ONCE(sk->family == AF_INET6)
to ip_recv_error.
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 11 +++++++++++
 net/ipv4/ping.c    | 12 ++----------
 net/ipv4/tcp.c     |  2 +-
 3 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8b7fe5b03906..e67da4e6c324 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1386,6 +1386,17 @@ out:
 	return pp;
 }
 
+int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
+{
+	if (sk->sk_family == AF_INET)
+		return ip_recv_error(sk, msg, len, addr_len);
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6)
+		return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len);
+#endif
+	return -EINVAL;
+}
+
 static int inet_gro_complete(struct sk_buff *skb, int nhoff)
 {
 	__be16 newlen = htons(skb->len - nhoff);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 85a02a75ad83..5d740cccf69e 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -855,16 +855,8 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	if (flags & MSG_OOB)
 		goto out;
 
-	if (flags & MSG_ERRQUEUE) {
-		if (family == AF_INET) {
-			return ip_recv_error(sk, msg, len, addr_len);
-#if IS_ENABLED(CONFIG_IPV6)
-		} else if (family == AF_INET6) {
-			return pingv6_ops.ipv6_recv_error(sk, msg, len,
-							  addr_len);
-#endif
-		}
-	}
+	if (flags & MSG_ERRQUEUE)
+		return inet_recv_error(sk, msg, len, addr_len);
 
 	skb = skb_recv_datagram(sk, flags, noblock, &err);
 	if (!skb)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 39ec0c379545..38c2bcb8dd5d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1598,7 +1598,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	u32 urg_hole = 0;
 
 	if (unlikely(flags & MSG_ERRQUEUE))
-		return ip_recv_error(sk, msg, len, addr_len);
+		return inet_recv_error(sk, msg, len, addr_len);
 
 	if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
 	    (sk->sk_state == TCP_ESTABLISHED))
-- 
cgit v1.2.3