From 7f92083eb58f85ea114d97f65fcbe22be5b0468d Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 30 Sep 2016 11:11:07 +0200
Subject: vti6: flush x-netns xfrm cache when vti interface is removed

This is the same fix than commit a5d0dc810abf ("vti: flush x-netns xfrm
cache when vti interface is removed")

This patch fixes a refcnt problem when a x-netns vti6 interface is removed:
unregister_netdevice: waiting for vti6_test to become free. Usage count = 1

Here is a script to reproduce the problem:

ip link set dev ntfp2 up
ip addr add dev ntfp2 2001::1/64
ip link add vti6_test type vti6 local 2001::1 remote 2001::2 key 1
ip netns add secure
ip link set vti6_test netns secure
ip netns exec secure ip link set vti6_test up
ip netns exec secure ip link s lo up
ip netns exec secure ip addr add dev vti6_test 2003::1/64
ip -6 xfrm policy add dir out tmpl src 2001::1 dst 2001::2 proto esp \
	   mode tunnel mark 1
ip -6 xfrm policy add dir in tmpl src 2001::2 dst 2001::1 proto esp \
	   mode tunnel mark 1
ip xfrm state add src 2001::1 dst 2001::2 proto esp spi 1 mode tunnel \
	   enc des3_ede 0x112233445566778811223344556677881122334455667788 mark 1
ip xfrm state add src 2001::2 dst 2001::1 proto esp spi 1 mode tunnel \
	   enc des3_ede 0x112233445566778811223344556677881122334455667788 mark 1
ip netns exec secure  ping6 -c 4 2003::2
ip netns del secure

CC: Lance Richardson <lrichard@redhat.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Lance Richardson <lrichard@redhat.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/ip6_vti.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 8a02ca8a11af..c299c1e2bbf0 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -1138,6 +1138,33 @@ static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = {
 	.priority	=	100,
 };
 
+static bool is_vti6_tunnel(const struct net_device *dev)
+{
+	return dev->netdev_ops == &vti6_netdev_ops;
+}
+
+static int vti6_device_event(struct notifier_block *unused,
+			     unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct ip6_tnl *t = netdev_priv(dev);
+
+	if (!is_vti6_tunnel(dev))
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_DOWN:
+		if (!net_eq(t->net, dev_net(dev)))
+			xfrm_garbage_collect(t->net);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block vti6_notifier_block __read_mostly = {
+	.notifier_call = vti6_device_event,
+};
+
 /**
  * vti6_tunnel_init - register protocol and reserve needed resources
  *
@@ -1148,6 +1175,8 @@ static int __init vti6_tunnel_init(void)
 	const char *msg;
 	int err;
 
+	register_netdevice_notifier(&vti6_notifier_block);
+
 	msg = "tunnel device";
 	err = register_pernet_device(&vti6_net_ops);
 	if (err < 0)
@@ -1180,6 +1209,7 @@ xfrm_proto_ah_failed:
 xfrm_proto_esp_failed:
 	unregister_pernet_device(&vti6_net_ops);
 pernet_dev_failed:
+	unregister_netdevice_notifier(&vti6_notifier_block);
 	pr_err("vti6 init: failed to register %s\n", msg);
 	return err;
 }
@@ -1194,6 +1224,7 @@ static void __exit vti6_tunnel_cleanup(void)
 	xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH);
 	xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP);
 	unregister_pernet_device(&vti6_net_ops);
+	unregister_netdevice_notifier(&vti6_notifier_block);
 }
 
 module_init(vti6_tunnel_init);
-- 
cgit v1.2.3


From 68d00f332e0ba7f60f212be74ede290c9f873bc5 Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <junk@yandex-team.ru>
Date: Tue, 11 Oct 2016 22:47:20 +0300
Subject: ip6_tunnel: fix ip6_tnl_lookup

The commit ea3dc9601bda ("ip6_tunnel: Add support for wildcard tunnel
endpoints.") introduces support for wildcards in tunnels endpoints,
but in some rare circumstances ip6_tnl_lookup selects wrong tunnel
interface relying only on source or destination address of the packet
and not checking presence of wildcard in tunnels endpoints. Later in
ip6_tnl_rcv this packets can be dicarded because of difference in
ipproto even if fallback device have proper ipproto configuration.

This patch adds checks of wildcard endpoint in tunnel avoiding such
behavior

Fixes: ea3dc9601bda ("ip6_tunnel: Add support for wildcard tunnel endpoints.")
Signed-off-by: Vadim Fedorenko <junk@yandex-team.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 6a66adba0c22..5692d6b8da95 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -157,6 +157,7 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_
 	hash = HASH(&any, local);
 	for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
 		if (ipv6_addr_equal(local, &t->parms.laddr) &&
+		    ipv6_addr_any(&t->parms.raddr) &&
 		    (t->dev->flags & IFF_UP))
 			return t;
 	}
@@ -164,6 +165,7 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_
 	hash = HASH(remote, &any);
 	for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
 		if (ipv6_addr_equal(remote, &t->parms.raddr) &&
+		    ipv6_addr_any(&t->parms.laddr) &&
 		    (t->dev->flags & IFF_UP))
 			return t;
 	}
-- 
cgit v1.2.3


From a220445f9f4382c36a53d8ef3e08165fa27f7e2c Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Wed, 12 Oct 2016 10:10:40 +0200
Subject: ipv6: correctly add local routes when lo goes up

The goal of the patch is to fix this scenario:
 ip link add dummy1 type dummy
 ip link set dummy1 up
 ip link set lo down ; ip link set lo up

After that sequence, the local route to the link layer address of dummy1 is
not there anymore.

When the loopback is set down, all local routes are deleted by
addrconf_ifdown()/rt6_ifdown(). At this time, the rt6_info entry still
exists, because the corresponding idev has a reference on it. After the rcu
grace period, dst_rcu_free() is called, and thus ___dst_free(), which will
set obsolete to DST_OBSOLETE_DEAD.

In this case, init_loopback() is called before dst_rcu_free(), thus
obsolete is still sets to something <= 0. So, the function doesn't add the
route again. To avoid that race, let's check the rt6 refcnt instead.

Fixes: 25fb6ca4ed9c ("net IPv6 : Fix broken IPv6 routing table after loopback down-up")
Fixes: a881ae1f625c ("ipv6: don't call addrconf_dst_alloc again when enable lo")
Fixes: 33d99113b110 ("ipv6: reallocate addrconf router for ipv6 address when lo device up")
Reported-by: Francesco Santoro <francesco.santoro@6wind.com>
Reported-by: Samuel Gauthier <samuel.gauthier@6wind.com>
CC: Balakumaran Kannan <Balakumaran.Kannan@ap.sony.com>
CC: Maruthi Thotad <Maruthi.Thotad@ap.sony.com>
CC: Sabrina Dubroca <sd@queasysnail.net>
CC: Hannes Frederic Sowa <hannes@stressinduktion.org>
CC: Weilong Chen <chenweilong@huawei.com>
CC: Gao feng <gaofeng@cn.fujitsu.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d8983e15f859..9faafe58516a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3018,7 +3018,7 @@ static void init_loopback(struct net_device *dev)
 				 * lo device down, release this obsolete dst and
 				 * reallocate a new router for ifa.
 				 */
-				if (sp_ifa->rt->dst.obsolete > 0) {
+				if (!atomic_read(&sp_ifa->rt->rt6i_ref)) {
 					ip6_rt_put(sp_ifa->rt);
 					sp_ifa->rt = NULL;
 				} else {
-- 
cgit v1.2.3


From 9d6280da39c04832d6abf5f4ecc8175c9aad91c0 Mon Sep 17 00:00:00 2001
From: Jiri Bohac <jbohac@suse.cz>
Date: Thu, 13 Oct 2016 18:50:02 +0200
Subject: IPv6: Drop the temporary address regen_timer

The randomized interface identifier (rndid) was periodically updated from
the regen_timer timer. Simplify the code by updating the rndid only when
needed by ipv6_try_regen_rndid().

This makes the follow-up DESYNC_FACTOR fix much simpler.  Also it fixes a
reference counting error in this error path, where an in6_dev_put was
missing:
		err = addrconf_sysctl_register(ndev);
		if (err) {
			ipv6_mc_destroy_dev(ndev);
	-               del_timer(&ndev->regen_timer);
			snmp6_unregister_dev(ndev);
			goto err_release;

Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 61 ++++++++---------------------------------------------
 1 file changed, 9 insertions(+), 52 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 9faafe58516a..58c3d73403a5 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -147,9 +147,8 @@ static inline void addrconf_sysctl_unregister(struct inet6_dev *idev)
 }
 #endif
 
-static void __ipv6_regen_rndid(struct inet6_dev *idev);
-static void __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr);
-static void ipv6_regen_rndid(unsigned long data);
+static void ipv6_regen_rndid(struct inet6_dev *idev);
+static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr);
 
 static int ipv6_generate_eui64(u8 *eui, struct net_device *dev);
 static int ipv6_count_addresses(struct inet6_dev *idev);
@@ -409,9 +408,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 		goto err_release;
 	}
 
-	/* One reference from device.  We must do this before
-	 * we invoke __ipv6_regen_rndid().
-	 */
+	/* One reference from device. */
 	in6_dev_hold(ndev);
 
 	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
@@ -425,17 +422,14 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 #endif
 
 	INIT_LIST_HEAD(&ndev->tempaddr_list);
-	setup_timer(&ndev->regen_timer, ipv6_regen_rndid, (unsigned long)ndev);
 	if ((dev->flags&IFF_LOOPBACK) ||
 	    dev->type == ARPHRD_TUNNEL ||
 	    dev->type == ARPHRD_TUNNEL6 ||
 	    dev->type == ARPHRD_SIT ||
 	    dev->type == ARPHRD_NONE) {
 		ndev->cnf.use_tempaddr = -1;
-	} else {
-		in6_dev_hold(ndev);
-		ipv6_regen_rndid((unsigned long) ndev);
-	}
+	} else
+		ipv6_regen_rndid(ndev);
 
 	ndev->token = in6addr_any;
 
@@ -447,7 +441,6 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 	err = addrconf_sysctl_register(ndev);
 	if (err) {
 		ipv6_mc_destroy_dev(ndev);
-		del_timer(&ndev->regen_timer);
 		snmp6_unregister_dev(ndev);
 		goto err_release;
 	}
@@ -1222,7 +1215,7 @@ retry:
 	}
 	in6_ifa_hold(ifp);
 	memcpy(addr.s6_addr, ifp->addr.s6_addr, 8);
-	__ipv6_try_regen_rndid(idev, tmpaddr);
+	ipv6_try_regen_rndid(idev, tmpaddr);
 	memcpy(&addr.s6_addr[8], idev->rndid, 8);
 	age = (now - ifp->tstamp) / HZ;
 	tmp_valid_lft = min_t(__u32,
@@ -2150,7 +2143,7 @@ static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev)
 }
 
 /* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */
-static void __ipv6_regen_rndid(struct inet6_dev *idev)
+static void ipv6_regen_rndid(struct inet6_dev *idev)
 {
 regen:
 	get_random_bytes(idev->rndid, sizeof(idev->rndid));
@@ -2179,43 +2172,10 @@ regen:
 	}
 }
 
-static void ipv6_regen_rndid(unsigned long data)
-{
-	struct inet6_dev *idev = (struct inet6_dev *) data;
-	unsigned long expires;
-
-	rcu_read_lock_bh();
-	write_lock_bh(&idev->lock);
-
-	if (idev->dead)
-		goto out;
-
-	__ipv6_regen_rndid(idev);
-
-	expires = jiffies +
-		idev->cnf.temp_prefered_lft * HZ -
-		idev->cnf.regen_max_retry * idev->cnf.dad_transmits *
-		NEIGH_VAR(idev->nd_parms, RETRANS_TIME) -
-		idev->cnf.max_desync_factor * HZ;
-	if (time_before(expires, jiffies)) {
-		pr_warn("%s: too short regeneration interval; timer disabled for %s\n",
-			__func__, idev->dev->name);
-		goto out;
-	}
-
-	if (!mod_timer(&idev->regen_timer, expires))
-		in6_dev_hold(idev);
-
-out:
-	write_unlock_bh(&idev->lock);
-	rcu_read_unlock_bh();
-	in6_dev_put(idev);
-}
-
-static void  __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr)
+static void  ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr)
 {
 	if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0)
-		__ipv6_regen_rndid(idev);
+		ipv6_regen_rndid(idev);
 }
 
 /*
@@ -3594,9 +3554,6 @@ restart:
 	if (!how)
 		idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY);
 
-	if (how && del_timer(&idev->regen_timer))
-		in6_dev_put(idev);
-
 	/* Step 3: clear tempaddr list */
 	while (!list_empty(&idev->tempaddr_list)) {
 		ifa = list_first_entry(&idev->tempaddr_list,
-- 
cgit v1.2.3


From 76506a986dc31394fd1f2741db037d29c7e57843 Mon Sep 17 00:00:00 2001
From: Jiri Bohac <jbohac@suse.cz>
Date: Thu, 13 Oct 2016 18:52:15 +0200
Subject: IPv6: fix DESYNC_FACTOR

The IPv6 temporary address generation uses a variable called DESYNC_FACTOR
to prevent hosts updating the addresses at the same time. Quoting RFC 4941:

   ... The value DESYNC_FACTOR is a random value (different for each
   client) that ensures that clients don't synchronize with each other and
   generate new addresses at exactly the same time ...

DESYNC_FACTOR is defined as:

   DESYNC_FACTOR -- A random value within the range 0 - MAX_DESYNC_FACTOR.
   It is computed once at system start (rather than each time it is used)
   and must never be greater than (TEMP_VALID_LIFETIME - REGEN_ADVANCE).

First, I believe the RFC has a typo in it and meant to say: "and must
never be greater than (TEMP_PREFERRED_LIFETIME - REGEN_ADVANCE)"

The reason is that at various places in the RFC, DESYNC_FACTOR is used in
a calculation like (TEMP_PREFERRED_LIFETIME - DESYNC_FACTOR) or
(TEMP_PREFERRED_LIFETIME - REGEN_ADVANCE - DESYNC_FACTOR). It needs to be
smaller than (TEMP_PREFERRED_LIFETIME - REGEN_ADVANCE) for the result of
these calculations to be larger than zero. It's never used in a
calculation together with TEMP_VALID_LIFETIME.

I already submitted an errata to the rfc-editor:
https://www.rfc-editor.org/errata_search.php?rfc=4941

The Linux implementation of DESYNC_FACTOR is very wrong:
max_desync_factor is used in places DESYNC_FACTOR should be used.
max_desync_factor is initialized to the RFC-recommended value for
MAX_DESYNC_FACTOR (600) but the whole point is to get a _random_ value.

And nothing ensures that the value used is not greater than
(TEMP_PREFERRED_LIFETIME - REGEN_ADVANCE), which leads to underflows.  The
effect can easily be observed when setting the temp_prefered_lft sysctl
e.g. to 60. The preferred lifetime of the temporary addresses will be
bogus.

TEMP_PREFERRED_LIFETIME and REGEN_ADVANCE are not constants and can be
influenced by these three sysctls: regen_max_retry, dad_transmits and
temp_prefered_lft. Thus, the upper bound for desync_factor needs to be
re-calculated each time a new address is generated and if desync_factor is
larger than the new upper bound, a new random value needs to be
re-generated.

And since we already have max_desync_factor configurable per interface, we
also need to calculate and store desync_factor per interface.

Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 39 +++++++++++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 8 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 58c3d73403a5..cc7c26d05f45 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -422,6 +422,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 #endif
 
 	INIT_LIST_HEAD(&ndev->tempaddr_list);
+	ndev->desync_factor = U32_MAX;
 	if ((dev->flags&IFF_LOOPBACK) ||
 	    dev->type == ARPHRD_TUNNEL ||
 	    dev->type == ARPHRD_TUNNEL6 ||
@@ -1183,6 +1184,7 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *i
 	int ret = 0;
 	u32 addr_flags;
 	unsigned long now = jiffies;
+	long max_desync_factor;
 
 	write_lock_bh(&idev->lock);
 	if (ift) {
@@ -1218,20 +1220,41 @@ retry:
 	ipv6_try_regen_rndid(idev, tmpaddr);
 	memcpy(&addr.s6_addr[8], idev->rndid, 8);
 	age = (now - ifp->tstamp) / HZ;
+
+	regen_advance = idev->cnf.regen_max_retry *
+			idev->cnf.dad_transmits *
+			NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ;
+
+	/* recalculate max_desync_factor each time and update
+	 * idev->desync_factor if it's larger
+	 */
+	max_desync_factor = min_t(__u32,
+				  idev->cnf.max_desync_factor,
+				  idev->cnf.temp_prefered_lft - regen_advance);
+
+	if (unlikely(idev->desync_factor > max_desync_factor)) {
+		if (max_desync_factor > 0) {
+			get_random_bytes(&idev->desync_factor,
+					 sizeof(idev->desync_factor));
+			idev->desync_factor %= max_desync_factor;
+		} else {
+			idev->desync_factor = 0;
+		}
+	}
+
 	tmp_valid_lft = min_t(__u32,
 			      ifp->valid_lft,
 			      idev->cnf.temp_valid_lft + age);
-	tmp_prefered_lft = min_t(__u32,
-				 ifp->prefered_lft,
-				 idev->cnf.temp_prefered_lft + age -
-				 idev->cnf.max_desync_factor);
+	tmp_prefered_lft = idev->cnf.temp_prefered_lft + age -
+			    idev->desync_factor;
+	/* guard against underflow in case of concurrent updates to cnf */
+	if (unlikely(tmp_prefered_lft < 0))
+		tmp_prefered_lft = 0;
+	tmp_prefered_lft = min_t(__u32, ifp->prefered_lft, tmp_prefered_lft);
 	tmp_plen = ifp->prefix_len;
 	tmp_tstamp = ifp->tstamp;
 	spin_unlock_bh(&ifp->lock);
 
-	regen_advance = idev->cnf.regen_max_retry *
-			idev->cnf.dad_transmits *
-			NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ;
 	write_unlock_bh(&idev->lock);
 
 	/* A temporary address is created only if this calculated Preferred
@@ -2316,7 +2339,7 @@ static void manage_tempaddrs(struct inet6_dev *idev,
 			max_valid = 0;
 
 		max_prefered = idev->cnf.temp_prefered_lft -
-			       idev->cnf.max_desync_factor - age;
+			       idev->desync_factor - age;
 		if (max_prefered < 0)
 			max_prefered = 0;
 
-- 
cgit v1.2.3


From a04a480d4392ea6efd117be2de564117b2a009c0 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sun, 16 Oct 2016 20:02:52 -0700
Subject: net: Require exact match for TCP socket lookups if dif is l3mdev

Currently, socket lookups for l3mdev (vrf) use cases can match a socket
that is bound to a port but not a device (ie., a global socket). If the
sysctl tcp_l3mdev_accept is not set this leads to ack packets going out
based on the main table even though the packet came in from an L3 domain.
The end result is that the connection does not establish creating
confusion for users since the service is running and a socket shows in
ss output. Fix by requiring an exact dif to sk_bound_dev_if match if the
skb came through an interface enslaved to an l3mdev device and the
tcp_l3mdev_accept is not set.

skb's through an l3mdev interface are marked by setting a flag in
inet{6}_skb_parm. The IPv6 variant is already set; this patch adds the
flag for IPv4. Using an skb flag avoids a device lookup on the dif. The
flag is set in the VRF driver using the IP{6}CB macros. For IPv4, the
inet_skb_parm struct is moved in the cb per commit 971f10eca186, so the
match function in the TCP stack needs to use TCP_SKB_CB. For IPv6, the
move is done after the socket lookup, so IP6CB is used.

The flags field in inet_skb_parm struct needs to be increased to add
another flag. There is currently a 1-byte hole following the flags,
so it can be expanded to u16 without increasing the size of the struct.

Fixes: 193125dbd8eb ("net: Introduce VRF device driver")
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/inet6_hashtables.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 00cf28ad4565..2fd0374a35b1 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -96,7 +96,7 @@ EXPORT_SYMBOL(__inet6_lookup_established);
 static inline int compute_score(struct sock *sk, struct net *net,
 				const unsigned short hnum,
 				const struct in6_addr *daddr,
-				const int dif)
+				const int dif, bool exact_dif)
 {
 	int score = -1;
 
@@ -109,7 +109,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
 				return -1;
 			score++;
 		}
-		if (sk->sk_bound_dev_if) {
+		if (sk->sk_bound_dev_if || exact_dif) {
 			if (sk->sk_bound_dev_if != dif)
 				return -1;
 			score++;
@@ -131,11 +131,12 @@ struct sock *inet6_lookup_listener(struct net *net,
 	unsigned int hash = inet_lhashfn(net, hnum);
 	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
 	int score, hiscore = 0, matches = 0, reuseport = 0;
+	bool exact_dif = inet6_exact_dif_match(net, skb);
 	struct sock *sk, *result = NULL;
 	u32 phash = 0;
 
 	sk_for_each(sk, &ilb->head) {
-		score = compute_score(sk, net, hnum, daddr, dif);
+		score = compute_score(sk, net, hnum, daddr, dif, exact_dif);
 		if (score > hiscore) {
 			reuseport = sk->sk_reuseport;
 			if (reuseport) {
-- 
cgit v1.2.3


From 7aa8e63f0d0f2e0ae353632bca1ce75a258696c6 Mon Sep 17 00:00:00 2001
From: Jiri Bohac <jbohac@suse.cz>
Date: Thu, 20 Oct 2016 12:29:26 +0200
Subject: ipv6: properly prevent temp_prefered_lft sysctl race

The check for an underflow of tmp_prefered_lft is always false
because tmp_prefered_lft is unsigned. The intention of the check
was to guard against racing with an update of the
temp_prefered_lft sysctl, potentially resulting in an underflow.

As suggested by David Miller, the best way to prevent the race is
by reading the sysctl variable using READ_ONCE.

Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Reported-by: Julia Lawall <julia.lawall@lip6.fr>
Fixes: 76506a986dc3 ("IPv6: fix DESYNC_FACTOR")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index cc7c26d05f45..060dd9922018 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1185,6 +1185,7 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *i
 	u32 addr_flags;
 	unsigned long now = jiffies;
 	long max_desync_factor;
+	s32 cnf_temp_preferred_lft;
 
 	write_lock_bh(&idev->lock);
 	if (ift) {
@@ -1228,9 +1229,10 @@ retry:
 	/* recalculate max_desync_factor each time and update
 	 * idev->desync_factor if it's larger
 	 */
+	cnf_temp_preferred_lft = READ_ONCE(idev->cnf.temp_prefered_lft);
 	max_desync_factor = min_t(__u32,
 				  idev->cnf.max_desync_factor,
-				  idev->cnf.temp_prefered_lft - regen_advance);
+				  cnf_temp_preferred_lft - regen_advance);
 
 	if (unlikely(idev->desync_factor > max_desync_factor)) {
 		if (max_desync_factor > 0) {
@@ -1245,11 +1247,8 @@ retry:
 	tmp_valid_lft = min_t(__u32,
 			      ifp->valid_lft,
 			      idev->cnf.temp_valid_lft + age);
-	tmp_prefered_lft = idev->cnf.temp_prefered_lft + age -
+	tmp_prefered_lft = cnf_temp_preferred_lft + age -
 			    idev->desync_factor;
-	/* guard against underflow in case of concurrent updates to cnf */
-	if (unlikely(tmp_prefered_lft < 0))
-		tmp_prefered_lft = 0;
 	tmp_prefered_lft = min_t(__u32, ifp->prefered_lft, tmp_prefered_lft);
 	tmp_plen = ifp->prefix_len;
 	tmp_tstamp = ifp->tstamp;
-- 
cgit v1.2.3


From fcd91dd449867c6bfe56a81cabba76b829fd05cd Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Thu, 20 Oct 2016 15:58:02 +0200
Subject: net: add recursion limit to GRO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, GRO can do unlimited recursion through the gro_receive
handlers.  This was fixed for tunneling protocols by limiting tunnel GRO
to one level with encap_mark, but both VLAN and TEB still have this
problem.  Thus, the kernel is vulnerable to a stack overflow, if we
receive a packet composed entirely of VLAN headers.

This patch adds a recursion counter to the GRO layer to prevent stack
overflow.  When a gro_receive function hits the recursion limit, GRO is
aborted for this skb and it is processed normally.  This recursion
counter is put in the GRO CB, but could be turned into a percpu counter
if we run out of space in the CB.

Thanks to Vladimír Beneš <vbenes@redhat.com> for the initial bug report.

Fixes: CVE-2016-7039
Fixes: 9b174d88c257 ("net: Add Transparent Ethernet Bridging GRO support.")
Fixes: 66e5133f19e9 ("vlan: Add GRO support for non hardware accelerated vlan")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_offload.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index e7bfd55899a3..1fcf61f1cbc3 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -246,7 +246,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head,
 
 	skb_gro_postpull_rcsum(skb, iph, nlen);
 
-	pp = ops->callbacks.gro_receive(head, skb);
+	pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
 
 out_unlock:
 	rcu_read_unlock();
-- 
cgit v1.2.3


From 286c72deabaa240b7eebbd99496ed3324d69f3c0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 20 Oct 2016 09:39:40 -0700
Subject: udp: must lock the socket in udp_disconnect()

Baozeng Ding reported KASAN traces showing uses after free in
udp_lib_get_port() and other related UDP functions.

A CONFIG_DEBUG_PAGEALLOC=y kernel would eventually crash.

I could write a reproducer with two threads doing :

static int sock_fd;
static void *thr1(void *arg)
{
	for (;;) {
		connect(sock_fd, (const struct sockaddr *)arg,
			sizeof(struct sockaddr_in));
	}
}

static void *thr2(void *arg)
{
	struct sockaddr_in unspec;

	for (;;) {
		memset(&unspec, 0, sizeof(unspec));
	        connect(sock_fd, (const struct sockaddr *)&unspec,
			sizeof(unspec));
        }
}

Problem is that udp_disconnect() could run without holding socket lock,
and this was causing list corruptions.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Baozeng Ding <sploving1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ping.c | 2 +-
 net/ipv6/raw.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 0e983b694ee8..66e2d9dfc43a 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -180,7 +180,7 @@ struct proto pingv6_prot = {
 	.init =		ping_init_sock,
 	.close =	ping_close,
 	.connect =	ip6_datagram_connect_v6_only,
-	.disconnect =	udp_disconnect,
+	.disconnect =	__udp_disconnect,
 	.setsockopt =	ipv6_setsockopt,
 	.getsockopt =	ipv6_getsockopt,
 	.sendmsg =	ping_v6_sendmsg,
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 54404f08efcc..054a1d84fc5e 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1241,7 +1241,7 @@ struct proto rawv6_prot = {
 	.close		   = rawv6_close,
 	.destroy	   = raw6_destroy,
 	.connect	   = ip6_datagram_connect_v6_only,
-	.disconnect	   = udp_disconnect,
+	.disconnect	   = __udp_disconnect,
 	.ioctl		   = rawv6_ioctl,
 	.init		   = rawv6_init_sk,
 	.setsockopt	   = rawv6_setsockopt,
-- 
cgit v1.2.3


From 8651be8f14a12d24f203f283601d9b0418c389ff Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Wed, 19 Oct 2016 23:35:12 -0700
Subject: ipv6: fix a potential deadlock in do_ipv6_setsockopt()

Baozeng reported this deadlock case:

       CPU0                    CPU1
       ----                    ----
  lock([  165.136033] sk_lock-AF_INET6);
                               lock([  165.136033] rtnl_mutex);
                               lock([  165.136033] sk_lock-AF_INET6);
  lock([  165.136033] rtnl_mutex);

Similar to commit 87e9f0315952
("ipv4: fix a potential deadlock in mcast getsockopt() path")
this is due to we still have a case, ipv6_sock_mc_close(),
where we acquire sk_lock before rtnl_lock. Close this deadlock
with the similar solution, that is always acquire rtnl lock first.

Fixes: baf606d9c9b1 ("ipv4,ipv6: grab rtnl before locking the socket")
Reported-by: Baozeng Ding <sploving1@gmail.com>
Tested-by: Baozeng Ding <sploving1@gmail.com>
Cc: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ipv6_sockglue.c |  3 ++-
 net/ipv6/mcast.c         | 17 ++++++++++++-----
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 5330262ab673..636ec56f5f50 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -120,6 +120,7 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
 static bool setsockopt_needs_rtnl(int optname)
 {
 	switch (optname) {
+	case IPV6_ADDRFORM:
 	case IPV6_ADD_MEMBERSHIP:
 	case IPV6_DROP_MEMBERSHIP:
 	case IPV6_JOIN_ANYCAST:
@@ -198,7 +199,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			}
 
 			fl6_free_socklist(sk);
-			ipv6_sock_mc_close(sk);
+			__ipv6_sock_mc_close(sk);
 
 			/*
 			 * Sock is moving from IPv6 to IPv4 (sk_prot), so
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 75c1fc54f188..14a3903f1c82 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -276,16 +276,14 @@ static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net,
 	return idev;
 }
 
-void ipv6_sock_mc_close(struct sock *sk)
+void __ipv6_sock_mc_close(struct sock *sk)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct ipv6_mc_socklist *mc_lst;
 	struct net *net = sock_net(sk);
 
-	if (!rcu_access_pointer(np->ipv6_mc_list))
-		return;
+	ASSERT_RTNL();
 
-	rtnl_lock();
 	while ((mc_lst = rtnl_dereference(np->ipv6_mc_list)) != NULL) {
 		struct net_device *dev;
 
@@ -303,8 +301,17 @@ void ipv6_sock_mc_close(struct sock *sk)
 
 		atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
 		kfree_rcu(mc_lst, rcu);
-
 	}
+}
+
+void ipv6_sock_mc_close(struct sock *sk)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+
+	if (!rcu_access_pointer(np->ipv6_mc_list))
+		return;
+	rtnl_lock();
+	__ipv6_sock_mc_close(sk);
 	rtnl_unlock();
 }
 
-- 
cgit v1.2.3


From b678aa578c9e400429e027269e8de2783e5e73ce Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Fri, 21 Oct 2016 18:28:25 +0900
Subject: ipv6: do not increment mac header when it's unset

Otherwise we'll overflow the integer. This occurs when layer 3 tunneled
packets are handed off to the IPv6 layer.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/reassembly.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 2160d5d009cb..3815e8505ed2 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -456,7 +456,8 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 	skb_network_header(head)[nhoff] = skb_transport_header(head)[0];
 	memmove(head->head + sizeof(struct frag_hdr), head->head,
 		(head->data - head->head) - sizeof(struct frag_hdr));
-	head->mac_header += sizeof(struct frag_hdr);
+	if (skb_mac_header_was_set(head))
+		head->mac_header += sizeof(struct frag_hdr);
 	head->network_header += sizeof(struct frag_hdr);
 
 	skb_reset_transport_header(head);
-- 
cgit v1.2.3


From 10df8e6152c6c400a563a673e9956320bfce1871 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 23 Oct 2016 18:03:06 -0700
Subject: udp: fix IP_CHECKSUM handling

First bug was added in commit ad6f939ab193 ("ip: Add offset parameter to
ip_cmsg_recv") : Tom missed that ipv4 udp messages could be received on
AF_INET6 socket. ip_cmsg_recv(msg, skb) should have been replaced by
ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr));

Then commit e6afc8ace6dd ("udp: remove headers from UDP packets before
queueing") forgot to adjust the offsets now UDP headers are pulled
before skb are put in receive queue.

Fixes: ad6f939ab193 ("ip: Add offset parameter to ip_cmsg_recv")
Fixes: e6afc8ace6dd ("udp: remove headers from UDP packets before queueing")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Sam Kumar <samanthakumar@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Tested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/udp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 9aa7c1c7a9ce..b2ef061e6836 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -427,7 +427,8 @@ try_again:
 
 	if (is_udp4) {
 		if (inet->cmsg_flags)
-			ip_cmsg_recv(msg, skb);
+			ip_cmsg_recv_offset(msg, skb,
+					    sizeof(struct udphdr), off);
 	} else {
 		if (np->rxopt.all)
 			ip6_datagram_recv_specific_ctl(sk, msg, skb);
-- 
cgit v1.2.3


From 830218c1add1da16519b71909e5cf21522b7d062 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Mon, 24 Oct 2016 10:52:35 -0700
Subject: net: ipv6: Fix processing of RAs in presence of VRF

rt6_add_route_info and rt6_add_dflt_router were updated to pull the FIB
table from the device index, but the corresponding rt6_get_route_info
and rt6_get_dflt_router functions were not leading to the failure to
process RA's:

    ICMPv6: RA: ndisc_router_discovery failed to add default route

Fix the 'get' functions by using the table id associated with the
device when applicable.

Also, now that default routes can be added to tables other than the
default table, rt6_purge_dflt_routers needs to be updated as well to
look at all tables. To handle that efficiently, add a flag to the table
denoting if it is has a default route via RA.

Fixes: ca254490c8dfd ("net: Add VRF support to IPv6 stack")
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 68 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 20 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index bdbc38e8bf29..3ac19eb81a86 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -102,11 +102,13 @@ static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
 #ifdef CONFIG_IPV6_ROUTE_INFO
 static struct rt6_info *rt6_add_route_info(struct net *net,
 					   const struct in6_addr *prefix, int prefixlen,
-					   const struct in6_addr *gwaddr, int ifindex,
+					   const struct in6_addr *gwaddr,
+					   struct net_device *dev,
 					   unsigned int pref);
 static struct rt6_info *rt6_get_route_info(struct net *net,
 					   const struct in6_addr *prefix, int prefixlen,
-					   const struct in6_addr *gwaddr, int ifindex);
+					   const struct in6_addr *gwaddr,
+					   struct net_device *dev);
 #endif
 
 struct uncached_list {
@@ -803,7 +805,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 		rt = rt6_get_dflt_router(gwaddr, dev);
 	else
 		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
-					gwaddr, dev->ifindex);
+					gwaddr, dev);
 
 	if (rt && !lifetime) {
 		ip6_del_rt(rt);
@@ -811,8 +813,8 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 	}
 
 	if (!rt && lifetime)
-		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
-					pref);
+		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
+					dev, pref);
 	else if (rt)
 		rt->rt6i_flags = RTF_ROUTEINFO |
 				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
@@ -2325,13 +2327,16 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
 #ifdef CONFIG_IPV6_ROUTE_INFO
 static struct rt6_info *rt6_get_route_info(struct net *net,
 					   const struct in6_addr *prefix, int prefixlen,
-					   const struct in6_addr *gwaddr, int ifindex)
+					   const struct in6_addr *gwaddr,
+					   struct net_device *dev)
 {
+	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
+	int ifindex = dev->ifindex;
 	struct fib6_node *fn;
 	struct rt6_info *rt = NULL;
 	struct fib6_table *table;
 
-	table = fib6_get_table(net, RT6_TABLE_INFO);
+	table = fib6_get_table(net, tb_id);
 	if (!table)
 		return NULL;
 
@@ -2357,12 +2362,13 @@ out:
 
 static struct rt6_info *rt6_add_route_info(struct net *net,
 					   const struct in6_addr *prefix, int prefixlen,
-					   const struct in6_addr *gwaddr, int ifindex,
+					   const struct in6_addr *gwaddr,
+					   struct net_device *dev,
 					   unsigned int pref)
 {
 	struct fib6_config cfg = {
 		.fc_metric	= IP6_RT_PRIO_USER,
-		.fc_ifindex	= ifindex,
+		.fc_ifindex	= dev->ifindex,
 		.fc_dst_len	= prefixlen,
 		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
 				  RTF_UP | RTF_PREF(pref),
@@ -2371,7 +2377,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
 		.fc_nlinfo.nl_net = net,
 	};
 
-	cfg.fc_table = l3mdev_fib_table_by_index(net, ifindex) ? : RT6_TABLE_INFO;
+	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
 	cfg.fc_dst = *prefix;
 	cfg.fc_gateway = *gwaddr;
 
@@ -2381,16 +2387,17 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
 
 	ip6_route_add(&cfg);
 
-	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
+	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
 }
 #endif
 
 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
 {
+	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
 	struct rt6_info *rt;
 	struct fib6_table *table;
 
-	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
+	table = fib6_get_table(dev_net(dev), tb_id);
 	if (!table)
 		return NULL;
 
@@ -2424,20 +2431,20 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
 
 	cfg.fc_gateway = *gwaddr;
 
-	ip6_route_add(&cfg);
+	if (!ip6_route_add(&cfg)) {
+		struct fib6_table *table;
+
+		table = fib6_get_table(dev_net(dev), cfg.fc_table);
+		if (table)
+			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
+	}
 
 	return rt6_get_dflt_router(gwaddr, dev);
 }
 
-void rt6_purge_dflt_routers(struct net *net)
+static void __rt6_purge_dflt_routers(struct fib6_table *table)
 {
 	struct rt6_info *rt;
-	struct fib6_table *table;
-
-	/* NOTE: Keep consistent with rt6_get_dflt_router */
-	table = fib6_get_table(net, RT6_TABLE_DFLT);
-	if (!table)
-		return;
 
 restart:
 	read_lock_bh(&table->tb6_lock);
@@ -2451,6 +2458,27 @@ restart:
 		}
 	}
 	read_unlock_bh(&table->tb6_lock);
+
+	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
+}
+
+void rt6_purge_dflt_routers(struct net *net)
+{
+	struct fib6_table *table;
+	struct hlist_head *head;
+	unsigned int h;
+
+	rcu_read_lock();
+
+	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+		head = &net->ipv6.fib_table_hash[h];
+		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
+			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
+				__rt6_purge_dflt_routers(table);
+		}
+	}
+
+	rcu_read_unlock();
 }
 
 static void rtmsg_to_fib6_config(struct net *net,
-- 
cgit v1.2.3


From d5d32e4b76687f4df9ad3ba8d3702b7347f51fa6 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Mon, 24 Oct 2016 12:27:23 -0700
Subject: net: ipv6: Do not consider link state for nexthop validation

Similar to IPv4, do not consider link state when validating next hops.

Currently, if the link is down default routes can fail to insert:
 $ ip -6 ro add vrf blue default via 2100:2::64 dev eth2
 RTNETLINK answers: No route to host

With this patch the command succeeds.

Fixes: 8c14586fc320 ("net: ipv6: Use passed in table for nexthop lookups")
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3ac19eb81a86..947ed1ded026 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -658,7 +658,8 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 	struct net_device *dev = rt->dst.dev;
 
 	if (dev && !netif_carrier_ok(dev) &&
-	    idev->cnf.ignore_routes_with_linkdown)
+	    idev->cnf.ignore_routes_with_linkdown &&
+	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
 		goto out;
 
 	if (rt6_check_expired(rt))
@@ -1052,6 +1053,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 	int strict = 0;
 
 	strict |= flags & RT6_LOOKUP_F_IFACE;
+	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
 	if (net->ipv6.devconf_all->forwarding == 0)
 		strict |= RT6_LOOKUP_F_REACHABLE;
 
@@ -1791,7 +1793,7 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net,
 	};
 	struct fib6_table *table;
 	struct rt6_info *rt;
-	int flags = RT6_LOOKUP_F_IFACE;
+	int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
 
 	table = fib6_get_table(net, cfg->fc_table);
 	if (!table)
-- 
cgit v1.2.3


From e4cabca54911a6be6f2e80e48fa497c7e19019a5 Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Tue, 25 Oct 2016 18:08:49 -0400
Subject: inet: Fix missing return value in inet6_hash

As part of a series to implement faster SO_REUSEPORT lookups,
commit 086c653f5862 ("sock: struct proto hash function may error")
added return values to protocol hash functions and
commit 496611d7b5ea ("inet: create IPv6-equivalent inet_hash function")
implemented a new hash function for IPv6.  However, the latter does
not respect the former's convention.

This properly propagates the hash errors in the IPv6 case.

Fixes: 496611d7b5ea ("inet: create IPv6-equivalent inet_hash function")
Reported-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Craig Gallek <kraig@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/inet6_hashtables.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 2fd0374a35b1..02761c9fe43e 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -264,13 +264,15 @@ EXPORT_SYMBOL_GPL(inet6_hash_connect);
 
 int inet6_hash(struct sock *sk)
 {
+	int err = 0;
+
 	if (sk->sk_state != TCP_CLOSE) {
 		local_bh_disable();
-		__inet_hash(sk, NULL, ipv6_rcv_saddr_equal);
+		err = __inet_hash(sk, NULL, ipv6_rcv_saddr_equal);
 		local_bh_enable();
 	}
 
-	return 0;
+	return err;
 }
 EXPORT_SYMBOL_GPL(inet6_hash);
 
-- 
cgit v1.2.3


From ae148b085876fa771d9ef2c05f85d4b4bf09ce0d Mon Sep 17 00:00:00 2001
From: Eli Cooper <elicooper@gmx.com>
Date: Wed, 26 Oct 2016 10:11:09 +0800
Subject: ip6_tunnel: Update skb->protocol to ETH_P_IPV6 in ip6_tnl_xmit()

This patch updates skb->protocol to ETH_P_IPV6 in ip6_tnl_xmit() when an
IPv6 header is installed to a socket buffer.

This is not a cosmetic change.  Without updating this value, GSO packets
transmitted through an ipip6 tunnel have the protocol of ETH_P_IP and
skb_mac_gso_segment() will attempt to call gso_segment() for IPv4,
which results in the packets being dropped.

Fixes: b8921ca83eed ("ip4ip6: Support for GSO/GRO")
Signed-off-by: Eli Cooper <elicooper@gmx.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 5692d6b8da95..87784560dc46 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1172,6 +1172,7 @@ route_lookup:
 	if (err)
 		return err;
 
+	skb->protocol = htons(ETH_P_IPV6);
 	skb_push(skb, sizeof(struct ipv6hdr));
 	skb_reset_network_header(skb);
 	ipv6h = ipv6_hdr(skb);
-- 
cgit v1.2.3


From b73b8a1ba598236296a46103d81c10d629d9a470 Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Sat, 29 Oct 2016 22:09:51 +0800
Subject: netfilter: nft_dup: do not use sreg_dev if the user doesn't specify
 it

The NFTA_DUP_SREG_DEV attribute is not a must option, so we should use it
in routing lookup only when the user specify it.

Fixes: d877f07112f1 ("netfilter: nf_tables: add nft_dup expression")
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nft_dup_ipv6.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c
index 8bfd470cbe72..831f86e1ec08 100644
--- a/net/ipv6/netfilter/nft_dup_ipv6.c
+++ b/net/ipv6/netfilter/nft_dup_ipv6.c
@@ -26,7 +26,7 @@ static void nft_dup_ipv6_eval(const struct nft_expr *expr,
 {
 	struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
 	struct in6_addr *gw = (struct in6_addr *)&regs->data[priv->sreg_addr];
-	int oif = regs->data[priv->sreg_dev];
+	int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1;
 
 	nf_dup_ipv6(pkt->net, pkt->skb, pkt->hook, gw, oif);
 }
@@ -57,7 +57,9 @@ static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
 
-	if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) ||
+	if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr))
+		goto nla_put_failure;
+	if (priv->sreg_dev &&
 	    nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From f89c56ce710afa65e1b2ead555b52c4807f34ff7 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jkbs@redhat.com>
Date: Wed, 26 Oct 2016 11:21:14 +0200
Subject: ipv6: Don't use ufo handling on later transformed packets

Similar to commit c146066ab802 ("ipv4: Don't use ufo handling on later
transformed packets"), don't perform UFO on packets that will be IPsec
transformed. To detect it we rely on the fact that headerlen in
dst_entry is non-zero only for transformation bundles (xfrm_dst
objects).

Unwanted segmentation can be observed with a NETIF_F_UFO capable device,
such as a dummy device:

  DEV=dum0 LEN=1493

  ip li add $DEV type dummy
  ip addr add fc00::1/64 dev $DEV nodad
  ip link set $DEV up
  ip xfrm policy add dir out src fc00::1 dst fc00::2 \
     tmpl src fc00::1 dst fc00::2 proto esp spi 1
  ip xfrm state add src fc00::1 dst fc00::2 \
     proto esp spi 1 enc 'aes' 0x0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b

  tcpdump -n -nn -i $DEV -t &
  socat /dev/zero,readbytes=$LEN udp6:[fc00::2]:$LEN

tcpdump output before:

  IP6 fc00::1 > fc00::2: frag (0|1448) ESP(spi=0x00000001,seq=0x1), length 1448
  IP6 fc00::1 > fc00::2: frag (1448|48)
  IP6 fc00::1 > fc00::2: ESP(spi=0x00000001,seq=0x2), length 88

... and after:

  IP6 fc00::1 > fc00::2: frag (0|1448) ESP(spi=0x00000001,seq=0x1), length 1448
  IP6 fc00::1 > fc00::2: frag (1448|80)

Fixes: e89e9cf539a2 ("[IPv4/IPv6]: UFO Scatter-gather approach")

Signed-off-by: Jakub Sitnicki <jkbs@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 6001e781164e..59eb4ed99ce8 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1366,7 +1366,7 @@ emsgsize:
 	if (((length > mtu) ||
 	     (skb && skb_is_gso(skb))) &&
 	    (sk->sk_protocol == IPPROTO_UDP) &&
-	    (rt->dst.dev->features & NETIF_F_UFO) &&
+	    (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
 	    (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
 		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
 					  hh_len, fragheaderlen, exthdrlen,
-- 
cgit v1.2.3


From 19bda36c4299ce3d7e5bce10bebe01764a655a6d Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 28 Oct 2016 18:18:01 +0800
Subject: ipv6: add mtu lock check in __ip6_rt_update_pmtu

Prior to this patch, ipv6 didn't do mtu lock check in ip6_update_pmtu.
It leaded to that mtu lock doesn't really work when receiving the pkt
of ICMPV6_PKT_TOOBIG.

This patch is to add mtu lock check in __ip6_rt_update_pmtu just as ipv4
did in __ip_rt_update_pmtu.

Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 947ed1ded026..7403d90dcb38 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1364,6 +1364,9 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 	if (rt6->rt6i_flags & RTF_LOCAL)
 		return;
 
+	if (dst_metric_locked(dst, RTAX_MTU))
+		return;
+
 	dst_confirm(dst);
 	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
 	if (mtu >= dst_mtu(dst))
-- 
cgit v1.2.3


From 4fd19c15decedd06d707e2691c24fce08700e2b1 Mon Sep 17 00:00:00 2001
From: Eli Cooper <elicooper@gmx.com>
Date: Tue, 1 Nov 2016 23:45:13 +0800
Subject: ip6_udp_tunnel: remove unused IPCB related codes

Some IPCB fields are currently set in udp_tunnel6_xmit_skb(), which are
never used before it reaches ip6tunnel_xmit(), and past that point the
control buffer is no longer interpreted as IPCB.

This clears these unused IPCB related codes. Currently there is no skb
scrubbing in ip6_udp_tunnel, otherwise IPCB(skb)->opt might need to be
cleared for IPv4 packets, as shown in 5146d1f1511
("tunnel: Clear IPCB(skb)->opt before dst_link_failure called").

Signed-off-by: Eli Cooper <elicooper@gmx.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_udp_tunnel.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index a7520528ecd2..b283f293ee4a 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -88,9 +88,6 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 
 	uh->len = htons(skb->len);
 
-	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED
-			    | IPSKB_REROUTED);
 	skb_dst_set(skb, dst);
 
 	udp6_set_csum(nocheck, skb, saddr, daddr, skb->len);
-- 
cgit v1.2.3


From 5d41ce29e3b91ef305f88d23f72b3359de329cec Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 3 Nov 2016 16:17:26 -0700
Subject: net: icmp6_send should use dst dev to determine L3 domain

icmp6_send is called in response to some event. The skb may not have
the device set (skb->dev is NULL), but it is expected to have a dst set.
Update icmp6_send to use the dst on the skb to determine L3 domain.

Fixes: ca254490c8dfd ("net: Add VRF support to IPv6 stack")
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/icmp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index bd59c343d35f..7370ad2e693a 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -448,7 +448,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	if (__ipv6_addr_needs_scope_id(addr_type))
 		iif = skb->dev->ifindex;
 	else
-		iif = l3mdev_master_ifindex(skb->dev);
+		iif = l3mdev_master_ifindex(skb_dst(skb)->dev);
 
 	/*
 	 *	Must not send error if the source does not uniquely
-- 
cgit v1.2.3


From fb56be83e43d0bb0cc9e8c35a6a9cac853231ba2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= <maze@google.com>
Date: Fri, 4 Nov 2016 14:51:54 -0700
Subject: net-ipv6: on device mtu change do not add mtu to mtu-less routes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Routes can specify an mtu explicitly or inherit the mtu from
the underlying device - this inheritance is implemented in
dst->ops->mtu handlers ip6_mtu() and ip6_blackhole_mtu().

Currently changing the mtu of a device adds mtu explicitly
to routes using that device.

ie.
  # ip link set dev lo mtu 65536
  # ip -6 route add local 2000::1 dev lo
  # ip -6 route get 2000::1
  local 2000::1 dev lo  table local  src ...  metric 1024  pref medium

  # ip link set dev lo mtu 65535
  # ip -6 route get 2000::1
  local 2000::1 dev lo  table local  src ...  metric 1024  mtu 65535 pref medium

  # ip link set dev lo mtu 65536
  # ip -6 route get 2000::1
  local 2000::1 dev lo  table local  src ...  metric 1024  mtu 65536 pref medium

  # ip -6 route del local 2000::1

After this patch the route entry no longer changes unless it already has an mtu.
There is no need: this inheritance is already done in ip6_mtu()

  # ip link set dev lo mtu 65536
  # ip -6 route add local 2000::1 dev lo
  # ip -6 route add local 2000::2 dev lo mtu 2000
  # ip -6 route get 2000::1; ip -6 route get 2000::2
  local 2000::1 dev lo  table local  src ...  metric 1024  pref medium
  local 2000::2 dev lo  table local  src ...  metric 1024  mtu 2000 pref medium

  # ip link set dev lo mtu 65535
  # ip -6 route get 2000::1; ip -6 route get 2000::2
  local 2000::1 dev lo  table local  src ...  metric 1024  pref medium
  local 2000::2 dev lo  table local  src ...  metric 1024  mtu 2000 pref medium

  # ip link set dev lo mtu 1501
  # ip -6 route get 2000::1; ip -6 route get 2000::2
  local 2000::1 dev lo  table local  src ...  metric 1024  pref medium
  local 2000::2 dev lo  table local  src ...  metric 1024  mtu 1501 pref medium

  # ip link set dev lo mtu 65536
  # ip -6 route get 2000::1; ip -6 route get 2000::2
  local 2000::1 dev lo  table local  src ...  metric 1024  pref medium
  local 2000::2 dev lo  table local  src ...  metric 1024  mtu 65536 pref medium

  # ip -6 route del local 2000::1
  # ip -6 route del local 2000::2

This is desirable because changing device mtu and then resetting it
to the previous value shouldn't change the user visible routing table.

Signed-off-by: Maciej Żenczykowski <maze@google.com>
CC: Eric Dumazet <edumazet@google.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7403d90dcb38..1b57e11e6e0d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2761,6 +2761,7 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
 	   PMTU discouvery.
 	 */
 	if (rt->dst.dev == arg->dev &&
+	    dst_metric_raw(&rt->dst, RTAX_MTU) &&
 	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
 		if (rt->rt6i_flags & RTF_CACHE) {
 			/* For RTF_CACHE with rt6i_pmtu == 0
-- 
cgit v1.2.3


From 9b6c14d51bd2304b92f842e96172a9cc822fc77c Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Wed, 9 Nov 2016 09:07:26 -0800
Subject: net: tcp response should set oif only if it is L3 master

Lorenzo noted an Android unit test failed due to e0d56fdd7342:
"The expectation in the test was that the RST replying to a SYN sent to a
closed port should be generated with oif=0. In other words it should not
prefer the interface where the SYN came in on, but instead should follow
whatever the routing table says it should do."

Revert the change to ip_send_unicast_reply and tcp_v6_send_response such
that the oif in the flow is set to the skb_iif only if skb_iif is an L3
master.

Fixes: e0d56fdd7342 ("net: l3mdev: remove redundant calls")
Reported-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Tested-by: Lorenzo Colitti <lorenzo@google.com>
Acked-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5a27ab4eab39..6ca23c2e76f7 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -818,8 +818,12 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 	fl6.flowi6_proto = IPPROTO_TCP;
 	if (rt6_need_strict(&fl6.daddr) && !oif)
 		fl6.flowi6_oif = tcp_v6_iif(skb);
-	else
-		fl6.flowi6_oif = oif ? : skb->skb_iif;
+	else {
+		if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
+			oif = skb->skb_iif;
+
+		fl6.flowi6_oif = oif;
+	}
 
 	fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark);
 	fl6.fl6_dport = t1->dest;
-- 
cgit v1.2.3


From ac6e780070e30e4c35bd395acfe9191e6268bdd3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 10 Nov 2016 13:12:35 -0800
Subject: tcp: take care of truncations done by sk_filter()

With syzkaller help, Marco Grassi found a bug in TCP stack,
crashing in tcp_collapse()

Root cause is that sk_filter() can truncate the incoming skb,
but TCP stack was not really expecting this to happen.
It probably was expecting a simple DROP or ACCEPT behavior.

We first need to make sure no part of TCP header could be removed.
Then we need to adjust TCP_SKB_CB(skb)->end_seq

Many thanks to syzkaller team and Marco for giving us a reproducer.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Marco Grassi <marco.gra@gmail.com>
Reported-by: Vladis Dronov <vdronov@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6ca23c2e76f7..b9f1fee9a886 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1229,7 +1229,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 	if (skb->protocol == htons(ETH_P_IP))
 		return tcp_v4_do_rcv(sk, skb);
 
-	if (sk_filter(sk, skb))
+	if (tcp_filter(sk, skb))
 		goto discard;
 
 	/*
@@ -1457,8 +1457,10 @@ process:
 	if (tcp_v6_inbound_md5_hash(sk, skb))
 		goto discard_and_relse;
 
-	if (sk_filter(sk, skb))
+	if (tcp_filter(sk, skb))
 		goto discard_and_relse;
+	th = (const struct tcphdr *)skb->data;
+	hdr = ipv6_hdr(skb);
 
 	skb->dev = NULL;
 
-- 
cgit v1.2.3


From 73e2d5e34b6cdd1080038daf3d6d6d744a9eefe6 Mon Sep 17 00:00:00 2001
From: Pablo Neira <pablo@netfilter.org>
Date: Mon, 14 Nov 2016 23:40:30 +0100
Subject: udp: restore UDPlite many-cast delivery

Honor udptable parameter that is passed to __udp*_lib_mcast_deliver(),
otherwise udplite broadcast/multicast use the wrong table and it breaks.

Fixes: 2dc41cff7545 ("udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver.")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/udp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b2ef061e6836..e5056d4873d1 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -706,10 +706,10 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 
 	if (use_hash2) {
 		hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
-			    udp_table.mask;
-		hash2 = udp6_portaddr_hash(net, daddr, hnum) & udp_table.mask;
+			    udptable->mask;
+		hash2 = udp6_portaddr_hash(net, daddr, hnum) & udptable->mask;
 start_lookup:
-		hslot = &udp_table.hash2[hash2];
+		hslot = &udptable->hash2[hash2];
 		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
 	}
 
-- 
cgit v1.2.3


From b5c2d49544e5930c96e2632a7eece3f4325a1888 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 16 Nov 2016 16:26:46 +0100
Subject: ip6_tunnel: disable caching when the traffic class is inherited

If an ip6 tunnel is configured to inherit the traffic class from
the inner header, the dst_cache must be disabled or it will foul
the policy routing.

The issue is apprently there since at leat Linux-2.6.12-rc2.

Reported-by: Liam McBirnie <liam.mcbirnie@boeing.com>
Cc: Liam McBirnie <liam.mcbirnie@boeing.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 87784560dc46..0a4759b89da2 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1034,6 +1034,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 	int mtu;
 	unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen;
 	unsigned int max_headroom = psh_hlen;
+	bool use_cache = false;
 	u8 hop_limit;
 	int err = -1;
 
@@ -1066,7 +1067,15 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
 
 		memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
 		neigh_release(neigh);
-	} else if (!fl6->flowi6_mark)
+	} else if (!(t->parms.flags &
+		     (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) {
+		/* enable the cache only only if the routing decision does
+		 * not depend on the current inner header value
+		 */
+		use_cache = true;
+	}
+
+	if (use_cache)
 		dst = dst_cache_get(&t->dst_cache);
 
 	if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr))
@@ -1150,7 +1159,7 @@ route_lookup:
 		if (t->encap.type != TUNNEL_ENCAP_NONE)
 			goto tx_err_dst_release;
 	} else {
-		if (!fl6->flowi6_mark && ndst)
+		if (use_cache && ndst)
 			dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
 	}
 	skb_dst_set(skb, dst);
-- 
cgit v1.2.3


From 00b4422fe363cc7cadc51c50c5a0c3c510f0fa14 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Wed, 9 Nov 2016 10:25:05 -0800
Subject: netfilter: Update nf_send_reset6 to consider L3 domain

nf_send_reset6 is not considering the L3 domain and lookups are sent
to the wrong table. For example consider the following output rule:

ip6tables -A OUTPUT -p tcp --dport 12345 -j REJECT --reject-with tcp-reset

using perf to analyze lookups via the fib6_table_lookup tracepoint shows:

swapper     0 [001]   248.787816: fib6:fib6_table_lookup: table 255 oif 0 iif 1 src 2100:1::3 dst 2100:1:
        ffffffff81439cdc perf_trace_fib6_table_lookup ([kernel.kallsyms])
        ffffffff814c1ce3 trace_fib6_table_lookup ([kernel.kallsyms])
        ffffffff814c3e89 ip6_pol_route ([kernel.kallsyms])
        ffffffff814c40d5 ip6_pol_route_output ([kernel.kallsyms])
        ffffffff814e7b6f fib6_rule_action ([kernel.kallsyms])
        ffffffff81437f60 fib_rules_lookup ([kernel.kallsyms])
        ffffffff814e7c79 fib6_rule_lookup ([kernel.kallsyms])
        ffffffff814c2541 ip6_route_output_flags ([kernel.kallsyms])
                     528 nf_send_reset6 ([nf_reject_ipv6])

The lookup is directed to table 255 rather than the table associated with
the device via the L3 domain. Update nf_send_reset6 to pull the L3 domain
from the dst currently attached to the skb.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_reject_ipv6.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index a5400223fd74..10090400c72f 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -156,6 +156,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
 	fl6.daddr = oip6h->saddr;
 	fl6.fl6_sport = otcph->dest;
 	fl6.fl6_dport = otcph->source;
+	fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev);
 	security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
 	dst = ip6_route_output(net, NULL, &fl6);
 	if (dst->error) {
-- 
cgit v1.2.3


From 764d3be6e415b40056834bfd29b994dc3f837606 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 22 Nov 2016 16:57:40 +0100
Subject: ipv6: bump genid when the IFA_F_TENTATIVE flag is clear

When an ipv6 address has the tentative flag set, it can't be
used as source for egress traffic, while the associated route,
if any, can be looked up and even stored into some dst_cache.

In the latter scenario, the source ipv6 address selected and
stored in the cache is most probably wrong (e.g. with
link-local scope) and the entity using the dst_cache will
experience lack of ipv6 connectivity until said cache is
cleared or invalidated.

Overall this may cause lack of connectivity over most IPv6 tunnels
(comprising geneve and vxlan), if the first egress packet reaches
the tunnel before the DaD is completed for the used ipv6
address.

This patch bumps a new genid after that the IFA_F_TENTATIVE flag
is cleared, so that dst_cache will be invalidated on
next lookup and ipv6 connectivity restored.

Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device")
Fixes: 468dfffcd762 ("geneve: add dst caching support")
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 060dd9922018..4bc5ba3ae452 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -183,7 +183,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
 
 static void addrconf_dad_start(struct inet6_ifaddr *ifp);
 static void addrconf_dad_work(struct work_struct *w);
-static void addrconf_dad_completed(struct inet6_ifaddr *ifp);
+static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id);
 static void addrconf_dad_run(struct inet6_dev *idev);
 static void addrconf_rs_timer(unsigned long data);
 static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
@@ -2898,6 +2898,7 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		spin_lock_bh(&ifp->lock);
 		ifp->flags &= ~IFA_F_TENTATIVE;
 		spin_unlock_bh(&ifp->lock);
+		rt_genid_bump_ipv6(dev_net(idev->dev));
 		ipv6_ifa_notify(RTM_NEWADDR, ifp);
 		in6_ifa_put(ifp);
 	}
@@ -3740,7 +3741,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
 {
 	struct inet6_dev *idev = ifp->idev;
 	struct net_device *dev = idev->dev;
-	bool notify = false;
+	bool bump_id, notify = false;
 
 	addrconf_join_solict(dev, &ifp->addr);
 
@@ -3755,11 +3756,12 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
 	    idev->cnf.accept_dad < 1 ||
 	    !(ifp->flags&IFA_F_TENTATIVE) ||
 	    ifp->flags & IFA_F_NODAD) {
+		bump_id = ifp->flags & IFA_F_TENTATIVE;
 		ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
 		spin_unlock(&ifp->lock);
 		read_unlock_bh(&idev->lock);
 
-		addrconf_dad_completed(ifp);
+		addrconf_dad_completed(ifp, bump_id);
 		return;
 	}
 
@@ -3819,8 +3821,8 @@ static void addrconf_dad_work(struct work_struct *w)
 						struct inet6_ifaddr,
 						dad_work);
 	struct inet6_dev *idev = ifp->idev;
+	bool bump_id, disable_ipv6 = false;
 	struct in6_addr mcaddr;
-	bool disable_ipv6 = false;
 
 	enum {
 		DAD_PROCESS,
@@ -3890,11 +3892,12 @@ static void addrconf_dad_work(struct work_struct *w)
 		 * DAD was successful
 		 */
 
+		bump_id = ifp->flags & IFA_F_TENTATIVE;
 		ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
 		spin_unlock(&ifp->lock);
 		write_unlock_bh(&idev->lock);
 
-		addrconf_dad_completed(ifp);
+		addrconf_dad_completed(ifp, bump_id);
 
 		goto out;
 	}
@@ -3931,7 +3934,7 @@ static bool ipv6_lonely_lladdr(struct inet6_ifaddr *ifp)
 	return true;
 }
 
-static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
+static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id)
 {
 	struct net_device *dev = ifp->idev->dev;
 	struct in6_addr lladdr;
@@ -3983,6 +3986,9 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
 		spin_unlock(&ifp->lock);
 		write_unlock_bh(&ifp->idev->lock);
 	}
+
+	if (bump_id)
+		rt_genid_bump_ipv6(dev_net(dev));
 }
 
 static void addrconf_dad_run(struct inet6_dev *idev)
-- 
cgit v1.2.3


From 30c7be26fd3587abcb69587f781098e3ca2d565b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 22 Nov 2016 09:06:45 -0800
Subject: udplite: call proper backlog handlers

In commits 93821778def10 ("udp: Fix rcv socket locking") and
f7ad74fef3af ("net/ipv6/udp: UDP encapsulation: break backlog_rcv into
__udpv6_queue_rcv_skb") UDP backlog handlers were renamed, but UDPlite
was forgotten.

This leads to crashes if UDPlite header is pulled twice, which happens
starting from commit e6afc8ace6dd ("udp: remove headers from UDP packets
before queueing")

Bug found by syzkaller team, thanks a lot guys !

Note that backlog use in UDP/UDPlite is scheduled to be removed starting
from linux-4.10, so this patch is only needed up to linux-4.9

Fixes: 93821778def1 ("udp: Fix rcv socket locking")
Fixes: f7ad74fef3af ("net/ipv6/udp: UDP encapsulation: break backlog_rcv into __udpv6_queue_rcv_skb")
Fixes: e6afc8ace6dd ("udp: remove headers from UDP packets before queueing")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/udp.c      | 2 +-
 net/ipv6/udp_impl.h | 2 +-
 net/ipv6/udplite.c  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e5056d4873d1..e4a8000d59ad 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -514,7 +514,7 @@ out:
 	return;
 }
 
-static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
 	int rc;
 
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index f6eb1ab34f4b..e78bdc76dcc3 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -26,7 +26,7 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
 int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
 int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 		  int flags, int *addr_len);
-int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 void udpv6_destroy_sock(struct sock *sk);
 
 #ifdef CONFIG_PROC_FS
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 47d0d2b87106..2f5101a12283 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -45,7 +45,7 @@ struct proto udplitev6_prot = {
 	.getsockopt	   = udpv6_getsockopt,
 	.sendmsg	   = udpv6_sendmsg,
 	.recvmsg	   = udpv6_recvmsg,
-	.backlog_rcv	   = udpv6_queue_rcv_skb,
+	.backlog_rcv	   = __udpv6_queue_rcv_skb,
 	.hash		   = udp_lib_hash,
 	.unhash		   = udp_lib_unhash,
 	.get_port	   = udp_v6_get_port,
-- 
cgit v1.2.3


From 79dc7e3f1cd323be4c81aa1a94faa1b3ed987fb2 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sun, 27 Nov 2016 18:52:53 -0800
Subject: net: handle no dst on skb in icmp6_send

Andrey reported the following while fuzzing the kernel with syzkaller:

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] SMP KASAN
Modules linked in:
CPU: 0 PID: 3859 Comm: a.out Not tainted 4.9.0-rc6+ #429
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
task: ffff8800666d4200 task.stack: ffff880067348000
RIP: 0010:[<ffffffff833617ec>]  [<ffffffff833617ec>]
icmp6_send+0x5fc/0x1e30 net/ipv6/icmp.c:451
RSP: 0018:ffff88006734f2c0  EFLAGS: 00010206
RAX: ffff8800666d4200 RBX: 0000000000000000 RCX: 0000000000000000
RDX: 0000000000000000 RSI: dffffc0000000000 RDI: 0000000000000018
RBP: ffff88006734f630 R08: ffff880064138418 R09: 0000000000000003
R10: dffffc0000000000 R11: 0000000000000005 R12: 0000000000000000
R13: ffffffff84e7e200 R14: ffff880064138484 R15: ffff8800641383c0
FS:  00007fb3887a07c0(0000) GS:ffff88006cc00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020000000 CR3: 000000006b040000 CR4: 00000000000006f0
Stack:
 ffff8800666d4200 ffff8800666d49f8 ffff8800666d4200 ffffffff84c02460
 ffff8800666d4a1a 1ffff1000ccdaa2f ffff88006734f498 0000000000000046
 ffff88006734f440 ffffffff832f4269 ffff880064ba7456 0000000000000000
Call Trace:
 [<ffffffff83364ddc>] icmpv6_param_prob+0x2c/0x40 net/ipv6/icmp.c:557
 [<     inline     >] ip6_tlvopt_unknown net/ipv6/exthdrs.c:88
 [<ffffffff83394405>] ip6_parse_tlv+0x555/0x670 net/ipv6/exthdrs.c:157
 [<ffffffff8339a759>] ipv6_parse_hopopts+0x199/0x460 net/ipv6/exthdrs.c:663
 [<ffffffff832ee773>] ipv6_rcv+0xfa3/0x1dc0 net/ipv6/ip6_input.c:191
 ...

icmp6_send / icmpv6_send is invoked for both rx and tx paths. In both
cases the dst->dev should be preferred for determining the L3 domain
if the dst has been set on the skb. Fallback to the skb->dev if it has
not. This covers the case reported here where icmp6_send is invoked on
Rx before the route lookup.

Fixes: 5d41ce29e ("net: icmp6_send should use dst dev to determine L3 domain")
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/icmp.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 7370ad2e693a..2772004ba5a1 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -447,8 +447,10 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 
 	if (__ipv6_addr_needs_scope_id(addr_type))
 		iif = skb->dev->ifindex;
-	else
-		iif = l3mdev_master_ifindex(skb_dst(skb)->dev);
+	else {
+		dst = skb_dst(skb);
+		iif = l3mdev_master_ifindex(dst ? dst->dev : skb->dev);
+	}
 
 	/*
 	 *	Must not send error if the source does not uniquely
-- 
cgit v1.2.3


From 9b57da0630c9fd36ed7a20fc0f98dc82cc0777fa Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 29 Nov 2016 02:17:34 +0100
Subject: netfilter: ipv6: nf_defrag: drop mangled skb on ream error

Dmitry Vyukov reported GPF in network stack that Andrey traced down to
negative nh offset in nf_ct_frag6_queue().

Problem is that all network headers before fragment header are pulled.
Normal ipv6 reassembly will drop the skb when errors occur further down
the line.

netfilter doesn't do this, and instead passed the original fragment
along.  That was also fine back when netfilter ipv6 defrag worked with
cloned fragments, as the original, pristine fragment was passed on.

So we either have to undo the pull op, or discard such fragments.
Since they're malformed after all (e.g. overlapping fragment) it seems
preferrable to just drop them.

Same for temporary errors -- it doesn't make sense to accept (and
perhaps forward!) only some fragments of same datagram.

Fixes: 029f7f3b8701cc7ac ("netfilter: ipv6: nf_defrag: avoid/free clone operations")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Debugged-by: Andrey Konovalov <andreyknvl@google.com>
Diagnosed-by: Eric Dumazet <Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c   | 4 ++--
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index e4347aeb2e65..9948b5ce52da 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -576,11 +576,11 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 	/* Jumbo payload inhibits frag. header */
 	if (ipv6_hdr(skb)->payload_len == 0) {
 		pr_debug("payload len = 0\n");
-		return -EINVAL;
+		return 0;
 	}
 
 	if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0)
-		return -EINVAL;
+		return 0;
 
 	if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr)))
 		return -ENOMEM;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index f7aab5ab93a5..f06b0471f39f 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -69,7 +69,7 @@ static unsigned int ipv6_defrag(void *priv,
 	if (err == -EINPROGRESS)
 		return NF_STOLEN;
 
-	return NF_ACCEPT;
+	return err == 0 ? NF_ACCEPT : NF_DROP;
 }
 
 static struct nf_hook_ops ipv6_defrag_ops[] = {
-- 
cgit v1.2.3


From a55e23864d381c5a4ef110df94b00b2fe121a70d Mon Sep 17 00:00:00 2001
From: Tobias Brunner <tobias@strongswan.org>
Date: Tue, 29 Nov 2016 17:05:25 +0100
Subject: esp6: Fix integrity verification when ESN are used

When handling inbound packets, the two halves of the sequence number
stored on the skb are already in network order.

Fixes: 000ae7b2690e ("esp6: Switch to new AEAD interface")
Signed-off-by: Tobias Brunner <tobias@strongswan.org>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/esp6.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 060a60b2f8a6..111ba55fd512 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -418,7 +418,7 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
 		esph = (void *)skb_push(skb, 4);
 		*seqhi = esph->spi;
 		esph->spi = esph->seq_no;
-		esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi);
+		esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
 		aead_request_set_callback(req, 0, esp_input_done_esn, skb);
 	}
 
-- 
cgit v1.2.3


From 0382a25af3c771a8e4d5e417d1834cbe28c2aaac Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Tue, 29 Nov 2016 13:09:44 +0100
Subject: l2tp: lock socket before checking flags in connect()

Socket flags aren't updated atomically, so the socket must be locked
while reading the SOCK_ZAPPED flag.

This issue exists for both l2tp_ip and l2tp_ip6. For IPv6, this patch
also brings error handling for __ip6_datagram_connect() failures.

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/datagram.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 37874e2f30ed..ccf40550c475 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -139,7 +139,8 @@ void ip6_datagram_release_cb(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(ip6_datagram_release_cb);
 
-static int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
+			   int addr_len)
 {
 	struct sockaddr_in6	*usin = (struct sockaddr_in6 *) uaddr;
 	struct inet_sock	*inet = inet_sk(sk);
@@ -252,6 +253,7 @@ ipv4_connected:
 out:
 	return err;
 }
+EXPORT_SYMBOL_GPL(__ip6_datagram_connect);
 
 int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
-- 
cgit v1.2.3


From b4e479a96fc398ccf83bb1cffb4ffef8631beaf1 Mon Sep 17 00:00:00 2001
From: Eli Cooper <elicooper@gmx.com>
Date: Thu, 1 Dec 2016 10:05:11 +0800
Subject: ipv6: Set skb->protocol properly for local output

When xfrm is applied to TSO/GSO packets, it follows this path:

    xfrm_output() -> xfrm_output_gso() -> skb_gso_segment()

where skb_gso_segment() relies on skb->protocol to function properly.

This patch sets skb->protocol to ETH_P_IPV6 before dst_output() is called,
fixing a bug where GSO packets sent through an ipip6 tunnel are dropped
when xfrm is involved.

Cc: stable@vger.kernel.org
Signed-off-by: Eli Cooper <elicooper@gmx.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/output_core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 7cca8ac66fe9..cd4252346a32 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -155,6 +155,8 @@ int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 	if (unlikely(!skb))
 		return 0;
 
+	skb->protocol = htons(ETH_P_IPV6);
+
 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 		       net, sk, skb, NULL, skb_dst(skb)->dev,
 		       dst_output);
-- 
cgit v1.2.3


From 80d1106aeaf689ab5fdf33020c5fecd269b31c88 Mon Sep 17 00:00:00 2001
From: Eli Cooper <elicooper@gmx.com>
Date: Thu, 1 Dec 2016 10:05:12 +0800
Subject: Revert: "ip6_tunnel: Update skb->protocol to ETH_P_IPV6 in
 ip6_tnl_xmit()"

This reverts commit ae148b085876fa771d9ef2c05f85d4b4bf09ce0d
("ip6_tunnel: Update skb->protocol to ETH_P_IPV6 in ip6_tnl_xmit()").

skb->protocol is now set in __ip_local_out() and __ip6_local_out() before
dst_output() is called. It is no longer necessary to do it for each tunnel.

Cc: stable@vger.kernel.org
Signed-off-by: Eli Cooper <elicooper@gmx.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 0a4759b89da2..d76674efe523 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1181,7 +1181,6 @@ route_lookup:
 	if (err)
 		return err;
 
-	skb->protocol = htons(ETH_P_IPV6);
 	skb_push(skb, sizeof(struct ipv6hdr));
 	skb_reset_network_header(skb);
 	ipv6h = ipv6_hdr(skb);
-- 
cgit v1.2.3


From 6b6ebb6b01c873d0cfe3449e8a1219ee6e5fc022 Mon Sep 17 00:00:00 2001
From: Artem Savkov <asavkov@redhat.com>
Date: Thu, 1 Dec 2016 14:06:04 +0100
Subject: ip6_offload: check segs for NULL in ipv6_gso_segment.

segs needs to be checked for being NULL in ipv6_gso_segment() before calling
skb_shinfo(segs), otherwise kernel can run into a NULL-pointer dereference:

[   97.811262] BUG: unable to handle kernel NULL pointer dereference at 00000000000000cc
[   97.819112] IP: [<ffffffff816e52f9>] ipv6_gso_segment+0x119/0x2f0
[   97.825214] PGD 0 [   97.827047]
[   97.828540] Oops: 0000 [#1] SMP
[   97.831678] Modules linked in: vhost_net vhost macvtap macvlan nfsv3 rpcsec_gss_krb5
nfsv4 dns_resolver nfs fscache xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4
iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack
ipt_REJECT nf_reject_ipv4 tun ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter
bridge stp llc snd_hda_codec_realtek snd_hda_codec_hdmi snd_hda_codec_generic snd_hda_intel
snd_hda_codec edac_mce_amd snd_hda_core edac_core snd_hwdep kvm_amd snd_seq kvm snd_seq_device
snd_pcm irqbypass snd_timer ppdev parport_serial snd parport_pc k10temp pcspkr soundcore parport
sp5100_tco shpchp sg wmi i2c_piix4 acpi_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc
ip_tables xfs libcrc32c sr_mod cdrom sd_mod ata_generic pata_acpi amdkfd amd_iommu_v2 radeon
broadcom bcm_phy_lib i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops
ttm ahci serio_raw tg3 firewire_ohci libahci pata_atiixp drm ptp libata firewire_core pps_core
i2c_core crc_itu_t fjes dm_mirror dm_region_hash dm_log dm_mod
[   97.927721] CPU: 1 PID: 3504 Comm: vhost-3495 Not tainted 4.9.0-7.el7.test.x86_64 #1
[   97.935457] Hardware name: AMD Snook/Snook, BIOS ESK0726A 07/26/2010
[   97.941806] task: ffff880129a1c080 task.stack: ffffc90001bcc000
[   97.947720] RIP: 0010:[<ffffffff816e52f9>]  [<ffffffff816e52f9>] ipv6_gso_segment+0x119/0x2f0
[   97.956251] RSP: 0018:ffff88012fc43a10  EFLAGS: 00010207
[   97.961557] RAX: 0000000000000000 RBX: ffff8801292c8700 RCX: 0000000000000594
[   97.968687] RDX: 0000000000000593 RSI: ffff880129a846c0 RDI: 0000000000240000
[   97.975814] RBP: ffff88012fc43a68 R08: ffff880129a8404e R09: 0000000000000000
[   97.982942] R10: 0000000000000000 R11: ffff880129a84076 R12: 00000020002949b3
[   97.990070] R13: ffff88012a580000 R14: 0000000000000000 R15: ffff88012a580000
[   97.997198] FS:  0000000000000000(0000) GS:ffff88012fc40000(0000) knlGS:0000000000000000
[   98.005280] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   98.011021] CR2: 00000000000000cc CR3: 0000000126c5d000 CR4: 00000000000006e0
[   98.018149] Stack:
[   98.020157]  00000000ffffffff ffff88012fc43ac8 ffffffffa017ad0a 000000000000000e
[   98.027584]  0000001300000000 0000000077d59998 ffff8801292c8700 00000020002949b3
[   98.035010]  ffff88012a580000 0000000000000000 ffff88012a580000 ffff88012fc43a98
[   98.042437] Call Trace:
[   98.044879]  <IRQ> [   98.046803]  [<ffffffffa017ad0a>] ? tg3_start_xmit+0x84a/0xd60 [tg3]
[   98.053156]  [<ffffffff815eeee0>] skb_mac_gso_segment+0xb0/0x130
[   98.059158]  [<ffffffff815eefd3>] __skb_gso_segment+0x73/0x110
[   98.064985]  [<ffffffff815ef40d>] validate_xmit_skb+0x12d/0x2b0
[   98.070899]  [<ffffffff815ef5d2>] validate_xmit_skb_list+0x42/0x70
[   98.077073]  [<ffffffff81618560>] sch_direct_xmit+0xd0/0x1b0
[   98.082726]  [<ffffffff815efd86>] __dev_queue_xmit+0x486/0x690
[   98.088554]  [<ffffffff8135c135>] ? cpumask_next_and+0x35/0x50
[   98.094380]  [<ffffffff815effa0>] dev_queue_xmit+0x10/0x20
[   98.099863]  [<ffffffffa09ce057>] br_dev_queue_push_xmit+0xa7/0x170 [bridge]
[   98.106907]  [<ffffffffa09ce161>] br_forward_finish+0x41/0xc0 [bridge]
[   98.113430]  [<ffffffff81627cf2>] ? nf_iterate+0x52/0x60
[   98.118735]  [<ffffffff81627d6b>] ? nf_hook_slow+0x6b/0xc0
[   98.124216]  [<ffffffffa09ce32c>] __br_forward+0x14c/0x1e0 [bridge]
[   98.130480]  [<ffffffffa09ce120>] ? br_dev_queue_push_xmit+0x170/0x170 [bridge]
[   98.137785]  [<ffffffffa09ce4bd>] br_forward+0x9d/0xb0 [bridge]
[   98.143701]  [<ffffffffa09cfbb7>] br_handle_frame_finish+0x267/0x560 [bridge]
[   98.150834]  [<ffffffffa09d0064>] br_handle_frame+0x174/0x2f0 [bridge]
[   98.157355]  [<ffffffff8102fb89>] ? sched_clock+0x9/0x10
[   98.162662]  [<ffffffff810b63b2>] ? sched_clock_cpu+0x72/0xa0
[   98.168403]  [<ffffffff815eccf5>] __netif_receive_skb_core+0x1e5/0xa20
[   98.174926]  [<ffffffff813659f9>] ? timerqueue_add+0x59/0xb0
[   98.180580]  [<ffffffff815ed548>] __netif_receive_skb+0x18/0x60
[   98.186494]  [<ffffffff815ee625>] process_backlog+0x95/0x140
[   98.192145]  [<ffffffff815edccd>] net_rx_action+0x16d/0x380
[   98.197713]  [<ffffffff8170cff1>] __do_softirq+0xd1/0x283
[   98.203106]  [<ffffffff8170b2bc>] do_softirq_own_stack+0x1c/0x30
[   98.209107]  <EOI> [   98.211029]  [<ffffffff8108a5c0>] do_softirq+0x50/0x60
[   98.216166]  [<ffffffff815ec853>] netif_rx_ni+0x33/0x80
[   98.221386]  [<ffffffffa09eeff7>] tun_get_user+0x487/0x7f0 [tun]
[   98.227388]  [<ffffffffa09ef3ab>] tun_sendmsg+0x4b/0x60 [tun]
[   98.233129]  [<ffffffffa0b68932>] handle_tx+0x282/0x540 [vhost_net]
[   98.239392]  [<ffffffffa0b68c25>] handle_tx_kick+0x15/0x20 [vhost_net]
[   98.245916]  [<ffffffffa0abacfe>] vhost_worker+0x9e/0xf0 [vhost]
[   98.251919]  [<ffffffffa0abac60>] ? vhost_umem_alloc+0x40/0x40 [vhost]
[   98.258440]  [<ffffffff81003a47>] ? do_syscall_64+0x67/0x180
[   98.264094]  [<ffffffff810a44d9>] kthread+0xd9/0xf0
[   98.268965]  [<ffffffff810a4400>] ? kthread_park+0x60/0x60
[   98.274444]  [<ffffffff8170a4d5>] ret_from_fork+0x25/0x30
[   98.279836] Code: 8b 93 d8 00 00 00 48 2b 93 d0 00 00 00 4c 89 e6 48 89 df 66 89 93 c2 00 00 00 ff 10 48 3d 00 f0 ff ff 49 89 c2 0f 87 52 01 00 00 <41> 8b 92 cc 00 00 00 48 8b 80 d0 00 00 00 44 0f b7 74 10 06 66
[   98.299425] RIP  [<ffffffff816e52f9>] ipv6_gso_segment+0x119/0x2f0
[   98.305612]  RSP <ffff88012fc43a10>
[   98.309094] CR2: 00000000000000cc
[   98.312406] ---[ end trace 726a2c7a2d2d78d0 ]---

Signed-off-by: Artem Savkov <asavkov@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_offload.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 1fcf61f1cbc3..89c59e656f44 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -99,7 +99,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		segs = ops->callbacks.gso_segment(skb, features);
 	}
 
-	if (IS_ERR(segs))
+	if (IS_ERR_OR_NULL(segs))
 		goto out;
 
 	gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
-- 
cgit v1.2.3