From d93c6258ee4255749c10012c50a31c08f4e9fb16 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 20 Jan 2016 11:16:43 +0100
Subject: netfilter: conntrack: resched in nf_ct_iterate_cleanup

Ulrich reports soft lockup with following (shortened) callchain:

NMI watchdog: BUG: soft lockup - CPU#1 stuck for 22s!
__netif_receive_skb_core+0x6e4/0x774
process_backlog+0x94/0x160
net_rx_action+0x88/0x178
call_do_softirq+0x24/0x3c
do_softirq+0x54/0x6c
__local_bh_enable_ip+0x7c/0xbc
nf_ct_iterate_cleanup+0x11c/0x22c [nf_conntrack]
masq_inet_event+0x20/0x30 [nf_nat_masquerade_ipv6]
atomic_notifier_call_chain+0x1c/0x2c
ipv6_del_addr+0x1bc/0x220 [ipv6]

Problem is that nf_ct_iterate_cleanup can run for a very long time
since it can be interrupted by softirq processing.
Moreover, atomic_notifier_call_chain runs with rcu readlock held.

So lets call cond_resched() in nf_ct_iterate_cleanup and defer
the call to a work queue for the atomic_notifier_call_chain case.

We also need another cond_resched in get_next_corpse, since we
have to deal with iter() always returning false, in that case
get_next_corpse will walk entire conntrack table.

Reported-by: Ulrich Weber <uw@ocedo.com>
Tested-by: Ulrich Weber <uw@ocedo.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_nat_masquerade_ipv6.c | 74 +++++++++++++++++++++++++++--
 net/netfilter/nf_conntrack_core.c           |  5 ++
 2 files changed, 76 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
index 31ba7ca19757..051b6a6bfff6 100644
--- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
@@ -21,6 +21,10 @@
 #include <net/ipv6.h>
 #include <net/netfilter/ipv6/nf_nat_masquerade.h>
 
+#define MAX_WORK_COUNT	16
+
+static atomic_t v6_worker_count;
+
 unsigned int
 nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
 		       const struct net_device *out)
@@ -78,14 +82,78 @@ static struct notifier_block masq_dev_notifier = {
 	.notifier_call	= masq_device_event,
 };
 
+struct masq_dev_work {
+	struct work_struct work;
+	struct net *net;
+	int ifindex;
+};
+
+static void iterate_cleanup_work(struct work_struct *work)
+{
+	struct masq_dev_work *w;
+	long index;
+
+	w = container_of(work, struct masq_dev_work, work);
+
+	index = w->ifindex;
+	nf_ct_iterate_cleanup(w->net, device_cmp, (void *)index, 0, 0);
+
+	put_net(w->net);
+	kfree(w);
+	atomic_dec(&v6_worker_count);
+	module_put(THIS_MODULE);
+}
+
+/* ipv6 inet notifier is an atomic notifier, i.e. we cannot
+ * schedule.
+ *
+ * Unfortunately, nf_ct_iterate_cleanup can run for a long
+ * time if there are lots of conntracks and the system
+ * handles high softirq load, so it frequently calls cond_resched
+ * while iterating the conntrack table.
+ *
+ * So we defer nf_ct_iterate_cleanup walk to the system workqueue.
+ *
+ * As we can have 'a lot' of inet_events (depending on amount
+ * of ipv6 addresses being deleted), we also need to add an upper
+ * limit to the number of queued work items.
+ */
 static int masq_inet_event(struct notifier_block *this,
 			   unsigned long event, void *ptr)
 {
 	struct inet6_ifaddr *ifa = ptr;
-	struct netdev_notifier_info info;
+	const struct net_device *dev;
+	struct masq_dev_work *w;
+	struct net *net;
+
+	if (event != NETDEV_DOWN ||
+	    atomic_read(&v6_worker_count) >= MAX_WORK_COUNT)
+		return NOTIFY_DONE;
+
+	dev = ifa->idev->dev;
+	net = maybe_get_net(dev_net(dev));
+	if (!net)
+		return NOTIFY_DONE;
 
-	netdev_notifier_info_init(&info, ifa->idev->dev);
-	return masq_device_event(this, event, &info);
+	if (!try_module_get(THIS_MODULE))
+		goto err_module;
+
+	w = kmalloc(sizeof(*w), GFP_ATOMIC);
+	if (w) {
+		atomic_inc(&v6_worker_count);
+
+		INIT_WORK(&w->work, iterate_cleanup_work);
+		w->ifindex = dev->ifindex;
+		w->net = net;
+		schedule_work(&w->work);
+
+		return NOTIFY_DONE;
+	}
+
+	module_put(THIS_MODULE);
+ err_module:
+	put_net(net);
+	return NOTIFY_DONE;
 }
 
 static struct notifier_block masq_inet_notifier = {
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 58882de06bd7..f60b4fdeeb8c 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1412,6 +1412,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
 		}
 		spin_unlock(lockp);
 		local_bh_enable();
+		cond_resched();
 	}
 
 	for_each_possible_cpu(cpu) {
@@ -1424,6 +1425,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
 				set_bit(IPS_DYING_BIT, &ct->status);
 		}
 		spin_unlock_bh(&pcpu->lock);
+		cond_resched();
 	}
 	return NULL;
 found:
@@ -1440,6 +1442,8 @@ void nf_ct_iterate_cleanup(struct net *net,
 	struct nf_conn *ct;
 	unsigned int bucket = 0;
 
+	might_sleep();
+
 	while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
 		/* Time to push up daises... */
 		if (del_timer(&ct->timeout))
@@ -1448,6 +1452,7 @@ void nf_ct_iterate_cleanup(struct net *net,
 		/* ... else the timer will get him soon. */
 
 		nf_ct_put(ct);
+		cond_resched();
 	}
 }
 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
-- 
cgit v1.2.3


From 7c7bdf35991bb8f7cfaeaf22ea3a2f2d1967c166 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 24 Jan 2016 23:08:39 +0100
Subject: netfilter: nfnetlink: use original skbuff when acking batches

Since bd678e09dc17 ("netfilter: nfnetlink: fix splat due to incorrect
socket memory accounting in skbuff clones"), we don't manually attach
the sk to the skbuff clone anymore, so we have to use the original
skbuff from netlink_ack() which needs to access the sk pointer.

Fixes: bd678e09dc17 ("netfilter: nfnetlink: fix splat due to incorrect socket memory accounting in skbuff clones")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index a7ba23353dab..62e92af2384a 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -311,14 +311,14 @@ replay:
 #endif
 		{
 			nfnl_unlock(subsys_id);
-			netlink_ack(skb, nlh, -EOPNOTSUPP);
+			netlink_ack(oskb, nlh, -EOPNOTSUPP);
 			return kfree_skb(skb);
 		}
 	}
 
 	if (!ss->commit || !ss->abort) {
 		nfnl_unlock(subsys_id);
-		netlink_ack(skb, nlh, -EOPNOTSUPP);
+		netlink_ack(oskb, nlh, -EOPNOTSUPP);
 		return kfree_skb(skb);
 	}
 
@@ -406,7 +406,7 @@ ack:
 				 * pointing to the batch header.
 				 */
 				nfnl_err_reset(&err_list);
-				netlink_ack(skb, nlmsg_hdr(oskb), -ENOMEM);
+				netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM);
 				status |= NFNL_BATCH_FAILURE;
 				goto done;
 			}
-- 
cgit v1.2.3


From 53c520c2ab79e9f3765d24116ab54f6d5b3cd563 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 28 Jan 2016 13:16:59 +0100
Subject: netfilter: cttimeout: fix deadlock due to erroneous unlock/lock
 conversion

The spin_unlock call should have been left as-is, revert.

Fixes: b16c29191dc89bd ("netfilter: nf_conntrack: use safer way to lock all buckets")
Reported-by: kernel test robot <fengguang.wu@intel.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_cttimeout.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 94837d236ab0..2671b9deb103 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -312,7 +312,7 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout)
 			hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
 				untimeout(h, timeout);
 		}
-		nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+		spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
 	}
 	local_bh_enable();
 }
-- 
cgit v1.2.3


From c58d6c93680f28ac58984af61d0a7ebf4319c241 Mon Sep 17 00:00:00 2001
From: Phil Turnbull <phil.turnbull@oracle.com>
Date: Tue, 2 Feb 2016 13:36:45 -0500
Subject: netfilter: nfnetlink: correctly validate length of batch messages

If nlh->nlmsg_len is zero then an infinite loop is triggered because
'skb_pull(skb, msglen);' pulls zero bytes.

The calculation in nlmsg_len() underflows if 'nlh->nlmsg_len <
NLMSG_HDRLEN' which bypasses the length validation and will later
trigger an out-of-bound read.

If the length validation does fail then the malformed batch message is
copied back to userspace. However, we cannot do this because the
nlh->nlmsg_len can be invalid. This leads to an out-of-bounds read in
netlink_ack:

    [   41.455421] ==================================================================
    [   41.456431] BUG: KASAN: slab-out-of-bounds in memcpy+0x1d/0x40 at addr ffff880119e79340
    [   41.456431] Read of size 4294967280 by task a.out/987
    [   41.456431] =============================================================================
    [   41.456431] BUG kmalloc-512 (Not tainted): kasan: bad access detected
    [   41.456431] -----------------------------------------------------------------------------
    ...
    [   41.456431] Bytes b4 ffff880119e79310: 00 00 00 00 d5 03 00 00 b0 fb fe ff 00 00 00 00  ................
    [   41.456431] Object ffff880119e79320: 20 00 00 00 10 00 05 00 00 00 00 00 00 00 00 00   ...............
    [   41.456431] Object ffff880119e79330: 14 00 0a 00 01 03 fc 40 45 56 11 22 33 10 00 05  .......@EV."3...
    [   41.456431] Object ffff880119e79340: f0 ff ff ff 88 99 aa bb 00 14 00 0a 00 06 fe fb  ................
                                            ^^ start of batch nlmsg with
                                               nlmsg_len=4294967280
    ...
    [   41.456431] Memory state around the buggy address:
    [   41.456431]  ffff880119e79400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
    [   41.456431]  ffff880119e79480: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
    [   41.456431] >ffff880119e79500: 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc fc
    [   41.456431]                                ^
    [   41.456431]  ffff880119e79580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
    [   41.456431]  ffff880119e79600: fc fc fc fc fc fc fc fc fc fc fb fb fb fb fb fb
    [   41.456431] ==================================================================

Fix this with better validation of nlh->nlmsg_len and by setting
NFNL_BATCH_FAILURE if any batch message fails length validation.

CAP_NET_ADMIN is required to trigger the bugs.

Fixes: 9ea2aa8b7dba ("netfilter: nfnetlink: validate nfnetlink header from batch")
Signed-off-by: Phil Turnbull <phil.turnbull@oracle.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 62e92af2384a..857ae89633af 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -328,10 +328,12 @@ replay:
 		nlh = nlmsg_hdr(skb);
 		err = 0;
 
-		if (nlmsg_len(nlh) < sizeof(struct nfgenmsg) ||
-		    skb->len < nlh->nlmsg_len) {
-			err = -EINVAL;
-			goto ack;
+		if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+		    skb->len < nlh->nlmsg_len ||
+		    nlmsg_len(nlh) < sizeof(struct nfgenmsg)) {
+			nfnl_err_reset(&err_list);
+			status |= NFNL_BATCH_FAILURE;
+			goto done;
 		}
 
 		/* Only requests are handled by the kernel */
-- 
cgit v1.2.3


From 08a7f5d3f5c38ed745c3e99ee91975f20562d272 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 5 Feb 2016 10:20:21 +0100
Subject: netfilter: tee: select NF_DUP_IPV6 unconditionally

The NETFILTER_XT_TARGET_TEE option selects NF_DUP_IPV6 whenever
IP6_NF_IPTABLES is enabled, and it ensures that it cannot be
builtin itself if NF_CONNTRACK is a loadable module, as that
is a dependency for NF_DUP_IPV6.

However, NF_DUP_IPV6 can be enabled even if IP6_NF_IPTABLES is
turned off, and it only really depends on IPV6. With the current
check in tee_tg6, we call nf_dup_ipv6() whenever NF_DUP_IPV6
is enabled. This can however be a loadable module which is
unreachable from a built-in xt_TEE:

net/built-in.o: In function `tee_tg6':
:(.text+0x67728): undefined reference to `nf_dup_ipv6'

The bug was originally introduced in the split of the xt_TEE module
into separate modules for ipv4 and ipv6, and two patches tried
to fix it unsuccessfully afterwards.

This is a revert of the the first incorrect attempt to fix it,
going back to depending on IPV6 as the dependency, and we
adapt the 'select' condition accordingly.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: bbde9fc1824a ("netfilter: factor out packet duplication for IPv4/IPv6")
Fixes: 116984a316c3 ("netfilter: xt_TEE: use IS_ENABLED(CONFIG_NF_DUP_IPV6)")
Fixes: 74ec4d55c4d2 ("netfilter: fix xt_TEE and xt_TPROXY dependencies")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/Kconfig  | 2 +-
 net/netfilter/xt_TEE.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 8c067e6663a1..95e757c377f9 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -891,7 +891,7 @@ config NETFILTER_XT_TARGET_TEE
 	depends on IPV6 || IPV6=n
 	depends on !NF_CONNTRACK || NF_CONNTRACK
 	select NF_DUP_IPV4
-	select NF_DUP_IPV6 if IP6_NF_IPTABLES != n
+	select NF_DUP_IPV6 if IPV6
 	---help---
 	This option adds a "TEE" target with which a packet can be cloned and
 	this clone be rerouted to another nexthop.
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 3eff7b67cdf2..6e57a3966dc5 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -38,7 +38,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
-#if IS_ENABLED(CONFIG_NF_DUP_IPV6)
+#if IS_ENABLED(CONFIG_IPV6)
 static unsigned int
 tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 {
@@ -131,7 +131,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
 		.destroy    = tee_tg_destroy,
 		.me         = THIS_MODULE,
 	},
-#if IS_ENABLED(CONFIG_NF_DUP_IPV6)
+#if IS_ENABLED(CONFIG_IPV6)
 	{
 		.name       = "TEE",
 		.revision   = 1,
-- 
cgit v1.2.3


From 5cc6ce9ff27565949a1001a2889a8dd9fd09e772 Mon Sep 17 00:00:00 2001
From: Anton Protopopov <a.s.protopopov@gmail.com>
Date: Sat, 6 Feb 2016 23:31:19 -0500
Subject: netfilter: nft_counter: fix erroneous return values

The nft_counter_init() and nft_counter_clone() functions should return
negative error value -ENOMEM instead of positive ENOMEM.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_counter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index c7808fc19719..c9743f78f219 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -100,7 +100,7 @@ static int nft_counter_init(const struct nft_ctx *ctx,
 
 	cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu);
 	if (cpu_stats == NULL)
-		return ENOMEM;
+		return -ENOMEM;
 
 	preempt_disable();
 	this_cpu = this_cpu_ptr(cpu_stats);
@@ -138,7 +138,7 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
 	cpu_stats = __netdev_alloc_pcpu_stats(struct nft_counter_percpu,
 					      GFP_ATOMIC);
 	if (cpu_stats == NULL)
-		return ENOMEM;
+		return -ENOMEM;
 
 	preempt_disable();
 	this_cpu = this_cpu_ptr(cpu_stats);
-- 
cgit v1.2.3