From 52bd2d62ce6758d811edcbd2256eb9ea7f6a56cb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 18 Nov 2015 06:30:50 -0800
Subject: net: better skb->sender_cpu and skb->napi_id cohabitation

skb->sender_cpu and skb->napi_id share a common storage,
and we had various bugs about this.

We had to call skb_sender_cpu_clear() in some places to
not leave a prior skb->napi_id and fool netdev_pick_tx()

As suggested by Alexei, we could split the space so that
these errors can not happen.

0 value being reserved as the common (not initialized) value,
let's reserve [1 .. NR_CPUS] range for valid sender_cpu,
and [NR_CPUS+1 .. ~0U] for valid napi_id.

This will allow proper busy polling support over tunnels.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index ae00b894e675..2582c24a75c6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -182,7 +182,7 @@ EXPORT_SYMBOL(dev_base_lock);
 /* protects napi_hash addition/deletion and napi_gen_id */
 static DEFINE_SPINLOCK(napi_hash_lock);
 
-static unsigned int napi_gen_id;
+static unsigned int napi_gen_id = NR_CPUS;
 static DEFINE_HASHTABLE(napi_hash, 8);
 
 static seqcount_t devnet_rename_seq;
@@ -3021,7 +3021,9 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 	int queue_index = 0;
 
 #ifdef CONFIG_XPS
-	if (skb->sender_cpu == 0)
+	u32 sender_cpu = skb->sender_cpu - 1;
+
+	if (sender_cpu >= (u32)NR_CPUS)
 		skb->sender_cpu = raw_smp_processor_id() + 1;
 #endif
 
@@ -4676,25 +4678,22 @@ EXPORT_SYMBOL_GPL(napi_by_id);
 
 void napi_hash_add(struct napi_struct *napi)
 {
-	if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
+	if (test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
+		return;
 
-		spin_lock(&napi_hash_lock);
+	spin_lock(&napi_hash_lock);
 
-		/* 0 is not a valid id, we also skip an id that is taken
-		 * we expect both events to be extremely rare
-		 */
-		napi->napi_id = 0;
-		while (!napi->napi_id) {
-			napi->napi_id = ++napi_gen_id;
-			if (napi_by_id(napi->napi_id))
-				napi->napi_id = 0;
-		}
+	/* 0..NR_CPUS+1 range is reserved for sender_cpu use */
+	do {
+		if (unlikely(++napi_gen_id < NR_CPUS + 1))
+			napi_gen_id = NR_CPUS + 1;
+	} while (napi_by_id(napi_gen_id));
+	napi->napi_id = napi_gen_id;
 
-		hlist_add_head_rcu(&napi->napi_hash_node,
-			&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+	hlist_add_head_rcu(&napi->napi_hash_node,
+			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
 
-		spin_unlock(&napi_hash_lock);
-	}
+	spin_unlock(&napi_hash_lock);
 }
 EXPORT_SYMBOL_GPL(napi_hash_add);
 
-- 
cgit v1.2.3


From 02d62e86fe892c59a1259d089d4d16ac76977a37 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 18 Nov 2015 06:30:52 -0800
Subject: net: un-inline sk_busy_loop()

There is really little gain from inlining this big function.
We'll soon make it even bigger in following patches.

This means we no longer need to export napi_by_id()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 48 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 2582c24a75c6..74a816b299df 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -96,6 +96,7 @@
 #include <linux/skbuff.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
+#include <net/busy_poll.h>
 #include <linux/rtnetlink.h>
 #include <linux/stat.h>
 #include <net/dst.h>
@@ -4663,7 +4664,7 @@ void napi_complete_done(struct napi_struct *n, int work_done)
 EXPORT_SYMBOL(napi_complete_done);
 
 /* must be called under rcu_read_lock(), as we dont take a reference */
-struct napi_struct *napi_by_id(unsigned int napi_id)
+static struct napi_struct *napi_by_id(unsigned int napi_id)
 {
 	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
 	struct napi_struct *napi;
@@ -4674,7 +4675,52 @@ struct napi_struct *napi_by_id(unsigned int napi_id)
 
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(napi_by_id);
+
+#if defined(CONFIG_NET_RX_BUSY_POLL)
+bool sk_busy_loop(struct sock *sk, int nonblock)
+{
+	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
+	const struct net_device_ops *ops;
+	struct napi_struct *napi;
+	int rc = false;
+
+	/*
+	 * rcu read lock for napi hash
+	 * bh so we don't race with net_rx_action
+	 */
+	rcu_read_lock_bh();
+
+	napi = napi_by_id(sk->sk_napi_id);
+	if (!napi)
+		goto out;
+
+	ops = napi->dev->netdev_ops;
+	if (!ops->ndo_busy_poll)
+		goto out;
+
+	do {
+		rc = ops->ndo_busy_poll(napi);
+
+		if (rc == LL_FLUSH_FAILED)
+			break; /* permanent failure */
+
+		if (rc > 0)
+			/* local bh are disabled so it is ok to use _BH */
+			NET_ADD_STATS_BH(sock_net(sk),
+					 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
+		cpu_relax();
+
+	} while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
+		 !need_resched() && !busy_loop_timeout(end_time));
+
+	rc = !skb_queue_empty(&sk->sk_receive_queue);
+out:
+	rcu_read_unlock_bh();
+	return rc;
+}
+EXPORT_SYMBOL(sk_busy_loop);
+
+#endif /* CONFIG_NET_RX_BUSY_POLL */
 
 void napi_hash_add(struct napi_struct *napi)
 {
-- 
cgit v1.2.3


From 2a028ecb76497d05e5cd4e3e8b09d965cac2e3f1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 18 Nov 2015 06:30:53 -0800
Subject: net: allow BH servicing in sk_busy_loop()

Instead of blocking BH in whole sk_busy_loop(), block them
only around ->ndo_busy_poll() calls.

This has many benefits.

1) allow tunneled traffic to use busy poll as well as native traffic.
   Tunnels handlers usually call netif_rx() and depend on net_rx_action()
   being run (from sofirq handler)

2) allow RFS/RPS being used (sending IPI to other cpus if needed)

3) use the 'lets burn cpu cycles' budget to do useful work
   (like TX completions, timers, RCU callbacks...)

4) reduce BH latencies, making busy poll a better citizen.

Tested:

Tested with SIT tunnel

lpaa5:~# echo 0 >/proc/sys/net/core/busy_read
lpaa5:~# ./netperf -H 2002:af6:786::1 -t TCP_RR
MIGRATED TCP REQUEST/RESPONSE TEST from ::0 (::) port 0 AF_INET6 to 2002:af6:786::1 () port 0 AF_INET6 : first burst 0
Local /Remote
Socket Size   Request  Resp.   Elapsed  Trans.
Send   Recv   Size     Size    Time     Rate
bytes  Bytes  bytes    bytes   secs.    per sec

16384  87380  1        1       10.00    37373.93
16384  87380

Now enable busy poll on both hosts

lpaa5:~# echo 70 >/proc/sys/net/core/busy_read
lpaa6:~# echo 70 >/proc/sys/net/core/busy_read

lpaa5:~# ./netperf -H 2002:af6:786::1 -t TCP_RR
MIGRATED TCP REQUEST/RESPONSE TEST from ::0 (::) port 0 AF_INET6 to 2002:af6:786::1 () port 0 AF_INET6 : first burst 0
Local /Remote
Socket Size   Request  Resp.   Elapsed  Trans.
Send   Recv   Size     Size    Time     Rate
bytes  Bytes  bytes    bytes   secs.    per sec

16384  87380  1        1       10.00    58314.77
16384  87380

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 74a816b299df..2002eec2617d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4684,11 +4684,7 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
 	struct napi_struct *napi;
 	int rc = false;
 
-	/*
-	 * rcu read lock for napi hash
-	 * bh so we don't race with net_rx_action
-	 */
-	rcu_read_lock_bh();
+	rcu_read_lock();
 
 	napi = napi_by_id(sk->sk_napi_id);
 	if (!napi)
@@ -4699,23 +4695,23 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
 		goto out;
 
 	do {
+		local_bh_disable();
 		rc = ops->ndo_busy_poll(napi);
+		if (rc > 0)
+			NET_ADD_STATS_BH(sock_net(sk),
+					 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
+		local_bh_enable();
 
 		if (rc == LL_FLUSH_FAILED)
 			break; /* permanent failure */
 
-		if (rc > 0)
-			/* local bh are disabled so it is ok to use _BH */
-			NET_ADD_STATS_BH(sock_net(sk),
-					 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
 		cpu_relax();
-
 	} while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
 		 !need_resched() && !busy_loop_timeout(end_time));
 
 	rc = !skb_queue_empty(&sk->sk_receive_queue);
 out:
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 	return rc;
 }
 EXPORT_SYMBOL(sk_busy_loop);
-- 
cgit v1.2.3


From ce6aea93f7510437dde625b77a7a2f4d20b72660 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 18 Nov 2015 06:30:54 -0800
Subject: net: network drivers no longer need to implement ndo_busy_poll()

Instead of having to implement complex ndo_busy_poll() method,
drivers can simply rely on NAPI poll logic.

Busy polling gains are mainly coming from polling itself,
not on exact details on how we poll the device.

ndo_busy_poll() if implemented can avoid touching
napi state, but it adds extra synchronization between
normal napi->poll() and busy poll handler, slowing down
the common path (non busy polling) with extra atomic operations.
In practice few drivers ever got busy poll because of the complexity.

We could go one step further, and make busy polling
available for all NAPI drivers, but this would require
that all netif_napi_del() calls are done in process context
so that we can call synchronize_rcu().
Full audit would be required.

Before this is done, a driver still needs to call :

- skb_mark_napi_id() for each skb provided to the stack.
- napi_hash_add() and napi_hash_del() to allocate a napi_id per napi struct.
- Make sure RCU grace period is respected after napi_hash_del() before
  memory containing napi structure is freed.

Followup patch implements busy poll for mlx5 driver as an example.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 2002eec2617d..93009610aee8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4677,10 +4677,11 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
 }
 
 #if defined(CONFIG_NET_RX_BUSY_POLL)
+#define BUSY_POLL_BUDGET 8
 bool sk_busy_loop(struct sock *sk, int nonblock)
 {
 	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
-	const struct net_device_ops *ops;
+	int (*busy_poll)(struct napi_struct *dev);
 	struct napi_struct *napi;
 	int rc = false;
 
@@ -4690,13 +4691,27 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
 	if (!napi)
 		goto out;
 
-	ops = napi->dev->netdev_ops;
-	if (!ops->ndo_busy_poll)
-		goto out;
+	/* Note: ndo_busy_poll method is optional in linux-4.5 */
+	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
 
 	do {
+		rc = 0;
 		local_bh_disable();
-		rc = ops->ndo_busy_poll(napi);
+		if (busy_poll) {
+			rc = busy_poll(napi);
+		} else if (napi_schedule_prep(napi)) {
+			void *have = netpoll_poll_lock(napi);
+
+			if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+				rc = napi->poll(napi, BUSY_POLL_BUDGET);
+				trace_napi_poll(napi);
+				if (rc == BUSY_POLL_BUDGET) {
+					napi_complete_done(napi, rc);
+					napi_schedule(napi);
+				}
+			}
+			netpoll_poll_unlock(have);
+		}
 		if (rc > 0)
 			NET_ADD_STATS_BH(sock_net(sk),
 					 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
-- 
cgit v1.2.3


From 93f93a4404159ecf7e9148f5ad0718ec702ac4cb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 18 Nov 2015 06:30:59 -0800
Subject: net: move skb_mark_napi_id() into core networking stack

We would like to automatically provide busy polling support
to all NAPI drivers, without them having to implement anything.

skb_mark_napi_id() can be called from napi_gro_receive() and
napi_get_frags().

Few drivers are still calling skb_mark_napi_id() because
they use netif_receive_skb(). They should eventually call
napi_gro_receive() instead. I will leave this to drivers
maintainers.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 93009610aee8..83b48747928c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4356,6 +4356,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
+	skb_mark_napi_id(skb, napi);
 	trace_napi_gro_receive_entry(skb);
 
 	skb_gro_reset_offset(skb);
@@ -4390,6 +4391,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
 	if (!skb) {
 		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
 		napi->skb = skb;
+		skb_mark_napi_id(skb, napi);
 	}
 	return skb;
 }
-- 
cgit v1.2.3


From d64b5e85bfe2fe4c790abcbd16d9ae32391ddd7e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 18 Nov 2015 06:31:00 -0800
Subject: net: add netif_tx_napi_add()

netif_tx_napi_add() is a variant of netif_napi_add()

It should be used by drivers that use a napi structure
to exclusively poll TX.

We do not want to add this kind of napi in napi_hash[] in following
patches, adding generic busy polling to all NAPI drivers.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 83b48747928c..ff58a8bc5e3c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4737,7 +4737,8 @@ EXPORT_SYMBOL(sk_busy_loop);
 
 void napi_hash_add(struct napi_struct *napi)
 {
-	if (test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
+	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
+	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
 		return;
 
 	spin_lock(&napi_hash_lock);
-- 
cgit v1.2.3


From 6180d9de61a5c461f9e3efef5417a844701dbbb2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 18 Nov 2015 06:31:01 -0800
Subject: net: move napi_hash[] into read mostly section

We do not often add/delete a napi context.
Moving napi_hash[] into read mostly section avoids potential false sharing.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index ff58a8bc5e3c..02dfbd91a8e4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -184,7 +184,7 @@ EXPORT_SYMBOL(dev_base_lock);
 static DEFINE_SPINLOCK(napi_hash_lock);
 
 static unsigned int napi_gen_id = NR_CPUS;
-static DEFINE_HASHTABLE(napi_hash, 8);
+static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 
 static seqcount_t devnet_rename_seq;
 
-- 
cgit v1.2.3


From 34cbe27e811c591c854a39c0dee1b461bb796953 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 18 Nov 2015 06:31:02 -0800
Subject: net: napi_hash_del() returns a boolean status

napi_hash_del() will soon be used from both drivers (if they want)
or core networking stack.

Callers are responsibles to ensure an RCU grace period is respected
before freeing napi structure : napi_hash_del() can signal if
this RCU grace period is needed or not.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 02dfbd91a8e4..59dddac1c2e7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4760,14 +4760,18 @@ EXPORT_SYMBOL_GPL(napi_hash_add);
 /* Warning : caller is responsible to make sure rcu grace period
  * is respected before freeing memory containing @napi
  */
-void napi_hash_del(struct napi_struct *napi)
+bool napi_hash_del(struct napi_struct *napi)
 {
+	bool rcu_sync_needed = false;
+
 	spin_lock(&napi_hash_lock);
 
-	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
+	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
+		rcu_sync_needed = true;
 		hlist_del_rcu(&napi->napi_hash_node);
-
+	}
 	spin_unlock(&napi_hash_lock);
+	return rcu_sync_needed;
 }
 EXPORT_SYMBOL_GPL(napi_hash_del);
 
-- 
cgit v1.2.3


From 93d05d4a320cb16712bb3d57a9658f395d8cecb9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 18 Nov 2015 06:31:03 -0800
Subject: net: provide generic busy polling to all NAPI drivers

NAPI drivers no longer need to observe a particular protocol
to benefit from busy polling (CONFIG_NET_RX_BUSY_POLL=y)

napi_hash_add() and napi_hash_del() are automatically called
from core networking stack, respectively from
netif_napi_add() and netif_napi_del()

This patch depends on free_netdev() and netif_napi_del() being
called from process context, which seems to be the norm.

Drivers might still prefer to call napi_hash_del() on their
own, since they might combine all the rcu grace periods into
a single one, knowing their NAPI structures lifetime, while
core networking stack has no idea of a possible combining.

Once this patch proves to not bring serious regressions,
we will cleanup drivers to either remove napi_hash_del()
or provide appropriate rcu grace periods combining.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 59dddac1c2e7..41cef3e3f558 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4807,6 +4807,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 	napi->poll_owner = -1;
 #endif
 	set_bit(NAPI_STATE_SCHED, &napi->state);
+	napi_hash_add(napi);
 }
 EXPORT_SYMBOL(netif_napi_add);
 
@@ -4826,8 +4827,12 @@ void napi_disable(struct napi_struct *n)
 }
 EXPORT_SYMBOL(napi_disable);
 
+/* Must be called in process context */
 void netif_napi_del(struct napi_struct *napi)
 {
+	might_sleep();
+	if (napi_hash_del(napi))
+		synchronize_net();
 	list_del_init(&napi->dev_list);
 	napi_free_frags(napi);
 
@@ -7227,11 +7232,13 @@ EXPORT_SYMBOL(alloc_netdev_mqs);
  *	This function does the last stage of destroying an allocated device
  * 	interface. The reference to the device object is released.
  *	If this is the last reference then it will be freed.
+ *	Must be called in process context.
  */
 void free_netdev(struct net_device *dev)
 {
 	struct napi_struct *p, *n;
 
+	might_sleep();
 	netif_free_tx_queues(dev);
 #ifdef CONFIG_SYSFS
 	kvfree(dev->_rx);
-- 
cgit v1.2.3


From e2f9dc3bd213792ac006e83f50a5453f23b8c354 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 19 Nov 2015 12:11:23 -0800
Subject: net: avoid NULL deref in napi_get_frags()

napi_alloc_skb() can return NULL.
We should not crash should this happen.

Fixes: 93f93a440415 ("net: move skb_mark_napi_id() into core networking stack")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 41cef3e3f558..5df6cbce727c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4390,8 +4390,10 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
 
 	if (!skb) {
 		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
-		napi->skb = skb;
-		skb_mark_napi_id(skb, napi);
+		if (skb) {
+			napi->skb = skb;
+			skb_mark_napi_id(skb, napi);
+		}
 	}
 	return skb;
 }
-- 
cgit v1.2.3


From b03804e7c3ad41c265c0ca21ddb306b252b4f99f Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Thu, 3 Dec 2015 12:12:03 +0100
Subject: net: Check CHANGEUPPER notifier return value

switchdev drivers reflect the newly requested topology to hardware when
CHANGEUPPER is received, after software links were already formed.
However, the operation can fail and user will not be notified, as the
return value of the notifier is not checked.

Add this check and rollback software links if necessary.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 5df6cbce727c..939cd1b1da15 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5490,8 +5490,12 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 			goto rollback_lower_mesh;
 	}
 
-	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
-				      &changeupper_info.info);
+	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+					    &changeupper_info.info);
+	ret = notifier_to_errno(ret);
+	if (ret)
+		goto rollback_lower_mesh;
+
 	return 0;
 
 rollback_lower_mesh:
-- 
cgit v1.2.3


From 6dffb0447c25476f499d205dfceb1972e8dae919 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Dec 2015 12:12:10 +0100
Subject: net: propagate upper priv via netdev_master_upper_dev_link

Eliminate netdev_master_upper_dev_link_private and pass priv directly as
a parameter of netdev_master_upper_dev_link.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 939cd1b1da15..27d052bb78bc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5421,7 +5421,7 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 
 static int __netdev_upper_dev_link(struct net_device *dev,
 				   struct net_device *upper_dev, bool master,
-				   void *private)
+				   void *upper_priv)
 {
 	struct netdev_notifier_changeupper_info changeupper_info;
 	struct netdev_adjacent *i, *j, *to_i, *to_j;
@@ -5452,7 +5452,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	if (ret)
 		return ret;
 
-	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
+	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
 						   master);
 	if (ret)
 		return ret;
@@ -5557,6 +5557,7 @@ EXPORT_SYMBOL(netdev_upper_dev_link);
  * netdev_master_upper_dev_link - Add a master link to the upper device
  * @dev: device
  * @upper_dev: new upper device
+ * @upper_priv: upper device private
  *
  * Adds a link to device which is upper to this one. In this case, only
  * one master upper device can be linked, although other non-master devices
@@ -5565,20 +5566,13 @@ EXPORT_SYMBOL(netdev_upper_dev_link);
  * counts are adjusted and the function returns zero.
  */
 int netdev_master_upper_dev_link(struct net_device *dev,
-				 struct net_device *upper_dev)
+				 struct net_device *upper_dev,
+				 void *upper_priv)
 {
-	return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
+	return __netdev_upper_dev_link(dev, upper_dev, true, upper_priv);
 }
 EXPORT_SYMBOL(netdev_master_upper_dev_link);
 
-int netdev_master_upper_dev_link_private(struct net_device *dev,
-					 struct net_device *upper_dev,
-					 void *private)
-{
-	return __netdev_upper_dev_link(dev, upper_dev, true, private);
-}
-EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
-
 /**
  * netdev_upper_dev_unlink - Removes a link to upper device
  * @dev: device
-- 
cgit v1.2.3


From 29bf24afb29042f568fa67b1b0eee46796725ed2 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Dec 2015 12:12:11 +0100
Subject: net: add possibility to pass information about upper device via
 notifier

Sometimes the drivers and other code would find it handy to know some
internal information about upper device being changed. So allow upper-code
to pass information down to notifier listeners during linking.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 27d052bb78bc..8ed886663c6d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5421,7 +5421,7 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
 
 static int __netdev_upper_dev_link(struct net_device *dev,
 				   struct net_device *upper_dev, bool master,
-				   void *upper_priv)
+				   void *upper_priv, void *upper_info)
 {
 	struct netdev_notifier_changeupper_info changeupper_info;
 	struct netdev_adjacent *i, *j, *to_i, *to_j;
@@ -5445,6 +5445,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 	changeupper_info.upper_dev = upper_dev;
 	changeupper_info.master = master;
 	changeupper_info.linking = true;
+	changeupper_info.upper_info = upper_info;
 
 	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
 					    &changeupper_info.info);
@@ -5549,7 +5550,7 @@ rollback_mesh:
 int netdev_upper_dev_link(struct net_device *dev,
 			  struct net_device *upper_dev)
 {
-	return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
+	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
 }
 EXPORT_SYMBOL(netdev_upper_dev_link);
 
@@ -5558,6 +5559,7 @@ EXPORT_SYMBOL(netdev_upper_dev_link);
  * @dev: device
  * @upper_dev: new upper device
  * @upper_priv: upper device private
+ * @upper_info: upper info to be passed down via notifier
  *
  * Adds a link to device which is upper to this one. In this case, only
  * one master upper device can be linked, although other non-master devices
@@ -5567,9 +5569,10 @@ EXPORT_SYMBOL(netdev_upper_dev_link);
  */
 int netdev_master_upper_dev_link(struct net_device *dev,
 				 struct net_device *upper_dev,
-				 void *upper_priv)
+				 void *upper_priv, void *upper_info)
 {
-	return __netdev_upper_dev_link(dev, upper_dev, true, upper_priv);
+	return __netdev_upper_dev_link(dev, upper_dev, true,
+				       upper_priv, upper_info);
 }
 EXPORT_SYMBOL(netdev_master_upper_dev_link);
 
-- 
cgit v1.2.3


From 04d482660a07039fc4e9a42bb3517db236d98f96 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 3 Dec 2015 12:12:15 +0100
Subject: net: introduce change lower state notifier

When lower device like bonding slave, team/bridge port, etc changes its
state, it is useful for others to notice this change. Currently this is
implemented specificly for bonding as NETDEV_BONDING_INFO notifier. This
patch aims to replace this specific usage and make this more generic to
be used for all upper-lower devices.

Introduce NETDEV_CHANGELOWERSTATE netdev notifier type and
netdev_lower_state_changed() helper.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 8ed886663c6d..d1706e88fbeb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5756,6 +5756,26 @@ int dev_get_nest_level(struct net_device *dev,
 }
 EXPORT_SYMBOL(dev_get_nest_level);
 
+/**
+ * netdev_lower_change - Dispatch event about lower device state change
+ * @lower_dev: device
+ * @lower_state_info: state to dispatch
+ *
+ * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
+ * The caller must hold the RTNL lock.
+ */
+void netdev_lower_state_changed(struct net_device *lower_dev,
+				void *lower_state_info)
+{
+	struct netdev_notifier_changelowerstate_info changelowerstate_info;
+
+	ASSERT_RTNL();
+	changelowerstate_info.lower_state_info = lower_state_info;
+	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
+				      &changelowerstate_info.info);
+}
+EXPORT_SYMBOL(netdev_lower_state_changed);
+
 static void dev_change_rx_flags(struct net_device *dev, int flags)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
-- 
cgit v1.2.3


From b618aaa91b5870e7bd139987ac4b7bf0851142d0 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 4 Dec 2015 15:01:31 +0100
Subject: net: constify netif_is_* helpers net_device param

As suggested by Eric, these helpers should have const dev param.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index d1706e88fbeb..e5c395473eba 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5734,7 +5734,7 @@ EXPORT_SYMBOL(netdev_lower_dev_get_private);
 
 
 int dev_get_nest_level(struct net_device *dev,
-		       bool (*type_check)(struct net_device *dev))
+		       bool (*type_check)(const struct net_device *dev))
 {
 	struct net_device *lower = NULL;
 	struct list_head *iter;
-- 
cgit v1.2.3


From 2a56a1fec290bf0bc4676bbf4efdb3744953a3e7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 7 Dec 2015 17:38:52 -0500
Subject: net: wrap sock->sk_cgrp_prioidx and ->sk_classid inside a struct

Introduce sock->sk_cgrp_data which is a struct sock_cgroup_data.
->sk_cgroup_prioidx and ->sk_classid are moved into it.  The struct
and its accessors are defined in cgroup-defs.h.  This is to prepare
for overloading the fields with a cgroup pointer.

This patch mostly performs equivalent conversions but the followings
are noteworthy.

* Equality test before updating classid is removed from
  sock_update_classid().  This shouldn't make any noticeable
  difference and a similar test will be implemented on the helper side
  later.

* sock_update_netprioidx() now takes struct sock_cgroup_data and can
  be moved to netprio_cgroup.h without causing include dependency
  loop.  Moved.

* The dummy version of sock_update_netprioidx() converted to a static
  inline function while at it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index e5c395473eba..8f705fcedb94 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2929,7 +2929,8 @@ static void skb_update_prio(struct sk_buff *skb)
 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
 
 	if (!skb->priority && skb->sk && map) {
-		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
+		unsigned int prioidx =
+			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
 
 		if (prioidx < map->priomap_len)
 			skb->priority = map->priomap[prioidx];
-- 
cgit v1.2.3


From a188222b6ed29404ac2d4232d35d1fe0e77af370 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 14 Dec 2015 11:19:43 -0800
Subject: net: Rename NETIF_F_ALL_CSUM to NETIF_F_CSUM_MASK

The name NETIF_F_ALL_CSUM is a misnomer. This does not correspond to the
set of features for offloading all checksums. This is a mask of the
checksum offload related features bits. It is incorrect to set both
NETIF_F_HW_CSUM and NETIF_F_IP_CSUM or NETIF_F_IPV6 at the same time for
features of a device.

This patch:
  - Changes instances of NETIF_F_ALL_CSUM to NETIF_F_CSUM_MASK (where
    NETIF_F_ALL_CSUM is being used as a mask).
  - Changes bonding, sfc/efx, ipvlan, macvlan, vlan, and team drivers to
    use NEITF_F_HW_CSUM in features list instead of NETIF_F_ALL_CSUM.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 8f705fcedb94..5a3b5a404642 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2645,7 +2645,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
 
 	if (skb->ip_summed != CHECKSUM_NONE &&
 	    !can_checksum_protocol(features, type)) {
-		features &= ~NETIF_F_ALL_CSUM;
+		features &= ~NETIF_F_CSUM_MASK;
 	} else if (illegal_highdma(skb->dev, skb)) {
 		features &= ~NETIF_F_SG;
 	}
@@ -2792,7 +2792,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
 			else
 				skb_set_transport_header(skb,
 							 skb_checksum_start_offset(skb));
-			if (!(features & NETIF_F_ALL_CSUM) &&
+			if (!(features & NETIF_F_CSUM_MASK) &&
 			    skb_checksum_help(skb))
 				goto out_kfree_skb;
 		}
@@ -7572,15 +7572,15 @@ netdev_features_t netdev_increment_features(netdev_features_t all,
 	netdev_features_t one, netdev_features_t mask)
 {
 	if (mask & NETIF_F_GEN_CSUM)
-		mask |= NETIF_F_ALL_CSUM;
+		mask |= NETIF_F_CSUM_MASK;
 	mask |= NETIF_F_VLAN_CHALLENGED;
 
-	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
+	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
 	all &= one | ~NETIF_F_ALL_FOR_ALL;
 
 	/* If one device supports hw checksumming, set for all. */
 	if (all & NETIF_F_GEN_CSUM)
-		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
+		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM);
 
 	return all;
 }
-- 
cgit v1.2.3


From c8cd0989bd151fda87bbf10887b3df18021284bc Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 14 Dec 2015 11:19:44 -0800
Subject: net: Eliminate NETIF_F_GEN_CSUM and NETIF_F_V[46]_CSUM

These netif flags are unnecessary convolutions. It is more
straightforward to just use NETIF_F_HW_CSUM, NETIF_F_IP_CSUM,
and NETIF_F_IPV6_CSUM directly.

This patch also:
    - Cleans up can_checksum_protocol
    - Simplifies netdev_intersect_features

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 5a3b5a404642..45b013f27625 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6467,9 +6467,9 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
 	/* UFO needs SG and checksumming */
 	if (features & NETIF_F_UFO) {
 		/* maybe split UFO into V4 and V6? */
-		if (!((features & NETIF_F_GEN_CSUM) ||
-		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
-			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+		if (!(features & NETIF_F_HW_CSUM) &&
+		    ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
+		     (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
 			netdev_dbg(dev,
 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
 			features &= ~NETIF_F_UFO;
@@ -7571,7 +7571,7 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 netdev_features_t netdev_increment_features(netdev_features_t all,
 	netdev_features_t one, netdev_features_t mask)
 {
-	if (mask & NETIF_F_GEN_CSUM)
+	if (mask & NETIF_F_HW_CSUM)
 		mask |= NETIF_F_CSUM_MASK;
 	mask |= NETIF_F_VLAN_CHALLENGED;
 
@@ -7579,8 +7579,8 @@ netdev_features_t netdev_increment_features(netdev_features_t all,
 	all &= one | ~NETIF_F_ALL_FOR_ALL;
 
 	/* If one device supports hw checksumming, set for all. */
-	if (all & NETIF_F_GEN_CSUM)
-		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_GEN_CSUM);
+	if (all & NETIF_F_HW_CSUM)
+		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
 
 	return all;
 }
-- 
cgit v1.2.3


From 6ae23ad36253a8033c5714c52b691b84456487c5 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 14 Dec 2015 11:19:46 -0800
Subject: net: Add driver helper functions to determine checksum offloadability

Add skb_csum_offload_chk driver helper function to determine if a
device with limited checksum offload capabilities is able to offload the
checksum for a given packet.

This patch includes:
  - The skb_csum_offload_chk function. Returns true if checksum is
    offloadable, else false. Optionally, in the case that the checksum
    is not offloable, the function can call skb_checksum_help to resolve
    the checksum. skb_csum_offload_chk also returns whether the checksum
    refers to an encapsulated checksum.
  - Definition of skb_csum_offl_spec structure that caller uses to
    indicate rules about what it can offload (e.g. IPv4/v6, TCP/UDP only,
    whether encapsulated checksums can be offloaded, whether checksum with
    IPv6 extension headers can be offloaded).
  - Ancilary functions called skb_csum_offload_chk_help,
    skb_csum_off_chk_help_cmn, skb_csum_off_chk_help_cmn_v4_only.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 45b013f27625..914b4a24c654 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -138,6 +138,7 @@
 #include <linux/errqueue.h>
 #include <linux/hrtimer.h>
 #include <linux/netfilter_ingress.h>
+#include <linux/sctp.h>
 
 #include "net-sysfs.h"
 
@@ -2471,6 +2472,141 @@ out:
 }
 EXPORT_SYMBOL(skb_checksum_help);
 
+/* skb_csum_offload_check - Driver helper function to determine if a device
+ * with limited checksum offload capabilities is able to offload the checksum
+ * for a given packet.
+ *
+ * Arguments:
+ *   skb - sk_buff for the packet in question
+ *   spec - contains the description of what device can offload
+ *   csum_encapped - returns true if the checksum being offloaded is
+ *	      encpasulated. That is it is checksum for the transport header
+ *	      in the inner headers.
+ *   checksum_help - when set indicates that helper function should
+ *	      call skb_checksum_help if offload checks fail
+ *
+ * Returns:
+ *   true: Packet has passed the checksum checks and should be offloadable to
+ *	   the device (a driver may still need to check for additional
+ *	   restrictions of its device)
+ *   false: Checksum is not offloadable. If checksum_help was set then
+ *	   skb_checksum_help was called to resolve checksum for non-GSO
+ *	   packets and when IP protocol is not SCTP
+ */
+bool __skb_csum_offload_chk(struct sk_buff *skb,
+			    const struct skb_csum_offl_spec *spec,
+			    bool *csum_encapped,
+			    bool csum_help)
+{
+	struct iphdr *iph;
+	struct ipv6hdr *ipv6;
+	void *nhdr;
+	int protocol;
+	u8 ip_proto;
+
+	if (skb->protocol == htons(ETH_P_8021Q) ||
+	    skb->protocol == htons(ETH_P_8021AD)) {
+		if (!spec->vlan_okay)
+			goto need_help;
+	}
+
+	/* We check whether the checksum refers to a transport layer checksum in
+	 * the outermost header or an encapsulated transport layer checksum that
+	 * corresponds to the inner headers of the skb. If the checksum is for
+	 * something else in the packet we need help.
+	 */
+	if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
+		/* Non-encapsulated checksum */
+		protocol = eproto_to_ipproto(vlan_get_protocol(skb));
+		nhdr = skb_network_header(skb);
+		*csum_encapped = false;
+		if (spec->no_not_encapped)
+			goto need_help;
+	} else if (skb->encapsulation && spec->encap_okay &&
+		   skb_checksum_start_offset(skb) ==
+		   skb_inner_transport_offset(skb)) {
+		/* Encapsulated checksum */
+		*csum_encapped = true;
+		switch (skb->inner_protocol_type) {
+		case ENCAP_TYPE_ETHER:
+			protocol = eproto_to_ipproto(skb->inner_protocol);
+			break;
+		case ENCAP_TYPE_IPPROTO:
+			protocol = skb->inner_protocol;
+			break;
+		}
+		nhdr = skb_inner_network_header(skb);
+	} else {
+		goto need_help;
+	}
+
+	switch (protocol) {
+	case IPPROTO_IP:
+		if (!spec->ipv4_okay)
+			goto need_help;
+		iph = nhdr;
+		ip_proto = iph->protocol;
+		if (iph->ihl != 5 && !spec->ip_options_okay)
+			goto need_help;
+		break;
+	case IPPROTO_IPV6:
+		if (!spec->ipv6_okay)
+			goto need_help;
+		if (spec->no_encapped_ipv6 && *csum_encapped)
+			goto need_help;
+		ipv6 = nhdr;
+		nhdr += sizeof(*ipv6);
+		ip_proto = ipv6->nexthdr;
+		break;
+	default:
+		goto need_help;
+	}
+
+ip_proto_again:
+	switch (ip_proto) {
+	case IPPROTO_TCP:
+		if (!spec->tcp_okay ||
+		    skb->csum_offset != offsetof(struct tcphdr, check))
+			goto need_help;
+		break;
+	case IPPROTO_UDP:
+		if (!spec->udp_okay ||
+		    skb->csum_offset != offsetof(struct udphdr, check))
+			goto need_help;
+		break;
+	case IPPROTO_SCTP:
+		if (!spec->sctp_okay ||
+		    skb->csum_offset != offsetof(struct sctphdr, checksum))
+			goto cant_help;
+		break;
+	case NEXTHDR_HOP:
+	case NEXTHDR_ROUTING:
+	case NEXTHDR_DEST: {
+		u8 *opthdr = nhdr;
+
+		if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
+			goto need_help;
+
+		ip_proto = opthdr[0];
+		nhdr += (opthdr[1] + 1) << 3;
+
+		goto ip_proto_again;
+	}
+	default:
+		goto need_help;
+	}
+
+	/* Passed the tests for offloading checksum */
+	return true;
+
+need_help:
+	if (csum_help && !skb_shinfo(skb)->gso_size)
+		skb_checksum_help(skb);
+cant_help:
+	return false;
+}
+EXPORT_SYMBOL(__skb_csum_offload_chk);
+
 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 {
 	__be16 type = skb->protocol;
-- 
cgit v1.2.3


From 1f211a1b929c804100e138c5d3d656992cfd5622 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 7 Jan 2016 22:29:47 +0100
Subject: net, sched: add clsact qdisc

This work adds a generalization of the ingress qdisc as a qdisc holding
only classifiers. The clsact qdisc works on ingress, but also on egress.
In both cases, it's execution happens without taking the qdisc lock, and
the main difference for the egress part compared to prior version of [1]
is that this can be applied with _any_ underlying real egress qdisc (also
classless ones).

Besides solving the use-case of [1], that is, allowing for more programmability
on assigning skb->priority for the mqprio case that is supported by most
popular 10G+ NICs, it also opens up a lot more flexibility for other tc
applications. The main work on classification can already be done at clsact
egress time if the use-case allows and state stored for later retrieval
f.e. again in skb->priority with major/minors (which is checked by most
classful qdiscs before consulting tc_classify()) and/or in other skb fields
like skb->tc_index for some light-weight post-processing to get to the
eventual classid in case of a classful qdisc. Another use case is that
the clsact egress part allows to have a central egress counterpart to
the ingress classifiers, so that classifiers can easily share state (e.g.
in cls_bpf via eBPF maps) for ingress and egress.

Currently, default setups like mq + pfifo_fast would require for this to
use, for example, prio qdisc instead (to get a tc_classify() run) and to
duplicate the egress classifier for each queue. With clsact, it allows
for leaving the setup as is, it can additionally assign skb->priority to
put the skb in one of pfifo_fast's bands and it can share state with maps.
Moreover, we can access the skb's dst entry (f.e. to retrieve tclassid)
w/o the need to perform a skb_dst_force() to hold on to it any longer. In
lwt case, we can also use this facility to setup dst metadata via cls_bpf
(bpf_skb_set_tunnel_key()) without needing a real egress qdisc just for
that (case of IFF_NO_QUEUE devices, for example).

The realization can be done without any changes to the scheduler core
framework. All it takes is that we have two a-priori defined minors/child
classes, where we can mux between ingress and egress classifier list
(dev->ingress_cl_list and dev->egress_cl_list, latter stored close to
dev->_tx to avoid extra cacheline miss for moderate loads). The egress
part is a bit similar modelled to handle_ing() and patched to a noop in
case the functionality is not used. Both handlers are now called
sch_handle_ingress() and sch_handle_egress(), code sharing among the two
doesn't seem practical as there are various minor differences in both
paths, so that making them conditional in a single handler would rather
slow things down.

Full compatibility to ingress qdisc is provided as well. Since both
piggyback on TC_H_CLSACT, only one of them (ingress/clsact) can exist
per netdevice, and thus ingress qdisc specific behaviour can be retained
for user space. This means, either a user does 'tc qdisc add dev foo ingress'
and configures ingress qdisc as usual, or the 'tc qdisc add dev foo clsact'
alternative, where both, ingress and egress classifier can be configured
as in the below example. ingress qdisc supports attaching classifier to any
minor number whereas clsact has two fixed minors for muxing between the
lists, therefore to not break user space setups, they are better done as
two separate qdiscs.

I decided to extend the sch_ingress module with clsact functionality so
that commonly used code can be reused, the module is being aliased with
sch_clsact so that it can be auto-loaded properly. Alternative would have been
to add a flag when initializing ingress to alter its behaviour plus aliasing
to a different name (as it's more than just ingress). However, the first would
end up, based on the flag, choosing the new/old behaviour by calling different
function implementations to handle each anyway, the latter would require to
register ingress qdisc once again under different alias. So, this really begs
to provide a minimal, cleaner approach to have Qdisc_ops and Qdisc_class_ops
by its own that share callbacks used by both.

Example, adding qdisc:

   # tc qdisc add dev foo clsact
   # tc qdisc show dev foo
   qdisc mq 0: root
   qdisc pfifo_fast 0: parent :1 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
   qdisc pfifo_fast 0: parent :2 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
   qdisc pfifo_fast 0: parent :3 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
   qdisc pfifo_fast 0: parent :4 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1
   qdisc clsact ffff: parent ffff:fff1

Adding filters (deleting, etc works analogous by specifying ingress/egress):

   # tc filter add dev foo ingress bpf da obj bar.o sec ingress
   # tc filter add dev foo egress  bpf da obj bar.o sec egress
   # tc filter show dev foo ingress
   filter protocol all pref 49152 bpf
   filter protocol all pref 49152 bpf handle 0x1 bar.o:[ingress] direct-action
   # tc filter show dev foo egress
   filter protocol all pref 49152 bpf
   filter protocol all pref 49152 bpf handle 0x1 bar.o:[egress] direct-action

A 'tc filter show dev foo' or 'tc filter show dev foo parent ffff:' will
show an empty list for clsact. Either using the parent names (ingress/egress)
or specifying the full major/minor will then show the related filter lists.

Prior work on a mqprio prequeue() facility [1] was done mainly by John Fastabend.

  [1] http://patchwork.ozlabs.org/patch/512949/

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 74 insertions(+), 8 deletions(-)

(limited to 'net/core/dev.c')

diff --git a/net/core/dev.c b/net/core/dev.c
index 914b4a24c654..0ca95d5d7af0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1676,6 +1676,22 @@ void net_dec_ingress_queue(void)
 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
 #endif
 
+#ifdef CONFIG_NET_EGRESS
+static struct static_key egress_needed __read_mostly;
+
+void net_inc_egress_queue(void)
+{
+	static_key_slow_inc(&egress_needed);
+}
+EXPORT_SYMBOL_GPL(net_inc_egress_queue);
+
+void net_dec_egress_queue(void)
+{
+	static_key_slow_dec(&egress_needed);
+}
+EXPORT_SYMBOL_GPL(net_dec_egress_queue);
+#endif
+
 static struct static_key netstamp_needed __read_mostly;
 #ifdef HAVE_JUMP_LABEL
 /* We are not allowed to call static_key_slow_dec() from irq context
@@ -3007,7 +3023,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 	bool contended;
 	int rc;
 
-	qdisc_pkt_len_init(skb);
 	qdisc_calculate_pkt_len(skb, q);
 	/*
 	 * Heuristic to force contended enqueues to serialize on a
@@ -3100,6 +3115,49 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(dev_loopback_xmit);
 
+#ifdef CONFIG_NET_EGRESS
+static struct sk_buff *
+sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
+{
+	struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
+	struct tcf_result cl_res;
+
+	if (!cl)
+		return skb;
+
+	/* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
+	 * earlier by the caller.
+	 */
+	qdisc_bstats_cpu_update(cl->q, skb);
+
+	switch (tc_classify(skb, cl, &cl_res, false)) {
+	case TC_ACT_OK:
+	case TC_ACT_RECLASSIFY:
+		skb->tc_index = TC_H_MIN(cl_res.classid);
+		break;
+	case TC_ACT_SHOT:
+		qdisc_qstats_cpu_drop(cl->q);
+		*ret = NET_XMIT_DROP;
+		goto drop;
+	case TC_ACT_STOLEN:
+	case TC_ACT_QUEUED:
+		*ret = NET_XMIT_SUCCESS;
+drop:
+		kfree_skb(skb);
+		return NULL;
+	case TC_ACT_REDIRECT:
+		/* No need to push/pop skb's mac_header here on egress! */
+		skb_do_redirect(skb);
+		*ret = NET_XMIT_SUCCESS;
+		return NULL;
+	default:
+		break;
+	}
+
+	return skb;
+}
+#endif /* CONFIG_NET_EGRESS */
+
 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 {
 #ifdef CONFIG_XPS
@@ -3226,6 +3284,17 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 
 	skb_update_prio(skb);
 
+	qdisc_pkt_len_init(skb);
+#ifdef CONFIG_NET_CLS_ACT
+	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
+# ifdef CONFIG_NET_EGRESS
+	if (static_key_false(&egress_needed)) {
+		skb = sch_handle_egress(skb, &rc, dev);
+		if (!skb)
+			goto out;
+	}
+# endif
+#endif
 	/* If device/qdisc don't need skb->dst, release it right now while
 	 * its hot in this cpu cache.
 	 */
@@ -3247,9 +3316,6 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
 	txq = netdev_pick_tx(dev, skb, accel_priv);
 	q = rcu_dereference_bh(txq->qdisc);
 
-#ifdef CONFIG_NET_CLS_ACT
-	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
-#endif
 	trace_net_dev_queue(skb);
 	if (q->enqueue) {
 		rc = __dev_xmit_skb(skb, q, dev, txq);
@@ -3806,9 +3872,9 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev,
 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 #endif
 
-static inline struct sk_buff *handle_ing(struct sk_buff *skb,
-					 struct packet_type **pt_prev,
-					 int *ret, struct net_device *orig_dev)
+static inline struct sk_buff *
+sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
+		   struct net_device *orig_dev)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
@@ -4002,7 +4068,7 @@ another_round:
 skip_taps:
 #ifdef CONFIG_NET_INGRESS
 	if (static_key_false(&ingress_needed)) {
-		skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
+		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
 		if (!skb)
 			goto out;
 
-- 
cgit v1.2.3