From bffa72cf7f9df842f0016ba03586039296b4caaf Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 19 Sep 2017 05:14:24 -0700
Subject: net: sk_buff rbnode reorg

skb->rbnode shares space with skb->next, skb->prev and skb->tstamp

Current uses (TCP receive ofo queue and netem) need to save/restore
tstamp, while skb->dev is either NULL (TCP) or a constant for a given
queue (netem).

Since we plan using an RB tree for TCP retransmit queue to speedup SACK
processing with large BDP, this patch exchanges skb->dev and
skb->tstamp.

This saves some overhead in both TCP and netem.

v2: removes the swtstamp field from struct tcp_skb_cb

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Wei Wang <weiwan@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 72299ef00061..492828801acb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -661,8 +661,12 @@ struct sk_buff {
 			struct sk_buff		*prev;
 
 			union {
-				ktime_t		tstamp;
-				u64		skb_mstamp;
+				struct net_device	*dev;
+				/* Some protocols might use this space to store information,
+				 * while device pointer would be NULL.
+				 * UDP receive path is one user.
+				 */
+				unsigned long		dev_scratch;
 			};
 		};
 		struct rb_node	rbnode; /* used in netem & tcp stack */
@@ -670,12 +674,8 @@ struct sk_buff {
 	struct sock		*sk;
 
 	union {
-		struct net_device	*dev;
-		/* Some protocols might use this space to store information,
-		 * while device pointer would be NULL.
-		 * UDP receive path is one user.
-		 */
-		unsigned long		dev_scratch;
+		ktime_t		tstamp;
+		u64		skb_mstamp;
 	};
 	/*
 	 * This is the control buffer. It is free to use for every
-- 
cgit v1.2.3


From 6e617de84e87d626d1e976fc30e1322239fd4d2d Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 20 Sep 2017 18:26:53 +0200
Subject: net: avoid a full fib lookup when rp_filter is disabled.

Since commit 1dced6a85482 ("ipv4: Restore accept_local behaviour
in fib_validate_source()") a full fib lookup is needed even if
the rp_filter is disabled, if accept_local is false - which is
the default.

What we really need in the above scenario is just checking
that the source IP address is not local, and in most case we
can do that is a cheaper way looking up the ifaddr hash table.

This commit adds a helper for such lookup, and uses it to
validate the src address when rp_filter is disabled and no
'local' routes are created by the user space in the relevant
namespace.

A new ipv4 netns flag is added to account for such routes.
We need that to preserve the same behavior we had before this
patch.

It also drops the checks to bail early from __fib_validate_source,
added by the commit 1dced6a85482 ("ipv4: Restore accept_local
behaviour in fib_validate_source()") they do not give any
measurable performance improvement: if we do the lookup with are
on a slower path.

This improves UDP performances for unconnected sockets
when rp_filter is disabled by 5% and also gives small but
measurable performance improvement for TCP flood scenarios.

v1 -> v2:
 - use the ifaddr lookup helper in __ip_dev_find(), as suggested
   by Eric
 - fall-back to full lookup if custom local routes are present

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index fb3f809e34e4..751d051f0bc7 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -179,6 +179,7 @@ __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst,
 			 __be32 local, int scope);
 struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
 				    __be32 mask);
+struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr);
 static __inline__ bool inet_ifa_match(__be32 addr, struct in_ifaddr *ifa)
 {
 	return !((addr^ifa->ifa_address)&ifa->ifa_mask);
-- 
cgit v1.2.3


From 242c1a28eb61cb34974e8aa05235d84355940a8a Mon Sep 17 00:00:00 2001
From: Gao Feng <gfree.wind@vip.163.com>
Date: Fri, 22 Sep 2017 10:25:22 +0800
Subject: net: Remove useless function skb_header_release

There is no one which would invokes the function skb_header_release.
So just remove it now.

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 492828801acb..f9db5539a6fb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1456,28 +1456,9 @@ static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri)
 	return 0;
 }
 
-/**
- *	skb_header_release - release reference to header
- *	@skb: buffer to operate on
- *
- *	Drop a reference to the header part of the buffer.  This is done
- *	by acquiring a payload reference.  You must not read from the header
- *	part of skb->data after this.
- *	Note : Check if you can use __skb_header_release() instead.
- */
-static inline void skb_header_release(struct sk_buff *skb)
-{
-	BUG_ON(skb->nohdr);
-	skb->nohdr = 1;
-	atomic_add(1 << SKB_DATAREF_SHIFT, &skb_shinfo(skb)->dataref);
-}
-
 /**
  *	__skb_header_release - release reference to header
  *	@skb: buffer to operate on
- *
- *	Variant of skb_header_release() assuming skb is private to caller.
- *	We can avoid one atomic operation.
  */
 static inline void __skb_header_release(struct sk_buff *skb)
 {
-- 
cgit v1.2.3


From 1e99c497012cd8647972876f1bd18545bc907aea Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Sun, 24 Sep 2017 12:09:45 +0300
Subject: qed: iWARP - Add check for errors on a SYN packet

A SYN packet which arrives with errors from FW should be dropped.
This required adding an additional field to the ll2
rx completion data.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_ll2_if.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index dd7a3b86bb9e..89fa0bbd54f3 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -101,6 +101,7 @@ struct qed_ll2_comp_rx_data {
 	void *cookie;
 	dma_addr_t rx_buf_addr;
 	u16 parse_flags;
+	u16 err_flags;
 	u16 vlan;
 	bool b_last_packet;
 	u8 connection_handle;
-- 
cgit v1.2.3


From 6aaae2b6c4330a46204bca042f1d2f41e8e18dea Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 25 Sep 2017 02:25:50 +0200
Subject: bpf: rename bpf_compute_data_end into bpf_compute_data_pointers

Just do the rename into bpf_compute_data_pointers() as we'll add
one more pointer here to recompute.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index d29e58fde364..052bab3d62e7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -496,10 +496,13 @@ struct xdp_buff {
 	void *data_hard_start;
 };
 
-/* compute the linear packet data range [data, data_end) which
- * will be accessed by cls_bpf, act_bpf and lwt programs
+/* Compute the linear packet data range [data, data_end) which
+ * will be accessed by various program types (cls_bpf, act_bpf,
+ * lwt, ...). Subsystems allowing direct data access must (!)
+ * ensure that cb[] area can be written to when BPF program is
+ * invoked (otherwise cb[] save/restore is necessary).
  */
-static inline void bpf_compute_data_end(struct sk_buff *skb)
+static inline void bpf_compute_data_pointers(struct sk_buff *skb)
 {
 	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
 
-- 
cgit v1.2.3


From de8f3a83b0a0fddb2cf56e7a718127e9619ea3da Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 25 Sep 2017 02:25:51 +0200
Subject: bpf: add meta pointer for direct access

This work enables generic transfer of metadata from XDP into skb. The
basic idea is that we can make use of the fact that the resulting skb
must be linear and already comes with a larger headroom for supporting
bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work
on a similar principle and introduce a small helper bpf_xdp_adjust_meta()
for adjusting a new pointer called xdp->data_meta. Thus, the packet has
a flexible and programmable room for meta data, followed by the actual
packet data. struct xdp_buff is therefore laid out that we first point
to data_hard_start, then data_meta directly prepended to data followed
by data_end marking the end of packet. bpf_xdp_adjust_head() takes into
account whether we have meta data already prepended and if so, memmove()s
this along with the given offset provided there's enough room.

xdp->data_meta is optional and programs are not required to use it. The
rationale is that when we process the packet in XDP (e.g. as DoS filter),
we can push further meta data along with it for the XDP_PASS case, and
give the guarantee that a clsact ingress BPF program on the same device
can pick this up for further post-processing. Since we work with skb
there, we can also set skb->mark, skb->priority or other skb meta data
out of BPF, thus having this scratch space generic and programmable
allows for more flexibility than defining a direct 1:1 transfer of
potentially new XDP members into skb (it's also more efficient as we
don't need to initialize/handle each of such new members). The facility
also works together with GRO aggregation. The scratch space at the head
of the packet can be multiple of 4 byte up to 32 byte large. Drivers not
yet supporting xdp->data_meta can simply be set up with xdp->data_meta
as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out,
such that the subsequent match against xdp->data for later access is
guaranteed to fail.

The verifier treats xdp->data_meta/xdp->data the same way as we treat
xdp->data/xdp->data_end pointer comparisons. The requirement for doing
the compare against xdp->data is that it hasn't been modified from it's
original address we got from ctx access. It may have a range marking
already from prior successful xdp->data/xdp->data_end pointer comparisons
though.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h    |  1 +
 include/linux/filter.h | 21 ++++++++++++++--
 include/linux/skbuff.h | 68 ++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 86 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8390859e79e7..2b672c50f160 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -137,6 +137,7 @@ enum bpf_reg_type {
 	PTR_TO_MAP_VALUE,	 /* reg points to map element value */
 	PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
 	PTR_TO_STACK,		 /* reg == frame_pointer + offset */
+	PTR_TO_PACKET_META,	 /* skb->data - meta_len */
 	PTR_TO_PACKET,		 /* reg points to skb->data */
 	PTR_TO_PACKET_END,	 /* skb->data + headlen */
 };
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 052bab3d62e7..911d454af107 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -487,12 +487,14 @@ struct sk_filter {
 
 struct bpf_skb_data_end {
 	struct qdisc_skb_cb qdisc_cb;
+	void *data_meta;
 	void *data_end;
 };
 
 struct xdp_buff {
 	void *data;
 	void *data_end;
+	void *data_meta;
 	void *data_hard_start;
 };
 
@@ -507,7 +509,8 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb)
 	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
 
 	BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb));
-	cb->data_end = skb->data + skb_headlen(skb);
+	cb->data_meta = skb->data - skb_metadata_len(skb);
+	cb->data_end  = skb->data + skb_headlen(skb);
 }
 
 static inline u8 *bpf_skb_cb(struct sk_buff *skb)
@@ -728,8 +731,22 @@ int xdp_do_redirect(struct net_device *dev,
 		    struct bpf_prog *prog);
 void xdp_do_flush_map(void);
 
+/* Drivers not supporting XDP metadata can use this helper, which
+ * rejects any room expansion for metadata as a result.
+ */
+static __always_inline void
+xdp_set_data_meta_invalid(struct xdp_buff *xdp)
+{
+	xdp->data_meta = xdp->data + 1;
+}
+
+static __always_inline bool
+xdp_data_meta_unsupported(const struct xdp_buff *xdp)
+{
+	return unlikely(xdp->data_meta > xdp->data);
+}
+
 void bpf_warn_invalid_xdp_action(u32 act);
-void bpf_warn_invalid_xdp_redirect(u32 ifindex);
 
 struct sock *do_sk_redirect_map(void);
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f9db5539a6fb..19e64bfb1a66 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -489,8 +489,9 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
  * the end of the header data, ie. at skb->end.
  */
 struct skb_shared_info {
-	unsigned short	_unused;
-	unsigned char	nr_frags;
+	__u8		__unused;
+	__u8		meta_len;
+	__u8		nr_frags;
 	__u8		tx_flags;
 	unsigned short	gso_size;
 	/* Warning: this field is not always filled in (UFO)! */
@@ -3400,6 +3401,69 @@ static inline ktime_t net_invalid_timestamp(void)
 	return 0;
 }
 
+static inline u8 skb_metadata_len(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->meta_len;
+}
+
+static inline void *skb_metadata_end(const struct sk_buff *skb)
+{
+	return skb_mac_header(skb);
+}
+
+static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
+					  const struct sk_buff *skb_b,
+					  u8 meta_len)
+{
+	const void *a = skb_metadata_end(skb_a);
+	const void *b = skb_metadata_end(skb_b);
+	/* Using more efficient varaiant than plain call to memcmp(). */
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+	u64 diffs = 0;
+
+	switch (meta_len) {
+#define __it(x, op) (x -= sizeof(u##op))
+#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
+	case 32: diffs |= __it_diff(a, b, 64);
+	case 24: diffs |= __it_diff(a, b, 64);
+	case 16: diffs |= __it_diff(a, b, 64);
+	case  8: diffs |= __it_diff(a, b, 64);
+		break;
+	case 28: diffs |= __it_diff(a, b, 64);
+	case 20: diffs |= __it_diff(a, b, 64);
+	case 12: diffs |= __it_diff(a, b, 64);
+	case  4: diffs |= __it_diff(a, b, 32);
+		break;
+	}
+	return diffs;
+#else
+	return memcmp(a - meta_len, b - meta_len, meta_len);
+#endif
+}
+
+static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
+					const struct sk_buff *skb_b)
+{
+	u8 len_a = skb_metadata_len(skb_a);
+	u8 len_b = skb_metadata_len(skb_b);
+
+	if (!(len_a | len_b))
+		return false;
+
+	return len_a != len_b ?
+	       true : __skb_metadata_differs(skb_a, skb_b, len_a);
+}
+
+static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
+{
+	skb_shinfo(skb)->meta_len = meta_len;
+}
+
+static inline void skb_metadata_clear(struct sk_buff *skb)
+{
+	skb_metadata_set(skb, 0);
+}
+
 struct sk_buff *skb_clone_sk(struct sk_buff *skb);
 
 #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
-- 
cgit v1.2.3


From 310ebbba3b7396b00bce08a33f1d2de2c74fa257 Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Wed, 27 Sep 2017 08:23:12 +0200
Subject: ipmr: Add reference count to MFC entries

Next commits will introduce MFC notifications through the atomic
fib_notification chain, thus allowing modules to be aware of MFC entries.

Due to the fact that modules may need to hold a reference to an MFC entry,
add reference count to MFC entries to prevent them from being freed while
these modules use them.

The reference counting is done only on resolved MFC entries currently.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index d7f63339ef0b..10028f208efb 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -109,6 +109,7 @@ struct mfc_cache_cmp_arg {
  * @wrong_if: number of wrong source interface hits
  * @lastuse: time of last use of the group (traffic or update)
  * @ttls: OIF TTL threshold array
+ * @refcount: reference count for this entry
  * @list: global entry list
  * @rcu: used for entry destruction
  */
@@ -138,6 +139,7 @@ struct mfc_cache {
 			unsigned long wrong_if;
 			unsigned long lastuse;
 			unsigned char ttls[MAXVIFS];
+			refcount_t refcount;
 		} res;
 	} mfc_un;
 	struct list_head list;
@@ -148,4 +150,23 @@ struct rtmsg;
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		   __be32 saddr, __be32 daddr,
 		   struct rtmsg *rtm, u32 portid);
+
+#ifdef CONFIG_IP_MROUTE
+void ipmr_cache_free(struct mfc_cache *mfc_cache);
+#else
+static inline void ipmr_cache_free(struct mfc_cache *mfc_cache)
+{
+}
+#endif
+
+static inline void ipmr_cache_put(struct mfc_cache *c)
+{
+	if (refcount_dec_and_test(&c->mfc_un.res.refcount))
+		ipmr_cache_free(c);
+}
+static inline void ipmr_cache_hold(struct mfc_cache *c)
+{
+	refcount_inc(&c->mfc_un.res.refcount);
+}
+
 #endif
-- 
cgit v1.2.3


From 4d65b9487831170e699b2fc64a91b839d729bd78 Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Wed, 27 Sep 2017 08:23:13 +0200
Subject: ipmr: Add FIB notification access functions

Make the ipmr module register as a FIB notifier. To do that, implement both
the ipmr_seq_read and ipmr_dump ops.

The ipmr_seq_read op returns a sequence counter that is incremented on
every notification related operation done by the ipmr. To implement that,
add a sequence counter in the netns_ipv4 struct and increment it whenever a
new MFC route or VIF are added or deleted. The sequence operations are
protected by the RTNL lock.

The ipmr_dump iterates the list of MFC routes and the list of VIF entries
and sends notifications about them. The entries dump is done under RCU
where the VIF dump uses the mrt_lock too, as the vif->dev field can change
under RCU.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 10028f208efb..54c5cb82ddcb 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -5,6 +5,7 @@
 #include <linux/pim.h>
 #include <linux/rhashtable.h>
 #include <net/sock.h>
+#include <net/fib_notifier.h>
 #include <uapi/linux/mroute.h>
 
 #ifdef CONFIG_IP_MROUTE
@@ -58,6 +59,14 @@ struct vif_device {
 	int		link;			/* Physical interface index	*/
 };
 
+struct vif_entry_notifier_info {
+	struct fib_notifier_info info;
+	struct net_device *dev;
+	vifi_t vif_index;
+	unsigned short vif_flags;
+	u32 tb_id;
+};
+
 #define VIFF_STATIC 0x8000
 
 #define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
@@ -146,6 +155,12 @@ struct mfc_cache {
 	struct rcu_head	rcu;
 };
 
+struct mfc_entry_notifier_info {
+	struct fib_notifier_info info;
+	struct mfc_cache *mfc;
+	u32 tb_id;
+};
+
 struct rtmsg;
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		   __be32 saddr, __be32 daddr,
-- 
cgit v1.2.3


From c7c0bbeae9501a7e42f2fd306d6a6399aca688b6 Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Wed, 27 Sep 2017 08:23:15 +0200
Subject: net: ipmr: Add MFC offload indication

Allow drivers, registered to the fib notification chain indicate whether a
multicast MFC route is offloaded or not, similarly to unicast routes. The
indication of whether a route is offloaded is done using the mfc_flags
field on an mfc_cache struct, and the information is sent to the userspace
via the RTNetlink interface only.

Currently, MFC routes are either offloaded or not, thus there is no need to
add per-VIF offload indication.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 54c5cb82ddcb..5566580811ce 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -90,9 +90,11 @@ struct mr_table {
 
 /* mfc_flags:
  * MFC_STATIC - the entry was added statically (not by a routing daemon)
+ * MFC_OFFLOAD - the entry was offloaded to the hardware
  */
 enum {
 	MFC_STATIC = BIT(0),
+	MFC_OFFLOAD = BIT(1),
 };
 
 struct mfc_cache_cmp_arg {
-- 
cgit v1.2.3


From 478e4c2f0067d57d7c17059caafab026ca32084a Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Wed, 27 Sep 2017 08:23:16 +0200
Subject: net: mroute: Check if rule is a default rule

When the ipmr starts, it adds one default FIB rule that matches all packets
and sends them to the DEFAULT (multicast) FIB table. A more complex rule
can be added by user to specify that for a specific interface, a packet
should be look up at either an arbitrary table or according to the l3mdev
of the interface.

For drivers willing to offload the ipmr logic into a hardware but don't
want to offload all the FIB rules functionality, provide a function that
can indicate whether the FIB rule is the default multicast rule, thus only
one routing table is needed.

This way, a driver can register to the FIB notification chain, get
notifications about FIB rules added and trigger some kind of an internal
abort mechanism when a non default rule is added by the user.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index 5566580811ce..b072a84fbe1c 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -5,6 +5,7 @@
 #include <linux/pim.h>
 #include <linux/rhashtable.h>
 #include <net/sock.h>
+#include <net/fib_rules.h>
 #include <net/fib_notifier.h>
 #include <uapi/linux/mroute.h>
 
@@ -19,6 +20,7 @@ int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *);
 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg);
 int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
 int ip_mr_init(void);
+bool ipmr_rule_default(const struct fib_rule *rule);
 #else
 static inline int ip_mroute_setsockopt(struct sock *sock, int optname,
 				       char __user *optval, unsigned int optlen)
@@ -46,6 +48,11 @@ static inline int ip_mroute_opt(int opt)
 {
 	return 0;
 }
+
+static inline bool ipmr_rule_default(const struct fib_rule *rule)
+{
+	return true;
+}
 #endif
 
 struct vif_device {
-- 
cgit v1.2.3


From 6ade97da601f8af793f6c7a861af754d0f0b6767 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 26 Sep 2017 23:12:28 +0300
Subject: arp: make arp_hdr_len() return unsigned int

Negative ARP header length are not a thing.

Constify arguments while I'm at it.

Space savings:

	add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-3 (-3)
	function                        old     new   delta
	arpt_do_table                  1163    1160      -3

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_arp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h
index 3355efc89781..6756fea18b69 100644
--- a/include/linux/if_arp.h
+++ b/include/linux/if_arp.h
@@ -31,7 +31,7 @@ static inline struct arphdr *arp_hdr(const struct sk_buff *skb)
 	return (struct arphdr *)skb_network_header(skb);
 }
 
-static inline int arp_hdr_len(struct net_device *dev)
+static inline unsigned int arp_hdr_len(const struct net_device *dev)
 {
 	switch (dev->type) {
 #if IS_ENABLED(CONFIG_FIREWIRE_NET)
-- 
cgit v1.2.3


From cb4d2b3f03d8eed90be3a194e5b54b734ec4bbe9 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 27 Sep 2017 14:37:52 -0700
Subject: bpf: Add name, load_time, uid and map_ids to bpf_prog_info

The patch adds name and load_time to struct bpf_prog_aux.  They
are also exported to bpf_prog_info.

The bpf_prog's name is passed by userspace during BPF_PROG_LOAD.
The kernel only stores the first (BPF_PROG_NAME_LEN - 1) bytes
and the name stored in the kernel is always \0 terminated.

The kernel will reject name that contains characters other than
isalnum() and '_'.  It will also reject name that is not null
terminated.

The existing 'user->uid' of the bpf_prog_aux is also exported to
the bpf_prog_info as created_by_uid.

The existing 'used_maps' of the bpf_prog_aux is exported to
the newly added members 'nr_map_ids' and 'map_ids' of
the bpf_prog_info.  On the input, nr_map_ids tells how
big the userspace's map_ids buffer is.  On the output,
nr_map_ids tells the exact user_map_cnt and it will only
copy up to the userspace's map_ids buffer is allowed.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2b672c50f160..33ccc474fb04 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -187,6 +187,8 @@ struct bpf_prog_aux {
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
 	struct user_struct *user;
+	u64 load_time; /* ns since boottime */
+	u8 name[BPF_OBJ_NAME_LEN];
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
-- 
cgit v1.2.3


From ad5b177bd73f5107d97c36f56395c4281fb6f089 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 27 Sep 2017 14:37:53 -0700
Subject: bpf: Add map_name to bpf_map_info

This patch allows userspace to specify a name for a map
during BPF_MAP_CREATE.

The map's name can later be exported to user space
via BPF_OBJ_GET_INFO_BY_FD.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 33ccc474fb04..252f4bc9eb25 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,6 +56,7 @@ struct bpf_map {
 	struct work_struct work;
 	atomic_t usercnt;
 	struct bpf_map *inner_map_meta;
+	u8 name[BPF_OBJ_NAME_LEN];
 };
 
 /* function argument constraints */
-- 
cgit v1.2.3


From a3f5aa907340b5d7b54223ddbaa90410f168864d Mon Sep 17 00:00:00 2001
From: Alan Brady <alan.brady@intel.com>
Date: Fri, 14 Jul 2017 09:27:08 -0400
Subject: i40e: Enable VF to negotiate number of allocated queues

Currently the PF allocates a default number of queues for each VF and
cannot be changed.  This patch enables the VF to request a different
number of queues allocated to it.  This patch also adds a new virtchnl
op and capability flag to facilitate this negotiation.

After the PF receives a request message, it will set a requested number
of queues for that VF.  Then when the VF resets, its VSI will get a new
number of queues allocated to it.

This is a best effort request and since we only allocate a guaranteed
default number, if the VF tries to ask for more than the guaranteed
number, there may not be enough in HW to accommodate it unless other
queues for other VFs are freed. It should also be noted decreasing the
number queues allocated to a VF to below the default will NOT enable the
allocation of more than 32 VFs per PF and will not free queues guaranteed
to each VF by default.

Signed-off-by: Alan Brady <alan.brady@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/avf/virtchnl.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 2b038442c352..60e5d90cb18a 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -135,6 +135,7 @@ enum virtchnl_ops {
 	VIRTCHNL_OP_SET_RSS_HENA = 26,
 	VIRTCHNL_OP_ENABLE_VLAN_STRIPPING = 27,
 	VIRTCHNL_OP_DISABLE_VLAN_STRIPPING = 28,
+	VIRTCHNL_OP_REQUEST_QUEUES = 29,
 };
 
 /* This macro is used to generate a compilation error if a structure
@@ -235,6 +236,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
 #define VIRTCHNL_VF_OFFLOAD_RSS_AQ		0x00000008
 #define VIRTCHNL_VF_OFFLOAD_RSS_REG		0x00000010
 #define VIRTCHNL_VF_OFFLOAD_WB_ON_ITR		0x00000020
+#define VIRTCHNL_VF_OFFLOAD_REQ_QUEUES		0x00000040
 #define VIRTCHNL_VF_OFFLOAD_VLAN		0x00010000
 #define VIRTCHNL_VF_OFFLOAD_RX_POLLING		0x00020000
 #define VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2	0x00040000
@@ -325,6 +327,21 @@ struct virtchnl_vsi_queue_config_info {
 	struct virtchnl_queue_pair_info qpair[1];
 };
 
+/* VIRTCHNL_OP_REQUEST_QUEUES
+ * VF sends this message to request the PF to allocate additional queues to
+ * this VF.  Each VF gets a guaranteed number of queues on init but asking for
+ * additional queues must be negotiated.  This is a best effort request as it
+ * is possible the PF does not have enough queues left to support the request.
+ * If the PF cannot support the number requested it will respond with the
+ * maximum number it is able to support; otherwise it will respond with the
+ * number requested.
+ */
+
+/* VF resource request */
+struct virtchnl_vf_res_request {
+	u16 num_queue_pairs;
+};
+
 VIRTCHNL_CHECK_STRUCT_LEN(72, virtchnl_vsi_queue_config_info);
 
 /* VIRTCHNL_OP_CONFIG_IRQ_MAP
@@ -691,6 +708,9 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode,
 	case VIRTCHNL_OP_ENABLE_VLAN_STRIPPING:
 	case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING:
 		break;
+	case VIRTCHNL_OP_REQUEST_QUEUES:
+		valid_len = sizeof(struct virtchnl_vf_res_request);
+		break;
 	/* These are always errors coming from the VF. */
 	case VIRTCHNL_OP_EVENT:
 	case VIRTCHNL_OP_UNKNOWN:
-- 
cgit v1.2.3


From 2f657a600409f1961d67642fe384a9d4be71d36a Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Fri, 29 Sep 2017 17:19:20 -0400
Subject: net: dsa: change dsa_ptr for a dsa_port

With DSA, a master net device (CPU facing interface) has a dsa_ptr
pointer to which hangs a dsa_switch_tree. This is not correct because a
master interface is wired to a dedicated switch port, and because we can
theoretically have several master interfaces pointing to several CPU
ports of the same switch fabric.

Change the master interface's dsa_ptr for the CPU dsa_port pointer.
This is a step towards supporting multiple CPU ports.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f535779d9dc1..e1d6ef130611 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -55,7 +55,7 @@
 struct netpoll_info;
 struct device;
 struct phy_device;
-struct dsa_switch_tree;
+struct dsa_port;
 
 /* 802.11 specific */
 struct wireless_dev;
@@ -1752,7 +1752,7 @@ struct net_device {
 	struct vlan_info __rcu	*vlan_info;
 #endif
 #if IS_ENABLED(CONFIG_NET_DSA)
-	struct dsa_switch_tree	*dsa_ptr;
+	struct dsa_port		*dsa_ptr;
 #endif
 #if IS_ENABLED(CONFIG_TIPC)
 	struct tipc_bearer __rcu *tipc_ptr;
-- 
cgit v1.2.3


From 66b1bedf662518e9b6367990a87e9601b35a94c1 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Fri, 29 Sep 2017 14:21:14 +0300
Subject: ieee80211: Add WFA TPC report element OUI type

Add Transmit Power Control OUI type definition for WLAN_OUI_MICROSOFT.

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 55a604ad459f..ee6657a0ed69 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2445,6 +2445,7 @@ enum ieee80211_sa_query_action {
 #define WLAN_OUI_TYPE_MICROSOFT_WPA	1
 #define WLAN_OUI_TYPE_MICROSOFT_WMM	2
 #define WLAN_OUI_TYPE_MICROSOFT_WPS	4
+#define WLAN_OUI_TYPE_MICROSOFT_TPC	8
 
 /*
  * WMM/802.11e Tspec Element
-- 
cgit v1.2.3


From f2f2efb807d339513199b1bb771806c90cce83ae Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:28 +0300
Subject: byteorder: Move {cpu_to_be32, be32_to_cpu}_array() from Thunderbolt
 to core

We will be using these when communicating XDomain discovery protocol
over Thunderbolt link but they might be useful for other drivers as
well.

Make them available through byteorder/generic.h.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/byteorder/generic.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h
index 89f67c1c3160..805d16654459 100644
--- a/include/linux/byteorder/generic.h
+++ b/include/linux/byteorder/generic.h
@@ -170,4 +170,20 @@ static inline void be64_add_cpu(__be64 *var, u64 val)
 	*var = cpu_to_be64(be64_to_cpu(*var) + val);
 }
 
+static inline void cpu_to_be32_array(__be32 *dst, const u32 *src, size_t len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		dst[i] = cpu_to_be32(src[i]);
+}
+
+static inline void be32_to_cpu_array(u32 *dst, const __be32 *src, size_t len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		dst[i] = be32_to_cpu(src[i]);
+}
+
 #endif /* _LINUX_BYTEORDER_GENERIC_H */
-- 
cgit v1.2.3


From cdae7c07e3e3509eaabc18c1640a55dc5b99c179 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:30 +0300
Subject: thunderbolt: Add support for XDomain properties

Thunderbolt XDomain discovery protocol uses directories which contain
properties and other directories to exchange information about what
capabilities the remote host supports. This also includes identification
information like device ID and name.

This adds support for parsing and formatting these properties and
establishes an API drivers can use in addition to the core Thunderbolt
driver. This API is exposed in a new header: include/linux/thunderbolt.h.

This code is based on the work done by Amir Levy and Michael Jamet.

Signed-off-by: Michael Jamet <michael.jamet@intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/thunderbolt.h | 89 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 include/linux/thunderbolt.h

(limited to 'include/linux')

diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
new file mode 100644
index 000000000000..96561c1265ae
--- /dev/null
+++ b/include/linux/thunderbolt.h
@@ -0,0 +1,89 @@
+/*
+ * Thunderbolt service API
+ *
+ * Copyright (C) 2017, Intel Corporation
+ * Authors: Michael Jamet <michael.jamet@intel.com>
+ *          Mika Westerberg <mika.westerberg@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef THUNDERBOLT_H_
+#define THUNDERBOLT_H_
+
+#include <linux/list.h>
+#include <linux/uuid.h>
+
+/**
+ * struct tb_property_dir - XDomain property directory
+ * @uuid: Directory UUID or %NULL if root directory
+ * @properties: List of properties in this directory
+ *
+ * User needs to provide serialization if needed.
+ */
+struct tb_property_dir {
+	const uuid_t *uuid;
+	struct list_head properties;
+};
+
+enum tb_property_type {
+	TB_PROPERTY_TYPE_UNKNOWN = 0x00,
+	TB_PROPERTY_TYPE_DIRECTORY = 0x44,
+	TB_PROPERTY_TYPE_DATA = 0x64,
+	TB_PROPERTY_TYPE_TEXT = 0x74,
+	TB_PROPERTY_TYPE_VALUE = 0x76,
+};
+
+#define TB_PROPERTY_KEY_SIZE	8
+
+/**
+ * struct tb_property - XDomain property
+ * @list: Used to link properties together in a directory
+ * @key: Key for the property (always terminated).
+ * @type: Type of the property
+ * @length: Length of the property data in dwords
+ * @value: Property value
+ *
+ * Users use @type to determine which field in @value is filled.
+ */
+struct tb_property {
+	struct list_head list;
+	char key[TB_PROPERTY_KEY_SIZE + 1];
+	enum tb_property_type type;
+	size_t length;
+	union {
+		struct tb_property_dir *dir;
+		u8 *data;
+		char *text;
+		u32 immediate;
+	} value;
+};
+
+struct tb_property_dir *tb_property_parse_dir(const u32 *block,
+					      size_t block_len);
+ssize_t tb_property_format_dir(const struct tb_property_dir *dir, u32 *block,
+			       size_t block_len);
+struct tb_property_dir *tb_property_create_dir(const uuid_t *uuid);
+void tb_property_free_dir(struct tb_property_dir *dir);
+int tb_property_add_immediate(struct tb_property_dir *parent, const char *key,
+			      u32 value);
+int tb_property_add_data(struct tb_property_dir *parent, const char *key,
+			 const void *buf, size_t buflen);
+int tb_property_add_text(struct tb_property_dir *parent, const char *key,
+			 const char *text);
+int tb_property_add_dir(struct tb_property_dir *parent, const char *key,
+			struct tb_property_dir *dir);
+void tb_property_remove(struct tb_property *tb_property);
+struct tb_property *tb_property_find(struct tb_property_dir *dir,
+			const char *key, enum tb_property_type type);
+struct tb_property *tb_property_get_next(struct tb_property_dir *dir,
+					 struct tb_property *prev);
+
+#define tb_property_for_each(dir, property)			\
+	for (property = tb_property_get_next(dir, NULL);	\
+	     property;						\
+	     property = tb_property_get_next(dir, property))
+
+#endif /* THUNDERBOLT_H_ */
-- 
cgit v1.2.3


From eaf8ff35a345449207ad116e2574c19780ec9a98 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:31 +0300
Subject: thunderbolt: Move enum tb_cfg_pkg_type to thunderbolt.h

These will be needed by Thunderbolt services when sending and receiving
XDomain control messages. While there change TB_CFG_PKG_PREPARE_TO_SLEEP
value to be decimal in order to be consistent with other members.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/thunderbolt.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 96561c1265ae..b512b1e2b4f2 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -1,6 +1,7 @@
 /*
  * Thunderbolt service API
  *
+ * Copyright (C) 2014 Andreas Noever <andreas.noever@gmail.com>
  * Copyright (C) 2017, Intel Corporation
  * Authors: Michael Jamet <michael.jamet@intel.com>
  *          Mika Westerberg <mika.westerberg@linux.intel.com>
@@ -16,6 +17,22 @@
 #include <linux/list.h>
 #include <linux/uuid.h>
 
+enum tb_cfg_pkg_type {
+	TB_CFG_PKG_READ = 1,
+	TB_CFG_PKG_WRITE = 2,
+	TB_CFG_PKG_ERROR = 3,
+	TB_CFG_PKG_NOTIFY_ACK = 4,
+	TB_CFG_PKG_EVENT = 5,
+	TB_CFG_PKG_XDOMAIN_REQ = 6,
+	TB_CFG_PKG_XDOMAIN_RESP = 7,
+	TB_CFG_PKG_OVERRIDE = 8,
+	TB_CFG_PKG_RESET = 9,
+	TB_CFG_PKG_ICM_EVENT = 10,
+	TB_CFG_PKG_ICM_CMD = 11,
+	TB_CFG_PKG_ICM_RESP = 12,
+	TB_CFG_PKG_PREPARE_TO_SLEEP = 13,
+};
+
 /**
  * struct tb_property_dir - XDomain property directory
  * @uuid: Directory UUID or %NULL if root directory
-- 
cgit v1.2.3


From 9e99b9f4d5c36340dabda6d14053195b2a43796b Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:32 +0300
Subject: thunderbolt: Move thunderbolt domain structure to thunderbolt.h

These are needed by Thunderbolt services so move them to thunderbolt.h
to make sure they are available outside of drivers/thunderbolt.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/thunderbolt.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index b512b1e2b4f2..910b1bf92112 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -14,7 +14,9 @@
 #ifndef THUNDERBOLT_H_
 #define THUNDERBOLT_H_
 
+#include <linux/device.h>
 #include <linux/list.h>
+#include <linux/mutex.h>
 #include <linux/uuid.h>
 
 enum tb_cfg_pkg_type {
@@ -33,6 +35,49 @@ enum tb_cfg_pkg_type {
 	TB_CFG_PKG_PREPARE_TO_SLEEP = 13,
 };
 
+/**
+ * enum tb_security_level - Thunderbolt security level
+ * @TB_SECURITY_NONE: No security, legacy mode
+ * @TB_SECURITY_USER: User approval required at minimum
+ * @TB_SECURITY_SECURE: One time saved key required at minimum
+ * @TB_SECURITY_DPONLY: Only tunnel Display port (and USB)
+ */
+enum tb_security_level {
+	TB_SECURITY_NONE,
+	TB_SECURITY_USER,
+	TB_SECURITY_SECURE,
+	TB_SECURITY_DPONLY,
+};
+
+/**
+ * struct tb - main thunderbolt bus structure
+ * @dev: Domain device
+ * @lock: Big lock. Must be held when accessing any struct
+ *	  tb_switch / struct tb_port.
+ * @nhi: Pointer to the NHI structure
+ * @ctl: Control channel for this domain
+ * @wq: Ordered workqueue for all domain specific work
+ * @root_switch: Root switch of this domain
+ * @cm_ops: Connection manager specific operations vector
+ * @index: Linux assigned domain number
+ * @security_level: Current security level
+ * @privdata: Private connection manager specific data
+ */
+struct tb {
+	struct device dev;
+	struct mutex lock;
+	struct tb_nhi *nhi;
+	struct tb_ctl *ctl;
+	struct workqueue_struct *wq;
+	struct tb_switch *root_switch;
+	const struct tb_cm_ops *cm_ops;
+	int index;
+	enum tb_security_level security_level;
+	unsigned long privdata[0];
+};
+
+extern struct bus_type tb_bus_type;
+
 /**
  * struct tb_property_dir - XDomain property directory
  * @uuid: Directory UUID or %NULL if root directory
-- 
cgit v1.2.3


From e69b71f8458b78a2ef44e3d07374a8f46e45123d Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:33 +0300
Subject: thunderbolt: Move tb_switch_phy_port_from_link() to thunderbolt.h

A Thunderbolt service might need to find the physical port from a link
the cable is connected to. For instance networking driver uses this
information to generate MAC address according the Apple ThunderboltIP
protocol.

Move this function to thunderbolt.h and rename it to
tb_phy_port_from_link() to reflect the fact that it does not take switch
as parameter.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/thunderbolt.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 910b1bf92112..43b8d1e09341 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -78,6 +78,13 @@ struct tb {
 
 extern struct bus_type tb_bus_type;
 
+#define TB_LINKS_PER_PHY_PORT	2
+
+static inline unsigned int tb_phy_port_from_link(unsigned int link)
+{
+	return (link - 1) / TB_LINKS_PER_PHY_PORT;
+}
+
 /**
  * struct tb_property_dir - XDomain property directory
  * @uuid: Directory UUID or %NULL if root directory
-- 
cgit v1.2.3


From d1ff70241a275133e1a0258b7c23588b122276c8 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:34 +0300
Subject: thunderbolt: Add support for XDomain discovery protocol

When two hosts are connected over a Thunderbolt cable, there is a
protocol they can use to communicate capabilities supported by the host.
The discovery protocol uses automatically configured control channel
(ring 0) and is build on top of request/response transactions using
special XDomain primitives provided by the Thunderbolt base protocol.

The capabilities consists of a root directory block of basic properties
used for identification of the host, and then there can be zero or more
directories each describing a Thunderbolt service and its capabilities.

Once both sides have discovered what is supported the two hosts can
setup high-speed DMA paths and transfer data to the other side using
whatever protocol was agreed based on the properties. The software
protocol used to communicate which DMA paths to enable is service
specific.

This patch adds support for the XDomain discovery protocol to the
Thunderbolt bus. We model each remote host connection as a Linux XDomain
device. For each Thunderbolt service found supported on the XDomain
device, we create Linux Thunderbolt service device which Thunderbolt
service drivers can then bind to based on the protocol identification
information retrieved from the property directory describing the
service.

This code is based on the work done by Amir Levy and Michael Jamet.

Signed-off-by: Michael Jamet <michael.jamet@intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mod_devicetable.h |  26 +++++
 include/linux/thunderbolt.h     | 242 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 268 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 694cebb50f72..7625c3b81f84 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -683,5 +683,31 @@ struct fsl_mc_device_id {
 	const char obj_type[16];
 };
 
+/**
+ * struct tb_service_id - Thunderbolt service identifiers
+ * @match_flags: Flags used to match the structure
+ * @protocol_key: Protocol key the service supports
+ * @protocol_id: Protocol id the service supports
+ * @protocol_version: Version of the protocol
+ * @protocol_revision: Revision of the protocol software
+ * @driver_data: Driver specific data
+ *
+ * Thunderbolt XDomain services are exposed as devices where each device
+ * carries the protocol information the service supports. Thunderbolt
+ * XDomain service drivers match against that information.
+ */
+struct tb_service_id {
+	__u32 match_flags;
+	char protocol_key[8 + 1];
+	__u32 protocol_id;
+	__u32 protocol_version;
+	__u32 protocol_revision;
+	kernel_ulong_t driver_data;
+};
+
+#define TBSVC_MATCH_PROTOCOL_KEY	0x0001
+#define TBSVC_MATCH_PROTOCOL_ID		0x0002
+#define TBSVC_MATCH_PROTOCOL_VERSION	0x0004
+#define TBSVC_MATCH_PROTOCOL_REVISION	0x0008
 
 #endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 43b8d1e09341..18c0e3d5e85c 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -17,6 +17,7 @@
 #include <linux/device.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
+#include <linux/mod_devicetable.h>
 #include <linux/uuid.h>
 
 enum tb_cfg_pkg_type {
@@ -77,6 +78,8 @@ struct tb {
 };
 
 extern struct bus_type tb_bus_type;
+extern struct device_type tb_service_type;
+extern struct device_type tb_xdomain_type;
 
 #define TB_LINKS_PER_PHY_PORT	2
 
@@ -155,4 +158,243 @@ struct tb_property *tb_property_get_next(struct tb_property_dir *dir,
 	     property;						\
 	     property = tb_property_get_next(dir, property))
 
+int tb_register_property_dir(const char *key, struct tb_property_dir *dir);
+void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir);
+
+/**
+ * struct tb_xdomain - Cross-domain (XDomain) connection
+ * @dev: XDomain device
+ * @tb: Pointer to the domain
+ * @remote_uuid: UUID of the remote domain (host)
+ * @local_uuid: Cached local UUID
+ * @route: Route string the other domain can be reached
+ * @vendor: Vendor ID of the remote domain
+ * @device: Device ID of the demote domain
+ * @lock: Lock to serialize access to the following fields of this structure
+ * @vendor_name: Name of the vendor (or %NULL if not known)
+ * @device_name: Name of the device (or %NULL if not known)
+ * @is_unplugged: The XDomain is unplugged
+ * @resume: The XDomain is being resumed
+ * @transmit_path: HopID which the remote end expects us to transmit
+ * @transmit_ring: Local ring (hop) where outgoing packets are pushed
+ * @receive_path: HopID which we expect the remote end to transmit
+ * @receive_ring: Local ring (hop) where incoming packets arrive
+ * @service_ids: Used to generate IDs for the services
+ * @properties: Properties exported by the remote domain
+ * @property_block_gen: Generation of @properties
+ * @properties_lock: Lock protecting @properties.
+ * @get_properties_work: Work used to get remote domain properties
+ * @properties_retries: Number of times left to read properties
+ * @properties_changed_work: Work used to notify the remote domain that
+ *			     our properties have changed
+ * @properties_changed_retries: Number of times left to send properties
+ *				changed notification
+ * @link: Root switch link the remote domain is connected (ICM only)
+ * @depth: Depth in the chain the remote domain is connected (ICM only)
+ *
+ * This structure represents connection across two domains (hosts).
+ * Each XDomain contains zero or more services which are exposed as
+ * &struct tb_service objects.
+ *
+ * Service drivers may access this structure if they need to enumerate
+ * non-standard properties but they need hold @lock when doing so
+ * because properties can be changed asynchronously in response to
+ * changes in the remote domain.
+ */
+struct tb_xdomain {
+	struct device dev;
+	struct tb *tb;
+	uuid_t *remote_uuid;
+	const uuid_t *local_uuid;
+	u64 route;
+	u16 vendor;
+	u16 device;
+	struct mutex lock;
+	const char *vendor_name;
+	const char *device_name;
+	bool is_unplugged;
+	bool resume;
+	u16 transmit_path;
+	u16 transmit_ring;
+	u16 receive_path;
+	u16 receive_ring;
+	struct ida service_ids;
+	struct tb_property_dir *properties;
+	u32 property_block_gen;
+	struct delayed_work get_properties_work;
+	int properties_retries;
+	struct delayed_work properties_changed_work;
+	int properties_changed_retries;
+	u8 link;
+	u8 depth;
+};
+
+int tb_xdomain_enable_paths(struct tb_xdomain *xd, u16 transmit_path,
+			    u16 transmit_ring, u16 receive_path,
+			    u16 receive_ring);
+int tb_xdomain_disable_paths(struct tb_xdomain *xd);
+struct tb_xdomain *tb_xdomain_find_by_uuid(struct tb *tb, const uuid_t *uuid);
+
+static inline struct tb_xdomain *
+tb_xdomain_find_by_uuid_locked(struct tb *tb, const uuid_t *uuid)
+{
+	struct tb_xdomain *xd;
+
+	mutex_lock(&tb->lock);
+	xd = tb_xdomain_find_by_uuid(tb, uuid);
+	mutex_unlock(&tb->lock);
+
+	return xd;
+}
+
+static inline struct tb_xdomain *tb_xdomain_get(struct tb_xdomain *xd)
+{
+	if (xd)
+		get_device(&xd->dev);
+	return xd;
+}
+
+static inline void tb_xdomain_put(struct tb_xdomain *xd)
+{
+	if (xd)
+		put_device(&xd->dev);
+}
+
+static inline bool tb_is_xdomain(const struct device *dev)
+{
+	return dev->type == &tb_xdomain_type;
+}
+
+static inline struct tb_xdomain *tb_to_xdomain(struct device *dev)
+{
+	if (tb_is_xdomain(dev))
+		return container_of(dev, struct tb_xdomain, dev);
+	return NULL;
+}
+
+int tb_xdomain_response(struct tb_xdomain *xd, const void *response,
+			size_t size, enum tb_cfg_pkg_type type);
+int tb_xdomain_request(struct tb_xdomain *xd, const void *request,
+		       size_t request_size, enum tb_cfg_pkg_type request_type,
+		       void *response, size_t response_size,
+		       enum tb_cfg_pkg_type response_type,
+		       unsigned int timeout_msec);
+
+/**
+ * tb_protocol_handler - Protocol specific handler
+ * @uuid: XDomain messages with this UUID are dispatched to this handler
+ * @callback: Callback called with the XDomain message. Returning %1
+ *	      here tells the XDomain core that the message was handled
+ *	      by this handler and should not be forwared to other
+ *	      handlers.
+ * @data: Data passed with the callback
+ * @list: Handlers are linked using this
+ *
+ * Thunderbolt services can hook into incoming XDomain requests by
+ * registering protocol handler. Only limitation is that the XDomain
+ * discovery protocol UUID cannot be registered since it is handled by
+ * the core XDomain code.
+ *
+ * The @callback must check that the message is really directed to the
+ * service the driver implements.
+ */
+struct tb_protocol_handler {
+	const uuid_t *uuid;
+	int (*callback)(const void *buf, size_t size, void *data);
+	void *data;
+	struct list_head list;
+};
+
+int tb_register_protocol_handler(struct tb_protocol_handler *handler);
+void tb_unregister_protocol_handler(struct tb_protocol_handler *handler);
+
+/**
+ * struct tb_service - Thunderbolt service
+ * @dev: XDomain device
+ * @id: ID of the service (shown in sysfs)
+ * @key: Protocol key from the properties directory
+ * @prtcid: Protocol ID from the properties directory
+ * @prtcvers: Protocol version from the properties directory
+ * @prtcrevs: Protocol software revision from the properties directory
+ * @prtcstns: Protocol settings mask from the properties directory
+ *
+ * Each domain exposes set of services it supports as collection of
+ * properties. For each service there will be one corresponding
+ * &struct tb_service. Service drivers are bound to these.
+ */
+struct tb_service {
+	struct device dev;
+	int id;
+	const char *key;
+	u32 prtcid;
+	u32 prtcvers;
+	u32 prtcrevs;
+	u32 prtcstns;
+};
+
+static inline struct tb_service *tb_service_get(struct tb_service *svc)
+{
+	if (svc)
+		get_device(&svc->dev);
+	return svc;
+}
+
+static inline void tb_service_put(struct tb_service *svc)
+{
+	if (svc)
+		put_device(&svc->dev);
+}
+
+static inline bool tb_is_service(const struct device *dev)
+{
+	return dev->type == &tb_service_type;
+}
+
+static inline struct tb_service *tb_to_service(struct device *dev)
+{
+	if (tb_is_service(dev))
+		return container_of(dev, struct tb_service, dev);
+	return NULL;
+}
+
+/**
+ * tb_service_driver - Thunderbolt service driver
+ * @driver: Driver structure
+ * @probe: Called when the driver is probed
+ * @remove: Called when the driver is removed (optional)
+ * @shutdown: Called at shutdown time to stop the service (optional)
+ * @id_table: Table of service identifiers the driver supports
+ */
+struct tb_service_driver {
+	struct device_driver driver;
+	int (*probe)(struct tb_service *svc, const struct tb_service_id *id);
+	void (*remove)(struct tb_service *svc);
+	void (*shutdown)(struct tb_service *svc);
+	const struct tb_service_id *id_table;
+};
+
+#define TB_SERVICE(key, id)				\
+	.match_flags = TBSVC_MATCH_PROTOCOL_KEY |	\
+		       TBSVC_MATCH_PROTOCOL_ID,		\
+	.protocol_key = (key),				\
+	.protocol_id = (id)
+
+int tb_register_service_driver(struct tb_service_driver *drv);
+void tb_unregister_service_driver(struct tb_service_driver *drv);
+
+static inline void *tb_service_get_drvdata(const struct tb_service *svc)
+{
+	return dev_get_drvdata(&svc->dev);
+}
+
+static inline void tb_service_set_drvdata(struct tb_service *svc, void *data)
+{
+	dev_set_drvdata(&svc->dev, data);
+}
+
+static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc)
+{
+	return tb_to_xdomain(svc->dev.parent);
+}
+
 #endif /* THUNDERBOLT_H_ */
-- 
cgit v1.2.3


From 3b3d9f4da96493e4f68d0a80ab210763a24f8b33 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:37 +0300
Subject: thunderbolt: Export ring handling functions to modules

These are used by Thunderbolt services to send and receive frames over
the high-speed DMA rings.

We also put the functions to tb_ namespace to make sure we do not
collide with others and add missing kernel-doc comments for the exported
functions.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/thunderbolt.h | 158 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 158 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 18c0e3d5e85c..9ddb83ad890f 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -15,10 +15,12 @@
 #define THUNDERBOLT_H_
 
 #include <linux/device.h>
+#include <linux/idr.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/mod_devicetable.h>
 #include <linux/uuid.h>
+#include <linux/workqueue.h>
 
 enum tb_cfg_pkg_type {
 	TB_CFG_PKG_READ = 1,
@@ -397,4 +399,160 @@ static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc)
 	return tb_to_xdomain(svc->dev.parent);
 }
 
+/**
+ * struct tb_nhi - thunderbolt native host interface
+ * @lock: Must be held during ring creation/destruction. Is acquired by
+ *	  interrupt_work when dispatching interrupts to individual rings.
+ * @pdev: Pointer to the PCI device
+ * @iobase: MMIO space of the NHI
+ * @tx_rings: All Tx rings available on this host controller
+ * @rx_rings: All Rx rings available on this host controller
+ * @msix_ida: Used to allocate MSI-X vectors for rings
+ * @going_away: The host controller device is about to disappear so when
+ *		this flag is set, avoid touching the hardware anymore.
+ * @interrupt_work: Work scheduled to handle ring interrupt when no
+ *		    MSI-X is used.
+ * @hop_count: Number of rings (end point hops) supported by NHI.
+ */
+struct tb_nhi {
+	struct mutex lock;
+	struct pci_dev *pdev;
+	void __iomem *iobase;
+	struct tb_ring **tx_rings;
+	struct tb_ring **rx_rings;
+	struct ida msix_ida;
+	bool going_away;
+	struct work_struct interrupt_work;
+	u32 hop_count;
+};
+
+/**
+ * struct tb_ring - thunderbolt TX or RX ring associated with a NHI
+ * @lock: Lock serializing actions to this ring. Must be acquired after
+ *	  nhi->lock.
+ * @nhi: Pointer to the native host controller interface
+ * @size: Size of the ring
+ * @hop: Hop (DMA channel) associated with this ring
+ * @head: Head of the ring (write next descriptor here)
+ * @tail: Tail of the ring (complete next descriptor here)
+ * @descriptors: Allocated descriptors for this ring
+ * @queue: Queue holding frames to be transferred over this ring
+ * @in_flight: Queue holding frames that are currently in flight
+ * @work: Interrupt work structure
+ * @is_tx: Is the ring Tx or Rx
+ * @running: Is the ring running
+ * @irq: MSI-X irq number if the ring uses MSI-X. %0 otherwise.
+ * @vector: MSI-X vector number the ring uses (only set if @irq is > 0)
+ * @flags: Ring specific flags
+ * @sof_mask: Bit mask used to detect start of frame PDF
+ * @eof_mask: Bit mask used to detect end of frame PDF
+ */
+struct tb_ring {
+	struct mutex lock;
+	struct tb_nhi *nhi;
+	int size;
+	int hop;
+	int head;
+	int tail;
+	struct ring_desc *descriptors;
+	dma_addr_t descriptors_dma;
+	struct list_head queue;
+	struct list_head in_flight;
+	struct work_struct work;
+	bool is_tx:1;
+	bool running:1;
+	int irq;
+	u8 vector;
+	unsigned int flags;
+	u16 sof_mask;
+	u16 eof_mask;
+};
+
+/* Leave ring interrupt enabled on suspend */
+#define RING_FLAG_NO_SUSPEND	BIT(0)
+/* Configure the ring to be in frame mode */
+#define RING_FLAG_FRAME		BIT(1)
+/* Enable end-to-end flow control */
+#define RING_FLAG_E2E		BIT(2)
+
+struct ring_frame;
+typedef void (*ring_cb)(struct tb_ring *, struct ring_frame *, bool canceled);
+
+/**
+ * struct ring_frame - For use with ring_rx/ring_tx
+ * @buffer_phy: DMA mapped address of the frame
+ * @callback: Callback called when the frame is finished
+ * @list: Frame is linked to a queue using this
+ * @size: Size of the frame in bytes (%0 means %4096)
+ * @flags: Flags for the frame (see &enum ring_desc_flags)
+ * @eof: End of frame protocol defined field
+ * @sof: Start of frame protocol defined field
+ */
+struct ring_frame {
+	dma_addr_t buffer_phy;
+	ring_cb callback;
+	struct list_head list;
+	u32 size:12;
+	u32 flags:12;
+	u32 eof:4;
+	u32 sof:4;
+};
+
+/* Minimum size for ring_rx */
+#define TB_FRAME_SIZE		0x100
+
+struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
+				 unsigned int flags);
+struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
+				 unsigned int flags, u16 sof_mask,
+				 u16 eof_mask);
+void tb_ring_start(struct tb_ring *ring);
+void tb_ring_stop(struct tb_ring *ring);
+void tb_ring_free(struct tb_ring *ring);
+
+int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame);
+
+/**
+ * tb_ring_rx() - enqueue a frame on an RX ring
+ * @ring: Ring to enqueue the frame
+ * @frame: Frame to enqueue
+ *
+ * @frame->buffer, @frame->buffer_phy and @frame->callback have to be set. The
+ * buffer must contain at least %TB_FRAME_SIZE bytes.
+ *
+ * @frame->callback will be invoked with @frame->size, @frame->flags,
+ * @frame->eof, @frame->sof set once the frame has been received.
+ *
+ * If ring_stop() is called after the packet has been enqueued
+ * @frame->callback will be called with canceled set to true.
+ *
+ * Return: Returns %-ESHUTDOWN if ring_stop has been called. Zero otherwise.
+ */
+static inline int tb_ring_rx(struct tb_ring *ring, struct ring_frame *frame)
+{
+	WARN_ON(ring->is_tx);
+	return __tb_ring_enqueue(ring, frame);
+}
+
+/**
+ * tb_ring_tx() - enqueue a frame on an TX ring
+ * @ring: Ring the enqueue the frame
+ * @frame: Frame to enqueue
+ *
+ * @frame->buffer, @frame->buffer_phy, @frame->callback, @frame->size,
+ * @frame->eof and @frame->sof have to be set.
+ *
+ * @frame->callback will be invoked with once the frame has been transmitted.
+ *
+ * If ring_stop() is called after the packet has been enqueued @frame->callback
+ * will be called with canceled set to true.
+ *
+ * Return: Returns %-ESHUTDOWN if ring_stop has been called. Zero otherwise.
+ */
+static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame)
+{
+	WARN_ON(!ring->is_tx);
+	return __tb_ring_enqueue(ring, frame);
+}
+
 #endif /* THUNDERBOLT_H_ */
-- 
cgit v1.2.3


From 2a91ec63f8a11e70d4b958dd4df867fec0247179 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:38 +0300
Subject: thunderbolt: Move ring descriptor flags to thunderbolt.h

A Thunderbolt service driver might need to check if there was an error
with the descriptor when in frame mode. We also add two Rx specific
error flags RING_DESC_CRC_ERROR and RING_DESC_BUFFER_OVERRUN.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/thunderbolt.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 9ddb83ad890f..e3b9af7be0ad 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -478,6 +478,24 @@ struct tb_ring {
 struct ring_frame;
 typedef void (*ring_cb)(struct tb_ring *, struct ring_frame *, bool canceled);
 
+/**
+ * enum ring_desc_flags - Flags for DMA ring descriptor
+ * %RING_DESC_ISOCH: Enable isonchronous DMA (Tx only)
+ * %RING_DESC_CRC_ERROR: In frame mode CRC check failed for the frame (Rx only)
+ * %RING_DESC_COMPLETED: Descriptor completed (set by NHI)
+ * %RING_DESC_POSTED: Always set this
+ * %RING_DESC_BUFFER_OVERRUN: RX buffer overrun
+ * %RING_DESC_INTERRUPT: Request an interrupt on completion
+ */
+enum ring_desc_flags {
+	RING_DESC_ISOCH = 0x1,
+	RING_DESC_CRC_ERROR = 0x1,
+	RING_DESC_COMPLETED = 0x2,
+	RING_DESC_POSTED = 0x4,
+	RING_DESC_BUFFER_OVERRUN = 0x04,
+	RING_DESC_INTERRUPT = 0x8,
+};
+
 /**
  * struct ring_frame - For use with ring_rx/ring_tx
  * @buffer_phy: DMA mapped address of the frame
-- 
cgit v1.2.3


From 22b7de1000e66d739c431d6be4e7e97c69fa7c98 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:39 +0300
Subject: thunderbolt: Use spinlock in ring serialization

This makes it possible to enqueue frames also from atomic context which
is needed for example, when networking packets are sent over a
Thunderbolt cable.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/thunderbolt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index e3b9af7be0ad..cf9e42db780f 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -448,7 +448,7 @@ struct tb_nhi {
  * @eof_mask: Bit mask used to detect end of frame PDF
  */
 struct tb_ring {
-	struct mutex lock;
+	spinlock_t lock;
 	struct tb_nhi *nhi;
 	int size;
 	int hop;
-- 
cgit v1.2.3


From 59120e06101db72442acf4c8b364a0c76d8faa68 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:40 +0300
Subject: thunderbolt: Use spinlock in NHI serialization

This is needed because ring polling functionality can be called from
atomic contexts when networking and other high-speed traffic is
transferred over a Thunderbolt cable.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/thunderbolt.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index cf9e42db780f..d59e3f9a35c4 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -415,7 +415,7 @@ static inline struct tb_xdomain *tb_service_parent(struct tb_service *svc)
  * @hop_count: Number of rings (end point hops) supported by NHI.
  */
 struct tb_nhi {
-	struct mutex lock;
+	spinlock_t lock;
 	struct pci_dev *pdev;
 	void __iomem *iobase;
 	struct tb_ring **tx_rings;
-- 
cgit v1.2.3


From 4ffe722eefcb07c76701f03e0d759fbaecedf79f Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:41 +0300
Subject: thunderbolt: Add polling mode for rings

In order to support things like networking over Thunderbolt cable, there
needs to be a way to switch the ring to a mode where it can be polled
with the interrupt masked. We implement such mode so that the caller can
allocate a ring by passing pointer to a function that is then called
when an interrupt is triggered. Completed frames can be fetched using
tb_ring_poll() and the interrupt can be re-enabled when the caller is
finished with polling by using tb_ring_poll_complete().

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/thunderbolt.h | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index d59e3f9a35c4..36925e3aec7c 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -446,6 +446,9 @@ struct tb_nhi {
  * @flags: Ring specific flags
  * @sof_mask: Bit mask used to detect start of frame PDF
  * @eof_mask: Bit mask used to detect end of frame PDF
+ * @start_poll: Called when ring interrupt is triggered to start
+ *		polling. Passing %NULL keeps the ring in interrupt mode.
+ * @poll_data: Data passed to @start_poll
  */
 struct tb_ring {
 	spinlock_t lock;
@@ -466,6 +469,8 @@ struct tb_ring {
 	unsigned int flags;
 	u16 sof_mask;
 	u16 eof_mask;
+	void (*start_poll)(void *data);
+	void *poll_data;
 };
 
 /* Leave ring interrupt enabled on suspend */
@@ -499,7 +504,7 @@ enum ring_desc_flags {
 /**
  * struct ring_frame - For use with ring_rx/ring_tx
  * @buffer_phy: DMA mapped address of the frame
- * @callback: Callback called when the frame is finished
+ * @callback: Callback called when the frame is finished (optional)
  * @list: Frame is linked to a queue using this
  * @size: Size of the frame in bytes (%0 means %4096)
  * @flags: Flags for the frame (see &enum ring_desc_flags)
@@ -522,8 +527,8 @@ struct ring_frame {
 struct tb_ring *tb_ring_alloc_tx(struct tb_nhi *nhi, int hop, int size,
 				 unsigned int flags);
 struct tb_ring *tb_ring_alloc_rx(struct tb_nhi *nhi, int hop, int size,
-				 unsigned int flags, u16 sof_mask,
-				 u16 eof_mask);
+				 unsigned int flags, u16 sof_mask, u16 eof_mask,
+				 void (*start_poll)(void *), void *poll_data);
 void tb_ring_start(struct tb_ring *ring);
 void tb_ring_stop(struct tb_ring *ring);
 void tb_ring_free(struct tb_ring *ring);
@@ -535,8 +540,8 @@ int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame);
  * @ring: Ring to enqueue the frame
  * @frame: Frame to enqueue
  *
- * @frame->buffer, @frame->buffer_phy and @frame->callback have to be set. The
- * buffer must contain at least %TB_FRAME_SIZE bytes.
+ * @frame->buffer, @frame->buffer_phy have to be set. The buffer must
+ * contain at least %TB_FRAME_SIZE bytes.
  *
  * @frame->callback will be invoked with @frame->size, @frame->flags,
  * @frame->eof, @frame->sof set once the frame has been received.
@@ -557,8 +562,8 @@ static inline int tb_ring_rx(struct tb_ring *ring, struct ring_frame *frame)
  * @ring: Ring the enqueue the frame
  * @frame: Frame to enqueue
  *
- * @frame->buffer, @frame->buffer_phy, @frame->callback, @frame->size,
- * @frame->eof and @frame->sof have to be set.
+ * @frame->buffer, @frame->buffer_phy, @frame->size, @frame->eof and
+ * @frame->sof have to be set.
  *
  * @frame->callback will be invoked with once the frame has been transmitted.
  *
@@ -573,4 +578,8 @@ static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame)
 	return __tb_ring_enqueue(ring, frame);
 }
 
+/* Used only when the ring is in polling mode */
+struct ring_frame *tb_ring_poll(struct tb_ring *ring);
+void tb_ring_poll_complete(struct tb_ring *ring);
+
 #endif /* THUNDERBOLT_H_ */
-- 
cgit v1.2.3


From 3304559e353f098d7e0ed5ca981e26c406513e12 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 2 Oct 2017 13:38:42 +0300
Subject: thunderbolt: Add function to retrieve DMA device for the ring

This is needed when Thunderbolt service drivers need to DMA map memory
before it is passed down to the ring.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Michael Jamet <michael.jamet@intel.com>
Reviewed-by: Yehezkel Bernat <yehezkel.bernat@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/thunderbolt.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 36925e3aec7c..7b69853188b1 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -19,6 +19,7 @@
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/mod_devicetable.h>
+#include <linux/pci.h>
 #include <linux/uuid.h>
 #include <linux/workqueue.h>
 
@@ -582,4 +583,16 @@ static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame)
 struct ring_frame *tb_ring_poll(struct tb_ring *ring);
 void tb_ring_poll_complete(struct tb_ring *ring);
 
+/**
+ * tb_ring_dma_device() - Return device used for DMA mapping
+ * @ring: Ring whose DMA device is retrieved
+ *
+ * Use this function when you are mapping DMA for buffers that are
+ * passed to the ring for sending/receiving.
+ */
+static inline struct device *tb_ring_dma_device(struct tb_ring *ring)
+{
+	return &ring->nhi->pdev->dev;
+}
+
 #endif /* THUNDERBOLT_H_ */
-- 
cgit v1.2.3


From abf4bb6b63d0a54266f8e7eff3720c1974063971 Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Tue, 3 Oct 2017 09:58:06 +0200
Subject: skbuff: Add the offload_mr_fwd_mark field

Similarly to the offload_fwd_mark field, the offload_mr_fwd_mark field is
used to allow partial offloading of MFC multicast routes.

Switchdev drivers can offload MFC multicast routes to the hardware by
registering to the FIB notification chain. When one of the route output
interfaces is not offload-able, i.e. has different parent ID, the route
cannot be fully offloaded by the hardware. Examples to non-offload-able
devices are a management NIC, dummy device, pimreg device, etc.

Similar problem exists in the bridge module, as one bridge can hold
interfaces with different parent IDs. At the bridge, the problem is solved
by the offload_fwd_mark skb field.

Currently, when a route cannot go through full offload, the only solution
for a switchdev driver is not to offload it at all and let the packet go
through slow path.

Using the offload_mr_fwd_mark field, a driver can indicate that a packet
was already forwarded by hardware to all the devices with the same parent
ID as the input device. Further patches in this patch-set are going to
enhance ipmr to skip multicast forwarding to devices with the same parent
ID if a packets is marked with that field.

The reason why the already existing "offload_fwd_mark" bit cannot be used
is that a switchdev driver would want to make the distinction between a
packet that has already gone through L2 forwarding but did not go through
multicast forwarding, and a packet that has already gone through both L2
and multicast forwarding.

For example: when a packet is ingressing from a switchport enslaved to a
bridge, which is configured with multicast forwarding, the following
scenarios are possible:
 - The packet can be trapped to the CPU due to exception while multicast
   forwarding (for example, MTU error). In that case, it had already gone
   through L2 forwarding in the hardware, thus A switchdev driver would
   want to set the skb->offload_fwd_mark and not the
   skb->offload_mr_fwd_mark.
 - The packet can also be trapped due to a pimreg/dummy device used as one
   of the output interfaces. In that case, it can go through both L2 and
   (partial) multicast forwarding inside the hardware, thus a switchdev
   driver would want to set both the skb->offload_fwd_mark and
   skb->offload_mr_fwd_mark.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellaox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 19e64bfb1a66..ada821466e88 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -772,6 +772,7 @@ struct sk_buff {
 	__u8			remcsum_offload:1;
 #ifdef CONFIG_NET_SWITCHDEV
 	__u8			offload_fwd_mark:1;
+	__u8			offload_mr_fwd_mark:1;
 #endif
 #ifdef CONFIG_NET_CLS_ACT
 	__u8			tc_skip_classify:1;
-- 
cgit v1.2.3


From 5d8b3e69fc5e5ccafc9db1251bb7c78a8622fddd Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Tue, 3 Oct 2017 09:58:07 +0200
Subject: ipv4: ipmr: Add the parent ID field to VIF struct

In order to allow the ipmr module to do partial multicast forwarding
according to the device parent ID, add the device parent ID field to the
VIF struct. This way, the forwarding path can use the parent ID field
without invoking switchdev calls, which requires the RTNL lock.

When a new VIF is added, set the device parent ID field in it by invoking
the switchdev_port_attr_get call.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index b072a84fbe1c..8242d05df35e 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -57,6 +57,7 @@ static inline bool ipmr_rule_default(const struct fib_rule *rule)
 
 struct vif_device {
 	struct net_device 	*dev;			/* Device we are using */
+	struct netdev_phys_item_id dev_parent_id;	/* Device parent ID    */
 	unsigned long	bytes_in,bytes_out;
 	unsigned long	pkt_in,pkt_out;		/* Statistics 			*/
 	unsigned long	rate_limit;		/* Traffic shaping (NI) 	*/
-- 
cgit v1.2.3


From 6c5570016b972d9b1f0f6c2dca9cc0422b1f92bf Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 2 Oct 2017 23:50:05 +0200
Subject: net: core: decouple ifalias get/set from rtnl lock

Device alias can be set by either rtnetlink (rtnl is held) or sysfs.

rtnetlink hold the rtnl mutex, sysfs acquires it for this purpose.
Add an extra mutex for it and use rcu to protect concurrent accesses.

This allows the sysfs path to not take rtnl and would later allow
to not hold it when dumping ifalias.

Based on suggestion from Eric Dumazet.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e1d6ef130611..d04424cfffba 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -826,6 +826,11 @@ struct xfrmdev_ops {
 };
 #endif
 
+struct dev_ifalias {
+	struct rcu_head rcuhead;
+	char ifalias[];
+};
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -1632,7 +1637,7 @@ enum netdev_priv_flags {
 struct net_device {
 	char			name[IFNAMSIZ];
 	struct hlist_node	name_hlist;
-	char 			*ifalias;
+	struct dev_ifalias	__rcu *ifalias;
 	/*
 	 *	I/O specific fields
 	 *	FIXME: Merge these and struct ifmap into one
@@ -3275,6 +3280,7 @@ void __dev_notify_flags(struct net_device *, unsigned int old_flags,
 			unsigned int gchanges);
 int dev_change_name(struct net_device *, const char *);
 int dev_set_alias(struct net_device *, const char *, size_t);
+int dev_get_alias(const struct net_device *, char *, size_t);
 int dev_change_net_namespace(struct net_device *, struct net *, const char *);
 int __dev_set_mtu(struct net_device *, int);
 int dev_set_mtu(struct net_device *, int);
-- 
cgit v1.2.3


From 324bda9e6c5add86ba2e1066476481c48132aca0 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 2 Oct 2017 22:50:21 -0700
Subject: bpf: multi program support for cgroup+bpf

introduce BPF_F_ALLOW_MULTI flag that can be used to attach multiple
bpf programs to a cgroup.

The difference between three possible flags for BPF_PROG_ATTACH command:
- NONE(default): No further bpf programs allowed in the subtree.
- BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
  the program in this cgroup yields to sub-cgroup program.
- BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
  that cgroup program gets run in addition to the program in this cgroup.

NONE and BPF_F_ALLOW_OVERRIDE existed before. This patch doesn't
change their behavior. It only clarifies the semantics in relation
to new flag.

Only one program is allowed to be attached to a cgroup with
NONE or BPF_F_ALLOW_OVERRIDE flag.
Multiple programs are allowed to be attached to a cgroup with
BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
(those that were attached first, run first)
The programs of sub-cgroup are executed first, then programs of
this cgroup and then programs of parent cgroup.
All eligible programs are executed regardless of return code from
earlier programs.

To allow efficient execution of multiple programs attached to a cgroup
and to avoid penalizing cgroups without any programs attached
introduce 'struct bpf_prog_array' which is RCU protected array
of pointers to bpf programs.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
for cgroup bits
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h | 46 ++++++++++++++++++++++++++++++----------------
 include/linux/bpf.h        | 32 ++++++++++++++++++++++++++++++++
 include/linux/filter.h     |  2 +-
 3 files changed, 63 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index d41d40ac3efd..102e56fbb6de 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -14,27 +14,42 @@ struct bpf_sock_ops_kern;
 extern struct static_key_false cgroup_bpf_enabled_key;
 #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
 
+struct bpf_prog_list {
+	struct list_head node;
+	struct bpf_prog *prog;
+};
+
+struct bpf_prog_array;
+
 struct cgroup_bpf {
-	/*
-	 * Store two sets of bpf_prog pointers, one for programs that are
-	 * pinned directly to this cgroup, and one for those that are effective
-	 * when this cgroup is accessed.
+	/* array of effective progs in this cgroup */
+	struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE];
+
+	/* attached progs to this cgroup and attach flags
+	 * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will
+	 * have either zero or one element
+	 * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
 	 */
-	struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
-	struct bpf_prog __rcu *effective[MAX_BPF_ATTACH_TYPE];
-	bool disallow_override[MAX_BPF_ATTACH_TYPE];
+	struct list_head progs[MAX_BPF_ATTACH_TYPE];
+	u32 flags[MAX_BPF_ATTACH_TYPE];
+
+	/* temp storage for effective prog array used by prog_attach/detach */
+	struct bpf_prog_array __rcu *inactive;
 };
 
 void cgroup_bpf_put(struct cgroup *cgrp);
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
+int cgroup_bpf_inherit(struct cgroup *cgrp);
 
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
-			struct bpf_prog *prog, enum bpf_attach_type type,
-			bool overridable);
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags);
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags);
 
-/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type, bool overridable);
+/* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags);
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags);
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
@@ -96,8 +111,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 
 struct cgroup_bpf {};
 static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
-static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
-				      struct cgroup *parent) {}
+static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 252f4bc9eb25..a6964b75f070 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -241,6 +241,38 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 			  union bpf_attr __user *uattr);
 
+/* an array of programs to be executed under rcu_lock.
+ *
+ * Typical usage:
+ * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, BPF_PROG_RUN);
+ *
+ * the structure returned by bpf_prog_array_alloc() should be populated
+ * with program pointers and the last pointer must be NULL.
+ * The user has to keep refcnt on the program and make sure the program
+ * is removed from the array before bpf_prog_put().
+ * The 'struct bpf_prog_array *' should only be replaced with xchg()
+ * since other cpus are walking the array of pointers in parallel.
+ */
+struct bpf_prog_array {
+	struct rcu_head rcu;
+	struct bpf_prog *progs[0];
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
+
+#define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
+	({						\
+		struct bpf_prog **_prog;		\
+		u32 _ret = 1;				\
+		rcu_read_lock();			\
+		_prog = rcu_dereference(array)->progs;	\
+		for (; *_prog; _prog++)			\
+			_ret &= func(*_prog, ctx);	\
+		rcu_read_unlock();			\
+		_ret;					\
+	 })
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 911d454af107..2d2db394b0ca 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -481,7 +481,7 @@ struct sk_filter {
 	struct bpf_prog	*prog;
 };
 
-#define BPF_PROG_RUN(filter, ctx)  (*filter->bpf_func)(ctx, filter->insnsi)
+#define BPF_PROG_RUN(filter, ctx)  (*(filter)->bpf_func)(ctx, (filter)->insnsi)
 
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
-- 
cgit v1.2.3


From 468e2f64d220fe2dc11caa2bcb9b3a1e50fc7321 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 2 Oct 2017 22:50:22 -0700
Subject: bpf: introduce BPF_PROG_QUERY command

introduce BPF_PROG_QUERY command to retrieve a set of either
attached programs to given cgroup or a set of effective programs
that will execute for events within a cgroup

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
for cgroup bits
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h | 4 ++++
 include/linux/bpf.h        | 3 +++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 102e56fbb6de..359b6f5d3d90 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -44,12 +44,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 			enum bpf_attach_type type, u32 flags);
 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 			enum bpf_attach_type type, u32 flags);
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		       union bpf_attr __user *uattr);
 
 /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
 int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type, u32 flags);
 int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type, u32 flags);
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		     union bpf_attr __user *uattr);
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a6964b75f070..a67daea731ab 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -260,6 +260,9 @@ struct bpf_prog_array {
 
 struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
 void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs);
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+				__u32 __user *prog_ids, u32 cnt);
 
 #define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
 	({						\
-- 
cgit v1.2.3


From 6621dd29eb9b5e6774ec7a9a75161352fdea47fc Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Tue, 3 Oct 2017 13:53:23 +0200
Subject: dev: advertise the new nsid when the netns iface changes

x-netns interfaces are bound to two netns: the link netns and the upper
netns. Usually, this kind of interfaces is created in the link netns and
then moved to the upper netns. At the end, the interface is visible only
in the upper netns. The link nsid is advertised via netlink in the upper
netns, thus the user always knows where is the link part.

There is no such mechanism in the link netns. When the interface is moved
to another netns, the user cannot "follow" it.
This patch adds a new netlink attribute which helps to follow an interface
which moves to another netns. When the interface is unregistered, the new
nsid is advertised. If the interface is a x-netns interface (ie
rtnl_link_ops->get_link_net is defined), the nsid is allocated if needed.

CC: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index dea59c8eec54..1251638e60d3 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -17,9 +17,11 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
 			      u32 id, long expires, u32 error);
 
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
+void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
+			 gfp_t flags, int *new_nsid);
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned change, u32 event,
-				       gfp_t flags);
+				       gfp_t flags, int *new_nsid);
 void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
 		       gfp_t flags);
 
-- 
cgit v1.2.3


From 51d0c04795a4b5d9a188336884887a9d394a94b0 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 4 Oct 2017 17:48:45 -0700
Subject: net: Add extack to netdev_notifier_info

Add netlink_ext_ack to netdev_notifier_info to allow notifier
handlers to return errors to userspace.

Clean up the initialization in dev.c such that extack is easily
added in subsequent patches where relevant. Specifically, remove
the init call in call_netdevice_notifiers_info and have callers
initalize on stack when info is declared.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d04424cfffba..05fcaba4b0d9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2309,7 +2309,8 @@ int register_netdevice_notifier(struct notifier_block *nb);
 int unregister_netdevice_notifier(struct notifier_block *nb);
 
 struct netdev_notifier_info {
-	struct net_device *dev;
+	struct net_device	*dev;
+	struct netlink_ext_ack	*extack;
 };
 
 struct netdev_notifier_change_info {
@@ -2334,6 +2335,7 @@ static inline void netdev_notifier_info_init(struct netdev_notifier_info *info,
 					     struct net_device *dev)
 {
 	info->dev = dev;
+	info->extack = NULL;
 }
 
 static inline struct net_device *
@@ -2342,6 +2344,12 @@ netdev_notifier_info_to_dev(const struct netdev_notifier_info *info)
 	return info->dev;
 }
 
+static inline struct netlink_ext_ack *
+netdev_notifier_info_to_extack(const struct netdev_notifier_info *info)
+{
+	return info->extack;
+}
+
 int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
 
 
-- 
cgit v1.2.3


From 33eaf2a6eb48ebf00374aaaf4b1b43f9950dcbe4 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 4 Oct 2017 17:48:46 -0700
Subject: net: Add extack to ndo_add_slave

Pass extack to do_set_master and down to ndo_add_slave

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 05fcaba4b0d9..368a5064a487 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1246,7 +1246,8 @@ struct net_device_ops {
 						     u32 flow_id);
 #endif
 	int			(*ndo_add_slave)(struct net_device *dev,
-						 struct net_device *slave_dev);
+						 struct net_device *slave_dev,
+						 struct netlink_ext_ack *extack);
 	int			(*ndo_del_slave)(struct net_device *dev,
 						 struct net_device *slave_dev);
 	netdev_features_t	(*ndo_fix_features)(struct net_device *dev,
-- 
cgit v1.2.3


From 42ab19ee90292993370a30ad242599d75a3b749e Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 4 Oct 2017 17:48:47 -0700
Subject: net: Add extack to upper device linking

Add extack arg to netdev_upper_dev_link and netdev_master_upper_dev_link

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_macvlan.h | 3 ++-
 include/linux/netdevice.h  | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index c9ec1343d187..10e319f41fb1 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -72,7 +72,8 @@ static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
 extern void macvlan_common_setup(struct net_device *dev);
 
 extern int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
-				  struct nlattr *tb[], struct nlattr *data[]);
+				  struct nlattr *tb[], struct nlattr *data[],
+				  struct netlink_ext_ack *extack);
 
 extern void macvlan_count_rx(const struct macvlan_dev *vlan,
 			     unsigned int len, bool success,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 368a5064a487..31bb3010c69b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3919,10 +3919,12 @@ void *netdev_adjacent_get_private(struct list_head *adj_list);
 void *netdev_lower_get_first_private_rcu(struct net_device *dev);
 struct net_device *netdev_master_upper_dev_get(struct net_device *dev);
 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev);
-int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev);
+int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev,
+			  struct netlink_ext_ack *extack);
 int netdev_master_upper_dev_link(struct net_device *dev,
 				 struct net_device *upper_dev,
-				 void *upper_priv, void *upper_info);
+				 void *upper_priv, void *upper_info,
+				 struct netlink_ext_ack *extack);
 void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev);
 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname);
-- 
cgit v1.2.3


From e2080072ed2d98a55ae69d95dea60ff7a17cddd5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 4 Oct 2017 12:59:58 -0700
Subject: tcp: new list for sent but unacked skbs for RACK recovery

This patch adds a new queue (list) that tracks the sent but not yet
acked or SACKed skbs for a TCP connection. The list is chronologically
ordered by skb->skb_mstamp (the head is the oldest sent skb).

This list will be used to optimize TCP Rack recovery, which checks
an skb's timestamp to judge if it has been lost and needs to be
retransmitted. Since TCP write queue is ordered by sequence instead
of sent time, RACK has to scan over the write queue to catch all
eligible packets to detect lost retransmission, and iterates through
SACKed skbs repeatedly.

Special cares for rare events:
1. TCP repair fakes skb transmission so the send queue needs adjusted
2. SACK reneging would require re-inserting SACKed skbs into the
   send queue. For now I believe it's not worth the complexity to
   make RACK work perfectly on SACK reneging, so we do nothing here.
3. Fast Open: currently for non-TFO, send-queue correctly queues
   the pure SYN packet. For TFO which queues a pure SYN and
   then a data packet, send-queue only queues the data packet but
   not the pure SYN due to the structure of TFO code. This is okay
   because the SYN receiver would never respond with a SACK on a
   missing SYN (i.e. SYN is never fast-retransmitted by SACK/RACK).

In order to not grow sk_buff, we use an union for the new list and
_skb_refdst/destructor fields. This is a bit complicated because
we need to make sure _skb_refdst and destructor are properly zeroed
before skb is cloned/copied at transmit, and before being freed.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 11 +++++++++--
 include/linux/tcp.h    |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ada821466e88..01a985937867 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -617,6 +617,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@nf_trace: netfilter packet trace flag
  *	@protocol: Packet protocol from driver
  *	@destructor: Destruct function
+ *	@tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
  *	@_nfct: Associated connection, if any (with nfctinfo bits)
  *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
  *	@skb_iif: ifindex of device we arrived on
@@ -686,8 +687,14 @@ struct sk_buff {
 	 */
 	char			cb[48] __aligned(8);
 
-	unsigned long		_skb_refdst;
-	void			(*destructor)(struct sk_buff *skb);
+	union {
+		struct {
+			unsigned long	_skb_refdst;
+			void		(*destructor)(struct sk_buff *skb);
+		};
+		struct list_head	tcp_tsorted_anchor;
+	};
+
 #ifdef CONFIG_XFRM
 	struct	sec_path	*sp;
 #endif
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4aa40ef02d32..1d2c44e09e31 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -191,6 +191,7 @@ struct tcp_sock {
 	u32	tsoffset;	/* timestamp offset */
 
 	struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
+	struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
 
 	u32	snd_wl1;	/* Sequence for window update		*/
 	u32	snd_wnd;	/* The window we expect to receive	*/
-- 
cgit v1.2.3


From 18a4c0eab2623cc95be98a1e6af1ad18e7695977 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 5 Oct 2017 22:21:21 -0700
Subject: net: add rb_to_skb() and other rb tree helpers

Geeralize private netem_rb_to_skb()

TCP rtx queue will soon be converted to rb-tree,
so we will need skb_rbtree_walk() helpers.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 01a985937867..03634ec2f918 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3158,6 +3158,12 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
 	return __skb_grow(skb, len);
 }
 
+#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
+#define skb_rb_first(root) rb_to_skb(rb_first(root))
+#define skb_rb_last(root)  rb_to_skb(rb_last(root))
+#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
+#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))
+
 #define skb_queue_walk(queue, skb) \
 		for (skb = (queue)->next;					\
 		     skb != (struct sk_buff *)(queue);				\
@@ -3172,6 +3178,18 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
 		for (; skb != (struct sk_buff *)(queue);			\
 		     skb = skb->next)
 
+#define skb_rbtree_walk(skb, root)						\
+		for (skb = skb_rb_first(root); skb != NULL;			\
+		     skb = skb_rb_next(skb))
+
+#define skb_rbtree_walk_from(skb)						\
+		for (; skb != NULL;						\
+		     skb = skb_rb_next(skb))
+
+#define skb_rbtree_walk_from_safe(skb, tmp)					\
+		for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL);	\
+		     skb = tmp)
+
 #define skb_queue_walk_from_safe(queue, skb, tmp)				\
 		for (tmp = skb->next;						\
 		     skb != (struct sk_buff *)(queue);				\
-- 
cgit v1.2.3


From 97562633bcbac4a07d605ae628d7655fa71caaf5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 5 Oct 2017 09:19:19 -0700
Subject: bpf: perf event change needed for subsequent bpf helpers

This patch does not impact existing functionalities.
It contains the changes in perf event area needed for
subsequent bpf_perf_event_read_value and
bpf_perf_prog_read_value helpers.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/perf_event.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8e22f24ded6a..79b18a20cf5d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -806,6 +806,7 @@ struct perf_output_handle {
 struct bpf_perf_event_data_kern {
 	struct pt_regs *regs;
 	struct perf_sample_data *data;
+	struct perf_event *event;
 };
 
 #ifdef CONFIG_CGROUP_PERF
@@ -884,7 +885,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				void *context);
 extern void perf_pmu_migrate_context(struct pmu *pmu,
 				int src_cpu, int dst_cpu);
-int perf_event_read_local(struct perf_event *event, u64 *value);
+int perf_event_read_local(struct perf_event *event, u64 *value,
+			  u64 *enabled, u64 *running);
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
 
@@ -1286,7 +1288,8 @@ static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *
 {
 	return ERR_PTR(-EINVAL);
 }
-static inline int perf_event_read_local(struct perf_event *event, u64 *value)
+static inline int perf_event_read_local(struct perf_event *event, u64 *value,
+					u64 *enabled, u64 *running)
 {
 	return -EINVAL;
 }
-- 
cgit v1.2.3


From 64237470ddf97b63155fbd272c9e743e01d5f514 Mon Sep 17 00:00:00 2001
From: Lin Zhang <xiaolou4617@gmail.com>
Date: Fri, 6 Oct 2017 01:37:29 +0800
Subject: net: phonet: mark header_ops as const

Signed-off-by: Lin Zhang <xiaolou4617@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_phonet.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/if_phonet.h b/include/linux/if_phonet.h
index bbcdb0a767d8..a118ee4a8428 100644
--- a/include/linux/if_phonet.h
+++ b/include/linux/if_phonet.h
@@ -10,5 +10,5 @@
 
 #include <uapi/linux/if_phonet.h>
 
-extern struct header_ops phonet_header_ops;
+extern const struct header_ops phonet_header_ops;
 #endif
-- 
cgit v1.2.3


From 067cae47771c864604969fd902efe10916e0d79c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 5 Oct 2017 21:52:12 -0700
Subject: bpf: Use char in prog and map name

Instead of u8, use char for prog and map name.  It can avoid the
userspace tool getting compiler's signess warning.  The
bpf_prog_aux, bpf_map, bpf_attr, bpf_prog_info and
bpf_map_info are changed.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a67daea731ab..bc7da2ddfcaf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,7 +56,7 @@ struct bpf_map {
 	struct work_struct work;
 	atomic_t usercnt;
 	struct bpf_map *inner_map_meta;
-	u8 name[BPF_OBJ_NAME_LEN];
+	char name[BPF_OBJ_NAME_LEN];
 };
 
 /* function argument constraints */
@@ -189,7 +189,7 @@ struct bpf_prog_aux {
 	struct bpf_prog *prog;
 	struct user_struct *user;
 	u64 load_time; /* ns since boottime */
-	u8 name[BPF_OBJ_NAME_LEN];
+	char name[BPF_OBJ_NAME_LEN];
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
-- 
cgit v1.2.3


From 821f1b21cabb46827ce39ddf82e2789680b5042a Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Fri, 6 Oct 2017 22:12:37 -0700
Subject: bridge: add new BR_NEIGH_SUPPRESS port flag to suppress arp and nd
 flood

This patch adds a new bridge port flag BR_NEIGH_SUPPRESS to
suppress arp and nd flood on bridge ports. It implements
rfc7432, section 10.
https://tools.ietf.org/html/rfc7432#section-10
for ethernet VPN deployments. It is similar to the existing
BR_PROXYARP* flags but has a few semantic differences to conform
to EVPN standard. Unlike the existing flags, this new flag suppresses
flood of all neigh discovery packets (arp and nd) to tunnel ports.
Supports both vlan filtering and non-vlan filtering bridges.

In case of EVPN, it is mainly used to avoid flooding
of arp and nd packets to tunnel ports like vxlan.

This patch adds netlink and sysfs support to set this bridge port
flag.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 3cd18ac0697f..316ee113a220 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -49,6 +49,7 @@ struct br_ip_list {
 #define BR_MULTICAST_TO_UNICAST	BIT(12)
 #define BR_VLAN_TUNNEL		BIT(13)
 #define BR_BCAST_FLOOD		BIT(14)
+#define BR_NEIGH_SUPPRESS	BIT(15)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
-- 
cgit v1.2.3


From 0912bda436388a02c72164b4b490b578e64c012e Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Mon, 9 Oct 2017 11:15:32 +0200
Subject: net: bridge: Export bridge multicast router state

Add an access function that, given a bridge netdevice, returns whether the
bridge device is currently an mrouter or not. The function uses the already
existing br_multicast_is_router function to check that.

This function is needed in order to allow ports that join an already
existing bridge to know the current mrouter state of the bridge device.
Together with the bridge device mrouter ports switchdev notifications, it
is possible to have full offloading of the semantics of the bridge device
mcast router state.

Due to the fact that the bridge multicast router status can change in
packet RX path, take the multicast_router bridge spinlock to protect the
read.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Nogah Frankel <nogahf@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 316ee113a220..02639ebea2f0 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -64,6 +64,7 @@ int br_multicast_list_adjacent(struct net_device *dev,
 bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto);
 bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto);
 bool br_multicast_enabled(const struct net_device *dev);
+bool br_multicast_router(const struct net_device *dev);
 #else
 static inline int br_multicast_list_adjacent(struct net_device *dev,
 					     struct list_head *br_ip_list)
@@ -84,6 +85,10 @@ static inline bool br_multicast_enabled(const struct net_device *dev)
 {
 	return false;
 }
+static inline bool br_multicast_router(const struct net_device *dev)
+{
+	return false;
+}
 #endif
 
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
-- 
cgit v1.2.3


From ed468ebee04ffba0231a8f50616bdb250752a891 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Mon, 9 Oct 2017 12:37:44 +0300
Subject: qed: Add ll2 ability of opening a secondary queue

When more than one ll2 queue is opened ( that is not an OOO queue )
ll2 code does not have enough information to determine whether
the queue is the main one or not, so a new field is added to the
acquire input data to expose the control of determining whether
the queue is the main queue or a secondary queue.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_ll2_if.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index 89fa0bbd54f3..d7cca590b743 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -171,6 +171,7 @@ struct qed_ll2_acquire_data_inputs {
 	enum qed_ll2_tx_dest tx_dest;
 	enum qed_ll2_error_handle ai_err_packet_too_big;
 	enum qed_ll2_error_handle ai_err_no_buf;
+	bool secondary_queue;
 	u8 gsi_enable;
 };
 
-- 
cgit v1.2.3


From 77caa792f5d8e4ecc88eb1cf4b9c478c07e0ec57 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Mon, 9 Oct 2017 12:37:45 +0300
Subject: qed: Add ll2 option for dropping a tx packet

The option of sending a packet on the ll2 and dropping it exists in
hardware and was not used until now, thus not exposed.
The iWARP unaligned MPA flow requires this functionality for
flushing the tx queue.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_ll2_if.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index d7cca590b743..95fdf02a3bbe 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -64,6 +64,7 @@ enum qed_ll2_roce_flavor_type {
 enum qed_ll2_tx_dest {
 	QED_LL2_TX_DEST_NW, /* Light L2 TX Destination to the Network */
 	QED_LL2_TX_DEST_LB, /* Light L2 TX Destination to the Loopback */
+	QED_LL2_TX_DEST_DROP, /* Light L2 Drop the TX packet */
 	QED_LL2_TX_DEST_MAX
 };
 
-- 
cgit v1.2.3


From 6f34a284f36399501fcc034dc4522a2d8d9fa6c9 Mon Sep 17 00:00:00 2001
From: Michal Kalderon <Michal.Kalderon@cavium.com>
Date: Mon, 9 Oct 2017 12:37:48 +0300
Subject: qed: Add LL2 slowpath handling

For iWARP unaligned MPA flow, a slowpath event of flushing an
MPA connection that entered an unaligned state is required.
The flush ramrod is received on the ll2 queue, and a pre-registered
callback function is called to handle the flush event.

Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
Signed-off-by: Ariel Elior <Ariel.Elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_ll2_if.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h
index 95fdf02a3bbe..e755954d85fd 100644
--- a/include/linux/qed/qed_ll2_if.h
+++ b/include/linux/qed/qed_ll2_if.h
@@ -151,11 +151,16 @@ void (*qed_ll2_release_tx_packet_cb)(void *cxt,
 				     dma_addr_t first_frag_addr,
 				     bool b_last_fragment, bool b_last_packet);
 
+typedef
+void (*qed_ll2_slowpath_cb)(void *cxt, u8 connection_handle,
+			    u32 opaque_data_0, u32 opaque_data_1);
+
 struct qed_ll2_cbs {
 	qed_ll2_complete_rx_packet_cb rx_comp_cb;
 	qed_ll2_release_rx_packet_cb rx_release_cb;
 	qed_ll2_complete_tx_packet_cb tx_comp_cb;
 	qed_ll2_release_tx_packet_cb tx_release_cb;
+	qed_ll2_slowpath_cb slowpath_cb;
 	void *cookie;
 };
 
-- 
cgit v1.2.3


From cf4c950b87ee2f547ad3abd3aca6ae3f3eb3443f Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 9 Oct 2017 14:30:52 -0700
Subject: once: switch to new jump label API

Switch the DO_ONCE() macro from the deprecated jump label API to the new
one.  The new one is more readable, and for DO_ONCE() it also makes the
generated code more icache-friendly: now the one-time initialization
code is placed out-of-line at the jump target, rather than at the inline
fallthrough case.

Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/once.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/once.h b/include/linux/once.h
index 9c98aaa87cbc..724724918e8b 100644
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -5,7 +5,7 @@
 #include <linux/jump_label.h>
 
 bool __do_once_start(bool *done, unsigned long *flags);
-void __do_once_done(bool *done, struct static_key *once_key,
+void __do_once_done(bool *done, struct static_key_true *once_key,
 		    unsigned long *flags);
 
 /* Call a function exactly once. The idea of DO_ONCE() is to perform
@@ -38,8 +38,8 @@ void __do_once_done(bool *done, struct static_key *once_key,
 	({								     \
 		bool ___ret = false;					     \
 		static bool ___done = false;				     \
-		static struct static_key ___once_key = STATIC_KEY_INIT_TRUE; \
-		if (static_key_true(&___once_key)) {			     \
+		static DEFINE_STATIC_KEY_TRUE(___once_key);		     \
+		if (static_branch_unlikely(&___once_key)) {		     \
 			unsigned long ___flags;				     \
 			___ret = __do_once_start(&___done, &___flags);	     \
 			if (unlikely(___ret)) {				     \
-- 
cgit v1.2.3


From e7bf8249e8f1bac64885eeccb55bcf6111901a81 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 9 Oct 2017 10:30:10 -0700
Subject: bpf: encapsulate verifier log state into a structure

Put the loose log_* variables into a structure.  This will make
it simpler to remove the global verifier state in following patches.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b8d200f60a40..163541ba70d9 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -115,6 +115,19 @@ struct bpf_insn_aux_data {
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
 
+struct bpf_verifer_log {
+	u32 level;
+	char *kbuf;
+	char __user *ubuf;
+	u32 len_used;
+	u32 len_total;
+};
+
+static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log)
+{
+	return log->len_used >= log->len_total - 1;
+}
+
 struct bpf_verifier_env;
 struct bpf_ext_analyzer_ops {
 	int (*insn_hook)(struct bpf_verifier_env *env,
-- 
cgit v1.2.3


From 61bd5218eef349fcacc4976a251bc83a4748b4af Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 9 Oct 2017 10:30:11 -0700
Subject: bpf: move global verifier log into verifier environment

The biggest piece of global state protected by the verifier lock
is the verifier_log.  Move that log to struct bpf_verifier_env.
struct bpf_verifier_env has to be passed now to all invocations
of verbose().

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 163541ba70d9..5ddb9a626a51 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -152,6 +152,8 @@ struct bpf_verifier_env {
 	bool allow_ptr_leaks;
 	bool seen_direct_write;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
+
+	struct bpf_verifer_log log;
 };
 
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
-- 
cgit v1.2.3


From a2a7d5701052542cd2260e7659b12443e0a74733 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 9 Oct 2017 10:30:15 -0700
Subject: bpf: write back the verifier log buffer as it gets filled

Verifier log buffer can be quite large (up to 16MB currently).
As Eric Dumazet points out if we allow multiple verification
requests to proceed simultaneously, malicious user may use the
verifier as a way of allocating large amounts of unswappable
memory to OOM the host.

Switch to a strategy of allocating a smaller buffer (1024B)
and writing it out into the user buffer after every print.

While at it remove the old BUG_ON().

This is in preparation of the global verifier lock removal.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 5ddb9a626a51..f00ef751c1c5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -115,9 +115,11 @@ struct bpf_insn_aux_data {
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
 
+#define BPF_VERIFIER_TMP_LOG_SIZE	1024
+
 struct bpf_verifer_log {
 	u32 level;
-	char *kbuf;
+	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
 	char __user *ubuf;
 	u32 len_used;
 	u32 len_total;
-- 
cgit v1.2.3


From 2355a6546a053b1c16ebefd6ce1f0cccc00e1da5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Thu, 12 Oct 2017 10:21:25 +0200
Subject: net: phy: broadcom: support new device flag for setting master mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some of Broadcom's PHYs run by default in slave mode with Automatic
Slave/Master configuration disabled. It stops them from working properly
with some devices.

So far it has been verified for BCM54210E and BCM50212E which don't
work well with Intel's I217-LM and I218-LM:
http://ark.intel.com/products/60019/Intel-Ethernet-Connection-I217-LM
http://ark.intel.com/products/71307/Intel-Ethernet-Connection-I218-LM
I was told there is massive ping loss.

This commit adds support for a new flag which can be set by an ethernet
driver to fixup PHY setup.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/brcmphy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index abcda9b458ab..9ac9e3e3d1e5 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -63,6 +63,7 @@
 #define PHY_BRCM_EXT_IBND_TX_ENABLE	0x00002000
 #define PHY_BRCM_CLEAR_RGMII_MODE	0x00004000
 #define PHY_BRCM_DIS_TXCRXC_NOENRGY	0x00008000
+#define PHY_BRCM_EN_MASTER_MODE		0x00010000
 
 /* Broadcom BCM7xxx specific workarounds */
 #define PHY_BRCM_7XXX_REV(x)		(((x) >> 8) & 0xff)
-- 
cgit v1.2.3


From 17a9422de78c3a59b490b400f555635c477f1476 Mon Sep 17 00:00:00 2001
From: Alan Brady <alan.brady@intel.com>
Date: Wed, 11 Oct 2017 14:49:43 -0700
Subject: i40e/i40evf: don't trust VF to reset itself

When using 'ethtool -L' on a VF to change number of requested queues
from PF, we shouldn't trust the VF to reset itself after making the
request.  Doing it that way opens the door for a potentially malicious
VF to do nasty things to the PF which should never be the case.

This makes it such that after VF makes a successful request, PF will
then reset the VF to institute required changes.  Only if the request
fails will PF send a message back to VF letting it know the request was
unsuccessful.

Testing-hints:
There should be no real functional changes.  This is simply hardening
against a potentially malicious VF.

Signed-off-by: Alan Brady <alan.brady@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/avf/virtchnl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h
index 60e5d90cb18a..3ce61342fa31 100644
--- a/include/linux/avf/virtchnl.h
+++ b/include/linux/avf/virtchnl.h
@@ -333,8 +333,8 @@ struct virtchnl_vsi_queue_config_info {
  * additional queues must be negotiated.  This is a best effort request as it
  * is possible the PF does not have enough queues left to support the request.
  * If the PF cannot support the number requested it will respond with the
- * maximum number it is able to support; otherwise it will respond with the
- * number requested.
+ * maximum number it is able to support.  If the request is successful, PF will
+ * then reset the VF to institute required changes.
  */
 
 /* VF resource request */
-- 
cgit v1.2.3


From 7c39afb394c79e72c3795b4a42d55155b34ee073 Mon Sep 17 00:00:00 2001
From: Feras Daoud <ferasda@mellanox.com>
Date: Tue, 15 Aug 2017 13:46:04 +0300
Subject: net/mlx5: PTP code migration to driver core section

PTP code is moved to core section of mlx5 driver in order to share
it between ethernet and infiniband. This movement involves the following
changes:
- Change mlx5e_ prefix to be mlx5_
- Add clock structs to Core
- Add clock object to mlx5_core_dev
- Call Init/Uninit clock from core init/cleanup
- Rename mlx5e_tstamp to be mlx5_clock

Signed-off-by: Feras Daoud <ferasda@mellanox.com>
Signed-off-by: Eitan Rabin <rabin@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/driver.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 401c8972cc3a..08c77b7e59cb 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -49,6 +49,8 @@
 #include <linux/mlx5/device.h>
 #include <linux/mlx5/doorbell.h>
 #include <linux/mlx5/srq.h>
+#include <linux/timecounter.h>
+#include <linux/ptp_clock_kernel.h>
 
 enum {
 	MLX5_BOARD_ID_LEN = 64,
@@ -760,6 +762,27 @@ struct mlx5_rsvd_gids {
 	struct ida ida;
 };
 
+#define MAX_PIN_NUM	8
+struct mlx5_pps {
+	u8                         pin_caps[MAX_PIN_NUM];
+	struct work_struct         out_work;
+	u64                        start[MAX_PIN_NUM];
+	u8                         enabled;
+};
+
+struct mlx5_clock {
+	rwlock_t                   lock;
+	struct cyclecounter        cycles;
+	struct timecounter         tc;
+	struct hwtstamp_config     hwtstamp_config;
+	u32                        nominal_c_mult;
+	unsigned long              overflow_period;
+	struct delayed_work        overflow_work;
+	struct ptp_clock          *ptp;
+	struct ptp_clock_info      ptp_info;
+	struct mlx5_pps            pps_info;
+};
+
 struct mlx5_core_dev {
 	struct pci_dev	       *pdev;
 	/* sync pci state */
@@ -800,6 +823,7 @@ struct mlx5_core_dev {
 #ifdef CONFIG_RFS_ACCEL
 	struct cpu_rmap         *rmap;
 #endif
+	struct mlx5_clock        clock;
 };
 
 struct mlx5_db {
-- 
cgit v1.2.3


From 5a6cd6de76ae78b651e7c36eba8b1da465d65f06 Mon Sep 17 00:00:00 2001
From: Alan Brady <alan.brady@intel.com>
Date: Thu, 5 Oct 2017 14:53:40 -0700
Subject: ethtool: add ethtool_intersect_link_masks

This function provides a way to intersect two link masks together to
find the common ground between them.  For example in i40e, the driver
first generates link masks for what is supported by the PHY type.  The
driver then gets the link masks for what the NVM supports.  The
resulting intersection between them yields what can truly be supported.

Signed-off-by: Alan Brady <alan.brady@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/ethtool.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 4587a4c36923..c77fa3529e15 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -163,6 +163,16 @@ extern int
 __ethtool_get_link_ksettings(struct net_device *dev,
 			     struct ethtool_link_ksettings *link_ksettings);
 
+/**
+ * ethtool_intersect_link_masks - Given two link masks, AND them together
+ * @dst: first mask and where result is stored
+ * @src: second mask to intersect with
+ *
+ * Given two link mode masks, AND them together and save the result in dst.
+ */
+void ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst,
+				  struct ethtool_link_ksettings *src);
+
 void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst,
 					     u32 legacy_u32);
 
-- 
cgit v1.2.3


From 6710e1126934d8b4372b4d2f9ae1646cd3f151bf Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:28 +0200
Subject: bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP

The 'cpumap' is primarily used as a backend map for XDP BPF helper
call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.

This patch implement the main part of the map.  It is not connected to
the XDP redirect system yet, and no SKB allocation are done yet.

The main concern in this patch is to ensure the datapath can run
without any locking.  This adds complexity to the setup and tear-down
procedure, which assumptions are extra carefully documented in the
code comments.

V2:
 - make sure array isn't larger than NR_CPUS
 - make sure CPUs added is a valid possible CPU

V3: fix nitpicks from Jakub Kicinski <kubakici@wp.pl>

V5:
 - Restrict map allocation to root / CAP_SYS_ADMIN
 - WARN_ON_ONCE if queue is not empty on tear-down
 - Return -EPERM on memlock limit instead of -ENOMEM
 - Error code in __cpu_map_entry_alloc() also handle ptr_ring_cleanup()
 - Moved cpu_map_enqueue() to next patch

V6: all notice by Daniel Borkmann
 - Fix err return code in cpu_map_alloc() introduced in V5
 - Move cpu_possible() check after max_entries boundary check
 - Forbid usage initially in check_map_func_compatibility()

V7:
 - Fix alloc error path spotted by Daniel Borkmann
 - Did stress test adding+removing CPUs from the map concurrently
 - Fixed refcnt issue on cpu_map_entry, kthread started too soon
 - Make sure packets are flushed during tear-down, involved use of
   rcu_barrier() and kthread_run only exit after queue is empty
 - Fix alloc error path in __cpu_map_entry_alloc() for ptr_ring

V8:
 - Nitpicking comments and gramma by Edward Cree
 - Fix missing semi-colon introduced in V7 due to rebasing
 - Move struct bpf_cpu_map_entry members cpu+map_id to tracepoint patch

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_types.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 6f1a567667b8..814c1081a4a9 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -41,4 +41,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
 #ifdef CONFIG_STREAM_PARSER
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 #endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #endif
-- 
cgit v1.2.3


From 9c270af37bb62e708e3e4415d653ce73e713df02 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:34 +0200
Subject: bpf: XDP_REDIRECT enable use of cpumap

This patch connects cpumap to the xdp_do_redirect_map infrastructure.

Still no SKB allocation are done yet.  The XDP frames are transferred
to the other CPU, but they are simply refcnt decremented on the remote
CPU.  This served as a good benchmark for measuring the overhead of
remote refcnt decrement.  If driver page recycle cache is not
efficient then this, exposes a bottleneck in the page allocator.

A shout-out to MST's ptr_ring, which is the secret behind is being so
efficient to transfer memory pointers between CPUs, without constantly
bouncing cache-lines between CPUs.

V3: Handle !CONFIG_BPF_SYSCALL pointed out by kbuild test robot.

V4: Make Generic-XDP aware of cpumap type, but don't allow redirect yet,
 as implementation require a separate upstream discussion.

V5:
 - Fix a maybe-uninitialized pointed out by kbuild test robot.
 - Restrict bpf-prog side access to cpumap, open when use-cases appear
 - Implement cpu_map_enqueue() as a more simple void pointer enqueue

V6:
 - Allow cpumap type for usage in helper bpf_redirect_map,
   general bpf-prog side restriction moved to earlier patch.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4373125de1f3..6d4dd844828a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -355,6 +355,13 @@ struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
 
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
+void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
+void __cpu_map_flush(struct bpf_map *map);
+struct xdp_buff;
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+		    struct net_device *dev_rx);
+
 /* Return map's numa specified by userspace */
 static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
 {
@@ -362,7 +369,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
 		attr->numa_node : NUMA_NO_NODE;
 }
 
-#else
+#else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
 	return ERR_PTR(-EOPNOTSUPP);
@@ -425,6 +432,28 @@ static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
 static inline void __dev_map_flush(struct bpf_map *map)
 {
 }
+
+static inline
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	return NULL;
+}
+
+static inline void __cpu_map_insert_ctx(struct bpf_map *map, u32 index)
+{
+}
+
+static inline void __cpu_map_flush(struct bpf_map *map)
+{
+}
+
+struct xdp_buff;
+static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
+				  struct xdp_buff *xdp,
+				  struct net_device *dev_rx)
+{
+	return 0;
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
-- 
cgit v1.2.3


From 1c601d829ab0d7ac3ac44853f83db2206afe67fc Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:39 +0200
Subject: bpf: cpumap xdp_buff to skb conversion and allocation

This patch makes cpumap functional, by adding SKB allocation and
invoking the network stack on the dequeuing CPU.

For constructing the SKB on the remote CPU, the xdp_buff in converted
into a struct xdp_pkt, and it mapped into the top headroom of the
packet, to avoid allocating separate mem.  For now, struct xdp_pkt is
just a cpumap internal data structure, with info carried between
enqueue to dequeue.

If a driver doesn't have enough headroom it is simply dropped, with
return code -EOVERFLOW.  This will be picked up the xdp tracepoint
infrastructure, to allow users to catch this.

V2: take into account xdp->data_meta

V4:
 - Drop busypoll tricks, keeping it more simple.
 - Skip RPS and Generic-XDP-recursive-reinjection, suggested by Alexei

V5: correct RCU read protection around __netif_receive_skb_core.

V6: Setting TASK_RUNNING vs TASK_INTERRUPTIBLE based on talk with Rik van Riel

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 31bb3010c69b..bf014afcb914 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3260,6 +3260,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
 int netif_rx(struct sk_buff *skb);
 int netif_rx_ni(struct sk_buff *skb);
 int netif_receive_skb(struct sk_buff *skb);
+int netif_receive_skb_core(struct sk_buff *skb);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
-- 
cgit v1.2.3


From 7de16e3a35578f4f5accc6f5f23970310483d0a2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 16 Oct 2017 16:40:53 -0700
Subject: bpf: split verifier and program ops

struct bpf_verifier_ops contains both verifier ops and operations
used later during program's lifetime (test_run).  Split the runtime
ops into a different structure.

BPF_PROG_TYPE() will now append ## _prog_ops or ## _verifier_ops
to the names.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h       | 15 ++++++++++-----
 include/linux/bpf_types.h | 28 ++++++++++++++--------------
 2 files changed, 24 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6d4dd844828a..e1fba5504ca5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -157,6 +157,11 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size)
 	aux->ctx_field_size = size;
 }
 
+struct bpf_prog_ops {
+	int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
+			union bpf_attr __user *uattr);
+};
+
 struct bpf_verifier_ops {
 	/* return eBPF function prototype for verification */
 	const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
@@ -172,8 +177,6 @@ struct bpf_verifier_ops {
 				  const struct bpf_insn *src,
 				  struct bpf_insn *dst,
 				  struct bpf_prog *prog, u32 *target_size);
-	int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
-			union bpf_attr __user *uattr);
 };
 
 struct bpf_prog_aux {
@@ -184,7 +187,8 @@ struct bpf_prog_aux {
 	u32 id;
 	struct latch_tree_node ksym_tnode;
 	struct list_head ksym_lnode;
-	const struct bpf_verifier_ops *ops;
+	const struct bpf_prog_ops *ops;
+	const struct bpf_verifier_ops *vops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
 	struct user_struct *user;
@@ -279,8 +283,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
-#define BPF_PROG_TYPE(_id, _ops) \
-	extern const struct bpf_verifier_ops _ops;
+#define BPF_PROG_TYPE(_id, _name) \
+	extern const struct bpf_prog_ops _name ## _prog_ops; \
+	extern const struct bpf_verifier_ops _name ## _verifier_ops;
 #define BPF_MAP_TYPE(_id, _ops) \
 	extern const struct bpf_map_ops _ops;
 #include <linux/bpf_types.h>
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 814c1081a4a9..36418ad43245 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -1,22 +1,22 @@
 /* internal file - do not include directly */
 
 #ifdef CONFIG_NET
-BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act)
+BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
 #endif
 #ifdef CONFIG_BPF_EVENTS
-BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
+BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
+BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 #endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
-- 
cgit v1.2.3


From 00176a34d9e27ab1e77db75fe13abc005cffe0ca Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 16 Oct 2017 16:40:54 -0700
Subject: bpf: remove the verifier ops from program structure

Since the verifier ops don't have to be associated with
the program for its entire lifetime we can move it to
verifier's struct bpf_verifier_env.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h          | 1 -
 include/linux/bpf_verifier.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e1fba5504ca5..cf91977e8719 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -188,7 +188,6 @@ struct bpf_prog_aux {
 	struct latch_tree_node ksym_tnode;
 	struct list_head ksym_lnode;
 	const struct bpf_prog_ops *ops;
-	const struct bpf_verifier_ops *vops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
 	struct user_struct *user;
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index f00ef751c1c5..feeaea93d959 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -141,6 +141,7 @@ struct bpf_ext_analyzer_ops {
  */
 struct bpf_verifier_env {
 	struct bpf_prog *prog;		/* eBPF program being verified */
+	const struct bpf_verifier_ops *ops;
 	struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
 	int stack_size;			/* number of states to be processed */
 	bool strict_alignment;		/* perform strict pointer alignment checks */
-- 
cgit v1.2.3


From 4f9218aaf8a463f76cac40aa08d859d065f8cc9e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 16 Oct 2017 16:40:55 -0700
Subject: bpf: move knowledge about post-translation offsets out of verifier

Use the fact that verifier ops are now separate from program
ops to define a separate set of callbacks for verification of
already translated programs.

Since we expect the analyzer ops to be defined only for
a small subset of all program types initialize their array
by hand (don't use linux/bpf_types.h).

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cf91977e8719..d67ccdc0099f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -291,6 +291,9 @@ DECLARE_PER_CPU(int, bpf_prog_active);
 #undef BPF_PROG_TYPE
 #undef BPF_MAP_TYPE
 
+extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
+extern const struct bpf_verifier_ops xdp_analyzer_ops;
+
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
 struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
-- 
cgit v1.2.3


From 7a0947e755084b918e33242fd558e55cb443408e Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 17 Oct 2017 17:16:52 -0700
Subject: dql: make dql_init return void

dql_init always returned 0, and the only place that uses it
in network core code didn't care about the return value anyway.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Acked-by: Hiroaki SHIMODA <shimoda.hiroaki@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dynamic_queue_limits.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h
index a4be70398ce1..f69f98541953 100644
--- a/include/linux/dynamic_queue_limits.h
+++ b/include/linux/dynamic_queue_limits.h
@@ -98,7 +98,7 @@ void dql_completed(struct dql *dql, unsigned int count);
 void dql_reset(struct dql *dql);
 
 /* Initialize dql state */
-int dql_init(struct dql *dql, unsigned hold_time);
+void dql_init(struct dql *dql, unsigned int hold_time);
 
 #endif /* _KERNEL_ */
 
-- 
cgit v1.2.3


From de95e04791a03de5cb681980a3880db6919e3b4a Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 18 Oct 2017 09:56:54 -0700
Subject: net: Add extack to validator_info structs used for address notifier

Add extack to in_validator_info and in6_validator_info. Update the one
user of each, ipvlan, to return an error message for failures.

Only manual configuration of an address is plumbed in the IPv6 code path.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 751d051f0bc7..681dff30940b 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -154,6 +154,7 @@ struct in_ifaddr {
 struct in_validator_info {
 	__be32			ivi_addr;
 	struct in_device	*ivi_dev;
+	struct netlink_ext_ack	*extack;
 };
 
 int register_inetaddr_notifier(struct notifier_block *nb);
-- 
cgit v1.2.3


From 6e71b04a82248ccf13a94b85cbc674a9fefe53f5 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 18 Oct 2017 13:00:22 -0700
Subject: bpf: Add file mode configuration into bpf maps

Introduce the map read/write flags to the eBPF syscalls that returns the
map fd. The flags is used to set up the file mode when construct a new
file descriptor for bpf maps. To not break the backward capability, the
f_flags is set to O_RDWR if the flag passed by syscall is 0. Otherwise
it should be O_RDONLY or O_WRONLY. When the userspace want to modify or
read the map content, it will check the file mode to see if it is
allowed to make the change.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d67ccdc0099f..3e5508f2fa87 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -315,11 +315,11 @@ void bpf_map_area_free(void *base);
 
 extern int sysctl_unprivileged_bpf_disabled;
 
-int bpf_map_new_fd(struct bpf_map *map);
+int bpf_map_new_fd(struct bpf_map *map, int flags);
 int bpf_prog_new_fd(struct bpf_prog *prog);
 
 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
-int bpf_obj_get_user(const char __user *pathname);
+int bpf_obj_get_user(const char __user *pathname, int flags);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
@@ -338,6 +338,8 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
 				void *key, void *value, u64 map_flags);
 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
 
+int bpf_get_file_flag(int flags);
+
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
  * forced to use 'long' read/writes to try to atomically copy long counters.
  * Best-effort only.  No barriers here, since it _will_ race with concurrent
@@ -421,7 +423,7 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
 {
 }
 
-static inline int bpf_obj_get_user(const char __user *pathname)
+static inline int bpf_obj_get_user(const char __user *pathname, int flags)
 {
 	return -EOPNOTSUPP;
 }
-- 
cgit v1.2.3


From afdb09c720b62b8090584c11151d856df330e57d Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 18 Oct 2017 13:00:24 -0700
Subject: security: bpf: Add LSM hooks for bpf object related syscall

Introduce several LSM hooks for the syscalls that will allow the
userspace to access to eBPF object such as eBPF programs and eBPF maps.
The security check is aimed to enforce a per object security protection
for eBPF object so only processes with the right priviliges can
read/write to a specific map or use a specific eBPF program. Besides
that, a general security hook is added before the multiplexer of bpf
syscall to check the cmd and the attribute used for the command. The
actual security module can decide which command need to be checked and
how the cmd should be checked.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h       |  6 ++++++
 include/linux/lsm_hooks.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/security.h  | 45 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 105 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3e5508f2fa87..84c192da3e0b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -57,6 +57,9 @@ struct bpf_map {
 	atomic_t usercnt;
 	struct bpf_map *inner_map_meta;
 	char name[BPF_OBJ_NAME_LEN];
+#ifdef CONFIG_SECURITY
+	void *security;
+#endif
 };
 
 /* function argument constraints */
@@ -193,6 +196,9 @@ struct bpf_prog_aux {
 	struct user_struct *user;
 	u64 load_time; /* ns since boottime */
 	char name[BPF_OBJ_NAME_LEN];
+#ifdef CONFIG_SECURITY
+	void *security;
+#endif
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index c9258124e417..7161d8e7ee79 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1351,6 +1351,40 @@
  *	@inode we wish to get the security context of.
  *	@ctx is a pointer in which to place the allocated security context.
  *	@ctxlen points to the place to put the length of @ctx.
+ *
+ * Security hooks for using the eBPF maps and programs functionalities through
+ * eBPF syscalls.
+ *
+ * @bpf:
+ *	Do a initial check for all bpf syscalls after the attribute is copied
+ *	into the kernel. The actual security module can implement their own
+ *	rules to check the specific cmd they need.
+ *
+ * @bpf_map:
+ *	Do a check when the kernel generate and return a file descriptor for
+ *	eBPF maps.
+ *
+ *	@map: bpf map that we want to access
+ *	@mask: the access flags
+ *
+ * @bpf_prog:
+ *	Do a check when the kernel generate and return a file descriptor for
+ *	eBPF programs.
+ *
+ *	@prog: bpf prog that userspace want to use.
+ *
+ * @bpf_map_alloc_security:
+ *	Initialize the security field inside bpf map.
+ *
+ * @bpf_map_free_security:
+ *	Clean up the security information stored inside bpf map.
+ *
+ * @bpf_prog_alloc_security:
+ *	Initialize the security field inside bpf program.
+ *
+ * @bpf_prog_free_security:
+ *	Clean up the security information stored inside bpf prog.
+ *
  */
 union security_list_options {
 	int (*binder_set_context_mgr)(struct task_struct *mgr);
@@ -1682,6 +1716,17 @@ union security_list_options {
 				struct audit_context *actx);
 	void (*audit_rule_free)(void *lsmrule);
 #endif /* CONFIG_AUDIT */
+
+#ifdef CONFIG_BPF_SYSCALL
+	int (*bpf)(int cmd, union bpf_attr *attr,
+				 unsigned int size);
+	int (*bpf_map)(struct bpf_map *map, fmode_t fmode);
+	int (*bpf_prog)(struct bpf_prog *prog);
+	int (*bpf_map_alloc_security)(struct bpf_map *map);
+	void (*bpf_map_free_security)(struct bpf_map *map);
+	int (*bpf_prog_alloc_security)(struct bpf_prog_aux *aux);
+	void (*bpf_prog_free_security)(struct bpf_prog_aux *aux);
+#endif /* CONFIG_BPF_SYSCALL */
 };
 
 struct security_hook_heads {
@@ -1901,6 +1946,15 @@ struct security_hook_heads {
 	struct list_head audit_rule_match;
 	struct list_head audit_rule_free;
 #endif /* CONFIG_AUDIT */
+#ifdef CONFIG_BPF_SYSCALL
+	struct list_head bpf;
+	struct list_head bpf_map;
+	struct list_head bpf_prog;
+	struct list_head bpf_map_alloc_security;
+	struct list_head bpf_map_free_security;
+	struct list_head bpf_prog_alloc_security;
+	struct list_head bpf_prog_free_security;
+#endif /* CONFIG_BPF_SYSCALL */
 } __randomize_layout;
 
 /*
diff --git a/include/linux/security.h b/include/linux/security.h
index ce6265960d6c..18800b0911e5 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -31,6 +31,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/bpf.h>
 
 struct linux_binprm;
 struct cred;
@@ -1730,6 +1731,50 @@ static inline void securityfs_remove(struct dentry *dentry)
 
 #endif
 
+#ifdef CONFIG_BPF_SYSCALL
+#ifdef CONFIG_SECURITY
+extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
+extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
+extern int security_bpf_prog(struct bpf_prog *prog);
+extern int security_bpf_map_alloc(struct bpf_map *map);
+extern void security_bpf_map_free(struct bpf_map *map);
+extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux);
+extern void security_bpf_prog_free(struct bpf_prog_aux *aux);
+#else
+static inline int security_bpf(int cmd, union bpf_attr *attr,
+					     unsigned int size)
+{
+	return 0;
+}
+
+static inline int security_bpf_map(struct bpf_map *map, fmode_t fmode)
+{
+	return 0;
+}
+
+static inline int security_bpf_prog(struct bpf_prog *prog)
+{
+	return 0;
+}
+
+static inline int security_bpf_map_alloc(struct bpf_map *map)
+{
+	return 0;
+}
+
+static inline void security_bpf_map_free(struct bpf_map *map)
+{ }
+
+static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+{
+	return 0;
+}
+
+static inline void security_bpf_prog_free(struct bpf_prog_aux *aux)
+{ }
+#endif /* CONFIG_SECURITY */
+#endif /* CONFIG_BPF_SYSCALL */
+
 #ifdef CONFIG_SECURITY
 
 static inline char *alloc_secdata(void)
-- 
cgit v1.2.3


From f66e448cfda021b0bcd884f26709796fe19c7cc1 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 18 Oct 2017 13:00:26 -0700
Subject: selinux: bpf: Add addtional check for bpf object file receive

Introduce a bpf object related check when sending and receiving files
through unix domain socket as well as binder. It checks if the receiving
process have privilege to read/write the bpf map or use the bpf program.
This check is necessary because the bpf maps and programs are using a
anonymous inode as their shared inode so the normal way of checking the
files and sockets when passing between processes cannot work properly on
eBPF object. This check only works when the BPF_SYSCALL is configured.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
Reviewed-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 84c192da3e0b..1e334b248ff6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -288,6 +288,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
+extern const struct file_operations bpf_map_fops;
+extern const struct file_operations bpf_prog_fops;
+
 #define BPF_PROG_TYPE(_id, _name) \
 	extern const struct bpf_prog_ops _name ## _prog_ops; \
 	extern const struct bpf_verifier_ops _name ## _verifier_ops;
-- 
cgit v1.2.3


From 8c4083b30e56fc71b0e94c26374b32d95d5ea461 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 19 Oct 2017 15:50:29 +0200
Subject: net: sched: add block bind/unbind notif. and extended block_get/put

Introduce new type of ndo_setup_tc message to propage binding/unbinding
of a block to driver. Call this ndo whenever qdisc gets/puts a block.
Alongside with this, there's need to propagate binder type from qdisc
code down to the notifier. So introduce extended variants of
block_get/put in order to pass this info.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bf014afcb914..4de5b08ee0fb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -775,6 +775,7 @@ enum tc_setup_type {
 	TC_SETUP_CLSFLOWER,
 	TC_SETUP_CLSMATCHALL,
 	TC_SETUP_CLSBPF,
+	TC_SETUP_BLOCK,
 };
 
 /* These structures hold the attributes of xdp state that are being passed
-- 
cgit v1.2.3


From ff61b5e3f041c2f1aa8d7c700af3007889973889 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 10:23:37 +0300
Subject: drivers, net, mlx4: convert mlx4_cq.refcount from atomic_t to
 refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable mlx4_cq.refcount is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx4/device.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index b0a57e043fa3..daac2e3a1a58 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -40,7 +40,7 @@
 #include <linux/cpu_rmap.h>
 #include <linux/crash_dump.h>
 
-#include <linux/atomic.h>
+#include <linux/refcount.h>
 
 #include <linux/timecounter.h>
 
@@ -751,7 +751,7 @@ struct mlx4_cq {
 	int			cqn;
 	unsigned		vector;
 
-	atomic_t		refcount;
+	refcount_t		refcount;
 	struct completion	free;
 	struct {
 		struct list_head list;
-- 
cgit v1.2.3


From 0068895ff845c38e9e2b65c002c53c623379e436 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 10:23:38 +0300
Subject: drivers, net, mlx4: convert mlx4_qp.refcount from atomic_t to
 refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable mlx4_qp.refcount is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx4/device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index daac2e3a1a58..b8e19c4d6caa 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -768,7 +768,7 @@ struct mlx4_qp {
 
 	int			qpn;
 
-	atomic_t		refcount;
+	refcount_t		refcount;
 	struct completion	free;
 	u8			usage;
 };
-- 
cgit v1.2.3


From 17ac99b2b8d08ed40f4525491d6eff330329a6d2 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 10:23:39 +0300
Subject: drivers, net, mlx4: convert mlx4_srq.refcount from atomic_t to
 refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable mlx4_srq.refcount is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx4/device.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index b8e19c4d6caa..a9b5fed8f7c6 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -781,7 +781,7 @@ struct mlx4_srq {
 	int			max_gs;
 	int			wqe_shift;
 
-	atomic_t		refcount;
+	refcount_t		refcount;
 	struct completion	free;
 };
 
-- 
cgit v1.2.3


From a4b51a9f83c6d359ff8fc0c66009283b6fdeeaf8 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 10:23:40 +0300
Subject: drivers, net, mlx5: convert mlx5_cq.refcount from atomic_t to
 refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable mlx5_cq.refcount is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/cq.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index 95898847c7d4..6a57ec2f1ef7 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -35,7 +35,7 @@
 
 #include <rdma/ib_verbs.h>
 #include <linux/mlx5/driver.h>
-
+#include <linux/refcount.h>
 
 struct mlx5_core_cq {
 	u32			cqn;
@@ -43,7 +43,7 @@ struct mlx5_core_cq {
 	__be32		       *set_ci_db;
 	__be32		       *arm_db;
 	struct mlx5_uars_page  *uar;
-	atomic_t		refcount;
+	refcount_t		refcount;
 	struct completion	free;
 	unsigned		vector;
 	unsigned int		irqn;
-- 
cgit v1.2.3


From e65f7ee39b4d7604a78b03ed35d723e1001fc241 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 10:23:49 +0300
Subject: drivers, connector: convert cn_callback_entry.refcnt from atomic_t to
 refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable cn_callback_entry.refcnt is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/connector.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/connector.h b/include/linux/connector.h
index f8fe8637d771..032102b19645 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -22,7 +22,7 @@
 #define __CONNECTOR_H
 
 
-#include <linux/atomic.h>
+#include <linux/refcount.h>
 
 #include <linux/list.h>
 #include <linux/workqueue.h>
@@ -49,7 +49,7 @@ struct cn_callback_id {
 
 struct cn_callback_entry {
 	struct list_head callback_entry;
-	atomic_t refcnt;
+	refcount_t refcnt;
 	struct cn_queue_dev *pdev;
 
 	struct cn_callback_id id;
-- 
cgit v1.2.3


From 71c02379c762cb616c00fd5c4ed253fbf6bbe11b Mon Sep 17 00:00:00 2001
From: Christoph Paasch <cpaasch@apple.com>
Date: Mon, 23 Oct 2017 13:22:23 -0700
Subject: tcp: Configure TFO without cookie per socket and/or per route

We already allow to enable TFO without a cookie by using the
fastopen-sysctl and setting it to TFO_SERVER_COOKIE_NOT_REQD (or
TFO_CLIENT_NO_COOKIE).
This is safe to do in certain environments where we know that there
isn't a malicous host (aka., data-centers) or when the
application-protocol already provides an authentication mechanism in the
first flight of data.

A server however might be providing multiple services or talking to both
sides (public Internet and data-center). So, this server would want to
enable cookie-less TFO for certain services and/or for connections that
go to the data-center.

This patch exposes a socket-option and a per-route attribute to enable such
fine-grained configurations.

Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1d2c44e09e31..173a7c2f9636 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -215,7 +215,8 @@ struct tcp_sock {
 	u8	chrono_type:2,	/* current chronograph type */
 		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
 		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
-		unused:4;
+		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
+		unused:3;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		unused1	    : 1,
-- 
cgit v1.2.3


From e87c6bc3852b981e71c757be20771546ce9f76f3 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 23 Oct 2017 23:53:08 -0700
Subject: bpf: permit multiple bpf attachments for a single perf event

This patch enables multiple bpf attachments for a
kprobe/uprobe/tracepoint single trace event.
Each trace_event keeps a list of attached perf events.
When an event happens, all attached bpf programs will
be executed based on the order of attachment.

A global bpf_event_mutex lock is introduced to protect
prog_array attaching and detaching. An alternative will
be introduce a mutex lock in every trace_event_call
structure, but it takes a lot of extra memory.
So a global bpf_event_mutex lock is a good compromise.

The bpf prog detachment involves allocation of memory.
If the allocation fails, a dummy do-nothing program
will replace to-be-detached program in-place.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h          | 30 +++++++++++++++++++++++++-----
 include/linux/trace_events.h | 43 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 64 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1e334b248ff6..172be7faf7ba 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -273,18 +273,38 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs);
 int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 				__u32 __user *prog_ids, u32 cnt);
 
-#define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+				struct bpf_prog *old_prog);
+int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+			struct bpf_prog *exclude_prog,
+			struct bpf_prog *include_prog,
+			struct bpf_prog_array **new_array);
+
+#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null)	\
 	({						\
-		struct bpf_prog **_prog;		\
+		struct bpf_prog **_prog, *__prog;	\
+		struct bpf_prog_array *_array;		\
 		u32 _ret = 1;				\
 		rcu_read_lock();			\
-		_prog = rcu_dereference(array)->progs;	\
-		for (; *_prog; _prog++)			\
-			_ret &= func(*_prog, ctx);	\
+		_array = rcu_dereference(array);	\
+		if (unlikely(check_non_null && !_array))\
+			goto _out;			\
+		_prog = _array->progs;			\
+		while ((__prog = READ_ONCE(*_prog))) {	\
+			_ret &= func(__prog, ctx);	\
+			_prog++;			\
+		}					\
+_out:							\
 		rcu_read_unlock();			\
 		_ret;					\
 	 })
 
+#define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
+	__BPF_PROG_RUN_ARRAY(array, ctx, func, false)
+
+#define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func)	\
+	__BPF_PROG_RUN_ARRAY(array, ctx, func, true)
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2e0f22298fe9..fc6aeca945db 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -271,14 +271,37 @@ struct trace_event_call {
 #ifdef CONFIG_PERF_EVENTS
 	int				perf_refcount;
 	struct hlist_head __percpu	*perf_events;
-	struct bpf_prog			*prog;
-	struct perf_event		*bpf_prog_owner;
+	struct bpf_prog_array __rcu	*prog_array;
 
 	int	(*perf_perm)(struct trace_event_call *,
 			     struct perf_event *);
 #endif
 };
 
+#ifdef CONFIG_PERF_EVENTS
+static inline bool bpf_prog_array_valid(struct trace_event_call *call)
+{
+	/*
+	 * This inline function checks whether call->prog_array
+	 * is valid or not. The function is called in various places,
+	 * outside rcu_read_lock/unlock, as a heuristic to speed up execution.
+	 *
+	 * If this function returns true, and later call->prog_array
+	 * becomes false inside rcu_read_lock/unlock region,
+	 * we bail out then. If this function return false,
+	 * there is a risk that we might miss a few events if the checking
+	 * were delayed until inside rcu_read_lock/unlock region and
+	 * call->prog_array happened to become non-NULL then.
+	 *
+	 * Here, READ_ONCE() is used instead of rcu_access_pointer().
+	 * rcu_access_pointer() requires the actual definition of
+	 * "struct bpf_prog_array" while READ_ONCE() only needs
+	 * a declaration of the same type.
+	 */
+	return !!READ_ONCE(call->prog_array);
+}
+#endif
+
 static inline const char *
 trace_event_name(struct trace_event_call *call)
 {
@@ -435,12 +458,23 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
 }
 
 #ifdef CONFIG_BPF_EVENTS
-unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
+int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
+void perf_event_detach_bpf_prog(struct perf_event *event);
 #else
-static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
 	return 1;
 }
+
+static inline int
+perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void perf_event_detach_bpf_prog(struct perf_event *event) { }
+
 #endif
 
 enum {
@@ -511,6 +545,7 @@ perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 {
 	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task, event);
 }
+
 #endif
 
 #endif /* _LINUX_TRACE_EVENT_H */
-- 
cgit v1.2.3


From 88ca59d1aaf28c25b47a9f933090e480ba6dc92a Mon Sep 17 00:00:00 2001
From: Girish Moodalbail <girish.moodalbail@oracle.com>
Date: Wed, 25 Oct 2017 12:26:43 -0700
Subject: macvlan: remove unused fields in struct macvlan_dev

commit 635b8c8ecdd2 ("tap: Renaming tap related APIs, data structures,
macros") captured all the tap related fields into a new struct tap_dev.
However, it failed to remove those fields from struct macvlan_dev.
Those fields are currently unused and must be removed. While there
I moved the comment for MAX_TAP_QUEUES to the right place.

Fixes: 635b8c8ecdd27142 (tap: Renaming tap related APIs, data structures, macros)
Signed-off-by: Girish Moodalbail <girish.moodalbail@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_macvlan.h | 15 ---------------
 include/linux/if_tap.h     |  4 ++++
 2 files changed, 4 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index 10e319f41fb1..e13b369df02b 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -10,13 +10,6 @@
 #include <linux/u64_stats_sync.h>
 
 struct macvlan_port;
-struct macvtap_queue;
-
-/*
- * Maximum times a macvtap device can be opened. This can be used to
- * configure the number of receive queue, e.g. for multiqueue virtio.
- */
-#define MAX_TAP_QUEUES	256
 
 #define MACVLAN_MC_FILTER_BITS	8
 #define MACVLAN_MC_FILTER_SZ	(1 << MACVLAN_MC_FILTER_BITS)
@@ -35,14 +28,6 @@ struct macvlan_dev {
 	netdev_features_t	set_features;
 	enum macvlan_mode	mode;
 	u16			flags;
-	/* This array tracks active taps. */
-	struct tap_queue	__rcu *taps[MAX_TAP_QUEUES];
-	/* This list tracks all taps (both enabled and disabled) */
-	struct list_head	queue_list;
-	int			numvtaps;
-	int			numqueues;
-	netdev_features_t	tap_features;
-	int			minor;
 	int			nest_level;
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	struct netpoll		*netpoll;
diff --git a/include/linux/if_tap.h b/include/linux/if_tap.h
index 4837157da0dc..d1b5173ad8f0 100644
--- a/include/linux/if_tap.h
+++ b/include/linux/if_tap.h
@@ -22,6 +22,10 @@ static inline struct skb_array *tap_get_skb_array(struct file *f)
 #include <net/sock.h>
 #include <linux/skb_array.h>
 
+/*
+ * Maximum times a tap device can be opened. This can be used to
+ * configure the number of receive queue, e.g. for multiqueue virtio.
+ */
 #define MAX_TAP_QUEUES 256
 
 struct tap_queue;
-- 
cgit v1.2.3


From 60e2a7780793bae0debc275a9ccd57f7da0cf195 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Wed, 25 Oct 2017 11:01:45 +0200
Subject: tcp: TCP experimental option for SMC

The SMC protocol [1] relies on the use of a new TCP experimental
option [2, 3]. With this option, SMC capabilities are exchanged
between peers during the TCP three way handshake. This patch adds
support for this experimental option to TCP.

References:
[1] SMC-R Informational RFC: http://www.rfc-editor.org/info/rfc7609
[2] Shared Use of TCP Experimental Options RFC 6994:
    https://tools.ietf.org/rfc/rfc6994.txt
[3] IANA ExID SMCR:
http://www.iana.org/assignments/tcp-parameters/tcp-parameters.xhtml#tcp-exids

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 173a7c2f9636..8c431385b272 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -98,7 +98,8 @@ struct tcp_options_received {
 		tstamp_ok : 1,	/* TIMESTAMP seen on SYN packet		*/
 		dsack : 1,	/* D-SACK is scheduled			*/
 		wscale_ok : 1,	/* Wscale seen on SYN packet		*/
-		sack_ok : 4,	/* SACK seen on SYN packet		*/
+		sack_ok : 3,	/* SACK seen on SYN packet		*/
+		smc_ok : 1,	/* SMC seen on SYN packet		*/
 		snd_wscale : 4,	/* Window scaling received from sender	*/
 		rcv_wscale : 4;	/* Window scaling to send to receiver	*/
 	u8	num_sacks;	/* Number of SACK blocks		*/
@@ -110,6 +111,9 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
 {
 	rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
 	rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
+#if IS_ENABLED(CONFIG_SMC)
+	rx_opt->smc_ok = 0;
+#endif
 }
 
 /* This is the max number of SACKS that we'll generate and process. It's safe
@@ -229,7 +233,8 @@ struct tcp_sock {
 		syn_fastopen_ch:1, /* Active TFO re-enabling probe */
 		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
 		save_syn:1,	/* Save headers of SYN packet */
-		is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
+		is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
+		syn_smc:1;	/* SYN includes SMC */
 	u32	tlp_high_seq;	/* snd_nxt at the time of TLP retransmit. */
 
 /* RTT measurement */
-- 
cgit v1.2.3


From 032cfd66afcc2dd2c7be89c71b020fcb15bcc37d Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 25 Oct 2017 03:53:59 -0700
Subject: drivers/net: wan/sdla: Convert timers to use timer_setup()

In preparation for unconditionally passing the struct timer_list pointer to
all timer callbacks, switch to using the new timer_setup() and from_timer()
to pass the timer pointer explicitly.

Cc: Allen Pais <allen.lkml@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Tobias Klauser <tklauser@distanz.ch>
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_frad.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/if_frad.h b/include/linux/if_frad.h
index 46df7e565d6f..82a1b4e93570 100644
--- a/include/linux/if_frad.h
+++ b/include/linux/if_frad.h
@@ -83,6 +83,7 @@ struct frad_local
 
    /* fields that are used by the Sangoma SDLA cards */
    struct timer_list timer;
+   struct net_device *dev;
    int               type;		/* adapter type */
    int               state;		/* state of the S502/8 control latch */
    int               buffer;		/* current buffer for S508 firmware */
-- 
cgit v1.2.3


From 035226b964c820f65e201cdf123705a8f1d7c670 Mon Sep 17 00:00:00 2001
From: Gianluca Borello <g.borello@gmail.com>
Date: Thu, 26 Oct 2017 01:47:42 +0000
Subject: bpf: remove tail_call and get_stackid helper declarations from bpf.h

commit afdb09c720b6 ("security: bpf: Add LSM hooks for bpf object related
syscall") included linux/bpf.h in linux/security.h. As a result, bpf
programs including bpf_helpers.h and some other header that ends up
pulling in also security.h, such as several examples under samples/bpf,
fail to compile because bpf_tail_call and bpf_get_stackid are now
"redefined as different kind of symbol".

>From bpf.h:

u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);

Whereas in bpf_helpers.h they are:

static void (*bpf_tail_call)(void *ctx, void *map, int index);
static int (*bpf_get_stackid)(void *ctx, void *map, int flags);

Fix this by removing the unused declaration of bpf_tail_call and moving
the declaration of bpf_get_stackid in bpf_trace.c, which is the only
place where it's needed.

Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 172be7faf7ba..520aeebe0d93 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -231,9 +231,6 @@ struct bpf_event_entry {
 	struct rcu_head rcu;
 };
 
-u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
-u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
-
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 int bpf_prog_calc_tag(struct bpf_prog *fp);
 
-- 
cgit v1.2.3


From 356c3e9afac0cc19c3d3b0cbc67106ce8efa0743 Mon Sep 17 00:00:00 2001
From: Egil Hjelmeland <privat@egil-hjelmeland.no>
Date: Thu, 26 Oct 2017 11:00:48 +0200
Subject: net: dsa: lan9303: Move struct lan9303 to include/linux/dsa/lan9303.h

The next patch require net/dsa/tag_lan9303.c to access struct lan9303.
Therefore move struct lan9303 definitions from drivers/net/dsa/lan9303.h
to new file include/linux/dsa/lan9303.h.

Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/lan9303.h | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 include/linux/dsa/lan9303.h

(limited to 'include/linux')

diff --git a/include/linux/dsa/lan9303.h b/include/linux/dsa/lan9303.h
new file mode 100644
index 000000000000..05d8d136baab
--- /dev/null
+++ b/include/linux/dsa/lan9303.h
@@ -0,0 +1,36 @@
+/* Included by drivers/net/dsa/lan9303.h and net/dsa/tag_lan9303.c */
+#include <linux/if_ether.h>
+
+struct lan9303;
+
+struct lan9303_phy_ops {
+	/* PHY 1 and 2 access*/
+	int	(*phy_read)(struct lan9303 *chip, int port, int regnum);
+	int	(*phy_write)(struct lan9303 *chip, int port,
+			     int regnum, u16 val);
+};
+
+#define LAN9303_NUM_ALR_RECORDS 512
+struct lan9303_alr_cache_entry {
+	u8  mac_addr[ETH_ALEN];
+	u8  port_map;           /* Bitmap of ports. Zero if unused entry */
+	u8  stp_override;       /* non zero if set ALR_DAT1_AGE_OVERRID */
+};
+
+struct lan9303 {
+	struct device *dev;
+	struct regmap *regmap;
+	struct regmap_irq_chip_data *irq_data;
+	struct gpio_desc *reset_gpio;
+	u32 reset_duration; /* in [ms] */
+	bool phy_addr_sel_strap;
+	struct dsa_switch *ds;
+	struct mutex indirect_mutex; /* protect indexed register access */
+	const struct lan9303_phy_ops *ops;
+	bool is_bridged; /* true if port 1 and 2 are bridged */
+	u32 swe_port_state; /* remember SWE_PORT_STATE while not bridged */
+	/* LAN9303 do not offer reading specific ALR entry. Cache all
+	 * static entries in a flat table
+	 **/
+	struct lan9303_alr_cache_entry alr_cache[LAN9303_NUM_ALR_RECORDS];
+};
-- 
cgit v1.2.3


From 3d0bd028ffb4a4915cb64cfa0d2cee1578cc0321 Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Mon, 16 Oct 2017 18:01:27 -0700
Subject: net/sched: Add support for HW offloading for CBS

This adds support for offloading the CBS algorithm to the controller,
if supported. Drivers wanting to support CBS offload must implement
the .ndo_setup_tc callback and handle the TC_SETUP_CBS (introduced
here) type.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Tested-by: Henrik Austad <henrik@austad.us>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6c7960c8338a..5e02f79b2110 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -776,6 +776,7 @@ enum tc_setup_type {
 	TC_SETUP_CLSMATCHALL,
 	TC_SETUP_CLSBPF,
 	TC_SETUP_BLOCK,
+	TC_SETUP_CBS,
 };
 
 /* These structures hold the attributes of xdp state that are being passed
-- 
cgit v1.2.3


From 638f5b90d46016372a8e3e0a434f199cc5e12b8c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 31 Oct 2017 18:16:05 -0700
Subject: bpf: reduce verifier memory consumption

the verifier got progressively smarter over time and size of its internal
state grew as well. Time to reduce the memory consumption.

Before:
sizeof(struct bpf_verifier_state) = 6520
After:
sizeof(struct bpf_verifier_state) = 896

It's done by observing that majority of BPF programs use little to
no stack whereas verifier kept all of 512 stack slots ready always.
Instead dynamically reallocate struct verifier state when stack
access is detected.
Runtime difference before vs after is within a noise.
The number of processed instructions stays the same.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index feeaea93d959..3b0976aaac75 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -88,14 +88,19 @@ enum bpf_stack_slot_type {
 
 #define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
 
+struct bpf_stack_state {
+	struct bpf_reg_state spilled_ptr;
+	u8 slot_type[BPF_REG_SIZE];
+};
+
 /* state of the program:
  * type of all registers and stack info
  */
 struct bpf_verifier_state {
 	struct bpf_reg_state regs[MAX_BPF_REG];
-	u8 stack_slot_type[MAX_BPF_STACK];
-	struct bpf_reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
 	struct bpf_verifier_state *parent;
+	int allocated_stack;
+	struct bpf_stack_state *stack;
 };
 
 /* linked list of verifier states used to prune search */
@@ -145,7 +150,7 @@ struct bpf_verifier_env {
 	struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
 	int stack_size;			/* number of states to be processed */
 	bool strict_alignment;		/* perform strict pointer alignment checks */
-	struct bpf_verifier_state cur_state; /* current verifier state */
+	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
 	void *analyzer_priv; /* pointer to external analyzer's private data */
@@ -159,6 +164,11 @@ struct bpf_verifier_env {
 	struct bpf_verifer_log log;
 };
 
+static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
+{
+	return env->cur_state->regs;
+}
+
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 		 void *priv);
 
-- 
cgit v1.2.3


From e9292f2c03851ef81bef38579a0ee9c42140e586 Mon Sep 17 00:00:00 2001
From: Egil Hjelmeland <privat@egil-hjelmeland.no>
Date: Tue, 31 Oct 2017 15:48:01 +0100
Subject: net: dsa: lan9303: Add STP ALR entry on port 0

STP BPDUs arriving on user ports must sent to CPU port only,
for processing by the SW bridge.

Add an ALR entry with STP state override to fix that.

Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/lan9303.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dsa/lan9303.h b/include/linux/dsa/lan9303.h
index 05d8d136baab..b2110e69630f 100644
--- a/include/linux/dsa/lan9303.h
+++ b/include/linux/dsa/lan9303.h
@@ -34,3 +34,5 @@ struct lan9303 {
 	 **/
 	struct lan9303_alr_cache_entry alr_cache[LAN9303_NUM_ALR_RECORDS];
 };
+
+#define eth_stp_addr eth_reserved_addr_base
-- 
cgit v1.2.3


From 1495dc9f0a711a54f8fec849ce7f3a8f585a11e5 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 1 Nov 2017 11:48:00 -0700
Subject: security: bpf: replace include of linux/bpf.h with forward
 declarations

Touching linux/bpf.h makes us rebuild a surprisingly large
portion of the kernel.  Remove the unnecessary dependency
from security.h, it only needs forward declarations.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/security.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 18800b0911e5..73f1ef625d40 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -31,7 +31,6 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
-#include <linux/bpf.h>
 
 struct linux_binprm;
 struct cred;
@@ -1732,6 +1731,10 @@ static inline void securityfs_remove(struct dentry *dentry)
 #endif
 
 #ifdef CONFIG_BPF_SYSCALL
+union bpf_attr;
+struct bpf_map;
+struct bpf_prog;
+struct bpf_prog_aux;
 #ifdef CONFIG_SECURITY
 extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
 extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
-- 
cgit v1.2.3


From 054287295b1132c8742ea55f8e3af9cbd630c932 Mon Sep 17 00:00:00 2001
From: Egil Hjelmeland <privat@egil-hjelmeland.no>
Date: Thu, 2 Nov 2017 10:36:48 +0100
Subject: net: Define eth_stp_addr in linux/etherdevice.h

The lan9303 driver defines eth_stp_addr as a synonym to
eth_reserved_addr_base to get the STP ethernet address 01:80:c2:00:00:00.

eth_reserved_addr_base is also used to define the start of Bridge Reserved
ethernet address range, which happen to be the STP address.

br_dev_setup refer to eth_reserved_addr_base as a definition of STP
address.

Clean up by:
 - Move the eth_stp_addr definition to linux/etherdevice.h
 - Use eth_stp_addr instead of eth_reserved_addr_base in br_dev_setup.

Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/lan9303.h | 2 --
 include/linux/etherdevice.h | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/lan9303.h b/include/linux/dsa/lan9303.h
index b2110e69630f..05d8d136baab 100644
--- a/include/linux/dsa/lan9303.h
+++ b/include/linux/dsa/lan9303.h
@@ -34,5 +34,3 @@ struct lan9303 {
 	 **/
 	struct lan9303_alr_cache_entry alr_cache[LAN9303_NUM_ALR_RECORDS];
 };
-
-#define eth_stp_addr eth_reserved_addr_base
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 2d9f80848d4b..263dbcad22fc 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -66,6 +66,7 @@ int eth_gro_complete(struct sk_buff *skb, int nhoff);
 /* Reserved Ethernet Addresses per IEEE 802.1Q */
 static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) =
 { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
+#define eth_stp_addr eth_reserved_addr_base
 
 /**
  * is_link_local_ether_addr - Determine if given Ethernet address is link-local
-- 
cgit v1.2.3


From 46209401f8f6116bd0b2c2d14a63958e83ffca0b Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 3 Nov 2017 11:46:25 +0100
Subject: net: core: introduce mini_Qdisc and eliminate usage of tp->q for
 clsact fastpath

In sch_handle_egress and sch_handle_ingress tp->q is used only in order
to update stats. So stats and filter list are the only things that are
needed in clsact qdisc fastpath processing. Introduce new mini_Qdisc
struct to hold those items. Also, introduce a helper to swap the
mini_Qdisc structures in case filter list head changes.

This removes need for tp->q usage without added overhead.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5e02f79b2110..7de7656550c2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1559,6 +1559,8 @@ enum netdev_priv_flags {
  *
  *	@rx_handler:		handler for received packets
  *	@rx_handler_data: 	XXX: need comments on this one
+ *	@miniq_ingress:		ingress/clsact qdisc specific data for
+ *				ingress processing
  *	@ingress_queue:		XXX: need comments on this one
  *	@broadcast:		hw bcast address
  *
@@ -1576,7 +1578,8 @@ enum netdev_priv_flags {
  *	@tx_global_lock: 	XXX: need comments on this one
  *
  *	@xps_maps:	XXX: need comments on this one
- *
+ *	@miniq_egress:		clsact qdisc specific data for
+ *				egress processing
  *	@watchdog_timeo:	Represents the timeout that is used by
  *				the watchdog (see dev_watchdog())
  *	@watchdog_timer:	List of timers
@@ -1795,7 +1798,7 @@ struct net_device {
 	void __rcu		*rx_handler_data;
 
 #ifdef CONFIG_NET_CLS_ACT
-	struct tcf_proto __rcu  *ingress_cl_list;
+	struct mini_Qdisc __rcu	*miniq_ingress;
 #endif
 	struct netdev_queue __rcu *ingress_queue;
 #ifdef CONFIG_NETFILTER_INGRESS
@@ -1826,7 +1829,7 @@ struct net_device {
 	struct xps_dev_maps __rcu *xps_maps;
 #endif
 #ifdef CONFIG_NET_CLS_ACT
-	struct tcf_proto __rcu  *egress_cl_list;
+	struct mini_Qdisc __rcu	*miniq_egress;
 #endif
 
 	/* These may be needed for future network-power-down code. */
-- 
cgit v1.2.3


From c02762eb20cb57ec5b7c037b056c37d5838c803f Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Tue, 18 Jul 2017 16:03:17 -0500
Subject: net/mlx5: QCAM register firmware command support

The QCAM register provides capability bit for all the QoS registers
using ACCESS_REG command.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/device.h   | 14 ++++++++++++++
 include/linux/mlx5/driver.h   |  2 ++
 include/linux/mlx5/mlx5_ifc.h | 40 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 55 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index e32dbc4934db..6d79b3f79458 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1000,6 +1000,14 @@ enum mlx5_mcam_feature_groups {
 	MLX5_MCAM_FEATURE_ENHANCED_FEATURES         = 0x0,
 };
 
+enum mlx5_qcam_reg_groups {
+	MLX5_QCAM_REGS_FIRST_128                    = 0x0,
+};
+
+enum mlx5_qcam_feature_groups {
+	MLX5_QCAM_FEATURE_ENHANCED_FEATURES         = 0x0,
+};
+
 /* GET Dev Caps macros */
 #define MLX5_CAP_GEN(mdev, cap) \
 	MLX5_GET(cmd_hca_cap, mdev->caps.hca_cur[MLX5_CAP_GENERAL], cap)
@@ -1108,6 +1116,12 @@ enum mlx5_mcam_feature_groups {
 #define MLX5_CAP_MCAM_FEATURE(mdev, fld) \
 	MLX5_GET(mcam_reg, (mdev)->caps.mcam, mng_feature_cap_mask.enhanced_features.fld)
 
+#define MLX5_CAP_QCAM_REG(mdev, fld) \
+	MLX5_GET(qcam_reg, (mdev)->caps.qcam, qos_access_reg_cap_mask.reg_cap.fld)
+
+#define MLX5_CAP_QCAM_FEATURE(mdev, fld) \
+	MLX5_GET(qcam_reg, (mdev)->caps.qcam, qos_feature_cap_mask.feature_cap.fld)
+
 #define MLX5_CAP_FPGA(mdev, cap) \
 	MLX5_GET(fpga_cap, (mdev)->caps.fpga, cap)
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 08c77b7e59cb..ed5be52282ea 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -109,6 +109,7 @@ enum {
 enum {
 	MLX5_REG_QETCR		 = 0x4005,
 	MLX5_REG_QTCT		 = 0x400a,
+	MLX5_REG_QCAM            = 0x4019,
 	MLX5_REG_DCBX_PARAM      = 0x4020,
 	MLX5_REG_DCBX_APP        = 0x4021,
 	MLX5_REG_FPGA_CAP	 = 0x4022,
@@ -798,6 +799,7 @@ struct mlx5_core_dev {
 		u32 pcam[MLX5_ST_SZ_DW(pcam_reg)];
 		u32 mcam[MLX5_ST_SZ_DW(mcam_reg)];
 		u32 fpga[MLX5_ST_SZ_DW(fpga_cap)];
+		u32 qcam[MLX5_ST_SZ_DW(qcam_reg)];
 	} caps;
 	phys_addr_t		iseg_base;
 	struct mlx5_init_seg __iomem *iseg;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 69772347f866..f127c5b310c5 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -838,7 +838,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         cc_modify_allowed[0x1];
 	u8         start_pad[0x1];
 	u8         cache_line_128byte[0x1];
-	u8         reserved_at_165[0xb];
+	u8         reserved_at_165[0xa];
+	u8         qcam_reg[0x1];
 	u8         gid_table_size[0x10];
 
 	u8         out_of_seq_cnt[0x1];
@@ -7890,6 +7891,43 @@ struct mlx5_ifc_mcam_reg_bits {
 	u8         reserved_at_1c0[0x80];
 };
 
+struct mlx5_ifc_qcam_access_reg_cap_mask {
+	u8         qcam_access_reg_cap_mask_127_to_20[0x6C];
+	u8         qpdpm[0x1];
+	u8         qcam_access_reg_cap_mask_18_to_4[0x0F];
+	u8         qdpm[0x1];
+	u8         qpts[0x1];
+	u8         qcap[0x1];
+	u8         qcam_access_reg_cap_mask_0[0x1];
+};
+
+struct mlx5_ifc_qcam_qos_feature_cap_mask {
+	u8         qcam_qos_feature_cap_mask_127_to_1[0x7F];
+	u8         qpts_trust_both[0x1];
+};
+
+struct mlx5_ifc_qcam_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         feature_group[0x8];
+	u8         reserved_at_10[0x8];
+	u8         access_reg_group[0x8];
+	u8         reserved_at_20[0x20];
+
+	union {
+		struct mlx5_ifc_qcam_access_reg_cap_mask reg_cap;
+		u8  reserved_at_0[0x80];
+	} qos_access_reg_cap_mask;
+
+	u8         reserved_at_c0[0x80];
+
+	union {
+		struct mlx5_ifc_qcam_qos_feature_cap_mask feature_cap;
+		u8  reserved_at_0[0x80];
+	} qos_feature_cap_mask;
+
+	u8         reserved_at_1c0[0x80];
+};
+
 struct mlx5_ifc_pcap_reg_bits {
 	u8         reserved_at_0[0x8];
 	u8         local_port[0x8];
-- 
cgit v1.2.3


From 71c70eb21c33c60433b95e72a59d40bb128db649 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Wed, 2 Aug 2017 21:36:23 -0500
Subject: net/mlx5: Add MLX5_SET16 and MLX5_GET16

Add MLX5_SET16 and MLX5_GET16 for 16bit structure field in firmware
command.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/device.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 6d79b3f79458..409ffb14298a 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -49,11 +49,15 @@
 #define __mlx5_nullp(typ) ((struct mlx5_ifc_##typ##_bits *)0)
 #define __mlx5_bit_sz(typ, fld) sizeof(__mlx5_nullp(typ)->fld)
 #define __mlx5_bit_off(typ, fld) (offsetof(struct mlx5_ifc_##typ##_bits, fld))
+#define __mlx5_16_off(typ, fld) (__mlx5_bit_off(typ, fld) / 16)
 #define __mlx5_dw_off(typ, fld) (__mlx5_bit_off(typ, fld) / 32)
 #define __mlx5_64_off(typ, fld) (__mlx5_bit_off(typ, fld) / 64)
+#define __mlx5_16_bit_off(typ, fld) (16 - __mlx5_bit_sz(typ, fld) - (__mlx5_bit_off(typ, fld) & 0xf))
 #define __mlx5_dw_bit_off(typ, fld) (32 - __mlx5_bit_sz(typ, fld) - (__mlx5_bit_off(typ, fld) & 0x1f))
 #define __mlx5_mask(typ, fld) ((u32)((1ull << __mlx5_bit_sz(typ, fld)) - 1))
 #define __mlx5_dw_mask(typ, fld) (__mlx5_mask(typ, fld) << __mlx5_dw_bit_off(typ, fld))
+#define __mlx5_mask16(typ, fld) ((u16)((1ull << __mlx5_bit_sz(typ, fld)) - 1))
+#define __mlx5_16_mask(typ, fld) (__mlx5_mask16(typ, fld) << __mlx5_16_bit_off(typ, fld))
 #define __mlx5_st_sz_bits(typ) sizeof(struct mlx5_ifc_##typ##_bits)
 
 #define MLX5_FLD_SZ_BYTES(typ, fld) (__mlx5_bit_sz(typ, fld) / 8)
@@ -116,6 +120,19 @@ __mlx5_mask(typ, fld))
 	___t; \
 })
 
+#define MLX5_GET16(typ, p, fld) ((be16_to_cpu(*((__be16 *)(p) +\
+__mlx5_16_off(typ, fld))) >> __mlx5_16_bit_off(typ, fld)) & \
+__mlx5_mask16(typ, fld))
+
+#define MLX5_SET16(typ, p, fld, v) do { \
+	u16 _v = v; \
+	BUILD_BUG_ON(__mlx5_st_sz_bits(typ) % 16);             \
+	*((__be16 *)(p) + __mlx5_16_off(typ, fld)) = \
+	cpu_to_be16((be16_to_cpu(*((__be16 *)(p) + __mlx5_16_off(typ, fld))) & \
+		     (~__mlx5_16_mask(typ, fld))) | (((_v) & __mlx5_mask16(typ, fld)) \
+		     << __mlx5_16_bit_off(typ, fld))); \
+} while (0)
+
 /* Big endian getters */
 #define MLX5_GET64_BE(typ, p, fld) (*((__be64 *)(p) +\
 	__mlx5_64_off(typ, fld)))
-- 
cgit v1.2.3


From 415a64aa8dc6b4fc478609c549ca652d95a12f13 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Tue, 18 Jul 2017 16:08:46 -0500
Subject: net/mlx5: QPTS and QPDPM register firmware command support

The QPTS register allows changing the priority trust state between pcp and
dscp. Add support to get/set trust state from device. When the port is
in pcp/dscp trust state, packet is routed by hardware to matching priority
based on its pcp/dscp value respectively.

The QPDPM register allow channing the dscp to priority mapping. Add support
to get/set dscp to priority mapping from device.
Note that to change a dscp mapping, the "e" bit of this dscp structure
must be set in the QPDPM firmware command.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/driver.h   |  7 +++++++
 include/linux/mlx5/mlx5_ifc.h | 20 ++++++++++++++++++++
 include/linux/mlx5/port.h     |  5 +++++
 3 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ed5be52282ea..a886b51511ab 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -107,8 +107,10 @@ enum {
 };
 
 enum {
+	MLX5_REG_QPTS            = 0x4002,
 	MLX5_REG_QETCR		 = 0x4005,
 	MLX5_REG_QTCT		 = 0x400a,
+	MLX5_REG_QPDPM           = 0x4013,
 	MLX5_REG_QCAM            = 0x4019,
 	MLX5_REG_DCBX_PARAM      = 0x4020,
 	MLX5_REG_DCBX_APP        = 0x4021,
@@ -142,6 +144,11 @@ enum {
 	MLX5_REG_MCAM		 = 0x907f,
 };
 
+enum mlx5_qpts_trust_state {
+	MLX5_QPTS_TRUST_PCP  = 1,
+	MLX5_QPTS_TRUST_DSCP = 2,
+};
+
 enum mlx5_dcbx_oper_mode {
 	MLX5E_DCBX_PARAM_VER_OPER_HOST  = 0x0,
 	MLX5E_DCBX_PARAM_VER_OPER_AUTO  = 0x3,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f127c5b310c5..3e5363f760dd 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -8578,6 +8578,26 @@ struct mlx5_ifc_qetc_reg_bits {
 	struct mlx5_ifc_ets_global_config_reg_bits global_configuration;
 };
 
+struct mlx5_ifc_qpdpm_dscp_reg_bits {
+	u8         e[0x1];
+	u8         reserved_at_01[0x0b];
+	u8         prio[0x04];
+};
+
+struct mlx5_ifc_qpdpm_reg_bits {
+	u8                                     reserved_at_0[0x8];
+	u8                                     local_port[0x8];
+	u8                                     reserved_at_10[0x10];
+	struct mlx5_ifc_qpdpm_dscp_reg_bits    dscp[64];
+};
+
+struct mlx5_ifc_qpts_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         local_port[0x8];
+	u8         reserved_at_10[0x2d];
+	u8         trust_state[0x3];
+};
+
 struct mlx5_ifc_qtct_reg_bits {
 	u8         reserved_at_0[0x8];
 	u8         port_number[0x8];
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index c59af8ab753a..035f0d4dc9fe 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -179,4 +179,9 @@ int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
 
 int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out);
 int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in);
+
+int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state);
+int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state);
+int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio);
+int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio);
 #endif /* __MLX5_PORT_H__ */
-- 
cgit v1.2.3


From f4e63525ee35f9c02e9f51f90571718363e9a9a9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:16 -0700
Subject: net: bpf: rename ndo_xdp to ndo_bpf

ndo_xdp is a control path callback for setting up XDP in the
driver.  We can reuse it for other forms of communication
between the eBPF stack and the drivers.  Rename the callback
and associated structures and definitions.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7de7656550c2..9af9feaaeb64 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -779,10 +779,10 @@ enum tc_setup_type {
 	TC_SETUP_CBS,
 };
 
-/* These structures hold the attributes of xdp state that are being passed
- * to the netdevice through the xdp op.
+/* These structures hold the attributes of bpf state that are being passed
+ * to the netdevice through the bpf op.
  */
-enum xdp_netdev_command {
+enum bpf_netdev_command {
 	/* Set or clear a bpf program used in the earliest stages of packet
 	 * rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee
 	 * is responsible for calling bpf_prog_put on any old progs that are
@@ -801,8 +801,8 @@ enum xdp_netdev_command {
 
 struct netlink_ext_ack;
 
-struct netdev_xdp {
-	enum xdp_netdev_command command;
+struct netdev_bpf {
+	enum bpf_netdev_command command;
 	union {
 		/* XDP_SETUP_PROG */
 		struct {
@@ -1124,9 +1124,10 @@ struct dev_ifalias {
  *	appropriate rx headroom value allows avoiding skb head copy on
  *	forward. Setting a negative value resets the rx headroom to the
  *	default value.
- * int (*ndo_xdp)(struct net_device *dev, struct netdev_xdp *xdp);
+ * int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf);
  *	This function is used to set or query state related to XDP on the
- *	netdevice. See definition of enum xdp_netdev_command for details.
+ *	netdevice and manage BPF offload. See definition of
+ *	enum bpf_netdev_command for details.
  * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp);
  *	This function is used to submit a XDP packet for transmit on a
  *	netdevice.
@@ -1315,8 +1316,8 @@ struct net_device_ops {
 						       struct sk_buff *skb);
 	void			(*ndo_set_rx_headroom)(struct net_device *dev,
 						       int needed_headroom);
-	int			(*ndo_xdp)(struct net_device *dev,
-					   struct netdev_xdp *xdp);
+	int			(*ndo_bpf)(struct net_device *dev,
+					   struct netdev_bpf *bpf);
 	int			(*ndo_xdp_xmit)(struct net_device *dev,
 						struct xdp_buff *xdp);
 	void			(*ndo_xdp_flush)(struct net_device *dev);
@@ -3311,10 +3312,10 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
 
-typedef int (*xdp_op_t)(struct net_device *dev, struct netdev_xdp *xdp);
+typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, u32 flags);
-u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id);
+u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t xdp_op, u32 *prog_id);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
-- 
cgit v1.2.3


From ab3f0063c48c26c927851b6767824e35a716d878 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:17 -0700
Subject: bpf: offload: add infrastructure for loading programs for a specific
 netdev

The fact that we don't know which device the program is going
to be used on is quite limiting in current eBPF infrastructure.
We have to reverse or limit the changes which kernel makes to
the loaded bytecode if we want it to be offloaded to a networking
device.  We also have to invent new APIs for debugging and
troubleshooting support.

Make it possible to load programs for a specific netdev.  This
helps us to bring the debug information closer to the core
eBPF infrastructure (e.g. we will be able to reuse the verifer
log in device JIT).  It allows device JITs to perform translation
on the original bytecode.

__bpf_prog_get() when called to get a reference for an attachment
point will now refuse to give it if program has a device assigned.
Following patches will add a version of that function which passes
the expected netdev in. @type argument in __bpf_prog_get() is
renamed to attach_type to make it clearer that it's only set on
attachment.

All calls to ndo_bpf are protected by rtnl, only verifier callbacks
are not.  We need a wait queue to make sure netdev doesn't get
destroyed while verifier is still running and calling its driver.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h          | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/bpf_verifier.h | 10 ++++++++++
 include/linux/netdevice.h    | 14 ++++++++++++++
 3 files changed, 60 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 520aeebe0d93..e45d43f9ec92 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -15,6 +15,7 @@
 #include <linux/err.h>
 #include <linux/rbtree_latch.h>
 #include <linux/numa.h>
+#include <linux/wait.h>
 
 struct perf_event;
 struct bpf_prog;
@@ -182,6 +183,16 @@ struct bpf_verifier_ops {
 				  struct bpf_prog *prog, u32 *target_size);
 };
 
+struct bpf_dev_offload {
+	struct bpf_prog		*prog;
+	struct net_device	*netdev;
+	void			*dev_priv;
+	struct list_head	offloads;
+	bool			dev_state;
+	bool			verifier_running;
+	wait_queue_head_t	verifier_done;
+};
+
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
@@ -199,6 +210,7 @@ struct bpf_prog_aux {
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
+	struct bpf_dev_offload *offload;
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
@@ -317,6 +329,7 @@ extern const struct file_operations bpf_prog_fops;
 #undef BPF_PROG_TYPE
 #undef BPF_MAP_TYPE
 
+extern const struct bpf_prog_ops bpf_offload_prog_ops;
 extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
 extern const struct bpf_verifier_ops xdp_analyzer_ops;
 
@@ -491,6 +504,29 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
+int bpf_prog_offload_compile(struct bpf_prog *prog);
+void bpf_prog_offload_destroy(struct bpf_prog *prog);
+
+#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
+int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
+
+static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
+{
+	return aux->offload;
+}
+#else
+static inline int bpf_prog_offload_init(struct bpf_prog *prog,
+					union bpf_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
+{
+	return false;
+}
+#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
+
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
 struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
 int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3b0976aaac75..e45011dbc02d 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -153,6 +153,7 @@ struct bpf_verifier_env {
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
+	const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */
 	void *analyzer_priv; /* pointer to external analyzer's private data */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
@@ -169,6 +170,15 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
 	return env->cur_state->regs;
 }
 
+#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
+#else
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 		 void *priv);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9af9feaaeb64..fda527ccb263 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -797,8 +797,13 @@ enum bpf_netdev_command {
 	 * is equivalent to XDP_ATTACHED_DRV.
 	 */
 	XDP_QUERY_PROG,
+	/* BPF program for offload callbacks, invoked at program load time. */
+	BPF_OFFLOAD_VERIFIER_PREP,
+	BPF_OFFLOAD_TRANSLATE,
+	BPF_OFFLOAD_DESTROY,
 };
 
+struct bpf_ext_analyzer_ops;
 struct netlink_ext_ack;
 
 struct netdev_bpf {
@@ -815,6 +820,15 @@ struct netdev_bpf {
 			u8 prog_attached;
 			u32 prog_id;
 		};
+		/* BPF_OFFLOAD_VERIFIER_PREP */
+		struct {
+			struct bpf_prog *prog;
+			const struct bpf_ext_analyzer_ops *ops; /* callee set */
+		} verifier;
+		/* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */
+		struct {
+			struct bpf_prog *prog;
+		} offload;
 	};
 };
 
-- 
cgit v1.2.3


From bd601b6ada11fdfb9e277f24ad2eb54bc599156b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:18 -0700
Subject: bpf: report offload info to user space

Extend struct bpf_prog_info to contain information about program
being bound to a device.  Since the netdev may get destroyed while
program still exists we need a flag to indicate the program is
loaded for a device, even if the device is gone.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e45d43f9ec92..98bacd0fa5cc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -506,6 +506,7 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
+u32 bpf_prog_offload_ifindex(struct bpf_prog *prog);
 
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
-- 
cgit v1.2.3


From 248f346ffe9508dee0039db4ac839cb31ba3bdec Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:20 -0700
Subject: xdp: allow attaching programs loaded for specific device

Pass the netdev pointer to bpf_prog_get_type().  This way
BPF code can decide whether the device matches what the
code was loaded/translated for.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 98bacd0fa5cc..c397934f91dd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -335,6 +335,8 @@ extern const struct bpf_verifier_ops xdp_analyzer_ops;
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
+struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
+				       struct net_device *netdev);
 struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
 void bpf_prog_sub(struct bpf_prog *prog, int i);
 struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog);
@@ -428,6 +430,14 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
+
+static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
+						     enum bpf_prog_type type,
+						     struct net_device *netdev)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog,
 							  int i)
 {
-- 
cgit v1.2.3


From b37a530613104aa3f592376c67a462823298759c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:30 -0700
Subject: bpf: remove old offload/analyzer

Thanks to the ability to load a program for a specific device,
running verifier twice is no longer needed.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e45011dbc02d..07b96aaca256 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -152,9 +152,7 @@ struct bpf_verifier_env {
 	bool strict_alignment;		/* perform strict pointer alignment checks */
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
-	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
 	const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */
-	void *analyzer_priv; /* pointer to external analyzer's private data */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
 	u32 id_gen;			/* used to generate unique reg IDs */
@@ -179,7 +177,4 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
 }
 #endif
 
-int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
-		 void *priv);
-
 #endif /* _LINUX_BPF_VERIFIER_H */
-- 
cgit v1.2.3


From 1f2556916d974cfb62b6af51660186b5f58bd869 Mon Sep 17 00:00:00 2001
From: Priyaranjan Jha <priyarjha@google.com>
Date: Fri, 3 Nov 2017 16:38:48 -0700
Subject: tcp: higher throughput under reordering with adaptive RACK reordering
 wnd

Currently TCP RACK loss detection does not work well if packets are
being reordered beyond its static reordering window (min_rtt/4).Under
such reordering it may falsely trigger loss recoveries and reduce TCP
throughput significantly.

This patch improves that by increasing and reducing the reordering
window based on DSACK, which is now supported in major TCP implementations.
It makes RACK's reo_wnd adaptive based on DSACK and no. of recoveries.

- If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded
  by srtt), since there is possibility that spurious retransmission was
  due to reordering delay longer than reo_wnd.

- Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
  no. of successful recoveries (accounts for full DSACK-based loss
  recovery undo). After that, reset it to default (min_rtt/4).

- At max, reo_wnd is incremented only once per rtt. So that the new
  DSACK on which we are reacting, is due to the spurious retx (approx)
  after the reo_wnd has been updated last time.

- reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
  absolute value to account for change in rtt.

In our internal testing, we observed significant increase in throughput,
in scenarios where reordering exceeds min_rtt/4 (previous static value).

Signed-off-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 8c431385b272..22f40c96a15b 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -210,8 +210,13 @@ struct tcp_sock {
 		u64 mstamp; /* (Re)sent time of the skb */
 		u32 rtt_us;  /* Associated RTT */
 		u32 end_seq; /* Ending TCP sequence of the skb */
-		u8 advanced; /* mstamp advanced since last lost marking */
-		u8 reord;    /* reordering detected */
+		u32 last_delivered; /* tp->delivered at last reo_wnd adj */
+		u8 reo_wnd_steps;   /* Allowed reordering window */
+#define TCP_RACK_RECOVERY_THRESH 16
+		u8 reo_wnd_persist:5, /* No. of recovery since last adj */
+		   dsack_seen:1, /* Whether DSACK seen after last adj */
+		   advanced:1,	 /* mstamp advanced since last lost marking */
+		   reord:1;	 /* reordering detected */
 	} rack;
 	u16	advmss;		/* Advertised MSS			*/
 	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
-- 
cgit v1.2.3


From ecf8fecb7828648cba0e42de7464a7e600c93459 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Sun, 5 Nov 2017 08:15:31 -0500
Subject: device_cgroup: prepare code for bpf-based device controller

This is non-functional change to prepare the device cgroup code
for adding eBPF-based controller for cgroups v2.

The patch performs the following changes:
1) __devcgroup_inode_permission() and devcgroup_inode_mknod()
   are moving to the device-cgroup.h and converting into static inline.
2) __devcgroup_check_permission() is exported.
3) devcgroup_check_permission() wrapper is introduced to be used
   by both existing and new bpf-based implementations.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/device_cgroup.h | 61 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 57 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h
index cdbc344a92e4..2d93d7ecd479 100644
--- a/include/linux/device_cgroup.h
+++ b/include/linux/device_cgroup.h
@@ -1,17 +1,70 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/fs.h>
 
+#define DEVCG_ACC_MKNOD 1
+#define DEVCG_ACC_READ  2
+#define DEVCG_ACC_WRITE 4
+#define DEVCG_ACC_MASK (DEVCG_ACC_MKNOD | DEVCG_ACC_READ | DEVCG_ACC_WRITE)
+
+#define DEVCG_DEV_BLOCK 1
+#define DEVCG_DEV_CHAR  2
+#define DEVCG_DEV_ALL   4  /* this represents all devices */
+
+#ifdef CONFIG_CGROUP_DEVICE
+extern int __devcgroup_check_permission(short type, u32 major, u32 minor,
+					short access);
+#else
+static inline int __devcgroup_check_permission(short type, u32 major, u32 minor,
+					       short access)
+{ return 0; }
+#endif
+
 #ifdef CONFIG_CGROUP_DEVICE
-extern int __devcgroup_inode_permission(struct inode *inode, int mask);
-extern int devcgroup_inode_mknod(int mode, dev_t dev);
+static inline int devcgroup_check_permission(short type, u32 major, u32 minor,
+					     short access)
+{
+	return __devcgroup_check_permission(type, major, minor, access);
+}
+
 static inline int devcgroup_inode_permission(struct inode *inode, int mask)
 {
+	short type, access = 0;
+
 	if (likely(!inode->i_rdev))
 		return 0;
-	if (!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))
+
+	if (S_ISBLK(inode->i_mode))
+		type = DEVCG_DEV_BLOCK;
+	else if (S_ISCHR(inode->i_mode))
+		type = DEVCG_DEV_CHAR;
+	else
 		return 0;
-	return __devcgroup_inode_permission(inode, mask);
+
+	if (mask & MAY_WRITE)
+		access |= DEVCG_ACC_WRITE;
+	if (mask & MAY_READ)
+		access |= DEVCG_ACC_READ;
+
+	return devcgroup_check_permission(type, imajor(inode), iminor(inode),
+					  access);
 }
+
+static inline int devcgroup_inode_mknod(int mode, dev_t dev)
+{
+	short type;
+
+	if (!S_ISBLK(mode) && !S_ISCHR(mode))
+		return 0;
+
+	if (S_ISBLK(mode))
+		type = DEVCG_DEV_BLOCK;
+	else
+		type = DEVCG_DEV_CHAR;
+
+	return devcgroup_check_permission(type, MAJOR(dev), MINOR(dev),
+					  DEVCG_ACC_MKNOD);
+}
+
 #else
 static inline int devcgroup_inode_permission(struct inode *inode, int mask)
 { return 0; }
-- 
cgit v1.2.3


From ebc614f687369f9df99828572b1d85a7c2de3d92 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Sun, 5 Nov 2017 08:15:32 -0500
Subject: bpf, cgroup: implement eBPF-based device controller for cgroup v2

Cgroup v2 lacks the device controller, provided by cgroup v1.
This patch adds a new eBPF program type, which in combination
of previously added ability to attach multiple eBPF programs
to a cgroup, will provide a similar functionality, but with some
additional flexibility.

This patch introduces a BPF_PROG_TYPE_CGROUP_DEVICE program type.
A program takes major and minor device numbers, device type
(block/character) and access type (mknod/read/write) as parameters
and returns an integer which defines if the operation should be
allowed or terminated with -EPERM.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h    | 15 +++++++++++++++
 include/linux/bpf_types.h     |  3 +++
 include/linux/device_cgroup.h |  8 +++++++-
 3 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 87a7db9feb38..a7f16e0f8d68 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -67,6 +67,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     struct bpf_sock_ops_kern *sock_ops,
 				     enum bpf_attach_type type);
 
+int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
+				      short access, enum bpf_attach_type type);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -112,6 +115,17 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 	}								       \
 	__ret;								       \
 })
+
+#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access)	      \
+({									      \
+	int __ret = 0;							      \
+	if (cgroup_bpf_enabled)						      \
+		__ret = __cgroup_bpf_check_dev_permission(type, major, minor, \
+							  access,	      \
+							  BPF_CGROUP_DEVICE); \
+									      \
+	__ret;								      \
+})
 #else
 
 struct cgroup_bpf {};
@@ -122,6 +136,7 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
 
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 53c5b9ad7220..978c1d9c9383 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -19,6 +19,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
 BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
 BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 #endif
+#ifdef CONFIG_CGROUP_BPF
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h
index 2d93d7ecd479..8557efe096dc 100644
--- a/include/linux/device_cgroup.h
+++ b/include/linux/device_cgroup.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/fs.h>
+#include <linux/bpf-cgroup.h>
 
 #define DEVCG_ACC_MKNOD 1
 #define DEVCG_ACC_READ  2
@@ -19,10 +20,15 @@ static inline int __devcgroup_check_permission(short type, u32 major, u32 minor,
 { return 0; }
 #endif
 
-#ifdef CONFIG_CGROUP_DEVICE
+#if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF)
 static inline int devcgroup_check_permission(short type, u32 major, u32 minor,
 					     short access)
 {
+	int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access);
+
+	if (rc)
+		return -EPERM;
+
 	return __devcgroup_check_permission(type, major, minor, access);
 }
 
-- 
cgit v1.2.3


From 3928ee6485a316c8abde7e24c7f82033a1c8d3ae Mon Sep 17 00:00:00 2001
From: "Maciej S. Szmigiero" <mail@maciej.szmigiero.name>
Date: Thu, 2 Nov 2017 00:49:18 +0100
Subject: net: phy: leds: Add support for "link" trigger

Currently, we create a LED trigger for any link speed known to a PHY.
These triggers only fire when their exact link speed had been negotiated
(they aren't cumulative, that is, they don't fire for "their or any higher"
link speed).

What we are missing, however, is a trigger which will fire on any link
speed known to the PHY. Such trigger can then be used for implementing a
poor man's substitute of the "link" LED on boards that lack it.
Let's add it.

Signed-off-by: Maciej S. Szmigiero <mail@maciej.szmigiero.name>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index d78cd01ea513..dc82a07cb4fd 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -451,6 +451,8 @@ struct phy_device {
 	struct phy_led_trigger *phy_led_triggers;
 	unsigned int phy_num_led_triggers;
 	struct phy_led_trigger *last_triggered;
+
+	struct phy_led_trigger *led_link_trigger;
 #endif
 
 	/*
-- 
cgit v1.2.3


From 602f3baf22188aad24b9a58be3209ab774b97d74 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Mon, 6 Nov 2017 07:23:41 +0100
Subject: net_sch: red: Add offload ability to RED qdisc

Add the ability to offload RED qdisc by using ndo_setup_tc.
There are four commands for RED offloading:
* TC_RED_SET: handles set and change.
* TC_RED_DESTROY: handle qdisc destroy.
* TC_RED_STATS: update the qdiscs counters (given as reference)
* TC_RED_XSTAT: returns red xstats.

Whether RED is being offloaded is being determined every time dump action
is being called because parent change of this qdisc could change its
offload state but doesn't require any RED function to be called.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fda527ccb263..71968a2ca9f3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -777,6 +777,7 @@ enum tc_setup_type {
 	TC_SETUP_CLSBPF,
 	TC_SETUP_BLOCK,
 	TC_SETUP_CBS,
+	TC_SETUP_QDISC_RED,
 };
 
 /* These structures hold the attributes of bpf state that are being passed
-- 
cgit v1.2.3


From 575ed7d39e2fbe602a3894bc766a8cb49af83bd3 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Mon, 6 Nov 2017 07:23:42 +0100
Subject: net_sch: mqprio: Change TC_SETUP_MQPRIO to TC_SETUP_QDISC_MQPRIO

Change TC_SETUP_MQPRIO to TC_SETUP_QDISC_MQPRIO to match the new
convention.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 71968a2ca9f3..703885aed856 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -770,7 +770,7 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
 				       struct sk_buff *skb);
 
 enum tc_setup_type {
-	TC_SETUP_MQPRIO,
+	TC_SETUP_QDISC_MQPRIO,
 	TC_SETUP_CLSU32,
 	TC_SETUP_CLSFLOWER,
 	TC_SETUP_CLSMATCHALL,
-- 
cgit v1.2.3


From 8521db4c7e155d12fb280686c0552e47f77e9110 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Mon, 6 Nov 2017 07:23:43 +0100
Subject: net_sch: cbs: Change TC_SETUP_CBS to TC_SETUP_QDISC_CBS

Change TC_SETUP_CBS to TC_SETUP_QDISC_CBS to match the new convention..

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 703885aed856..30f0f2928808 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -776,7 +776,7 @@ enum tc_setup_type {
 	TC_SETUP_CLSMATCHALL,
 	TC_SETUP_CLSBPF,
 	TC_SETUP_BLOCK,
-	TC_SETUP_CBS,
+	TC_SETUP_QDISC_CBS,
 	TC_SETUP_QDISC_RED,
 };
 
-- 
cgit v1.2.3


From 620a5c860b774a81ce3f193eefb52bf4d128cca5 Mon Sep 17 00:00:00 2001
From: Egil Hjelmeland <privat@egil-hjelmeland.no>
Date: Mon, 6 Nov 2017 12:42:01 +0100
Subject: net: dsa: lan9303: Correct register names in comments

Two comments refer to registers, but lack the LAN9303_ prefix.
Fix that.

Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/lan9303.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dsa/lan9303.h b/include/linux/dsa/lan9303.h
index 05d8d136baab..f48a85c377de 100644
--- a/include/linux/dsa/lan9303.h
+++ b/include/linux/dsa/lan9303.h
@@ -13,8 +13,8 @@ struct lan9303_phy_ops {
 #define LAN9303_NUM_ALR_RECORDS 512
 struct lan9303_alr_cache_entry {
 	u8  mac_addr[ETH_ALEN];
-	u8  port_map;           /* Bitmap of ports. Zero if unused entry */
-	u8  stp_override;       /* non zero if set ALR_DAT1_AGE_OVERRID */
+	u8  port_map;         /* Bitmap of ports. Zero if unused entry */
+	u8  stp_override;     /* non zero if set LAN9303_ALR_DAT1_AGE_OVERRID */
 };
 
 struct lan9303 {
@@ -28,7 +28,9 @@ struct lan9303 {
 	struct mutex indirect_mutex; /* protect indexed register access */
 	const struct lan9303_phy_ops *ops;
 	bool is_bridged; /* true if port 1 and 2 are bridged */
-	u32 swe_port_state; /* remember SWE_PORT_STATE while not bridged */
+
+	/* remember LAN9303_SWE_PORT_STATE while not bridged */
+	u32 swe_port_state;
 	/* LAN9303 do not offer reading specific ALR entry. Cache all
 	 * static entries in a flat table
 	 **/
-- 
cgit v1.2.3


From 96c623e51f1c40bf524decc48c6fac7ce5dd41f7 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 6 Nov 2017 14:26:10 +0100
Subject: of: add of_property_read_variable_* dummy helpers

Commit a67e9472da42 ("of: Add array read functions with min/max size
limits") added a new interface for reading variable-length arrays from
DT properties. One user was added in dsa recently and this causes a
build error because that code can be built with CONFIG_OF disabled:

net/dsa/dsa2.c: In function 'dsa_switch_parse_member_of':
net/dsa/dsa2.c:678:7: error: implicit declaration of function 'of_property_read_variable_u32_array'; did you mean 'of_property_read_u32_array'? [-Werror=implicit-function-declaration]

This adds a dummy functions for of_property_read_variable_u32_array()
and a few others that had been missing here. I decided to move
of_property_read_string() and of_property_read_string_helper() in the
process to make it easier to compare the two sets of function prototypes
to make sure they match.

Fixes: 975e6e32215e ("net: dsa: rework switch parsing")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/of.h | 62 +++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 50 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index b240ed69dc96..b32d418d011a 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -675,12 +675,6 @@ static inline int of_property_count_elems_of_size(const struct device_node *np,
 	return -ENOSYS;
 }
 
-static inline int of_property_read_u32_index(const struct device_node *np,
-			const char *propname, u32 index, u32 *out_value)
-{
-	return -ENOSYS;
-}
-
 static inline int of_property_read_u8_array(const struct device_node *np,
 			const char *propname, u8 *out_values, size_t sz)
 {
@@ -707,16 +701,14 @@ static inline int of_property_read_u64_array(const struct device_node *np,
 	return -ENOSYS;
 }
 
-static inline int of_property_read_string(const struct device_node *np,
-					  const char *propname,
-					  const char **out_string)
+static inline int of_property_read_u32_index(const struct device_node *np,
+			const char *propname, u32 index, u32 *out_value)
 {
 	return -ENOSYS;
 }
 
-static inline int of_property_read_string_helper(const struct device_node *np,
-						 const char *propname,
-						 const char **out_strs, size_t sz, int index)
+static inline int of_property_read_u64_index(const struct device_node *np,
+			const char *propname, u32 index, u64 *out_value)
 {
 	return -ENOSYS;
 }
@@ -744,12 +736,51 @@ static inline int of_n_size_cells(struct device_node *np)
 	return 0;
 }
 
+static inline int of_property_read_variable_u8_array(const struct device_node *np,
+					const char *propname, u8 *out_values,
+					size_t sz_min, size_t sz_max)
+{
+	return -ENOSYS;
+}
+
+static inline int of_property_read_variable_u16_array(const struct device_node *np,
+					const char *propname, u16 *out_values,
+					size_t sz_min, size_t sz_max)
+{
+	return -ENOSYS;
+}
+
+static inline int of_property_read_variable_u32_array(const struct device_node *np,
+					const char *propname,
+					u32 *out_values,
+					size_t sz_min,
+					size_t sz_max)
+{
+	return -ENOSYS;
+}
+
 static inline int of_property_read_u64(const struct device_node *np,
 				       const char *propname, u64 *out_value)
 {
 	return -ENOSYS;
 }
 
+static inline int of_property_read_variable_u64_array(const struct device_node *np,
+					const char *propname,
+					u64 *out_values,
+					size_t sz_min,
+					size_t sz_max)
+{
+	return -ENOSYS;
+}
+
+static inline int of_property_read_string(const struct device_node *np,
+					  const char *propname,
+					  const char **out_string)
+{
+	return -ENOSYS;
+}
+
 static inline int of_property_match_string(const struct device_node *np,
 					   const char *propname,
 					   const char *string)
@@ -757,6 +788,13 @@ static inline int of_property_match_string(const struct device_node *np,
 	return -ENOSYS;
 }
 
+static inline int of_property_read_string_helper(const struct device_node *np,
+						 const char *propname,
+						 const char **out_strs, size_t sz, int index)
+{
+	return -ENOSYS;
+}
+
 static inline struct device_node *of_parse_phandle(const struct device_node *np,
 						   const char *phandle_name,
 						   int index)
-- 
cgit v1.2.3


From 375ef2b1f0d0b43b0d36ffdd521637ff59b0c13c Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 17 Sep 2017 13:43:58 +0300
Subject: net: Introduce netdev_*_once functions

Extend the net device error logging with netdev_*_once macros.
netdev_*_once are the equivalents of the dev_*_once macros which are
useful for messages that should only be logged once.

Also add netdev_WARN_ONCE, which is the "once" extension for the already
existing netdev_WARN macro.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/netdevice.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 30f0f2928808..79518ede3170 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4336,6 +4336,31 @@ void netdev_notice(const struct net_device *dev, const char *format, ...);
 __printf(2, 3)
 void netdev_info(const struct net_device *dev, const char *format, ...);
 
+#define netdev_level_once(level, dev, fmt, ...)			\
+do {								\
+	static bool __print_once __read_mostly;			\
+								\
+	if (!__print_once) {					\
+		__print_once = true;				\
+		netdev_printk(level, dev, fmt, ##__VA_ARGS__);	\
+	}							\
+} while (0)
+
+#define netdev_emerg_once(dev, fmt, ...) \
+	netdev_level_once(KERN_EMERG, dev, fmt, ##__VA_ARGS__)
+#define netdev_alert_once(dev, fmt, ...) \
+	netdev_level_once(KERN_ALERT, dev, fmt, ##__VA_ARGS__)
+#define netdev_crit_once(dev, fmt, ...) \
+	netdev_level_once(KERN_CRIT, dev, fmt, ##__VA_ARGS__)
+#define netdev_err_once(dev, fmt, ...) \
+	netdev_level_once(KERN_ERR, dev, fmt, ##__VA_ARGS__)
+#define netdev_warn_once(dev, fmt, ...) \
+	netdev_level_once(KERN_WARNING, dev, fmt, ##__VA_ARGS__)
+#define netdev_notice_once(dev, fmt, ...) \
+	netdev_level_once(KERN_NOTICE, dev, fmt, ##__VA_ARGS__)
+#define netdev_info_once(dev, fmt, ...) \
+	netdev_level_once(KERN_INFO, dev, fmt, ##__VA_ARGS__)
+
 #define MODULE_ALIAS_NETDEV(device) \
 	MODULE_ALIAS("netdev-" device)
 
@@ -4376,6 +4401,10 @@ do {								\
 	WARN(1, "netdevice: %s%s\n" format, netdev_name(dev),	\
 	     netdev_reg_state(dev), ##args)
 
+#define netdev_WARN_ONCE(dev, condition, format, arg...)		\
+	WARN_ONCE(1, "netdevice: %s%s\n" format, netdev_name(dev)	\
+		  netdev_reg_state(dev), ##args)
+
 /* netif printk helpers, similar to netdev_printk */
 
 #define netif_printk(priv, type, level, dev, fmt, args...)	\
-- 
cgit v1.2.3


From 4382c7b92a1db397874ca62c73aa8b023af6dba8 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 10 Sep 2017 13:22:51 +0300
Subject: net/mlx5e: Add 802.1ad VLAN insertion support

Report VLAN insertion support for S-tagged packets and add support by
choosing the correct VLAN type in the WQE.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/qp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 66d19b611fe4..62af7512dabb 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -221,6 +221,7 @@ enum {
 };
 
 enum {
+	MLX5_ETH_WQE_SVLAN              = 1 << 0,
 	MLX5_ETH_WQE_INSERT_VLAN        = 1 << 15,
 };
 
-- 
cgit v1.2.3


From 54985120a1c461b74f9510e5d730971f2a2383b1 Mon Sep 17 00:00:00 2001
From: Girish Moodalbail <girish.moodalbail@oracle.com>
Date: Tue, 7 Nov 2017 11:32:11 -0800
Subject: net: fix incorrect comment with regard to VLAN packet handling

The commit bcc6d4790361 ("net: vlan: make non-hw-accel rx path similar
to hw-accel") unified accel and non-accel path for VLAN RX. With that
fix we do not register any packet_type handler for VLANs anymore, so fix
the incorrect comment.

Signed-off-by: Girish Moodalbail <girish.moodalbail@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 79518ede3170..6b274bfe489f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4479,15 +4479,7 @@ do {								\
  *	Why 16. Because with 16 the only overlap we get on a hash of the
  *	low nibble of the protocol value is RARP/SNAP/X.25.
  *
- *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
- *             sure which should go first, but I bet it won't make much
- *             difference if we are running VLANs.  The good news is that
- *             this protocol won't be in the list unless compiled in, so
- *             the average user (w/out VLANs) will not be adversely affected.
- *             --BLG
- *
  *		0800	IP
- *		8100    802.1Q VLAN
  *		0001	802.3
  *		0002	AX.25
  *		0004	802.2
-- 
cgit v1.2.3


From dd0bb688eaa241b5655d396d45366cba9225aed9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 7 Nov 2017 15:28:42 -0500
Subject: bpf: add a bpf_override_function helper

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h       | 3 ++-
 include/linux/trace_events.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0cd02ff4ae30..eaec066f99e8 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -459,7 +459,8 @@ struct bpf_prog {
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1;	/* Do we need dst entry? */
+				dst_needed:1,	/* Do we need dst entry? */
+				kprobe_override:1; /* Do we override a kprobe? */
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 84014ecfa67f..17e5e820a84c 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -523,6 +523,7 @@ do {									\
 struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
+DECLARE_PER_CPU(int, bpf_kprobe_override);
 
 extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
-- 
cgit v1.2.3


From 2210d6b2f287d738eddf6b75f432126ce05450f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= <maze@google.com>
Date: Tue, 7 Nov 2017 21:52:09 -0800
Subject: net: ipv6: sysctl to specify IPv6 ND traffic class
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a per-device sysctl to specify the default traffic class to use for
kernel originated IPv6 Neighbour Discovery packets.

Currently this includes:

  - Router Solicitation (ICMPv6 type 133)
    ndisc_send_rs() -> ndisc_send_skb() -> ip6_nd_hdr()

  - Neighbour Solicitation (ICMPv6 type 135)
    ndisc_send_ns() -> ndisc_send_skb() -> ip6_nd_hdr()

  - Neighbour Advertisement (ICMPv6 type 136)
    ndisc_send_na() -> ndisc_send_skb() -> ip6_nd_hdr()

  - Redirect (ICMPv6 type 137)
    ndisc_send_redirect() -> ndisc_send_skb() -> ip6_nd_hdr()

and if the kernel ever gets around to generating RA's,
it would presumably also include:

  - Router Advertisement (ICMPv6 type 134)
    (radvd daemon could pick up on the kernel setting and use it)

Interface drivers may examine the Traffic Class value and translate
the DiffServ Code Point into a link-layer appropriate traffic
prioritization scheme.  An example of mapping IETF DSCP values to
IEEE 802.11 User Priority values can be found here:

    https://tools.ietf.org/html/draft-ietf-tsvwg-ieee-802-11

The expected primary use case is to properly prioritize ND over wifi.

Testing:
  jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  0
  jzem22:~# echo -1 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  -bash: echo: write error: Invalid argument
  jzem22:~# echo 256 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  -bash: echo: write error: Invalid argument
  jzem22:~# echo 0 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# echo 255 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  255
  jzem22:~# echo 34 > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# cat /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  34

  jzem22:~# echo $[0xDC] > /proc/sys/net/ipv6/conf/eth0/ndisc_tclass
  jzem22:~# tcpdump -v -i eth0 icmp6 and src host jzem22.pgc and dst host fe80::1
  tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 262144 bytes
  IP6 (class 0xdc, hlim 255, next-header ICMPv6 (58) payload length: 24)
  jzem22.pgc > fe80::1: [icmp6 sum ok] ICMP6, neighbor advertisement,
  length 24, tgt is jzem22.pgc, Flags [solicited]

(based on original change written by Erik Kline, with minor changes)

v2: fix 'suspicious rcu_dereference_check() usage'
    by explicitly grabbing the rcu_read_lock.

Cc: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: Erik Kline <ek@google.com>
Signed-off-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index ea04ca024f0d..cb18c6290ca8 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -73,6 +73,7 @@ struct ipv6_devconf {
 	__u32		enhanced_dad;
 	__u32		addr_gen_mode;
 	__s32		disable_policy;
+	__s32           ndisc_tclass;
 
 	struct ctl_table_header *sysctl_header;
 };
-- 
cgit v1.2.3


From f3edacbd697f94a743fff1a3d26910ab99948ba7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 11 Nov 2017 18:24:55 +0900
Subject: bpf: Revert bpf_overrid_function() helper changes.

NACK'd by x86 maintainer.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h       | 3 +--
 include/linux/trace_events.h | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index eaec066f99e8..0cd02ff4ae30 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -459,8 +459,7 @@ struct bpf_prog {
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1,	/* Do we need dst entry? */
-				kprobe_override:1; /* Do we override a kprobe? */
+				dst_needed:1;	/* Do we need dst entry? */
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 17e5e820a84c..84014ecfa67f 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -523,7 +523,6 @@ do {									\
 struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
-DECLARE_PER_CPU(int, bpf_kprobe_override);
 
 extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
-- 
cgit v1.2.3


From 713bafea92920103cd3d361657406cf04d0e22dd Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 8 Nov 2017 13:01:26 -0800
Subject: tcp: retire FACK loss detection

FACK loss detection has been disabled by default and the
successor RACK subsumed FACK and can handle reordering better.
This patch removes FACK to simplify TCP loss recovery.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 22f40c96a15b..9574936fe041 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -85,7 +85,6 @@ struct tcp_sack_block {
 
 /*These are used to set the sack_ok field in struct tcp_options_received */
 #define TCP_SACK_SEEN     (1 << 0)   /*1 = peer is SACK capable, */
-#define TCP_FACK_ENABLED  (1 << 1)   /*1 = FACK is enabled locally*/
 #define TCP_DSACK_SEEN    (1 << 2)   /*1 = DSACK was received from peer*/
 
 struct tcp_options_received {
-- 
cgit v1.2.3


From 737ff314563ca27f044f9a3a041e9d42491ef7ce Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 8 Nov 2017 13:01:27 -0800
Subject: tcp: use sequence distance to detect reordering

Replace the reordering distance measurement in packet unit with
sequence based approach. Previously it trackes the number of "packets"
toward the forward ACK (i.e.  highest sacked sequence)in a state
variable "fackets_out".

Precisely measuring reordering degree on packet distance has not much
benefit, as the degree constantly changes by factors like path, load,
and congestion window. It is also complicated and prone to arcane bugs.
This patch replaces with sequence-based approach that's much simpler.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 9574936fe041..df5d97a85e1a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -293,7 +293,6 @@ struct tcp_sock {
 	u32	pushed_seq;	/* Last pushed seq, required to talk to windows */
 	u32	lost_out;	/* Lost packets			*/
 	u32	sacked_out;	/* SACK'd packets			*/
-	u32	fackets_out;	/* FACK'd packets			*/
 
 	struct hrtimer	pacing_timer;
 
-- 
cgit v1.2.3


From 39b175211053c7a6a4d794c42e225994f1c069c2 Mon Sep 17 00:00:00 2001
From: Mat Martineau <mathew.j.martineau@linux.intel.com>
Date: Fri, 10 Nov 2017 14:03:51 -0800
Subject: net: Remove unused skb_shared_info member

ip6_frag_id was only used by UFO, which has been removed.
ipv6_proxy_select_ident() only existed to set ip6_frag_id and has no
in-tree callers.

Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 57d712671081..54fe91183a8e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -500,7 +500,6 @@ struct skb_shared_info {
 	struct skb_shared_hwtstamps hwtstamps;
 	unsigned int	gso_type;
 	u32		tskey;
-	__be32          ip6_frag_id;
 
 	/*
 	 * Warning : all fields before dataref are cleared in __alloc_skb()
-- 
cgit v1.2.3


From 096d1dd0f03211fb42d6c2457f248827604b7f0e Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 13 Nov 2017 16:19:46 +0100
Subject: netlink: remove unused NETLINK SKB flags

These flags are unused, remove them to be less confusing.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 6ddb4a5da371..49b4257ce1ea 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -17,9 +17,6 @@ static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb)
 }
 
 enum netlink_skb_flags {
-	NETLINK_SKB_MMAPED	= 0x1,	/* Packet data is mmaped */
-	NETLINK_SKB_TX		= 0x2,	/* Packet was sent by userspace */
-	NETLINK_SKB_DELIVERED	= 0x4,	/* Packet was delivered */
 	NETLINK_SKB_DST		= 0x8,	/* Dst set in sendto or sendmsg */
 };
 
-- 
cgit v1.2.3