From 02ec6cafd78c2052283516afc74c309745d20271 Mon Sep 17 00:00:00 2001
From: Hoang Le <hoang.h.le@dektech.com.au>
Date: Tue, 19 Mar 2019 18:49:48 +0700
Subject: tipc: support broadcast/replicast configurable for bc-link

Currently, a multicast stream uses either broadcast or replicast as
transmission method, based on the ratio between number of actual
destinations nodes and cluster size.

However, when an L2 interface (e.g., VXLAN) provides pseudo
broadcast support, this becomes very inefficient, as it blindly
replicates multicast packets to all cluster/subnet nodes,
irrespective of whether they host actual target sockets or not.

The TIPC multicast algorithm is able to distinguish real destination
nodes from other nodes, and hence provides a smarter and more
efficient method for transferring multicast messages than
pseudo broadcast can do.

Because of this, we now make it possible for users to force
the broadcast link to permanently switch to using replicast,
irrespective of which capabilities the bearer provides,
or pretend to provide.
Conversely, we also make it possible to force the broadcast link
to always use true broadcast. While maybe less useful in
deployed systems, this may at least be useful for testing the
broadcast algorithm in small clusters.

We retain the current AUTOSELECT ability, i.e., to let the broadcast link
automatically select which algorithm to use, and to switch back and forth
between broadcast and replicast as the ratio between destination
node number and cluster size changes. This remains the default method.

Furthermore, we make it possible to configure the threshold ratio for
such switches. The default ratio is now set to 10%, down from 25% in the
earlier implementation.

Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Hoang Le <hoang.h.le@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc_netlink.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h
index 0ebe02ef1a86..efb958fd167d 100644
--- a/include/uapi/linux/tipc_netlink.h
+++ b/include/uapi/linux/tipc_netlink.h
@@ -281,6 +281,8 @@ enum {
 	TIPC_NLA_PROP_TOL,		/* u32 */
 	TIPC_NLA_PROP_WIN,		/* u32 */
 	TIPC_NLA_PROP_MTU,		/* u32 */
+	TIPC_NLA_PROP_BROADCAST,	/* u32 */
+	TIPC_NLA_PROP_BROADCAST_RATIO,	/* u32 */
 
 	__TIPC_NLA_PROP_MAX,
 	TIPC_NLA_PROP_MAX = __TIPC_NLA_PROP_MAX - 1
-- 
cgit v1.2.3


From 9403cf2302588022d06f1878b072d3f6933021f0 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Tue, 19 Mar 2019 16:05:44 +0100
Subject: tcp: free request sock directly upon TFO or syncookies error

Since the request socket is created locally, it'd make more sense to
use reqsk_free() instead of reqsk_put() in TFO and syncookies' error
path.

However, tcp_get_cookie_sock() may set ->rsk_refcnt before freeing the
socket; tcp_conn_request() may also have non-null ->rsk_refcnt because
of tcp_try_fastopen(). In both cases 'req' hasn't been exposed
to the outside world and is safe to free immediately, but that'd
trigger the WARN_ON_ONCE in reqsk_free().

Define __reqsk_free() for these situations where we know nobody's
referencing the socket, even though ->rsk_refcnt might be non-null.
Now we can consolidate the error path of tcp_get_cookie_sock() and
tcp_conn_request().

Signed-off-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/request_sock.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 21a5243fecd1..9dfd7960d90a 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -106,10 +106,8 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener,
 	return req;
 }
 
-static inline void reqsk_free(struct request_sock *req)
+static inline void __reqsk_free(struct request_sock *req)
 {
-	WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0);
-
 	req->rsk_ops->destructor(req);
 	if (req->rsk_listener)
 		sock_put(req->rsk_listener);
@@ -117,6 +115,12 @@ static inline void reqsk_free(struct request_sock *req)
 	kmem_cache_free(req->rsk_ops->slab, req);
 }
 
+static inline void reqsk_free(struct request_sock *req)
+{
+	WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0);
+	__reqsk_free(req);
+}
+
 static inline void reqsk_put(struct request_sock *req)
 {
 	if (refcount_dec_and_test(&req->rsk_refcnt))
-- 
cgit v1.2.3


From 03f1eccc7a69c965351e6bee41c62afa2844752f Mon Sep 17 00:00:00 2001
From: Stephen Suryaputra <ssuryaextr@gmail.com>
Date: Tue, 19 Mar 2019 12:37:12 -0400
Subject: ipv6: Add icmp_echo_ignore_multicast support for ICMPv6

IPv4 has icmp_echo_ignore_broadcast to prevent responding to broadcast pings.
IPv6 needs a similar mechanism.

v1->v2:
- Remove NET_IPV6_ICMP_ECHO_IGNORE_MULTICAST.

Signed-off-by: Stephen Suryaputra <ssuryaextr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index b028a1dc150d..e29aff15acc9 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -33,6 +33,7 @@ struct netns_sysctl_ipv6 {
 	int auto_flowlabels;
 	int icmpv6_time;
 	int icmpv6_echo_ignore_all;
+	int icmpv6_echo_ignore_multicast;
 	int anycast_src_echo_reply;
 	int ip_nonlocal_bind;
 	int fwmark_reflect;
-- 
cgit v1.2.3


From f295b3ae9f5927e084bd5decdff82390e3471801 Mon Sep 17 00:00:00 2001
From: Vakul Garg <vakul.garg@nxp.com>
Date: Wed, 20 Mar 2019 02:03:36 +0000
Subject: net/tls: Add support of AES128-CCM based ciphers

Added support for AES128-CCM based record encryption. AES128-CCM is
similar to AES128-GCM. Both of them have same salt/iv/mac size. The
notable difference between the two is that while invoking AES128-CCM
operation, the salt||nonce (which is passed as IV) has to be prefixed
with a hardcoded value '2'. Further, CCM implementation in kernel
requires IV passed in crypto_aead_request() to be full '16' bytes.
Therefore, the record structure 'struct tls_rec' has been modified to
reserve '16' bytes for IV. This works for both GCM and CCM based cipher.

Signed-off-by: Vakul Garg <vakul.garg@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h        | 15 +++++++++++++--
 include/uapi/linux/tls.h | 15 +++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index a5a938583295..3ce71d78414c 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -60,6 +60,17 @@
 #define TLS_AAD_SPACE_SIZE		13
 #define TLS_DEVICE_NAME_MAX		32
 
+#define MAX_IV_SIZE			16
+
+/* For AES-CCM, the full 16-bytes of IV is made of '4' fields of given sizes.
+ *
+ * IV[16] = b0[1] || implicit nonce[4] || explicit nonce[8] || length[3]
+ *
+ * The field 'length' is encoded in field 'b0' as '(length width - 1)'.
+ * Hence b0 contains (3 - 1) = 2.
+ */
+#define TLS_AES_CCM_IV_B0_BYTE		2
+
 /*
  * This structure defines the routines for Inline TLS driver.
  * The following routines are optional and filled with a
@@ -123,8 +134,7 @@ struct tls_rec {
 	struct scatterlist sg_content_type;
 
 	char aad_space[TLS_AAD_SPACE_SIZE];
-	u8 iv_data[TLS_CIPHER_AES_GCM_128_IV_SIZE +
-		   TLS_CIPHER_AES_GCM_128_SALT_SIZE];
+	u8 iv_data[MAX_IV_SIZE];
 	struct aead_request aead_req;
 	u8 aead_req_ctx[];
 };
@@ -219,6 +229,7 @@ struct tls_prot_info {
 	u16 tag_size;
 	u16 overhead_size;
 	u16 iv_size;
+	u16 salt_size;
 	u16 rec_seq_size;
 	u16 aad_size;
 	u16 tail_size;
diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h
index 401d6f01de6a..5b9c26753e46 100644
--- a/include/uapi/linux/tls.h
+++ b/include/uapi/linux/tls.h
@@ -70,6 +70,13 @@
 #define TLS_CIPHER_AES_GCM_256_TAG_SIZE		16
 #define TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE		8
 
+#define TLS_CIPHER_AES_CCM_128				53
+#define TLS_CIPHER_AES_CCM_128_IV_SIZE			8
+#define TLS_CIPHER_AES_CCM_128_KEY_SIZE		16
+#define TLS_CIPHER_AES_CCM_128_SALT_SIZE		4
+#define TLS_CIPHER_AES_CCM_128_TAG_SIZE		16
+#define TLS_CIPHER_AES_CCM_128_REC_SEQ_SIZE		8
+
 #define TLS_SET_RECORD_TYPE	1
 #define TLS_GET_RECORD_TYPE	2
 
@@ -94,4 +101,12 @@ struct tls12_crypto_info_aes_gcm_256 {
 	unsigned char rec_seq[TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE];
 };
 
+struct tls12_crypto_info_aes_ccm_128 {
+	struct tls_crypto_info info;
+	unsigned char iv[TLS_CIPHER_AES_CCM_128_IV_SIZE];
+	unsigned char key[TLS_CIPHER_AES_CCM_128_KEY_SIZE];
+	unsigned char salt[TLS_CIPHER_AES_CCM_128_SALT_SIZE];
+	unsigned char rec_seq[TLS_CIPHER_AES_CCM_128_REC_SEQ_SIZE];
+};
+
 #endif /* _UAPI_LINUX_TLS_H */
-- 
cgit v1.2.3


From 4bd97d51a5e602ea1fbdab8c2d653513dea17115 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 20 Mar 2019 11:02:04 +0100
Subject: net: dev: rename queue selection helpers.

With the following patches, we are going to use __netdev_pick_tx() in
many modules. Rename it to netdev_pick_tx(), to make it clear is
a public API.

Also rename the existing netdev_pick_tx() to netdev_core_pick_tx(),
to avoid name clashes.

Suggested-by: Eric Dumazet <edumazet@google.com>
Suggested-by: David Miller <davem@davemloft.net>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 26f69cf763f4..57cd2bdd9f78 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2152,9 +2152,9 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev,
 				  &qdisc_xmit_lock_key);	\
 }
 
-struct netdev_queue *netdev_pick_tx(struct net_device *dev,
-				    struct sk_buff *skb,
-				    struct net_device *sb_dev);
+struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
+					 struct sk_buff *skb,
+					 struct net_device *sb_dev);
 
 /* returns the headroom that the master device needs to take in account
  * when forwarding to this dev
-- 
cgit v1.2.3


From b71b5837f8711dbc4bc0424cb5c75e5921be055c Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 20 Mar 2019 11:02:05 +0100
Subject: packet: rework packet_pick_tx_queue() to use common code selection

Currently packet_pick_tx_queue() is the only caller of
ndo_select_queue() using a fallback argument other than
netdev_pick_tx.

Leveraging rx queue, we can obtain a similar queue selection
behavior using core helpers. After this change, ndo_select_queue()
is always invoked with netdev_pick_tx() as fallback.
We can change ndo_select_queue() signature in a followup patch,
dropping an indirect call per transmitted packet in some scenarios
(e.g. TCP syn and XDP generic xmit)

This changes slightly how af packet queue selection happens when
PACKET_QDISC_BYPASS is set. It's now more similar to plan dev_queue_xmit()
tacking in account both XPS and TC mapping.

 v1  -> v2:
  - rebased after helper name change
 RFC -> v1:
  - initialize sender_cpu to the expected value

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 57cd2bdd9f78..0ff28db4239f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2152,6 +2152,8 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev,
 				  &qdisc_xmit_lock_key);	\
 }
 
+u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
+		     struct net_device *sb_dev);
 struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
 					 struct sk_buff *skb,
 					 struct net_device *sb_dev);
-- 
cgit v1.2.3


From a350eccee5830d9a1f29e393a88dc05a15326d44 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 20 Mar 2019 11:02:06 +0100
Subject: net: remove 'fallback' argument from dev->ndo_select_queue()

After the previous patch, all the callers of ndo_select_queue()
provide as a 'fallback' argument netdev_pick_tx.
The only exceptions are nested calls to ndo_select_queue(),
which pass down the 'fallback' available in the current scope
- still netdev_pick_tx.

We can drop such argument and replace fallback() invocation with
netdev_pick_tx(). This avoids an indirect call per xmit packet
in some scenarios (TCP syn, UDP unconnected, XDP generic, pktgen)
with device drivers implementing such ndo. It also clean the code
a bit.

Tested with ixgbe and CONFIG_FCOE=m

With pktgen using queue xmit:
threads		vanilla 	patched
		(kpps)		(kpps)
1		2334		2428
2		4166		4278
4		7895		8100

 v1 -> v2:
 - rebased after helper's name change

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0ff28db4239f..823762291ebf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -986,8 +986,7 @@ struct devlink;
  *	those the driver believes to be appropriate.
  *
  * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
- *                         struct net_device *sb_dev,
- *                         select_queue_fallback_t fallback);
+ *                         struct net_device *sb_dev);
  *	Called to decide which queue to use when device supports multiple
  *	transmit queues.
  *
@@ -1268,8 +1267,7 @@ struct net_device_ops {
 						      netdev_features_t features);
 	u16			(*ndo_select_queue)(struct net_device *dev,
 						    struct sk_buff *skb,
-						    struct net_device *sb_dev,
-						    select_queue_fallback_t fallback);
+						    struct net_device *sb_dev);
 	void			(*ndo_change_rx_flags)(struct net_device *dev,
 						       int flags);
 	void			(*ndo_set_rx_mode)(struct net_device *dev);
@@ -2641,11 +2639,9 @@ void dev_close_many(struct list_head *head, bool unlink);
 void dev_disable_lro(struct net_device *dev);
 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
 u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
-		     struct net_device *sb_dev,
-		     select_queue_fallback_t fallback);
+		     struct net_device *sb_dev);
 u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
-		       struct net_device *sb_dev,
-		       select_queue_fallback_t fallback);
+		       struct net_device *sb_dev);
 int dev_queue_xmit(struct sk_buff *skb);
 int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev);
 int dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
-- 
cgit v1.2.3


From 0b03a5ca8b14321366eec4a903922d2b46d585ff Mon Sep 17 00:00:00 2001
From: Stephen Suryaputra <ssuryaextr@gmail.com>
Date: Wed, 20 Mar 2019 10:29:27 -0400
Subject: ipv6: Add icmp_echo_ignore_anycast for ICMPv6

In addition to icmp_echo_ignore_multicast, there is a need to also
prevent responding to pings to anycast addresses for security.

Signed-off-by: Stephen Suryaputra <ssuryaextr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv6.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index e29aff15acc9..64e29b58bb5e 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -34,6 +34,7 @@ struct netns_sysctl_ipv6 {
 	int icmpv6_time;
 	int icmpv6_echo_ignore_all;
 	int icmpv6_echo_ignore_multicast;
+	int icmpv6_echo_ignore_anycast;
 	int anycast_src_echo_reply;
 	int ip_nonlocal_bind;
 	int fwmark_reflect;
-- 
cgit v1.2.3


From c7a1ce397adacaf5d4bb2eab0a738b5f80dc3e43 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Thu, 21 Mar 2019 05:21:35 -0700
Subject: ipv6: Change addrconf_f6i_alloc to use ip6_route_info_create

Change addrconf_f6i_alloc to generate a fib6_config and call
ip6_route_info_create. addrconf_f6i_alloc is the last caller to
fib6_info_alloc besides ip6_route_info_create, and there is no
reason for it to do its own initialization on a fib6_info.

Host routes need to be created even if the device is down, so add a
new flag, fc_ignore_dev_down, to fib6_config and update fib6_nh_init
to not error out if device is not up.

Notes on the conversion:
- ip_fib_metrics_init is the same as fib6_config has fc_mx set to NULL
  and fc_mx_len set to 0
- dst_nocount is handled by the RTF_ADDRCONF flag
- dst_host is handled by fc_dst_len = 128

nh_gw does not get set after the conversion to ip6_route_info_create
but it should not be set in addrconf_f6i_alloc since this is a host
route not a gateway route.

Everything else is a straight forward map between fib6_info and
fib6_config.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 84097010237c..2acb78a762ee 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -50,7 +50,8 @@ struct fib6_config {
 	u32		fc_protocol;
 	u16		fc_type;        /* only 8 bits are used */
 	u16		fc_delete_all_nh : 1,
-			__unused : 15;
+			fc_ignore_dev_down:1,
+			__unused : 14;
 
 	struct in6_addr	fc_dst;
 	struct in6_addr	fc_src;
-- 
cgit v1.2.3


From 0c3e0e3bb623c3735b8c9ab8aa8332f944f83a9f Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 20 Mar 2019 12:16:42 +0300
Subject: tun: Add ioctl() TUNGETDEVNETNS cmd to allow obtaining real net ns of
 tun device

In commit f2780d6d7475 "tun: Add ioctl() SIOCGSKNS cmd to allow
obtaining net ns of tun device" it was missed that tun may change
its net ns, while net ns of socket remains the same as it was
created initially. SIOCGSKNS returns net ns of socket, so it is
not suitable for obtaining net ns of device.

We may have two tun devices with the same names in two net ns,
and in this case it's not possible to determ, which of them
fd refers to (TUNGETIFF will return the same name).

This patch adds new ioctl() cmd for obtaining net ns of a device.

Reported-by: Harald Albrecht <harald.albrecht@gmx.net>
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_tun.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 23a6753b37df..454ae31b93c7 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -60,6 +60,7 @@
 #define TUNSETSTEERINGEBPF _IOR('T', 224, int)
 #define TUNSETFILTEREBPF _IOR('T', 225, int)
 #define TUNSETCARRIER _IOW('T', 226, int)
+#define TUNGETDEVNETNS _IO('T', 227)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
-- 
cgit v1.2.3


From 9ab948a91b2c2abc8e82845c0e61f4b1683e3a4f Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 20 Mar 2019 09:18:59 -0700
Subject: ipv4: Allow amount of dirty memory from fib resizing to be
 controllable

fib_trie implementation calls synchronize_rcu when a certain amount of
pages are dirty from freed entries. The number of pages was determined
experimentally in 2009 (commit c3059477fce2d).

At the current setting, synchronize_rcu is called often -- 51 times in a
second in one test with an average of an 8 msec delay adding a fib entry.
The total impact is a lot of slow down modifying the fib. This is seen
in the output of 'time' - the difference between real time and sys+user.
For example, using 720,022 single path routes and 'ip -batch'[1]:

    $ time ./ip -batch ipv4/routes-1-hops
    real    0m14.214s
    user    0m2.513s
    sys     0m6.783s

So roughly 35% of the actual time to install the routes is from the ip
command getting scheduled out, most notably due to synchronize_rcu (this
is observed using 'perf sched timehist').

This patch makes the amount of dirty memory configurable between 64k where
the synchronize_rcu is called often (small, low end systems that are memory
sensitive) to 64M where synchronize_rcu is called rarely during a large
FIB change (for high end systems with lots of memory). The default is 512kB
which corresponds to the current setting of 128 pages with a 4kB page size.

As an example, at 16MB the worst interval shows 4 calls to synchronize_rcu
in a second blocking for up to 30 msec in a single instance, and a total
of almost 100 msec across the 4 calls in the second. The trade off is
allowing FIB entries to consume more memory in a given time window but
but with much better fib insertion rates (~30% increase in prefixes/sec).
With this patch and net.ipv4.fib_sync_mem set to 16MB, the same batch
file runs in:

    $ time ./ip -batch ipv4/routes-1-hops
    real    0m9.692s
    user    0m2.491s
    sys     0m6.769s

So the dead time is reduced to about 1/2 second or <5% of the real time.

[1] 'ip' modified to not request ACK messages which improves route
    insertion times by about 20%

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index be3cad9c2e4c..aa09ae5f01a5 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -38,6 +38,10 @@
 #define IPV4_MAX_PMTU		65535U		/* RFC 2675, Section 5.1 */
 #define IPV4_MIN_MTU		68			/* RFC 791 */
 
+extern unsigned int sysctl_fib_sync_mem;
+extern unsigned int sysctl_fib_sync_mem_min;
+extern unsigned int sysctl_fib_sync_mem_max;
+
 struct sock;
 
 struct inet_skb_parm {
-- 
cgit v1.2.3


From 02afc7ad45bd6cfc9fd51fdbc132455371b63469 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Wed, 20 Mar 2019 20:02:56 +0100
Subject: net: dst: remove gc leftovers

Get rid of some obsolete gc-related documentation and macros that were
missed in commit 5b7c9a8ff828 ("net: remove dst gc related code").

CC: Wei Wang <weiwan@google.com>
Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Acked-by: Wei Wang <weiwan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include')

diff --git a/include/net/dst.h b/include/net/dst.h
index 6cf0870414c7..12b31c602cb0 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -19,17 +19,6 @@
 #include <net/neighbour.h>
 #include <asm/processor.h>
 
-#define DST_GC_MIN	(HZ/10)
-#define DST_GC_INC	(HZ/2)
-#define DST_GC_MAX	(120*HZ)
-
-/* Each dst_entry has reference count and sits in some parent list(s).
- * When it is removed from parent list, it is "freed" (dst_free).
- * After this it enters dead state (dst->obsolete > 0) and if its refcnt
- * is zero, it can be destroyed immediately, otherwise it is added
- * to gc list and garbage collector periodically checks the refcnt.
- */
-
 struct sk_buff;
 
 struct dst_entry {
-- 
cgit v1.2.3


From 4feb7c7a4fbb8f63371be31cda79433c7cf3da86 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 21 Mar 2019 14:42:40 +1100
Subject: rhashtable: don't hold lock on first table throughout insertion.

rhashtable_try_insert() currently holds a lock on the bucket in
the first table, while also locking buckets in subsequent tables.
This is unnecessary and looks like a hold-over from some earlier
version of the implementation.

As insert and remove always lock a bucket in each table in turn, and
as insert only inserts in the final table, there cannot be any races
that are not covered by simply locking a bucket in each table in turn.

When an insert call reaches that last table it can be sure that there
is no matchinf entry in any other table as it has searched them all, and
insertion never happens anywhere but in the last table.  The fact that
code tests for the existence of future_tbl while holding a lock on
the relevant bucket ensures that two threads inserting the same key
will make compatible decisions about which is the "last" table.

This simplifies the code and allows the ->rehash field to be
discarded.

We still need a way to ensure that a dead bucket_table is never
re-linked by rhashtable_walk_stop().  This can be achieved by calling
call_rcu() inside the locked region, and checking with
rcu_head_after_call_rcu() in rhashtable_walk_stop() to see if the
bucket table is empty and dead.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Reviewed-by: Paul E. McKenney <paulmck@linux.ibm.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index ae9c0f71f311..3864193d5e2e 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -63,7 +63,6 @@
 struct bucket_table {
 	unsigned int		size;
 	unsigned int		nest;
-	unsigned int		rehash;
 	u32			hash_rnd;
 	unsigned int		locks_mask;
 	spinlock_t		*locks;
@@ -776,12 +775,6 @@ static inline int rhltable_insert(
  * @obj:	pointer to hash head inside object
  * @params:	hash table parameters
  *
- * Locks down the bucket chain in both the old and new table if a resize
- * is in progress to ensure that writers can't remove from the old table
- * and can't insert to the new table during the atomic operation of search
- * and insertion. Searches for duplicates in both the old and new table if
- * a resize is in progress.
- *
  * This lookup function may only be used for fixed key hash table (key_len
  * parameter set). It will BUG() if used inappropriately.
  *
@@ -837,12 +830,6 @@ static inline void *rhashtable_lookup_get_insert_fast(
  * @obj:	pointer to hash head inside object
  * @params:	hash table parameters
  *
- * Locks down the bucket chain in both the old and new table if a resize
- * is in progress to ensure that writers can't remove from the old table
- * and can't insert to the new table during the atomic operation of search
- * and insertion. Searches for duplicates in both the old and new table if
- * a resize is in progress.
- *
  * Lookups may occur in parallel with hashtable mutations and resizing.
  *
  * Will trigger an automatic deferred table resizing if residency in the
-- 
cgit v1.2.3


From f7ad68bf98506f48129267438ada1255fc4edfa2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 21 Mar 2019 14:42:40 +1100
Subject: rhashtable: rename rht_for_each*continue as *from.

The pattern set by list.h is that for_each..continue()
iterators start at the next entry after the given one,
while for_each..from() iterators start at the given
entry.

The rht_for_each*continue() iterators are documented as though the
start at the 'next' entry, but actually start at the given entry,
and they are used expecting that behaviour.
So fix the documentation and change the names to *from for consistency
with list.h

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Miguel Ojeda <miguel.ojeda.sandonis@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 3864193d5e2e..86dfa417848d 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -306,13 +306,13 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
 }
 
 /**
- * rht_for_each_continue - continue iterating over hash chain
+ * rht_for_each_from - iterate over hash chain from given head
  * @pos:	the &struct rhash_head to use as a loop cursor.
- * @head:	the previous &struct rhash_head to continue from
+ * @head:	the &struct rhash_head to start from
  * @tbl:	the &struct bucket_table
  * @hash:	the hash value / bucket index
  */
-#define rht_for_each_continue(pos, head, tbl, hash) \
+#define rht_for_each_from(pos, head, tbl, hash) \
 	for (pos = rht_dereference_bucket(head, tbl, hash); \
 	     !rht_is_a_nulls(pos); \
 	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
@@ -324,18 +324,18 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * @hash:	the hash value / bucket index
  */
 #define rht_for_each(pos, tbl, hash) \
-	rht_for_each_continue(pos, *rht_bucket(tbl, hash), tbl, hash)
+	rht_for_each_from(pos, *rht_bucket(tbl, hash), tbl, hash)
 
 /**
- * rht_for_each_entry_continue - continue iterating over hash chain
+ * rht_for_each_entry_from - iterate over hash chain from given head
  * @tpos:	the type * to use as a loop cursor.
  * @pos:	the &struct rhash_head to use as a loop cursor.
- * @head:	the previous &struct rhash_head to continue from
+ * @head:	the &struct rhash_head to start from
  * @tbl:	the &struct bucket_table
  * @hash:	the hash value / bucket index
  * @member:	name of the &struct rhash_head within the hashable struct.
  */
-#define rht_for_each_entry_continue(tpos, pos, head, tbl, hash, member)	\
+#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member)	\
 	for (pos = rht_dereference_bucket(head, tbl, hash);		\
 	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	\
 	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
@@ -349,7 +349,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * @member:	name of the &struct rhash_head within the hashable struct.
  */
 #define rht_for_each_entry(tpos, pos, tbl, hash, member)		\
-	rht_for_each_entry_continue(tpos, pos, *rht_bucket(tbl, hash),	\
+	rht_for_each_entry_from(tpos, pos, *rht_bucket(tbl, hash),	\
 				    tbl, hash, member)
 
 /**
@@ -374,9 +374,9 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
 		       rht_dereference_bucket(pos->next, tbl, hash) : NULL)
 
 /**
- * rht_for_each_rcu_continue - continue iterating over rcu hash chain
+ * rht_for_each_rcu_from - iterate over rcu hash chain from given head
  * @pos:	the &struct rhash_head to use as a loop cursor.
- * @head:	the previous &struct rhash_head to continue from
+ * @head:	the &struct rhash_head to start from
  * @tbl:	the &struct bucket_table
  * @hash:	the hash value / bucket index
  *
@@ -384,7 +384,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * the _rcu mutation primitives such as rhashtable_insert() as long as the
  * traversal is guarded by rcu_read_lock().
  */
-#define rht_for_each_rcu_continue(pos, head, tbl, hash)			\
+#define rht_for_each_rcu_from(pos, head, tbl, hash)			\
 	for (({barrier(); }),						\
 	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		\
 	     !rht_is_a_nulls(pos);					\
@@ -401,13 +401,13 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * traversal is guarded by rcu_read_lock().
  */
 #define rht_for_each_rcu(pos, tbl, hash)				\
-	rht_for_each_rcu_continue(pos, *rht_bucket(tbl, hash), tbl, hash)
+	rht_for_each_rcu_from(pos, *rht_bucket(tbl, hash), tbl, hash)
 
 /**
- * rht_for_each_entry_rcu_continue - continue iterating over rcu hash chain
+ * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head
  * @tpos:	the type * to use as a loop cursor.
  * @pos:	the &struct rhash_head to use as a loop cursor.
- * @head:	the previous &struct rhash_head to continue from
+ * @head:	the &struct rhash_head to start from
  * @tbl:	the &struct bucket_table
  * @hash:	the hash value / bucket index
  * @member:	name of the &struct rhash_head within the hashable struct.
@@ -416,7 +416,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * the _rcu mutation primitives such as rhashtable_insert() as long as the
  * traversal is guarded by rcu_read_lock().
  */
-#define rht_for_each_entry_rcu_continue(tpos, pos, head, tbl, hash, member) \
+#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \
 	for (({barrier(); }),						    \
 	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		    \
 	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	    \
@@ -435,7 +435,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * traversal is guarded by rcu_read_lock().
  */
 #define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		   \
-	rht_for_each_entry_rcu_continue(tpos, pos, *rht_bucket(tbl, hash), \
+	rht_for_each_entry_rcu_from(tpos, pos, *rht_bucket(tbl, hash), \
 					tbl, hash, member)
 
 /**
@@ -491,7 +491,7 @@ restart:
 	hash = rht_key_hashfn(ht, tbl, key, params);
 	head = rht_bucket(tbl, hash);
 	do {
-		rht_for_each_rcu_continue(he, *head, tbl, hash) {
+		rht_for_each_rcu_from(he, *head, tbl, hash) {
 			if (params.obj_cmpfn ?
 			    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
 			    rhashtable_compare(&arg, rht_obj(ht, he)))
@@ -625,7 +625,7 @@ slow_path:
 	if (!pprev)
 		goto out;
 
-	rht_for_each_continue(head, *pprev, tbl, hash) {
+	rht_for_each_from(head, *pprev, tbl, hash) {
 		struct rhlist_head *plist;
 		struct rhlist_head *list;
 
@@ -890,7 +890,7 @@ static inline int __rhashtable_remove_fast_one(
 	spin_lock_bh(lock);
 
 	pprev = rht_bucket_var(tbl, hash);
-	rht_for_each_continue(he, *pprev, tbl, hash) {
+	rht_for_each_from(he, *pprev, tbl, hash) {
 		struct rhlist_head *list;
 
 		list = container_of(he, struct rhlist_head, rhead);
@@ -1042,7 +1042,7 @@ static inline int __rhashtable_replace_fast(
 	spin_lock_bh(lock);
 
 	pprev = rht_bucket_var(tbl, hash);
-	rht_for_each_continue(he, *pprev, tbl, hash) {
+	rht_for_each_from(he, *pprev, tbl, hash) {
 		if (he != obj_old) {
 			pprev = &he->next;
 			continue;
-- 
cgit v1.2.3


From 85a51f8c28b9812642d76db6889f3f39dc3fbab3 Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Fri, 22 Mar 2019 09:54:00 +0800
Subject: bpf: allow helpers to return PTR_TO_SOCK_COMMON

It's currently not possible to access timewait or request sockets
from eBPF, since there is no way to return a PTR_TO_SOCK_COMMON
from a helper. Introduce RET_PTR_TO_SOCK_COMMON to enable this
behaviour.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f02367faa58d..f62897198844 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -205,6 +205,7 @@ enum bpf_return_type {
 	RET_PTR_TO_MAP_VALUE_OR_NULL,	/* returns a pointer to map elem value or NULL */
 	RET_PTR_TO_SOCKET_OR_NULL,	/* returns a pointer to a socket or NULL */
 	RET_PTR_TO_TCP_SOCK_OR_NULL,	/* returns a pointer to a tcp_sock or NULL */
+	RET_PTR_TO_SOCK_COMMON_OR_NULL,	/* returns a pointer to a sock_common or NULL */
 };
 
 /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
-- 
cgit v1.2.3


From edbf8c01de5a104a71ed6df2bf6421ceb2836a8e Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Fri, 22 Mar 2019 09:54:01 +0800
Subject: bpf: add skc_lookup_tcp helper

Allow looking up a sock_common. This gives eBPF programs
access to timewait and request sockets.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 929c8e537a14..fab05317f5e7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2431,6 +2431,23 @@ union bpf_attr {
  *	Return
  *		A **struct bpf_sock** pointer on success, or **NULL** in
  *		case of failure.
+ *
+ * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
+ *	Description
+ *		Look for TCP socket matching *tuple*, optionally in a child
+ *		network namespace *netns*. The return value must be checked,
+ *		and if non-**NULL**, released via **bpf_sk_release**\ ().
+ *
+ *		This function is identical to bpf_sk_lookup_tcp, except that it
+ *		also returns timewait or request sockets. Use bpf_sk_fullsock
+ *		or bpf_tcp_socket to access the full structure.
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		**CONFIG_NET** configuration option.
+ *	Return
+ *		Pointer to **struct bpf_sock**, or **NULL** in case of failure.
+ *		For sockets with reuseport option, the **struct bpf_sock**
+ *		result is from **reuse->socks**\ [] using the hash of the tuple.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2531,7 +2548,8 @@ union bpf_attr {
 	FN(sk_fullsock),		\
 	FN(tcp_sock),			\
 	FN(skb_ecn_set_ce),		\
-	FN(get_listener_sock),
+	FN(get_listener_sock),		\
+	FN(skc_lookup_tcp),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit v1.2.3


From 399040847084a69f345e0a52fd62f04654e0fce3 Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Fri, 22 Mar 2019 09:54:02 +0800
Subject: bpf: add helper to check for a valid SYN cookie

Using bpf_skc_lookup_tcp it's possible to ascertain whether a packet
belongs to a known connection. However, there is one corner case: no
sockets are created if SYN cookies are active. This means that the final
ACK in the 3WHS is misclassified.

Using the helper, we can look up the listening socket via
bpf_skc_lookup_tcp and then check whether a packet is a valid SYN
cookie ACK.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index fab05317f5e7..3c04410137d9 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2448,6 +2448,21 @@ union bpf_attr {
  *		Pointer to **struct bpf_sock**, or **NULL** in case of failure.
  *		For sockets with reuseport option, the **struct bpf_sock**
  *		result is from **reuse->socks**\ [] using the hash of the tuple.
+ *
+ * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
+ * 	Description
+ * 		Check whether iph and th contain a valid SYN cookie ACK for
+ * 		the listening socket in sk.
+ *
+ * 		iph points to the start of the IPv4 or IPv6 header, while
+ * 		iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr).
+ *
+ * 		th points to the start of the TCP header, while th_len contains
+ * 		sizeof(struct tcphdr).
+ *
+ * 	Return
+ * 		0 if iph and th are a valid SYN cookie ACK, or a negative error
+ * 		otherwise.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2549,7 +2564,8 @@ union bpf_attr {
 	FN(tcp_sock),			\
 	FN(skb_ecn_set_ce),		\
 	FN(get_listener_sock),		\
-	FN(skc_lookup_tcp),
+	FN(skc_lookup_tcp),		\
+	FN(tcp_check_syncookie),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit v1.2.3


From 3b0f31f2b8c9fb348e4530b88f6b64f9621f83d6 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 21 Mar 2019 22:51:02 +0100
Subject: genetlink: make policy common to family

Since maxattr is common, the policy can't really differ sanely,
so make it common as well.

The only user that did in fact manage to make a non-common policy
is taskstats, which has to be really careful about it (since it's
still using a common maxattr!). This is no longer supported, but
we can fake it using pre_doit.

This reduces the size of e.g. nl80211.o (which has lots of commands):

   text	   data	    bss	    dec	    hex	filename
 398745	  14323	   2240	 415308	  6564c	net/wireless/nl80211.o (before)
 397913	  14331	   2240	 414484	  65314	net/wireless/nl80211.o (after)
--------------------------------
   -832      +8       0    -824

Which is obviously just 8 bytes for each command, and an added 8
bytes for the new policy pointer. I'm not sure why the ops list is
counted as .text though.

Most of the code transformations were done using the following spatch:
    @ops@
    identifier OPS;
    expression POLICY;
    @@
    struct genl_ops OPS[] = {
    ...,
     {
    -	.policy = POLICY,
     },
    ...
    };

    @@
    identifier ops.OPS;
    expression ops.POLICY;
    identifier fam;
    expression M;
    @@
    struct genl_family fam = {
            .ops = OPS,
            .maxattr = M,
    +       .policy = POLICY,
            ...
    };

This also gets rid of devlink_nl_cmd_region_read_dumpit() accessing
the cb->data as ops, which we want to change in a later genl patch.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/genl_magic_func.h | 4 ++--
 include/net/genetlink.h         | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index 83f81ac53282..6cb82301d8e9 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -233,7 +233,6 @@ const char *CONCAT_(GENL_MAGIC_FAMILY, _genl_cmd_to_str)(__u8 cmd)
 {								\
 	handler							\
 	.cmd = op_name,						\
-	.policy	= CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy),	\
 },
 
 #define ZZZ_genl_ops		CONCAT_(GENL_MAGIC_FAMILY, _genl_ops)
@@ -290,7 +289,8 @@ static struct genl_family ZZZ_genl_family __ro_after_init = {
 #ifdef GENL_MAGIC_FAMILY_HDRSZ
 	.hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ),
 #endif
-	.maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1,
+	.maxattr = ARRAY_SIZE(CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy))-1,
+	.policy	= CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy),
 	.ops = ZZZ_genl_ops,
 	.n_ops = ARRAY_SIZE(ZZZ_genl_ops),
 	.mcgrps = ZZZ_genl_mcgrps,
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index aa2e5888f18d..6850c7b1a3a6 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -26,6 +26,7 @@ struct genl_info;
  * @name: name of family
  * @version: protocol version
  * @maxattr: maximum number of attributes supported
+ * @policy: netlink policy
  * @netnsok: set to true if the family can handle network
  *	namespaces and should be presented in all of them
  * @parallel_ops: operations can be called in parallel and aren't
@@ -56,6 +57,7 @@ struct genl_family {
 	unsigned int		maxattr;
 	bool			netnsok;
 	bool			parallel_ops;
+	const struct nla_policy *policy;
 	int			(*pre_doit)(const struct genl_ops *ops,
 					    struct sk_buff *skb,
 					    struct genl_info *info);
@@ -124,14 +126,12 @@ static inline int genl_err_attr(struct genl_info *info, int err,
  * @cmd: command identifier
  * @internal_flags: flags used by the family
  * @flags: flags
- * @policy: attribute validation policy
  * @doit: standard command callback
  * @start: start callback for dumps
  * @dumpit: callback for dumpers
  * @done: completion callback for dumps
  */
 struct genl_ops {
-	const struct nla_policy	*policy;
 	int		       (*doit)(struct sk_buff *skb,
 				       struct genl_info *info);
 	int		       (*start)(struct netlink_callback *cb);
-- 
cgit v1.2.3


From 974eff2b5793eeaa2eb433bca7eba9640d890c4a Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Thu, 21 Mar 2019 15:51:36 -0700
Subject: net: Move the definition of the default Geneve udp port to public
 header file

Move the definition of the default Geneve udp port from the geneve
source to the header file, so we can re-use it from drivers.
Modify existing drivers to use it.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Cc: John Hurley <john.hurley@netronome.com>
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/net/geneve.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/geneve.h b/include/net/geneve.h
index fc6a7e0a874a..bced0b1d9fe4 100644
--- a/include/net/geneve.h
+++ b/include/net/geneve.h
@@ -4,6 +4,8 @@
 
 #include <net/udp_tunnel.h>
 
+#define GENEVE_UDP_PORT		6081
+
 /* Geneve Header:
  *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  *  |Ver|  Opt Len  |O|C|    Rsvd.  |          Protocol Type        |
-- 
cgit v1.2.3


From bea964107fa78ffe484ef8659ecc26f9ae2bcd2f Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Thu, 21 Mar 2019 15:51:39 -0700
Subject: net: Add IANA_VXLAN_UDP_PORT definition to vxlan header file

Added IANA_VXLAN_UDP_PORT (4789) definition to vxlan header file so it
can be used by drivers instead of local definition.
Updated drivers which locally defined it as 4789 to use it.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Cc: John Hurley <john.hurley@netronome.com>
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Cc: Yunsheng Lin <linyunsheng@huawei.com>
Cc: Peng Li <lipeng321@huawei.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/net/vxlan.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 00254a58824b..83b5999a2587 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -8,6 +8,8 @@
 #include <net/rtnetlink.h>
 #include <net/switchdev.h>
 
+#define IANA_VXLAN_UDP_PORT     4789
+
 /* VXLAN protocol (RFC 7348) header:
  * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
  * |R|R|R|R|I|R|R|R|               Reserved                        |
-- 
cgit v1.2.3


From 0eb69bb9962973f4852bb35b8151332c98741770 Mon Sep 17 00:00:00 2001
From: Eli Britstein <elibr@mellanox.com>
Date: Thu, 21 Mar 2019 15:51:40 -0700
Subject: net/mlx5e: Add VLAN ID rewrite fields

Add VLAN ID rewrite fields as a pre-step to support this rewrite.

Signed-off-by: Eli Britstein <elibr@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 3b83288749c6..b0e17c94566c 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -5110,6 +5110,7 @@ enum {
 	MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0    = 0x14,
 	MLX5_ACTION_IN_FIELD_OUT_SIPV4         = 0x15,
 	MLX5_ACTION_IN_FIELD_OUT_DIPV4         = 0x16,
+	MLX5_ACTION_IN_FIELD_OUT_FIRST_VID     = 0x17,
 	MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT = 0x47,
 };
 
-- 
cgit v1.2.3


From 14aa31929b724b70fb63a9b0e7877da325b25cfe Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 22 Mar 2019 14:32:54 -0400
Subject: bpf: add bpf_skb_adjust_room mode BPF_ADJ_ROOM_MAC

bpf_skb_adjust_room net allows inserting room in an skb.

Existing mode BPF_ADJ_ROOM_NET inserts room after the network header
by pulling the skb, moving the network header forward and zeroing the
new space.

Add new mode BPF_ADJUST_ROOM_MAC that inserts room after the mac
header. This allows inserting tunnel headers in front of the network
header without having to recreate the network header in the original
space, avoiding two copies.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3c04410137d9..7c8fd0647070 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1478,7 +1478,10 @@ union bpf_attr {
  * 		Grow or shrink the room for data in the packet associated to
  * 		*skb* by *len_diff*, and according to the selected *mode*.
  *
- * 		There is a single supported mode at this time:
+ *		There are two supported modes at this time:
+ *
+ *		* **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
+ *		  (room space is added or removed below the layer 2 header).
  *
  * 		* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
  * 		  (room space is added or removed below the layer 3 header).
@@ -2627,6 +2630,7 @@ enum bpf_func_id {
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
+	BPF_ADJ_ROOM_MAC,
 };
 
 /* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
-- 
cgit v1.2.3


From 2278f6cc151a8bef6ba0b3fe3009d14dc3c51c4a Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 22 Mar 2019 14:32:55 -0400
Subject: bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_FIXED_GSO

bpf_skb_adjust_room adjusts gso_size of gso packets to account for the
pushed or popped header room.

This is not allowed with UDP, where gso_size delineates datagrams. Add
an option to avoid these updates and allow this call for datagrams.

It can also be used with TCP, when MSS is known to allow headroom,
e.g., through MSS clamping or route MTU.

Changes v1->v2:
  - document flag BPF_F_ADJ_ROOM_FIXED_GSO
  - do not expose BPF_F_ADJ_ROOM_MASK through uapi, as it may change.

Link: https://patchwork.ozlabs.org/patch/1052497/
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7c8fd0647070..4f157d0ec571 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1486,8 +1486,10 @@ union bpf_attr {
  * 		* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
  * 		  (room space is added or removed below the layer 3 header).
  *
- * 		All values for *flags* are reserved for future usage, and must
- * 		be left at zero.
+ *		There is one supported flag at this time:
+ *
+ *		* **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size.
+ *		  Adjusting mss in this way is not allowed for datagrams.
  *
  * 		A call to this helper is susceptible to change the underlaying
  * 		packet buffer. Therefore, at load time, all checks on pointers
@@ -2627,6 +2629,9 @@ enum bpf_func_id {
 /* Current network namespace */
 #define BPF_F_CURRENT_NETNS		(-1L)
 
+/* BPF_FUNC_skb_adjust_room flags. */
+#define BPF_F_ADJ_ROOM_FIXED_GSO	(1ULL << 0)
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
-- 
cgit v1.2.3


From 868d523535c2d00b696753ece606e641a816e91e Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 22 Mar 2019 14:32:56 -0400
Subject: bpf: add bpf_skb_adjust_room encap flags

When pushing tunnel headers, annotate skbs in the same way as tunnel
devices.

For GSO packets, the network stack requires certain fields set to
segment packets with tunnel headers. gro_gse_segment depends on
transport and inner mac header, for instance.

Add an option to pass this information.

Remove the restriction on len_diff to network header length, which
is too short, e.g., for GRE protocols.

Changes
  v1->v2:
  - document new flags
  - BPF_F_ADJ_ROOM_MASK moved
  v2->v3:
  - BPF_F_ADJ_ROOM_ENCAP_L3_MASK moved

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4f157d0ec571..837024512baf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1486,11 +1486,20 @@ union bpf_attr {
  * 		* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
  * 		  (room space is added or removed below the layer 3 header).
  *
- *		There is one supported flag at this time:
+ *		The following flags are supported at this time:
  *
  *		* **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size.
  *		  Adjusting mss in this way is not allowed for datagrams.
  *
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **:
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **:
+ *		  Any new space is reserved to hold a tunnel header.
+ *		  Configure skb offsets and other fields accordingly.
+ *
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **:
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **:
+ *		  Use with ENCAP_L3 flags to further specify the tunnel type.
+ *
  * 		A call to this helper is susceptible to change the underlaying
  * 		packet buffer. Therefore, at load time, all checks on pointers
  * 		previously done by the verifier are invalidated and must be
@@ -2632,6 +2641,11 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_adjust_room flags. */
 #define BPF_F_ADJ_ROOM_FIXED_GSO	(1ULL << 0)
 
+#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4	(1ULL << 1)
+#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6	(1ULL << 2)
+#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE	(1ULL << 3)
+#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP	(1ULL << 4)
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
-- 
cgit v1.2.3


From 576fd2f7cac3daa36025f0039f9e7cb75b4b4ae0 Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Fri, 22 Mar 2019 10:59:47 -0400
Subject: tcp: add documentation for tcp_ca_state

Add documentation to the tcp_ca_state enum, since this enum is
exposed in uapi.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Sowmini Varadhan <sowmini05@gmail.com>
Acked-by: Sowmini Varadhan <sowmini05@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 8bb6cc5f3235..b521464ea962 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -160,15 +160,42 @@ enum {
 #define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */
 #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
 
+/*
+ * Sender's congestion state indicating normal or abnormal situations
+ * in the last round of packets sent. The state is driven by the ACK
+ * information and timer events.
+ */
 enum tcp_ca_state {
+	/*
+	 * Nothing bad has been observed recently.
+	 * No apparent reordering, packet loss, or ECN marks.
+	 */
 	TCP_CA_Open = 0,
 #define TCPF_CA_Open	(1<<TCP_CA_Open)
+	/*
+	 * The sender enters disordered state when it has received DUPACKs or
+	 * SACKs in the last round of packets sent. This could be due to packet
+	 * loss or reordering but needs further information to confirm packets
+	 * have been lost.
+	 */
 	TCP_CA_Disorder = 1,
 #define TCPF_CA_Disorder (1<<TCP_CA_Disorder)
+	/*
+	 * The sender enters Congestion Window Reduction (CWR) state when it
+	 * has received ACKs with ECN-ECE marks, or has experienced congestion
+	 * or packet discard on the sender host (e.g. qdisc).
+	 */
 	TCP_CA_CWR = 2,
 #define TCPF_CA_CWR	(1<<TCP_CA_CWR)
+	/*
+	 * The sender is in fast recovery and retransmitting lost packets,
+	 * typically triggered by ACK events.
+	 */
 	TCP_CA_Recovery = 3,
 #define TCPF_CA_Recovery (1<<TCP_CA_Recovery)
+	/*
+	 * The sender is in loss recovery triggered by retransmission timeout.
+	 */
 	TCP_CA_Loss = 4
 #define TCPF_CA_Loss	(1<<TCP_CA_Loss)
 };
-- 
cgit v1.2.3


From 28cff537ef2eed9307bc7e4e40745075637bec56 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 22 Mar 2019 16:01:55 +0100
Subject: net: sched: add empty status flag for NOLOCK qdisc

The queue is marked not empty after acquiring the seqlock,
and it's up to the NOLOCK qdisc clearing such flag on dequeue.
Since the empty status lays on the same cache-line of the
seqlock, it's always hot on cache during the updates.

This makes the empty flag update a little bit loosy. Given
the lack of synchronization between enqueue and dequeue, this
is unavoidable.

v2 -> v3:
 - qdisc_is_empty() has a const argument (Eric)

v1 -> v2:
 - use really an 'empty' flag instead of 'not_empty', as
   suggested by Eric

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 31284c078d06..e227475e78ca 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -113,6 +113,9 @@ struct Qdisc {
 
 	spinlock_t		busylock ____cacheline_aligned_in_smp;
 	spinlock_t		seqlock;
+
+	/* for NOLOCK qdisc, true if there are no enqueued skbs */
+	bool			empty;
 	struct rcu_head		rcu;
 };
 
@@ -143,11 +146,19 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc)
 	return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
 }
 
+static inline bool qdisc_is_empty(const struct Qdisc *qdisc)
+{
+	if (qdisc->flags & TCQ_F_NOLOCK)
+		return qdisc->empty;
+	return !qdisc->q.qlen;
+}
+
 static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 {
 	if (qdisc->flags & TCQ_F_NOLOCK) {
 		if (!spin_trylock(&qdisc->seqlock))
 			return false;
+		qdisc->empty = false;
 	} else if (qdisc_is_running(qdisc)) {
 		return false;
 	}
-- 
cgit v1.2.3


From dc05360fee660a9dbe59824b3f7896534210432b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 22 Mar 2019 08:56:38 -0700
Subject: net: convert rps_needed and rfs_needed to new static branch api

We prefer static_branch_unlikely() over static_key_false() these days.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++--
 include/net/sock.h        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 823762291ebf..166fdc0a78b4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -194,8 +194,8 @@ struct net_device_stats {
 
 #ifdef CONFIG_RPS
 #include <linux/static_key.h>
-extern struct static_key rps_needed;
-extern struct static_key rfs_needed;
+extern struct static_key_false rps_needed;
+extern struct static_key_false rfs_needed;
 #endif
 
 struct neighbour;
diff --git a/include/net/sock.h b/include/net/sock.h
index 8de5ee258b93..fecdf639225c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -966,7 +966,7 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
 static inline void sock_rps_record_flow(const struct sock *sk)
 {
 #ifdef CONFIG_RPS
-	if (static_key_false(&rfs_needed)) {
+	if (static_branch_unlikely(&rfs_needed)) {
 		/* Reading sk->sk_rxhash might incur an expensive cache line
 		 * miss.
 		 *
-- 
cgit v1.2.3


From 472c2e07eef045145bc1493cc94a01c87140780a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 22 Mar 2019 08:56:39 -0700
Subject: tcp: add one skb cache for tx

On hosts with a lot of cores, RPC workloads suffer from heavy contention on slab spinlocks.

    20.69%  [kernel]       [k] queued_spin_lock_slowpath
     5.64%  [kernel]       [k] _raw_spin_lock
     3.83%  [kernel]       [k] syscall_return_via_sysret
     3.48%  [kernel]       [k] __entry_text_start
     1.76%  [kernel]       [k] __netif_receive_skb_core
     1.64%  [kernel]       [k] __fget

For each sendmsg(), we allocate one skb, and free it at the time ACK packet comes.

In many cases, ACK packets are handled by another cpus, and this unfortunately
incurs heavy costs for slab layer.

This patch uses an extra pointer in socket structure, so that we try to reuse
the same skb and avoid these expensive costs.

We cache at most one skb per socket so this should be safe as far as
memory pressure is concerned.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index fecdf639225c..314c47a8f5d1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -414,6 +414,7 @@ struct sock {
 		struct sk_buff	*sk_send_head;
 		struct rb_root	tcp_rtx_queue;
 	};
+	struct sk_buff		*sk_tx_skb_cache;
 	struct sk_buff_head	sk_write_queue;
 	__s32			sk_peek_off;
 	int			sk_write_pending;
@@ -1463,6 +1464,10 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
 
 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
 {
+	if (!sk->sk_tx_skb_cache) {
+		sk->sk_tx_skb_cache = skb;
+		return;
+	}
 	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
 	sk->sk_wmem_queued -= skb->truesize;
 	sk_mem_uncharge(sk, skb->truesize);
-- 
cgit v1.2.3


From 8b27dae5a2e89a61c46c6dbc76c040c0e6d0ed4c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 22 Mar 2019 08:56:40 -0700
Subject: tcp: add one skb cache for rx

Often times, recvmsg() system calls and BH handling for a particular
TCP socket are done on different cpus.

This means the incoming skb had to be allocated on a cpu,
but freed on another.

This incurs a high spinlock contention in slab layer for small rpc,
but also a high number of cache line ping pongs for larger packets.

A full size GRO packet might use 45 page fragments, meaning
that up to 45 put_page() can be involved.

More over performing the __kfree_skb() in the recvmsg() context
adds a latency for user applications, and increase probability
of trapping them in backlog processing, since the BH handler
might found the socket owned by the user.

This patch, combined with the prior one increases the rpc
performance by about 10 % on servers with large number of cores.

(tcp_rr workload with 10,000 flows and 112 threads reach 9 Mpps
 instead of 8 Mpps)

This also increases single bulk flow performance on 40Gbit+ links,
since in this case there are often two cpus working in tandem :

 - CPU handling the NIC rx interrupts, feeding the receive queue,
  and (after this patch) freeing the skbs that were consumed.

 - CPU in recvmsg() system call, essentially 100 % busy copying out
  data to user space.

Having at most one skb in a per-socket cache has very little risk
of memory exhaustion, and since it is protected by socket lock,
its management is essentially free.

Note that if rps/rfs is used, we do not enable this feature, because
there is high chance that the same cpu is handling both the recvmsg()
system call and the TCP rx path, but that another cpu did the skb
allocations in the device driver right before the RPS/RFS logic.

To properly handle this case, it seems we would need to record
on which cpu skb was allocated, and use a different channel
to give skbs back to this cpu.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 314c47a8f5d1..577d91fb5626 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -368,6 +368,7 @@ struct sock {
 	atomic_t		sk_drops;
 	int			sk_rcvlowat;
 	struct sk_buff_head	sk_error_queue;
+	struct sk_buff		*sk_rx_skb_cache;
 	struct sk_buff_head	sk_receive_queue;
 	/*
 	 * The backlog queue is special, it is always used with
@@ -2438,6 +2439,15 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
 static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
 {
 	__skb_unlink(skb, &sk->sk_receive_queue);
+	if (
+#ifdef CONFIG_RPS
+	    !static_branch_unlikely(&rps_needed) &&
+#endif
+	    !sk->sk_rx_skb_cache) {
+		sk->sk_rx_skb_cache = skb;
+		skb_orphan(skb);
+		return;
+	}
 	__kfree_skb(skb);
 }
 
-- 
cgit v1.2.3


From b8f975545cdbcc316cf20e827e7966d4410b5c5a Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Sun, 24 Mar 2019 11:14:37 +0100
Subject: net: devlink: add port type spinlock

Add spinlock to protect port type and type_dev pointer consistency.
Without that, userspace may see inconsistent type and type_dev
combinations.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
v1->v2:
- rebased
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 63de99e09f04..cb9b060033e1 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -16,6 +16,7 @@
 #include <linux/gfp.h>
 #include <linux/list.h>
 #include <linux/netdevice.h>
+#include <linux/spinlock.h>
 #include <net/net_namespace.h>
 #include <uapi/linux/devlink.h>
 
@@ -53,6 +54,9 @@ struct devlink_port {
 	struct devlink *devlink;
 	unsigned index;
 	bool registered;
+	spinlock_t type_lock; /* Protects type and type_dev
+			       * pointer consistency.
+			       */
 	enum devlink_port_type type;
 	enum devlink_port_type desired_type;
 	void *type_dev;
-- 
cgit v1.2.3


From f6b19b354d50c5ae46ad66b5273f92e563fbc847 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Sun, 24 Mar 2019 11:14:38 +0100
Subject: net: devlink: select NET_DEVLINK from drivers

Some drivers are becoming more dependent on NET_DEVLINK being selected
in configuration. With upcoming compat functions, the behavior would be
wrong in case devlink was not compiled in. So make the drivers select
NET_DEVLINK and rely on the functions being there, not just stubs.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 495 +-------------------------------------------------
 1 file changed, 3 insertions(+), 492 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index cb9b060033e1..03fb16f4fb6c 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -549,17 +549,13 @@ static inline struct devlink *priv_to_devlink(void *priv)
 
 static inline struct devlink *netdev_to_devlink(struct net_device *dev)
 {
-#if IS_ENABLED(CONFIG_NET_DEVLINK)
 	if (dev->netdev_ops->ndo_get_devlink)
 		return dev->netdev_ops->ndo_get_devlink(dev);
-#endif
 	return NULL;
 }
 
 struct ib_device;
 
-#if IS_ENABLED(CONFIG_NET_DEVLINK)
-
 struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size);
 int devlink_register(struct devlink *devlink, struct device *dev);
 void devlink_unregister(struct devlink *devlink);
@@ -728,500 +724,14 @@ void
 devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
 				     enum devlink_health_reporter_state state);
 
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+
 void devlink_compat_running_version(struct net_device *dev,
 				    char *buf, size_t len);
 int devlink_compat_flash_update(struct net_device *dev, const char *file_name);
 
 #else
 
-static inline struct devlink *devlink_alloc(const struct devlink_ops *ops,
-					    size_t priv_size)
-{
-	return kzalloc(sizeof(struct devlink) + priv_size, GFP_KERNEL);
-}
-
-static inline int devlink_register(struct devlink *devlink, struct device *dev)
-{
-	return 0;
-}
-
-static inline void devlink_unregister(struct devlink *devlink)
-{
-}
-
-static inline void devlink_params_publish(struct devlink *devlink)
-{
-}
-
-static inline void devlink_params_unpublish(struct devlink *devlink)
-{
-}
-
-static inline void devlink_free(struct devlink *devlink)
-{
-	kfree(devlink);
-}
-
-static inline int devlink_port_register(struct devlink *devlink,
-					struct devlink_port *devlink_port,
-					unsigned int port_index)
-{
-	return 0;
-}
-
-static inline void devlink_port_unregister(struct devlink_port *devlink_port)
-{
-}
-
-static inline void devlink_port_type_eth_set(struct devlink_port *devlink_port,
-					     struct net_device *netdev)
-{
-}
-
-static inline void devlink_port_type_ib_set(struct devlink_port *devlink_port,
-					    struct ib_device *ibdev)
-{
-}
-
-static inline void devlink_port_type_clear(struct devlink_port *devlink_port)
-{
-}
-
-static inline void devlink_port_attrs_set(struct devlink_port *devlink_port,
-					  enum devlink_port_flavour flavour,
-					  u32 port_number, bool split,
-					  u32 split_subport_number)
-{
-}
-
-static inline int
-devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
-				char *name, size_t len)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int devlink_sb_register(struct devlink *devlink,
-				      unsigned int sb_index, u32 size,
-				      u16 ingress_pools_count,
-				      u16 egress_pools_count,
-				      u16 ingress_tc_count,
-				      u16 egress_tc_count)
-{
-	return 0;
-}
-
-static inline void devlink_sb_unregister(struct devlink *devlink,
-					 unsigned int sb_index)
-{
-}
-
-static inline int
-devlink_dpipe_table_register(struct devlink *devlink,
-			     const char *table_name,
-			     struct devlink_dpipe_table_ops *table_ops,
-			     void *priv, bool counter_control_extern)
-{
-	return 0;
-}
-
-static inline void devlink_dpipe_table_unregister(struct devlink *devlink,
-						  const char *table_name)
-{
-}
-
-static inline int devlink_dpipe_headers_register(struct devlink *devlink,
-						 struct devlink_dpipe_headers *
-						 dpipe_headers)
-{
-	return 0;
-}
-
-static inline void devlink_dpipe_headers_unregister(struct devlink *devlink)
-{
-}
-
-static inline bool devlink_dpipe_table_counter_enabled(struct devlink *devlink,
-						       const char *table_name)
-{
-	return false;
-}
-
-static inline int
-devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx)
-{
-	return 0;
-}
-
-static inline int
-devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx,
-			       struct devlink_dpipe_entry *entry)
-{
-	return 0;
-}
-
-static inline int
-devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx)
-{
-	return 0;
-}
-
-static inline void
-devlink_dpipe_entry_clear(struct devlink_dpipe_entry *entry)
-{
-}
-
-static inline int
-devlink_dpipe_action_put(struct sk_buff *skb,
-			 struct devlink_dpipe_action *action)
-{
-	return 0;
-}
-
-static inline int
-devlink_dpipe_match_put(struct sk_buff *skb,
-			struct devlink_dpipe_match *match)
-{
-	return 0;
-}
-
-static inline int
-devlink_resource_register(struct devlink *devlink,
-			  const char *resource_name,
-			  u64 resource_size,
-			  u64 resource_id,
-			  u64 parent_resource_id,
-			  const struct devlink_resource_size_params *size_params)
-{
-	return 0;
-}
-
-static inline void
-devlink_resources_unregister(struct devlink *devlink,
-			     struct devlink_resource *resource)
-{
-}
-
-static inline int
-devlink_resource_size_get(struct devlink *devlink, u64 resource_id,
-			  u64 *p_resource_size)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int
-devlink_dpipe_table_resource_set(struct devlink *devlink,
-				 const char *table_name, u64 resource_id,
-				 u64 resource_units)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void
-devlink_resource_occ_get_register(struct devlink *devlink,
-				  u64 resource_id,
-				  devlink_resource_occ_get_t *occ_get,
-				  void *occ_get_priv)
-{
-}
-
-static inline void
-devlink_resource_occ_get_unregister(struct devlink *devlink,
-				    u64 resource_id)
-{
-}
-
-static inline int
-devlink_params_register(struct devlink *devlink,
-			const struct devlink_param *params,
-			size_t params_count)
-{
-	return 0;
-}
-
-static inline void
-devlink_params_unregister(struct devlink *devlink,
-			  const struct devlink_param *params,
-			  size_t params_count)
-{
-
-}
-
-static inline int
-devlink_port_params_register(struct devlink_port *devlink_port,
-			     const struct devlink_param *params,
-			     size_t params_count)
-{
-	return 0;
-}
-
-static inline void
-devlink_port_params_unregister(struct devlink_port *devlink_port,
-			       const struct devlink_param *params,
-			       size_t params_count)
-{
-}
-
-static inline int
-devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id,
-				   union devlink_param_value *init_val)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int
-devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
-				   union devlink_param_value init_val)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int
-devlink_port_param_driverinit_value_get(struct devlink_port *devlink_port,
-					u32 param_id,
-					union devlink_param_value *init_val)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline int
-devlink_port_param_driverinit_value_set(struct devlink_port *devlink_port,
-					u32 param_id,
-					union devlink_param_value init_val)
-{
-	return -EOPNOTSUPP;
-}
-
-static inline void
-devlink_param_value_changed(struct devlink *devlink, u32 param_id)
-{
-}
-
-static inline void
-devlink_port_param_value_changed(struct devlink_port *devlink_port,
-				 u32 param_id)
-{
-}
-
-static inline void
-devlink_param_value_str_fill(union devlink_param_value *dst_val,
-			     const char *src)
-{
-}
-
-static inline struct devlink_region *
-devlink_region_create(struct devlink *devlink,
-		      const char *region_name,
-		      u32 region_max_snapshots,
-		      u64 region_size)
-{
-	return NULL;
-}
-
-static inline void
-devlink_region_destroy(struct devlink_region *region)
-{
-}
-
-static inline u32
-devlink_region_shapshot_id_get(struct devlink *devlink)
-{
-	return 0;
-}
-
-static inline int
-devlink_region_snapshot_create(struct devlink_region *region, u64 data_len,
-			       u8 *data, u32 snapshot_id,
-			       devlink_snapshot_data_dest_t *data_destructor)
-{
-	return 0;
-}
-
-static inline int
-devlink_info_driver_name_put(struct devlink_info_req *req, const char *name)
-{
-	return 0;
-}
-
-static inline int
-devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn)
-{
-	return 0;
-}
-
-static inline int
-devlink_info_version_fixed_put(struct devlink_info_req *req,
-			       const char *version_name,
-			       const char *version_value)
-{
-	return 0;
-}
-
-static inline int
-devlink_info_version_stored_put(struct devlink_info_req *req,
-				const char *version_name,
-				const char *version_value)
-{
-	return 0;
-}
-
-static inline int
-devlink_info_version_running_put(struct devlink_info_req *req,
-				 const char *version_name,
-				 const char *version_value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg,
-				 const char *name)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
-			u16 value_len)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			   bool value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			 u8 value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			  u32 value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			  u64 value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			     const char *value)
-{
-	return 0;
-}
-
-static inline int
-devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
-			     const void *value, u16 value_len)
-{
-	return 0;
-}
-
-static inline struct devlink_health_reporter *
-devlink_health_reporter_create(struct devlink *devlink,
-			       const struct devlink_health_reporter_ops *ops,
-			       u64 graceful_period, bool auto_recover,
-			       void *priv)
-{
-	return NULL;
-}
-
-static inline void
-devlink_health_reporter_destroy(struct devlink_health_reporter *reporter)
-{
-}
-
-static inline void *
-devlink_health_reporter_priv(struct devlink_health_reporter *reporter)
-{
-	return NULL;
-}
-
-static inline int
-devlink_health_report(struct devlink_health_reporter *reporter,
-		      const char *msg, void *priv_ctx)
-{
-	return 0;
-}
-
-static inline void
-devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
-				     enum devlink_health_reporter_state state)
-{
-}
-
 static inline void
 devlink_compat_running_version(struct net_device *dev, char *buf, size_t len)
 {
@@ -1232,6 +742,7 @@ devlink_compat_flash_update(struct net_device *dev, const char *file_name)
 {
 	return -EOPNOTSUPP;
 }
+
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
-- 
cgit v1.2.3


From 0d5f20c42b24adffa1505ec3d4930d11dfaea82f Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Sun, 3 Mar 2019 15:52:07 +0100
Subject: batman-adv: Drop license boilerplate

All files got a SPDX-License-Identifier with commit 7db7d9f369a4
("batman-adv: Add SPDX license identifier above copyright header"). All the
required information about the license conditions can be found in
LICENSES/.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batadv_packet.h | 12 ------------
 include/uapi/linux/batman_adv.h    | 18 ------------------
 2 files changed, 30 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h
index c99336f4eefe..4ebc2135e950 100644
--- a/include/uapi/linux/batadv_packet.h
+++ b/include/uapi/linux/batadv_packet.h
@@ -2,18 +2,6 @@
 /* Copyright (C) 2007-2019  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _UAPI_LINUX_BATADV_PACKET_H_
diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index 305bf316dd03..e53f2b5e7ee7 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -2,24 +2,6 @@
 /* Copyright (C) 2016-2019  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
  */
 
 #ifndef _UAPI_LINUX_BATMAN_ADV_H_
-- 
cgit v1.2.3


From 32e727449c792b689c2a06a8b4cc9fef6270c5a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Linus=20L=C3=BCssing?= <linus.luessing@c0d3.blue>
Date: Sat, 23 Mar 2019 05:47:41 +0100
Subject: batman-adv: Add multicast-to-unicast support for multiple targets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With this patch multicast packets with a limited number of destinations
(current default: 16) will be split and transmitted by the originator as
individual unicast transmissions.

Wifi broadcasts with their low bitrate are still a costly undertaking.
In a mesh network this cost multiplies with the overall size of the mesh
network. Therefore using multiple unicast transmissions instead of
broadcast flooding is almost always less burdensome for the mesh
network.

The maximum amount of unicast packets can be configured via the newly
introduced multicast_fanout parameter. If this limit is exceeded
distribution will fall back to classic broadcast flooding.

The multicast-to-unicast conversion is performed on the initial
multicast sender node and counts on a final destination node, mesh-wide
basis (and not next hop, neighbor node basis).

Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 include/uapi/linux/batman_adv.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h
index e53f2b5e7ee7..67f4636758af 100644
--- a/include/uapi/linux/batman_adv.h
+++ b/include/uapi/linux/batman_adv.h
@@ -473,6 +473,13 @@ enum batadv_nl_attrs {
 	 */
 	BATADV_ATTR_THROUGHPUT_OVERRIDE,
 
+	/**
+	 * @BATADV_ATTR_MULTICAST_FANOUT: defines the maximum number of packet
+	 * copies that may be generated for a multicast-to-unicast conversion.
+	 * Once this limit is exceeded distribution will fall back to broadcast.
+	 */
+	BATADV_ATTR_MULTICAST_FANOUT,
+
 	/* add attributes above here, update the policy in netlink.c */
 
 	/**
-- 
cgit v1.2.3


From 1713cb37bf671e5d98919536941a8b56337874fd Mon Sep 17 00:00:00 2001
From: Kristian Evensen <kristian.evensen@gmail.com>
Date: Wed, 27 Mar 2019 11:16:03 +0100
Subject: fou: Support binding FoU socket
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

An FoU socket is currently bound to the wildcard-address. While this
works fine, there are several use-cases where the use of the
wildcard-address is not desirable. For example, I use FoU on some
multi-homed servers and would like to use FoU on only one of the
interfaces.

This commit adds support for binding FoU sockets to a given source
address/interface, as well as connecting the socket to a given
destination address/port. udp_tunnel already provides the required
infrastructure, so most of the code added is for exposing and setting
the different attributes (local address, peer address, etc.).

The lookups performed when we add, delete or get an FoU-socket has also
been updated to compare all the attributes a user can set. Since the
comparison now involves several elements, I have added a separate
comparison-function instead of open-coding.

In order to test the code and ensure that the new comparison code works
correctly, I started by creating a wildcard socket bound to port 1234 on
my machine. I then tried to create a non-wildcarded socket bound to the
same port, as well as fetching and deleting the socket (including source
address, peer address or interface index in the netlink request).  Both
the create, fetch and delete request failed. Deleting/fetching the
socket was only successful when my netlink request attributes matched
those used to create the socket.

I then repeated the tests, but with a socket bound to a local ip
address, a socket bound to a local address + interface, and a bound
socket that was also «connected» to a peer. Add only worked when no
socket with the matching source address/interface (or wildcard) existed,
while fetch/delete was only successful when all attributes matched.

In addition to testing that the new code work, I also checked that the
current behavior is kept. If none of the new attributes are provided,
then an FoU-socket is configured as before (i.e., wildcarded).  If any
of the new attributes are provided, the FoU-socket is configured as
expected.

v1->v2:
* Fixed building with IPv6 disabled (kbuild).
* Fixed a return type warning and make the ugly comparison function more
readable (kbuild).
* Describe more in detail what has been tested (thanks David Miller).
* Make peer port required if peer address is specified.

Signed-off-by: Kristian Evensen <kristian.evensen@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/fou.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h
index f2ea833a2812..87c2c9f08803 100644
--- a/include/uapi/linux/fou.h
+++ b/include/uapi/linux/fou.h
@@ -16,6 +16,12 @@ enum {
 	FOU_ATTR_IPPROTO,			/* u8 */
 	FOU_ATTR_TYPE,				/* u8 */
 	FOU_ATTR_REMCSUM_NOPARTIAL,		/* flag */
+	FOU_ATTR_LOCAL_V4,			/* u32 */
+	FOU_ATTR_LOCAL_V6,			/* in6_addr */
+	FOU_ATTR_PEER_V4,			/* u32 */
+	FOU_ATTR_PEER_V6,			/* in6_addr */
+	FOU_ATTR_PEER_PORT,			/* u16 */
+	FOU_ATTR_IFINDEX,			/* s32 */
 
 	__FOU_ATTR_MAX,
 };
-- 
cgit v1.2.3


From 3aeb0803f7ea11ff2fc478f7d58f2b8e713af380 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 25 Mar 2019 19:34:58 +0100
Subject: ethtool: add PHY Fast Link Down support

This adds support for Fast Link Down as new PHY tunable.
Fast Link Down reduces the time until a link down event is reported
for 1000BaseT. According to the standard it's 750ms what is too long
for several use cases.

v2:
- add comment describing the constants

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 3652b239dad1..50c76f4fa402 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -252,9 +252,17 @@ struct ethtool_tunable {
 #define DOWNSHIFT_DEV_DEFAULT_COUNT	0xff
 #define DOWNSHIFT_DEV_DISABLE		0
 
+/* Time in msecs after which link is reported as down
+ * 0 = lowest time supported by the PHY
+ * 0xff = off, link down detection according to standard
+ */
+#define ETHTOOL_PHY_FAST_LINK_DOWN_ON	0
+#define ETHTOOL_PHY_FAST_LINK_DOWN_OFF	0xff
+
 enum phy_tunable_id {
 	ETHTOOL_PHY_ID_UNSPEC,
 	ETHTOOL_PHY_DOWNSHIFT,
+	ETHTOOL_PHY_FAST_LINK_DOWN,
 	/*
 	 * Add your fresh new phy tunable attribute above and remember to update
 	 * phy_tunable_strings[] in net/core/ethtool.c
-- 
cgit v1.2.3


From 4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 Mon Sep 17 00:00:00 2001
From: Numan Siddique <nusiddiq@redhat.com>
Date: Tue, 26 Mar 2019 06:13:46 +0530
Subject: net: openvswitch: Add a new action check_pkt_len

This patch adds a new action - 'check_pkt_len' which checks the
packet length and executes a set of actions if the packet
length is greater than the specified length or executes
another set of actions if the packet length is lesser or equal to.

This action takes below nlattrs
  * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for

  * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER - Nested actions
    to apply if the packet length is greater than the specified 'pkt_len'

  * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL - Nested
    actions to apply if the packet length is lesser or equal to the
    specified 'pkt_len'.

The main use case for adding this action is to solve the packet
drops because of MTU mismatch in OVN virtual networking solution.
When a VM (which belongs to a logical switch of OVN) sends a packet
destined to go via the gateway router and if the nic which provides
external connectivity, has a lesser MTU, OVS drops the packet
if the packet length is greater than this MTU.

With the help of this action, OVN will check the packet length
and if it is greater than the MTU size, it will generate an
ICMP packet (type 3, code 4) and includes the next hop mtu in it
so that the sender can fragment the packets.

Reported-at:
https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html
Suggested-by: Ben Pfaff <blp@ovn.org>
Signed-off-by: Numan Siddique <nusiddiq@redhat.com>
CC: Gregory Rose <gvrose8192@gmail.com>
CC: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Tested-by: Greg Rose <gvrose8192@gmail.com>
Reviewed-by: Greg Rose <gvrose8192@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 42 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index dbe0cbe4f1b7..dfabacee6903 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -798,6 +798,44 @@ struct ovs_action_push_eth {
 	struct ovs_key_ethernet addresses;
 };
 
+/*
+ * enum ovs_check_pkt_len_attr - Attributes for %OVS_ACTION_ATTR_CHECK_PKT_LEN.
+ *
+ * @OVS_CHECK_PKT_LEN_ATTR_PKT_LEN: u16 Packet length to check for.
+ * @OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER: Nested OVS_ACTION_ATTR_*
+ * actions to apply if the packer length is greater than the specified
+ * length in the attr - OVS_CHECK_PKT_LEN_ATTR_PKT_LEN.
+ * @OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL - Nested OVS_ACTION_ATTR_*
+ * actions to apply if the packer length is lesser or equal to the specified
+ * length in the attr - OVS_CHECK_PKT_LEN_ATTR_PKT_LEN.
+ */
+enum ovs_check_pkt_len_attr {
+	OVS_CHECK_PKT_LEN_ATTR_UNSPEC,
+	OVS_CHECK_PKT_LEN_ATTR_PKT_LEN,
+	OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER,
+	OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL,
+	__OVS_CHECK_PKT_LEN_ATTR_MAX,
+
+#ifdef __KERNEL__
+	OVS_CHECK_PKT_LEN_ATTR_ARG          /* struct check_pkt_len_arg  */
+#endif
+};
+
+#define OVS_CHECK_PKT_LEN_ATTR_MAX (__OVS_CHECK_PKT_LEN_ATTR_MAX - 1)
+
+#ifdef __KERNEL__
+struct check_pkt_len_arg {
+	u16 pkt_len;	/* Same value as OVS_CHECK_PKT_LEN_ATTR_PKT_LEN'. */
+	bool exec_for_greater;	/* When true, actions in IF_GREATER will
+				 * not change flow keys. False otherwise.
+				 */
+	bool exec_for_lesser_equal; /* When true, actions in IF_LESS_EQUAL
+				     * will not change flow keys. False
+				     * otherwise.
+				     */
+};
+#endif
+
 /**
  * enum ovs_action_attr - Action types.
  *
@@ -842,6 +880,9 @@ struct ovs_action_push_eth {
  * packet, or modify the packet (e.g., change the DSCP field).
  * @OVS_ACTION_ATTR_CLONE: make a copy of the packet and execute a list of
  * actions without affecting the original packet and key.
+ * @OVS_ACTION_ATTR_CHECK_PKT_LEN: Check the packet length and execute a set
+ * of actions if greater than the specified packet length, else execute
+ * another set of actions.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -876,6 +917,7 @@ enum ovs_action_attr {
 	OVS_ACTION_ATTR_POP_NSH,      /* No argument. */
 	OVS_ACTION_ATTR_METER,        /* u32 meter ID. */
 	OVS_ACTION_ATTR_CLONE,        /* Nested OVS_CLONE_ATTR_*.  */
+	OVS_ACTION_ATTR_CHECK_PKT_LEN, /* Nested OVS_CHECK_PKT_LEN_ATTR_*. */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
-- 
cgit v1.2.3


From 4f661542a40217713f2cee0bb6678fbb30d9d367 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 Mar 2019 08:34:55 -0700
Subject: tcp: fix zerocopy and notsent_lowat issues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

My recent patch had at least three problems :

1) TX zerocopy wants notification when skb is acknowledged,
   thus we need to call skb_zcopy_clear() if the skb is
   cached into sk->sk_tx_skb_cache

2) Some applications might expect precise EPOLLOUT
   notifications, so we need to update sk->sk_wmem_queued
   and call sk_mem_uncharge() from sk_wmem_free_skb()
   in all cases. The SOCK_QUEUE_SHRUNK flag must also be set.

3) Reuse of saved skb should have used skb_cloned() instead
  of simply checking if the fast clone has been freed.

Fixes: 472c2e07eef0 ("tcp: add one skb cache for tx")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 577d91fb5626..7fa223278522 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1465,13 +1465,14 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
 
 static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
 {
+	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+	sk->sk_wmem_queued -= skb->truesize;
+	sk_mem_uncharge(sk, skb->truesize);
 	if (!sk->sk_tx_skb_cache) {
+		skb_zcopy_clear(skb, true);
 		sk->sk_tx_skb_cache = skb;
 		return;
 	}
-	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
-	sk->sk_wmem_queued -= skb->truesize;
-	sk_mem_uncharge(sk, skb->truesize);
 	__kfree_skb(skb);
 }
 
-- 
cgit v1.2.3


From df453700e8d81b1bdafdf684365ee2b9431fb702 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 27 Mar 2019 12:40:33 -0700
Subject: inet: switch IP ID generator to siphash

According to Amit Klein and Benny Pinkas, IP ID generation is too weak
and might be used by attackers.

Even with recent net_hash_mix() fix (netns: provide pure entropy for net_hash_mix())
having 64bit key and Jenkins hash is risky.

It is time to switch to siphash and its 128bit keys.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Amit Klein <aksecurity@gmail.com>
Reported-by: Benny Pinkas <benny@pinkas.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/siphash.h  | 5 +++++
 include/net/netns/ipv4.h | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/siphash.h b/include/linux/siphash.h
index fa7a6b9cedbf..bf21591a9e5e 100644
--- a/include/linux/siphash.h
+++ b/include/linux/siphash.h
@@ -21,6 +21,11 @@ typedef struct {
 	u64 key[2];
 } siphash_key_t;
 
+static inline bool siphash_key_is_zero(const siphash_key_t *key)
+{
+	return !(key->key[0] | key->key[1]);
+}
+
 u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key);
 #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key);
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 104a6669e344..7698460a3dd1 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -9,6 +9,7 @@
 #include <linux/uidgid.h>
 #include <net/inet_frag.h>
 #include <linux/rcupdate.h>
+#include <linux/siphash.h>
 
 struct tcpm_hash_bucket;
 struct ctl_table_header;
@@ -217,5 +218,6 @@ struct netns_ipv4 {
 	unsigned int	ipmr_seq;	/* protected by rtnl_mutex */
 
 	atomic_t	rt_genid;
+	siphash_key_t	ip_id_key;
 };
 #endif
-- 
cgit v1.2.3


From 5dc37bb9b03586e8fdeb47d25e8d2a0399984936 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 28 Mar 2019 13:56:36 +0100
Subject: net: replace ndo_get_devlink with ndo_get_devlink_port

Follow-up patch is going to need a devlink port instance according to
a netdev. Devlink port instance should be always available when devlink
is used. So change the recently introduced ndo_get_devlink to
ndo_get_devlink_port. With that, adjust the wrapper for the only
user to get devlink pointer.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  6 +++---
 include/net/devlink.h     | 14 ++++++++++++--
 2 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 166fdc0a78b4..78f5ec4ebf64 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1250,8 +1250,8 @@ struct devlink;
  *	that got dropped are freed/returned via xdp_return_frame().
  *	Returns negative number, means general error invoking ndo, meaning
  *	no frames were xmit'ed and core-caller will free all frames.
- * struct devlink *(*ndo_get_devlink)(struct net_device *dev);
- *	Get devlink instance associated with a given netdev.
+ * struct devlink_port *(*ndo_get_devlink_port)(struct net_device *dev);
+ *	Get devlink port instance associated with a given netdev.
  *	Called with a reference on the netdevice and devlink locks only,
  *	rtnl_lock is not held.
  */
@@ -1451,7 +1451,7 @@ struct net_device_ops {
 						u32 flags);
 	int			(*ndo_xsk_async_xmit)(struct net_device *dev,
 						      u32 queue_id);
-	struct devlink *	(*ndo_get_devlink)(struct net_device *dev);
+	struct devlink_port *	(*ndo_get_devlink_port)(struct net_device *dev);
 };
 
 /**
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 03fb16f4fb6c..81b5ed04a341 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -547,10 +547,20 @@ static inline struct devlink *priv_to_devlink(void *priv)
 	return container_of(priv, struct devlink, priv);
 }
 
+static inline struct devlink_port *
+netdev_to_devlink_port(struct net_device *dev)
+{
+	if (dev->netdev_ops->ndo_get_devlink_port)
+		return dev->netdev_ops->ndo_get_devlink_port(dev);
+	return NULL;
+}
+
 static inline struct devlink *netdev_to_devlink(struct net_device *dev)
 {
-	if (dev->netdev_ops->ndo_get_devlink)
-		return dev->netdev_ops->ndo_get_devlink(dev);
+	struct devlink_port *devlink_port = netdev_to_devlink_port(dev);
+
+	if (devlink_port)
+		return devlink_port->devlink;
 	return NULL;
 }
 
-- 
cgit v1.2.3


From af3836df9a59e7339d60c9c46729a7d9094d0582 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 28 Mar 2019 13:56:37 +0100
Subject: net: devlink: introduce devlink_compat_phys_port_name_get()

Introduce devlink_compat_phys_port_name_get() helper that
gets the physical port name for specified netdevice
according to devlink port attributes.
Call this helper from dev_get_phys_port_name()
in case ndo_get_phys_port_name is not defined.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 81b5ed04a341..85e577d6ec3b 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -739,6 +739,8 @@ devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
 void devlink_compat_running_version(struct net_device *dev,
 				    char *buf, size_t len);
 int devlink_compat_flash_update(struct net_device *dev, const char *file_name);
+int devlink_compat_phys_port_name_get(struct net_device *dev,
+				      char *name, size_t len);
 
 #else
 
@@ -753,6 +755,13 @@ devlink_compat_flash_update(struct net_device *dev, const char *file_name)
 	return -EOPNOTSUPP;
 }
 
+static inline int
+devlink_compat_phys_port_name_get(struct net_device *dev,
+				  char *name, size_t len)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
-- 
cgit v1.2.3


From 14c03ac4c100e4b81ec4747f5ec861701ff52de2 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Thu, 28 Mar 2019 13:56:40 +0100
Subject: net: devlink: remove unused devlink_port_get_phys_port_name()
 function

Now it is unused, remove it.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 85e577d6ec3b..31d5cec4d06b 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -583,8 +583,6 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
 			    enum devlink_port_flavour flavour,
 			    u32 port_number, bool split,
 			    u32 split_subport_number);
-int devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
-				    char *name, size_t len);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
-- 
cgit v1.2.3


From 717700d183d65bd2e6511566aa6d32395419d158 Mon Sep 17 00:00:00 2001
From: Yi-Hung Wei <yihung.wei@gmail.com>
Date: Tue, 26 Mar 2019 11:31:13 -0700
Subject: netfilter: Export nf_ct_{set,destroy}_timeout()

This patch exports nf_ct_set_timeout() and nf_ct_destroy_timeout().
The two functions are derived from xt_ct_destroy_timeout() and
xt_ct_set_timeout() in xt_CT.c, and moved to nf_conntrack_timeout.c
without any functional change.
It would be useful for other users (i.e. OVS) that utilizes the
finer-grain conntrack timeout feature.

CC: Pablo Neira Ayuso <pablo@netfilter.org>
CC: Pravin Shelar <pshelar@ovn.org>
Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netfilter/nf_conntrack_timeout.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h
index 3394d75e1c80..00a8fbb2d735 100644
--- a/include/net/netfilter/nf_conntrack_timeout.h
+++ b/include/net/netfilter/nf_conntrack_timeout.h
@@ -88,6 +88,9 @@ static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct)
 int nf_conntrack_timeout_init(void);
 void nf_conntrack_timeout_fini(void);
 void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout);
+int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num,
+		      const char *timeout_name);
+void nf_ct_destroy_timeout(struct nf_conn *ct);
 #else
 static inline int nf_conntrack_timeout_init(void)
 {
@@ -98,6 +101,18 @@ static inline void nf_conntrack_timeout_fini(void)
 {
         return;
 }
+
+static inline int nf_ct_set_timeout(struct net *net, struct nf_conn *ct,
+				    u8 l3num, u8 l4num,
+				    const char *timeout_name)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void nf_ct_destroy_timeout(struct nf_conn *ct)
+{
+	return;
+}
 #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
 
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-- 
cgit v1.2.3


From 06bd2bdf19d2f3d22731625e1a47fa1dff5ac407 Mon Sep 17 00:00:00 2001
From: Yi-Hung Wei <yihung.wei@gmail.com>
Date: Tue, 26 Mar 2019 11:31:14 -0700
Subject: openvswitch: Add timeout support to ct action

Add support for fine-grain timeout support to conntrack action.
The new OVS_CT_ATTR_TIMEOUT attribute of the conntrack action
specifies a timeout to be associated with this connection.
If no timeout is specified, it acts as is, that is the default
timeout for the connection will be automatically applied.

Example usage:
$ nfct timeout add timeout_1 inet tcp syn_sent 100 established 200
$ ovs-ofctl add-flow br0 in_port=1,ip,tcp,action=ct(commit,timeout=timeout_1)

CC: Pravin Shelar <pshelar@ovn.org>
CC: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index dfabacee6903..0cac5d802c6a 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -734,6 +734,7 @@ struct ovs_action_hash {
  * be received on NFNLGRP_CONNTRACK_NEW and NFNLGRP_CONNTRACK_DESTROY groups,
  * respectively.  Remaining bits control the changes for which an event is
  * delivered on the NFNLGRP_CONNTRACK_UPDATE group.
+ * @OVS_CT_ATTR_TIMEOUT: Variable length string defining conntrack timeout.
  */
 enum ovs_ct_attr {
 	OVS_CT_ATTR_UNSPEC,
@@ -746,6 +747,8 @@ enum ovs_ct_attr {
 	OVS_CT_ATTR_NAT,        /* Nested OVS_NAT_ATTR_* */
 	OVS_CT_ATTR_FORCE_COMMIT,  /* No argument */
 	OVS_CT_ATTR_EVENTMASK,  /* u32 mask of IPCT_* events. */
+	OVS_CT_ATTR_TIMEOUT,	/* Associate timeout with this connection for
+				 * fine-grain timeout tuning. */
 	__OVS_CT_ATTR_MAX
 };
 
-- 
cgit v1.2.3


From 331c7a402358de6206232f6aab7aa48ec6c1088a Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:47 -0700
Subject: ipv4: Move IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN to helper

in_dev lookup followed by IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN check
is called in several places, some with the rcu lock and others with the
rtnl held.

Move the check to a helper similar to what IPv6 has. Since the helper
can be invoked from either context use rcu_dereference_rtnl to
dereference ip_ptr.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index a64f21a97369..367dc2a0f84a 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -237,6 +237,20 @@ static inline struct in_device *__in_dev_get_rtnl(const struct net_device *dev)
 	return rtnl_dereference(dev->ip_ptr);
 }
 
+/* called with rcu_read_lock or rtnl held */
+static inline bool ip_ignore_linkdown(const struct net_device *dev)
+{
+	struct in_device *in_dev;
+	bool rc = false;
+
+	in_dev = rcu_dereference_rtnl(dev->ip_ptr);
+	if (in_dev &&
+	    IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
+		rc = true;
+
+	return rc;
+}
+
 static inline struct neigh_parms *__in_dev_arp_parms_get_rcu(const struct net_device *dev)
 {
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
-- 
cgit v1.2.3


From e4516ef65490ef29d48a98ad4d7c90dccf39068f Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:48 -0700
Subject: ipv4: Create init helper for fib_nh

Consolidate the fib_nh initialization which is duplicated between
fib_create_info for single path and fib_get_nhs for multipath.
Export the helper to allow for use with nexthop objects in the
future.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 9c8214d2116d..1af1f552644a 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -416,6 +416,10 @@ void fib_select_multipath(struct fib_result *res, int hash);
 void fib_select_path(struct net *net, struct fib_result *res,
 		     struct flowi4 *fl4, const struct sk_buff *skb);
 
+int fib_nh_init(struct net *net, struct fib_nh *fib_nh,
+		struct fib_config *cfg, int nh_weight,
+		struct netlink_ext_ack *extack);
+
 /* Exported by fib_trie.c */
 void fib_trie_init(void);
 struct fib_table *fib_trie_table(u32 id, struct fib_table *alias);
-- 
cgit v1.2.3


From faa041a40b9fa64913789fcc0161c7c73161f57e Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:49 -0700
Subject: ipv4: Create cleanup helper for fib_nh

Move the fib_nh cleanup code from free_fib_info_rcu into a new helper,
fib_nh_release. Move classid accounting into fib_nh_release which is
called per fib_nh to make accounting symmetrical with fib_nh_init.
Export the helper to allow for use with nexthop objects in the
future.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 1af1f552644a..5a4df0ba175e 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -419,6 +419,7 @@ void fib_select_path(struct net *net, struct fib_result *res,
 int fib_nh_init(struct net *net, struct fib_nh *fib_nh,
 		struct fib_config *cfg, int nh_weight,
 		struct netlink_ext_ack *extack);
+void fib_nh_release(struct net *net, struct fib_nh *fib_nh);
 
 /* Exported by fib_trie.c */
 void fib_trie_init(void);
-- 
cgit v1.2.3


From 83c442515917812d4ff643e90cd456c630b7e762 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:50 -0700
Subject: ipv6: Create init helper for fib6_nh

Similar to IPv4, consolidate the fib6_nh initialization into a helper.
As a new standalone function, add a cleanup path to put lwtstate on
error.

To avoid modifying fib6_config flags, move the reject check to a helper
that is invoked once by fib6_nh_init to reset the device and then
again in ip6_route_info_create to set the fib6_flags.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 2acb78a762ee..ce1f81345c8e 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -444,6 +444,10 @@ static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
 	return f6i->fib6_nh.nh_dev;
 }
 
+int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
+		 struct fib6_config *cfg, gfp_t gfp_flags,
+		 struct netlink_ext_ack *extack);
+
 static inline
 struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i)
 {
-- 
cgit v1.2.3


From dac7d0f27075ce54017a7efdd6ae0a55352a0f80 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:51 -0700
Subject: ipv6: Create cleanup helper for fib6_nh

Move the fib6_nh cleanup code to a new helper, fib6_nh_release.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index ce1f81345c8e..2d2a468b3d6d 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -447,6 +447,7 @@ static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
 		 struct fib6_config *cfg, gfp_t gfp_flags,
 		 struct netlink_ext_ack *extack);
+void fib6_nh_release(struct fib6_nh *fib6_nh);
 
 static inline
 struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i)
-- 
cgit v1.2.3


From 2b2450ca4a2d9d772dc45e1220c04cb3ba761843 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:52 -0700
Subject: ipv6: Move gateway checks to a fib6_nh setting

The gateway setting is not per fib6_info entry but per-fib6_nh. Add a new
fib_nh_has_gw flag to fib6_nh and convert references to RTF_GATEWAY to
the new flag. For IPv6 address the flag is cheaper than checking that
nh_gw is non-0 like IPv4 does.

While this increases fib6_nh by 8-bytes, the effective allocation size of
a fib6_info is unchanged. The 8 bytes is recovered later with a
fib_nh_common change.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h   | 1 +
 include/net/ip6_route.h | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 2d2a468b3d6d..3b04b318cf13 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -126,6 +126,7 @@ struct rt6_exception {
 
 struct fib6_nh {
 	struct in6_addr		nh_gw;
+	bool			fib_nh_has_gw;
 	struct net_device	*nh_dev;
 	struct lwtunnel_state	*nh_lwtstate;
 
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 7ab119936e69..95cd8a2f6284 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -68,8 +68,8 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 
 static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
 {
-	return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
-	       RTF_GATEWAY;
+	return !(f6i->fib6_flags & (RTF_ADDRCONF|RTF_DYNAMIC)) &&
+		f6i->fib6_nh.fib_nh_has_gw;
 }
 
 void ip6_route_input(struct sk_buff *skb);
-- 
cgit v1.2.3


From 6d3d07b45c86f984424ccbad110ca500397fd18c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:53 -0700
Subject: ipv6: Refactor fib6_ignore_linkdown

fib6_ignore_linkdown takes a fib6_info but only looks at the net_device
and its IPv6 config. Change it to take a net_device over a fib6_info as
its input argument.

In addition, move it to a header file to make the check inline and usable
later with IPv4 code without going through the ipv6 stub, and rename to
ip6_ignore_linkdown since it is only checking the setting based on the
ipv6 struct on a device.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 269ec27385e9..ec8e6784a6f7 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -425,6 +425,14 @@ static inline void in6_dev_hold(struct inet6_dev *idev)
 	refcount_inc(&idev->refcnt);
 }
 
+/* called with rcu_read_lock held */
+static inline bool ip6_ignore_linkdown(const struct net_device *dev)
+{
+	const struct inet6_dev *idev = __in6_dev_get(dev);
+
+	return !!idev->cnf.ignore_routes_with_linkdown;
+}
+
 void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp);
 
 static inline void in6_ifa_put(struct inet6_ifaddr *ifp)
-- 
cgit v1.2.3


From b75ed8b1aa9c3a99702159c3be8b0c1d54972ae5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:55 -0700
Subject: ipv4: Rename fib_nh entries

Rename fib_nh entries that will be moved to a fib_nh_common struct.
Specifically, the device, oif, gateway, flags, scope, lwtstate,
nh_weight and nh_upper_bound are common with all nexthop definitions.
In the process shorten fib_nh_lwtstate to fib_nh_lws to avoid really
long lines.

Rename only; no functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h       | 24 ++++++++++++------------
 include/trace/events/fib.h |  7 +++++--
 2 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 5a4df0ba175e..029acd333d29 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -77,26 +77,26 @@ struct fnhe_hash_bucket {
 #define FNHE_RECLAIM_DEPTH	5
 
 struct fib_nh {
-	struct net_device	*nh_dev;
+	struct net_device	*fib_nh_dev;
 	struct hlist_node	nh_hash;
 	struct fib_info		*nh_parent;
-	unsigned int		nh_flags;
-	unsigned char		nh_scope;
+	unsigned int		fib_nh_flags;
+	unsigned char		fib_nh_scope;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	int			nh_weight;
-	atomic_t		nh_upper_bound;
+	int			fib_nh_weight;
+	atomic_t		fib_nh_upper_bound;
 #endif
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	__u32			nh_tclassid;
 #endif
-	int			nh_oif;
-	__be32			nh_gw;
+	int			fib_nh_oif;
+	__be32			fib_nh_gw4;
 	__be32			nh_saddr;
 	int			nh_saddr_genid;
 	struct rtable __rcu * __percpu *nh_pcpu_rth_output;
 	struct rtable __rcu	*nh_rth_input;
 	struct fnhe_hash_bucket	__rcu *nh_exceptions;
-	struct lwtunnel_state	*nh_lwtstate;
+	struct lwtunnel_state	*fib_nh_lws;
 };
 
 /*
@@ -125,7 +125,7 @@ struct fib_info {
 	int			fib_nhs;
 	struct rcu_head		rcu;
 	struct fib_nh		fib_nh[0];
-#define fib_dev		fib_nh[0].nh_dev
+#define fib_dev		fib_nh[0].fib_nh_dev
 };
 
 
@@ -180,9 +180,9 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh);
 	  atomic_read(&(net)->ipv4.dev_addr_genid)) ?	\
 	 FIB_RES_NH(res).nh_saddr :			\
 	 fib_info_update_nh_saddr((net), &FIB_RES_NH(res)))
-#define FIB_RES_GW(res)			(FIB_RES_NH(res).nh_gw)
-#define FIB_RES_DEV(res)		(FIB_RES_NH(res).nh_dev)
-#define FIB_RES_OIF(res)		(FIB_RES_NH(res).nh_oif)
+#define FIB_RES_GW(res)			(FIB_RES_NH(res).fib_nh_gw4)
+#define FIB_RES_DEV(res)		(FIB_RES_NH(res).fib_nh_dev)
+#define FIB_RES_OIF(res)		(FIB_RES_NH(res).fib_nh_oif)
 
 #define FIB_RES_PREFSRC(net, res)	((res).fi->fib_prefsrc ? : \
 					 FIB_RES_SADDR(net, res))
diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h
index 6271bab63bfb..61ea7a24c8e5 100644
--- a/include/trace/events/fib.h
+++ b/include/trace/events/fib.h
@@ -63,13 +63,16 @@ TRACE_EVENT(fib_table_lookup,
 		}
 
 		if (nh) {
+			struct net_device *dev;
+
 			p32 = (__be32 *) __entry->saddr;
 			*p32 = nh->nh_saddr;
 
 			p32 = (__be32 *) __entry->gw;
-			*p32 = nh->nh_gw;
+			*p32 = nh->fib_nh_gw4;
 
-			__assign_str(name, nh->nh_dev ? nh->nh_dev->name : "-");
+			dev = nh->fib_nh_dev;
+			__assign_str(name, dev ? dev->name : "-");
 		} else {
 			p32 = (__be32 *) __entry->saddr;
 			*p32 = 0;
-- 
cgit v1.2.3


From ad1601ae0260551f85691ca1ac814773fdcec239 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:56 -0700
Subject: ipv6: Rename fib6_nh entries

Rename fib6_nh entries that will be moved to a fib_nh_common struct.
Specifically, the device, gateway, flags, and lwtstate are common
with all nexthop definitions. In some places new temporary variables
are declared or local variables renamed to maintain line lengths.

Rename only; no functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h       | 16 ++++++++--------
 include/net/ip6_route.h     |  8 +++++---
 include/trace/events/fib6.h |  6 +++---
 3 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 3b04b318cf13..aff8570725c8 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -125,14 +125,14 @@ struct rt6_exception {
 #define FIB6_MAX_DEPTH 5
 
 struct fib6_nh {
-	struct in6_addr		nh_gw;
+	struct in6_addr		fib_nh_gw6;
 	bool			fib_nh_has_gw;
-	struct net_device	*nh_dev;
-	struct lwtunnel_state	*nh_lwtstate;
+	struct net_device	*fib_nh_dev;
+	struct lwtunnel_state	*fib_nh_lws;
 
-	unsigned int		nh_flags;
-	atomic_t		nh_upper_bound;
-	int			nh_weight;
+	unsigned int		fib_nh_flags;
+	atomic_t		fib_nh_upper_bound;
+	int			fib_nh_weight;
 };
 
 struct fib6_info {
@@ -442,7 +442,7 @@ void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr)
 
 static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
 {
-	return f6i->fib6_nh.nh_dev;
+	return f6i->fib6_nh.fib_nh_dev;
 }
 
 int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
@@ -453,7 +453,7 @@ void fib6_nh_release(struct fib6_nh *fib6_nh);
 static inline
 struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i)
 {
-	return f6i->fib6_nh.nh_lwtstate;
+	return f6i->fib6_nh.fib_nh_lws;
 }
 
 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 95cd8a2f6284..342180a7285c 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -274,9 +274,11 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
 
 static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
 {
-	return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev &&
-	       ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) &&
-	       !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate);
+	struct fib6_nh *nha = &a->fib6_nh, *nhb = &b->fib6_nh;
+
+	return nha->fib_nh_dev == nhb->fib_nh_dev &&
+	       ipv6_addr_equal(&nha->fib_nh_gw6, &nhb->fib_nh_gw6) &&
+	       !lwtunnel_cmp_encap(nha->fib_nh_lws, nhb->fib_nh_lws);
 }
 
 static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index b088b54d699c..6d05ebdd669c 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -62,8 +62,8 @@ TRACE_EVENT(fib6_table_lookup,
 			__entry->dport = 0;
 		}
 
-		if (f6i->fib6_nh.nh_dev) {
-			__assign_str(name, f6i->fib6_nh.nh_dev);
+		if (f6i->fib6_nh.fib_nh_dev) {
+			__assign_str(name, f6i->fib6_nh.fib_nh_dev);
 		} else {
 			__assign_str(name, "-");
 		}
@@ -75,7 +75,7 @@ TRACE_EVENT(fib6_table_lookup,
 
 		} else if (f6i) {
 			in6 = (struct in6_addr *)__entry->gw;
-			*in6 = f6i->fib6_nh.nh_gw;
+			*in6 = f6i->fib6_nh.fib_nh_gw6;
 		}
 	),
 
-- 
cgit v1.2.3


From f1741730dd18828fe3ea5fa91c22f41cf001c625 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:57 -0700
Subject: net: Add fib_nh_common and update fib_nh and fib6_nh

Add fib_nh_common struct with common nexthop attributes. Convert
fib_nh and fib6_nh to use it. Use macros to move existing
fib_nh_* references to the new nh_common.nhc_*.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 10 ++--------
 include/net/ip_fib.h  | 41 +++++++++++++++++++++++++++++++----------
 2 files changed, 33 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index aff8570725c8..58dbb4e82908 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -19,6 +19,7 @@
 #include <linux/notifier.h>
 #include <net/dst.h>
 #include <net/flow.h>
+#include <net/ip_fib.h>
 #include <net/netlink.h>
 #include <net/inetpeer.h>
 #include <net/fib_notifier.h>
@@ -125,14 +126,7 @@ struct rt6_exception {
 #define FIB6_MAX_DEPTH 5
 
 struct fib6_nh {
-	struct in6_addr		fib_nh_gw6;
-	bool			fib_nh_has_gw;
-	struct net_device	*fib_nh_dev;
-	struct lwtunnel_state	*fib_nh_lws;
-
-	unsigned int		fib_nh_flags;
-	atomic_t		fib_nh_upper_bound;
-	int			fib_nh_weight;
+	struct fib_nh_common	nh_common;
 };
 
 struct fib6_info {
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 029acd333d29..70548b1a6322 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -76,27 +76,48 @@ struct fnhe_hash_bucket {
 #define FNHE_HASH_SIZE		(1 << FNHE_HASH_SHIFT)
 #define FNHE_RECLAIM_DEPTH	5
 
+struct fib_nh_common {
+	struct net_device	*nhc_dev;
+	int			nhc_oif;
+	unsigned int		nhc_flags;
+	struct lwtunnel_state	*nhc_lwtstate;
+	unsigned char		nhc_scope;
+	u8			nhc_family;
+	u8			nhc_has_gw:1,
+				unused:7;
+	union {
+		__be32          ipv4;
+		struct in6_addr ipv6;
+	} nhc_gw;
+
+	int			nhc_weight;
+	atomic_t		nhc_upper_bound;
+};
+
 struct fib_nh {
-	struct net_device	*fib_nh_dev;
+	struct fib_nh_common	nh_common;
 	struct hlist_node	nh_hash;
 	struct fib_info		*nh_parent;
-	unsigned int		fib_nh_flags;
-	unsigned char		fib_nh_scope;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-	int			fib_nh_weight;
-	atomic_t		fib_nh_upper_bound;
-#endif
 #ifdef CONFIG_IP_ROUTE_CLASSID
 	__u32			nh_tclassid;
 #endif
-	int			fib_nh_oif;
-	__be32			fib_nh_gw4;
 	__be32			nh_saddr;
 	int			nh_saddr_genid;
 	struct rtable __rcu * __percpu *nh_pcpu_rth_output;
 	struct rtable __rcu	*nh_rth_input;
 	struct fnhe_hash_bucket	__rcu *nh_exceptions;
-	struct lwtunnel_state	*fib_nh_lws;
+#define fib_nh_family		nh_common.nhc_family
+#define fib_nh_dev		nh_common.nhc_dev
+#define fib_nh_oif		nh_common.nhc_oif
+#define fib_nh_flags		nh_common.nhc_flags
+#define fib_nh_lws		nh_common.nhc_lwtstate
+#define fib_nh_scope		nh_common.nhc_scope
+#define fib_nh_family		nh_common.nhc_family
+#define fib_nh_has_gw		nh_common.nhc_has_gw
+#define fib_nh_gw4		nh_common.nhc_gw.ipv4
+#define fib_nh_gw6		nh_common.nhc_gw.ipv6
+#define fib_nh_weight		nh_common.nhc_weight
+#define fib_nh_upper_bound	nh_common.nhc_upper_bound
 };
 
 /*
-- 
cgit v1.2.3


From 979e276ebebd537782797c439c9cb42b6d3aba27 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 27 Mar 2019 20:53:58 -0700
Subject: net: Use common nexthop init and release helpers

With fib_nh_common in place, move common initialization and release
code into helpers used by both ipv4 and ipv6. For the moment, the init
is just the lwt encap and the release is both the netdev reference and
the the lwt state reference. More will be added later.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 70548b1a6322..12a6d759cf57 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -441,6 +441,10 @@ int fib_nh_init(struct net *net, struct fib_nh *fib_nh,
 		struct fib_config *cfg, int nh_weight,
 		struct netlink_ext_ack *extack);
 void fib_nh_release(struct net *net, struct fib_nh *fib_nh);
+int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *fc_encap,
+		       u16 fc_encap_type, void *cfg, gfp_t gfp_flags,
+		       struct netlink_ext_ack *extack);
+void fib_nh_common_release(struct fib_nh_common *nhc);
 
 /* Exported by fib_trie.c */
 void fib_trie_init(void);
-- 
cgit v1.2.3


From 3616d08bcbb564c7765187cd45ad392e49bad73a Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 22 Mar 2019 06:06:09 -0700
Subject: ipv6: Move ipv6 stubs to a separate header file

The number of stubs is growing and has nothing to do with addrconf.
Move the definition of the stubs to a separate header file and update
users. In the move, drop the vxlan specific comment before ipv6_stub.

Code move only; no functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/addrconf.h   | 47 ------------------------------------
 include/net/ipv6_stubs.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/net/udp_tunnel.h |  2 +-
 3 files changed, 64 insertions(+), 48 deletions(-)
 create mode 100644 include/net/ipv6_stubs.h

(limited to 'include')

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index ec8e6784a6f7..2f67ae854ff0 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -238,53 +238,6 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
 
 void ipv6_mc_dad_complete(struct inet6_dev *idev);
 
-/* A stub used by vxlan module. This is ugly, ideally these
- * symbols should be built into the core kernel.
- */
-struct ipv6_stub {
-	int (*ipv6_sock_mc_join)(struct sock *sk, int ifindex,
-				 const struct in6_addr *addr);
-	int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex,
-				 const struct in6_addr *addr);
-	int (*ipv6_dst_lookup)(struct net *net, struct sock *sk,
-			       struct dst_entry **dst, struct flowi6 *fl6);
-	int (*ipv6_route_input)(struct sk_buff *skb);
-
-	struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
-	struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
-					 struct flowi6 *fl6, int flags);
-	struct fib6_info *(*fib6_table_lookup)(struct net *net,
-					      struct fib6_table *table,
-					      int oif, struct flowi6 *fl6,
-					      int flags);
-	struct fib6_info *(*fib6_multipath_select)(const struct net *net,
-						   struct fib6_info *f6i,
-						   struct flowi6 *fl6, int oif,
-						   const struct sk_buff *skb,
-						   int strict);
-	u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr,
-				 struct in6_addr *saddr);
-
-	void (*udpv6_encap_enable)(void);
-	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
-			      const struct in6_addr *solicited_addr,
-			      bool router, bool solicited, bool override, bool inc_opt);
-	struct neigh_table *nd_tbl;
-};
-extern const struct ipv6_stub *ipv6_stub __read_mostly;
-
-/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */
-struct ipv6_bpf_stub {
-	int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len,
-			  bool force_bind_address_no_port, bool with_lock);
-	struct sock *(*udp6_lib_lookup)(struct net *net,
-					const struct in6_addr *saddr, __be16 sport,
-					const struct in6_addr *daddr, __be16 dport,
-					int dif, int sdif, struct udp_table *tbl,
-					struct sk_buff *skb);
-};
-extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
-
 /*
  * identify MLD packets for MLD filter exceptions
  */
diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
new file mode 100644
index 000000000000..d8d9c0b0e8c0
--- /dev/null
+++ b/include/net/ipv6_stubs.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _IPV6_STUBS_H
+#define _IPV6_STUBS_H
+
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/dst.h>
+#include <net/flow.h>
+#include <net/neighbour.h>
+#include <net/sock.h>
+
+/* structs from net/ip6_fib.h */
+struct fib6_info;
+
+/* This is ugly, ideally these symbols should be built
+ * into the core kernel.
+ */
+struct ipv6_stub {
+	int (*ipv6_sock_mc_join)(struct sock *sk, int ifindex,
+				 const struct in6_addr *addr);
+	int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex,
+				 const struct in6_addr *addr);
+	int (*ipv6_dst_lookup)(struct net *net, struct sock *sk,
+			       struct dst_entry **dst, struct flowi6 *fl6);
+	int (*ipv6_route_input)(struct sk_buff *skb);
+
+	struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
+	struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
+					 struct flowi6 *fl6, int flags);
+	struct fib6_info *(*fib6_table_lookup)(struct net *net,
+					      struct fib6_table *table,
+					      int oif, struct flowi6 *fl6,
+					      int flags);
+	struct fib6_info *(*fib6_multipath_select)(const struct net *net,
+						   struct fib6_info *f6i,
+						   struct flowi6 *fl6, int oif,
+						   const struct sk_buff *skb,
+						   int strict);
+	u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr,
+				 struct in6_addr *saddr);
+
+	void (*udpv6_encap_enable)(void);
+	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
+			      const struct in6_addr *solicited_addr,
+			      bool router, bool solicited, bool override, bool inc_opt);
+	struct neigh_table *nd_tbl;
+};
+extern const struct ipv6_stub *ipv6_stub __read_mostly;
+
+/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */
+struct ipv6_bpf_stub {
+	int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+			  bool force_bind_address_no_port, bool with_lock);
+	struct sock *(*udp6_lib_lookup)(struct net *net,
+				     const struct in6_addr *saddr, __be16 sport,
+				     const struct in6_addr *daddr, __be16 dport,
+				     int dif, int sdif, struct udp_table *tbl,
+				     struct sk_buff *skb);
+};
+extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
+
+#endif
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index b8137953fea3..4b1f95e08307 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -7,7 +7,7 @@
 
 #if IS_ENABLED(CONFIG_IPV6)
 #include <net/ipv6.h>
-#include <net/addrconf.h>
+#include <net/ipv6_stubs.h>
 #endif
 
 struct udp_port_cfg {
-- 
cgit v1.2.3


From 18b6f717483a835fb98de9f0df6c724df9324e78 Mon Sep 17 00:00:00 2001
From: wenxu <wenxu@ucloud.cn>
Date: Thu, 28 Mar 2019 12:43:23 +0800
Subject: openvswitch: Make metadata_dst tunnel work in IP_TUNNEL_INFO_BRIDGE
 mode

There is currently no support for the multicast/broadcast aspects
of VXLAN in ovs. In the datapath flow the tun_dst must specific.
But in the IP_TUNNEL_INFO_BRIDGE mode the tun_dst can not be specific.
And the packet can forward through the fdb table of vxlan devcice. In
this mode the broadcast/multicast packet can be sent through the
following ways in ovs.

ovs-vsctl add-port br0 vxlan -- set in vxlan type=vxlan \
        options:key=1000 options:remote_ip=flow
ovs-ofctl add-flow br0 in_port=LOCAL,dl_dst=ff:ff:ff:ff:ff:ff, \
        action=output:vxlan

bridge fdb append ff:ff:ff:ff:ff:ff dev vxlan_sys_4789 dst 172.168.0.1 \
        src_vni 1000 vni 1000 self
bridge fdb append ff:ff:ff:ff:ff:ff dev vxlan_sys_4789 dst 172.168.0.2 \
        src_vni 1000 vni 1000 self

Signed-off-by: wenxu <wenxu@ucloud.cn>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 0cac5d802c6a..f271f1ec50ae 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -364,6 +364,7 @@ enum ovs_tunnel_key_attr {
 	OVS_TUNNEL_KEY_ATTR_IPV6_DST,		/* struct in6_addr dst IPv6 address. */
 	OVS_TUNNEL_KEY_ATTR_PAD,
 	OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,	/* struct erspan_metadata */
+	OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE,	/* No argument. IPV4_INFO_BRIDGE mode.*/
 	__OVS_TUNNEL_KEY_ATTR_MAX
 };
 
-- 
cgit v1.2.3


From a4e76ba6b4994773fbe7a4eed8228e47862ac8a3 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Sun, 31 Mar 2019 06:49:41 +0000
Subject: mlxsw: spectrum_acl: Rename rehash_dis trace

The name of the trace is no longer correct, since there is no disable of
rehash done. So name it "rehash_rollback_failed".

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/mlxsw.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/mlxsw.h b/include/trace/events/mlxsw.h
index 6a4cfaef33a2..19a25ed323a5 100644
--- a/include/trace/events/mlxsw.h
+++ b/include/trace/events/mlxsw.h
@@ -93,7 +93,7 @@ TRACE_EVENT(mlxsw_sp_acl_tcam_vregion_migrate_end,
 		  __entry->mlxsw_sp, __entry->vregion)
 );
 
-TRACE_EVENT(mlxsw_sp_acl_tcam_vregion_rehash_dis,
+TRACE_EVENT(mlxsw_sp_acl_tcam_vregion_rehash_rollback_failed,
 	TP_PROTO(const struct mlxsw_sp *mlxsw_sp,
 		 const struct mlxsw_sp_acl_tcam_vregion *vregion),
 
-- 
cgit v1.2.3


From a2c7023f7075ca9b80f944d3f20f60e6574538e2 Mon Sep 17 00:00:00 2001
From: Xiaofei Shen <xiaofeis@codeaurora.org>
Date: Fri, 29 Mar 2019 11:04:58 +0530
Subject: net: dsa: read mac address from DT for slave device

Before creating a slave netdevice, get the mac address from DTS and
apply in case it is valid.

Signed-off-by: Xiaofei Shen <xiaofeis@codeaurora.org>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index ae480bba11f5..0cfc2f828b87 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -140,6 +140,7 @@ struct dsa_port {
 	unsigned int		index;
 	const char		*name;
 	const struct dsa_port	*cpu_dp;
+	const char		*mac;
 	struct device_node	*dn;
 	unsigned int		ageing_time;
 	u8			stp_state;
-- 
cgit v1.2.3


From 97cdcf37b57e3f204be3000b9eab9686f38b4356 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 1 Apr 2019 16:42:13 +0200
Subject: net: place xmit recursion in softnet data

This fills a hole in softnet data, so no change in structure size.

Also prepares for xmit_more placement in the same spot;
skb->xmit_more will be removed in followup patch.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 78f5ec4ebf64..2b25824642fa 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2659,14 +2659,6 @@ void netdev_freemem(struct net_device *dev);
 void synchronize_net(void);
 int init_dummy_netdev(struct net_device *dev);
 
-DECLARE_PER_CPU(int, xmit_recursion);
-#define XMIT_RECURSION_LIMIT	10
-
-static inline int dev_recursion_level(void)
-{
-	return this_cpu_read(xmit_recursion);
-}
-
 struct net_device *dev_get_by_index(struct net *net, int ifindex);
 struct net_device *__dev_get_by_index(struct net *net, int ifindex);
 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
@@ -3015,6 +3007,11 @@ struct softnet_data {
 #ifdef CONFIG_XFRM_OFFLOAD
 	struct sk_buff_head	xfrm_backlog;
 #endif
+	/* written and read only by owning cpu: */
+	struct {
+		u16 recursion;
+		u8  more;
+	} xmit;
 #ifdef CONFIG_RPS
 	/* input_queue_head should be written by cpu owning this struct,
 	 * and only read by other cpus. Worth using a cache line.
@@ -3050,6 +3047,28 @@ static inline void input_queue_tail_incr_save(struct softnet_data *sd,
 
 DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 
+static inline int dev_recursion_level(void)
+{
+	return __this_cpu_read(softnet_data.xmit.recursion);
+}
+
+#define XMIT_RECURSION_LIMIT	10
+static inline bool dev_xmit_recursion(void)
+{
+	return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
+			XMIT_RECURSION_LIMIT);
+}
+
+static inline void dev_xmit_recursion_inc(void)
+{
+	__this_cpu_inc(softnet_data.xmit.recursion);
+}
+
+static inline void dev_xmit_recursion_dec(void)
+{
+	__this_cpu_dec(softnet_data.xmit.recursion);
+}
+
 void __netif_schedule(struct Qdisc *q);
 void netif_schedule_queue(struct netdev_queue *txq);
 
@@ -4409,6 +4428,11 @@ static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
 	return ops->ndo_start_xmit(skb, dev);
 }
 
+static inline bool netdev_xmit_more(void)
+{
+	return __this_cpu_read(softnet_data.xmit.more);
+}
+
 static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
 					    struct netdev_queue *txq, bool more)
 {
-- 
cgit v1.2.3


From 6b16f9ee89b8d5709f24bc3ac89ae8b5452c0d7c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 1 Apr 2019 16:42:14 +0200
Subject: net: move skb->xmit_more hint to softnet data

There are two reasons for this.

First, the xmit_more flag conceptually doesn't fit into the skb, as
xmit_more is not a property related to the skb.
Its only a hint to the driver that the stack is about to transmit another
packet immediately.

Second, it was only done this way to not have to pass another argument
to ndo_start_xmit().

We can place xmit_more in the softnet data, next to the device recursion.
The recursion counter is already written to on each transmit. The "more"
indicator is placed right next to it.

Drivers can use the netdev_xmit_more() helper instead of skb->xmit_more
to check the "more packets coming" hint.

skb->xmit_more is retained (but always 0) to not cause build breakage.

This change takes care of the simple s/skb->xmit_more/netdev_xmit_more()/
conversions.  Remaining drivers are converted in the next patches.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2b25824642fa..eb9f05e0863d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4424,7 +4424,7 @@ static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
 					      struct sk_buff *skb, struct net_device *dev,
 					      bool more)
 {
-	skb->xmit_more = more ? 1 : 0;
+	__this_cpu_write(softnet_data.xmit.more, more);
 	return ops->ndo_start_xmit(skb, dev);
 }
 
-- 
cgit v1.2.3


From 4f296edeb9d4cf76b876869461a7ae627c307110 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 1 Apr 2019 16:42:17 +0200
Subject: drivers: net: aurora: use netdev_xmit_more helper

This is the last driver using always-0 skb->xmit_more.
Switch it to netdev_xmit_more and remove the now unused xmit_more flag
from sk_buff.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9027a8c4219f..69b5538adcea 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -657,7 +657,6 @@ typedef unsigned char *sk_buff_data_t;
  *	@tc_index: Traffic control index
  *	@hash: the packet hash
  *	@queue_mapping: Queue mapping for multiqueue devices
- *	@xmit_more: More SKBs are pending for this queue
  *	@pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
  *	@active_extensions: active extensions (skb_ext_id types)
  *	@ndisc_nodetype: router type (from link layer)
@@ -764,7 +763,6 @@ struct sk_buff {
 				fclone:2,
 				peeked:1,
 				head_frag:1,
-				xmit_more:1,
 				pfmemalloc:1;
 #ifdef CONFIG_SKB_EXTENSIONS
 	__u8			active_extensions;
-- 
cgit v1.2.3


From 38702cce547a74493687fd8bb925fbb5c3898ce3 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@mellanox.com>
Date: Fri, 29 Mar 2019 15:37:51 -0700
Subject: net/mlx5: Remove unused MLX5_*_DOORBELL_LOCK macros

MLX5_*_DOORBELL_LOCK macros provided a way to avoid locking for
mlx5_write64 on 64-bit platforms where it's not necessary. Currently all
calls to mlx5_write64 don't use a spinlock, so the macros became unused.

Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Reviewed-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/doorbell.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/doorbell.h b/include/linux/mlx5/doorbell.h
index 0787de28f2fc..9ef3f9d00154 100644
--- a/include/linux/mlx5/doorbell.h
+++ b/include/linux/mlx5/doorbell.h
@@ -42,10 +42,6 @@
  * PCI so we won't worry about it.
  */
 
-#define MLX5_DECLARE_DOORBELL_LOCK(name)
-#define MLX5_INIT_DOORBELL_LOCK(ptr)    do { } while (0)
-#define MLX5_GET_DOORBELL_LOCK(ptr)      (NULL)
-
 static inline void mlx5_write64(__be32 val[2], void __iomem *dest,
 				spinlock_t *doorbell_lock)
 {
@@ -59,10 +55,6 @@ static inline void mlx5_write64(__be32 val[2], void __iomem *dest,
  * MMIO writes.
  */
 
-#define MLX5_DECLARE_DOORBELL_LOCK(name) spinlock_t name;
-#define MLX5_INIT_DOORBELL_LOCK(ptr)     spin_lock_init(ptr)
-#define MLX5_GET_DOORBELL_LOCK(ptr)      (ptr)
-
 static inline void mlx5_write64(__be32 val[2], void __iomem *dest,
 				spinlock_t *doorbell_lock)
 {
-- 
cgit v1.2.3


From bbf29f618e8c5bfd6efdad5fdc050a84bab795ab Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@mellanox.com>
Date: Fri, 29 Mar 2019 15:37:52 -0700
Subject: net/mlx5: Remove spinlock support from mlx5_write64

As there is no user of mlx5_write64 that passes a spinlock to
mlx5_write64, remove this functionality and simplify the function.

Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Reviewed-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/cq.h       |  2 +-
 include/linux/mlx5/doorbell.h | 31 +++++++++----------------------
 2 files changed, 10 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index 612c8c2f2466..769326ea1d9b 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -170,7 +170,7 @@ static inline void mlx5_cq_arm(struct mlx5_core_cq *cq, u32 cmd,
 	doorbell[0] = cpu_to_be32(sn << 28 | cmd | ci);
 	doorbell[1] = cpu_to_be32(cq->cqn);
 
-	mlx5_write64(doorbell, uar_page + MLX5_CQ_DOORBELL, NULL);
+	mlx5_write64(doorbell, uar_page + MLX5_CQ_DOORBELL);
 }
 
 static inline void mlx5_cq_hold(struct mlx5_core_cq *cq)
diff --git a/include/linux/mlx5/doorbell.h b/include/linux/mlx5/doorbell.h
index 9ef3f9d00154..5c267707e1df 100644
--- a/include/linux/mlx5/doorbell.h
+++ b/include/linux/mlx5/doorbell.h
@@ -36,38 +36,25 @@
 #define MLX5_BF_OFFSET	      0x800
 #define MLX5_CQ_DOORBELL      0x20
 
-#if BITS_PER_LONG == 64
 /* Assume that we can just write a 64-bit doorbell atomically.  s390
  * actually doesn't have writeq() but S/390 systems don't even have
  * PCI so we won't worry about it.
+ *
+ * Note that the write is not atomic on 32-bit systems! In contrast to 64-bit
+ * ones, it requires proper locking. mlx5_write64 doesn't do any locking, so use
+ * it at your own discretion, protected by some kind of lock on 32 bits.
+ *
+ * TODO: use write{q,l}_relaxed()
  */
 
-static inline void mlx5_write64(__be32 val[2], void __iomem *dest,
-				spinlock_t *doorbell_lock)
+static inline void mlx5_write64(__be32 val[2], void __iomem *dest)
 {
+#if BITS_PER_LONG == 64
 	__raw_writeq(*(u64 *)val, dest);
-}
-
 #else
-
-/* Just fall back to a spinlock to protect the doorbell if
- * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit
- * MMIO writes.
- */
-
-static inline void mlx5_write64(__be32 val[2], void __iomem *dest,
-				spinlock_t *doorbell_lock)
-{
-	unsigned long flags;
-
-	if (doorbell_lock)
-		spin_lock_irqsave(doorbell_lock, flags);
 	__raw_writel((__force u32) val[0], dest);
 	__raw_writel((__force u32) val[1], dest + 4);
-	if (doorbell_lock)
-		spin_unlock_irqrestore(doorbell_lock, flags);
-}
-
 #endif
+}
 
 #endif /* MLX5_DOORBELL_H */
-- 
cgit v1.2.3


From 52c368dc3da7beb7b283133024af1b6d07bf93b9 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Fri, 29 Mar 2019 15:37:55 -0700
Subject: net/mlx5: Move health and page alloc init to mdev_init

Software structure initialization should be in mdev_init stage.

This provides a better logical separation of mlx5 core device
initialization flow and will help to seamlessly support creating different
mlx5 device types such as PF, VF and SF mlx5 sub-function virtual device.

This patch does not change any functionality.

Signed-off-by: Vu Pham <vuhuong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index c5454f985e1d..d7f5c0e8c47a 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -883,6 +883,7 @@ void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome);
 int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type);
 int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn);
 int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn);
+void mlx5_health_flush(struct mlx5_core_dev *dev);
 void mlx5_health_cleanup(struct mlx5_core_dev *dev);
 int mlx5_health_init(struct mlx5_core_dev *dev);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
-- 
cgit v1.2.3


From aa8106f137b93628d531ef5ecbbcbecef99370d7 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Fri, 29 Mar 2019 15:38:01 -0700
Subject: net/mlx5: Add explicit bar address field

Add bar_addr field to store bar-0 address to avoid calling
pci_resource_start with hard-coded bar-0 as parameter.
Also note that different mlx5 device types will have bar_addr
on different bars.

This patch does not change any functionality.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Signed-off-by: Vu Pham <vuhuong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index d7f5c0e8c47a..c0ee597f5457 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -658,6 +658,7 @@ struct mlx5_core_dev {
 	u64			sys_image_guid;
 	phys_addr_t		iseg_base;
 	struct mlx5_init_seg __iomem *iseg;
+	phys_addr_t             bar_addr;
 	enum mlx5_device_state	state;
 	/* sync interface state */
 	struct mutex		intf_state_mutex;
-- 
cgit v1.2.3


From 4039049b5c462d3bb9ee8a68c4375582f037d5f2 Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@mellanox.com>
Date: Fri, 29 Mar 2019 15:38:03 -0700
Subject: net/mlx5: Expose MPEIN (Management PCIE INfo) register layout

Expose PRM layout for handling MPEIN (Management PCIE Info). It will be
used in the downstream patch for querying MPEIN via the driver.

Signed-off-by: Aya Levin <ayal@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/driver.h   |  1 +
 include/linux/mlx5/mlx5_ifc.h | 51 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index c0ee597f5457..0bfb95e30e47 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -133,6 +133,7 @@ enum {
 	MLX5_REG_MTRC_CONF	 = 0x9041,
 	MLX5_REG_MTRC_STDB	 = 0x9042,
 	MLX5_REG_MTRC_CTRL	 = 0x9043,
+	MLX5_REG_MPEIN		 = 0x9050,
 	MLX5_REG_MPCNT		 = 0x9051,
 	MLX5_REG_MTPPS		 = 0x9053,
 	MLX5_REG_MTPPSE		 = 0x9054,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 5decffe565fb..d31712af5a7b 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -8025,6 +8025,52 @@ struct mlx5_ifc_ppcnt_reg_bits {
 	union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits counter_set;
 };
 
+struct mlx5_ifc_mpein_reg_bits {
+	u8         reserved_at_0[0x2];
+	u8         depth[0x6];
+	u8         pcie_index[0x8];
+	u8         node[0x8];
+	u8         reserved_at_18[0x8];
+
+	u8         capability_mask[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         link_width_enabled[0x8];
+	u8         link_speed_enabled[0x10];
+
+	u8         lane0_physical_position[0x8];
+	u8         link_width_active[0x8];
+	u8         link_speed_active[0x10];
+
+	u8         num_of_pfs[0x10];
+	u8         num_of_vfs[0x10];
+
+	u8         bdf0[0x10];
+	u8         reserved_at_b0[0x10];
+
+	u8         max_read_request_size[0x4];
+	u8         max_payload_size[0x4];
+	u8         reserved_at_c8[0x5];
+	u8         pwr_status[0x3];
+	u8         port_type[0x4];
+	u8         reserved_at_d4[0xb];
+	u8         lane_reversal[0x1];
+
+	u8         reserved_at_e0[0x14];
+	u8         pci_power[0xc];
+
+	u8         reserved_at_100[0x20];
+
+	u8         device_status[0x10];
+	u8         port_state[0x8];
+	u8         reserved_at_138[0x8];
+
+	u8         reserved_at_140[0x10];
+	u8         receiver_detect_result[0x10];
+
+	u8         reserved_at_160[0x20];
+};
+
 struct mlx5_ifc_mpcnt_reg_bits {
 	u8         reserved_at_0[0x8];
 	u8         pcie_index[0x8];
@@ -8344,7 +8390,9 @@ struct mlx5_ifc_pcam_reg_bits {
 };
 
 struct mlx5_ifc_mcam_enhanced_features_bits {
-	u8         reserved_at_0[0x74];
+	u8         reserved_at_0[0x6e];
+	u8         pci_status_and_power[0x1];
+	u8         reserved_at_6f[0x5];
 	u8         mark_tx_action_cnp[0x1];
 	u8         mark_tx_action_cqe[0x1];
 	u8         dynamic_tx_overflow[0x1];
@@ -8944,6 +8992,7 @@ union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_pmtu_reg_bits pmtu_reg;
 	struct mlx5_ifc_ppad_reg_bits ppad_reg;
 	struct mlx5_ifc_ppcnt_reg_bits ppcnt_reg;
+	struct mlx5_ifc_mpein_reg_bits mpein_reg;
 	struct mlx5_ifc_mpcnt_reg_bits mpcnt_reg;
 	struct mlx5_ifc_pplm_reg_bits pplm_reg;
 	struct mlx5_ifc_pplr_reg_bits pplr_reg;
-- 
cgit v1.2.3


From 045925e3fe5b98e402337a176d154252c56cef2e Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 27 Mar 2019 21:58:44 +0100
Subject: net: phy: add genphy_read_abilities

Similar to genphy_c45_pma_read_abilities() add a function to dynamically
detect the abilities of a Clause 22 PHY. This is mainly copied from
genphy_config_init().

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 34084892a466..ad88f063e50f 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1075,6 +1075,7 @@ void phy_attached_info(struct phy_device *phydev);
 
 /* Clause 22 PHY */
 int genphy_config_init(struct phy_device *phydev);
+int genphy_read_abilities(struct phy_device *phydev);
 int genphy_setup_forced(struct phy_device *phydev);
 int genphy_restart_aneg(struct phy_device *phydev);
 int genphy_config_eee_advert(struct phy_device *phydev);
-- 
cgit v1.2.3


From 06ee7115b0d1742de745ad143fb5e06d77d27fba Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:40 -0700
Subject: bpf: add verifier stats and log_level bit 2

In order to understand the verifier bottlenecks add various stats
and extend log_level:
log_level 1 and 2 are kept as-is:
bit 0 - level=1 - print every insn and verifier state at branch points
bit 1 - level=2 - print every insn and verifier state at every insn
bit 2 - level=4 - print verifier error and stats at the end of verification

When verifier rejects the program the libbpf is trying to load the program twice.
Once with log_level=0 (no messages, only error code is reported to user space)
and second time with log_level=1 to tell the user why the verifier rejected it.

With introduction of bit 2 - level=4 the libbpf can choose to always use that
level and load programs once, since the verification speed is not affected and
in case of error the verbose message will be available.

Note that the verifier stats are not part of uapi just like all other
verbose messages. They're expected to change in the future.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7d8228d1c898..f7e15eeb60bb 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -248,6 +248,12 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 	return log->len_used >= log->len_total - 1;
 }
 
+#define BPF_LOG_LEVEL1	1
+#define BPF_LOG_LEVEL2	2
+#define BPF_LOG_STATS	4
+#define BPF_LOG_LEVEL	(BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2)
+#define BPF_LOG_MASK	(BPF_LOG_LEVEL | BPF_LOG_STATS)
+
 static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
 {
 	return log->level && log->ubuf && !bpf_verifier_log_full(log);
@@ -284,6 +290,21 @@ struct bpf_verifier_env {
 	struct bpf_verifier_log log;
 	struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1];
 	u32 subprog_cnt;
+	/* number of instructions analyzed by the verifier */
+	u32 insn_processed;
+	/* total verification time */
+	u64 verification_time;
+	/* maximum number of verifier states kept in 'branching' instructions */
+	u32 max_states_per_insn;
+	/* total number of allocated verifier states */
+	u32 total_states;
+	/* some states are freed during program analysis.
+	 * this is peak number of states. this number dominates kernel
+	 * memory consumption during verification
+	 */
+	u32 peak_states;
+	/* longest register parentage chain walked for liveness marking */
+	u32 longest_mark_read_walk;
 };
 
 __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
-- 
cgit v1.2.3


From 9f4686c41bdff051f557accb531af79dd1773687 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:41 -0700
Subject: bpf: improve verification speed by droping states

Branch instructions, branch targets and calls in a bpf program are
the places where the verifier remembers states that led to successful
verification of the program.
These states are used to prune brute force program analysis.
For unprivileged programs there is a limit of 64 states per such
'branching' instructions (maximum length is tracked by max_states_per_insn
counter introduced in the previous patch).
Simply reducing this threshold to 32 or lower increases insn_processed
metric to the point that small valid programs get rejected.
For root programs there is no limit and cilium programs can have
max_states_per_insn to be 100 or higher.
Walking 100+ states multiplied by number of 'branching' insns during
verification consumes significant amount of cpu time.
Turned out simple LRU-like mechanism can be used to remove states
that unlikely will be helpful in future search pruning.
This patch introduces hit_cnt and miss_cnt counters:
hit_cnt - this many times this state successfully pruned the search
miss_cnt - this many times this state was not equivalent to other states
(and that other states were added to state list)

The heuristic introduced in this patch is:
if (sl->miss_cnt > sl->hit_cnt * 3 + 3)
  /* drop this state from future considerations */

Higher numbers increase max_states_per_insn (allow more states to be
considered for pruning) and slow verification speed, but do not meaningfully
reduce insn_processed metric.
Lower numbers drop too many states and insn_processed increases too much.
Many different formulas were considered.
This one is simple and works well enough in practice.
(the analysis was done on selftests/progs/* and on cilium programs)

The end result is this heuristic improves verification speed by 10 times.
Large synthetic programs that used to take a second more now take
1/10 of a second.
In cases where max_states_per_insn used to be 100 or more, now it's ~10.

There is a slight increase in insn_processed for cilium progs:
                       before   after
bpf_lb-DLB_L3.o 	1831	1838
bpf_lb-DLB_L4.o 	3029	3218
bpf_lb-DUNKNOWN.o 	1064	1064
bpf_lxc-DDROP_ALL.o	26309	26935
bpf_lxc-DUNKNOWN.o	33517	34439
bpf_netdev.o		9713	9721
bpf_overlay.o		6184	6184
bpf_lcx_jit.o		37335	39389
And 2-3 times improvement in the verification speed.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index f7e15eeb60bb..fc8254d6b569 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -207,6 +207,7 @@ struct bpf_verifier_state {
 struct bpf_verifier_state_list {
 	struct bpf_verifier_state state;
 	struct bpf_verifier_state_list *next;
+	int miss_cnt, hit_cnt;
 };
 
 /* Possible states for alu_state member. */
@@ -280,6 +281,7 @@ struct bpf_verifier_env {
 	bool strict_alignment;		/* perform strict pointer alignment checks */
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
+	struct bpf_verifier_state_list *free_list;
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
 	u32 id_gen;			/* used to generate unique reg IDs */
-- 
cgit v1.2.3


From c04c0d2b968ac45d6ef020316808ef6c82325a82 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Mon, 1 Apr 2019 21:27:45 -0700
Subject: bpf: increase complexity limit and maximum program size

Large verifier speed improvements allow to increase
verifier complexity limit.
Now regardless of the program composition and its size it takes
little time for the verifier to hit insn_processed limit.
On typical x86 machine non-debug kernel processes 1M instructions
in 1/10 of a second.
(before these speed improvements specially crafted programs
could be hitting multi-second verification times)
Full kasan kernel with debug takes ~1 second for the same 1M insns.
Hence bump the BPF_COMPLEXITY_LIMIT_INSNS limit to 1M.
Also increase the number of instructions per program
from 4k to internal BPF_COMPLEXITY_LIMIT_INSNS limit.
4k limit was confusing to users, since small programs with hundreds
of insns could be hitting BPF_COMPLEXITY_LIMIT_INSNS limit.
Sometimes adding more insns and bpf_trace_printk debug statements
would make the verifier accept the program while removing
code would make the verifier reject it.
Some user space application started to add #define MAX_FOO to
their programs and do:
  MAX_FOO=100;
again:
  compile with MAX_FOO;
  try to load;
  if (fails_to_load) { reduce MAX_FOO; goto again; }
to be able to fit maximum amount of processing into single program.
Other users artificially split their single program into a set of programs
and use all 32 iterations of tail_calls to increase compute limits.
And the most advanced folks used unlimited tc-bpf filter list
to execute many bpf programs.
Essentially the users managed to workaround 4k insn limit.
This patch removes the limit for root programs from uapi.
BPF_COMPLEXITY_LIMIT_INSNS is the kernel internal limit
and success to load the program no longer depends on program size,
but on 'smartness' of the verifier only.
The verifier will continue to get smarter with every kernel release.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f62897198844..a445194b5fb6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -421,6 +421,7 @@ struct bpf_array {
 	};
 };
 
+#define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
 #define MAX_TAIL_CALL_CNT 32
 
 struct bpf_event_entry {
-- 
cgit v1.2.3


From 4950c2ba49cc6f2b38dbedcfa0ff67acf761419a Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 2 Apr 2019 20:43:30 +0200
Subject: net: phy: fix autoneg mismatch case in genphy_read_status

The original patch didn't consider the case that autoneg process
finishes successfully but both link partners have no mode in common.
In this case there's no link, nevertheless we may be interested in
what the link partner advertised.

Like phydev->link we set phydev->autoneg_complete in
genphy_update_link() and use the stored value in genphy_read_status().
This way we don't have to read register BMSR again.

Fixes: b6163f194c69 ("net: phy: improve genphy_read_status")
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index ad88f063e50f..ab7439b3da2b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -390,6 +390,7 @@ struct phy_device {
 	unsigned autoneg:1;
 	/* The most recently read link state */
 	unsigned link:1;
+	unsigned autoneg_complete:1;
 
 	/* Interrupts are enabled */
 	unsigned interrupts:1;
-- 
cgit v1.2.3


From 0af7e7c128eb33f2dc16ed088ced00675785d628 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 2 Apr 2019 14:11:54 -0700
Subject: ipv4: Update fib_table_lookup tracepoint to take common nexthop

Update fib_table_lookup tracepoint to take a fib_nh_common struct and
dump the v6 gateway address if the nexthop uses it.

Over the years saddr has not proven useful and the output of the
tracepoint produces very long lines. Since saddr is not part of
fib_nh_common, drop it. If it needs to be added later, fib_nh which
contains saddr can be obtained from a fib_nh_common via container_of.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/fib.h | 45 ++++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h
index 61ea7a24c8e5..7f83b6eafc5c 100644
--- a/include/trace/events/fib.h
+++ b/include/trace/events/fib.h
@@ -13,9 +13,9 @@
 TRACE_EVENT(fib_table_lookup,
 
 	TP_PROTO(u32 tb_id, const struct flowi4 *flp,
-		 const struct fib_nh *nh, int err),
+		 const struct fib_nh_common *nhc, int err),
 
-	TP_ARGS(tb_id, flp, nh, err),
+	TP_ARGS(tb_id, flp, nhc, err),
 
 	TP_STRUCT__entry(
 		__field(	u32,	tb_id		)
@@ -28,14 +28,17 @@ TRACE_EVENT(fib_table_lookup,
 		__field(	__u8,	flags		)
 		__array(	__u8,	src,	4	)
 		__array(	__u8,	dst,	4	)
-		__array(	__u8,	gw,	4	)
-		__array(	__u8,	saddr,	4	)
+		__array(	__u8,	gw4,	4	)
+		__array(	__u8,	gw6,	16	)
 		__field(	u16,	sport		)
 		__field(	u16,	dport		)
 		__dynamic_array(char,  name,   IFNAMSIZ )
 	),
 
 	TP_fast_assign(
+		struct in6_addr in6_zero = {};
+		struct net_device *dev;
+		struct in6_addr *in6;
 		__be32 *p32;
 
 		__entry->tb_id = tb_id;
@@ -62,33 +65,37 @@ TRACE_EVENT(fib_table_lookup,
 			__entry->dport = 0;
 		}
 
-		if (nh) {
-			struct net_device *dev;
+		dev = nhc ? nhc->nhc_dev : NULL;
+		__assign_str(name, dev ? dev->name : "-");
 
-			p32 = (__be32 *) __entry->saddr;
-			*p32 = nh->nh_saddr;
+		if (nhc) {
+			if (nhc->nhc_family == AF_INET) {
+				p32 = (__be32 *) __entry->gw4;
+				*p32 = nhc->nhc_gw.ipv4;
 
-			p32 = (__be32 *) __entry->gw;
-			*p32 = nh->fib_nh_gw4;
+				in6 = (struct in6_addr *)__entry->gw6;
+				*in6 = in6_zero;
+			} else if (nhc->nhc_family == AF_INET6) {
+				p32 = (__be32 *) __entry->gw4;
+				*p32 = 0;
 
-			dev = nh->fib_nh_dev;
-			__assign_str(name, dev ? dev->name : "-");
+				in6 = (struct in6_addr *)__entry->gw6;
+				*in6 = nhc->nhc_gw.ipv6;
+			}
 		} else {
-			p32 = (__be32 *) __entry->saddr;
+			p32 = (__be32 *) __entry->gw4;
 			*p32 = 0;
 
-			p32 = (__be32 *) __entry->gw;
-			*p32 = 0;
-
-			__assign_str(name, "-");
+			in6 = (struct in6_addr *)__entry->gw6;
+			*in6 = in6_zero;
 		}
 	),
 
-	TP_printk("table %u oif %d iif %d proto %u %pI4/%u -> %pI4/%u tos %d scope %d flags %x ==> dev %s gw %pI4 src %pI4 err %d",
+	TP_printk("table %u oif %d iif %d proto %u %pI4/%u -> %pI4/%u tos %d scope %d flags %x ==> dev %s gw %pI4/%pI6c err %d",
 		  __entry->tb_id, __entry->oif, __entry->iif, __entry->proto,
 		  __entry->src, __entry->sport, __entry->dst, __entry->dport,
 		  __entry->tos, __entry->scope, __entry->flags,
-		  __get_str(name), __entry->gw, __entry->saddr, __entry->err)
+		  __get_str(name), __entry->gw4, __entry->gw6, __entry->err)
 );
 #endif /* _TRACE_FIB_H */
 
-- 
cgit v1.2.3


From eba618abacade71669eb67c3360eecfee810cc88 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 2 Apr 2019 14:11:55 -0700
Subject: ipv4: Add fib_nh_common to fib_result

Most of the ipv4 code only needs data from fib_nh_common. Add
fib_nh_common selection to fib_result and update users to use it.

Right now, fib_nh_common in fib_result will point to a fib_nh struct
that is embedded within a fib_info:

        fib_info  --> fib_nh
                      fib_nh
                      ...
                      fib_nh
                        ^
    fib_result->nhc ----+

Later, nhc can point to a fib_nh within a nexthop struct:

        fib_info --> nexthop --> fib_nh
                                   ^
    fib_result->nhc ---------------+

or for a nexthop group:

        fib_info --> nexthop --> nexthop --> fib_nh
                                 nexthop --> fib_nh
                                 ...
                                 nexthop --> fib_nh
                                               ^
    fib_result->nhc ---------------------------+

In all cases nhsel within fib_result will point to which leg in the
multipath route is used.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 47 +++++++++++++++++++++--------------------------
 1 file changed, 21 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 12a6d759cf57..1f4a3b8bf584 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -156,15 +156,16 @@ struct fib_rule;
 
 struct fib_table;
 struct fib_result {
-	__be32		prefix;
-	unsigned char	prefixlen;
-	unsigned char	nh_sel;
-	unsigned char	type;
-	unsigned char	scope;
-	u32		tclassid;
-	struct fib_info *fi;
-	struct fib_table *table;
-	struct hlist_head *fa_head;
+	__be32			prefix;
+	unsigned char		prefixlen;
+	unsigned char		nh_sel;
+	unsigned char		type;
+	unsigned char		scope;
+	u32			tclassid;
+	struct fib_nh_common	*nhc;
+	struct fib_info		*fi;
+	struct fib_table	*table;
+	struct hlist_head	*fa_head;
 };
 
 struct fib_result_nl {
@@ -182,11 +183,10 @@ struct fib_result_nl {
 	int             err;
 };
 
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-#define FIB_RES_NH(res)		((res).fi->fib_nh[(res).nh_sel])
-#else /* CONFIG_IP_ROUTE_MULTIPATH */
-#define FIB_RES_NH(res)		((res).fi->fib_nh[0])
-#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+static inline struct fib_nh_common *fib_info_nhc(struct fib_info *fi, int nhsel)
+{
+	return &fi->fib_nh[nhsel].nh_common;
+}
 
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 #define FIB_TABLE_HASHSZ 256
@@ -195,18 +195,11 @@ struct fib_result_nl {
 #endif
 
 __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh);
+__be32 fib_result_prefsrc(struct net *net, struct fib_result *res);
 
-#define FIB_RES_SADDR(net, res)				\
-	((FIB_RES_NH(res).nh_saddr_genid ==		\
-	  atomic_read(&(net)->ipv4.dev_addr_genid)) ?	\
-	 FIB_RES_NH(res).nh_saddr :			\
-	 fib_info_update_nh_saddr((net), &FIB_RES_NH(res)))
-#define FIB_RES_GW(res)			(FIB_RES_NH(res).fib_nh_gw4)
-#define FIB_RES_DEV(res)		(FIB_RES_NH(res).fib_nh_dev)
-#define FIB_RES_OIF(res)		(FIB_RES_NH(res).fib_nh_oif)
-
-#define FIB_RES_PREFSRC(net, res)	((res).fi->fib_prefsrc ? : \
-					 FIB_RES_SADDR(net, res))
+#define FIB_RES_NHC(res)		((res).nhc)
+#define FIB_RES_DEV(res)	(FIB_RES_NHC(res)->nhc_dev)
+#define FIB_RES_OIF(res)	(FIB_RES_NHC(res)->nhc_oif)
 
 struct fib_entry_notifier_info {
 	struct fib_notifier_info info; /* must be first */
@@ -453,10 +446,12 @@ struct fib_table *fib_trie_table(u32 id, struct fib_table *alias);
 static inline void fib_combine_itag(u32 *itag, const struct fib_result *res)
 {
 #ifdef CONFIG_IP_ROUTE_CLASSID
+	struct fib_nh_common *nhc = res->nhc;
+	struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common);
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	u32 rtag;
 #endif
-	*itag = FIB_RES_NH(*res).nh_tclassid<<16;
+	*itag = nh->nh_tclassid << 16;
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	rtag = res->tclassid;
 	if (*itag == 0)
-- 
cgit v1.2.3


From c0a720770c01e67374b15f348f17a52409f6545c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 2 Apr 2019 14:11:58 -0700
Subject: ipv6: Flip to fib_nexthop_info

Export fib_nexthop_info and fib_add_nexthop for use by IPv6 code.
Remove rt6_nexthop_info and rt6_add_nexthop in favor of the IPv4
versions. Update fib_nexthop_info for IPv6 linkdown check and
RTA_GATEWAY for AF_INET6.

Signed-off-by: David Ahern <dsahern@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 1f4a3b8bf584..3ce07841dc3b 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -492,4 +492,9 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr);
 int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 			  struct fib_dump_filter *filter,
 			  struct netlink_callback *cb);
+
+int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh,
+		     unsigned int *flags, bool skip_oif);
+int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh,
+		    int nh_weight);
 #endif  /* _NET_FIB_H */
-- 
cgit v1.2.3


From 28b05b92886871bdd8e6a9df73e3a15845fe8ef4 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 3 Apr 2019 08:28:35 +0200
Subject: net: use correct this_cpu primitive in dev_recursion_level

syzbot reports:
BUG: using __this_cpu_read() in preemptible code:
caller is dev_recursion_level include/linux/netdevice.h:3052 [inline]
 __this_cpu_preempt_check+0x246/0x270 lib/smp_processor_id.c:47
 dev_recursion_level include/linux/netdevice.h:3052 [inline]
 ip6_skb_dst_mtu include/net/ip6_route.h:245 [inline]

I erronously downgraded a this_cpu_read to __this_cpu_read when
moving dev_recursion_level() around.

Reported-by: syzbot+51471b4aae195285a4a3@syzkaller.appspotmail.com
Fixes: 97cdcf37b57e ("net: place xmit recursion in softnet data")
Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eb9f05e0863d..521eb869555e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3049,7 +3049,7 @@ DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 
 static inline int dev_recursion_level(void)
 {
-	return __this_cpu_read(softnet_data.xmit.recursion);
+	return this_cpu_read(softnet_data.xmit.recursion);
 }
 
 #define XMIT_RECURSION_LIMIT	10
-- 
cgit v1.2.3


From 407dd706fb5245c138f3a972f8aaa1c8a09a574c Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 3 Apr 2019 14:24:15 +0200
Subject: net: devlink: convert devlink_port_attrs bools to bits

In order to save space in the struct, convert bools to bits.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 31d5cec4d06b..4a1e3452a4ce 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -41,10 +41,10 @@ struct devlink {
 };
 
 struct devlink_port_attrs {
-	bool set;
+	u8 set:1,
+	   split:1;
 	enum devlink_port_flavour flavour;
 	u32 port_number; /* same value as "split group" */
-	bool split;
 	u32 split_subport_number;
 };
 
-- 
cgit v1.2.3


From bec5267cded268acdf679b651778c300d204e9f2 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 3 Apr 2019 14:24:16 +0200
Subject: net: devlink: extend port attrs for switch ID

Extend devlink_port_attrs_set() to pass switch ID for ports which are
part of switch and store it in port attrs. For other ports, this is
NULL.

Note that this allows the driver to group devlink ports into one or more
switches according to the actual topology.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 4a1e3452a4ce..0f7968761204 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -42,10 +42,12 @@ struct devlink {
 
 struct devlink_port_attrs {
 	u8 set:1,
-	   split:1;
+	   split:1,
+	   switch_port:1;
 	enum devlink_port_flavour flavour;
 	u32 port_number; /* same value as "split group" */
 	u32 split_subport_number;
+	struct netdev_phys_item_id switch_id;
 };
 
 struct devlink_port {
@@ -582,7 +584,9 @@ void devlink_port_type_clear(struct devlink_port *devlink_port);
 void devlink_port_attrs_set(struct devlink_port *devlink_port,
 			    enum devlink_port_flavour flavour,
 			    u32 port_number, bool split,
-			    u32 split_subport_number);
+			    u32 split_subport_number,
+			    const unsigned char *switch_id,
+			    unsigned char switch_id_len);
 int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 			u32 size, u16 ingress_pools_count,
 			u16 egress_pools_count, u16 ingress_tc_count,
-- 
cgit v1.2.3


From 7e1146e8c10c00f859843817da8ecc5d902ea409 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Wed, 3 Apr 2019 14:24:17 +0200
Subject: net: devlink: introduce devlink_compat_switch_id_get() helper

Introduce devlink_compat_switch_id_get() helper which fills up switch_id
according to passed netdev pointer. Call it directly from
dev_get_port_parent_id() as a fallback when ndo_get_port_parent_id
is not defined for given netdev.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 0f7968761204..70c7d1ac8344 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -743,6 +743,8 @@ void devlink_compat_running_version(struct net_device *dev,
 int devlink_compat_flash_update(struct net_device *dev, const char *file_name);
 int devlink_compat_phys_port_name_get(struct net_device *dev,
 				      char *name, size_t len);
+int devlink_compat_switch_id_get(struct net_device *dev,
+				 struct netdev_phys_item_id *ppid);
 
 #else
 
@@ -764,6 +766,13 @@ devlink_compat_phys_port_name_get(struct net_device *dev,
 	return -EOPNOTSUPP;
 }
 
+static inline int
+devlink_compat_switch_id_get(struct net_device *dev,
+			     struct netdev_phys_item_id *ppid)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
-- 
cgit v1.2.3


From 5d3c537f907036c1f18bd325ffc356e24cde664c Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@mellanox.com>
Date: Sun, 24 Mar 2019 09:21:40 +0200
Subject: net/mlx5: Handle event of power detection in the PCIE slot

Handle event of power state change in the PCIE slot. When the event
occurs, check if query power state and PCI power fields is supported. If
so, read these fields from MPEIN (management PCIE info) register and
issue a corresponding message.

Signed-off-by: Aya Levin <ayal@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/device.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index f93a5598b942..db7dca75d726 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -361,6 +361,7 @@ enum {
 
 enum {
 	MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT = 0x1,
+	MLX5_GENERAL_SUBTYPE_PCI_POWER_CHANGE_EVENT = 0x5,
 };
 
 enum {
-- 
cgit v1.2.3


From ff302db965b57c141297911ea647d36d11fedfbe Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 2 Apr 2019 10:07:45 +1100
Subject: rhashtable: allow rht_bucket_var to return NULL.

Rather than returning a pointer to a static nulls, rht_bucket_var()
now returns NULL if the bucket doesn't exist.
This will make the next patch, which stores a bitlock in the
bucket pointer, somewhat cleaner.

This change involves introducing __rht_bucket_nested() which is
like rht_bucket_nested(), but doesn't provide the static nulls,
and changing rht_bucket_nested() to call this and possible
provide a static nulls - as is still needed for the non-var case.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 86dfa417848d..0c9175aeab8a 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -265,6 +265,8 @@ void rhashtable_destroy(struct rhashtable *ht);
 
 struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
 					    unsigned int hash);
+struct rhash_head __rcu **__rht_bucket_nested(const struct bucket_table *tbl,
+					      unsigned int hash);
 struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
 						   struct bucket_table *tbl,
 						   unsigned int hash);
@@ -294,7 +296,7 @@ static inline struct rhash_head __rcu *const *rht_bucket(
 static inline struct rhash_head __rcu **rht_bucket_var(
 	struct bucket_table *tbl, unsigned int hash)
 {
-	return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
+	return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) :
 				     &tbl->buckets[hash];
 }
 
@@ -890,6 +892,8 @@ static inline int __rhashtable_remove_fast_one(
 	spin_lock_bh(lock);
 
 	pprev = rht_bucket_var(tbl, hash);
+	if (!pprev)
+		goto out;
 	rht_for_each_from(he, *pprev, tbl, hash) {
 		struct rhlist_head *list;
 
@@ -934,6 +938,7 @@ static inline int __rhashtable_remove_fast_one(
 		break;
 	}
 
+out:
 	spin_unlock_bh(lock);
 
 	if (err > 0) {
@@ -1042,6 +1047,8 @@ static inline int __rhashtable_replace_fast(
 	spin_lock_bh(lock);
 
 	pprev = rht_bucket_var(tbl, hash);
+	if (!pprev)
+		goto out;
 	rht_for_each_from(he, *pprev, tbl, hash) {
 		if (he != obj_old) {
 			pprev = &he->next;
@@ -1053,7 +1060,7 @@ static inline int __rhashtable_replace_fast(
 		err = 0;
 		break;
 	}
-
+out:
 	spin_unlock_bh(lock);
 
 	return err;
-- 
cgit v1.2.3


From 8f0db018006a421956965e1149234c4e8db718ee Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 2 Apr 2019 10:07:45 +1100
Subject: rhashtable: use bit_spin_locks to protect hash bucket.

This patch changes rhashtables to use a bit_spin_lock on BIT(1) of the
bucket pointer to lock the hash chain for that bucket.

The benefits of a bit spin_lock are:
 - no need to allocate a separate array of locks.
 - no need to have a configuration option to guide the
   choice of the size of this array
 - locking cost is often a single test-and-set in a cache line
   that will have to be loaded anyway.  When inserting at, or removing
   from, the head of the chain, the unlock is free - writing the new
   address in the bucket head implicitly clears the lock bit.
   For __rhashtable_insert_fast() we ensure this always happens
   when adding a new key.
 - even when lockings costs 2 updates (lock and unlock), they are
   in a cacheline that needs to be read anyway.

The cost of using a bit spin_lock is a little bit of code complexity,
which I think is quite manageable.

Bit spin_locks are sometimes inappropriate because they are not fair -
if multiple CPUs repeatedly contend of the same lock, one CPU can
easily be starved.  This is not a credible situation with rhashtable.
Multiple CPUs may want to repeatedly add or remove objects, but they
will typically do so at different buckets, so they will attempt to
acquire different locks.

As we have more bit-locks than we previously had spinlocks (by at
least a factor of two) we can expect slightly less contention to
go with the slightly better cache behavior and reduced memory
consumption.

To enhance type checking, a new struct is introduced to represent the
  pointer plus lock-bit
that is stored in the bucket-table.  This is "struct rhash_lock_head"
and is empty.  A pointer to this needs to be cast to either an
unsigned lock, or a "struct rhash_head *" to be useful.
Variables of this type are most often called "bkt".

Previously "pprev" would sometimes point to a bucket, and sometimes a
->next pointer in an rhash_head.  As these are now different types,
pprev is NULL when it would have pointed to the bucket. In that case,
'blk' is used, together with correct locking protocol.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable-types.h |   2 -
 include/linux/rhashtable.h       | 261 +++++++++++++++++++++++++--------------
 2 files changed, 165 insertions(+), 98 deletions(-)

(limited to 'include')

diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
index 763d613ce2c2..57467cbf4c5b 100644
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@@ -48,7 +48,6 @@ typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
  * @head_offset: Offset of rhash_head in struct to be hashed
  * @max_size: Maximum size while expanding
  * @min_size: Minimum size while shrinking
- * @locks_mul: Number of bucket locks to allocate per cpu (default: 32)
  * @automatic_shrinking: Enable automatic shrinking of tables
  * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
  * @obj_hashfn: Function to hash object
@@ -62,7 +61,6 @@ struct rhashtable_params {
 	unsigned int		max_size;
 	u16			min_size;
 	bool			automatic_shrinking;
-	u8			locks_mul;
 	rht_hashfn_t		hashfn;
 	rht_obj_hashfn_t	obj_hashfn;
 	rht_obj_cmpfn_t		obj_cmpfn;
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 0c9175aeab8a..ccbbafdf5547 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -24,12 +24,27 @@
 #include <linux/list_nulls.h>
 #include <linux/workqueue.h>
 #include <linux/rculist.h>
+#include <linux/bit_spinlock.h>
 
 #include <linux/rhashtable-types.h>
 /*
+ * Objects in an rhashtable have an embedded struct rhash_head
+ * which is linked into as hash chain from the hash table - or one
+ * of two or more hash tables when the rhashtable is being resized.
  * The end of the chain is marked with a special nulls marks which has
- * the least significant bit set.
+ * the least significant bit set but otherwise stores the address of
+ * the hash bucket.  This allows us to be be sure we've found the end
+ * of the right list.
+ * The value stored in the hash bucket has BIT(2) used as a lock bit.
+ * This bit must be atomically set before any changes are made to
+ * the chain.  To avoid dereferencing this pointer without clearing
+ * the bit first, we use an opaque 'struct rhash_lock_head *' for the
+ * pointer stored in the bucket.  This struct needs to be defined so
+ * that rcu_derefernce() works on it, but it has no content so a
+ * cast is needed for it to be useful.  This ensures it isn't
+ * used by mistake with clearing the lock bit first.
  */
+struct rhash_lock_head {};
 
 /* Maximum chain length before rehash
  *
@@ -52,8 +67,6 @@
  * @nest: Number of bits of first-level nested table.
  * @rehash: Current bucket being rehashed
  * @hash_rnd: Random seed to fold into hash
- * @locks_mask: Mask to apply before accessing locks[]
- * @locks: Array of spinlocks protecting individual buckets
  * @walkers: List of active walkers
  * @rcu: RCU structure for freeing the table
  * @future_tbl: Table under construction during rehashing
@@ -64,16 +77,70 @@ struct bucket_table {
 	unsigned int		size;
 	unsigned int		nest;
 	u32			hash_rnd;
-	unsigned int		locks_mask;
-	spinlock_t		*locks;
 	struct list_head	walkers;
 	struct rcu_head		rcu;
 
 	struct bucket_table __rcu *future_tbl;
 
-	struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp;
+	struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp;
 };
 
+/*
+ * We lock a bucket by setting BIT(1) in the pointer - this is always
+ * zero in real pointers and in the nulls marker.
+ * bit_spin_locks do not handle contention well, but the whole point
+ * of the hashtable design is to achieve minimum per-bucket contention.
+ * A nested hash table might not have a bucket pointer.  In that case
+ * we cannot get a lock.  For remove and replace the bucket cannot be
+ * interesting and doesn't need locking.
+ * For insert we allocate the bucket if this is the last bucket_table,
+ * and then take the lock.
+ * Sometimes we unlock a bucket by writing a new pointer there.  In that
+ * case we don't need to unlock, but we do need to reset state such as
+ * local_bh. For that we have rht_assign_unlock().  As rcu_assign_pointer()
+ * provides the same release semantics that bit_spin_unlock() provides,
+ * this is safe.
+ */
+
+static inline void rht_lock(struct rhash_lock_head **bkt)
+{
+	local_bh_disable();
+	bit_spin_lock(1, (unsigned long *)bkt);
+}
+
+static inline void rht_unlock(struct rhash_lock_head **bkt)
+{
+	bit_spin_unlock(1, (unsigned long *)bkt);
+	local_bh_enable();
+}
+
+static inline void rht_assign_unlock(struct rhash_lock_head **bkt,
+				     struct rhash_head *obj)
+{
+	struct rhash_head **p = (struct rhash_head **)bkt;
+
+	rcu_assign_pointer(*p, obj);
+	preempt_enable();
+	__release(bitlock);
+	local_bh_enable();
+}
+
+/*
+ * If 'p' is a bucket head and might be locked:
+ *   rht_ptr() returns the address without the lock bit.
+ *   rht_ptr_locked() returns the address WITH the lock bit.
+ */
+static inline struct rhash_head __rcu *rht_ptr(const struct rhash_lock_head *p)
+{
+	return (void *)(((unsigned long)p) & ~BIT(1));
+}
+
+static inline struct rhash_lock_head __rcu *rht_ptr_locked(const
+							   struct rhash_head *p)
+{
+	return (void *)(((unsigned long)p) | BIT(1));
+}
+
 /*
  * NULLS_MARKER() expects a hash value with the low
  * bits mostly likely to be significant, and it discards
@@ -206,25 +273,6 @@ static inline bool rht_grow_above_max(const struct rhashtable *ht,
 	return atomic_read(&ht->nelems) >= ht->max_elems;
 }
 
-/* The bucket lock is selected based on the hash and protects mutations
- * on a group of hash buckets.
- *
- * A maximum of tbl->size/2 bucket locks is allocated. This ensures that
- * a single lock always covers both buckets which may both contains
- * entries which link to the same bucket of the old table during resizing.
- * This allows to simplify the locking as locking the bucket in both
- * tables during resize always guarantee protection.
- *
- * IMPORTANT: When holding the bucket lock of both the old and new table
- * during expansions and shrinking, the old bucket lock must always be
- * acquired first.
- */
-static inline spinlock_t *rht_bucket_lock(const struct bucket_table *tbl,
-					  unsigned int hash)
-{
-	return &tbl->locks[hash & tbl->locks_mask];
-}
-
 #ifdef CONFIG_PROVE_LOCKING
 int lockdep_rht_mutex_is_held(struct rhashtable *ht);
 int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
@@ -263,13 +311,13 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 				 void *arg);
 void rhashtable_destroy(struct rhashtable *ht);
 
-struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
-					    unsigned int hash);
-struct rhash_head __rcu **__rht_bucket_nested(const struct bucket_table *tbl,
-					      unsigned int hash);
-struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
-						   struct bucket_table *tbl,
+struct rhash_lock_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
+						 unsigned int hash);
+struct rhash_lock_head __rcu **__rht_bucket_nested(const struct bucket_table *tbl,
 						   unsigned int hash);
+struct rhash_lock_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
+							struct bucket_table *tbl,
+							unsigned int hash);
 
 #define rht_dereference(p, ht) \
 	rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht))
@@ -286,21 +334,21 @@ struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
 #define rht_entry(tpos, pos, member) \
 	({ tpos = container_of(pos, typeof(*tpos), member); 1; })
 
-static inline struct rhash_head __rcu *const *rht_bucket(
+static inline struct rhash_lock_head __rcu *const *rht_bucket(
 	const struct bucket_table *tbl, unsigned int hash)
 {
 	return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) :
 				     &tbl->buckets[hash];
 }
 
-static inline struct rhash_head __rcu **rht_bucket_var(
+static inline struct rhash_lock_head __rcu **rht_bucket_var(
 	struct bucket_table *tbl, unsigned int hash)
 {
 	return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) :
 				     &tbl->buckets[hash];
 }
 
-static inline struct rhash_head __rcu **rht_bucket_insert(
+static inline struct rhash_lock_head __rcu **rht_bucket_insert(
 	struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash)
 {
 	return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) :
@@ -326,7 +374,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * @hash:	the hash value / bucket index
  */
 #define rht_for_each(pos, tbl, hash) \
-	rht_for_each_from(pos, *rht_bucket(tbl, hash), tbl, hash)
+	rht_for_each_from(pos, rht_ptr(*rht_bucket(tbl, hash)), tbl, hash)
 
 /**
  * rht_for_each_entry_from - iterate over hash chain from given head
@@ -351,7 +399,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * @member:	name of the &struct rhash_head within the hashable struct.
  */
 #define rht_for_each_entry(tpos, pos, tbl, hash, member)		\
-	rht_for_each_entry_from(tpos, pos, *rht_bucket(tbl, hash),	\
+	rht_for_each_entry_from(tpos, pos, rht_ptr(*rht_bucket(tbl, hash)), \
 				    tbl, hash, member)
 
 /**
@@ -367,7 +415,8 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * remove the loop cursor from the list.
  */
 #define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)	      \
-	for (pos = rht_dereference_bucket(*rht_bucket(tbl, hash), tbl, hash), \
+	for (pos = rht_dereference_bucket(rht_ptr(*rht_bucket(tbl, hash)),    \
+					  tbl, hash),			      \
 	     next = !rht_is_a_nulls(pos) ?				      \
 		       rht_dereference_bucket(pos->next, tbl, hash) : NULL;   \
 	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	      \
@@ -402,8 +451,12 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * the _rcu mutation primitives such as rhashtable_insert() as long as the
  * traversal is guarded by rcu_read_lock().
  */
-#define rht_for_each_rcu(pos, tbl, hash)				\
-	rht_for_each_rcu_from(pos, *rht_bucket(tbl, hash), tbl, hash)
+#define rht_for_each_rcu(pos, tbl, hash)			\
+	for (({barrier(); }),						\
+	     pos = rht_ptr(rht_dereference_bucket_rcu(			\
+				   *rht_bucket(tbl, hash), tbl, hash));	\
+	     !rht_is_a_nulls(pos);					\
+	     pos = rcu_dereference_raw(pos->next))
 
 /**
  * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head
@@ -437,7 +490,8 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * traversal is guarded by rcu_read_lock().
  */
 #define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		   \
-	rht_for_each_entry_rcu_from(tpos, pos, *rht_bucket(tbl, hash), \
+	rht_for_each_entry_rcu_from(tpos, pos,				   \
+					rht_ptr(*rht_bucket(tbl, hash)),   \
 					tbl, hash, member)
 
 /**
@@ -483,7 +537,7 @@ static inline struct rhash_head *__rhashtable_lookup(
 		.ht = ht,
 		.key = key,
 	};
-	struct rhash_head __rcu * const *head;
+	struct rhash_lock_head __rcu * const *bkt;
 	struct bucket_table *tbl;
 	struct rhash_head *he;
 	unsigned int hash;
@@ -491,9 +545,10 @@ static inline struct rhash_head *__rhashtable_lookup(
 	tbl = rht_dereference_rcu(ht->tbl, ht);
 restart:
 	hash = rht_key_hashfn(ht, tbl, key, params);
-	head = rht_bucket(tbl, hash);
+	bkt = rht_bucket(tbl, hash);
 	do {
-		rht_for_each_rcu_from(he, *head, tbl, hash) {
+		he = rht_ptr(rht_dereference_bucket_rcu(*bkt, tbl, hash));
+		rht_for_each_rcu_from(he, he, tbl, hash) {
 			if (params.obj_cmpfn ?
 			    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
 			    rhashtable_compare(&arg, rht_obj(ht, he)))
@@ -503,7 +558,7 @@ restart:
 		/* An object might have been moved to a different hash chain,
 		 * while we walk along it - better check and retry.
 		 */
-	} while (he != RHT_NULLS_MARKER(head));
+	} while (he != RHT_NULLS_MARKER(bkt));
 
 	/* Ensure we see any new tables. */
 	smp_rmb();
@@ -599,10 +654,10 @@ static inline void *__rhashtable_insert_fast(
 		.ht = ht,
 		.key = key,
 	};
+	struct rhash_lock_head __rcu **bkt;
 	struct rhash_head __rcu **pprev;
 	struct bucket_table *tbl;
 	struct rhash_head *head;
-	spinlock_t *lock;
 	unsigned int hash;
 	int elasticity;
 	void *data;
@@ -611,23 +666,22 @@ static inline void *__rhashtable_insert_fast(
 
 	tbl = rht_dereference_rcu(ht->tbl, ht);
 	hash = rht_head_hashfn(ht, tbl, obj, params);
-	lock = rht_bucket_lock(tbl, hash);
-	spin_lock_bh(lock);
+	elasticity = RHT_ELASTICITY;
+	bkt = rht_bucket_insert(ht, tbl, hash);
+	data = ERR_PTR(-ENOMEM);
+	if (!bkt)
+		goto out;
+	pprev = NULL;
+	rht_lock(bkt);
 
 	if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
 slow_path:
-		spin_unlock_bh(lock);
+		rht_unlock(bkt);
 		rcu_read_unlock();
 		return rhashtable_insert_slow(ht, key, obj);
 	}
 
-	elasticity = RHT_ELASTICITY;
-	pprev = rht_bucket_insert(ht, tbl, hash);
-	data = ERR_PTR(-ENOMEM);
-	if (!pprev)
-		goto out;
-
-	rht_for_each_from(head, *pprev, tbl, hash) {
+	rht_for_each_from(head, rht_ptr(*bkt), tbl, hash) {
 		struct rhlist_head *plist;
 		struct rhlist_head *list;
 
@@ -643,7 +697,7 @@ slow_path:
 		data = rht_obj(ht, head);
 
 		if (!rhlist)
-			goto out;
+			goto out_unlock;
 
 
 		list = container_of(obj, struct rhlist_head, rhead);
@@ -652,9 +706,13 @@ slow_path:
 		RCU_INIT_POINTER(list->next, plist);
 		head = rht_dereference_bucket(head->next, tbl, hash);
 		RCU_INIT_POINTER(list->rhead.next, head);
-		rcu_assign_pointer(*pprev, obj);
-
-		goto good;
+		if (pprev) {
+			rcu_assign_pointer(*pprev, obj);
+			rht_unlock(bkt);
+		} else
+			rht_assign_unlock(bkt, obj);
+		data = NULL;
+		goto out;
 	}
 
 	if (elasticity <= 0)
@@ -662,12 +720,13 @@ slow_path:
 
 	data = ERR_PTR(-E2BIG);
 	if (unlikely(rht_grow_above_max(ht, tbl)))
-		goto out;
+		goto out_unlock;
 
 	if (unlikely(rht_grow_above_100(ht, tbl)))
 		goto slow_path;
 
-	head = rht_dereference_bucket(*pprev, tbl, hash);
+	/* Inserting at head of list makes unlocking free. */
+	head = rht_ptr(rht_dereference_bucket(*bkt, tbl, hash));
 
 	RCU_INIT_POINTER(obj->next, head);
 	if (rhlist) {
@@ -677,20 +736,21 @@ slow_path:
 		RCU_INIT_POINTER(list->next, NULL);
 	}
 
-	rcu_assign_pointer(*pprev, obj);
-
 	atomic_inc(&ht->nelems);
+	rht_assign_unlock(bkt, obj);
+
 	if (rht_grow_above_75(ht, tbl))
 		schedule_work(&ht->run_work);
 
-good:
 	data = NULL;
-
 out:
-	spin_unlock_bh(lock);
 	rcu_read_unlock();
 
 	return data;
+
+out_unlock:
+	rht_unlock(bkt);
+	goto out;
 }
 
 /**
@@ -699,9 +759,9 @@ out:
  * @obj:	pointer to hash head inside object
  * @params:	hash table parameters
  *
- * Will take a per bucket spinlock to protect against mutual mutations
+ * Will take the per bucket bitlock to protect against mutual mutations
  * on the same bucket. Multiple insertions may occur in parallel unless
- * they map to the same bucket lock.
+ * they map to the same bucket.
  *
  * It is safe to call this function from atomic context.
  *
@@ -728,9 +788,9 @@ static inline int rhashtable_insert_fast(
  * @list:	pointer to hash list head inside object
  * @params:	hash table parameters
  *
- * Will take a per bucket spinlock to protect against mutual mutations
+ * Will take the per bucket bitlock to protect against mutual mutations
  * on the same bucket. Multiple insertions may occur in parallel unless
- * they map to the same bucket lock.
+ * they map to the same bucket.
  *
  * It is safe to call this function from atomic context.
  *
@@ -751,9 +811,9 @@ static inline int rhltable_insert_key(
  * @list:	pointer to hash list head inside object
  * @params:	hash table parameters
  *
- * Will take a per bucket spinlock to protect against mutual mutations
+ * Will take the per bucket bitlock to protect against mutual mutations
  * on the same bucket. Multiple insertions may occur in parallel unless
- * they map to the same bucket lock.
+ * they map to the same bucket.
  *
  * It is safe to call this function from atomic context.
  *
@@ -880,21 +940,20 @@ static inline int __rhashtable_remove_fast_one(
 	struct rhash_head *obj, const struct rhashtable_params params,
 	bool rhlist)
 {
+	struct rhash_lock_head __rcu **bkt;
 	struct rhash_head __rcu **pprev;
 	struct rhash_head *he;
-	spinlock_t * lock;
 	unsigned int hash;
 	int err = -ENOENT;
 
 	hash = rht_head_hashfn(ht, tbl, obj, params);
-	lock = rht_bucket_lock(tbl, hash);
+	bkt = rht_bucket_var(tbl, hash);
+	if (!bkt)
+		return -ENOENT;
+	pprev = NULL;
+	rht_lock(bkt);
 
-	spin_lock_bh(lock);
-
-	pprev = rht_bucket_var(tbl, hash);
-	if (!pprev)
-		goto out;
-	rht_for_each_from(he, *pprev, tbl, hash) {
+	rht_for_each_from(he, rht_ptr(*bkt), tbl, hash) {
 		struct rhlist_head *list;
 
 		list = container_of(he, struct rhlist_head, rhead);
@@ -934,13 +993,17 @@ static inline int __rhashtable_remove_fast_one(
 			}
 		}
 
-		rcu_assign_pointer(*pprev, obj);
-		break;
+		if (pprev) {
+			rcu_assign_pointer(*pprev, obj);
+			rht_unlock(bkt);
+		} else {
+			rht_assign_unlock(bkt, obj);
+		}
+		goto unlocked;
 	}
 
-out:
-	spin_unlock_bh(lock);
-
+	rht_unlock(bkt);
+unlocked:
 	if (err > 0) {
 		atomic_dec(&ht->nelems);
 		if (unlikely(ht->p.automatic_shrinking &&
@@ -1029,9 +1092,9 @@ static inline int __rhashtable_replace_fast(
 	struct rhash_head *obj_old, struct rhash_head *obj_new,
 	const struct rhashtable_params params)
 {
+	struct rhash_lock_head __rcu **bkt;
 	struct rhash_head __rcu **pprev;
 	struct rhash_head *he;
-	spinlock_t *lock;
 	unsigned int hash;
 	int err = -ENOENT;
 
@@ -1042,27 +1105,33 @@ static inline int __rhashtable_replace_fast(
 	if (hash != rht_head_hashfn(ht, tbl, obj_new, params))
 		return -EINVAL;
 
-	lock = rht_bucket_lock(tbl, hash);
+	bkt = rht_bucket_var(tbl, hash);
+	if (!bkt)
+		return -ENOENT;
 
-	spin_lock_bh(lock);
+	pprev = NULL;
+	rht_lock(bkt);
 
-	pprev = rht_bucket_var(tbl, hash);
-	if (!pprev)
-		goto out;
-	rht_for_each_from(he, *pprev, tbl, hash) {
+	rht_for_each_from(he, rht_ptr(*bkt), tbl, hash) {
 		if (he != obj_old) {
 			pprev = &he->next;
 			continue;
 		}
 
 		rcu_assign_pointer(obj_new->next, obj_old->next);
-		rcu_assign_pointer(*pprev, obj_new);
+		if (pprev) {
+			rcu_assign_pointer(*pprev, obj_new);
+			rht_unlock(bkt);
+		} else {
+			rht_assign_unlock(bkt, obj_new);
+		}
 		err = 0;
-		break;
+		goto unlocked;
 	}
-out:
-	spin_unlock_bh(lock);
 
+	rht_unlock(bkt);
+
+unlocked:
 	return err;
 }
 
-- 
cgit v1.2.3


From 149212f07856b25a9d342bfd6d736519b2ef66dc Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 2 Apr 2019 10:07:45 +1100
Subject: rhashtable: add lockdep tracking to bucket bit-spin-locks.

Native bit_spin_locks are not tracked by lockdep.

The bit_spin_locks used for rhashtable buckets are local
to the rhashtable implementation, so there is little opportunity
for the sort of misuse that lockdep might detect.
However locks are held while a hash function or compare
function is called, and if one of these took a lock,
a misbehaviour is possible.

As it is quite easy to add lockdep support this unlikely
possibility seems to be enough justification.

So create a lockdep class for bucket bit_spin_lock and attach
through a lockdep_map in each bucket_table.

Without the 'nested' annotation in rhashtable_rehash_one(), lockdep
correctly reports a possible problem as this lock is taken
while another bucket lock (in another table) is held.  This
confirms that the added support works.
With the correct nested annotation in place, lockdep reports
no problems.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 51 ++++++++++++++++++++++++++++++----------------
 1 file changed, 34 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index ccbbafdf5547..460c0eaf6b96 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -82,6 +82,8 @@ struct bucket_table {
 
 	struct bucket_table __rcu *future_tbl;
 
+	struct lockdep_map	dep_map;
+
 	struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp;
 };
 
@@ -102,23 +104,38 @@ struct bucket_table {
  * this is safe.
  */
 
-static inline void rht_lock(struct rhash_lock_head **bkt)
+static inline void rht_lock(struct bucket_table *tbl,
+			    struct rhash_lock_head **bkt)
 {
 	local_bh_disable();
 	bit_spin_lock(1, (unsigned long *)bkt);
+	lock_map_acquire(&tbl->dep_map);
+}
+
+static inline void rht_lock_nested(struct bucket_table *tbl,
+				   struct rhash_lock_head **bucket,
+				   unsigned int subclass)
+{
+	local_bh_disable();
+	bit_spin_lock(1, (unsigned long *)bucket);
+	lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_);
 }
 
-static inline void rht_unlock(struct rhash_lock_head **bkt)
+static inline void rht_unlock(struct bucket_table *tbl,
+			      struct rhash_lock_head **bkt)
 {
+	lock_map_release(&tbl->dep_map);
 	bit_spin_unlock(1, (unsigned long *)bkt);
 	local_bh_enable();
 }
 
-static inline void rht_assign_unlock(struct rhash_lock_head **bkt,
+static inline void rht_assign_unlock(struct bucket_table *tbl,
+				     struct rhash_lock_head **bkt,
 				     struct rhash_head *obj)
 {
 	struct rhash_head **p = (struct rhash_head **)bkt;
 
+	lock_map_release(&tbl->dep_map);
 	rcu_assign_pointer(*p, obj);
 	preempt_enable();
 	__release(bitlock);
@@ -672,11 +689,11 @@ static inline void *__rhashtable_insert_fast(
 	if (!bkt)
 		goto out;
 	pprev = NULL;
-	rht_lock(bkt);
+	rht_lock(tbl, bkt);
 
 	if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
 slow_path:
-		rht_unlock(bkt);
+		rht_unlock(tbl, bkt);
 		rcu_read_unlock();
 		return rhashtable_insert_slow(ht, key, obj);
 	}
@@ -708,9 +725,9 @@ slow_path:
 		RCU_INIT_POINTER(list->rhead.next, head);
 		if (pprev) {
 			rcu_assign_pointer(*pprev, obj);
-			rht_unlock(bkt);
+			rht_unlock(tbl, bkt);
 		} else
-			rht_assign_unlock(bkt, obj);
+			rht_assign_unlock(tbl, bkt, obj);
 		data = NULL;
 		goto out;
 	}
@@ -737,7 +754,7 @@ slow_path:
 	}
 
 	atomic_inc(&ht->nelems);
-	rht_assign_unlock(bkt, obj);
+	rht_assign_unlock(tbl, bkt, obj);
 
 	if (rht_grow_above_75(ht, tbl))
 		schedule_work(&ht->run_work);
@@ -749,7 +766,7 @@ out:
 	return data;
 
 out_unlock:
-	rht_unlock(bkt);
+	rht_unlock(tbl, bkt);
 	goto out;
 }
 
@@ -951,7 +968,7 @@ static inline int __rhashtable_remove_fast_one(
 	if (!bkt)
 		return -ENOENT;
 	pprev = NULL;
-	rht_lock(bkt);
+	rht_lock(tbl, bkt);
 
 	rht_for_each_from(he, rht_ptr(*bkt), tbl, hash) {
 		struct rhlist_head *list;
@@ -995,14 +1012,14 @@ static inline int __rhashtable_remove_fast_one(
 
 		if (pprev) {
 			rcu_assign_pointer(*pprev, obj);
-			rht_unlock(bkt);
+			rht_unlock(tbl, bkt);
 		} else {
-			rht_assign_unlock(bkt, obj);
+			rht_assign_unlock(tbl, bkt, obj);
 		}
 		goto unlocked;
 	}
 
-	rht_unlock(bkt);
+	rht_unlock(tbl, bkt);
 unlocked:
 	if (err > 0) {
 		atomic_dec(&ht->nelems);
@@ -1110,7 +1127,7 @@ static inline int __rhashtable_replace_fast(
 		return -ENOENT;
 
 	pprev = NULL;
-	rht_lock(bkt);
+	rht_lock(tbl, bkt);
 
 	rht_for_each_from(he, rht_ptr(*bkt), tbl, hash) {
 		if (he != obj_old) {
@@ -1121,15 +1138,15 @@ static inline int __rhashtable_replace_fast(
 		rcu_assign_pointer(obj_new->next, obj_old->next);
 		if (pprev) {
 			rcu_assign_pointer(*pprev, obj_new);
-			rht_unlock(bkt);
+			rht_unlock(tbl, bkt);
 		} else {
-			rht_assign_unlock(bkt, obj_new);
+			rht_assign_unlock(tbl, bkt, obj_new);
 		}
 		err = 0;
 		goto unlocked;
 	}
 
-	rht_unlock(bkt);
+	rht_unlock(tbl, bkt);
 
 unlocked:
 	return err;
-- 
cgit v1.2.3


From b262a69582a4676c7378a73077b7bb186c7c5b2a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:22 +0100
Subject: xfrm: place af number into xfrm_mode struct

This will be useful to know if we're supposed to decode ipv4 or ipv6.

While at it, make the unregister function return void, all module_exit
functions did just BUG(); there is never a point in doing error checks
if there is no way to handle such error.

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 85386becbaea..9a155063c25f 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -482,8 +482,9 @@ struct xfrm_mode {
 
 	struct xfrm_state_afinfo *afinfo;
 	struct module *owner;
-	unsigned int encap;
-	int flags;
+	u8 encap;
+	u8 family;
+	u8 flags;
 };
 
 /* Flags for xfrm_mode. */
@@ -491,8 +492,8 @@ enum {
 	XFRM_MODE_FLAG_TUNNEL = 1,
 };
 
-int xfrm_register_mode(struct xfrm_mode *mode, int family);
-int xfrm_unregister_mode(struct xfrm_mode *mode, int family);
+int xfrm_register_mode(struct xfrm_mode *mode);
+void xfrm_unregister_mode(struct xfrm_mode *mode);
 
 static inline int xfrm_af2proto(unsigned int family)
 {
-- 
cgit v1.2.3


From c2d305e51038167dd9de8d476c72f667d84cad8b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:24 +0100
Subject: xfrm: remove input indirection from xfrm_mode

No need for any indirection or abstraction here, both functions
are pretty much the same and quite small, they also have no external
dependencies.

xfrm_prepare_input can then be made static.

With allmodconfig build, size increase of vmlinux is 25 byte:

Before:
   text   data     bss     dec      filename
15730207  6936924 4046908 26714039  vmlinux

After:
15730208  6936948 4046908 26714064 vmlinux

v2: Fix INET_XFRM_MODE_TRANSPORT name in is-enabled test (Sabrina Dubroca)
    change copied comment to refer to transport and network header,
    not skb->{h,nh}, which don't exist anymore. (Sabrina)
    make xfrm_prepare_input static (Eyal Birger)

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 9a155063c25f..2c5fc9cc367d 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -436,16 +436,6 @@ struct xfrm_mode {
 	 */
 	int (*input2)(struct xfrm_state *x, struct sk_buff *skb);
 
-	/*
-	 * This is the actual input entry point.
-	 *
-	 * For transport mode and equivalent this would be identical to
-	 * input2 (which does not need to be set).  While tunnel mode
-	 * and equivalent would set this to the tunnel encapsulation function
-	 * xfrm4_prepare_input that would in turn call input2.
-	 */
-	int (*input)(struct xfrm_state *x, struct sk_buff *skb);
-
 	/*
 	 * Add encapsulation header.
 	 *
@@ -1606,7 +1596,6 @@ int xfrm_init_replay(struct xfrm_state *x);
 int xfrm_state_mtu(struct xfrm_state *x, int mtu);
 int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload);
 int xfrm_init_state(struct xfrm_state *x);
-int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb);
 int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type);
 int xfrm_input_resume(struct sk_buff *skb, int nexthdr);
 int xfrm_trans_queue(struct sk_buff *skb,
-- 
cgit v1.2.3


From 0c620e97b3490890facbbe06d5deed9b024de255 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:25 +0100
Subject: xfrm: remove output indirection from xfrm_mode

Same is input indirection.  Only exception: we need to export
xfrm_outer_mode_output for pktgen.

Increases size of vmlinux by about 163 byte:
Before:
   text    data     bss     dec      filename
15730208  6936948 4046908 26714064   vmlinux

After:
15730311  6937008 4046908 26714227   vmlinux

xfrm_inner_extract_output has no more external callers, make it static.

v2: add IS_ENABLED(IPV6) guard in xfrm6_prepare_output
    add two missing breaks in xfrm_outer_mode_output (Sabrina Dubroca)
    add WARN_ON_ONCE for 'call AF_INET6 related output function, but
    CONFIG_IPV6=n' case.
    make xfrm_inner_extract_output static

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 2c5fc9cc367d..01e7e9c0e8a9 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -449,17 +449,6 @@ struct xfrm_mode {
 	 */
 	int (*output2)(struct xfrm_state *x,struct sk_buff *skb);
 
-	/*
-	 * This is the actual output entry point.
-	 *
-	 * For transport mode and equivalent this would be identical to
-	 * output2 (which does not need to be set).  While tunnel mode
-	 * and equivalent would set this to a tunnel encapsulation function
-	 * (xfrm4_prepare_output or xfrm6_prepare_output) that would in turn
-	 * call output2.
-	 */
-	int (*output)(struct xfrm_state *x, struct sk_buff *skb);
-
 	/*
 	 * Adjust pointers into the packet and do GSO segmentation.
 	 */
@@ -1603,7 +1592,11 @@ int xfrm_trans_queue(struct sk_buff *skb,
 				   struct sk_buff *));
 int xfrm_output_resume(struct sk_buff *skb, int err);
 int xfrm_output(struct sock *sk, struct sk_buff *skb);
-int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb);
+
+#if IS_ENABLED(CONFIG_NET_PKTGEN)
+int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb);
+#endif
+
 void xfrm_local_error(struct sk_buff *skb, int mtu);
 int xfrm4_extract_header(struct sk_buff *skb);
 int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb);
@@ -1622,7 +1615,6 @@ static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
 }
 
 int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb);
-int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb);
 int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb);
 int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err);
@@ -1649,7 +1641,6 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
 __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr);
 __be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr);
 int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb);
-int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb);
 int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb);
 int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb,
-- 
cgit v1.2.3


From 303c5fab1272888b22088fbdd08cb770205ccb7a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:26 +0100
Subject: xfrm: remove xmit indirection from xfrm_mode

There are only two versions (tunnel and transport). The ip/ipv6 versions
are only differ in sizeof(iphdr) vs ipv6hdr.

Place this in the core and use x->outer_mode->encap type to call the
correct adjustment helper.

Before:
   text   data    bss     dec      filename
15730311  6937008 4046908 26714227 vmlinux

After:
15730428  6937008 4046908 26714344 vmlinux

(about 117 byte increase)

v2: use family from x->outer_mode, not inner

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 01e7e9c0e8a9..07966a27e4a4 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -454,11 +454,6 @@ struct xfrm_mode {
 	 */
 	struct sk_buff *(*gso_segment)(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features);
 
-	/*
-	 * Adjust pointers into the packet when IPsec is done at layer2.
-	 */
-	void (*xmit)(struct xfrm_state *x, struct sk_buff *skb);
-
 	struct xfrm_state_afinfo *afinfo;
 	struct module *owner;
 	u8 encap;
-- 
cgit v1.2.3


From 7613b92b1ae37141704948b77e8762c5de896510 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:27 +0100
Subject: xfrm: remove gso_segment indirection from xfrm_mode

These functions are small and we only have versions for tunnel
and transport mode for ipv4 and ipv6 respectively.

Just place the 'transport or tunnel' conditional in the protocol
specific function instead of using an indirection.

Before:
    3226       12       0     3238   net/ipv4/esp4_offload.o
    7004      492       0     7496   net/ipv4/ip_vti.o
    3339       12       0     3351   net/ipv6/esp6_offload.o
   11294      460       0    11754   net/ipv6/ip6_vti.o
    1180       72       0     1252   net/ipv4/xfrm4_mode_beet.o
     428       48       0      476   net/ipv4/xfrm4_mode_transport.o
    1271       48       0     1319   net/ipv4/xfrm4_mode_tunnel.o
    1083       60       0     1143   net/ipv6/xfrm6_mode_beet.o
     172       48       0      220   net/ipv6/xfrm6_mode_ro.o
     429       48       0      477   net/ipv6/xfrm6_mode_transport.o
    1164       48       0     1212   net/ipv6/xfrm6_mode_tunnel.o
15730428  6937008 4046908 26714344   vmlinux

After:
    3461       12       0     3473   net/ipv4/esp4_offload.o
    7000      492       0     7492   net/ipv4/ip_vti.o
    3574       12       0     3586   net/ipv6/esp6_offload.o
   11295      460       0    11755   net/ipv6/ip6_vti.o
    1180       64       0     1244   net/ipv4/xfrm4_mode_beet.o
     171       40       0      211   net/ipv4/xfrm4_mode_transport.o
    1163       40       0     1203   net/ipv4/xfrm4_mode_tunnel.o
    1083       52       0     1135   net/ipv6/xfrm6_mode_beet.o
     172       40       0      212   net/ipv6/xfrm6_mode_ro.o
     172       40       0      212   net/ipv6/xfrm6_mode_transport.o
    1056       40       0     1096   net/ipv6/xfrm6_mode_tunnel.o
15730424  6937008 4046908 26714340   vmlinux

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 07966a27e4a4..de103a6d1ef8 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -449,11 +449,6 @@ struct xfrm_mode {
 	 */
 	int (*output2)(struct xfrm_state *x,struct sk_buff *skb);
 
-	/*
-	 * Adjust pointers into the packet and do GSO segmentation.
-	 */
-	struct sk_buff *(*gso_segment)(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features);
-
 	struct xfrm_state_afinfo *afinfo;
 	struct module *owner;
 	u8 encap;
-- 
cgit v1.2.3


From b3284df1c86f7ac078dcb8fb250fe3d6437e740c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:28 +0100
Subject: xfrm: remove input2 indirection from xfrm_mode

No external dependencies on any module, place this in the core.
Increase is about 1800 byte for xfrm_input.o.

The beet helpers get added to internal header, as they can be reused
from xfrm_output.c in the next patch (kernel contains several
copies of them in the xfrm{4,6}_mode_beet.c files).

Before:
   text    data     bss     dec filename
   5578     176    2364    8118 net/xfrm/xfrm_input.o
   1180      64       0    1244 net/ipv4/xfrm4_mode_beet.o
    171      40       0     211 net/ipv4/xfrm4_mode_transport.o
   1163      40       0    1203 net/ipv4/xfrm4_mode_tunnel.o
   1083      52       0    1135 net/ipv6/xfrm6_mode_beet.o
    172      40       0     212 net/ipv6/xfrm6_mode_ro.o
    172      40       0     212 net/ipv6/xfrm6_mode_transport.o
   1056      40       0    1096 net/ipv6/xfrm6_mode_tunnel.o

After:
   text    data     bss     dec filename
   7373     200    2364    9937 net/xfrm/xfrm_input.o
    587      44       0     631 net/ipv4/xfrm4_mode_beet.o
    171      32       0     203 net/ipv4/xfrm4_mode_transport.o
    649      32       0     681 net/ipv4/xfrm4_mode_tunnel.o
    625      44       0     669 net/ipv6/xfrm6_mode_beet.o
    172      32       0     204 net/ipv6/xfrm6_mode_ro.o
    172      32       0     204 net/ipv6/xfrm6_mode_transport.o
    599      32       0     631 net/ipv6/xfrm6_mode_tunnel.o

v2: pass inner_mode to xfrm_inner_mode_encap_remove to fix
    AF_UNSPEC selector breakage (bisected by Benedict Wong)

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index de103a6d1ef8..bdda545cf740 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -423,19 +423,6 @@ int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned sh
 int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);
 
 struct xfrm_mode {
-	/*
-	 * Remove encapsulation header.
-	 *
-	 * The IP header will be moved over the top of the encapsulation
-	 * header.
-	 *
-	 * On entry, the transport header shall point to where the IP header
-	 * should be and the network header shall be set to where the IP
-	 * header currently is.  skb->data shall point to the start of the
-	 * payload.
-	 */
-	int (*input2)(struct xfrm_state *x, struct sk_buff *skb);
-
 	/*
 	 * Add encapsulation header.
 	 *
-- 
cgit v1.2.3


From 1de70830066b72b6a8e259e5363f6c0bc4ba7bbc Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:29 +0100
Subject: xfrm: remove output2 indirection from xfrm_mode

similar to previous patch: no external module dependencies,
so we can avoid the indirection by placing this in the core.

This change removes the last indirection from xfrm_mode and the
xfrm4|6_mode_{beet,tunnel}.c modules contain (almost) no code anymore.

Before:
   text    data     bss     dec     hex filename
   3957     136       0    4093     ffd net/xfrm/xfrm_output.o
    587      44       0     631     277 net/ipv4/xfrm4_mode_beet.o
    649      32       0     681     2a9 net/ipv4/xfrm4_mode_tunnel.o
    625      44       0     669     29d net/ipv6/xfrm6_mode_beet.o
    599      32       0     631     277 net/ipv6/xfrm6_mode_tunnel.o
After:
   text    data     bss     dec     hex filename
   5359     184       0    5543    15a7 net/xfrm/xfrm_output.o
    171      24       0     195      c3 net/ipv4/xfrm4_mode_beet.o
    171      24       0     195      c3 net/ipv4/xfrm4_mode_tunnel.o
    172      24       0     196      c4 net/ipv6/xfrm6_mode_beet.o
    172      24       0     196      c4 net/ipv6/xfrm6_mode_tunnel.o

v2: fold the *encap_add functions into xfrm*_prepare_output
    preserve (move) output2 comment (Sabrina)
    use x->outer_mode->encap, not inner
    fix a build breakage on ppc (kbuild robot)

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index bdda545cf740..4351444c10fc 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -423,19 +423,6 @@ int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned sh
 int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);
 
 struct xfrm_mode {
-	/*
-	 * Add encapsulation header.
-	 *
-	 * On exit, the transport header will be set to the start of the
-	 * encapsulation header to be filled in by x->type->output and
-	 * the mac header will be set to the nextheader (protocol for
-	 * IPv4) field of the extension header directly preceding the
-	 * encapsulation header, or in its absence, that of the top IP
-	 * header.  The value of the network header will always point
-	 * to the top IP header while skb->data will point to the payload.
-	 */
-	int (*output2)(struct xfrm_state *x,struct sk_buff *skb);
-
 	struct xfrm_state_afinfo *afinfo;
 	struct module *owner;
 	u8 encap;
-- 
cgit v1.2.3


From 733a5fac2f15b55b9059230d098ed04341d2d884 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:30 +0100
Subject: xfrm: remove afinfo pointer from xfrm_mode

Adds an EXPORT_SYMBOL for afinfo_get_rcu, as it will now be called from
ipv6 in case of CONFIG_IPV6=m.

This change has virtually no effect on vmlinux size, but it reduces
afinfo size and allows followup patch to make xfrm modes const.

v2: mark if (afinfo) tests as likely (Sabrina)
    re-fetch afinfo according to inner_mode in xfrm_prepare_input().

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 4351444c10fc..8d1c9506bcf6 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -423,7 +423,6 @@ int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned sh
 int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);
 
 struct xfrm_mode {
-	struct xfrm_state_afinfo *afinfo;
 	struct module *owner;
 	u8 encap;
 	u8 family;
-- 
cgit v1.2.3


From 4c145dce26013763490df88f2473714f5bc7857d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:31 +0100
Subject: xfrm: make xfrm modes builtin

after previous changes, xfrm_mode contains no function pointers anymore
and all modules defining such struct contain no code except an init/exit
functions to register the xfrm_mode struct with the xfrm core.

Just place the xfrm modes core and remove the modules,
the run-time xfrm_mode register/unregister functionality is removed.

Before:

    text    data     bss      dec filename
    7523     200    2364    10087 net/xfrm/xfrm_input.o
   40003     628     440    41071 net/xfrm/xfrm_state.o
15730338 6937080 4046908 26714326 vmlinux

    7389     200    2364    9953  net/xfrm/xfrm_input.o
   40574     656     440   41670  net/xfrm/xfrm_state.o
15730084 6937068 4046908 26714060 vmlinux

The xfrm*_mode_{transport,tunnel,beet} modules are gone.

v2: replace CONFIG_INET6_XFRM_MODE_* IS_ENABLED guards with CONFIG_IPV6
    ones rather than removing them.

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 8d1c9506bcf6..4ca79cdc3460 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -234,9 +234,9 @@ struct xfrm_state {
 	/* Reference to data common to all the instances of this
 	 * transformer. */
 	const struct xfrm_type	*type;
-	struct xfrm_mode	*inner_mode;
-	struct xfrm_mode	*inner_mode_iaf;
-	struct xfrm_mode	*outer_mode;
+	const struct xfrm_mode	*inner_mode;
+	const struct xfrm_mode	*inner_mode_iaf;
+	const struct xfrm_mode	*outer_mode;
 
 	const struct xfrm_type_offload	*type_offload;
 
@@ -347,7 +347,6 @@ struct xfrm_state_afinfo {
 	struct module			*owner;
 	const struct xfrm_type		*type_map[IPPROTO_MAX];
 	const struct xfrm_type_offload	*type_offload_map[IPPROTO_MAX];
-	struct xfrm_mode		*mode_map[XFRM_MODE_MAX];
 
 	int			(*init_flags)(struct xfrm_state *x);
 	void			(*init_tempsel)(struct xfrm_selector *sel,
@@ -423,7 +422,6 @@ int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned sh
 int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);
 
 struct xfrm_mode {
-	struct module *owner;
 	u8 encap;
 	u8 family;
 	u8 flags;
@@ -434,9 +432,6 @@ enum {
 	XFRM_MODE_FLAG_TUNNEL = 1,
 };
 
-int xfrm_register_mode(struct xfrm_mode *mode);
-void xfrm_unregister_mode(struct xfrm_mode *mode);
-
 static inline int xfrm_af2proto(unsigned int family)
 {
 	switch(family) {
@@ -449,7 +444,7 @@ static inline int xfrm_af2proto(unsigned int family)
 	}
 }
 
-static inline struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto)
+static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto)
 {
 	if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) ||
 	    (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6))
-- 
cgit v1.2.3


From c9500d7b7de8ff6ac88ee3e38b782889f1616593 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Mar 2019 21:16:32 +0100
Subject: xfrm: store xfrm_mode directly, not its address

This structure is now only 4 bytes, so its more efficient
to cache a copy rather than its address.

No significant size difference in allmodconfig vmlinux.

With non-modular kernel that has all XFRM options enabled, this
series reduces vmlinux image size by ~11kb. All xfrm_mode
indirections are gone and all modes are built-in.

before (ipsec-next master):
    text      data      bss         dec   filename
21071494   7233140 11104324    39408958   vmlinux.master

after this series:
21066448   7226772 11104324    39397544   vmlinux.patched

With allmodconfig kernel, the size increase is only 362 bytes,
even all the xfrm config options removed in this series are
modular.

before:
    text      data     bss      dec   filename
15731286   6936912 4046908 26715106   vmlinux.master

after this series:
15731492   6937068  4046908  26715468 vmlinux

Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 4ca79cdc3460..77eb578a0384 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -132,6 +132,17 @@ struct xfrm_state_offload {
 	u8			flags;
 };
 
+struct xfrm_mode {
+	u8 encap;
+	u8 family;
+	u8 flags;
+};
+
+/* Flags for xfrm_mode. */
+enum {
+	XFRM_MODE_FLAG_TUNNEL = 1,
+};
+
 /* Full description of state of transformer. */
 struct xfrm_state {
 	possible_net_t		xs_net;
@@ -234,9 +245,9 @@ struct xfrm_state {
 	/* Reference to data common to all the instances of this
 	 * transformer. */
 	const struct xfrm_type	*type;
-	const struct xfrm_mode	*inner_mode;
-	const struct xfrm_mode	*inner_mode_iaf;
-	const struct xfrm_mode	*outer_mode;
+	struct xfrm_mode	inner_mode;
+	struct xfrm_mode	inner_mode_iaf;
+	struct xfrm_mode	outer_mode;
 
 	const struct xfrm_type_offload	*type_offload;
 
@@ -421,17 +432,6 @@ struct xfrm_type_offload {
 int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family);
 int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);
 
-struct xfrm_mode {
-	u8 encap;
-	u8 family;
-	u8 flags;
-};
-
-/* Flags for xfrm_mode. */
-enum {
-	XFRM_MODE_FLAG_TUNNEL = 1,
-};
-
 static inline int xfrm_af2proto(unsigned int family)
 {
 	switch(family) {
@@ -448,9 +448,9 @@ static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, i
 {
 	if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) ||
 	    (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6))
-		return x->inner_mode;
+		return &x->inner_mode;
 	else
-		return x->inner_mode_iaf;
+		return &x->inner_mode_iaf;
 }
 
 struct xfrm_tmpl {
@@ -1990,7 +1990,7 @@ static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x,
 			tunnel = true;
 		break;
 	}
-	if (tunnel && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL))
+	if (tunnel && !(x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL))
 		return -EINVAL;
 
 	return 0;
-- 
cgit v1.2.3


From 1e1b11b6a1111cd9e8af1fd6ccda270a9fa3eacf Mon Sep 17 00:00:00 2001
From: vamsi krishna <vamsin@codeaurora.org>
Date: Fri, 1 Feb 2019 18:34:51 +0530
Subject: nl80211/cfg80211: Specify band specific min RSSI thresholds with
 sched scan

This commit adds the support to specify the RSSI thresholds per
band for each match set. This enhances the current behavior which
specifies a single rssi_threshold across all the bands by
introducing the rssi_threshold_per_band. These per band rssi
thresholds are referred through NL80211_BAND_* (enum nl80211_band)
variables  as attribute types. Such attributes/values per each
band are nested through NL80211_ATTR_SCHED_SCAN_MIN_RSSI.
These band specific rssi thresholds shall take precedence over
the current rssi_thold per match set.
Drivers indicate this support through
%NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD.
These per band rssi attributes/values does not specify
"default RSSI filter" as done by
NL80211_SCHED_SCAN_MATCH_ATTR_RSSI to stay backward compatible.
That said, these per band rssi values have to be specified for
the corresponding matchset.

Signed-off-by: vamsi krishna <vamsin@codeaurora.org>
Signed-off-by: Srinivas Dasari <dasaris@codeaurora.org>
[rebase on refactoring, add policy]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  8 ++++++++
 include/uapi/linux/nl80211.h | 13 +++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index bb307a11ee63..b13234a486e7 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1832,11 +1832,19 @@ static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask)
  * @bssid: BSSID to be matched; may be all-zero BSSID in case of SSID match
  *	or no match (RSSI only)
  * @rssi_thold: don't report scan results below this threshold (in s32 dBm)
+ * @per_band_rssi_thold: Minimum rssi threshold for each band to be applied
+ *	for filtering out scan results received. Drivers advertize this support
+ *	of band specific rssi based filtering through the feature capability
+ *	%NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD. These band
+ *	specific rssi thresholds take precedence over rssi_thold, if specified.
+ *	If not specified for any band, it will be assigned with rssi_thold of
+ *	corresponding matchset.
  */
 struct cfg80211_match_set {
 	struct cfg80211_ssid ssid;
 	u8 bssid[ETH_ALEN];
 	s32 rssi_thold;
+	s32 per_band_rssi_thold[NUM_NL80211_BANDS];
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index dd4f86ee286e..4a9404958fbe 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3638,6 +3638,14 @@ enum nl80211_reg_rule_attr {
  *	value as specified by &struct nl80211_bss_select_rssi_adjust.
  * @NL80211_SCHED_SCAN_MATCH_ATTR_BSSID: BSSID to be used for matching
  *	(this cannot be used together with SSID).
+ * @NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI: Nested attribute that carries the
+ *	band specific minimum rssi thresholds for the bands defined in
+ *	enum nl80211_band. The minimum rssi threshold value(s32) specific to a
+ *	band shall be encapsulated in attribute with type value equals to one
+ *	of the NL80211_BAND_* defined in enum nl80211_band. For example, the
+ *	minimum rssi threshold value for 2.4GHZ band shall be encapsulated
+ *	within an attribute of type NL80211_BAND_2GHZ. And one or more of such
+ *	attributes will be nested within this attribute.
  * @NL80211_SCHED_SCAN_MATCH_ATTR_MAX: highest scheduled scan filter
  *	attribute number currently defined
  * @__NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST: internal use
@@ -3650,6 +3658,7 @@ enum nl80211_sched_scan_match_attr {
 	NL80211_SCHED_SCAN_MATCH_ATTR_RELATIVE_RSSI,
 	NL80211_SCHED_SCAN_MATCH_ATTR_RSSI_ADJUST,
 	NL80211_SCHED_SCAN_MATCH_ATTR_BSSID,
+	NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI,
 
 	/* keep last */
 	__NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST,
@@ -5343,6 +5352,9 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_AP_PMKSA_CACHING: Driver/device supports PMKSA caching
  *	(set/del PMKSA operations) in AP mode.
  *
+ * @NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD: Driver supports
+ *	filtering of sched scan results using band specific RSSI thresholds.
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -5384,6 +5396,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER,
 	NL80211_EXT_FEATURE_AIRTIME_FAIRNESS,
 	NL80211_EXT_FEATURE_AP_PMKSA_CACHING,
+	NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
-- 
cgit v1.2.3


From ab60633c7136c300f15a390f3469d7c4be15a055 Mon Sep 17 00:00:00 2001
From: Narayanraddi Masti <team.nmasti@gmail.com>
Date: Thu, 7 Feb 2019 12:16:05 -0800
Subject: mac80211: Add support for NL80211_STA_INFO_AIRTIME_LINK_METRIC

Add support for mesh airtime link metric attribute
NL80211_STA_INFO_AIRTIME_LINK_METRIC.

Signed-off-by: Narayanraddi Masti <team.nmasti@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 3 +++
 include/uapi/linux/nl80211.h | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index b13234a486e7..5859a5e02454 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1327,6 +1327,7 @@ struct cfg80211_tid_stats {
  * @fcs_err_count: number of packets (MPDUs) received from this station with
  *	an FCS error. This counter should be incremented only when TA of the
  *	received packet with an FCS error matches the peer MAC address.
+ * @airtime_link_metric: mesh airtime link metric.
  */
 struct station_info {
 	u64 filled;
@@ -1381,6 +1382,8 @@ struct station_info {
 
 	u32 rx_mpdu_count;
 	u32 fcs_err_count;
+
+	u32 airtime_link_metric;
 };
 
 #if IS_ENABLED(CONFIG_CFG80211)
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 4a9404958fbe..07457f4aea00 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3139,6 +3139,7 @@ enum nl80211_sta_bss_param {
  * @NL80211_STA_INFO_TX_DURATION: aggregate PPDU duration for all frames
  *	sent to the station (u64, usec)
  * @NL80211_STA_INFO_AIRTIME_WEIGHT: current airtime weight for station (u16)
+ * @NL80211_STA_INFO_AIRTIME_LINK_METRIC: airtime link metric for mesh station
  * @__NL80211_STA_INFO_AFTER_LAST: internal
  * @NL80211_STA_INFO_MAX: highest possible station info attribute
  */
@@ -3184,6 +3185,7 @@ enum nl80211_sta_info {
 	NL80211_STA_INFO_CONNECTED_TO_GATE,
 	NL80211_STA_INFO_TX_DURATION,
 	NL80211_STA_INFO_AIRTIME_WEIGHT,
+	NL80211_STA_INFO_AIRTIME_LINK_METRIC,
 
 	/* keep last */
 	__NL80211_STA_INFO_AFTER_LAST,
-- 
cgit v1.2.3


From cb74e9775871f8c82a1297cf76209f10ab5bbe3d Mon Sep 17 00:00:00 2001
From: Sunil Dutt <usdutt@codeaurora.org>
Date: Wed, 20 Feb 2019 16:18:07 +0530
Subject: cfg80211/nl80211: Offload OWE processing to user space in AP mode

This interface allows the host driver to offload OWE processing
to user space. This intends to support OWE (Opportunistic Wireless
Encryption) AKM by the drivers that implement SME but rely on the
user space for the cryptographic/OWE processing in AP mode. Such
drivers are not capable of processing/deriving the DH IE.

A new NL80211 command - NL80211_CMD_UPDATE_OWE_INFO is introduced
to send the request/event between the host driver and user space.

Driver shall provide the OWE info (MAC address and DH IE) of
the peer to user space for cryptographic processing of the DH IE
through the event. Accordingly, the user space shall update the
OWE info/DH IE to the driver.

Following is the sequence in AP mode for OWE authentication.

Driver passes the OWE info obtained from the peer in the
Association Request to the user space through the event
cfg80211_update_owe_info_event. User space shall process the
OWE info received and generate new OWE info. This OWE info is
passed to the driver through NL80211_CMD_UPDATE_OWE_INFO
request. Driver eventually uses this OWE info to send the
Association Response to the peer.

This OWE info in the command interface carries the IEs that include
PMKID of the peer if the PMKSA is still valid or an updated DH IE
for generating a new PMKSA with the peer.

Signed-off-by: Liangwei Dong <liangwei@codeaurora.org>
Signed-off-by: Sunil Dutt <usdutt@codeaurora.org>
Signed-off-by: Srinivas Dasari <dasaris@codeaurora.org>
[remove policy initialization - no longer exists]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 42 ++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/nl80211.h |  7 +++++++
 2 files changed, 49 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5859a5e02454..70432fd638af 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3110,6 +3110,32 @@ struct cfg80211_pmsr_request {
 	struct cfg80211_pmsr_request_peer peers[];
 };
 
+/**
+ * struct cfg80211_update_owe_info - OWE Information
+ *
+ * This structure provides information needed for the drivers to offload OWE
+ * (Opportunistic Wireless Encryption) processing to the user space.
+ *
+ * Commonly used across update_owe_info request and event interfaces.
+ *
+ * @peer: MAC address of the peer device for which the OWE processing
+ *	has to be done.
+ * @status: status code, %WLAN_STATUS_SUCCESS for successful OWE info
+ *	processing, use %WLAN_STATUS_UNSPECIFIED_FAILURE if user space
+ *	cannot give you the real status code for failures. Used only for
+ *	OWE update request command interface (user space to driver).
+ * @ie: IEs obtained from the peer or constructed by the user space. These are
+ *	the IEs of the remote peer in the event from the host driver and
+ *	the constructed IEs by the user space in the request interface.
+ * @ie_len: Length of IEs in octets.
+ */
+struct cfg80211_update_owe_info {
+	u8 peer[ETH_ALEN] __aligned(2);
+	u16 status;
+	const u8 *ie;
+	size_t ie_len;
+};
+
 /**
  * struct cfg80211_ops - backend description for wireless configuration
  *
@@ -3447,6 +3473,10 @@ struct cfg80211_pmsr_request {
  *	Statistics should be cumulative, currently no way to reset is provided.
  * @start_pmsr: start peer measurement (e.g. FTM)
  * @abort_pmsr: abort peer measurement
+ *
+ * @update_owe_info: Provide updated OWE info to driver. Driver implementing SME
+ *	but offloading OWE processing to the user space will get the updated
+ *	DH IE through this interface.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -3761,6 +3791,8 @@ struct cfg80211_ops {
 			      struct cfg80211_pmsr_request *request);
 	void	(*abort_pmsr)(struct wiphy *wiphy, struct wireless_dev *wdev,
 			      struct cfg80211_pmsr_request *request);
+	int	(*update_owe_info)(struct wiphy *wiphy, struct net_device *dev,
+				   struct cfg80211_update_owe_info *owe_info);
 };
 
 /*
@@ -7219,4 +7251,14 @@ void cfg80211_pmsr_complete(struct wireless_dev *wdev,
 #define wiphy_WARN(wiphy, format, args...)			\
 	WARN(1, "wiphy: %s\n" format, wiphy_name(wiphy), ##args);
 
+/**
+ * cfg80211_update_owe_info_event - Notify the peer's OWE info to user space
+ * @netdev: network device
+ * @owe_info: peer's owe info
+ * @gfp: allocation flags
+ */
+void cfg80211_update_owe_info_event(struct net_device *netdev,
+				    struct cfg80211_update_owe_info *owe_info,
+				    gfp_t gfp);
+
 #endif /* __NET_CFG80211_H */
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 07457f4aea00..a99d75bef598 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1065,6 +1065,11 @@
  *	indicated by %NL80211_ATTR_WIPHY_FREQ and other attributes
  *	determining the width and type.
  *
+ * @NL80211_CMD_UPDATE_OWE_INFO: This interface allows the host driver to
+ *	offload OWE processing to user space. This intends to support
+ *	OWE AKM by the host drivers that implement SME but rely
+ *	on the user space for the cryptographic/DH IE processing in AP mode.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1285,6 +1290,8 @@ enum nl80211_commands {
 
 	NL80211_CMD_NOTIFY_RADAR,
 
+	NL80211_CMD_UPDATE_OWE_INFO,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
-- 
cgit v1.2.3


From fd69c399c7d6262086b6b820757c6aeaa71feeba Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 8 Apr 2019 10:15:59 +0200
Subject: datagram: remove rendundant 'peeked' argument

After commit a297569fe00a ("net/udp: do not touch skb->peeked unless
really needed") the 'peeked' argument of __skb_try_recv_datagram()
and friends is always equal to !!'flags & MSG_PEEK'.

Since such argument is really a boolean info, and the callers have
already 'flags & MSG_PEEK' handy, we can remove it and clean-up the
code a bit.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 6 +++---
 include/net/udp.h      | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 69b5538adcea..a06275a618f0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3370,17 +3370,17 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
 					  unsigned int flags,
 					  void (*destructor)(struct sock *sk,
 							   struct sk_buff *skb),
-					  int *peeked, int *off, int *err,
+					  int *off, int *err,
 					  struct sk_buff **last);
 struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned flags,
 					void (*destructor)(struct sock *sk,
 							   struct sk_buff *skb),
-					int *peeked, int *off, int *err,
+					int *off, int *err,
 					struct sk_buff **last);
 struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
 				    void (*destructor)(struct sock *sk,
 						       struct sk_buff *skb),
-				    int *peeked, int *off, int *err);
+				    int *off, int *err);
 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
 				  int *err);
 __poll_t datagram_poll(struct file *file, struct socket *sock,
diff --git a/include/net/udp.h b/include/net/udp.h
index fd6d948755c8..d8ce937bc395 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -269,13 +269,13 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
 void udp_skb_destructor(struct sock *sk, struct sk_buff *skb);
 struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
-			       int noblock, int *peeked, int *off, int *err);
+			       int noblock, int *off, int *err);
 static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
 					   int noblock, int *err)
 {
-	int peeked, off = 0;
+	int off = 0;
 
-	return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err);
+	return __skb_recv_udp(sk, flags, noblock, &off, err);
 }
 
 int udp_v4_early_demux(struct sk_buff *skb);
-- 
cgit v1.2.3


From 3b15d09f7e6db44065aaba5fd16dc7420035c5ad Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Thu, 28 Feb 2019 13:13:26 +0800
Subject: time: Introduce jiffies64_to_msecs()

there is a similar helper in net/netfilter/nf_tables_api.c,
this maybe become a common request someday, so move it to
time.c

Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
Signed-off-by: Li RongQing <lirongqing@baidu.com>
Acked-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/jiffies.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index fa928242567d..1b6d31da7cbc 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -297,6 +297,7 @@ static inline u64 jiffies_to_nsecs(const unsigned long j)
 }
 
 extern u64 jiffies64_to_nsecs(u64 j);
+extern u64 jiffies64_to_msecs(u64 j);
 
 extern unsigned long __msecs_to_jiffies(const unsigned int m);
 #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
-- 
cgit v1.2.3


From 84c0d5e96f3ae20344fb3a79161eab18905dae56 Mon Sep 17 00:00:00 2001
From: Jacky Hu <hengqing.hu@gmail.com>
Date: Tue, 26 Mar 2019 18:31:21 +0800
Subject: ipvs: allow tunneling with gue encapsulation

ipip packets are blocked in some public cloud environments, this patch
allows gue encapsulation with the tunneling method, which would make
tunneling working in those environments.

Signed-off-by: Jacky Hu <hengqing.hu@gmail.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h        |  5 +++++
 include/uapi/linux/ip_vs.h | 11 +++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 047f9a5ccaad..2ac40135b576 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -600,6 +600,9 @@ struct ip_vs_dest_user_kern {
 
 	/* Address family of addr */
 	u16			af;
+
+	u16			tun_type;	/* tunnel type */
+	__be16			tun_port;	/* tunnel port */
 };
 
 
@@ -660,6 +663,8 @@ struct ip_vs_dest {
 	atomic_t		conn_flags;	/* flags to copy to conn */
 	atomic_t		weight;		/* server weight */
 	atomic_t		last_weight;	/* server latest weight */
+	__u16			tun_type;	/* tunnel type */
+	__be16			tun_port;	/* tunnel port */
 
 	refcount_t		refcnt;		/* reference counter */
 	struct ip_vs_stats      stats;          /* statistics */
diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h
index 1c916b2f89dc..e34f436fc79d 100644
--- a/include/uapi/linux/ip_vs.h
+++ b/include/uapi/linux/ip_vs.h
@@ -124,6 +124,13 @@
 
 #define IP_VS_PEDATA_MAXLEN     255
 
+/* Tunnel types */
+enum {
+	IP_VS_CONN_F_TUNNEL_TYPE_IPIP = 0,	/* IPIP */
+	IP_VS_CONN_F_TUNNEL_TYPE_GUE,		/* GUE */
+	IP_VS_CONN_F_TUNNEL_TYPE_MAX,
+};
+
 /*
  *	The struct ip_vs_service_user and struct ip_vs_dest_user are
  *	used to set IPVS rules through setsockopt.
@@ -392,6 +399,10 @@ enum {
 
 	IPVS_DEST_ATTR_STATS64,		/* nested attribute for dest stats */
 
+	IPVS_DEST_ATTR_TUN_TYPE,	/* tunnel type */
+
+	IPVS_DEST_ATTR_TUN_PORT,	/* tunnel port */
+
 	__IPVS_DEST_ATTR_MAX,
 };
 
-- 
cgit v1.2.3


From 01902f8c85bfde343a4c2b7428d18762442f3a25 Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Tue, 26 Mar 2019 20:06:20 +0800
Subject: netfilter: optimize nf_inet_addr_cmp

optimize nf_inet_addr_cmp by 64bit xor computation
similar to ipv6_addr_equal()

Signed-off-by: Yuan Linsi <yuanlinsi01@baidu.com>
Signed-off-by: Li RongQing <lirongqing@baidu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 72cb19c3db6a..4e0145ea033e 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -24,10 +24,17 @@ static inline int NF_DROP_GETERR(int verdict)
 static inline int nf_inet_addr_cmp(const union nf_inet_addr *a1,
 				   const union nf_inet_addr *a2)
 {
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+	const unsigned long *ul1 = (const unsigned long *)a1;
+	const unsigned long *ul2 = (const unsigned long *)a2;
+
+	return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL;
+#else
 	return a1->all[0] == a2->all[0] &&
 	       a1->all[1] == a2->all[1] &&
 	       a1->all[2] == a2->all[2] &&
 	       a1->all[3] == a2->all[3];
+#endif
 }
 
 static inline void nf_inet_addr_mask(const union nf_inet_addr *a1,
-- 
cgit v1.2.3


From d164385ec572cbe3335a635ac308760e126d4ec0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 27 Mar 2019 09:22:24 +0100
Subject: netfilter: nat: add inet family nat support

We need minimal support from the nat core for this, as we do not
want to register additional base hooks.

When an inet hook is registered, interally register ipv4 and ipv6
hooks for them and unregister those when inet hooks are removed.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index cf332c4e0b32..423cda2c6542 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -69,9 +69,9 @@ static inline bool nf_nat_oif_changed(unsigned int hooknum,
 #endif
 }
 
-int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops,
+int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
 		       const struct nf_hook_ops *nat_ops, unsigned int ops_count);
-void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops,
+void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops,
 			  unsigned int ops_count);
 
 unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
@@ -98,6 +98,9 @@ void nf_nat_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops);
 int nf_nat_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops);
 void nf_nat_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops);
 
+int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops);
+void nf_nat_inet_unregister_fn(struct net *net, const struct nf_hook_ops *ops);
+
 unsigned int
 nf_nat_inet_fn(void *priv, struct sk_buff *skb,
 	       const struct nf_hook_state *state);
-- 
cgit v1.2.3


From c1deb065cf3b5bcd483e3f03479f930edb151b99 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 27 Mar 2019 09:22:25 +0100
Subject: netfilter: nf_tables: merge route type into core

very little code, so it really doesn't make sense to have extra
modules or even a kconfig knob for this.

Merge them and make functionality available unconditionally.
The merge makes inet family route support trivial, so add it
as well here.

Before:
   text	   data	    bss	    dec	    hex	filename
    835	    832	      0	   1667	    683 nft_chain_route_ipv4.ko
    870	    832	      0	   1702	    6a6	nft_chain_route_ipv6.ko
 111568	   2556	    529	 114653	  1bfdd	nf_tables.ko

After:
   text	   data	    bss	    dec	    hex	filename
 113133	   2556	    529	 116218	  1c5fa	nf_tables.ko

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv6.h    | 15 +++++++++++++++
 include/net/netfilter/nf_tables.h |  2 ++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 471e9467105b..12113e502656 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -87,6 +87,21 @@ static inline int nf_ip6_route(struct net *net, struct dst_entry **dst,
 }
 
 int ip6_route_me_harder(struct net *net, struct sk_buff *skb);
+
+static inline int nf_ip6_route_me_harder(struct net *net, struct sk_buff *skb)
+{
+#if IS_MODULE(CONFIG_IPV6)
+	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
+
+	if (!v6_ops)
+		return -EHOSTUNREACH;
+
+	return v6_ops->route_me_harder(net, skb);
+#else
+	return ip6_route_me_harder(net, skb);
+#endif
+}
+
 __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
 			unsigned int dataoff, u_int8_t protocol);
 
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 3e9ab643eedf..55dff3ab44c7 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1411,4 +1411,6 @@ struct nft_trans_flowtable {
 int __init nft_chain_filter_init(void);
 void nft_chain_filter_fini(void);
 
+void __init nft_chain_route_init(void);
+void nft_chain_route_fini(void);
 #endif /* _NET_NF_TABLES_H */
-- 
cgit v1.2.3


From 4806e975729f99c7908d1688a143f1e16d464e6c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 27 Mar 2019 09:22:26 +0100
Subject: netfilter: replace NF_NAT_NEEDED with IS_ENABLED(CONFIG_NF_NAT)

NF_NAT_NEEDED is true whenever nat support for either ipv4 or ipv6 is
enabled.  Now that the af-specific nat configuration switches have been
removed, IS_ENABLED(CONFIG_NF_NAT) has the same effect.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h                   | 2 +-
 include/net/netfilter/nf_conntrack_expect.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 4e0145ea033e..a7252f3baeb0 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -367,7 +367,7 @@ extern struct nf_nat_hook __rcu *nf_nat_hook;
 static inline void
 nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
 {
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
 	struct nf_nat_hook *nat_hook;
 
 	rcu_read_lock();
diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h
index 006e430d1cdf..93ce6b0daaba 100644
--- a/include/net/netfilter/nf_conntrack_expect.h
+++ b/include/net/netfilter/nf_conntrack_expect.h
@@ -48,7 +48,7 @@ struct nf_conntrack_expect {
 	/* Expectation class */
 	unsigned int class;
 
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
 	union nf_inet_addr saved_addr;
 	/* This is the original per-proto part, used to map the
 	 * expected connection the way the recipient expects. */
-- 
cgit v1.2.3


From 22c7652cdaa8cd33ce78bacceb4e826a3f795873 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Wed, 27 Mar 2019 11:36:26 +0100
Subject: netfilter: nft_osf: Add version option support

Add version option support to the nftables "osf" expression.

Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink_osf.h  | 11 ++++++++---
 include/uapi/linux/netfilter/nf_tables.h |  6 ++++++
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/nfnetlink_osf.h b/include/linux/netfilter/nfnetlink_osf.h
index c6000046c966..788613f36935 100644
--- a/include/linux/netfilter/nfnetlink_osf.h
+++ b/include/linux/netfilter/nfnetlink_osf.h
@@ -21,13 +21,18 @@ struct nf_osf_finger {
 	struct nf_osf_user_finger	finger;
 };
 
+struct nf_osf_data {
+	const char *genre;
+	const char *version;
+};
+
 bool nf_osf_match(const struct sk_buff *skb, u_int8_t family,
 		  int hooknum, struct net_device *in, struct net_device *out,
 		  const struct nf_osf_info *info, struct net *net,
 		  const struct list_head *nf_osf_fingers);
 
-const char *nf_osf_find(const struct sk_buff *skb,
-			const struct list_head *nf_osf_fingers,
-			const int ttl_check);
+bool nf_osf_find(const struct sk_buff *skb,
+		 const struct list_head *nf_osf_fingers,
+		 const int ttl_check, struct nf_osf_data *data);
 
 #endif /* _NFOSF_H */
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index a66c8de006cc..061bb3eb20c3 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1522,15 +1522,21 @@ enum nft_flowtable_hook_attributes {
  *
  * @NFTA_OSF_DREG: destination register (NLA_U32: nft_registers)
  * @NFTA_OSF_TTL: Value of the TTL osf option (NLA_U8)
+ * @NFTA_OSF_FLAGS: flags (NLA_U32)
  */
 enum nft_osf_attributes {
 	NFTA_OSF_UNSPEC,
 	NFTA_OSF_DREG,
 	NFTA_OSF_TTL,
+	NFTA_OSF_FLAGS,
 	__NFTA_OSF_MAX,
 };
 #define NFTA_OSF_MAX (__NFTA_OSF_MAX - 1)
 
+enum nft_osf_flags {
+	NFT_OSF_F_VERSION = (1 << 0),
+};
+
 /**
  * enum nft_device_attributes - nf_tables device netlink attributes
  *
-- 
cgit v1.2.3


From 3b0a081db1f730373993c7a27936778402a3322c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 4 Apr 2019 10:58:20 +0200
Subject: netfilter: make two functions static

They have no external callers anymore.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 1 -
 include/net/netfilter/nf_tables.h  | 2 --
 2 files changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index bf384b3eedb8..1f852ef7b098 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -317,7 +317,6 @@ struct xt_table_info *xt_replace_table(struct xt_table *table,
 				       int *error);
 
 struct xt_match *xt_find_match(u8 af, const char *name, u8 revision);
-struct xt_target *xt_find_target(u8 af, const char *name, u8 revision);
 struct xt_match *xt_request_find_match(u8 af, const char *name, u8 revision);
 struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision);
 int xt_find_revision(u8 af, const char *name, u8 revision, int target,
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 55dff3ab44c7..2d5a0a1a87b8 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -475,8 +475,6 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set,
 			      enum nft_trans_phase phase);
 int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
 		       struct nft_set_binding *binding);
-void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
-			  struct nft_set_binding *binding, bool commit);
 void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set);
 
 /**
-- 
cgit v1.2.3


From 3b8b11f96616c2e763ebcc093b778b309fc07a92 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Fri, 5 Apr 2019 21:23:13 +0200
Subject: net: phy: improve link partner capability detection

genphy_read_status() so far checks phydev->supported, not the actual
PHY capabilities. This can make a difference if the supported speeds
have been limited by of_set_phy_supported() or phy_set_max_speed().

It seems that this issue only affects the link partner advertisements
as displayed by ethtool. Also this patch wouldn't apply to older
kernels because linkmode bitmaps have been introduced recently.
Therefore net-next.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index ab7439b3da2b..0f9552b17ee7 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -345,6 +345,7 @@ struct phy_c45_device_ids {
  * is_c45:  Set to true if this phy uses clause 45 addressing.
  * is_internal: Set to true if this phy is internal to a MAC.
  * is_pseudo_fixed_link: Set to true if this phy is an Ethernet switch, etc.
+ * is_gigabit_capable: Set to true if PHY supports 1000Mbps
  * has_fixups: Set to true if this phy has fixups/quirks.
  * suspended: Set to true if this phy has been suspended successfully.
  * sysfs_links: Internal boolean tracking sysfs symbolic links setup/removal.
@@ -382,6 +383,7 @@ struct phy_device {
 	unsigned is_c45:1;
 	unsigned is_internal:1;
 	unsigned is_pseudo_fixed_link:1;
+	unsigned is_gigabit_capable:1;
 	unsigned has_fixups:1;
 	unsigned suspended:1;
 	unsigned sysfs_links:1;
-- 
cgit v1.2.3


From 1aefd3de7bc667115bb77cb0bc21e874c7e190fc Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:24 -0700
Subject: ipv6: Add fib6_nh_init and release to stubs

Add fib6_nh_init and fib6_nh_release to ipv6_stubs. If fib6_nh_init fails,
callers should not invoke fib6_nh_release, so there is no reason to have
a dummy stub for the IPv6 is not enabled case.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6_stubs.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index d8d9c0b0e8c0..453b55bf6723 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -12,6 +12,8 @@
 
 /* structs from net/ip6_fib.h */
 struct fib6_info;
+struct fib6_nh;
+struct fib6_config;
 
 /* This is ugly, ideally these symbols should be built
  * into the core kernel.
@@ -40,6 +42,10 @@ struct ipv6_stub {
 	u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr,
 				 struct in6_addr *saddr);
 
+	int (*fib6_nh_init)(struct net *net, struct fib6_nh *fib6_nh,
+			    struct fib6_config *cfg, gfp_t gfp_flags,
+			    struct netlink_ext_ack *extack);
+	void (*fib6_nh_release)(struct fib6_nh *fib6_nh);
 	void (*udpv6_encap_enable)(void);
 	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
 			      const struct in6_addr *solicited_addr,
-- 
cgit v1.2.3


From 71df5777aaaeff673c242a49b945b1b96fe81718 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:25 -0700
Subject: ipv6: Add neighbor helpers that use the ipv6 stub

Add ipv6 helpers to handle ndisc references via the stub. Update
bpf_ipv6_fib_lookup to use __ipv6_neigh_lookup_noref_stub instead of
the open code ___neigh_lookup_noref with the stub.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ndisc.h | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include')

diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index ddfbb591e2c5..366150053043 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -2,6 +2,8 @@
 #ifndef _NDISC_H
 #define _NDISC_H
 
+#include <net/ipv6_stubs.h>
+
 /*
  *	ICMP codes for neighbour discovery messages
  */
@@ -379,6 +381,14 @@ static inline struct neighbour *__ipv6_neigh_lookup_noref(struct net_device *dev
 	return ___neigh_lookup_noref(&nd_tbl, neigh_key_eq128, ndisc_hashfn, pkey, dev);
 }
 
+static inline
+struct neighbour *__ipv6_neigh_lookup_noref_stub(struct net_device *dev,
+						 const void *pkey)
+{
+	return ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
+				     ndisc_hashfn, pkey, dev);
+}
+
 static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, const void *pkey)
 {
 	struct neighbour *n;
@@ -409,6 +419,36 @@ static inline void __ipv6_confirm_neigh(struct net_device *dev,
 	rcu_read_unlock_bh();
 }
 
+static inline void __ipv6_confirm_neigh_stub(struct net_device *dev,
+					     const void *pkey)
+{
+	struct neighbour *n;
+
+	rcu_read_lock_bh();
+	n = __ipv6_neigh_lookup_noref_stub(dev, pkey);
+	if (n) {
+		unsigned long now = jiffies;
+
+		/* avoid dirtying neighbour */
+		if (n->confirmed != now)
+			n->confirmed = now;
+	}
+	rcu_read_unlock_bh();
+}
+
+/* uses ipv6_stub and is meant for use outside of IPv6 core */
+static inline struct neighbour *ip_neigh_gw6(struct net_device *dev,
+					     const void *addr)
+{
+	struct neighbour *neigh;
+
+	neigh = __ipv6_neigh_lookup_noref_stub(dev, addr);
+	if (unlikely(!neigh))
+		neigh = __neigh_create(ipv6_stub->nd_tbl, addr, dev, false);
+
+	return neigh;
+}
+
 int ndisc_init(void);
 int ndisc_late_init(void);
 
-- 
cgit v1.2.3


From bdf004677107e3b847c5db09c9fbf8edefa24996 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:26 -0700
Subject: net: Replace nhc_has_gw with nhc_gw_family

Allow the gateway in a fib_nh_common to be from a different address
family than the outer fib{6}_nh. To that end, replace nhc_has_gw with
nhc_gw_family and update users of nhc_has_gw to check nhc_gw_family.
Now nhc_family is used to know if the nh_common is part of a fib_nh
or fib6_nh (used for container_of to get to route family specific data),
and nhc_gw_family represents the address family for the gateway.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h    | 2 +-
 include/net/ip_fib.h       | 7 +++----
 include/trace/events/fib.h | 4 ++--
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 342180a7285c..5909fc421305 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -69,7 +69,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
 {
 	return !(f6i->fib6_flags & (RTF_ADDRCONF|RTF_DYNAMIC)) &&
-		f6i->fib6_nh.fib_nh_has_gw;
+		f6i->fib6_nh.fib_nh_gw_family;
 }
 
 void ip6_route_input(struct sk_buff *skb);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 3ce07841dc3b..c68a40435ee0 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -83,8 +83,8 @@ struct fib_nh_common {
 	struct lwtunnel_state	*nhc_lwtstate;
 	unsigned char		nhc_scope;
 	u8			nhc_family;
-	u8			nhc_has_gw:1,
-				unused:7;
+	u8			nhc_gw_family;
+
 	union {
 		__be32          ipv4;
 		struct in6_addr ipv6;
@@ -112,8 +112,7 @@ struct fib_nh {
 #define fib_nh_flags		nh_common.nhc_flags
 #define fib_nh_lws		nh_common.nhc_lwtstate
 #define fib_nh_scope		nh_common.nhc_scope
-#define fib_nh_family		nh_common.nhc_family
-#define fib_nh_has_gw		nh_common.nhc_has_gw
+#define fib_nh_gw_family	nh_common.nhc_gw_family
 #define fib_nh_gw4		nh_common.nhc_gw.ipv4
 #define fib_nh_gw6		nh_common.nhc_gw.ipv6
 #define fib_nh_weight		nh_common.nhc_weight
diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h
index 7f83b6eafc5c..6f2a4dc35e37 100644
--- a/include/trace/events/fib.h
+++ b/include/trace/events/fib.h
@@ -69,13 +69,13 @@ TRACE_EVENT(fib_table_lookup,
 		__assign_str(name, dev ? dev->name : "-");
 
 		if (nhc) {
-			if (nhc->nhc_family == AF_INET) {
+			if (nhc->nhc_gw_family == AF_INET) {
 				p32 = (__be32 *) __entry->gw4;
 				*p32 = nhc->nhc_gw.ipv4;
 
 				in6 = (struct in6_addr *)__entry->gw6;
 				*in6 = in6_zero;
-			} else if (nhc->nhc_family == AF_INET6) {
+			} else if (nhc->nhc_gw_family == AF_INET6) {
 				p32 = (__be32 *) __entry->gw4;
 				*p32 = 0;
 
-- 
cgit v1.2.3


From 1550c171935d264f522581fd037db5e64a716bb6 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:27 -0700
Subject: ipv4: Prepare rtable for IPv6 gateway

To allow the gateway to be either an IPv4 or IPv6 address, remove
rt_uses_gateway from rtable and replace with rt_gw_family. If
rt_gw_family is set it implies rt_uses_gateway. Rename rt_gateway
to rt_gw4 to represent the IPv4 version.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/route.h b/include/net/route.h
index 9883dc82f723..96912b099c08 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -55,12 +55,12 @@ struct rtable {
 	unsigned int		rt_flags;
 	__u16			rt_type;
 	__u8			rt_is_input;
-	__u8			rt_uses_gateway;
+	u8			rt_gw_family;
 
 	int			rt_iif;
 
 	/* Info on neighbour */
-	__be32			rt_gateway;
+	__be32			rt_gw4;
 
 	/* Miscellaneous cached information */
 	u32			rt_mtu_locked:1,
@@ -82,8 +82,8 @@ static inline bool rt_is_output_route(const struct rtable *rt)
 
 static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr)
 {
-	if (rt->rt_gateway)
-		return rt->rt_gateway;
+	if (rt->rt_gw_family == AF_INET)
+		return rt->rt_gw4;
 	return daddr;
 }
 
-- 
cgit v1.2.3


From f35b794b3b405e2478654ea875bc0b29fe1a1bc5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:28 -0700
Subject: ipv4: Prepare fib_config for IPv6 gateway

Similar to rtable, fib_config needs to allow the gateway to be either an
IPv4 or an IPv6 address. To that end, rename fc_gw to fc_gw4 to mean an
IPv4 address and add fc_gw_family. Checks on 'is a gateway set' are changed
to see if fc_gw_family is set. In the process prepare the code for a
fc_gw_family == AF_INET6.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index c68a40435ee0..1f72ad553c31 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -32,10 +32,11 @@ struct fib_config {
 	u8			fc_protocol;
 	u8			fc_scope;
 	u8			fc_type;
-	/* 3 bytes unused */
+	u8			fc_gw_family;
+	/* 2 bytes unused */
 	u32			fc_table;
 	__be32			fc_dst;
-	__be32			fc_gw;
+	__be32			fc_gw4;
 	int			fc_oif;
 	u32			fc_flags;
 	u32			fc_priority;
-- 
cgit v1.2.3


From 0f5f7d7bf6e6bda4dffe7b42812a16ada6ea9816 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:29 -0700
Subject: ipv4: Add support to rtable for ipv6 gateway

Add support for an IPv6 gateway to rtable. Since a gateway is either
IPv4 or IPv6, make it a union with rt_gw4 where rt_gw_family decides
which address is in use.

When dumping the route data, encode an ipv6 nexthop using RTA_VIA.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/route.h b/include/net/route.h
index 96912b099c08..5d28a2509b58 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -60,7 +60,10 @@ struct rtable {
 	int			rt_iif;
 
 	/* Info on neighbour */
-	__be32			rt_gw4;
+	union {
+		__be32		rt_gw4;
+		struct in6_addr	rt_gw6;
+	};
 
 	/* Miscellaneous cached information */
 	u32			rt_mtu_locked:1,
-- 
cgit v1.2.3


From a4ea5d43c807be28545625c1e0641905022fa0d1 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:30 -0700
Subject: ipv4: Add support to fib_config for IPv6 gateway

Add support for an IPv6 gateway to fib_config. Since a gateway is either
IPv4 or IPv6, make it a union with fc_gw4 where fc_gw_family decides
which address is in use. Update current checks on family and gw4 to
handle ipv6 as well.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 1f72ad553c31..f1c452f618a9 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -36,7 +36,10 @@ struct fib_config {
 	/* 2 bytes unused */
 	u32			fc_table;
 	__be32			fc_dst;
-	__be32			fc_gw4;
+	union {
+		__be32		fc_gw4;
+		struct in6_addr	fc_gw6;
+	};
 	int			fc_oif;
 	u32			fc_flags;
 	u32			fc_priority;
-- 
cgit v1.2.3


From 0353f28231c79416191326810e7fe656b69c63b7 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:33 -0700
Subject: neighbor: Add skip_cache argument to neigh_output

A later patch allows an IPv6 gateway with an IPv4 route. The neighbor
entry will exist in the v6 ndisc table and the cached header will contain
the ipv6 protocol which is wrong for an IPv4 packet. For an IPv4 packet to
use the v6 neighbor entry, neigh_output needs to skip the cached header
and just use the output callback for the neigh entry.

A future patchset can look at expanding the hh_cache to handle 2
protocols. For now, IPv6 gateways with an IPv4 route will take the
extra overhead of generating the header.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 7c1ab9edba03..3e5438bd0101 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -498,11 +498,12 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb
 	return dev_queue_xmit(skb);
 }
 
-static inline int neigh_output(struct neighbour *n, struct sk_buff *skb)
+static inline int neigh_output(struct neighbour *n, struct sk_buff *skb,
+			       bool skip_cache)
 {
 	const struct hh_cache *hh = &n->hh;
 
-	if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
+	if ((n->nud_state & NUD_CONNECTED) && hh->hh_len && !skip_cache)
 		return neigh_hh_output(hh, skb);
 	else
 		return n->output(n, skb);
-- 
cgit v1.2.3


From 5c9f7c1dfc2e0776551ef1ceb335187c6698d1ff Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:34 -0700
Subject: ipv4: Add helpers for neigh lookup for nexthop

A common theme in the output path is looking up a neigh entry for a
nexthop, either the gateway in an rtable or a fallback to the daddr
in the skb:

        nexthop = (__force u32)rt_nexthop(rt, ip_hdr(skb)->daddr);
        neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
        if (unlikely(!neigh))
                neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);

To allow the nexthop to be an IPv6 address we need to consider the
family of the nexthop and then call __ipv{4,6}_neigh_lookup_noref based
on it.

To make this simpler, add a ip_neigh_gw4 helper similar to ip_neigh_gw6
added in an earlier patch which handles:

        neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
        if (unlikely(!neigh))
                neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);

And then add a second one, ip_neigh_for_gw, that calls either
ip_neigh_gw4 or ip_neigh_gw6 based on the address family of the gateway.

Update the output paths in the VRF driver and core v4 code to use
ip_neigh_for_gw simplifying the family based lookup and making both
ready for a v6 nexthop.

ipv4_neigh_lookup has a different need - the potential to resolve a
passed in address in addition to any gateway in the rtable or skb. Since
this is a one-off, add ip_neigh_gw4 and ip_neigh_gw6 diectly. The
difference between __neigh_create used by the helpers and neigh_create
called by ipv4_neigh_lookup is taking a refcount, so add rcu_read_lock_bh
and bump the refcnt on the neigh entry.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include')

diff --git a/include/net/route.h b/include/net/route.h
index 5d28a2509b58..96f6c9ae33c2 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -29,6 +29,8 @@
 #include <net/flow.h>
 #include <net/inet_sock.h>
 #include <net/ip_fib.h>
+#include <net/arp.h>
+#include <net/ndisc.h>
 #include <linux/in_route.h>
 #include <linux/rtnetlink.h>
 #include <linux/rcupdate.h>
@@ -350,4 +352,34 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
 	return hoplimit;
 }
 
+static inline struct neighbour *ip_neigh_gw4(struct net_device *dev,
+					     __be32 daddr)
+{
+	struct neighbour *neigh;
+
+	neigh = __ipv4_neigh_lookup_noref(dev, daddr);
+	if (unlikely(!neigh))
+		neigh = __neigh_create(&arp_tbl, &daddr, dev, false);
+
+	return neigh;
+}
+
+static inline struct neighbour *ip_neigh_for_gw(struct rtable *rt,
+						struct sk_buff *skb,
+						bool *is_v6gw)
+{
+	struct net_device *dev = rt->dst.dev;
+	struct neighbour *neigh;
+
+	if (likely(rt->rt_gw_family == AF_INET)) {
+		neigh = ip_neigh_gw4(dev, rt->rt_gw4);
+	} else if (rt->rt_gw_family == AF_INET6) {
+		neigh = ip_neigh_gw6(dev, &rt->rt_gw6);
+		*is_v6gw = true;
+	} else {
+		neigh = ip_neigh_gw4(dev, ip_hdr(skb)->daddr);
+	}
+	return neigh;
+}
+
 #endif	/* _ROUTE_H */
-- 
cgit v1.2.3


From 19a9d136f198cd7c4e26ea6897a0cf067d3f7ecb Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:39 -0700
Subject: ipv4: Flag fib_info with a fib_nh using IPv6 gateway

Until support is added to the offload drivers, they need to be able to
reject routes with an IPv6 gateway. To that end add a flag to fib_info
that indicates if any fib_nh has a v6 gateway. The flag allows the drivers
to efficiently know the use of a v6 gateway without walking all fib_nh
tied to a fib_info each time a route is added.

Update mlxsw and rocker to reject the routes with extack message as to why.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f1c452f618a9..337106469ec5 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -147,6 +147,7 @@ struct fib_info {
 #define fib_rtt fib_metrics->metrics[RTAX_RTT-1]
 #define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1]
 	int			fib_nhs;
+	bool			fib_nh_is_v6;
 	struct rcu_head		rcu;
 	struct fib_nh		fib_nh[0];
 #define fib_dev		fib_nh[0].fib_nh_dev
-- 
cgit v1.2.3


From d15662682db232da77136cd348f4c9df312ca6f9 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 5 Apr 2019 16:30:40 -0700
Subject: ipv4: Allow ipv6 gateway with ipv4 routes

Add support for RTA_VIA and allow an IPv6 nexthop for v4 routes:
   $ ip ro add 172.16.1.0/24 via inet6 2001:db8::1 dev eth0
   $ ip ro ls
   ...
   172.16.1.0/24 via inet6 2001:db8::1 dev eth0

For convenience and simplicity, userspace can use RTA_VIA to specify
AF_INET or AF_INET6 gateway.

The common fib_nexthop_info dump function compares the gateway address
family to the nh_common family to know if the gateway should be encoded
as RTA_VIA or RTA_GATEWAY.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 337106469ec5..d8195c77e247 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -401,6 +401,8 @@ static inline bool fib4_rules_early_flow_dissect(struct net *net,
 /* Exported by fib_frontend.c */
 extern const struct nla_policy rtm_ipv4_policy[];
 void ip_fib_init(void);
+int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
+		    struct netlink_ext_ack *extack);
 __be32 fib_compute_spec_dst(struct sk_buff *skb);
 bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev);
 int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
-- 
cgit v1.2.3


From d8eca5bbb2be9bc7546f9e733786fa2f1a594c67 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 9 Apr 2019 23:20:03 +0200
Subject: bpf: implement lookup-free direct value access for maps

This generic extension to BPF maps allows for directly loading
an address residing inside a BPF map value as a single BPF
ldimm64 instruction!

The idea is similar to what BPF_PSEUDO_MAP_FD does today, which
is a special src_reg flag for ldimm64 instruction that indicates
that inside the first part of the double insns's imm field is a
file descriptor which the verifier then replaces as a full 64bit
address of the map into both imm parts. For the newly added
BPF_PSEUDO_MAP_VALUE src_reg flag, the idea is the following:
the first part of the double insns's imm field is again a file
descriptor corresponding to the map, and the second part of the
imm field is an offset into the value. The verifier will then
replace both imm parts with an address that points into the BPF
map value at the given value offset for maps that support this
operation. Currently supported is array map with single entry.
It is possible to support more than just single map element by
reusing both 16bit off fields of the insns as a map index, so
full array map lookup could be expressed that way. It hasn't
been implemented here due to lack of concrete use case, but
could easily be done so in future in a compatible way, since
both off fields right now have to be 0 and would correctly
denote a map index 0.

The BPF_PSEUDO_MAP_VALUE is a distinct flag as otherwise with
BPF_PSEUDO_MAP_FD we could not differ offset 0 between load of
map pointer versus load of map's value at offset 0, and changing
BPF_PSEUDO_MAP_FD's encoding into off by one to differ between
regular map pointer and map value pointer would add unnecessary
complexity and increases barrier for debugability thus less
suitable. Using the second part of the imm field as an offset
into the value does /not/ come with limitations since maximum
possible value size is in u32 universe anyway.

This optimization allows for efficiently retrieving an address
to a map value memory area without having to issue a helper call
which needs to prepare registers according to calling convention,
etc, without needing the extra NULL test, and without having to
add the offset in an additional instruction to the value base
pointer. The verifier then treats the destination register as
PTR_TO_MAP_VALUE with constant reg->off from the user passed
offset from the second imm field, and guarantees that this is
within bounds of the map value. Any subsequent operations are
normally treated as typical map value handling without anything
extra needed from verification side.

The two map operations for direct value access have been added to
array map for now. In future other types could be supported as
well depending on the use case. The main use case for this commit
is to allow for BPF loader support for global variables that
reside in .data/.rodata/.bss sections such that we can directly
load the address of them with minimal additional infrastructure
required. Loader support has been added in subsequent commits for
libbpf library.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |  6 ++++++
 include/linux/bpf_verifier.h |  4 ++++
 include/uapi/linux/bpf.h     | 13 ++++++++++++-
 3 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a445194b5fb6..bd93a592dd29 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -57,6 +57,12 @@ struct bpf_map_ops {
 			     const struct btf *btf,
 			     const struct btf_type *key_type,
 			     const struct btf_type *value_type);
+
+	/* Direct value access helpers. */
+	int (*map_direct_value_addr)(const struct bpf_map *map,
+				     u64 *imm, u32 off);
+	int (*map_direct_value_meta)(const struct bpf_map *map,
+				     u64 imm, u32 *off);
 };
 
 struct bpf_map {
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index fc8254d6b569..b3ab61fe1932 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -224,6 +224,10 @@ struct bpf_insn_aux_data {
 		unsigned long map_state;	/* pointer/poison value for maps */
 		s32 call_imm;			/* saved imm field of call insn */
 		u32 alu_limit;			/* limit for add/sub register with pointer */
+		struct {
+			u32 map_index;		/* index into used_maps[] */
+			u32 map_off;		/* offset from value base address */
+		};
 	};
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
 	int sanitize_stack_off; /* stack slot to be cleared */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 837024512baf..26cfb5b2c964 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -255,8 +255,19 @@ enum bpf_attach_type {
  */
 #define BPF_F_ANY_ALIGNMENT	(1U << 1)
 
-/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */
+/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
+ * two extensions:
+ *
+ * insn[0].src_reg:  BPF_PSEUDO_MAP_FD   BPF_PSEUDO_MAP_VALUE
+ * insn[0].imm:      map fd              map fd
+ * insn[1].imm:      0                   offset into value
+ * insn[0].off:      0                   0
+ * insn[1].off:      0                   0
+ * ldimm64 rewrite:  address of map      address of map[0]+offset
+ * verifier type:    CONST_PTR_TO_MAP    PTR_TO_MAP_VALUE
+ */
 #define BPF_PSEUDO_MAP_FD	1
+#define BPF_PSEUDO_MAP_VALUE	2
 
 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
  * offset to another bpf function
-- 
cgit v1.2.3


From 591fe9888d7809d9ee5c828020b6c6ae27c37229 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 9 Apr 2019 23:20:05 +0200
Subject: bpf: add program side {rd, wr}only support for maps

This work adds two new map creation flags BPF_F_RDONLY_PROG
and BPF_F_WRONLY_PROG in order to allow for read-only or
write-only BPF maps from a BPF program side.

Today we have BPF_F_RDONLY and BPF_F_WRONLY, but this only
applies to system call side, meaning the BPF program has full
read/write access to the map as usual while bpf(2) calls with
map fd can either only read or write into the map depending
on the flags. BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG allows
for the exact opposite such that verifier is going to reject
program loads if write into a read-only map or a read into a
write-only map is detected. For read-only map case also some
helpers are forbidden for programs that would alter the map
state such as map deletion, update, etc. As opposed to the two
BPF_F_RDONLY / BPF_F_WRONLY flags, BPF_F_RDONLY_PROG as well
as BPF_F_WRONLY_PROG really do correspond to the map lifetime.

We've enabled this generic map extension to various non-special
maps holding normal user data: array, hash, lru, lpm, local
storage, queue and stack. Further generic map types could be
followed up in future depending on use-case. Main use case
here is to forbid writes into .rodata map values from verifier
side.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      | 29 +++++++++++++++++++++++++++++
 include/uapi/linux/bpf.h |  6 +++++-
 2 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bd93a592dd29..be20804631b5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -430,6 +430,35 @@ struct bpf_array {
 #define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
 #define MAX_TAIL_CALL_CNT 32
 
+#define BPF_F_ACCESS_MASK	(BPF_F_RDONLY |		\
+				 BPF_F_RDONLY_PROG |	\
+				 BPF_F_WRONLY |		\
+				 BPF_F_WRONLY_PROG)
+
+#define BPF_MAP_CAN_READ	BIT(0)
+#define BPF_MAP_CAN_WRITE	BIT(1)
+
+static inline u32 bpf_map_flags_to_cap(struct bpf_map *map)
+{
+	u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG);
+
+	/* Combination of BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG is
+	 * not possible.
+	 */
+	if (access_flags & BPF_F_RDONLY_PROG)
+		return BPF_MAP_CAN_READ;
+	else if (access_flags & BPF_F_WRONLY_PROG)
+		return BPF_MAP_CAN_WRITE;
+	else
+		return BPF_MAP_CAN_READ | BPF_MAP_CAN_WRITE;
+}
+
+static inline bool bpf_map_flags_access_ok(u32 access_flags)
+{
+	return (access_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) !=
+	       (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG);
+}
+
 struct bpf_event_entry {
 	struct perf_event *event;
 	struct file *perf_file;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 26cfb5b2c964..d275446d807c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -294,7 +294,7 @@ enum bpf_attach_type {
 
 #define BPF_OBJ_NAME_LEN 16U
 
-/* Flags for accessing BPF object */
+/* Flags for accessing BPF object from syscall side. */
 #define BPF_F_RDONLY		(1U << 3)
 #define BPF_F_WRONLY		(1U << 4)
 
@@ -304,6 +304,10 @@ enum bpf_attach_type {
 /* Zero-initialize hash function seed. This should only be used for testing. */
 #define BPF_F_ZERO_SEED		(1U << 6)
 
+/* Flags for accessing BPF object from program side. */
+#define BPF_F_RDONLY_PROG	(1U << 7)
+#define BPF_F_WRONLY_PROG	(1U << 8)
+
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
-- 
cgit v1.2.3


From 87df15de441bd4add7876ef584da8cabdd9a042a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 9 Apr 2019 23:20:06 +0200
Subject: bpf: add syscall side map freeze support

This patch adds a new BPF_MAP_FREEZE command which allows to
"freeze" the map globally as read-only / immutable from syscall
side.

Map permission handling has been refactored into map_get_sys_perms()
and drops FMODE_CAN_WRITE in case of locked map. Main use case is
to allow for setting up .rodata sections from the BPF ELF which
are loaded into the kernel, meaning BPF loader first allocates
map, sets up map value by copying .rodata section into it and once
complete, it calls BPF_MAP_FREEZE on the map fd to prevent further
modifications.

Right now BPF_MAP_FREEZE only takes map fd as argument while remaining
bpf_attr members are required to be zero. I didn't add write-only
locking here as counterpart since I don't have a concrete use-case
for it on my side, and I think it makes probably more sense to wait
once there is actually one. In that case bpf_attr can be extended
as usual with a flag field and/or others where flag 0 means that
we lock the map read-only hence this doesn't prevent to add further
extensions to BPF_MAP_FREEZE upon need.

A map creation flag like BPF_F_WRONCE was not considered for couple
of reasons: i) in case of a generic implementation, a map can consist
of more than just one element, thus there could be multiple map
updates needed to set the map into a state where it can then be
made immutable, ii) WRONCE indicates exact one-time write before
it is then set immutable. A generic implementation would set a bit
atomically on map update entry (if unset), indicating that every
subsequent update from then onwards will need to bail out there.
However, map updates can fail, so upon failure that flag would need
to be unset again and the update attempt would need to be repeated
for it to be eventually made immutable. While this can be made
race-free, this approach feels less clean and in combination with
reason i), it's not generic enough. A dedicated BPF_MAP_FREEZE
command directly sets the flag and caller has the guarantee that
map is immutable from syscall side upon successful return for any
future syscall invocations that would alter the map state, which
is also more intuitive from an API point of view. A command name
such as BPF_MAP_LOCK has been avoided as it's too close with BPF
map spin locks (which already has BPF_F_LOCK flag). BPF_MAP_FREEZE
is so far only enabled for privileged users.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      | 3 ++-
 include/uapi/linux/bpf.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index be20804631b5..65f7094c40b4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -87,7 +87,8 @@ struct bpf_map {
 	struct btf *btf;
 	u32 pages;
 	bool unpriv_array;
-	/* 51 bytes hole */
+	bool frozen; /* write-once */
+	/* 48 bytes hole */
 
 	/* The 3rd and 4th cacheline with misc members to avoid false sharing
 	 * particularly with refcounting.
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d275446d807c..af1cbd951f26 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -105,6 +105,7 @@ enum bpf_cmd {
 	BPF_BTF_GET_FD_BY_ID,
 	BPF_TASK_FD_QUERY,
 	BPF_MAP_LOOKUP_AND_DELETE_ELEM,
+	BPF_MAP_FREEZE,
 };
 
 enum bpf_map_type {
-- 
cgit v1.2.3


From f063c889c9458354a92b235a51cbb60d30321070 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 9 Apr 2019 23:20:08 +0200
Subject: bpf: add specification for BTF Var and DataSec kinds

This adds the BTF specification and UAPI bits for supporting BTF Var
and DataSec kinds. This is following LLVM upstream commit ac4082b77e07
("[BPF] Add BTF Var and DataSec Support") which has been merged recently.
Var itself is for describing a global variable and DataSec to describe
ELF sections e.g. data/bss/rodata sections that hold one or multiple
global variables.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/btf.h | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index 7b7475ef2f17..9310652ca4f9 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -39,11 +39,11 @@ struct btf_type {
 	 *             struct, union and fwd
 	 */
 	__u32 info;
-	/* "size" is used by INT, ENUM, STRUCT and UNION.
+	/* "size" is used by INT, ENUM, STRUCT, UNION and DATASEC.
 	 * "size" tells the size of the type it is describing.
 	 *
 	 * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
-	 * FUNC and FUNC_PROTO.
+	 * FUNC, FUNC_PROTO and VAR.
 	 * "type" is a type_id referring to another type.
 	 */
 	union {
@@ -70,8 +70,10 @@ struct btf_type {
 #define BTF_KIND_RESTRICT	11	/* Restrict	*/
 #define BTF_KIND_FUNC		12	/* Function	*/
 #define BTF_KIND_FUNC_PROTO	13	/* Function Proto	*/
-#define BTF_KIND_MAX		13
-#define NR_BTF_KINDS		14
+#define BTF_KIND_VAR		14	/* Variable	*/
+#define BTF_KIND_DATASEC	15	/* Section	*/
+#define BTF_KIND_MAX		BTF_KIND_DATASEC
+#define NR_BTF_KINDS		(BTF_KIND_MAX + 1)
 
 /* For some specific BTF_KIND, "struct btf_type" is immediately
  * followed by extra data.
@@ -138,4 +140,26 @@ struct btf_param {
 	__u32	type;
 };
 
+enum {
+	BTF_VAR_STATIC = 0,
+	BTF_VAR_GLOBAL_ALLOCATED,
+};
+
+/* BTF_KIND_VAR is followed by a single "struct btf_var" to describe
+ * additional information related to the variable such as its linkage.
+ */
+struct btf_var {
+	__u32	linkage;
+};
+
+/* BTF_KIND_DATASEC is followed by multiple "struct btf_var_secinfo"
+ * to describe all BTF_KIND_VAR types it contains along with it's
+ * in-section offset as well as size.
+ */
+struct btf_var_secinfo {
+	__u32	type;
+	__u32	offset;
+	__u32	size;
+};
+
 #endif /* _UAPI__LINUX_BTF_H__ */
-- 
cgit v1.2.3


From 2824ecb7010f6a20e9a4140512b798469ab066cc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 9 Apr 2019 23:20:10 +0200
Subject: bpf: allow for key-less BTF in array map

Given we'll be reusing BPF array maps for global data/bss/rodata
sections, we need a way to associate BTF DataSec type as its map
value type. In usual cases we have this ugly BPF_ANNOTATE_KV_PAIR()
macro hack e.g. via 38d5d3b3d5db ("bpf: Introduce BPF_ANNOTATE_KV_PAIR")
to get initial map to type association going. While more use cases
for it are discouraged, this also won't work for global data since
the use of array map is a BPF loader detail and therefore unknown
at compilation time. For array maps with just a single entry we make
an exception in terms of BTF in that key type is declared optional
if value type is of DataSec type. The latter LLVM is guaranteed to
emit and it also aligns with how we regard global data maps as just
a plain buffer area reusing existing map facilities for allowing
things like introspection with existing tools.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/btf.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/btf.h b/include/linux/btf.h
index 455d31b55828..64cdf2a23d42 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -51,6 +51,7 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
 			   const struct btf_member *m,
 			   u32 expected_offset, u32 expected_size);
 int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
+bool btf_type_is_void(const struct btf_type *t);
 
 #ifdef CONFIG_BPF_SYSCALL
 const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
-- 
cgit v1.2.3


From b6d9ccb1125049941590ea895c38e1167badba5f Mon Sep 17 00:00:00 2001
From: Mark Bloch <markb@mellanox.com>
Date: Thu, 28 Mar 2019 15:27:31 +0200
Subject: net/mlx5: E-Switch, don't use hardcoded values for FDB prios

When creating the FDB prios, use the enum values already defined and not
the hardcoded values.

Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/mlx5/fs.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 9df51da04621..3eeb04154317 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -75,6 +75,11 @@ enum mlx5_flow_namespace_type {
 	MLX5_FLOW_NAMESPACE_EGRESS,
 };
 
+enum {
+	FDB_FAST_PATH,
+	FDB_SLOW_PATH,
+};
+
 struct mlx5_flow_table;
 struct mlx5_flow_group;
 struct mlx5_flow_namespace;
-- 
cgit v1.2.3


From d9cb06759eca5a420072b937d2a2a670db474008 Mon Sep 17 00:00:00 2001
From: Mark Bloch <markb@mellanox.com>
Date: Thu, 28 Mar 2019 15:27:32 +0200
Subject: net/mlx5: E-Switch, add a new prio to be used by the RDMA side

Create a new prio in the FDB, it will be used when inserting steering rules
into the FDB from the RDMA side. We create a new PRIO so rules from the
net side and rules from the RDMA side won't be inserted to the same PRIO,
each side has it's own sandbox to play in.

Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/mlx5/fs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 3eeb04154317..fd91df3a4e09 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -76,6 +76,7 @@ enum mlx5_flow_namespace_type {
 };
 
 enum {
+	FDB_BYPASS_PATH,
 	FDB_FAST_PATH,
 	FDB_SLOW_PATH,
 };
-- 
cgit v1.2.3


From 1f5e6fdd6aec7929e67afad1e42e35d894a119ae Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 10 Apr 2019 14:32:38 +0200
Subject: net: sched: prefer qdisc_is_empty() over direct qlen access

When checking for root qdisc queue length, do not access directly q.qlen.
In the following patches we will move back qlen accounting to per CPU
values for NOLOCK qdiscs.

Instead, prefer the qdisc_is_empty() helper usage.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 0aea0e262452..7ecb6127e980 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -747,7 +747,7 @@ static inline bool qdisc_all_tx_empty(const struct net_device *dev)
 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
 		const struct Qdisc *q = rcu_dereference(txq->qdisc);
 
-		if (q->q.qlen) {
+		if (!qdisc_is_empty(q)) {
 			rcu_read_unlock();
 			return false;
 		}
-- 
cgit v1.2.3


From 9c01c9f1f2a3ddbddbf3b233cc6bfa86f5a59af0 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 10 Apr 2019 14:32:39 +0200
Subject: net: sched: always do stats accounting according to TCQ_F_CPUSTATS

The core sched implementation checks independently for NOLOCK flag
to acquire/release the root spin lock and for qdisc_is_percpu_stats()
to account per CPU values in many places.

This change update the last few places checking the TCQ_F_NOLOCK to
do per CPU stats accounting according to qdisc_is_percpu_stats()
value.

The above allows to clean dev_requeue_skb() implementation a bit
and makes stats update always consistent with a single flag.

v1 -> v2:
 - do not move qdisc_is_empty definition, fix build issue

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 7ecb6127e980..ed56474cfe3b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -146,9 +146,14 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc)
 	return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
 }
 
+static inline bool qdisc_is_percpu_stats(const struct Qdisc *q)
+{
+	return q->flags & TCQ_F_CPUSTATS;
+}
+
 static inline bool qdisc_is_empty(const struct Qdisc *qdisc)
 {
-	if (qdisc->flags & TCQ_F_NOLOCK)
+	if (qdisc_is_percpu_stats(qdisc))
 		return qdisc->empty;
 	return !qdisc->q.qlen;
 }
@@ -490,7 +495,7 @@ static inline u32 qdisc_qlen_sum(const struct Qdisc *q)
 {
 	u32 qlen = q->qstats.qlen;
 
-	if (q->flags & TCQ_F_NOLOCK)
+	if (qdisc_is_percpu_stats(q))
 		qlen += atomic_read(&q->q.atomic_qlen);
 	else
 		qlen += q->q.qlen;
@@ -817,11 +822,6 @@ static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	return sch->enqueue(skb, sch, to_free);
 }
 
-static inline bool qdisc_is_percpu_stats(const struct Qdisc *q)
-{
-	return q->flags & TCQ_F_CPUSTATS;
-}
-
 static inline void _bstats_update(struct gnet_stats_basic_packed *bstats,
 				  __u64 bytes, __u32 packets)
 {
@@ -1113,8 +1113,13 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
 
 	if (skb) {
 		skb = __skb_dequeue(&sch->gso_skb);
-		qdisc_qstats_backlog_dec(sch, skb);
-		sch->q.qlen--;
+		if (qdisc_is_percpu_stats(sch)) {
+			qdisc_qstats_cpu_backlog_dec(sch, skb);
+			qdisc_qstats_atomic_qlen_dec(sch);
+		} else {
+			qdisc_qstats_backlog_dec(sch, skb);
+			sch->q.qlen--;
+		}
 	} else {
 		skb = sch->dequeue(sch);
 	}
-- 
cgit v1.2.3


From 8a53e616de294873fec1a75ddb77ecb3d225cee0 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 10 Apr 2019 14:32:40 +0200
Subject: net: sched: when clearing NOLOCK, clear TCQ_F_CPUSTATS, too

Since stats updating is always consistent with TCQ_F_CPUSTATS flag,
we can disable it at qdisc creation time flipping such bit.

In my experiments, if the NOLOCK flag is cleared, per CPU stats
accounting does not give any measurable performance gain, but it
waste some memory.

Let's clear TCQ_F_CPUSTATS together with NOLOCK, when enslaving
a NOLOCK qdisc to 'lock' one.

Use stats update helper inside pfifo_fast, to cope correctly with
TCQ_F_CPUSTATS flag change.

As a side effect, q.qlen value for any child qdiscs is always
consistent for all lock classfull qdiscs.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index ed56474cfe3b..f069011524ba 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -1106,6 +1106,32 @@ static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch)
 	return skb;
 }
 
+static inline void qdisc_update_stats_at_dequeue(struct Qdisc *sch,
+						 struct sk_buff *skb)
+{
+	if (qdisc_is_percpu_stats(sch)) {
+		qdisc_qstats_cpu_backlog_dec(sch, skb);
+		qdisc_bstats_cpu_update(sch, skb);
+		qdisc_qstats_atomic_qlen_dec(sch);
+	} else {
+		qdisc_qstats_backlog_dec(sch, skb);
+		qdisc_bstats_update(sch, skb);
+		sch->q.qlen--;
+	}
+}
+
+static inline void qdisc_update_stats_at_enqueue(struct Qdisc *sch,
+						 unsigned int pkt_len)
+{
+	if (qdisc_is_percpu_stats(sch)) {
+		qdisc_qstats_atomic_qlen_inc(sch);
+		this_cpu_add(sch->cpu_qstats->backlog, pkt_len);
+	} else {
+		sch->qstats.backlog += pkt_len;
+		sch->q.qlen++;
+	}
+}
+
 /* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */
 static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
 {
-- 
cgit v1.2.3


From 73eb628ddfd3884d1e58a8022de2e78de7807fc6 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 10 Apr 2019 14:32:41 +0200
Subject: Revert: "net: sched: put back q.qlen into a single location"

This revert commit 46b1c18f9deb ("net: sched: put back q.qlen into
a single location").
After the previous patch, when a NOLOCK qdisc is enslaved to a
locking qdisc it switches to global stats accounting. As a consequence,
when a classful qdisc accesses directly a child qdisc's qlen, such
qdisc is not doing per CPU accounting and qlen value is consistent.

In the control path nobody uses directly qlen since commit
e5f0e8f8e45 ("net: sched: introduce and use qdisc tree flush/purge
helpers"), so we can remove the contented atomic ops from the
datapath.

v1 -> v2:
 - complete the qdisc_qstats_atomic_qlen_dec() ->
   qdisc_qstats_cpu_qlen_dec() replacement, fix build issue
 - more descriptive commit message

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f069011524ba..e8f85cd2afce 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -52,10 +52,7 @@ struct qdisc_size_table {
 struct qdisc_skb_head {
 	struct sk_buff	*head;
 	struct sk_buff	*tail;
-	union {
-		u32		qlen;
-		atomic_t	atomic_qlen;
-	};
+	__u32		qlen;
 	spinlock_t	lock;
 };
 
@@ -486,19 +483,27 @@ static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
 	BUILD_BUG_ON(sizeof(qcb->data) < sz);
 }
 
+static inline int qdisc_qlen_cpu(const struct Qdisc *q)
+{
+	return this_cpu_ptr(q->cpu_qstats)->qlen;
+}
+
 static inline int qdisc_qlen(const struct Qdisc *q)
 {
 	return q->q.qlen;
 }
 
-static inline u32 qdisc_qlen_sum(const struct Qdisc *q)
+static inline int qdisc_qlen_sum(const struct Qdisc *q)
 {
-	u32 qlen = q->qstats.qlen;
+	__u32 qlen = q->qstats.qlen;
+	int i;
 
-	if (qdisc_is_percpu_stats(q))
-		qlen += atomic_read(&q->q.atomic_qlen);
-	else
+	if (qdisc_is_percpu_stats(q)) {
+		for_each_possible_cpu(i)
+			qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen;
+	} else {
 		qlen += q->q.qlen;
+	}
 
 	return qlen;
 }
@@ -889,14 +894,14 @@ static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch,
 	this_cpu_add(sch->cpu_qstats->backlog, qdisc_pkt_len(skb));
 }
 
-static inline void qdisc_qstats_atomic_qlen_inc(struct Qdisc *sch)
+static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch)
 {
-	atomic_inc(&sch->q.atomic_qlen);
+	this_cpu_inc(sch->cpu_qstats->qlen);
 }
 
-static inline void qdisc_qstats_atomic_qlen_dec(struct Qdisc *sch)
+static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch)
 {
-	atomic_dec(&sch->q.atomic_qlen);
+	this_cpu_dec(sch->cpu_qstats->qlen);
 }
 
 static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch)
@@ -1112,7 +1117,7 @@ static inline void qdisc_update_stats_at_dequeue(struct Qdisc *sch,
 	if (qdisc_is_percpu_stats(sch)) {
 		qdisc_qstats_cpu_backlog_dec(sch, skb);
 		qdisc_bstats_cpu_update(sch, skb);
-		qdisc_qstats_atomic_qlen_dec(sch);
+		qdisc_qstats_cpu_qlen_dec(sch);
 	} else {
 		qdisc_qstats_backlog_dec(sch, skb);
 		qdisc_bstats_update(sch, skb);
@@ -1124,7 +1129,7 @@ static inline void qdisc_update_stats_at_enqueue(struct Qdisc *sch,
 						 unsigned int pkt_len)
 {
 	if (qdisc_is_percpu_stats(sch)) {
-		qdisc_qstats_atomic_qlen_inc(sch);
+		qdisc_qstats_cpu_qlen_inc(sch);
 		this_cpu_add(sch->cpu_qstats->backlog, pkt_len);
 	} else {
 		sch->qstats.backlog += pkt_len;
@@ -1141,7 +1146,7 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
 		skb = __skb_dequeue(&sch->gso_skb);
 		if (qdisc_is_percpu_stats(sch)) {
 			qdisc_qstats_cpu_backlog_dec(sch, skb);
-			qdisc_qstats_atomic_qlen_dec(sch);
+			qdisc_qstats_cpu_qlen_dec(sch);
 		} else {
 			qdisc_qstats_backlog_dec(sch, skb);
 			sch->q.qlen--;
-- 
cgit v1.2.3


From b0b9395d865e3060d97658fbc9ba3f77fecc8da1 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 9 Apr 2019 11:49:09 -0700
Subject: bpf: support input __sk_buff context in BPF_PROG_TEST_RUN

Add new set of arguments to bpf_attr for BPF_PROG_TEST_RUN:
* ctx_in/ctx_size_in - input context
* ctx_out/ctx_size_out - output context

The intended use case is to pass some meta data to the test runs that
operate on skb (this has being brought up on recent LPC).

For programs that use bpf_prog_test_run_skb, support __sk_buff input and
output. Initially, from input __sk_buff, copy _only_ cb and priority into
skb, all other non-zero fields are prohibited (with EINVAL).
If the user has set ctx_out/ctx_size_out, copy the potentially modified
__sk_buff back to the userspace.

We require all fields of input __sk_buff except the ones we explicitly
support to be set to zero. The expectation is that in the future we might
add support for more fields and we want to fail explicitly if the user
runs the program on the kernel where we don't yet support them.

The API is intentionally vague (i.e. we don't explicitly add __sk_buff
to bpf_attr, but ctx_in) to potentially let other test_run types use
this interface in the future (this can be xdp_md for xdp types for
example).

v4:
  * don't copy more than allowed in bpf_ctx_init [Martin]

v3:
  * handle case where ctx_in is NULL, but ctx_out is not [Martin]
  * convert size==0 checks to ptr==NULL checks and add some extra ptr
    checks [Martin]

v2:
  * Addressed comments from Martin Lau

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index af1cbd951f26..31a27dd337dc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -412,6 +412,13 @@ union bpf_attr {
 		__aligned_u64	data_out;
 		__u32		repeat;
 		__u32		duration;
+		__u32		ctx_size_in;	/* input: len of ctx_in */
+		__u32		ctx_size_out;	/* input/output: len of ctx_out
+						 *   returns ENOSPC if ctx_out
+						 *   is too small.
+						 */
+		__aligned_u64	ctx_in;
+		__aligned_u64	ctx_out;
 	} test;
 
 	struct { /* anonymous struct used by BPF_*_GET_*_ID */
-- 
cgit v1.2.3


From bf8981a2aa082d9d64771b47c8a1c9c388d8cd40 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 9 Apr 2019 10:44:06 +0200
Subject: netfilter: nf_nat: merge ip/ip6 masquerade headers

Both are now implemented by nf_nat_masquerade.c, so no need to keep
different headers.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_nat_masquerade.h | 15 ---------------
 include/net/netfilter/ipv6/nf_nat_masquerade.h | 11 -----------
 include/net/netfilter/nf_nat_masquerade.h      | 21 +++++++++++++++++++++
 3 files changed, 21 insertions(+), 26 deletions(-)
 delete mode 100644 include/net/netfilter/ipv4/nf_nat_masquerade.h
 delete mode 100644 include/net/netfilter/ipv6/nf_nat_masquerade.h
 create mode 100644 include/net/netfilter/nf_nat_masquerade.h

(limited to 'include')

diff --git a/include/net/netfilter/ipv4/nf_nat_masquerade.h b/include/net/netfilter/ipv4/nf_nat_masquerade.h
deleted file mode 100644
index 13d55206bb9f..000000000000
--- a/include/net/netfilter/ipv4/nf_nat_masquerade.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NF_NAT_MASQUERADE_IPV4_H_
-#define _NF_NAT_MASQUERADE_IPV4_H_
-
-#include <net/netfilter/nf_nat.h>
-
-unsigned int
-nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
-		       const struct nf_nat_range2 *range,
-		       const struct net_device *out);
-
-int nf_nat_masquerade_ipv4_register_notifier(void);
-void nf_nat_masquerade_ipv4_unregister_notifier(void);
-
-#endif /*_NF_NAT_MASQUERADE_IPV4_H_ */
diff --git a/include/net/netfilter/ipv6/nf_nat_masquerade.h b/include/net/netfilter/ipv6/nf_nat_masquerade.h
deleted file mode 100644
index 2917bf95c437..000000000000
--- a/include/net/netfilter/ipv6/nf_nat_masquerade.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NF_NAT_MASQUERADE_IPV6_H_
-#define _NF_NAT_MASQUERADE_IPV6_H_
-
-unsigned int
-nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
-		       const struct net_device *out);
-int nf_nat_masquerade_ipv6_register_notifier(void);
-void nf_nat_masquerade_ipv6_unregister_notifier(void);
-
-#endif /* _NF_NAT_MASQUERADE_IPV6_H_ */
diff --git a/include/net/netfilter/nf_nat_masquerade.h b/include/net/netfilter/nf_nat_masquerade.h
new file mode 100644
index 000000000000..cafe71822a53
--- /dev/null
+++ b/include/net/netfilter/nf_nat_masquerade.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NF_NAT_MASQUERADE_H_
+#define _NF_NAT_MASQUERADE_H_
+
+#include <net/netfilter/nf_nat.h>
+
+unsigned int
+nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
+		       const struct nf_nat_range2 *range,
+		       const struct net_device *out);
+
+int nf_nat_masquerade_ipv4_register_notifier(void);
+void nf_nat_masquerade_ipv4_unregister_notifier(void);
+
+unsigned int
+nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
+		       const struct net_device *out);
+int nf_nat_masquerade_ipv6_register_notifier(void);
+void nf_nat_masquerade_ipv6_unregister_notifier(void);
+
+#endif /*_NF_NAT_MASQUERADE_H_ */
-- 
cgit v1.2.3


From 610a43149cabd0c7aa7bed19cbcf05a0249ab32a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 9 Apr 2019 10:44:08 +0200
Subject: netfilter: nf_nat_masquerade: unify ipv4/6 notifier registration

Only reason for having two different register functions was because of
ipt_MASQUERADE and ip6t_MASQUERADE being two different modules.

Previous patch merged those into xt_MASQUERADE, so we can merge this too.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat_masquerade.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_nat_masquerade.h b/include/net/netfilter/nf_nat_masquerade.h
index cafe71822a53..54a14d643c34 100644
--- a/include/net/netfilter/nf_nat_masquerade.h
+++ b/include/net/netfilter/nf_nat_masquerade.h
@@ -9,13 +9,11 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
 		       const struct nf_nat_range2 *range,
 		       const struct net_device *out);
 
-int nf_nat_masquerade_ipv4_register_notifier(void);
-void nf_nat_masquerade_ipv4_unregister_notifier(void);
+int nf_nat_masquerade_inet_register_notifiers(void);
+void nf_nat_masquerade_inet_unregister_notifiers(void);
 
 unsigned int
 nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
 		       const struct net_device *out);
-int nf_nat_masquerade_ipv6_register_notifier(void);
-void nf_nat_masquerade_ipv6_unregister_notifier(void);
 
 #endif /*_NF_NAT_MASQUERADE_H_ */
-- 
cgit v1.2.3


From c695865c5c9803f14eef2c99d8a49d9ad60a3383 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 11 Apr 2019 09:12:02 -0700
Subject: bpf: fix missing bpf_check_uarg_tail_zero in BPF_PROG_TEST_RUN
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit b0b9395d865e ("bpf: support input __sk_buff context in
BPF_PROG_TEST_RUN") started using bpf_check_uarg_tail_zero in
BPF_PROG_TEST_RUN. However, bpf_check_uarg_tail_zero is not defined
for !CONFIG_BPF_SYSCALL:

net/bpf/test_run.c: In function ‘bpf_ctx_init’:
net/bpf/test_run.c:142:9: error: implicit declaration of function ‘bpf_check_uarg_tail_zero’ [-Werror=implicit-function-declaration]
   err = bpf_check_uarg_tail_zero(data_in, max_size, size);
         ^~~~~~~~~~~~~~~~~~~~~~~~

Let's not build net/bpf/test_run.c when CONFIG_BPF_SYSCALL is not set.

Reported-by: kbuild test robot <lkp@intel.com>
Fixes: b0b9395d865e ("bpf: support input __sk_buff context in BPF_PROG_TEST_RUN")
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 65f7094c40b4..e4d4c1771ab0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -483,14 +483,6 @@ typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type,
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy);
 
-int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
-			  union bpf_attr __user *uattr);
-int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
-			  union bpf_attr __user *uattr);
-int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
-				     const union bpf_attr *kattr,
-				     union bpf_attr __user *uattr);
-
 /* an array of programs to be executed under rcu_lock.
  *
  * Typical usage:
@@ -681,6 +673,13 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
 struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type);
 int array_map_alloc_check(union bpf_attr *attr);
 
+int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
+			  union bpf_attr __user *uattr);
+int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
+			  union bpf_attr __user *uattr);
+int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
+				     const union bpf_attr *kattr,
+				     union bpf_attr __user *uattr);
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -792,6 +791,27 @@ static inline struct bpf_prog *bpf_prog_get_type_path(const char *name,
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
+
+static inline int bpf_prog_test_run_xdp(struct bpf_prog *prog,
+					const union bpf_attr *kattr,
+					union bpf_attr __user *uattr)
+{
+	return -ENOTSUPP;
+}
+
+static inline int bpf_prog_test_run_skb(struct bpf_prog *prog,
+					const union bpf_attr *kattr,
+					union bpf_attr __user *uattr)
+{
+	return -ENOTSUPP;
+}
+
+static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
+						   const union bpf_attr *kattr,
+						   union bpf_attr __user *uattr)
+{
+	return -ENOTSUPP;
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
-- 
cgit v1.2.3


From 58dfc900faff6db7eb9bf01555622e0b6c74c262 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Tue, 9 Apr 2019 15:06:41 +0100
Subject: bpf: add layer 2 encap support to bpf_skb_adjust_room

commit 868d523535c2 ("bpf: add bpf_skb_adjust_room encap flags")
introduced support to bpf_skb_adjust_room for GSO-friendly GRE
and UDP encapsulation.

For GSO to work for skbs, the inner headers (mac and network) need to
be marked.  For L3 encapsulation using bpf_skb_adjust_room, the mac
and network headers are identical.  Here we provide a way of specifying
the inner mac header length for cases where L2 encap is desired.  Such
an approach can support encapsulated ethernet headers, MPLS headers etc.
For example to convert from a packet of form [eth][ip][tcp] to
[eth][ip][udp][inner mac][ip][tcp], something like the following could
be done:

	headroom = sizeof(iph) + sizeof(struct udphdr) + inner_maclen;

	ret = bpf_skb_adjust_room(skb, headroom, BPF_ADJ_ROOM_MAC,
				  BPF_F_ADJ_ROOM_ENCAP_L4_UDP |
				  BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 |
				  BPF_F_ADJ_ROOM_ENCAP_L2(inner_maclen));

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 31a27dd337dc..2e96d0b4bf65 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1523,6 +1523,10 @@ union bpf_attr {
  *		* **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **:
  *		  Use with ENCAP_L3 flags to further specify the tunnel type.
  *
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L2(len) **:
+ *		  Use with ENCAP_L3/L4 flags to further specify the tunnel
+ *		  type; **len** is the length of the inner MAC header.
+ *
  * 		A call to this helper is susceptible to change the underlaying
  * 		packet buffer. Therefore, at load time, all checks on pointers
  * 		previously done by the verifier are invalidated and must be
@@ -2664,10 +2668,16 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_adjust_room flags. */
 #define BPF_F_ADJ_ROOM_FIXED_GSO	(1ULL << 0)
 
+#define	BPF_ADJ_ROOM_ENCAP_L2_MASK	0xff
+#define	BPF_ADJ_ROOM_ENCAP_L2_SHIFT	56
+
 #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4	(1ULL << 1)
 #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6	(1ULL << 2)
 #define BPF_F_ADJ_ROOM_ENCAP_L4_GRE	(1ULL << 3)
 #define BPF_F_ADJ_ROOM_ENCAP_L4_UDP	(1ULL << 4)
+#define	BPF_F_ADJ_ROOM_ENCAP_L2(len)	(((__u64)len & \
+					  BPF_ADJ_ROOM_ENCAP_L2_MASK) \
+					 << BPF_ADJ_ROOM_ENCAP_L2_SHIFT)
 
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
-- 
cgit v1.2.3


From cc3a86c802f0ba9a2627aef256d95ae3b3fa6e91 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 9 Apr 2019 14:41:12 -0700
Subject: ipv6: Change rt6_probe to take a fib6_nh

rt6_probe sends probes for gateways in a nexthop. As such it really
depends on a fib6_nh, not a fib entry. Move last_probe to fib6_nh and
update rt6_probe to a fib6_nh struct.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 58dbb4e82908..2e9235adfa0d 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -127,6 +127,10 @@ struct rt6_exception {
 
 struct fib6_nh {
 	struct fib_nh_common	nh_common;
+
+#ifdef CONFIG_IPV6_ROUTER_PREF
+	unsigned long		last_probe;
+#endif
 };
 
 struct fib6_info {
@@ -155,10 +159,6 @@ struct fib6_info {
 	struct rt6_info * __percpu	*rt6i_pcpu;
 	struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
 
-#ifdef CONFIG_IPV6_ROUTER_PREF
-	unsigned long			last_probe;
-#endif
-
 	u32				fib6_metric;
 	u8				fib6_protocol;
 	u8				fib6_type;
-- 
cgit v1.2.3


From 971502d77faa50a37c89bc6d172450294ad9a5fd Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 11 Apr 2019 16:36:41 +0200
Subject: bridge: netfilter: unroll NF_HOOK helper in bridge input path

Replace NF_HOOK() based invocation of the netfilter hooks with a private
copy of nf_hook_slow().

This copy has one difference: it can return the rx handler value expected
by the stack, i.e. RX_HANDLER_CONSUMED or RX_HANDLER_PASS.

This is needed by the next patch to invoke the ebtables
"broute" table via the standard netfilter hooks rather than the custom
"br_should_route_hook" indirection that is used now.

When the skb is to be "brouted", we must return RX_HANDLER_PASS from the
bridge rx input handler, but there is no way to indicate this via
NF_HOOK(), unless perhaps by some hack such as exposing bridge_cb in the
netfilter core or a percpu flag.

  text    data     bss     dec   filename
  3369      56       0    3425   net/bridge/br_input.o.before
  3458      40       0    3498   net/bridge/br_input.o.after

This allows removal of the "br_should_route_hook" in the next patch.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_queue.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index a50a69f5334c..7239105d9d2e 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -119,4 +119,7 @@ nfqueue_hash(const struct sk_buff *skb, u16 queue, u16 queues_total, u8 family,
 	return queue;
 }
 
+int nf_queue(struct sk_buff *skb, struct nf_hook_state *state,
+	     const struct nf_hook_entries *entries, unsigned int index,
+	     unsigned int verdict);
 #endif /* _NF_QUEUE_H */
-- 
cgit v1.2.3


From 223fd0adfa8af36d5d9b5d38016e579ee052f367 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 11 Apr 2019 16:36:42 +0200
Subject: bridge: broute: make broute a real ebtables table

This makes broute a normal ebtables table, hooking at PREROUTING.
The broute hook is removed.

It uses skb->cb to signal to bridge rx handler that the skb should be
routed instead of being bridged.

This change is backwards compatible with ebtables as no userspace visible
parts are changed.

This means we can also remove the !ops test in ebt_register_table,
it was only there for broute table sake.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/if_bridge.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 627b788ba0ff..ef0819ced0fc 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -56,9 +56,6 @@ struct br_ip_list {
 
 extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
 
-typedef int br_should_route_hook_t(struct sk_buff *skb);
-extern br_should_route_hook_t __rcu *br_should_route_hook;
-
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING)
 int br_multicast_list_adjacent(struct net_device *dev,
 			       struct list_head *br_ip_list);
-- 
cgit v1.2.3


From 013b96ec64616b57fc631b304dfcecc5bc288f90 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Thu, 11 Apr 2019 15:02:07 -0700
Subject: sctp: Pass sk_buff_head explicitly to sctp_ulpq_tail_event().

Now the SKB list implementation assumption can be removed.

And now that we know that the list head is always non-NULL
we can remove the code blocks dealing with that as well.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/ulpqueue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h
index bb0ecba3db2b..f4ac7117ff29 100644
--- a/include/net/sctp/ulpqueue.h
+++ b/include/net/sctp/ulpqueue.h
@@ -59,7 +59,7 @@ void sctp_ulpq_free(struct sctp_ulpq *);
 int sctp_ulpq_tail_data(struct sctp_ulpq *, struct sctp_chunk *, gfp_t);
 
 /* Add a new event for propagation to the ULP. */
-int sctp_ulpq_tail_event(struct sctp_ulpq *, struct sctp_ulpevent *ev);
+int sctp_ulpq_tail_event(struct sctp_ulpq *, struct sk_buff_head *skb_list);
 
 /* Renege previously received chunks.  */
 void sctp_ulpq_renege(struct sctp_ulpq *, struct sctp_chunk *, gfp_t);
-- 
cgit v1.2.3


From 7b146cebe30cb481b0f70d85779da938da818637 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 27 Feb 2019 12:59:24 -0800
Subject: bpf: Sysctl hook

Containerized applications may run as root and it may create problems
for whole host. Specifically such applications may change a sysctl and
affect applications in other containers.

Furthermore in existing infrastructure it may not be possible to just
completely disable writing to sysctl, instead such a process should be
gradual with ability to log what sysctl are being changed by a
container, investigate, limit the set of writable sysctl to currently
used ones (so that new ones can not be changed) and eventually reduce
this set to zero.

The patch introduces new program type BPF_PROG_TYPE_CGROUP_SYSCTL and
attach type BPF_CGROUP_SYSCTL to solve these problems on cgroup basis.

New program type has access to following minimal context:
	struct bpf_sysctl {
		__u32	write;
	};

Where @write indicates whether sysctl is being read (= 0) or written (=
1).

Helpers to access sysctl name and value will be introduced separately.

BPF_CGROUP_SYSCTL attach point is added to sysctl code right before
passing control to ctl_table->proc_handler so that BPF program can
either allow or deny access to sysctl.

Suggested-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf-cgroup.h | 18 ++++++++++++++++++
 include/linux/bpf_types.h  |  1 +
 include/linux/filter.h     |  8 ++++++++
 include/uapi/linux/bpf.h   |  9 +++++++++
 4 files changed, 36 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index a4c644c1c091..b1c45da20a26 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -17,6 +17,8 @@ struct bpf_map;
 struct bpf_prog;
 struct bpf_sock_ops_kern;
 struct bpf_cgroup_storage;
+struct ctl_table;
+struct ctl_table_header;
 
 #ifdef CONFIG_CGROUP_BPF
 
@@ -109,6 +111,10 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 				      short access, enum bpf_attach_type type);
 
+int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
+				   struct ctl_table *table, int write,
+				   enum bpf_attach_type type);
+
 static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	struct bpf_map *map)
 {
@@ -253,6 +259,17 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 									      \
 	__ret;								      \
 })
+
+
+#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write)			       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled)						       \
+		__ret = __cgroup_bpf_run_filter_sysctl(head, table, write,     \
+						       BPF_CGROUP_SYSCTL);     \
+	__ret;								       \
+})
+
 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
 			   enum bpf_prog_type ptype, struct bpf_prog *prog);
 int cgroup_bpf_prog_detach(const union bpf_attr *attr,
@@ -321,6 +338,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) ({ 0; })
 
 #define for_each_cgroup_storage_type(stype) for (; false; )
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 08bf2f1fe553..d26991a16894 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -28,6 +28,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 #endif
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl)
 #endif
 #ifdef CONFIG_BPF_LIRC_MODE2
 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6074aa064b54..a17732057880 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -33,6 +33,8 @@ struct bpf_prog_aux;
 struct xdp_rxq_info;
 struct xdp_buff;
 struct sock_reuseport;
+struct ctl_table;
+struct ctl_table_header;
 
 /* ArgX, context and stack frame pointer register positions. Note,
  * Arg1, Arg2, Arg3, etc are used as argument mappings of function
@@ -1177,4 +1179,10 @@ struct bpf_sock_ops_kern {
 					 */
 };
 
+struct bpf_sysctl_kern {
+	struct ctl_table_header *head;
+	struct ctl_table *table;
+	int write;
+};
+
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2e96d0b4bf65..cc2a2466d5f3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -167,6 +167,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LIRC_MODE2,
 	BPF_PROG_TYPE_SK_REUSEPORT,
 	BPF_PROG_TYPE_FLOW_DISSECTOR,
+	BPF_PROG_TYPE_CGROUP_SYSCTL,
 };
 
 enum bpf_attach_type {
@@ -188,6 +189,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_UDP6_SENDMSG,
 	BPF_LIRC_MODE2,
 	BPF_FLOW_DISSECTOR,
+	BPF_CGROUP_SYSCTL,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -3308,4 +3310,11 @@ struct bpf_line_info {
 struct bpf_spin_lock {
 	__u32	val;
 };
+
+struct bpf_sysctl {
+	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
+				 * Allows 1,2,4-byte read, but no write.
+				 */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3


From 808649fb787d918a48a360a668ee4ee9023f0c11 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Wed, 27 Feb 2019 13:28:48 -0800
Subject: bpf: Introduce bpf_sysctl_get_name helper

Add bpf_sysctl_get_name() helper to copy sysctl name (/proc/sys/ entry)
into provided by BPF_PROG_TYPE_CGROUP_SYSCTL program buffer.

By default full name (w/o /proc/sys/) is copied, e.g. "net/ipv4/tcp_mem".

If BPF_F_SYSCTL_BASE_NAME flag is set, only base name will be copied,
e.g. "tcp_mem".

Documentation for the new helper is provided in bpf.h UAPI.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cc2a2466d5f3..9c8a2f3ccb9b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2506,6 +2506,22 @@ union bpf_attr {
  * 	Return
  * 		0 if iph and th are a valid SYN cookie ACK, or a negative error
  * 		otherwise.
+ *
+ * int bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags)
+ *	Description
+ *		Get name of sysctl in /proc/sys/ and copy it into provided by
+ *		program buffer *buf* of size *buf_len*.
+ *
+ *		The buffer is always NUL terminated, unless it's zero-sized.
+ *
+ *		If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is
+ *		copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name
+ *		only (e.g. "tcp_mem").
+ *	Return
+ *		Number of character copied (not including the trailing NUL).
+ *
+ *		**-E2BIG** if the buffer wasn't big enough (*buf* will contain
+ *		truncated name in this case).
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2608,7 +2624,8 @@ union bpf_attr {
 	FN(skb_ecn_set_ce),		\
 	FN(get_listener_sock),		\
 	FN(skc_lookup_tcp),		\
-	FN(tcp_check_syncookie),
+	FN(tcp_check_syncookie),	\
+	FN(sysctl_get_name),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2681,6 +2698,9 @@ enum bpf_func_id {
 					  BPF_ADJ_ROOM_ENCAP_L2_MASK) \
 					 << BPF_ADJ_ROOM_ENCAP_L2_SHIFT)
 
+/* BPF_FUNC_sysctl_get_name flags. */
+#define BPF_F_SYSCTL_BASE_NAME		(1ULL << 0)
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
-- 
cgit v1.2.3


From 1d11b3016cec4ed9770b98e82a61708c8f4926e7 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Thu, 28 Feb 2019 19:22:15 -0800
Subject: bpf: Introduce bpf_sysctl_get_current_value helper

Add bpf_sysctl_get_current_value() helper to copy current sysctl value
into provided by BPF_PROG_TYPE_CGROUP_SYSCTL program buffer.

It provides same string as user space can see by reading corresponding
file in /proc/sys/, including new line, etc.

Documentation for the new helper is provided in bpf.h UAPI.

Since current value is kept in ctl_table->data in a parsed form,
ctl_table->proc_handler() with write=0 is called to read that data and
convert it to a string. Such a string can later be parsed by a program
using helpers that will be introduced separately.

Unfortunately it's not trivial to provide API to access parsed data due to
variety of data representations (string, intvec, uintvec, ulongvec,
custom structures, even NULL, etc). Instead it's assumed that user know
how to handle specific sysctl they're interested in and appropriate
helpers can be used.

Since ctl_table->proc_handler() expects __user buffer, conversion to
__user happens for kernel allocated one where the value is stored.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h   |  2 ++
 include/uapi/linux/bpf.h | 22 +++++++++++++++++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index a17732057880..f254ff92819f 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1182,6 +1182,8 @@ struct bpf_sock_ops_kern {
 struct bpf_sysctl_kern {
 	struct ctl_table_header *head;
 	struct ctl_table *table;
+	void *cur_val;
+	size_t cur_len;
 	int write;
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9c8a2f3ccb9b..063543afa359 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2522,6 +2522,25 @@ union bpf_attr {
  *
  *		**-E2BIG** if the buffer wasn't big enough (*buf* will contain
  *		truncated name in this case).
+ *
+ * int bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len)
+ *	Description
+ *		Get current value of sysctl as it is presented in /proc/sys
+ *		(incl. newline, etc), and copy it as a string into provided
+ *		by program buffer *buf* of size *buf_len*.
+ *
+ *		The whole value is copied, no matter what file position user
+ *		space issued e.g. sys_read at.
+ *
+ *		The buffer is always NUL terminated, unless it's zero-sized.
+ *	Return
+ *		Number of character copied (not including the trailing NUL).
+ *
+ *		**-E2BIG** if the buffer wasn't big enough (*buf* will contain
+ *		truncated name in this case).
+ *
+ *		**-EINVAL** if current value was unavailable, e.g. because
+ *		sysctl is uninitialized and read returns -EIO for it.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2625,7 +2644,8 @@ union bpf_attr {
 	FN(get_listener_sock),		\
 	FN(skc_lookup_tcp),		\
 	FN(tcp_check_syncookie),	\
-	FN(sysctl_get_name),
+	FN(sysctl_get_name),		\
+	FN(sysctl_get_current_value),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit v1.2.3


From 4e63acdff864654cee0ac5aaeda3913798ee78f6 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Thu, 7 Mar 2019 18:38:43 -0800
Subject: bpf: Introduce bpf_sysctl_{get,set}_new_value helpers

Add helpers to work with new value being written to sysctl by user
space.

bpf_sysctl_get_new_value() copies value being written to sysctl into
provided buffer.

bpf_sysctl_set_new_value() overrides new value being written by user
space with a one from provided buffer. Buffer should contain string
representation of the value, similar to what can be seen in /proc/sys/.

Both helpers can be used only on sysctl write.

File position matters and can be managed by an interface that will be
introduced separately. E.g. if user space calls sys_write to a file in
/proc/sys/ at file position = X, where X > 0, then the value set by
bpf_sysctl_set_new_value() will be written starting from X. If program
wants to override whole value with specified buffer, file position has
to be set to zero.

Documentation for the new helpers is provided in bpf.h UAPI.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf-cgroup.h |  8 +++++---
 include/linux/filter.h     |  3 +++
 include/uapi/linux/bpf.h   | 38 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 45 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index b1c45da20a26..1e97271f9a10 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -113,7 +113,8 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 
 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   struct ctl_table *table, int write,
-				   enum bpf_attach_type type);
+				   void __user *buf, size_t *pcount,
+				   void **new_buf, enum bpf_attach_type type);
 
 static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	struct bpf_map *map)
@@ -261,11 +262,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 })
 
 
-#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write)			       \
+#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, nbuf)       \
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled)						       \
 		__ret = __cgroup_bpf_run_filter_sysctl(head, table, write,     \
+						       buf, count, nbuf,       \
 						       BPF_CGROUP_SYSCTL);     \
 	__ret;								       \
 })
@@ -338,7 +340,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,nbuf) ({ 0; })
 
 #define for_each_cgroup_storage_type(stype) for (; false; )
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index f254ff92819f..a23653f9460c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1184,6 +1184,9 @@ struct bpf_sysctl_kern {
 	struct ctl_table *table;
 	void *cur_val;
 	size_t cur_len;
+	void *new_val;
+	size_t new_len;
+	int new_updated;
 	int write;
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 063543afa359..547b8258d731 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2541,6 +2541,40 @@ union bpf_attr {
  *
  *		**-EINVAL** if current value was unavailable, e.g. because
  *		sysctl is uninitialized and read returns -EIO for it.
+ *
+ * int bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len)
+ *	Description
+ *		Get new value being written by user space to sysctl (before
+ *		the actual write happens) and copy it as a string into
+ *		provided by program buffer *buf* of size *buf_len*.
+ *
+ *		User space may write new value at file position > 0.
+ *
+ *		The buffer is always NUL terminated, unless it's zero-sized.
+ *	Return
+ *		Number of character copied (not including the trailing NUL).
+ *
+ *		**-E2BIG** if the buffer wasn't big enough (*buf* will contain
+ *		truncated name in this case).
+ *
+ *		**-EINVAL** if sysctl is being read.
+ *
+ * int bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len)
+ *	Description
+ *		Override new value being written by user space to sysctl with
+ *		value provided by program in buffer *buf* of size *buf_len*.
+ *
+ *		*buf* should contain a string in same form as provided by user
+ *		space on sysctl write.
+ *
+ *		User space may write new value at file position > 0. To override
+ *		the whole sysctl value file position should be set to zero.
+ *	Return
+ *		0 on success.
+ *
+ *		**-E2BIG** if the *buf_len* is too big.
+ *
+ *		**-EINVAL** if sysctl is being read.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2645,7 +2679,9 @@ union bpf_attr {
 	FN(skc_lookup_tcp),		\
 	FN(tcp_check_syncookie),	\
 	FN(sysctl_get_name),		\
-	FN(sysctl_get_current_value),
+	FN(sysctl_get_current_value),	\
+	FN(sysctl_get_new_value),	\
+	FN(sysctl_set_new_value),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit v1.2.3


From e1550bfe0de47e30484ba91de1e50a91ec1c31f5 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Thu, 7 Mar 2019 18:50:52 -0800
Subject: bpf: Add file_pos field to bpf_sysctl ctx

Add file_pos field to bpf_sysctl context to read and write sysctl file
position at which sysctl is being accessed (read or written).

The field can be used to e.g. override whole sysctl value on write to
sysctl even when sys_write is called by user space with file_pos > 0. Or
BPF program may reject such accesses.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf-cgroup.h | 9 +++++----
 include/linux/filter.h     | 3 +++
 include/uapi/linux/bpf.h   | 3 +++
 3 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 1e97271f9a10..cb3c6b3b89c8 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -114,7 +114,8 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   struct ctl_table *table, int write,
 				   void __user *buf, size_t *pcount,
-				   void **new_buf, enum bpf_attach_type type);
+				   loff_t *ppos, void **new_buf,
+				   enum bpf_attach_type type);
 
 static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	struct bpf_map *map)
@@ -262,12 +263,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 })
 
 
-#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, nbuf)       \
+#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos, nbuf)  \
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled)						       \
 		__ret = __cgroup_bpf_run_filter_sysctl(head, table, write,     \
-						       buf, count, nbuf,       \
+						       buf, count, pos, nbuf,  \
 						       BPF_CGROUP_SYSCTL);     \
 	__ret;								       \
 })
@@ -340,7 +341,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,nbuf) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; })
 
 #define for_each_cgroup_storage_type(stype) for (; false; )
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a23653f9460c..fb0edad75971 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1188,6 +1188,9 @@ struct bpf_sysctl_kern {
 	size_t new_len;
 	int new_updated;
 	int write;
+	loff_t *ppos;
+	/* Temporary "register" for indirect stores to ppos. */
+	u64 tmp_reg;
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 547b8258d731..89976de909af 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3391,6 +3391,9 @@ struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
 				 */
+	__u32	file_pos;	/* Sysctl file position to read from, write to.
+				 * Allows 1,2,4-byte read an 4-byte write.
+				 */
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.2.3


From 57c3bb725a3dd97d960d7e1cd0845d88de53217f Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Mon, 18 Mar 2019 16:57:10 -0700
Subject: bpf: Introduce ARG_PTR_TO_{INT,LONG} arg types

Currently the way to pass result from BPF helper to BPF program is to
provide memory area defined by pointer and size: func(void *, size_t).

It works great for generic use-case, but for simple types, such as int,
it's overkill and consumes two arguments when it could use just one.

Introduce new argument types ARG_PTR_TO_INT and ARG_PTR_TO_LONG to be
able to pass result from helper to program via pointer to int and long
correspondingly: func(int *) or func(long *).

New argument types are similar to ARG_PTR_TO_MEM with the following
differences:
* they don't require corresponding ARG_CONST_SIZE argument, predefined
  access sizes are used instead (32bit for int, 64bit for long);
* it's possible to use more than one such an argument in a helper;
* provided pointers have to be aligned.

It's easy to introduce similar ARG_PTR_TO_CHAR and ARG_PTR_TO_SHORT
argument types. It's not done due to lack of use-case though.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e4d4c1771ab0..fd06ada941ad 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -202,6 +202,8 @@ enum bpf_arg_type {
 	ARG_ANYTHING,		/* any (initialized) argument is ok */
 	ARG_PTR_TO_SPIN_LOCK,	/* pointer to bpf_spin_lock */
 	ARG_PTR_TO_SOCK_COMMON,	/* pointer to sock_common */
+	ARG_PTR_TO_INT,		/* pointer to int */
+	ARG_PTR_TO_LONG,	/* pointer to long */
 };
 
 /* type of values returned from helper functions */
-- 
cgit v1.2.3


From d7a4cb9b6705a89937d12c8158a35a3145dc967a Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Mon, 18 Mar 2019 17:55:26 -0700
Subject: bpf: Introduce bpf_strtol and bpf_strtoul helpers

Add bpf_strtol and bpf_strtoul to convert a string to long and unsigned
long correspondingly. It's similar to user space strtol(3) and
strtoul(3) with a few changes to the API:

* instead of NUL-terminated C string the helpers expect buffer and
  buffer length;

* resulting long or unsigned long is returned in a separate
  result-argument;

* return value is used to indicate success or failure, on success number
  of consumed bytes is returned that can be used to identify position to
  read next if the buffer is expected to contain multiple integers;

* instead of *base* argument, *flags* is used that provides base in 5
  LSB, other bits are reserved for future use;

* number of supported bases is limited.

Documentation for the new helpers is provided in bpf.h UAPI.

The helpers are made available to BPF_PROG_TYPE_CGROUP_SYSCTL programs to
be able to convert string input to e.g. "ulongvec" output.

E.g. "net/ipv4/tcp_mem" consists of three ulong integers. They can be
parsed by calling to bpf_strtoul three times.

Implementation notes:

Implementation includes "../../lib/kstrtox.h" to reuse integer parsing
functions. It's done exactly same way as fs/proc/base.c already does.

Unfortunately existing kstrtoX function can't be used directly since
they fail if any invalid character is present right after integer in the
string. Existing simple_strtoX functions can't be used either since
they're obsolete and don't handle overflow properly.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  2 ++
 include/uapi/linux/bpf.h | 51 +++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 52 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index fd06ada941ad..f15432d90728 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -989,6 +989,8 @@ extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
 extern const struct bpf_func_proto bpf_spin_lock_proto;
 extern const struct bpf_func_proto bpf_spin_unlock_proto;
 extern const struct bpf_func_proto bpf_get_local_storage_proto;
+extern const struct bpf_func_proto bpf_strtol_proto;
+extern const struct bpf_func_proto bpf_strtoul_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 89976de909af..c26be24fd5e2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2575,6 +2575,53 @@ union bpf_attr {
  *		**-E2BIG** if the *buf_len* is too big.
  *
  *		**-EINVAL** if sysctl is being read.
+ *
+ * int bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res)
+ *	Description
+ *		Convert the initial part of the string from buffer *buf* of
+ *		size *buf_len* to a long integer according to the given base
+ *		and save the result in *res*.
+ *
+ *		The string may begin with an arbitrary amount of white space
+ *		(as determined by isspace(3)) followed by a single optional '-'
+ *		sign.
+ *
+ *		Five least significant bits of *flags* encode base, other bits
+ *		are currently unused.
+ *
+ *		Base must be either 8, 10, 16 or 0 to detect it automatically
+ *		similar to user space strtol(3).
+ *	Return
+ *		Number of characters consumed on success. Must be positive but
+ *		no more than buf_len.
+ *
+ *		**-EINVAL** if no valid digits were found or unsupported base
+ *		was provided.
+ *
+ *		**-ERANGE** if resulting value was out of range.
+ *
+ * int bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res)
+ *	Description
+ *		Convert the initial part of the string from buffer *buf* of
+ *		size *buf_len* to an unsigned long integer according to the
+ *		given base and save the result in *res*.
+ *
+ *		The string may begin with an arbitrary amount of white space
+ *		(as determined by isspace(3)).
+ *
+ *		Five least significant bits of *flags* encode base, other bits
+ *		are currently unused.
+ *
+ *		Base must be either 8, 10, 16 or 0 to detect it automatically
+ *		similar to user space strtoul(3).
+ *	Return
+ *		Number of characters consumed on success. Must be positive but
+ *		no more than buf_len.
+ *
+ *		**-EINVAL** if no valid digits were found or unsupported base
+ *		was provided.
+ *
+ *		**-ERANGE** if resulting value was out of range.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2681,7 +2728,9 @@ union bpf_attr {
 	FN(sysctl_get_name),		\
 	FN(sysctl_get_current_value),	\
 	FN(sysctl_get_new_value),	\
-	FN(sysctl_set_new_value),
+	FN(sysctl_set_new_value),	\
+	FN(strtol),			\
+	FN(strtoul),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit v1.2.3


From e4edbe3c1f44c84f319149aeb998e7e36b3b897f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 12 Apr 2019 11:52:07 +1000
Subject: rhashtable: fix some __rcu annotation errors

With these annotations, the rhashtable now gets no
warnings when compiled with "C=1" for sparse checking.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 460c0eaf6b96..2711cbf01b64 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -40,7 +40,7 @@
  * the chain.  To avoid dereferencing this pointer without clearing
  * the bit first, we use an opaque 'struct rhash_lock_head *' for the
  * pointer stored in the bucket.  This struct needs to be defined so
- * that rcu_derefernce() works on it, but it has no content so a
+ * that rcu_dereference() works on it, but it has no content so a
  * cast is needed for it to be useful.  This ensures it isn't
  * used by mistake with clearing the lock bit first.
  */
@@ -130,10 +130,10 @@ static inline void rht_unlock(struct bucket_table *tbl,
 }
 
 static inline void rht_assign_unlock(struct bucket_table *tbl,
-				     struct rhash_lock_head **bkt,
+				     struct rhash_lock_head __rcu **bkt,
 				     struct rhash_head *obj)
 {
-	struct rhash_head **p = (struct rhash_head **)bkt;
+	struct rhash_head __rcu **p = (struct rhash_head __rcu **)bkt;
 
 	lock_map_release(&tbl->dep_map);
 	rcu_assign_pointer(*p, obj);
@@ -556,6 +556,7 @@ static inline struct rhash_head *__rhashtable_lookup(
 	};
 	struct rhash_lock_head __rcu * const *bkt;
 	struct bucket_table *tbl;
+	struct rhash_head __rcu *head;
 	struct rhash_head *he;
 	unsigned int hash;
 
@@ -564,8 +565,8 @@ restart:
 	hash = rht_key_hashfn(ht, tbl, key, params);
 	bkt = rht_bucket(tbl, hash);
 	do {
-		he = rht_ptr(rht_dereference_bucket_rcu(*bkt, tbl, hash));
-		rht_for_each_rcu_from(he, he, tbl, hash) {
+		head = rht_ptr(rht_dereference_bucket_rcu(*bkt, tbl, hash));
+		rht_for_each_rcu_from(he, head, tbl, hash) {
 			if (params.obj_cmpfn ?
 			    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
 			    rhashtable_compare(&arg, rht_obj(ht, he)))
-- 
cgit v1.2.3


From c5783311a1248c437614d438b69c5f31fe483ecb Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 12 Apr 2019 11:52:08 +1000
Subject: rhashtable: reorder some inline functions and macros.

This patch only moves some code around, it doesn't
change the code at all.
A subsequent patch will benefit from this as it needs
to add calls to functions which are now defined before the
call-site, but weren't before.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 142 ++++++++++++++++++++++-----------------------
 1 file changed, 71 insertions(+), 71 deletions(-)

(limited to 'include')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 2711cbf01b64..c504cd820736 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -87,77 +87,6 @@ struct bucket_table {
 	struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp;
 };
 
-/*
- * We lock a bucket by setting BIT(1) in the pointer - this is always
- * zero in real pointers and in the nulls marker.
- * bit_spin_locks do not handle contention well, but the whole point
- * of the hashtable design is to achieve minimum per-bucket contention.
- * A nested hash table might not have a bucket pointer.  In that case
- * we cannot get a lock.  For remove and replace the bucket cannot be
- * interesting and doesn't need locking.
- * For insert we allocate the bucket if this is the last bucket_table,
- * and then take the lock.
- * Sometimes we unlock a bucket by writing a new pointer there.  In that
- * case we don't need to unlock, but we do need to reset state such as
- * local_bh. For that we have rht_assign_unlock().  As rcu_assign_pointer()
- * provides the same release semantics that bit_spin_unlock() provides,
- * this is safe.
- */
-
-static inline void rht_lock(struct bucket_table *tbl,
-			    struct rhash_lock_head **bkt)
-{
-	local_bh_disable();
-	bit_spin_lock(1, (unsigned long *)bkt);
-	lock_map_acquire(&tbl->dep_map);
-}
-
-static inline void rht_lock_nested(struct bucket_table *tbl,
-				   struct rhash_lock_head **bucket,
-				   unsigned int subclass)
-{
-	local_bh_disable();
-	bit_spin_lock(1, (unsigned long *)bucket);
-	lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_);
-}
-
-static inline void rht_unlock(struct bucket_table *tbl,
-			      struct rhash_lock_head **bkt)
-{
-	lock_map_release(&tbl->dep_map);
-	bit_spin_unlock(1, (unsigned long *)bkt);
-	local_bh_enable();
-}
-
-static inline void rht_assign_unlock(struct bucket_table *tbl,
-				     struct rhash_lock_head __rcu **bkt,
-				     struct rhash_head *obj)
-{
-	struct rhash_head __rcu **p = (struct rhash_head __rcu **)bkt;
-
-	lock_map_release(&tbl->dep_map);
-	rcu_assign_pointer(*p, obj);
-	preempt_enable();
-	__release(bitlock);
-	local_bh_enable();
-}
-
-/*
- * If 'p' is a bucket head and might be locked:
- *   rht_ptr() returns the address without the lock bit.
- *   rht_ptr_locked() returns the address WITH the lock bit.
- */
-static inline struct rhash_head __rcu *rht_ptr(const struct rhash_lock_head *p)
-{
-	return (void *)(((unsigned long)p) & ~BIT(1));
-}
-
-static inline struct rhash_lock_head __rcu *rht_ptr_locked(const
-							   struct rhash_head *p)
-{
-	return (void *)(((unsigned long)p) | BIT(1));
-}
-
 /*
  * NULLS_MARKER() expects a hash value with the low
  * bits mostly likely to be significant, and it discards
@@ -372,6 +301,77 @@ static inline struct rhash_lock_head __rcu **rht_bucket_insert(
 				     &tbl->buckets[hash];
 }
 
+/*
+ * We lock a bucket by setting BIT(1) in the pointer - this is always
+ * zero in real pointers and in the nulls marker.
+ * bit_spin_locks do not handle contention well, but the whole point
+ * of the hashtable design is to achieve minimum per-bucket contention.
+ * A nested hash table might not have a bucket pointer.  In that case
+ * we cannot get a lock.  For remove and replace the bucket cannot be
+ * interesting and doesn't need locking.
+ * For insert we allocate the bucket if this is the last bucket_table,
+ * and then take the lock.
+ * Sometimes we unlock a bucket by writing a new pointer there.  In that
+ * case we don't need to unlock, but we do need to reset state such as
+ * local_bh. For that we have rht_assign_unlock().  As rcu_assign_pointer()
+ * provides the same release semantics that bit_spin_unlock() provides,
+ * this is safe.
+ */
+
+static inline void rht_lock(struct bucket_table *tbl,
+			    struct rhash_lock_head **bkt)
+{
+	local_bh_disable();
+	bit_spin_lock(1, (unsigned long *)bkt);
+	lock_map_acquire(&tbl->dep_map);
+}
+
+static inline void rht_lock_nested(struct bucket_table *tbl,
+				   struct rhash_lock_head **bucket,
+				   unsigned int subclass)
+{
+	local_bh_disable();
+	bit_spin_lock(1, (unsigned long *)bucket);
+	lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_);
+}
+
+static inline void rht_unlock(struct bucket_table *tbl,
+			      struct rhash_lock_head **bkt)
+{
+	lock_map_release(&tbl->dep_map);
+	bit_spin_unlock(1, (unsigned long *)bkt);
+	local_bh_enable();
+}
+
+/*
+ * If 'p' is a bucket head and might be locked:
+ *   rht_ptr() returns the address without the lock bit.
+ *   rht_ptr_locked() returns the address WITH the lock bit.
+ */
+static inline struct rhash_head __rcu *rht_ptr(const struct rhash_lock_head *p)
+{
+	return (void *)(((unsigned long)p) & ~BIT(1));
+}
+
+static inline struct rhash_lock_head __rcu *rht_ptr_locked(const
+							   struct rhash_head *p)
+{
+	return (void *)(((unsigned long)p) | BIT(1));
+}
+
+static inline void rht_assign_unlock(struct bucket_table *tbl,
+				     struct rhash_lock_head __rcu **bkt,
+				     struct rhash_head *obj)
+{
+	struct rhash_head __rcu **p = (struct rhash_head __rcu **)bkt;
+
+	lock_map_release(&tbl->dep_map);
+	rcu_assign_pointer(*p, obj);
+	preempt_enable();
+	__release(bitlock);
+	local_bh_enable();
+}
+
 /**
  * rht_for_each_from - iterate over hash chain from given head
  * @pos:	the &struct rhash_head to use as a loop cursor.
-- 
cgit v1.2.3


From adc6a3ab192eb40fb9d8b093c87d9aa785af4513 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 12 Apr 2019 11:52:08 +1000
Subject: rhashtable: move dereference inside rht_ptr()

Rather than dereferencing a pointer to a bucket and then passing the
result to rht_ptr(), we now pass in the pointer and do the dereference
in rht_ptr().

This requires that we pass in the tbl and hash as well to support RCU
checks, and means that the various rht_for_each functions can expect a
pointer that can be dereferenced without further care.

There are two places where we dereference a bucket pointer
where there is no testable protection - in each case we know
that we much have exclusive access without having taken a lock.
The previous code used rht_dereference() to pretend that holding
the mutex provided protects, but holding the mutex never provides
protection for accessing buckets.

So instead introduce rht_ptr_exclusive() that can be used when
there is known to be exclusive access without holding any locks.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 69 ++++++++++++++++++++++++++++------------------
 1 file changed, 42 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index c504cd820736..b54e6436547e 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -344,12 +344,28 @@ static inline void rht_unlock(struct bucket_table *tbl,
 }
 
 /*
- * If 'p' is a bucket head and might be locked:
- *   rht_ptr() returns the address without the lock bit.
- *   rht_ptr_locked() returns the address WITH the lock bit.
+ * Where 'bkt' is a bucket and might be locked:
+ *   rht_ptr() dereferences that pointer and clears the lock bit.
+ *   rht_ptr_exclusive() dereferences in a context where exclusive
+ *            access is guaranteed, such as when destroying the table.
  */
-static inline struct rhash_head __rcu *rht_ptr(const struct rhash_lock_head *p)
+static inline struct rhash_head *rht_ptr(
+	struct rhash_lock_head __rcu * const *bkt,
+	struct bucket_table *tbl,
+	unsigned int hash)
 {
+	const struct rhash_lock_head *p =
+		rht_dereference_bucket_rcu(*bkt, tbl, hash);
+
+	return (void *)(((unsigned long)p) & ~BIT(1));
+}
+
+static inline struct rhash_head *rht_ptr_exclusive(
+	struct rhash_lock_head __rcu * const *bkt)
+{
+	const struct rhash_lock_head *p =
+		rcu_dereference_protected(*bkt, 1);
+
 	return (void *)(((unsigned long)p) & ~BIT(1));
 }
 
@@ -380,8 +396,8 @@ static inline void rht_assign_unlock(struct bucket_table *tbl,
  * @hash:	the hash value / bucket index
  */
 #define rht_for_each_from(pos, head, tbl, hash) \
-	for (pos = rht_dereference_bucket(head, tbl, hash); \
-	     !rht_is_a_nulls(pos); \
+	for (pos = head;			\
+	     !rht_is_a_nulls(pos);		\
 	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
 
 /**
@@ -391,7 +407,8 @@ static inline void rht_assign_unlock(struct bucket_table *tbl,
  * @hash:	the hash value / bucket index
  */
 #define rht_for_each(pos, tbl, hash) \
-	rht_for_each_from(pos, rht_ptr(*rht_bucket(tbl, hash)), tbl, hash)
+	rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash),  \
+			  tbl, hash)
 
 /**
  * rht_for_each_entry_from - iterate over hash chain from given head
@@ -403,7 +420,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl,
  * @member:	name of the &struct rhash_head within the hashable struct.
  */
 #define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member)	\
-	for (pos = rht_dereference_bucket(head, tbl, hash);		\
+	for (pos = head;						\
 	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	\
 	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
 
@@ -416,8 +433,9 @@ static inline void rht_assign_unlock(struct bucket_table *tbl,
  * @member:	name of the &struct rhash_head within the hashable struct.
  */
 #define rht_for_each_entry(tpos, pos, tbl, hash, member)		\
-	rht_for_each_entry_from(tpos, pos, rht_ptr(*rht_bucket(tbl, hash)), \
-				    tbl, hash, member)
+	rht_for_each_entry_from(tpos, pos,				\
+				rht_ptr(rht_bucket(tbl, hash), tbl, hash), \
+				tbl, hash, member)
 
 /**
  * rht_for_each_entry_safe - safely iterate over hash chain of given type
@@ -432,8 +450,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl,
  * remove the loop cursor from the list.
  */
 #define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)	      \
-	for (pos = rht_dereference_bucket(rht_ptr(*rht_bucket(tbl, hash)),    \
-					  tbl, hash),			      \
+	for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash),		      \
 	     next = !rht_is_a_nulls(pos) ?				      \
 		       rht_dereference_bucket(pos->next, tbl, hash) : NULL;   \
 	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	      \
@@ -454,7 +471,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl,
  */
 #define rht_for_each_rcu_from(pos, head, tbl, hash)			\
 	for (({barrier(); }),						\
-	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		\
+	     pos = head;						\
 	     !rht_is_a_nulls(pos);					\
 	     pos = rcu_dereference_raw(pos->next))
 
@@ -469,10 +486,9 @@ static inline void rht_assign_unlock(struct bucket_table *tbl,
  * traversal is guarded by rcu_read_lock().
  */
 #define rht_for_each_rcu(pos, tbl, hash)			\
-	for (({barrier(); }),						\
-	     pos = rht_ptr(rht_dereference_bucket_rcu(			\
-				   *rht_bucket(tbl, hash), tbl, hash));	\
-	     !rht_is_a_nulls(pos);					\
+	for (({barrier(); }),					\
+	     pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash);	\
+	     !rht_is_a_nulls(pos);				\
 	     pos = rcu_dereference_raw(pos->next))
 
 /**
@@ -490,7 +506,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl,
  */
 #define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \
 	for (({barrier(); }),						    \
-	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		    \
+	     pos = head;						    \
 	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	    \
 	     pos = rht_dereference_bucket_rcu(pos->next, tbl, hash))
 
@@ -508,8 +524,9 @@ static inline void rht_assign_unlock(struct bucket_table *tbl,
  */
 #define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		   \
 	rht_for_each_entry_rcu_from(tpos, pos,				   \
-					rht_ptr(*rht_bucket(tbl, hash)),   \
-					tbl, hash, member)
+				    rht_ptr(rht_bucket(tbl, hash),	   \
+					    tbl, hash),			   \
+				    tbl, hash, member)
 
 /**
  * rhl_for_each_rcu - iterate over rcu hash table list
@@ -556,7 +573,6 @@ static inline struct rhash_head *__rhashtable_lookup(
 	};
 	struct rhash_lock_head __rcu * const *bkt;
 	struct bucket_table *tbl;
-	struct rhash_head __rcu *head;
 	struct rhash_head *he;
 	unsigned int hash;
 
@@ -565,8 +581,7 @@ restart:
 	hash = rht_key_hashfn(ht, tbl, key, params);
 	bkt = rht_bucket(tbl, hash);
 	do {
-		head = rht_ptr(rht_dereference_bucket_rcu(*bkt, tbl, hash));
-		rht_for_each_rcu_from(he, head, tbl, hash) {
+		rht_for_each_rcu_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
 			if (params.obj_cmpfn ?
 			    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
 			    rhashtable_compare(&arg, rht_obj(ht, he)))
@@ -699,7 +714,7 @@ slow_path:
 		return rhashtable_insert_slow(ht, key, obj);
 	}
 
-	rht_for_each_from(head, rht_ptr(*bkt), tbl, hash) {
+	rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
 		struct rhlist_head *plist;
 		struct rhlist_head *list;
 
@@ -744,7 +759,7 @@ slow_path:
 		goto slow_path;
 
 	/* Inserting at head of list makes unlocking free. */
-	head = rht_ptr(rht_dereference_bucket(*bkt, tbl, hash));
+	head = rht_ptr(bkt, tbl, hash);
 
 	RCU_INIT_POINTER(obj->next, head);
 	if (rhlist) {
@@ -971,7 +986,7 @@ static inline int __rhashtable_remove_fast_one(
 	pprev = NULL;
 	rht_lock(tbl, bkt);
 
-	rht_for_each_from(he, rht_ptr(*bkt), tbl, hash) {
+	rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
 		struct rhlist_head *list;
 
 		list = container_of(he, struct rhlist_head, rhead);
@@ -1130,7 +1145,7 @@ static inline int __rhashtable_replace_fast(
 	pprev = NULL;
 	rht_lock(tbl, bkt);
 
-	rht_for_each_from(he, rht_ptr(*bkt), tbl, hash) {
+	rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) {
 		if (he != obj_old) {
 			pprev = &he->next;
 			continue;
-- 
cgit v1.2.3


From f4712b46a529ca2da078c82d5d99d367c7ebf82b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 12 Apr 2019 11:52:08 +1000
Subject: rhashtable: replace rht_ptr_locked() with rht_assign_locked()

The only times rht_ptr_locked() is used, it is to store a new
value in a bucket-head.  This is the only time it makes sense
to use it too.  So replace it by a function which does the
whole task:  Sets the lock bit and assigns to a bucket head.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index b54e6436547e..882bc0fcea4b 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -316,6 +316,7 @@ static inline struct rhash_lock_head __rcu **rht_bucket_insert(
  * local_bh. For that we have rht_assign_unlock().  As rcu_assign_pointer()
  * provides the same release semantics that bit_spin_unlock() provides,
  * this is safe.
+ * When we write to a bucket without unlocking, we use rht_assign_locked().
  */
 
 static inline void rht_lock(struct bucket_table *tbl,
@@ -369,10 +370,12 @@ static inline struct rhash_head *rht_ptr_exclusive(
 	return (void *)(((unsigned long)p) & ~BIT(1));
 }
 
-static inline struct rhash_lock_head __rcu *rht_ptr_locked(const
-							   struct rhash_head *p)
+static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
+				     struct rhash_head *obj)
 {
-	return (void *)(((unsigned long)p) | BIT(1));
+	struct rhash_head __rcu **p = (struct rhash_head __rcu **)bkt;
+
+	rcu_assign_pointer(*p, (void *)((unsigned long)obj | BIT(1)));
 }
 
 static inline void rht_assign_unlock(struct bucket_table *tbl,
-- 
cgit v1.2.3


From ca0b709d1a07b1fe1fb356d8d58f220287f85672 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 12 Apr 2019 11:52:08 +1000
Subject: rhashtable: use BIT(0) for locking.

As reported by Guenter Roeck, the new bit-locking using
BIT(1) doesn't work on the m68k architecture.  m68k only requires
2-byte alignment for words and longwords, so there is only one
unused bit in pointers to structs - We current use two, one for the
NULLS marker at the end of the linked list, and one for the bit-lock
in the head of the list.

The two uses don't need to conflict as we never need the head of the
list to be a NULLS marker - the marker is only needed to check if an
object has moved to a different table, and the bucket head cannot
move.  The NULLS marker is only needed in a ->next pointer.

As we already have different types for the bucket head pointer (struct
rhash_lock_head) and the ->next pointers (struct rhash_head), it is
fairly easy to treat the lsb differently in each.

So: Initialize buckets heads to NULL, and use the lsb for locking.
When loading the pointer from the bucket head, if it is NULL (ignoring
the lock big), report as being the expected NULLS marker.
When storing a value into a bucket head, if it is a NULLS marker,
store NULL instead.

And convert all places that used bit 1 for locking, to use bit 0.

Fixes: 8f0db018006a ("rhashtable: use bit_spin_locks to protect hash bucket.")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 882bc0fcea4b..f7714d3b46bd 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -35,7 +35,7 @@
  * the least significant bit set but otherwise stores the address of
  * the hash bucket.  This allows us to be be sure we've found the end
  * of the right list.
- * The value stored in the hash bucket has BIT(2) used as a lock bit.
+ * The value stored in the hash bucket has BIT(0) used as a lock bit.
  * This bit must be atomically set before any changes are made to
  * the chain.  To avoid dereferencing this pointer without clearing
  * the bit first, we use an opaque 'struct rhash_lock_head *' for the
@@ -91,15 +91,19 @@ struct bucket_table {
  * NULLS_MARKER() expects a hash value with the low
  * bits mostly likely to be significant, and it discards
  * the msb.
- * We git it an address, in which the bottom 2 bits are
+ * We give it an address, in which the bottom bit is
  * always 0, and the msb might be significant.
  * So we shift the address down one bit to align with
  * expectations and avoid losing a significant bit.
+ *
+ * We never store the NULLS_MARKER in the hash table
+ * itself as we need the lsb for locking.
+ * Instead we store a NULL
  */
 #define	RHT_NULLS_MARKER(ptr)	\
 	((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1))
 #define INIT_RHT_NULLS_HEAD(ptr)	\
-	((ptr) = RHT_NULLS_MARKER(&(ptr)))
+	((ptr) = NULL)
 
 static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
 {
@@ -302,8 +306,9 @@ static inline struct rhash_lock_head __rcu **rht_bucket_insert(
 }
 
 /*
- * We lock a bucket by setting BIT(1) in the pointer - this is always
- * zero in real pointers and in the nulls marker.
+ * We lock a bucket by setting BIT(0) in the pointer - this is always
+ * zero in real pointers.  The NULLS mark is never stored in the bucket,
+ * rather we store NULL if the bucket is empty.
  * bit_spin_locks do not handle contention well, but the whole point
  * of the hashtable design is to achieve minimum per-bucket contention.
  * A nested hash table might not have a bucket pointer.  In that case
@@ -323,7 +328,7 @@ static inline void rht_lock(struct bucket_table *tbl,
 			    struct rhash_lock_head **bkt)
 {
 	local_bh_disable();
-	bit_spin_lock(1, (unsigned long *)bkt);
+	bit_spin_lock(0, (unsigned long *)bkt);
 	lock_map_acquire(&tbl->dep_map);
 }
 
@@ -332,7 +337,7 @@ static inline void rht_lock_nested(struct bucket_table *tbl,
 				   unsigned int subclass)
 {
 	local_bh_disable();
-	bit_spin_lock(1, (unsigned long *)bucket);
+	bit_spin_lock(0, (unsigned long *)bucket);
 	lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_);
 }
 
@@ -340,7 +345,7 @@ static inline void rht_unlock(struct bucket_table *tbl,
 			      struct rhash_lock_head **bkt)
 {
 	lock_map_release(&tbl->dep_map);
-	bit_spin_unlock(1, (unsigned long *)bkt);
+	bit_spin_unlock(0, (unsigned long *)bkt);
 	local_bh_enable();
 }
 
@@ -358,7 +363,9 @@ static inline struct rhash_head *rht_ptr(
 	const struct rhash_lock_head *p =
 		rht_dereference_bucket_rcu(*bkt, tbl, hash);
 
-	return (void *)(((unsigned long)p) & ~BIT(1));
+	if ((((unsigned long)p) & ~BIT(0)) == 0)
+		return RHT_NULLS_MARKER(bkt);
+	return (void *)(((unsigned long)p) & ~BIT(0));
 }
 
 static inline struct rhash_head *rht_ptr_exclusive(
@@ -367,7 +374,9 @@ static inline struct rhash_head *rht_ptr_exclusive(
 	const struct rhash_lock_head *p =
 		rcu_dereference_protected(*bkt, 1);
 
-	return (void *)(((unsigned long)p) & ~BIT(1));
+	if (!p)
+		return RHT_NULLS_MARKER(bkt);
+	return (void *)(((unsigned long)p) & ~BIT(0));
 }
 
 static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
@@ -375,7 +384,9 @@ static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt,
 {
 	struct rhash_head __rcu **p = (struct rhash_head __rcu **)bkt;
 
-	rcu_assign_pointer(*p, (void *)((unsigned long)obj | BIT(1)));
+	if (rht_is_a_nulls(obj))
+		obj = NULL;
+	rcu_assign_pointer(*p, (void *)((unsigned long)obj | BIT(0)));
 }
 
 static inline void rht_assign_unlock(struct bucket_table *tbl,
@@ -384,6 +395,8 @@ static inline void rht_assign_unlock(struct bucket_table *tbl,
 {
 	struct rhash_head __rcu **p = (struct rhash_head __rcu **)bkt;
 
+	if (rht_is_a_nulls(obj))
+		obj = NULL;
 	lock_map_release(&tbl->dep_map);
 	rcu_assign_pointer(*p, obj);
 	preempt_enable();
-- 
cgit v1.2.3


From 9dde27de3e5efa0d032f3c891a0ca833a0d31911 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 15 Apr 2019 17:15:07 +0800
Subject: sctp: implement memory accounting on rx path

sk_forward_alloc's updating is also done on rx path, but to be consistent
we change to use sk_mem_charge() in sctp_skb_set_owner_r().

In sctp_eat_data(), it's not enough to check sctp_memory_pressure only,
which doesn't work for mem_cgroup_sockets_enabled, so we change to use
sk_under_memory_pressure().

When it's under memory pressure, sk_mem_reclaim() and sk_rmem_schedule()
should be called on both RENEGE or CHUNK DELIVERY path exit the memory
pressure status as soon as possible.

Note that sk_rmem_schedule() is using datalen to make things easy there.

Reported-by: Matteo Croce <mcroce@redhat.com>
Tested-by: Matteo Croce <mcroce@redhat.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 1d13ec3f2707..eefdfa5abf6e 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -421,7 +421,7 @@ static inline void sctp_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
 	/*
 	 * This mimics the behavior of skb_set_owner_r
 	 */
-	sk->sk_forward_alloc -= event->rmem_len;
+	sk_mem_charge(sk, event->rmem_len);
 }
 
 /* Tests if the list has one and only one entry. */
-- 
cgit v1.2.3


From bfb35c27c65fce60a12e188135ae1344d1b89e24 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Fri, 12 Apr 2019 12:27:34 +0100
Subject: bpf: fix whitespace for ENCAP_L2 defines in bpf.h

replace tab after #define with space in line with rest of definitions

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c26be24fd5e2..704bb69514a2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2792,14 +2792,14 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_adjust_room flags. */
 #define BPF_F_ADJ_ROOM_FIXED_GSO	(1ULL << 0)
 
-#define	BPF_ADJ_ROOM_ENCAP_L2_MASK	0xff
-#define	BPF_ADJ_ROOM_ENCAP_L2_SHIFT	56
+#define BPF_ADJ_ROOM_ENCAP_L2_MASK	0xff
+#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT	56
 
 #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4	(1ULL << 1)
 #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6	(1ULL << 2)
 #define BPF_F_ADJ_ROOM_ENCAP_L4_GRE	(1ULL << 3)
 #define BPF_F_ADJ_ROOM_ENCAP_L4_UDP	(1ULL << 4)
-#define	BPF_F_ADJ_ROOM_ENCAP_L2(len)	(((__u64)len & \
+#define BPF_F_ADJ_ROOM_ENCAP_L2(len)	(((__u64)len & \
 					  BPF_ADJ_ROOM_ENCAP_L2_MASK) \
 					 << BPF_ADJ_ROOM_ENCAP_L2_SHIFT)
 
-- 
cgit v1.2.3


From 725721a6506eea53bfde81a34e91a06d6162c216 Mon Sep 17 00:00:00 2001
From: Viet Hoang Tran <hoang.tran@uclouvain.be>
Date: Mon, 15 Apr 2019 09:54:55 +0000
Subject: bpf: allow clearing all sock_ops callback flags

The helper function bpf_sock_ops_cb_flags_set() can be used to both
set and clear the sock_ops callback flags. However, its current
behavior is not consistent. BPF program may clear a flag if more than
one were set, or replace a flag with another one, but cannot clear all
flags.

This patch also updates the documentation to clarify the ability to
clear flags of this helper function.

Signed-off-by: Hoang Tran <hoang.tran@uclouvain.be>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 704bb69514a2..eaf2d3284248 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1737,12 +1737,19 @@ union bpf_attr {
  * 		error if an eBPF program tries to set a callback that is not
  * 		supported in the current kernel.
  *
- * 		The supported callback values that *argval* can combine are:
+ * 		*argval* is a flag array which can combine these flags:
  *
  * 		* **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
  * 		* **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
  * 		* **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
  *
+ * 		Therefore, this function can be used to clear a callback flag by
+ * 		setting the appropriate bit to zero. e.g. to disable the RTO
+ * 		callback:
+ *
+ * 		**bpf_sock_ops_cb_flags_set(bpf_sock,**
+ * 			**bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)**
+ *
  * 		Here are some examples of where one could call such eBPF
  * 		program:
  *
-- 
cgit v1.2.3


From ba0509b6881efd0c8b26c36490cba87d8fb324c0 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 12 Apr 2019 17:07:37 +0200
Subject: net: core: introduce build_skb_around

The function build_skb() also have the responsibility to allocate and clear
the SKB structure. Introduce a new function build_skb_around(), that moves
the responsibility of allocation and clearing to the caller. This allows
caller to use kmem_cache (slab/slub) bulk allocation API.

Next patch use this function combined with kmem_cache_alloc_bulk.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/skbuff.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a06275a618f0..e81f2b0e8a83 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1042,6 +1042,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags,
 			    int node);
 struct sk_buff *__build_skb(void *data, unsigned int frag_size);
 struct sk_buff *build_skb(void *data, unsigned int frag_size);
+struct sk_buff *build_skb_around(struct sk_buff *skb,
+				 void *data, unsigned int frag_size);
 
 /**
  * alloc_skb - allocate a network buffer
-- 
cgit v1.2.3


From b1d40991506aa9f1de310a2e74ef8e3bec6ba215 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 16 Apr 2019 14:35:59 -0700
Subject: ipv6: Rename fib6_multipath_select and pass fib6_result

Add 'struct fib6_result' to hold the fib entry and fib6_nh from a fib
lookup as separate entries, similar to what IPv4 now has with fib_result.

Rename fib6_multipath_select to fib6_select_path, pass fib6_result to
it, and set f6i and nh in the result once a path selection is done.
Call fib6_select_path unconditionally for path selection which means
moving the sibling and oif check to fib6_select_path. To handle the two
different call paths (2 only call multipath_select if flowi6_oif == 0 and
the other always calls it), add a new have_oif_match that controls the
sibling walk if relevant.

Update callers of fib6_multipath_select accordingly and have them use the
fib6_info and fib6_nh from the result.

This is needed for multipath nexthop objects where a single f6i can
point to multiple fib6_nh (similar to IPv4).

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h    | 13 ++++++++-----
 include/net/ipv6_stubs.h |  9 ++++-----
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 2e9235adfa0d..c4d818041663 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -190,6 +190,11 @@ struct rt6_info {
 	unsigned short			rt6i_nfheader_len;
 };
 
+struct fib6_result {
+	struct fib6_nh		*nh;
+	struct fib6_info	*f6i;
+};
+
 #define for_each_fib6_node_rt_rcu(fn)					\
 	for (rt = rcu_dereference((fn)->leaf); rt;			\
 	     rt = rcu_dereference(rt->fib6_next))
@@ -391,11 +396,9 @@ struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
 				    int oif, struct flowi6 *fl6, int strict);
 
-struct fib6_info *fib6_multipath_select(const struct net *net,
-					struct fib6_info *match,
-					struct flowi6 *fl6, int oif,
-					const struct sk_buff *skb, int strict);
-
+void fib6_select_path(const struct net *net, struct fib6_result *res,
+		      struct flowi6 *fl6, int oif, bool have_oif_match,
+		      const struct sk_buff *skb, int strict);
 struct fib6_node *fib6_node_lookup(struct fib6_node *root,
 				   const struct in6_addr *daddr,
 				   const struct in6_addr *saddr);
diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index 453b55bf6723..5df36d6a2613 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -14,6 +14,7 @@
 struct fib6_info;
 struct fib6_nh;
 struct fib6_config;
+struct fib6_result;
 
 /* This is ugly, ideally these symbols should be built
  * into the core kernel.
@@ -34,11 +35,9 @@ struct ipv6_stub {
 					      struct fib6_table *table,
 					      int oif, struct flowi6 *fl6,
 					      int flags);
-	struct fib6_info *(*fib6_multipath_select)(const struct net *net,
-						   struct fib6_info *f6i,
-						   struct flowi6 *fl6, int oif,
-						   const struct sk_buff *skb,
-						   int strict);
+	void (*fib6_select_path)(const struct net *net, struct fib6_result *res,
+				 struct flowi6 *fl6, int oif, bool oif_match,
+				 const struct sk_buff *skb, int strict);
 	u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr,
 				 struct in6_addr *saddr);
 
-- 
cgit v1.2.3


From b748f26092626332f73e71d75e4390de6b8bdf9b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 16 Apr 2019 14:36:06 -0700
Subject: ipv6: Pass fib6_result to ip6_mtu_from_fib6 and fib6_mtu

Change ip6_mtu_from_fib6 and fib6_mtu to take a fib6_result over a
fib6_info. Update both to use the fib6_nh from fib6_result.

Since the signature of ip6_mtu_from_fib6 is already changing, add const
to daddr and saddr.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h  | 5 +++--
 include/net/ipv6_stubs.h | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 5909fc421305..46bbd8ff9cc6 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -302,8 +302,9 @@ static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 	return mtu;
 }
 
-u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
-		      struct in6_addr *saddr);
+u32 ip6_mtu_from_fib6(const struct fib6_result *res,
+		      const struct in6_addr *daddr,
+		      const struct in6_addr *saddr);
 
 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
 				   struct net_device *dev, struct sk_buff *skb,
diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index 5df36d6a2613..0d16b9ec0485 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -38,8 +38,9 @@ struct ipv6_stub {
 	void (*fib6_select_path)(const struct net *net, struct fib6_result *res,
 				 struct flowi6 *fl6, int oif, bool oif_match,
 				 const struct sk_buff *skb, int strict);
-	u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr,
-				 struct in6_addr *saddr);
+	u32 (*ip6_mtu_from_fib6)(const struct fib6_result *res,
+				 const struct in6_addr *daddr,
+				 const struct in6_addr *saddr);
 
 	int (*fib6_nh_init)(struct net *net, struct fib6_nh *fib6_nh,
 			    struct fib6_config *cfg, gfp_t gfp_flags,
-- 
cgit v1.2.3


From 8ff2e5b26cb84b1b0f502c0b7a3c62e4c4d86acc Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 16 Apr 2019 14:36:09 -0700
Subject: ipv6: Pass fib6_result to fib6_table_lookup tracepoint

Change fib6_table_lookup tracepoint to take the fib6_result and use
the fib6_info and fib6_nh from it.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/fib6.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index 6d05ebdd669c..70e252d926ea 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -12,10 +12,10 @@
 
 TRACE_EVENT(fib6_table_lookup,
 
-	TP_PROTO(const struct net *net, const struct fib6_info *f6i,
+	TP_PROTO(const struct net *net, const struct fib6_result *res,
 		 struct fib6_table *table, const struct flowi6 *flp),
 
-	TP_ARGS(net, f6i, table, flp),
+	TP_ARGS(net, res, table, flp),
 
 	TP_STRUCT__entry(
 		__field(	u32,	tb_id		)
@@ -39,7 +39,7 @@ TRACE_EVENT(fib6_table_lookup,
 		struct in6_addr *in6;
 
 		__entry->tb_id = table->tb6_id;
-		__entry->err = ip6_rt_type_to_error(f6i->fib6_type);
+		__entry->err = ip6_rt_type_to_error(res->f6i->fib6_type);
 		__entry->oif = flp->flowi6_oif;
 		__entry->iif = flp->flowi6_iif;
 		__entry->tos = ip6_tclass(flp->flowlabel);
@@ -62,20 +62,20 @@ TRACE_EVENT(fib6_table_lookup,
 			__entry->dport = 0;
 		}
 
-		if (f6i->fib6_nh.fib_nh_dev) {
-			__assign_str(name, f6i->fib6_nh.fib_nh_dev);
+		if (res->nh && res->nh->fib_nh_dev) {
+			__assign_str(name, res->nh->fib_nh_dev);
 		} else {
 			__assign_str(name, "-");
 		}
-		if (f6i == net->ipv6.fib6_null_entry) {
+		if (res->f6i == net->ipv6.fib6_null_entry) {
 			struct in6_addr in6_zero = {};
 
 			in6 = (struct in6_addr *)__entry->gw;
 			*in6 = in6_zero;
 
-		} else if (f6i) {
+		} else if (res->nh) {
 			in6 = (struct in6_addr *)__entry->gw;
-			*in6 = f6i->fib6_nh.fib_nh_gw6;
+			*in6 = res->nh->fib_nh_gw6;
 		}
 	),
 
-- 
cgit v1.2.3


From effda4dd97e878ab83336bec7411cc41b5cc6d37 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 16 Apr 2019 14:36:10 -0700
Subject: ipv6: Pass fib6_result to fib lookups

Change fib6_lookup and fib6_table_lookup to take a fib6_result and set
f6i and nh rather than returning a fib6_info. For now both always
return 0.

A later patch set can make these more like the IPv4 counterparts and
return EINVAL, EACCESS, etc based on fib6_type.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h    |  9 +++++----
 include/net/ipv6_stubs.h | 11 +++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index c4d818041663..cb3277cd1413 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -389,12 +389,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 /* called with rcu lock held; can return error pointer
  * caller needs to select path
  */
-struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
-			      int flags);
+int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+		struct fib6_result *res, int flags);
 
 /* called with rcu lock held; caller needs to select path */
-struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
-				    int oif, struct flowi6 *fl6, int strict);
+int fib6_table_lookup(struct net *net, struct fib6_table *table,
+		      int oif, struct flowi6 *fl6, struct fib6_result *res,
+		      int strict);
 
 void fib6_select_path(const struct net *net, struct fib6_result *res,
 		      struct flowi6 *fl6, int oif, bool have_oif_match,
diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
index 0d16b9ec0485..6c0c4fde16f8 100644
--- a/include/net/ipv6_stubs.h
+++ b/include/net/ipv6_stubs.h
@@ -29,12 +29,11 @@ struct ipv6_stub {
 	int (*ipv6_route_input)(struct sk_buff *skb);
 
 	struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
-	struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
-					 struct flowi6 *fl6, int flags);
-	struct fib6_info *(*fib6_table_lookup)(struct net *net,
-					      struct fib6_table *table,
-					      int oif, struct flowi6 *fl6,
-					      int flags);
+	int (*fib6_lookup)(struct net *net, int oif, struct flowi6 *fl6,
+			   struct fib6_result *res, int flags);
+	int (*fib6_table_lookup)(struct net *net, struct fib6_table *table,
+				 int oif, struct flowi6 *fl6,
+				 struct fib6_result *res, int flags);
 	void (*fib6_select_path)(const struct net *net, struct fib6_result *res,
 				 struct flowi6 *fl6, int oif, bool oif_match,
 				 const struct sk_buff *skb, int strict);
-- 
cgit v1.2.3


From 7d21fec90438941b44b699ae73673d2f8a3a9d76 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 16 Apr 2019 14:36:11 -0700
Subject: ipv6: Add fib6_type and fib6_flags to fib6_result

Add the fib6_flags and fib6_type to fib6_result. Update the lookup helpers
to set them and update post fib lookup users to use the version from the
result.

This allows nexthop objects to have blackhole nexthop.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h       | 2 ++
 include/trace/events/fib6.h | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index cb3277cd1413..6b7557b71c8c 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -193,6 +193,8 @@ struct rt6_info {
 struct fib6_result {
 	struct fib6_nh		*nh;
 	struct fib6_info	*f6i;
+	u32			fib6_flags;
+	u8			fib6_type;
 };
 
 #define for_each_fib6_node_rt_rcu(fn)					\
diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index 70e252d926ea..c6abdcc77c12 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -39,7 +39,7 @@ TRACE_EVENT(fib6_table_lookup,
 		struct in6_addr *in6;
 
 		__entry->tb_id = table->tb6_id;
-		__entry->err = ip6_rt_type_to_error(res->f6i->fib6_type);
+		__entry->err = ip6_rt_type_to_error(res->fib6_type);
 		__entry->oif = flp->flowi6_oif;
 		__entry->iif = flp->flowi6_iif;
 		__entry->tos = ip6_tclass(flp->flowlabel);
-- 
cgit v1.2.3


From b8fb1ab46169ac016a8552a6455bb0bfc401f8e2 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 16 Apr 2019 17:31:43 -0700
Subject: net ipv6: Prevent neighbor add if protocol is disabled on device

Disabling IPv6 on an interface removes existing entries but nothing prevents
new entries from being manually added. To that end, add a new neigh_table
operation, allow_add, that is called on RTM_NEWNEIGH to see if neighbor
entries are allowed on a given device. If IPv6 is disabled on the device,
allow_add returns false and passes a message back to the user via extack.

  $ echo 1 > /proc/sys/net/ipv6/conf/eth1/disable_ipv6
  $ ip -6 neigh add fe80::4c88:bff:fe21:2704 dev eth1 lladdr de:ad:be:ef:01:01
  Error: IPv6 is disabled on this device.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 3e5438bd0101..50a67bd6a434 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -205,6 +205,8 @@ struct neigh_table {
 	int			(*pconstructor)(struct pneigh_entry *);
 	void			(*pdestructor)(struct pneigh_entry *);
 	void			(*proxy_redo)(struct sk_buff *skb);
+	bool			(*allow_add)(const struct net_device *dev,
+					     struct netlink_ext_ack *extack);
 	char			*id;
 	struct neigh_parms	parms;
 	struct list_head	parms_list;
-- 
cgit v1.2.3


From 0bc199854405543b0debe67c735c0aae94f1d319 Mon Sep 17 00:00:00 2001
From: Stephen Suryaputra <ssuryaextr@gmail.com>
Date: Wed, 17 Apr 2019 16:35:49 -0400
Subject: ipv6: Add rate limit mask for ICMPv6 messages

To make ICMPv6 closer to ICMPv4, add ratemask parameter. Since the ICMP
message types use larger numeric values, a simple bitmask doesn't fit.
I use large bitmap. The input and output are the in form of list of
ranges. Set the default to rate limit all error messages but Packet Too
Big. For Packet Too Big, use ratemask instead of hard-coded.

There are functions where icmpv6_xrlim_allow() and icmpv6_global_allow()
aren't called. This patch only adds them to icmpv6_echo_reply().

Rate limiting error messages is mandated by RFC 4443 but RFC 4890 says
that it is also acceptable to rate limit informational messages. Thus,
I removed the current hard-coded behavior of icmpv6_mask_allow() that
doesn't rate limit informational messages.

v2: Add dummy function proc_do_large_bitmap() if CONFIG_PROC_SYSCTL
    isn't defined, expand the description in ip-sysctl.txt and remove
    unnecessary conditional before kfree().
v3: Inline the bitmap instead of dynamically allocated. Still is a
    pointer to it is needed because of the way proc_do_large_bitmap work.

Signed-off-by: Stephen Suryaputra <ssuryaextr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv6.h    | 3 +++
 include/uapi/linux/icmpv6.h | 4 ++++
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 64e29b58bb5e..5e61b5a8635d 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -8,6 +8,7 @@
 #ifndef __NETNS_IPV6_H__
 #define __NETNS_IPV6_H__
 #include <net/dst_ops.h>
+#include <uapi/linux/icmpv6.h>
 
 struct ctl_table_header;
 
@@ -35,6 +36,8 @@ struct netns_sysctl_ipv6 {
 	int icmpv6_echo_ignore_all;
 	int icmpv6_echo_ignore_multicast;
 	int icmpv6_echo_ignore_anycast;
+	DECLARE_BITMAP(icmpv6_ratemask, ICMPV6_MSG_MAX + 1);
+	unsigned long *icmpv6_ratemask_ptr;
 	int anycast_src_echo_reply;
 	int ip_nonlocal_bind;
 	int fwmark_reflect;
diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h
index 325395f56bfa..2622b5a3e616 100644
--- a/include/uapi/linux/icmpv6.h
+++ b/include/uapi/linux/icmpv6.h
@@ -90,6 +90,8 @@ struct icmp6hdr {
 #define ICMPV6_TIME_EXCEED		3
 #define ICMPV6_PARAMPROB		4
 
+#define ICMPV6_ERRMSG_MAX       127
+
 #define ICMPV6_INFOMSG_MASK		0x80
 
 #define ICMPV6_ECHO_REQUEST		128
@@ -110,6 +112,8 @@ struct icmp6hdr {
 
 #define ICMPV6_MRDISC_ADV		151
 
+#define ICMPV6_MSG_MAX          255
+
 /*
  *	Codes for Destination Unreachable
  */
-- 
cgit v1.2.3


From a06eaaf7913cab9dd6cb8ece4f78cfd7a802872a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 17 Apr 2019 13:51:59 -0700
Subject: net: skb: remove unused asserts

We are discouraging the use of BUG() these days, remove the
unused ASSERT macros from skbuff.h.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a06275a618f0..e4ee92089dd6 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2100,8 +2100,6 @@ void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
 void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
 			  unsigned int truesize);
 
-#define SKB_PAGE_ASSERT(skb) 	BUG_ON(skb_shinfo(skb)->nr_frags)
-#define SKB_FRAG_ASSERT(skb) 	BUG_ON(skb_has_frag_list(skb))
 #define SKB_LINEAR_ASSERT(skb)  BUG_ON(skb_is_nonlinear(skb))
 
 #ifdef NET_SKBUFF_DATA_USES_OFFSET
-- 
cgit v1.2.3


From 71dd6c0dff51b5f1fef2e9dfa6f6a948aac975f3 Mon Sep 17 00:00:00 2001
From: David Bauer <mail@david-bauer.net>
Date: Wed, 17 Apr 2019 23:59:21 +0200
Subject: net: phy: add support for reset-controller

This commit adds support for PHY reset pins handled by a reset controller.

Signed-off-by: David Bauer <mail@david-bauer.net>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mdio.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 3e99ae3ed87f..6eaf71500ef6 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -40,6 +40,7 @@ struct mdio_device {
 	int addr;
 	int flags;
 	struct gpio_desc *reset;
+	struct reset_control *reset_ctrl;
 	unsigned int reset_assert_delay;
 	unsigned int reset_deassert_delay;
 };
-- 
cgit v1.2.3


From 6110ed2db3a41f3b9d676e58ac3d4637c2b497c4 Mon Sep 17 00:00:00 2001
From: David Bauer <mail@david-bauer.net>
Date: Wed, 17 Apr 2019 23:59:22 +0200
Subject: net: mdio: rename mdio_device reset to reset_gpio

This renames the GPIO reset of mdio devices from 'reset' to
'reset_gpio' to better differentiate between GPIO and
reset-controller driven reset line.

Signed-off-by: David Bauer <mail@david-bauer.net>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mdio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mdio.h b/include/linux/mdio.h
index 6eaf71500ef6..9dc16d5705a1 100644
--- a/include/linux/mdio.h
+++ b/include/linux/mdio.h
@@ -39,7 +39,7 @@ struct mdio_device {
 	/* Bus address of the MDIO device (0-31) */
 	int addr;
 	int flags;
-	struct gpio_desc *reset;
+	struct gpio_desc *reset_gpio;
 	struct reset_control *reset_ctrl;
 	unsigned int reset_assert_delay;
 	unsigned int reset_deassert_delay;
-- 
cgit v1.2.3


From 8c8b3458d0b91b2230f76fbe1b0280568f10d19f Mon Sep 17 00:00:00 2001
From: Mike Manning <mmanning@vyatta.att-mail.com>
Date: Thu, 18 Apr 2019 18:35:31 +0100
Subject: vlan: support binding link state to vlan member bridge ports

In the case of vlan filtering on bridges, the bridge may also have the
corresponding vlan devices as upper devices. Currently the link state
of vlan devices is transferred from the lower device. So this is up if
the bridge is in admin up state and there is at least one bridge port
that is up, regardless of the vlan that the port is a member of.

The link state of the vlan device may need to track only the state of
the subset of ports that are also members of the corresponding vlan,
rather than that of all ports.

Add a flag to specify a vlan bridge binding mode, by which the link
state is no longer automatically transferred from the lower device,
but is instead determined by the bridge ports that are members of the
vlan.

Signed-off-by: Mike Manning <mmanning@vyatta.att-mail.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_vlan.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_vlan.h b/include/uapi/linux/if_vlan.h
index 7a0e8bd65b6b..90a2c89afc8f 100644
--- a/include/uapi/linux/if_vlan.h
+++ b/include/uapi/linux/if_vlan.h
@@ -32,10 +32,11 @@ enum vlan_ioctl_cmds {
 };
 
 enum vlan_flags {
-	VLAN_FLAG_REORDER_HDR	= 0x1,
-	VLAN_FLAG_GVRP		= 0x2,
-	VLAN_FLAG_LOOSE_BINDING	= 0x4,
-	VLAN_FLAG_MVRP		= 0x8,
+	VLAN_FLAG_REORDER_HDR		= 0x1,
+	VLAN_FLAG_GVRP			= 0x2,
+	VLAN_FLAG_LOOSE_BINDING		= 0x4,
+	VLAN_FLAG_MVRP			= 0x8,
+	VLAN_FLAG_BRIDGE_BINDING	= 0x10,
 };
 
 enum vlan_name_types {
-- 
cgit v1.2.3


From c7cbdbf29f488a19982cd9f4a109887f18028bbb Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 17 Apr 2019 22:51:48 +0200
Subject: net: rework SIOCGSTAMP ioctl handling

The SIOCGSTAMP/SIOCGSTAMPNS ioctl commands are implemented by many
socket protocol handlers, and all of those end up calling the same
sock_get_timestamp()/sock_get_timestampns() helper functions, which
results in a lot of duplicate code.

With the introduction of 64-bit time_t on 32-bit architectures, this
gets worse, as we then need four different ioctl commands in each
socket protocol implementation.

To simplify that, let's add a new .gettstamp() operation in
struct proto_ops, and move ioctl implementation into the common
sock_ioctl()/compat_sock_ioctl_trans() functions that these all go
through.

We can reuse the sock_get_timestamp() implementation, but generalize
it so it can deal with both native and compat mode, as well as
timeval and timespec structures.

Acked-by: Stefan Schmidt <stefan@datenfreihafen.org>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de>
Link: https://lore.kernel.org/lkml/CAK8P3a038aDQQotzua_QtKGhq8O9n+rdiz2=WDCp82ys8eUT+A@mail.gmail.com/
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h  | 2 ++
 include/net/compat.h | 3 ---
 include/net/sock.h   | 4 ++--
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/net.h b/include/linux/net.h
index c606c72311d0..50bf5206ead6 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -161,6 +161,8 @@ struct proto_ops {
 	int	 	(*compat_ioctl) (struct socket *sock, unsigned int cmd,
 				      unsigned long arg);
 #endif
+	int		(*gettstamp) (struct socket *sock, void __user *userstamp,
+				      bool timeval, bool time32);
 	int		(*listen)    (struct socket *sock, int len);
 	int		(*shutdown)  (struct socket *sock, int flags);
 	int		(*setsockopt)(struct socket *sock, int level,
diff --git a/include/net/compat.h b/include/net/compat.h
index 4c6d75612b6c..f277653c7e17 100644
--- a/include/net/compat.h
+++ b/include/net/compat.h
@@ -30,9 +30,6 @@ struct compat_cmsghdr {
 	compat_int_t	cmsg_type;
 };
 
-int compat_sock_get_timestamp(struct sock *, struct timeval __user *);
-int compat_sock_get_timestampns(struct sock *, struct timespec __user *);
-
 #else /* defined(CONFIG_COMPAT) */
 /*
  * To avoid compiler warnings:
diff --git a/include/net/sock.h b/include/net/sock.h
index bdd77bbce7d8..784cd19d5ff7 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1614,6 +1614,8 @@ int sock_setsockopt(struct socket *sock, int level, int op,
 
 int sock_getsockopt(struct socket *sock, int level, int op,
 		    char __user *optval, int __user *optlen);
+int sock_gettstamp(struct socket *sock, void __user *userstamp,
+		   bool timeval, bool time32);
 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
 				    int noblock, int *errcode);
 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
@@ -2503,8 +2505,6 @@ static inline bool sk_listener(const struct sock *sk)
 }
 
 void sock_enable_timestamp(struct sock *sk, int flag);
-int sock_get_timestamp(struct sock *, struct timeval __user *);
-int sock_get_timestampns(struct sock *, struct timespec __user *);
 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level,
 		       int type);
 
-- 
cgit v1.2.3


From 0768e17073dc527ccd18ed5f96ce85f9985e9115 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 17 Apr 2019 22:56:11 +0200
Subject: net: socket: implement 64-bit timestamps

The 'timeval' and 'timespec' data structures used for socket timestamps
are going to be redefined in user space based on 64-bit time_t in future
versions of the C library to deal with the y2038 overflow problem,
which breaks the ABI definition.

Unlike many modern ioctl commands, SIOCGSTAMP and SIOCGSTAMPNS do not
use the _IOR() macro to encode the size of the transferred data, so it
remains ambiguous whether the application uses the old or new layout.

The best workaround I could find is rather ugly: we redefine the command
code based on the size of the respective data structure with a ternary
operator. This lets it get evaluated as late as possible, hopefully after
that structure is visible to the caller. We cannot use an #ifdef here,
because inux/sockios.h might have been included before any libc header
that could determine the size of time_t.

The ioctl implementation now interprets the new command codes as always
referring to the 64-bit structure on all architectures, while the old
architecture specific command code still refers to the old architecture
specific layout. The new command number is only used when they are
actually different.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/asm-generic/sockios.h |  4 ++--
 include/uapi/linux/sockios.h       | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/asm-generic/sockios.h b/include/uapi/asm-generic/sockios.h
index 64f658c7cec2..44fa3ed70483 100644
--- a/include/uapi/asm-generic/sockios.h
+++ b/include/uapi/asm-generic/sockios.h
@@ -8,7 +8,7 @@
 #define FIOGETOWN	0x8903
 #define SIOCGPGRP	0x8904
 #define SIOCATMARK	0x8905
-#define SIOCGSTAMP	0x8906		/* Get stamp (timeval) */
-#define SIOCGSTAMPNS	0x8907		/* Get stamp (timespec) */
+#define SIOCGSTAMP_OLD	0x8906		/* Get stamp (timeval) */
+#define SIOCGSTAMPNS_OLD 0x8907		/* Get stamp (timespec) */
 
 #endif /* __ASM_GENERIC_SOCKIOS_H */
diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h
index d393e9ed3964..7d1bccbbef78 100644
--- a/include/uapi/linux/sockios.h
+++ b/include/uapi/linux/sockios.h
@@ -19,6 +19,7 @@
 #ifndef _LINUX_SOCKIOS_H
 #define _LINUX_SOCKIOS_H
 
+#include <asm/bitsperlong.h>
 #include <asm/sockios.h>
 
 /* Linux-specific socket ioctls */
@@ -27,6 +28,26 @@
 
 #define SOCK_IOC_TYPE	0x89
 
+/*
+ * the timeval/timespec data structure layout is defined by libc,
+ * so we need to cover both possible versions on 32-bit.
+ */
+/* Get stamp (timeval) */
+#define SIOCGSTAMP_NEW	 _IOR(SOCK_IOC_TYPE, 0x06, long long[2])
+/* Get stamp (timespec) */
+#define SIOCGSTAMPNS_NEW _IOR(SOCK_IOC_TYPE, 0x07, long long[2])
+
+#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
+/* on 64-bit and x32, avoid the ?: operator */
+#define SIOCGSTAMP	SIOCGSTAMP_OLD
+#define SIOCGSTAMPNS	SIOCGSTAMPNS_OLD
+#else
+#define SIOCGSTAMP	((sizeof(struct timeval))  == 8 ? \
+			 SIOCGSTAMP_OLD   : SIOCGSTAMP_NEW)
+#define SIOCGSTAMPNS	((sizeof(struct timespec)) == 8 ? \
+			 SIOCGSTAMPNS_OLD : SIOCGSTAMPNS_NEW)
+#endif
+
 /* Routing table calls. */
 #define SIOCADDRT	0x890B		/* add routing table entry	*/
 #define SIOCDELRT	0x890C		/* delete routing table entry	*/
-- 
cgit v1.2.3


From 42e5425aa0dfd8a6cdd7e177cfd9703df05c7411 Mon Sep 17 00:00:00 2001
From: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Date: Thu, 18 Apr 2019 21:02:19 +0700
Subject: tipc: introduce new socket option TIPC_SOCK_RECVQ_USED

When using TIPC_SOCK_RECVQ_DEPTH for getsockopt(), it returns the
number of buffers in receive socket buffer which is not so helpful
for user space applications.

This commit introduces the new option TIPC_SOCK_RECVQ_USED which
returns the current allocated bytes of the receive socket buffer.
This helps user space applications dimension its buffer usage to
avoid buffer overload issue.

Signed-off-by: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tipc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h
index 6b2fd4d9655f..7df026ea6aff 100644
--- a/include/uapi/linux/tipc.h
+++ b/include/uapi/linux/tipc.h
@@ -190,6 +190,7 @@ struct sockaddr_tipc {
 #define TIPC_MCAST_REPLICAST    134     /* Default: TIPC selects. No arg */
 #define TIPC_GROUP_JOIN         135     /* Takes struct tipc_group_req* */
 #define TIPC_GROUP_LEAVE        136     /* No argument */
+#define TIPC_SOCK_RECVQ_USED    137     /* Default: none (read only) */
 
 /*
  * Flag values
-- 
cgit v1.2.3


From 4e54507ab1a9da05238b986292f6cb702e6696c7 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 21 Apr 2019 08:49:01 -0700
Subject: ipv6: Simplify rt6_qualify_for_ecmp

After commit c7a1ce397ada ("ipv6: Change addrconf_f6i_alloc to use
ip6_route_info_create"), the gateway is no longer filled in for fib6_nh
structs in a prefix route. Accordingly, the RTF_ADDRCONF flag check can
be dropped from the 'rt6_qualify_for_ecmp'.

Further, RTF_DYNAMIC is only set in rt6_info instances, so it can be
removed from the check as well.

This reduces rt6_qualify_for_ecmp and the mlxsw version to just checking
if the nexthop has a gateway which is the real indication of whether
entries can be coalesced into a multipath route.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 46bbd8ff9cc6..518d97fbe074 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -68,8 +68,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 
 static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
 {
-	return !(f6i->fib6_flags & (RTF_ADDRCONF|RTF_DYNAMIC)) &&
-		f6i->fib6_nh.fib_nh_gw_family;
+	return f6i->fib6_nh.fib_nh_gw_family;
 }
 
 void ip6_route_input(struct sk_buff *skb);
-- 
cgit v1.2.3


From be659b8d3c79afc54e087ebf8d849685d7b0d395 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 21 Apr 2019 17:39:18 -0700
Subject: ipv6: Restore RTF_ADDRCONF check in rt6_qualify_for_ecmp

The RTF_ADDRCONF flag filters out routes added by RA's in determining
which routes can be appended to an existing one to create a multipath
route. Restore the flag check and add a comment to document the RA piece.

Fixes: 4e54507ab1a9 ("ipv6: Simplify rt6_qualify_for_ecmp")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 518d97fbe074..df9cebc2b20c 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -68,7 +68,9 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 
 static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
 {
-	return f6i->fib6_nh.fib_nh_gw_family;
+	/* the RTF_ADDRCONF flag filters out RA's */
+	return !(f6i->fib6_flags & RTF_ADDRCONF) &&
+		f6i->fib6_nh.fib_nh_gw_family;
 }
 
 void ip6_route_input(struct sk_buff *skb);
-- 
cgit v1.2.3


From 7df737e991069d75eec1ded1c8b37e81b8c54df9 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Fri, 19 Apr 2019 07:44:54 -0700
Subject: bpf: remove global variables

Move three global variables protected by bpf_verifier_lock into
'struct bpf_verifier_env' to allow parallel verification.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b3ab61fe1932..1305ccbd8fe6 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -295,6 +295,11 @@ struct bpf_verifier_env {
 	const struct bpf_line_info *prev_linfo;
 	struct bpf_verifier_log log;
 	struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1];
+	struct {
+		int *insn_state;
+		int *insn_stack;
+		int cur_stack;
+	} cfg;
 	u32 subprog_cnt;
 	/* number of instructions analyzed by the verifier */
 	u32 insn_processed;
-- 
cgit v1.2.3


From 0b13c9bb96f6edf03018030f07929a16b4dd77a6 Mon Sep 17 00:00:00 2001
From: "Daniel T. Lee" <danieltimlee@gmail.com>
Date: Sun, 21 Apr 2019 00:50:42 +0900
Subject: include/net/tcp.h: whitespace cleanup at tcp_v4_check

This patch makes trivial whitespace fix to the function
tcp_v4_check at include/net/tcp.h file.

It has stylistic issue, which is "space required after that ','"
and it can be confirmed with ./scripts/checkpatch.pl tool.

    ERROR: space required after that ',' (ctx:VxV)
    #29: FILE: include/net/tcp.h:1317:
    +	        return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
         	                              ^

Signed-off-by: Daniel T. Lee <danieltimlee@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 68ee02523b87..7cf1181630a3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1314,7 +1314,7 @@ static inline void tcp_update_wl(struct tcp_sock *tp, u32 seq)
 static inline __sum16 tcp_v4_check(int len, __be32 saddr,
 				   __be32 daddr, __wsum base)
 {
-	return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base);
+	return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_TCP, base);
 }
 
 static inline bool tcp_checksum_complete(struct sk_buff *skb)
-- 
cgit v1.2.3


From 7e5f4cdb284be5ff862f84ccda084e2847f73fbb Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sat, 20 Apr 2019 09:27:27 -0700
Subject: ipv6: Remove fib6_info_nh_lwt

fib6_info_nh_lwt is no longer used; remove it.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 6b7557b71c8c..352f767bea81 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -450,12 +450,6 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
 		 struct netlink_ext_ack *extack);
 void fib6_nh_release(struct fib6_nh *fib6_nh);
 
-static inline
-struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i)
-{
-	return f6i->fib6_nh.fib_nh_lws;
-}
-
 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
 		     unsigned int flags);
 
-- 
cgit v1.2.3


From 3c618c1dbb8859625c643121ac80af9a6723533f Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sat, 20 Apr 2019 09:28:20 -0700
Subject: net: Rename net/nexthop.h net/rtnh.h

The header contains rtnh_ macros so rename the file accordingly.
Allows a later patch to use the nexthop.h name for the new
nexthop code.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/nexthop.h | 34 ----------------------------------
 include/net/rtnh.h    | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 34 deletions(-)
 delete mode 100644 include/net/nexthop.h
 create mode 100644 include/net/rtnh.h

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
deleted file mode 100644
index 902ff382a6dc..000000000000
--- a/include/net/nexthop.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __NET_NEXTHOP_H
-#define __NET_NEXTHOP_H
-
-#include <linux/rtnetlink.h>
-#include <net/netlink.h>
-
-static inline int rtnh_ok(const struct rtnexthop *rtnh, int remaining)
-{
-	return remaining >= (int)sizeof(*rtnh) &&
-	       rtnh->rtnh_len >= sizeof(*rtnh) &&
-	       rtnh->rtnh_len <= remaining;
-}
-
-static inline struct rtnexthop *rtnh_next(const struct rtnexthop *rtnh,
-                                         int *remaining)
-{
-	int totlen = NLA_ALIGN(rtnh->rtnh_len);
-
-	*remaining -= totlen;
-	return (struct rtnexthop *) ((char *) rtnh + totlen);
-}
-
-static inline struct nlattr *rtnh_attrs(const struct rtnexthop *rtnh)
-{
-	return (struct nlattr *) ((char *) rtnh + NLA_ALIGN(sizeof(*rtnh)));
-}
-
-static inline int rtnh_attrlen(const struct rtnexthop *rtnh)
-{
-	return rtnh->rtnh_len - NLA_ALIGN(sizeof(*rtnh));
-}
-
-#endif
diff --git a/include/net/rtnh.h b/include/net/rtnh.h
new file mode 100644
index 000000000000..aa2cfc508f7c
--- /dev/null
+++ b/include/net/rtnh.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_RTNH_H
+#define __NET_RTNH_H
+
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+
+static inline int rtnh_ok(const struct rtnexthop *rtnh, int remaining)
+{
+	return remaining >= (int)sizeof(*rtnh) &&
+	       rtnh->rtnh_len >= sizeof(*rtnh) &&
+	       rtnh->rtnh_len <= remaining;
+}
+
+static inline struct rtnexthop *rtnh_next(const struct rtnexthop *rtnh,
+                                         int *remaining)
+{
+	int totlen = NLA_ALIGN(rtnh->rtnh_len);
+
+	*remaining -= totlen;
+	return (struct rtnexthop *) ((char *) rtnh + totlen);
+}
+
+static inline struct nlattr *rtnh_attrs(const struct rtnexthop *rtnh)
+{
+	return (struct nlattr *) ((char *) rtnh + NLA_ALIGN(sizeof(*rtnh)));
+}
+
+static inline int rtnh_attrlen(const struct rtnexthop *rtnh)
+{
+	return rtnh->rtnh_len - NLA_ALIGN(sizeof(*rtnh));
+}
+
+#endif
-- 
cgit v1.2.3


From a79eda3aaf30bc73752487fff5160cf67e99e313 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sat, 20 Apr 2019 23:29:42 -0400
Subject: net: psample: drop include of module.h from psample.h

Ideally, header files under include/linux shouldn't be adding
includes of other headers, in anticipation of their consumers,
but just the headers needed for the header itself to pass
parsing with CPP.

The module.h is particularly bad in this sense, as it itself does
include a whole bunch of other headers, due to the complexity of
module support.

There doesn't appear to be anything in psample.h that is module
related, and build coverage doesn't appear to show any other
files/drivers relying implicitly on getting it from here.

So it appears we are simply free to just remove it in this case.

Cc: Yotam Gigi <yotam.gi@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/psample.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/psample.h b/include/net/psample.h
index 9b80f814ab04..37a4df2325b2 100644
--- a/include/net/psample.h
+++ b/include/net/psample.h
@@ -3,7 +3,6 @@
 #define __NET_PSAMPLE_H
 
 #include <uapi/linux/psample.h>
-#include <linux/module.h>
 #include <linux/list.h>
 
 struct psample_group {
-- 
cgit v1.2.3


From c517796ea91d11dd3f0ae7ff61a12fe5d4941eb0 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sat, 20 Apr 2019 23:29:43 -0400
Subject: net: ife: drop include of module.h from net/ife.h

Ideally, header files under include/linux shouldn't be adding
includes of other headers, in anticipation of their consumers,
but just the headers needed for the header itself to pass
parsing with CPP.

The module.h is particularly bad in this sense, as it itself does
include a whole bunch of other headers, due to the complexity of
module support.

There doesn't appear to be anything in net/ife.h that is module
related, and build coverage doesn't appear to show any other
files/drivers relying implicitly on getting it from here.

So it appears we are simply free to just remove it in this case.

Cc: Yotam Gigi <yotam.gi@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ife.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ife.h b/include/net/ife.h
index e117617e3c34..7e2538d8585b 100644
--- a/include/net/ife.h
+++ b/include/net/ife.h
@@ -4,7 +4,6 @@
 
 #include <linux/etherdevice.h>
 #include <linux/rtnetlink.h>
-#include <linux/module.h>
 #include <uapi/linux/ife.h>
 
 #if IS_ENABLED(CONFIG_NET_IFE)
-- 
cgit v1.2.3


From 113e63286697893127c3ee83471b45ad0cf8d75f Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sat, 20 Apr 2019 23:29:44 -0400
Subject: net: fib: drop include of module.h from fib_notifier.h

Ideally, header files under include/linux shouldn't be adding
includes of other headers, in anticipation of their consumers,
but just the headers needed for the header itself to pass
parsing with CPP.

The module.h is particularly bad in this sense, as it itself does
include a whole bunch of other headers, due to the complexity of
module support.

Since fib_notifier.h is not going into a module struct looking for
specific fields, we can just let it know that module is a struct,
just like about 60 other include/linux headers already do.

Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_notifier.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h
index c91ec732afd6..c49d7bfb5c30 100644
--- a/include/net/fib_notifier.h
+++ b/include/net/fib_notifier.h
@@ -2,10 +2,11 @@
 #define __NET_FIB_NOTIFIER_H
 
 #include <linux/types.h>
-#include <linux/module.h>
 #include <linux/notifier.h>
 #include <net/net_namespace.h>
 
+struct module;
+
 struct fib_notifier_info {
 	struct net *net;
 	int family;
-- 
cgit v1.2.3


From a130f9b27545f56880034c345da9a4efc2ba2b24 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sat, 20 Apr 2019 23:29:45 -0400
Subject: net: tc_act: drop include of module.h from tc_ife.h

Ideally, header files under include/linux shouldn't be adding
includes of other headers, in anticipation of their consumers,
but just the headers needed for the header itself to pass
parsing with CPP.

The module.h is particularly bad in this sense, as it itself does
include a whole bunch of other headers, due to the complexity of
module support.

Since tc_ife.h is not going into a module struct looking for
specific fields, we can just let it know that module is a struct,
just like about 60 other include/linux headers already do.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_ife.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
index 86d13b01b39d..c7f24a2da1ca 100644
--- a/include/net/tc_act/tc_ife.h
+++ b/include/net/tc_act/tc_ife.h
@@ -5,7 +5,8 @@
 #include <net/act_api.h>
 #include <linux/etherdevice.h>
 #include <linux/rtnetlink.h>
-#include <linux/module.h>
+
+struct module;
 
 struct tcf_ife_params {
 	u8 eth_dst[ETH_ALEN];
-- 
cgit v1.2.3


From f2ad1a522e9817fba7799008e0a8dc6f8a32bf7d Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Mon, 22 Apr 2019 12:08:39 +0000
Subject: net: devlink: Add extack to shared buffer operations

Add extack to shared buffer set operations, so that meaningful error
messages could be propagated to the user.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 70c7d1ac8344..4f5e41613503 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -491,13 +491,14 @@ struct devlink_ops {
 			   struct devlink_sb_pool_info *pool_info);
 	int (*sb_pool_set)(struct devlink *devlink, unsigned int sb_index,
 			   u16 pool_index, u32 size,
-			   enum devlink_sb_threshold_type threshold_type);
+			   enum devlink_sb_threshold_type threshold_type,
+			   struct netlink_ext_ack *extack);
 	int (*sb_port_pool_get)(struct devlink_port *devlink_port,
 				unsigned int sb_index, u16 pool_index,
 				u32 *p_threshold);
 	int (*sb_port_pool_set)(struct devlink_port *devlink_port,
 				unsigned int sb_index, u16 pool_index,
-				u32 threshold);
+				u32 threshold, struct netlink_ext_ack *extack);
 	int (*sb_tc_pool_bind_get)(struct devlink_port *devlink_port,
 				   unsigned int sb_index,
 				   u16 tc_index,
@@ -507,7 +508,8 @@ struct devlink_ops {
 				   unsigned int sb_index,
 				   u16 tc_index,
 				   enum devlink_sb_pool_type pool_type,
-				   u16 pool_index, u32 threshold);
+				   u16 pool_index, u32 threshold,
+				   struct netlink_ext_ack *extack);
 	int (*sb_occ_snapshot)(struct devlink *devlink,
 			       unsigned int sb_index);
 	int (*sb_occ_max_clear)(struct devlink *devlink,
-- 
cgit v1.2.3


From f24ea52873c726bf7b54318f00ec45050222b367 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 16 Apr 2019 16:44:37 +0200
Subject: xfrm: remove tos indirection from afinfo_policy

Only used by ipv4, we can read the fl4 tos value directly instead.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 77eb578a0384..652da5861772 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -329,7 +329,6 @@ struct xfrm_policy_afinfo {
 	void			(*decode_session)(struct sk_buff *skb,
 						  struct flowi *fl,
 						  int reverse);
-	int			(*get_tos)(const struct flowi *fl);
 	int			(*init_path)(struct xfrm_dst *path,
 					     struct dst_entry *dst,
 					     int nfheader_len);
-- 
cgit v1.2.3


From 2e8b4aa816eaaf480fe68b1086614259caf1bf3c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 16 Apr 2019 16:44:38 +0200
Subject: xfrm: remove init_path indirection from afinfo_policy

handle this directly, its only used by ipv6.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 652da5861772..b8de1622141a 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -329,9 +329,6 @@ struct xfrm_policy_afinfo {
 	void			(*decode_session)(struct sk_buff *skb,
 						  struct flowi *fl,
 						  int reverse);
-	int			(*init_path)(struct xfrm_dst *path,
-					     struct dst_entry *dst,
-					     int nfheader_len);
 	int			(*fill_dst)(struct xfrm_dst *xdst,
 					    struct net_device *dev,
 					    const struct flowi *fl);
-- 
cgit v1.2.3


From c53ac41e3720926301c623d6682bb87ce992a3b3 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 16 Apr 2019 16:44:39 +0200
Subject: xfrm: remove decode_session indirection from afinfo_policy

No external dependencies, might as well handle this directly.
xfrm_afinfo_policy is now 40 bytes on x86_64.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index b8de1622141a..18d6b33501b9 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -326,9 +326,6 @@ struct xfrm_policy_afinfo {
 					     xfrm_address_t *saddr,
 					     xfrm_address_t *daddr,
 					     u32 mark);
-	void			(*decode_session)(struct sk_buff *skb,
-						  struct flowi *fl,
-						  int reverse);
 	int			(*fill_dst)(struct xfrm_dst *xdst,
 					    struct net_device *dev,
 					    const struct flowi *fl);
-- 
cgit v1.2.3


From bb9cd077e216b886438c5698e1cd75f762ecd3c9 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 17 Apr 2019 11:45:13 +0200
Subject: xfrm: remove unneeded export_symbols

None of them have any external callers, make them static.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 18d6b33501b9..eb5018b1cf9c 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1568,7 +1568,6 @@ static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
 int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb);
 int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb);
-int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err);
 int xfrm4_protocol_register(struct xfrm4_protocol *handler, unsigned char protocol);
 int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, unsigned char protocol);
 int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family);
@@ -1584,7 +1583,6 @@ int xfrm6_rcv(struct sk_buff *skb);
 int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
 		     xfrm_address_t *saddr, u8 proto);
 void xfrm6_local_error(struct sk_buff *skb, u32 mtu);
-int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err);
 int xfrm6_protocol_register(struct xfrm6_protocol *handler, unsigned char protocol);
 int xfrm6_protocol_deregister(struct xfrm6_protocol *handler, unsigned char protocol);
 int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family);
-- 
cgit v1.2.3


From 756e161993824961fad4ba62c40045d9ab65bdb8 Mon Sep 17 00:00:00 2001
From: Sean Wang <sean.wang@mediatek.com>
Date: Fri, 8 Mar 2019 09:15:43 +0800
Subject: mmc: add SDIO identifiers for MediaTek Bluetooth devices

The SDIO identifier for MediaTek Bluetooth devices were defined in the
MediaTek Bluetooth driver. Moving the definitions in MMC header file
seems common sense.

Signed-off-by: Sean Wang <sean.wang@mediatek.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/mmc/sdio_ids.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index 4332199c71c2..d1a5d5df02f5 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -59,6 +59,8 @@
 #define SDIO_DEVICE_ID_MARVELL_8797_F0		0x9128
 #define SDIO_DEVICE_ID_MARVELL_8887WLAN	0x9134
 
+#define SDIO_VENDOR_ID_MEDIATEK			0x037a
+
 #define SDIO_VENDOR_ID_SIANO			0x039a
 #define SDIO_DEVICE_ID_SIANO_NOVA_B0		0x0201
 #define SDIO_DEVICE_ID_SIANO_NICE		0x0202
-- 
cgit v1.2.3


From db0a390835209c1c5dce7669de3d23a8cba10f34 Mon Sep 17 00:00:00 2001
From: Sean Wang <sean.wang@mediatek.com>
Date: Thu, 14 Mar 2019 05:01:58 +0800
Subject: mmc: sdio: Add helper macro for sdio_driver boilerplate

This patch introduces the module_sdio_driver macro which is a convenience
macro for SDIO driver modules similar to module_usb_driver. It is intended
to be used by drivers which init/exit section does nothing but register/
unregister the SDIO driver. By using this macro it is possible to eliminate
a few lines of boilerplate code per SDIO driver.

Suggested-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Sean Wang <sean.wang@mediatek.com>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/mmc/sdio_func.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/linux/mmc/sdio_func.h b/include/linux/mmc/sdio_func.h
index 97ca105347a6..5685805533b5 100644
--- a/include/linux/mmc/sdio_func.h
+++ b/include/linux/mmc/sdio_func.h
@@ -111,6 +111,18 @@ struct sdio_driver {
 extern int sdio_register_driver(struct sdio_driver *);
 extern void sdio_unregister_driver(struct sdio_driver *);
 
+/**
+ * module_sdio_driver() - Helper macro for registering a SDIO driver
+ * @__sdio_driver: sdio_driver struct
+ *
+ * Helper macro for SDIO drivers which do not do anything special in module
+ * init/exit. This eliminates a lot of boilerplate. Each module may only
+ * use this macro once, and calling it replaces module_init() and module_exit()
+ */
+#define module_sdio_driver(__sdio_driver) \
+	module_driver(__sdio_driver, sdio_register_driver, \
+		      sdio_unregister_driver)
+
 /*
  * SDIO I/O operations
  */
-- 
cgit v1.2.3


From 089b19a9204fc090793d389a265f54124eacb05d Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 22 Apr 2019 08:55:44 -0700
Subject: flow_dissector: switch kernel context to struct bpf_flow_dissector

struct bpf_flow_dissector has a small subset of sk_buff fields that
flow dissector BPF program is allowed to access and an optional
pointer to real skb. Real skb is used only in bpf_skb_load_bytes
helper to read non-linear data.

The real motivation for this is to be able to call flow dissector
from eth_get_headlen context where we don't have an skb and need
to dissect raw bytes.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/skbuff.h       |  4 ++++
 include/net/flow_dissector.h |  7 +++++++
 include/net/sch_generic.h    | 11 ++++-------
 3 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6f42942a443b..2b7b8228c5c3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1275,6 +1275,10 @@ static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
 }
 #endif
 
+struct bpf_flow_dissector;
+bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
+		      __be16 proto, int nhoff, int hlen);
+
 struct bpf_flow_keys;
 bool __skb_flow_bpf_dissect(struct bpf_prog *prog,
 			    const struct sk_buff *skb,
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 2b26979efb48..7c5a8d9a8d2a 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -305,4 +305,11 @@ static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissec
 	return ((char *)target_container) + flow_dissector->offset[key_id];
 }
 
+struct bpf_flow_dissector {
+	struct bpf_flow_keys	*flow_keys;
+	const struct sk_buff	*skb;
+	void			*data;
+	void			*data_end;
+};
+
 #endif
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e8f85cd2afce..21f434f3ac9e 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -364,13 +364,10 @@ struct tcf_proto {
 };
 
 struct qdisc_skb_cb {
-	union {
-		struct {
-			unsigned int		pkt_len;
-			u16			slave_dev_queue_mapping;
-			u16			tc_classid;
-		};
-		struct bpf_flow_keys *flow_keys;
+	struct {
+		unsigned int		pkt_len;
+		u16			slave_dev_queue_mapping;
+		u16			tc_classid;
 	};
 #define QDISC_CB_PRIV_LEN 20
 	unsigned char		data[QDISC_CB_PRIV_LEN];
-- 
cgit v1.2.3


From 3cbf4ffba5eeec60f82868a5facc1962d8a44d00 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 22 Apr 2019 08:55:46 -0700
Subject: net: plumb network namespace into __skb_flow_dissect

This new argument will be used in the next patches for the
eth_get_headlen use case. eth_get_headlen calls flow dissector
with only data (without skb) so there is currently no way to
pull attached BPF flow dissector program. With this new argument,
we can amend the callers to explicitly pass network namespace
so we can use attached BPF program.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/skbuff.h | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2b7b8228c5c3..b466fbface2e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1284,7 +1284,8 @@ bool __skb_flow_bpf_dissect(struct bpf_prog *prog,
 			    const struct sk_buff *skb,
 			    struct flow_dissector *flow_dissector,
 			    struct bpf_flow_keys *flow_keys);
-bool __skb_flow_dissect(const struct sk_buff *skb,
+bool __skb_flow_dissect(const struct net *net,
+			const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
 			void *target_container,
 			void *data, __be16 proto, int nhoff, int hlen,
@@ -1294,8 +1295,8 @@ static inline bool skb_flow_dissect(const struct sk_buff *skb,
 				    struct flow_dissector *flow_dissector,
 				    void *target_container, unsigned int flags)
 {
-	return __skb_flow_dissect(skb, flow_dissector, target_container,
-				  NULL, 0, 0, 0, flags);
+	return __skb_flow_dissect(NULL, skb, flow_dissector,
+				  target_container, NULL, 0, 0, 0, flags);
 }
 
 static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
@@ -1303,18 +1304,19 @@ static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
 					      unsigned int flags)
 {
 	memset(flow, 0, sizeof(*flow));
-	return __skb_flow_dissect(skb, &flow_keys_dissector, flow,
-				  NULL, 0, 0, 0, flags);
+	return __skb_flow_dissect(NULL, skb, &flow_keys_dissector,
+				  flow, NULL, 0, 0, 0, flags);
 }
 
 static inline bool
-skb_flow_dissect_flow_keys_basic(const struct sk_buff *skb,
+skb_flow_dissect_flow_keys_basic(const struct net *net,
+				 const struct sk_buff *skb,
 				 struct flow_keys_basic *flow, void *data,
 				 __be16 proto, int nhoff, int hlen,
 				 unsigned int flags)
 {
 	memset(flow, 0, sizeof(*flow));
-	return __skb_flow_dissect(skb, &flow_keys_basic_dissector, flow,
+	return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow,
 				  data, proto, nhoff, hlen, flags);
 }
 
@@ -2492,7 +2494,8 @@ static inline void skb_probe_transport_header(struct sk_buff *skb)
 	if (skb_transport_header_was_set(skb))
 		return;
 
-	if (skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0))
+	if (skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
+					     NULL, 0, 0, 0, 0))
 		skb_set_transport_header(skb, keys.control.thoff);
 }
 
-- 
cgit v1.2.3


From 9b52e3f267a6835efd50ed9002d530666d16a411 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 22 Apr 2019 08:55:47 -0700
Subject: flow_dissector: handle no-skb use case

When called without skb, gather all required data from the
__skb_flow_dissect's arguments and use recently introduces
no-skb mode of bpf flow dissector.

Note: WARN_ON_ONCE(!net) will now trigger for eth_get_headlen users.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/skbuff.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b466fbface2e..998256c2820b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1279,11 +1279,6 @@ struct bpf_flow_dissector;
 bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
 		      __be16 proto, int nhoff, int hlen);
 
-struct bpf_flow_keys;
-bool __skb_flow_bpf_dissect(struct bpf_prog *prog,
-			    const struct sk_buff *skb,
-			    struct flow_dissector *flow_dissector,
-			    struct bpf_flow_keys *flow_keys);
 bool __skb_flow_dissect(const struct net *net,
 			const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
-- 
cgit v1.2.3


From c43f1255b866b423d2381f77eaa2cbc64a9c49aa Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 22 Apr 2019 08:55:48 -0700
Subject: net: pass net_device argument to the eth_get_headlen

Update all users of eth_get_headlen to pass network device, fetch
network namespace from it and pass it down to the flow dissector.
This commit is a noop until administrator inserts BPF flow dissector
program.

Cc: Maxim Krasnyansky <maxk@qti.qualcomm.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: intel-wired-lan@lists.osuosl.org
Cc: Yisen Zhuang <yisen.zhuang@huawei.com>
Cc: Salil Mehta <salil.mehta@huawei.com>
Cc: Michael Chan <michael.chan@broadcom.com>
Cc: Igor Russkikh <igor.russkikh@aquantia.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/etherdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index e2f3b21cd72a..c6c1930e28a0 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -33,7 +33,7 @@ struct device;
 int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
 unsigned char *arch_get_platform_mac_address(void);
 int nvmem_get_mac_address(struct device *dev, void *addrbuf);
-u32 eth_get_headlen(void *data, unsigned int max_len);
+u32 eth_get_headlen(const struct net_device *dev, void *data, unsigned int len);
 __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
 extern const struct header_ops eth_header_ops;
 
-- 
cgit v1.2.3


From a93f7fe134543649cf2e2d8fc2c50a8f4d742915 Mon Sep 17 00:00:00 2001
From: Jian Shen <shenjian15@huawei.com>
Date: Mon, 22 Apr 2019 21:52:23 +0800
Subject: net: phy: marvell: add new default led configure for m88e151x

The default m88e151x LED configuration is 0x1177, used LED[0]
for 1000M link, LED[1] for 100M link, and LED[2] for active.
But for some boards, which use LED[0] for link, and LED[1] for
active, prefer to be 0x1040. To be compatible with this case,
this patch defines a new dev_flag, and set it before connect
phy in HNS3 driver. When phy initializing, using the new
LED configuration if this dev_flag is set.

Signed-off-by: Jian Shen <shenjian15@huawei.com>
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/marvell_phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
index 73d04743a2bb..af6b11d4d673 100644
--- a/include/linux/marvell_phy.h
+++ b/include/linux/marvell_phy.h
@@ -34,5 +34,6 @@
 /* struct phy_device dev_flags definitions */
 #define MARVELL_PHY_M1145_FLAGS_RESISTANCE	0x00000001
 #define MARVELL_PHY_M1118_DNS323_LEDS		0x00000002
+#define MARVELL_PHY_LED0_LINK_LED1_ACTIVE	0x00000004
 
 #endif /* _MARVELL_PHY_H */
-- 
cgit v1.2.3


From c2273219baa5097a4d7c1c162b992623534f34c1 Mon Sep 17 00:00:00 2001
From: Shay Agroskin <shayag@mellanox.com>
Date: Thu, 14 Mar 2019 14:54:07 +0200
Subject: net/mlx5e: XDP, Inline small packets into the TX MPWQE in XDP xmit
 flow

Upon high packet rate with multiple CPUs TX workloads, much of the HCA's
resources are spent on prefetching TX descriptors, thus affecting
transmission rates.
This patch comes to mitigate this problem by moving some workload to the
CPU and reducing the HW data prefetch overhead for small packets (<= 256B).

When forwarding packets with XDP, a packet that is smaller
than a certain size (set to ~256 bytes) would be sent inline within
its WQE TX descrptor (mem-copied), when the hardware tx queue is congested
beyond a pre-defined water-mark.

This is added to better utilize the HW resources (which now makes
one less packet data prefetch) and allow better scalability, on the
account of CPU usage (which now 'memcpy's the packet into the WQE).

To load balance between HW and CPU and get max packet rate, we use
watermarks to detect how much the HW is congested and move the work
loads back and forth between HW and CPU.

Performance:
Tested packet rate for UDP 64Byte multi-stream
over two dual port ConnectX-5 100Gbps NICs.
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz

* Tested with hyper-threading disabled

XDP_TX:

|          | before | after   |       |
| 24 rings | 51Mpps | 116Mpps | +126% |
| 1 ring   | 12Mpps | 12Mpps  | same  |

XDP_REDIRECT:

** Below is the transmit rate, not the redirection rate
which might be larger, and is not affected by this patch.

|          | before  | after   |      |
| 32 rings | 64Mpps  | 92Mpps  | +43% |
| 1 ring   | 6.4Mpps | 6.4Mpps | same |

As we can see, feature significantly improves scaling, without
hurting single ring performance.

Signed-off-by: Shay Agroskin <shayag@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/qp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 0343c81d4c5f..3ba4edbd17a6 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -395,6 +395,7 @@ struct mlx5_wqe_signature_seg {
 
 struct mlx5_wqe_inline_seg {
 	__be32	byte_count;
+	__be32	data[0];
 };
 
 enum mlx5_sig_type {
-- 
cgit v1.2.3


From f05713e0916ca46f127641b6afa74bd1a0772423 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 22 Apr 2019 18:35:03 -0700
Subject: ipv6: convert fib6_ref to refcount_t

We suspect some issues involving fib6_ref 0 -> 1 transitions might
cause strange syzbot reports.

Lets convert fib6_ref to refcount_t to catch them earlier.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Wei Wang <weiwan@google.com>
Acked-by: Wei Wang <weiwan@google.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 352f767bea81..5a4a67b38712 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -146,7 +146,7 @@ struct fib6_info {
 	struct list_head		fib6_siblings;
 	unsigned int			fib6_nsiblings;
 
-	atomic_t			fib6_ref;
+	refcount_t			fib6_ref;
 	unsigned long			expires;
 	struct dst_metrics		*fib6_metrics;
 #define fib6_pmtu		fib6_metrics->metrics[RTAX_MTU-1]
@@ -284,17 +284,17 @@ void fib6_info_destroy_rcu(struct rcu_head *head);
 
 static inline void fib6_info_hold(struct fib6_info *f6i)
 {
-	atomic_inc(&f6i->fib6_ref);
+	refcount_inc(&f6i->fib6_ref);
 }
 
 static inline bool fib6_info_hold_safe(struct fib6_info *f6i)
 {
-	return atomic_inc_not_zero(&f6i->fib6_ref);
+	return refcount_inc_not_zero(&f6i->fib6_ref);
 }
 
 static inline void fib6_info_release(struct fib6_info *f6i)
 {
-	if (f6i && atomic_dec_and_test(&f6i->fib6_ref))
+	if (f6i && refcount_dec_and_test(&f6i->fib6_ref))
 		call_rcu(&f6i->rcu, fib6_info_destroy_rcu);
 }
 
-- 
cgit v1.2.3


From ffa8ce54be3aaf6b15abae3bbd08282b867d3a4f Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 23 Apr 2019 08:23:41 -0700
Subject: lwtunnel: Pass encap and encap type attributes to lwtunnel_fill_encap

Currently, lwtunnel_fill_encap hardcodes the encap and encap type
attributes as RTA_ENCAP and RTA_ENCAP_TYPE, respectively. The nexthop
objects want to re-use this code but the encap attributes passed to
userspace as NHA_ENCAP and NHA_ENCAP_TYPE. Since that is the only
difference, change lwtunnel_fill_encap to take the attribute type as
an input.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 671113bcb2cc..5d6c5b1fc695 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -118,8 +118,8 @@ int lwtunnel_build_state(u16 encap_type,
 			 unsigned int family, const void *cfg,
 			 struct lwtunnel_state **lws,
 			 struct netlink_ext_ack *extack);
-int lwtunnel_fill_encap(struct sk_buff *skb,
-			struct lwtunnel_state *lwtstate);
+int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate,
+			int encap_attr, int encap_type_attr);
 int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate);
 struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len);
 int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
@@ -219,7 +219,8 @@ static inline int lwtunnel_build_state(u16 encap_type,
 }
 
 static inline int lwtunnel_fill_encap(struct sk_buff *skb,
-				      struct lwtunnel_state *lwtstate)
+				      struct lwtunnel_state *lwtstate,
+				      int encap_attr, int encap_type_attr)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From ecc5663cce8c7d7e4eba32af4e1e3cab296c64b9 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 23 Apr 2019 08:48:09 -0700
Subject: net: Change nhc_flags to unsigned char

nhc_flags holds the RTNH_F flags for a given nexthop (fib{6}_nh).
All of the RTNH_F_ flags fit in an unsigned char, and since the API to
userspace (rtnh_flags and lower byte of rtm_flags) is 1 byte it can not
grow. Make nhc_flags in fib_nh_common an unsigned char and shrink the
size of the struct by 8, from 56 to 48 bytes.

Update the flags arguments for up netdevice events and fib_nexthop_info
which determines the RTNH_F flags to return on a dump/event. The RTNH_F
flags are passed in the lower byte of rtm_flags which is an unsigned int
so use a temp variable for the flags to fib_nexthop_info and combine
with rtm_flags in the caller.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 2 +-
 include/net/ip_fib.h    | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index df9cebc2b20c..4790beaa86e0 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -182,7 +182,7 @@ int rt6_dump_route(struct fib6_info *f6i, void *p_arg);
 void rt6_mtu_change(struct net_device *dev, unsigned int mtu);
 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
-void rt6_sync_up(struct net_device *dev, unsigned int nh_flags);
+void rt6_sync_up(struct net_device *dev, unsigned char nh_flags);
 void rt6_disable_ip(struct net_device *dev, unsigned long event);
 void rt6_sync_down_dev(struct net_device *dev, unsigned long event);
 void rt6_multipath_rebalance(struct fib6_info *f6i);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index d8195c77e247..772a9e61bd84 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -83,11 +83,11 @@ struct fnhe_hash_bucket {
 struct fib_nh_common {
 	struct net_device	*nhc_dev;
 	int			nhc_oif;
-	unsigned int		nhc_flags;
-	struct lwtunnel_state	*nhc_lwtstate;
 	unsigned char		nhc_scope;
 	u8			nhc_family;
 	u8			nhc_gw_family;
+	unsigned char		nhc_flags;
+	struct lwtunnel_state	*nhc_lwtstate;
 
 	union {
 		__be32          ipv4;
@@ -425,7 +425,7 @@ int fib_unmerge(struct net *net);
 int ip_fib_check_default(__be32 gw, struct net_device *dev);
 int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force);
 int fib_sync_down_addr(struct net_device *dev, __be32 local);
-int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
+int fib_sync_up(struct net_device *dev, unsigned char nh_flags);
 void fib_sync_mtu(struct net_device *dev, u32 orig_mtu);
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -500,7 +500,7 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 			  struct netlink_callback *cb);
 
 int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh,
-		     unsigned int *flags, bool skip_oif);
+		     unsigned char *flags, bool skip_oif);
 int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh,
 		    int nh_weight);
 #endif  /* _NET_FIB_H */
-- 
cgit v1.2.3


From a65120bae4b7425a39c5783aa3d4fc29677eef0e Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 23 Apr 2019 18:05:33 -0700
Subject: ipv6: Use result arg in fib_lookup_arg consistently

arg.result is sometimes used as fib6_result and sometimes used to
hold the rt6_info. Add rt6_info to fib6_result and make the use
of arg.result consistent through ipv6 rules.

The rt6 entry is filled in for lookups returning a dst_entry, but not
for direct fib_lookups that just want a fib6_info.

Fixes: effda4dd97e8 ("ipv6: Pass fib6_result to fib lookups")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 5a4a67b38712..40105738e2f6 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -195,6 +195,7 @@ struct fib6_result {
 	struct fib6_info	*f6i;
 	u32			fib6_flags;
 	u8			fib6_type;
+	struct rt6_info		*rt6;
 };
 
 #define for_each_fib6_node_rt_rcu(fn)					\
-- 
cgit v1.2.3


From 9fba2b9b4f1566213a4f0cec658479d8915086fa Mon Sep 17 00:00:00 2001
From: Ariel Levkovich <lariel@mellanox.com>
Date: Sun, 31 Mar 2019 19:44:43 +0300
Subject: net/mlx5: Expose SW ICM related device memory capabilities

Add SW ICM related fields to the device memory capabilities
structure and sw ownership capability in flow table properties.

The currently supported SW ICM types are steering and header modify
and the changes exposes the device memory capabilities for each
of these two types.

SW ICM memory can be allocated by SW and then be accessed by RDMA
operations for direct management of the HW packet handling tables.

Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
Reviewed-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 45 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 11e498442134..d96eb0916a44 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -80,6 +80,14 @@ enum {
 	MLX5_SHARED_RESOURCE_UID = 0xffff,
 };
 
+enum {
+	MLX5_OBJ_TYPE_SW_ICM = 0x0008,
+};
+
+enum {
+	MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM = (1ULL << MLX5_OBJ_TYPE_SW_ICM),
+};
+
 enum {
 	MLX5_CMD_OP_QUERY_HCA_CAP                 = 0x100,
 	MLX5_CMD_OP_QUERY_ADAPTER                 = 0x101,
@@ -357,7 +365,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8         pop_vlan_2[0x1];
 	u8         push_vlan_2[0x1];
 	u8	   reformat_and_vlan_action[0x1];
-	u8	   reserved_at_10[0x2];
+	u8	   reserved_at_10[0x1];
+	u8         sw_owner[0x1];
 	u8	   reformat_l3_tunnel_to_l2[0x1];
 	u8	   reformat_l2_to_l3_tunnel[0x1];
 	u8	   reformat_and_modify_action[0x1];
@@ -770,7 +779,19 @@ struct mlx5_ifc_device_mem_cap_bits {
 
 	u8         max_memic_size[0x20];
 
-	u8         reserved_at_c0[0x740];
+	u8         steering_sw_icm_start_address[0x40];
+
+	u8         reserved_at_100[0x8];
+	u8         log_header_modify_sw_icm_size[0x8];
+	u8         reserved_at_110[0x2];
+	u8         log_sw_icm_alloc_granularity[0x6];
+	u8         log_steering_sw_icm_size[0x8];
+
+	u8         reserved_at_120[0x20];
+
+	u8         header_modify_sw_icm_start_address[0x40];
+
+	u8         reserved_at_180[0x680];
 };
 
 enum {
@@ -919,6 +940,7 @@ enum {
 
 enum {
 	MLX5_UCTX_CAP_RAW_TX = 1UL << 0,
+	MLX5_UCTX_CAP_INTERNAL_DEV_RES = 1UL << 1,
 };
 
 struct mlx5_ifc_cmd_hca_cap_bits {
@@ -2920,6 +2942,7 @@ enum {
 	MLX5_MKC_ACCESS_MODE_MTT   = 0x1,
 	MLX5_MKC_ACCESS_MODE_KLMS  = 0x2,
 	MLX5_MKC_ACCESS_MODE_KSM   = 0x3,
+	MLX5_MKC_ACCESS_MODE_SW_ICM = 0x4,
 	MLX5_MKC_ACCESS_MODE_MEMIC = 0x5,
 };
 
@@ -9491,6 +9514,19 @@ struct mlx5_ifc_uctx_bits {
 	u8         reserved_at_20[0x160];
 };
 
+struct mlx5_ifc_sw_icm_bits {
+	u8         modify_field_select[0x40];
+
+	u8	   reserved_at_40[0x18];
+	u8         log_sw_icm_size[0x8];
+
+	u8         reserved_at_60[0x20];
+
+	u8         sw_icm_start_addr[0x40];
+
+	u8         reserved_at_c0[0x140];
+};
+
 struct mlx5_ifc_create_umem_in_bits {
 	u8         opcode[0x10];
 	u8         uid[0x10];
@@ -9528,6 +9564,11 @@ struct mlx5_ifc_destroy_uctx_in_bits {
 	u8         reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_create_sw_icm_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits   hdr;
+	struct mlx5_ifc_sw_icm_bits		      sw_icm;
+};
+
 struct mlx5_ifc_mtrc_string_db_param_bits {
 	u8         string_db_base_address[0x20];
 
-- 
cgit v1.2.3


From 3e07047021d36674d9051e76454e8b6a3b599036 Mon Sep 17 00:00:00 2001
From: Ariel Levkovich <lariel@mellanox.com>
Date: Sun, 31 Mar 2019 19:44:48 +0300
Subject: net/mlx5: Expose TIR ICM address in command outbox

Adding the TIR ICM address to the create_tir command outbox
through which the device reports the ICM address of the newly
created TIR.

The TIR address can be used for direct attachment to a steering
rule in SW managed steering mode.

Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
Reviewed-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index d96eb0916a44..4b37519bd6a5 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -6897,14 +6897,14 @@ struct mlx5_ifc_create_tis_in_bits {
 
 struct mlx5_ifc_create_tir_out_bits {
 	u8         status[0x8];
-	u8         reserved_at_8[0x18];
+	u8         icm_address_63_40[0x18];
 
 	u8         syndrome[0x20];
 
-	u8         reserved_at_40[0x8];
+	u8         icm_address_39_32[0x8];
 	u8         tirn[0x18];
 
-	u8         reserved_at_60[0x20];
+	u8         icm_address_31_0[0x20];
 };
 
 struct mlx5_ifc_create_tir_in_bits {
-- 
cgit v1.2.3


From 96780e4f46b2fc0fc5ae2b95957002e2c42b11d3 Mon Sep 17 00:00:00 2001
From: Ariel Levkovich <lariel@mellanox.com>
Date: Sun, 31 Mar 2019 19:44:49 +0300
Subject: net/mlx5: Introduce new TIR creation core API

Introducing new TIR creation core API which allows caller
to receive back from the call the full command outbox.

This comes as a preparation for the next patch that will
retrieve the TIR ICM address from the command outbox.

Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
Reviewed-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/transobj.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/mlx5/transobj.h b/include/linux/mlx5/transobj.h
index a261d5528ff7..dc6b1e7cb8c4 100644
--- a/include/linux/mlx5/transobj.h
+++ b/include/linux/mlx5/transobj.h
@@ -50,6 +50,9 @@ int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out);
 int mlx5_core_query_sq_state(struct mlx5_core_dev *dev, u32 sqn, u8 *state);
 int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen,
 			 u32 *tirn);
+int mlx5_core_create_tir_out(struct mlx5_core_dev *dev,
+			     u32 *in, int inlen,
+			     u32 *out, int outlen);
 int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in,
 			 int inlen);
 void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn);
-- 
cgit v1.2.3


From d5bb334a8e171b262e48f378bd2096c0ea458265 Mon Sep 17 00:00:00 2001
From: Marcel Holtmann <marcel@holtmann.org>
Date: Wed, 24 Apr 2019 22:19:17 +0200
Subject: Bluetooth: Align minimum encryption key size for LE and BR/EDR
 connections

The minimum encryption key size for LE connections is 56 bits and to
align LE with BR/EDR, enforce 56 bits of minimum encryption key size for
BR/EDR connections as well.

Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Cc: stable@vger.kernel.org
---
 include/net/bluetooth/hci_core.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 094e61e07030..05b1b96f4d9e 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -190,6 +190,9 @@ struct adv_info {
 
 #define HCI_MAX_SHORT_NAME_LENGTH	10
 
+/* Min encryption key size to match with SMP */
+#define HCI_MIN_ENC_KEY_SIZE		7
+
 /* Default LE RPA expiry time, 15 minutes */
 #define HCI_DEFAULT_RPA_TIMEOUT		(15 * 60)
 
-- 
cgit v1.2.3


From 118c8e9ae629d35fa9b3d27a7b9d59298b1b4183 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 25 Apr 2019 14:37:23 -0700
Subject: bpf: support BPF_PROG_QUERY for BPF_FLOW_DISSECTOR attach_type

target_fd is target namespace. If there is a flow dissector BPF program
attached to that namespace, its (single) id is returned.

v5:
* drop net ref right after rcu unlock (Daniel Borkmann)

v4:
* add missing put_net (Jann Horn)

v3:
* add missing inline to skb_flow_dissector_prog_query static def
  (kbuild test robot <lkp@intel.com>)

v2:
* don't sleep in rcu critical section (Jakub Kicinski)
* check input prog_cnt (exit early)

Cc: Jann Horn <jannh@google.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/skbuff.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 998256c2820b..6d58fa8a65fd 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1258,11 +1258,19 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 			     unsigned int key_count);
 
 #ifdef CONFIG_NET
+int skb_flow_dissector_prog_query(const union bpf_attr *attr,
+				  union bpf_attr __user *uattr);
 int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
 				       struct bpf_prog *prog);
 
 int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr);
 #else
+static inline int skb_flow_dissector_prog_query(const union bpf_attr *attr,
+						union bpf_attr __user *uattr)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
 						     struct bpf_prog *prog)
 {
-- 
cgit v1.2.3


From 1d9373329bcbe0cfbb9af1738139292a9b10fe6a Mon Sep 17 00:00:00 2001
From: Shaul Triebitz <shaul.triebitz@intel.com>
Date: Fri, 15 Mar 2019 17:38:58 +0200
Subject: nl80211: increase NL80211_MAX_SUPP_REG_RULES

The iwlwifi driver creates one rule per channel, thus it needs more
rules than normal.  To solve this, increase NL80211_MAX_SUPP_REG_RULES
so iwlwifi can also fit UHB (ultra high band) channels.

Signed-off-by: Shaul Triebitz <shaul.triebitz@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index a99d75bef598..f00dbd82149e 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -11,7 +11,7 @@
  * Copyright 2008 Jouni Malinen <jouni.malinen@atheros.com>
  * Copyright 2008 Colin McCabe <colin@cozybit.com>
  * Copyright 2015-2017	Intel Deutschland GmbH
- * Copyright (C) 2018 Intel Corporation
+ * Copyright (C) 2018-2019 Intel Corporation
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -2809,7 +2809,7 @@ enum nl80211_attrs {
 
 #define NL80211_MAX_SUPP_RATES			32
 #define NL80211_MAX_SUPP_HT_RATES		77
-#define NL80211_MAX_SUPP_REG_RULES		64
+#define NL80211_MAX_SUPP_REG_RULES		128
 #define NL80211_TKIP_DATA_OFFSET_ENCR_KEY	0
 #define NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY	16
 #define NL80211_TKIP_DATA_OFFSET_RX_MIC_KEY	24
-- 
cgit v1.2.3


From f7dacfb11475ba777e1e84ccec2e14b0ba5a17a3 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Fri, 15 Mar 2019 17:39:03 +0200
Subject: cfg80211: support non-inheritance element

Subelement profile may specify element IDs it doesn't inherit
from the management frame. Support it.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 1 +
 include/net/cfg80211.h    | 8 ++++++++
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 48703ec60d06..522881f31938 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2487,6 +2487,7 @@ enum ieee80211_eid_ext {
 	WLAN_EID_EXT_HE_MU_EDCA = 38,
 	WLAN_EID_EXT_MAX_CHANNEL_SWITCH_TIME = 52,
 	WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION = 55,
+	WLAN_EID_EXT_NON_INHERITANCE = 56,
 };
 
 /* Action category code */
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 70432fd638af..777c4f021610 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5534,6 +5534,14 @@ static inline void cfg80211_gen_new_bssid(const u8 *bssid, u8 max_bssid,
 	u64_to_ether_addr(new_bssid_u64, new_bssid);
 }
 
+/**
+ * cfg80211_is_element_inherited - returns if element ID should be inherited
+ * @element: element to check
+ * @non_inherit_element: non inheritance element
+ */
+bool cfg80211_is_element_inherited(const struct element *element,
+				   const struct element *non_inherit_element);
+
 /**
  * enum cfg80211_bss_frame_type - frame type that the BSS data came from
  * @CFG80211_BSS_FTYPE_UNKNOWN: driver doesn't know whether the data is
-- 
cgit v1.2.3


From fe806e4992c9047affd263bcc13b2c047029a726 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Fri, 15 Mar 2019 17:39:05 +0200
Subject: cfg80211: support profile split between elements

Since an element is limited to 255 octets, a profile may be split
split to several elements. Support the split as defined in the 11ax
draft 3. Detect legacy split and print a net-rate limited warning,
since there is no ROI in supporting this probably non-existent
split.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 777c4f021610..f6665f8eba5a 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5542,6 +5542,20 @@ static inline void cfg80211_gen_new_bssid(const u8 *bssid, u8 max_bssid,
 bool cfg80211_is_element_inherited(const struct element *element,
 				   const struct element *non_inherit_element);
 
+/**
+ * cfg80211_merge_profile - merges a MBSSID profile if it is split between IEs
+ * @ie: ies
+ * @ielen: length of IEs
+ * @mbssid_elem: current MBSSID element
+ * @sub_elem: current MBSSID subelement (profile)
+ * @merged_ie: location of the merged profile
+ * @max_copy_len: max merged profile length
+ */
+size_t cfg80211_merge_profile(const u8 *ie, size_t ielen,
+			      const struct element *mbssid_elem,
+			      const struct element *sub_elem,
+			      u8 **merged_ie, size_t max_copy_len);
+
 /**
  * enum cfg80211_bss_frame_type - frame type that the BSS data came from
  * @CFG80211_BSS_FTYPE_UNKNOWN: driver doesn't know whether the data is
-- 
cgit v1.2.3


From abaea61c79ea7a03fde7db5b48414143546b07c4 Mon Sep 17 00:00:00 2001
From: Liad Kaufman <liad.kaufman@intel.com>
Date: Fri, 15 Mar 2019 17:39:07 +0200
Subject: ieee80211: update HE IEs to D4.0 spec

Update the out-dated comments as well, and have them point to
the correct sections in the D4.0 spec.

Signed-off-by: Liad Kaufman <liad.kaufman@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 522881f31938..61f0a316c6ac 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1557,7 +1557,7 @@ struct ieee80211_vht_operation {
  * struct ieee80211_he_cap_elem - HE capabilities element
  *
  * This structure is the "HE capabilities element" fixed fields as
- * described in P802.11ax_D3.0 section 9.4.2.237.2 and 9.4.2.237.3
+ * described in P802.11ax_D4.0 section 9.4.2.242.2 and 9.4.2.242.3
  */
 struct ieee80211_he_cap_elem {
 	u8 mac_cap_info[6];
@@ -1619,12 +1619,12 @@ struct ieee80211_he_mcs_nss_supp {
  * struct ieee80211_he_operation - HE capabilities element
  *
  * This structure is the "HE operation element" fields as
- * described in P802.11ax_D3.0 section 9.4.2.238
+ * described in P802.11ax_D4.0 section 9.4.2.243
  */
 struct ieee80211_he_operation {
 	__le32 he_oper_params;
 	__le16 he_mcs_nss_set;
-	/* Optional 0,1,3 or 4 bytes: depends on @he_oper_params */
+	/* Optional 0,1,3,4,5,7 or 8 bytes: depends on @he_oper_params */
 	u8 optional[0];
 } __packed;
 
@@ -1632,7 +1632,7 @@ struct ieee80211_he_operation {
  * struct ieee80211_he_mu_edca_param_ac_rec - MU AC Parameter Record field
  *
  * This structure is the "MU AC Parameter Record" fields as
- * described in P802.11ax_D2.0 section 9.4.2.240
+ * described in P802.11ax_D4.0 section 9.4.2.245
  */
 struct ieee80211_he_mu_edca_param_ac_rec {
 	u8 aifsn;
@@ -1644,7 +1644,7 @@ struct ieee80211_he_mu_edca_param_ac_rec {
  * struct ieee80211_mu_edca_param_set - MU EDCA Parameter Set element
  *
  * This structure is the "MU EDCA Parameter Set element" fields as
- * described in P802.11ax_D2.0 section 9.4.2.240
+ * described in P802.11ax_D4.0 section 9.4.2.245
  */
 struct ieee80211_mu_edca_param_set {
 	u8 mu_qos_info;
@@ -2026,6 +2026,7 @@ ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info)
 #define IEEE80211_HE_OPERATION_VHT_OPER_INFO			0x00004000
 #define IEEE80211_HE_OPERATION_CO_HOSTED_BSS			0x00008000
 #define IEEE80211_HE_OPERATION_ER_SU_DISABLE			0x00010000
+#define IEEE80211_HE_OPERATION_6GHZ_OP_INFO			0x00020000
 #define IEEE80211_HE_OPERATION_BSS_COLOR_MASK			0x3f000000
 #define IEEE80211_HE_OPERATION_BSS_COLOR_OFFSET		24
 #define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR		0x40000000
@@ -2056,6 +2057,8 @@ ieee80211_he_oper_size(const u8 *he_oper_ie)
 		oper_len += 3;
 	if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS)
 		oper_len++;
+	if (he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO)
+		oper_len += 4;
 
 	/* Add the first byte (extension ID) to the total length */
 	oper_len++;
-- 
cgit v1.2.3


From f2af2df800d3648b1d68e02d5b8a5d77cfee8970 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 16 Mar 2019 18:06:32 +0100
Subject: mac80211: calculate hash for fq without holding fq->lock in itxq
 enqueue
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reduces lock contention on enqueue/dequeue of iTXQ packets

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/fq_impl.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h
index be7c0fab3478..2caa86660ab0 100644
--- a/include/net/fq_impl.h
+++ b/include/net/fq_impl.h
@@ -107,21 +107,23 @@ begin:
 	return skb;
 }
 
+static u32 fq_flow_idx(struct fq *fq, struct sk_buff *skb)
+{
+	u32 hash = skb_get_hash_perturb(skb, fq->perturbation);
+
+	return reciprocal_scale(hash, fq->flows_cnt);
+}
+
 static struct fq_flow *fq_flow_classify(struct fq *fq,
-					struct fq_tin *tin,
+					struct fq_tin *tin, u32 idx,
 					struct sk_buff *skb,
 					fq_flow_get_default_t get_default_func)
 {
 	struct fq_flow *flow;
-	u32 hash;
-	u32 idx;
 
 	lockdep_assert_held(&fq->lock);
 
-	hash = skb_get_hash_perturb(skb, fq->perturbation);
-	idx = reciprocal_scale(hash, fq->flows_cnt);
 	flow = &fq->flows[idx];
-
 	if (flow->tin && flow->tin != tin) {
 		flow = get_default_func(fq, tin, idx, skb);
 		tin->collisions++;
@@ -153,7 +155,7 @@ static void fq_recalc_backlog(struct fq *fq,
 }
 
 static void fq_tin_enqueue(struct fq *fq,
-			   struct fq_tin *tin,
+			   struct fq_tin *tin, u32 idx,
 			   struct sk_buff *skb,
 			   fq_skb_free_t free_func,
 			   fq_flow_get_default_t get_default_func)
@@ -163,7 +165,7 @@ static void fq_tin_enqueue(struct fq *fq,
 
 	lockdep_assert_held(&fq->lock);
 
-	flow = fq_flow_classify(fq, tin, skb, get_default_func);
+	flow = fq_flow_classify(fq, tin, idx, skb, get_default_func);
 
 	flow->tin = tin;
 	flow->backlog += skb->len;
-- 
cgit v1.2.3


From 6cdd3979a2bdc16116c5b2eb09475abf54ba9e70 Mon Sep 17 00:00:00 2001
From: Alexander Wetzel <alexander@wetzel-home.de>
Date: Tue, 19 Mar 2019 21:34:07 +0100
Subject: nl80211/cfg80211: Extended Key ID support

Add support for IEEE 802.11-2016 "Extended Key ID for Individually
Addressed Frames".

Extend cfg80211 and nl80211 to allow pairwise keys to be installed for
Rx only, enable Tx separately and allow Key ID 1 for pairwise keys.

Signed-off-by: Alexander Wetzel <alexander@wetzel-home.de>
[use NLA_POLICY_RANGE() for NL80211_KEY_MODE]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  2 ++
 include/uapi/linux/nl80211.h | 28 ++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index f6665f8eba5a..2b039802ae2e 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -485,6 +485,7 @@ struct vif_params {
  *	with the get_key() callback, must be in little endian,
  *	length given by @seq_len.
  * @seq_len: length of @seq.
+ * @mode: key install mode (RX_TX, NO_TX or SET_TX)
  */
 struct key_params {
 	const u8 *key;
@@ -492,6 +493,7 @@ struct key_params {
 	int key_len;
 	int seq_len;
 	u32 cipher;
+	enum nl80211_key_mode mode;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index f00dbd82149e..e75615bf4453 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4152,6 +4152,27 @@ enum nl80211_channel_type {
 	NL80211_CHAN_HT40PLUS
 };
 
+/**
+ * enum nl80211_key_mode - Key mode
+ *
+ * @NL80211_KEY_RX_TX: (Default)
+ *	Key can be used for Rx and Tx immediately
+ *
+ * The following modes can only be selected for unicast keys and when the
+ * driver supports @NL80211_EXT_FEATURE_EXT_KEY_ID:
+ *
+ * @NL80211_KEY_NO_TX: Only allowed in combination with @NL80211_CMD_NEW_KEY:
+ *	Unicast key can only be used for Rx, Tx not allowed, yet
+ * @NL80211_KEY_SET_TX: Only allowed in combination with @NL80211_CMD_SET_KEY:
+ *	The unicast key identified by idx and mac is cleared for Tx and becomes
+ *	the preferred Tx key for the station.
+ */
+enum nl80211_key_mode {
+	NL80211_KEY_RX_TX,
+	NL80211_KEY_NO_TX,
+	NL80211_KEY_SET_TX
+};
+
 /**
  * enum nl80211_chan_width - channel width definitions
  *
@@ -4395,6 +4416,9 @@ enum nl80211_key_default_types {
  * @NL80211_KEY_DEFAULT_TYPES: A nested attribute containing flags
  *	attributes, specifying what a key should be set as default as.
  *	See &enum nl80211_key_default_types.
+ * @NL80211_KEY_MODE: the mode from enum nl80211_key_mode.
+ *	Defaults to @NL80211_KEY_RX_TX.
+ *
  * @__NL80211_KEY_AFTER_LAST: internal
  * @NL80211_KEY_MAX: highest key attribute
  */
@@ -4408,6 +4432,7 @@ enum nl80211_key_attributes {
 	NL80211_KEY_DEFAULT_MGMT,
 	NL80211_KEY_TYPE,
 	NL80211_KEY_DEFAULT_TYPES,
+	NL80211_KEY_MODE,
 
 	/* keep last */
 	__NL80211_KEY_AFTER_LAST,
@@ -5353,6 +5378,8 @@ enum nl80211_feature_flags {
  *      able to rekey an in-use key correctly. Userspace must not rekey PTK keys
  *      if this flag is not set. Ignoring this can leak clear text packets and/or
  *      freeze the connection.
+ * @NL80211_EXT_FEATURE_EXT_KEY_ID: Driver supports "Extended Key ID for
+ *      Individually Addressed Frames" from IEEE802.11-2016.
  *
  * @NL80211_EXT_FEATURE_AIRTIME_FAIRNESS: Driver supports getting airtime
  *	fairness for transmitted packets and has enabled airtime fairness
@@ -5406,6 +5433,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_AIRTIME_FAIRNESS,
 	NL80211_EXT_FEATURE_AP_PMKSA_CACHING,
 	NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD,
+	NL80211_EXT_FEATURE_EXT_KEY_ID,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
-- 
cgit v1.2.3


From 96fc6efb9ad9d0cd8cbb4462f0eb2a07092649e6 Mon Sep 17 00:00:00 2001
From: Alexander Wetzel <alexander@wetzel-home.de>
Date: Tue, 19 Mar 2019 21:34:08 +0100
Subject: mac80211: IEEE 802.11 Extended Key ID support

Add support for Extended Key ID as defined in IEEE 802.11-2016.

 - Implement the nl80211 API for Extended Key ID
 - Extend mac80211 API to allow drivers to support Extended Key ID
 - Enable Extended Key ID by default for drivers only supporting SW
   crypto (e.g. mac80211_hwsim)
 - Allow unicast Tx usage to be supressed (IEEE80211_KEY_FLAG_NO_AUTO_TX)
 - Select the decryption key based on the MPDU keyid
 - Enforce existing assumptions in the code that rekeys don't change the
   cipher

Signed-off-by: Alexander Wetzel <alexander@wetzel-home.de>
[remove module parameter]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index ac2ed8ec662b..c10abca55fde 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1697,6 +1697,7 @@ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif);
  * @IEEE80211_KEY_FLAG_PUT_MIC_SPACE: This flag should be set by the driver for
  *	a TKIP key if it only requires MIC space. Do not set together with
  *	@IEEE80211_KEY_FLAG_GENERATE_MMIC on the same key.
+ * @IEEE80211_KEY_FLAG_NO_AUTO_TX: Key needs explicit Tx activation.
  */
 enum ieee80211_key_flags {
 	IEEE80211_KEY_FLAG_GENERATE_IV_MGMT	= BIT(0),
@@ -1708,6 +1709,7 @@ enum ieee80211_key_flags {
 	IEEE80211_KEY_FLAG_RX_MGMT		= BIT(6),
 	IEEE80211_KEY_FLAG_RESERVE_TAILROOM	= BIT(7),
 	IEEE80211_KEY_FLAG_PUT_MIC_SPACE	= BIT(8),
+	IEEE80211_KEY_FLAG_NO_AUTO_TX		= BIT(9),
 };
 
 /**
@@ -2243,6 +2245,9 @@ struct ieee80211_txq {
  * @IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID: Hardware supports multi BSSID
  *	only for HE APs. Applies if @IEEE80211_HW_SUPPORTS_MULTI_BSSID is set.
  *
+ * @IEEE80211_HW_EXT_KEY_ID_NATIVE: Driver and hardware are supporting Extended
+ *	Key ID and can handle two unicast keys per station for Rx and Tx.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2294,6 +2299,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_TX_STATUS_NO_AMPDU_LEN,
 	IEEE80211_HW_SUPPORTS_MULTI_BSSID,
 	IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID,
+	IEEE80211_HW_EXT_KEY_ID_NATIVE,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
-- 
cgit v1.2.3


From e96d1cd2635c05efdd01b4eafcfc50c22c40751f Mon Sep 17 00:00:00 2001
From: Ashok Raj Nagarajan <arnagara@codeaurora.org>
Date: Fri, 29 Mar 2019 16:18:21 +0530
Subject: cfg80211: Add support to set tx power for a station associated

This patch adds support to set transmit power setting type and transmit
power level attributes to NL80211_CMD_SET_STATION in order to facilitate
adjusting the transmit power level of a station associated to the AP.

The added attributes allow selection of automatic and limited transmit
power level, with the level defined in dBm format.

Co-developed-by: Balaji Pothunoori <bpothuno@codeaurora.org>
Signed-off-by: Ashok Raj Nagarajan <arnagara@codeaurora.org>
Signed-off-by: Balaji Pothunoori <bpothuno@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 22 ++++++++++++++++++++++
 include/uapi/linux/nl80211.h | 15 +++++++++++++++
 2 files changed, 37 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 2b039802ae2e..2ea04e94b522 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -975,6 +975,27 @@ enum station_parameters_apply_mask {
 	STATION_PARAM_APPLY_UAPSD = BIT(0),
 	STATION_PARAM_APPLY_CAPABILITY = BIT(1),
 	STATION_PARAM_APPLY_PLINK_STATE = BIT(2),
+	STATION_PARAM_APPLY_STA_TXPOWER = BIT(3),
+};
+
+/**
+ * struct sta_txpwr - station txpower configuration
+ *
+ * Used to configure txpower for station.
+ *
+ * @power: tx power (in dBm) to be used for sending data traffic. If tx power
+ *	is not provided, the default per-interface tx power setting will be
+ *	overriding. Driver should be picking up the lowest tx power, either tx
+ *	power per-interface or per-station.
+ * @type: In particular if TPC %type is NL80211_TX_POWER_LIMITED then tx power
+ *	will be less than or equal to specified from userspace, whereas if TPC
+ *	%type is NL80211_TX_POWER_AUTOMATIC then it indicates default tx power.
+ *	NL80211_TX_POWER_FIXED is not a valid configuration option for
+ *	per peer TPC.
+ */
+struct sta_txpwr {
+	s16 power;
+	enum nl80211_tx_power_setting type;
 };
 
 /**
@@ -1049,6 +1070,7 @@ struct station_parameters {
 	const struct ieee80211_he_cap_elem *he_capa;
 	u8 he_capa_len;
 	u16 airtime_weight;
+	struct sta_txpwr txpwr;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index e75615bf4453..25f70dd2b583 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2315,6 +2315,15 @@ enum nl80211_commands {
  * @NL80211_ATTR_AIRTIME_WEIGHT: Station's weight when scheduled by the airtime
  *	scheduler.
  *
+ * @NL80211_ATTR_STA_TX_POWER_SETTING: Transmit power setting type (u8) for
+ *	station associated with the AP. See &enum nl80211_tx_power_setting for
+ *	possible values.
+ * @NL80211_ATTR_STA_TX_POWER: Transmit power level (s16) in dBm units. This
+ *	allows to set Tx power for a station. If this attribute is not included,
+ *	the default per-interface tx power setting will be overriding. Driver
+ *	should be picking up the lowest tx power, either tx power per-interface
+ *	or per-station.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2765,6 +2774,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_PEER_MEASUREMENTS,
 
 	NL80211_ATTR_AIRTIME_WEIGHT,
+	NL80211_ATTR_STA_TX_POWER_SETTING,
+	NL80211_ATTR_STA_TX_POWER,
 
 	/* add attributes here, update the policy in nl80211.c */
 
@@ -5391,6 +5402,9 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD: Driver supports
  *	filtering of sched scan results using band specific RSSI thresholds.
  *
+ * @NL80211_EXT_FEATURE_STA_TX_PWR: This driver supports controlling tx power
+ *	to a station.
+ *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
@@ -5434,6 +5448,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_AP_PMKSA_CACHING,
 	NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD,
 	NL80211_EXT_FEATURE_EXT_KEY_ID,
+	NL80211_EXT_FEATURE_STA_TX_PWR,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
-- 
cgit v1.2.3


From ba905bf432f662cb907fd692a4f160e612c0408b Mon Sep 17 00:00:00 2001
From: Ashok Raj Nagarajan <arnagara@codeaurora.org>
Date: Fri, 29 Mar 2019 16:19:09 +0530
Subject: mac80211: store tx power value from user to station

This patch introduce a new driver callback drv_sta_set_txpwr. This API will
copy the transmit power value passed from user space and call the driver
callback to set the tx power for the station.

Co-developed-by: Balaji Pothunoori <bpothuno@codeaurora.org>
Signed-off-by: Ashok Raj Nagarajan <arnagara@codeaurora.org>
Signed-off-by: Balaji Pothunoori <bpothuno@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index c10abca55fde..d66fbfe8d55d 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1889,6 +1889,24 @@ struct ieee80211_sta_rates {
 	} rate[IEEE80211_TX_RATE_TABLE_SIZE];
 };
 
+/**
+ * struct ieee80211_sta_txpwr - station txpower configuration
+ *
+ * Used to configure txpower for station.
+ *
+ * @power: indicates the tx power, in dBm, to be used when sending data frames
+ *	to the STA.
+ * @type: In particular if TPC %type is NL80211_TX_POWER_LIMITED then tx power
+ *	will be less than or equal to specified from userspace, whereas if TPC
+ *	%type is NL80211_TX_POWER_AUTOMATIC then it indicates default tx power.
+ *	NL80211_TX_POWER_FIXED is not a valid configuration option for
+ *	per peer TPC.
+ */
+struct ieee80211_sta_txpwr {
+	s16 power;
+	enum nl80211_tx_power_setting type;
+};
+
 /**
  * struct ieee80211_sta - station table entry
  *
@@ -1975,6 +1993,7 @@ struct ieee80211_sta {
 	bool support_p2p_ps;
 	u16 max_rc_amsdu_len;
 	u16 max_tid_amsdu_len[IEEE80211_NUM_TIDS];
+	struct ieee80211_sta_txpwr txpwr;
 
 	struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1];
 
@@ -3800,6 +3819,9 @@ struct ieee80211_ops {
 #endif
 	void (*sta_notify)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			enum sta_notify_cmd, struct ieee80211_sta *sta);
+	int (*sta_set_txpwr)(struct ieee80211_hw *hw,
+			     struct ieee80211_vif *vif,
+			     struct ieee80211_sta *sta);
 	int (*sta_state)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			 struct ieee80211_sta *sta,
 			 enum ieee80211_sta_state old_state,
-- 
cgit v1.2.3


From 5809a5d54bb9eda3a388b5a712657970c2cb9f8e Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 11 Apr 2019 11:59:50 +0300
Subject: cfg80211: don't pass pointer to pointer unnecessarily

The cfg80211_merge_profile() and ieee802_11_find_bssid_profile() are
a bit cleaner if we just pass the merged_ie pointer instead of a pointer
to the pointer.

This isn't a functional change, it's just a clean up.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 2ea04e94b522..944de1802210 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -5578,7 +5578,7 @@ bool cfg80211_is_element_inherited(const struct element *element,
 size_t cfg80211_merge_profile(const u8 *ie, size_t ielen,
 			      const struct element *mbssid_elem,
 			      const struct element *sub_elem,
-			      u8 **merged_ie, size_t max_copy_len);
+			      u8 *merged_ie, size_t max_copy_len);
 
 /**
  * enum cfg80211_bss_frame_type - frame type that the BSS data came from
-- 
cgit v1.2.3


From 5ab92e7fe49ad74293b50fb9e6f25be5521e2f68 Mon Sep 17 00:00:00 2001
From: Rajkumar Manoharan <rmanohar@codeaurora.org>
Date: Thu, 11 Apr 2019 13:47:24 -0700
Subject: cfg80211: add support to probe unexercised mesh link

Adding support to allow mesh HWMP to measure link metrics on unexercised
direct mesh path by sending some data frames to other mesh points which
are not currently selected as a primary traffic path but only 1 hop away.
The absence of the primary path to the chosen node makes it necessary to
apply some form of marking on a chosen packet stream so that the packets
can be properly steered to the selected node for testing, and not by the
regular mesh path lookup.

Tested-by: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org>
Signed-off-by: Rajkumar Manoharan <rmanohar@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  5 +++++
 include/uapi/linux/nl80211.h | 17 +++++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 944de1802210..298301525f9f 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3501,6 +3501,9 @@ struct cfg80211_update_owe_info {
  * @update_owe_info: Provide updated OWE info to driver. Driver implementing SME
  *	but offloading OWE processing to the user space will get the updated
  *	DH IE through this interface.
+ *
+ * @probe_mesh_link: Probe direct Mesh peer's link quality by sending data frame
+ *	and overrule HWMP path selection algorithm.
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -3817,6 +3820,8 @@ struct cfg80211_ops {
 			      struct cfg80211_pmsr_request *request);
 	int	(*update_owe_info)(struct wiphy *wiphy, struct net_device *dev,
 				   struct cfg80211_update_owe_info *owe_info);
+	int	(*probe_mesh_link)(struct wiphy *wiphy, struct net_device *dev,
+				   const u8 *buf, size_t len);
 };
 
 /*
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 25f70dd2b583..6f09d1500960 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1070,6 +1070,21 @@
  *	OWE AKM by the host drivers that implement SME but rely
  *	on the user space for the cryptographic/DH IE processing in AP mode.
  *
+ * @NL80211_CMD_PROBE_MESH_LINK: The requirement for mesh link metric
+ *	refreshing, is that from one mesh point we be able to send some data
+ *	frames to other mesh points which are not currently selected as a
+ *	primary traffic path, but which are only 1 hop away. The absence of
+ *	the primary path to the chosen node makes it necessary to apply some
+ *	form of marking on a chosen packet stream so that the packets can be
+ *	properly steered to the selected node for testing, and not by the
+ *	regular mesh path lookup. Further, the packets must be of type data
+ *	so that the rate control (often embedded in firmware) is used for
+ *	rate selection.
+ *
+ *	Here attribute %NL80211_ATTR_MAC is used to specify connected mesh
+ *	peer MAC address and %NL80211_ATTR_FRAME is used to specify the frame
+ *	content. The frame is ethernet data.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1292,6 +1307,8 @@ enum nl80211_commands {
 
 	NL80211_CMD_UPDATE_OWE_INFO,
 
+	NL80211_CMD_PROBE_MESH_LINK,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
-- 
cgit v1.2.3


From 8828f81ad4a2f4e89ebe6e7793c06ed767c31d53 Mon Sep 17 00:00:00 2001
From: Rajkumar Manoharan <rmanohar@codeaurora.org>
Date: Thu, 11 Apr 2019 13:47:26 -0700
Subject: mac80211: probe unexercised mesh links

The requirement for mesh link metric refreshing, is that from one
mesh point we be able to send some data frames to other mesh points
which are not currently selected as a primary traffic path, but which
are only 1 hop away. The absence of the primary path to the chosen node
makes it necessary to apply some form of marking on a chosen packet
stream so that the packets can be properly steered to the selected node
for testing, and not by the regular mesh path lookup.

Tested-by: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org>
Signed-off-by: Rajkumar Manoharan <rmanohar@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index d66fbfe8d55d..76a443f32fc8 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -807,6 +807,7 @@ enum mac80211_tx_info_flags {
  * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information
  * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame
  * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path
+ * @IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP: This frame skips mesh path lookup
  *
  * These flags are used in tx_info->control.flags.
  */
@@ -816,6 +817,7 @@ enum mac80211_tx_control_flags {
 	IEEE80211_TX_CTRL_RATE_INJECT		= BIT(2),
 	IEEE80211_TX_CTRL_AMSDU			= BIT(3),
 	IEEE80211_TX_CTRL_FAST_XMIT		= BIT(4),
+	IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP	= BIT(5),
 };
 
 /*
-- 
cgit v1.2.3


From 9df1c28bb75217b244257152ab7d788bb2a386d0 Mon Sep 17 00:00:00 2001
From: Matt Mullins <mmullins@fb.com>
Date: Fri, 26 Apr 2019 11:49:47 -0700
Subject: bpf: add writable context for raw tracepoints

This is an opt-in interface that allows a tracepoint to provide a safe
buffer that can be written from a BPF_PROG_TYPE_RAW_TRACEPOINT program.
The size of the buffer must be a compile-time constant, and is checked
before allowing a BPF program to attach to a tracepoint that uses this
feature.

The pointer to this buffer will be the first argument of tracepoints
that opt in; the pointer is valid and can be bpf_probe_read() by both
BPF_PROG_TYPE_RAW_TRACEPOINT and BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE
programs that attach to such a tracepoint, but the buffer to which it
points may only be written by the latter.

Signed-off-by: Matt Mullins <mmullins@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h             |  2 ++
 include/linux/bpf_types.h       |  1 +
 include/linux/tracepoint-defs.h |  1 +
 include/trace/bpf_probe.h       | 27 +++++++++++++++++++++++++--
 include/uapi/linux/bpf.h        |  1 +
 5 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f15432d90728..cd6341eabd74 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -272,6 +272,7 @@ enum bpf_reg_type {
 	PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */
 	PTR_TO_TCP_SOCK,	 /* reg points to struct tcp_sock */
 	PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */
+	PTR_TO_TP_BUFFER,	 /* reg points to a writable raw tp's buffer */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -361,6 +362,7 @@ struct bpf_prog_aux {
 	u32 used_map_cnt;
 	u32 max_ctx_offset;
 	u32 max_pkt_offset;
+	u32 max_tp_access;
 	u32 stack_depth;
 	u32 id;
 	u32 func_cnt; /* used by non-func prog as the number of func progs */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index d26991a16894..a10d37bce364 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -25,6 +25,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
 BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
 BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint)
+BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable)
 #endif
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
index 49ba9cde7e4b..b29950a19205 100644
--- a/include/linux/tracepoint-defs.h
+++ b/include/linux/tracepoint-defs.h
@@ -45,6 +45,7 @@ struct bpf_raw_event_map {
 	struct tracepoint	*tp;
 	void			*bpf_func;
 	u32			num_args;
+	u32			writable_size;
 } __aligned(32);
 
 #endif
diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h
index 505dae0bed80..d6e556c0a085 100644
--- a/include/trace/bpf_probe.h
+++ b/include/trace/bpf_probe.h
@@ -69,8 +69,7 @@ __bpf_trace_##call(void *__data, proto)					\
  * to make sure that if the tracepoint handling changes, the
  * bpf probe will fail to compile unless it too is updated.
  */
-#undef DEFINE_EVENT
-#define DEFINE_EVENT(template, call, proto, args)			\
+#define __DEFINE_EVENT(template, call, proto, args, size)		\
 static inline void bpf_test_probe_##call(void)				\
 {									\
 	check_trace_callback_type_##call(__bpf_trace_##template);	\
@@ -81,12 +80,36 @@ __bpf_trace_tp_map_##call = {						\
 	.tp		= &__tracepoint_##call,				\
 	.bpf_func	= (void *)__bpf_trace_##template,		\
 	.num_args	= COUNT_ARGS(args),				\
+	.writable_size	= size,						\
 };
 
+#define FIRST(x, ...) x
+
+#undef DEFINE_EVENT_WRITABLE
+#define DEFINE_EVENT_WRITABLE(template, call, proto, args, size)	\
+static inline void bpf_test_buffer_##call(void)				\
+{									\
+	/* BUILD_BUG_ON() is ignored if the code is completely eliminated, but \
+	 * BUILD_BUG_ON_ZERO() uses a different mechanism that is not	\
+	 * dead-code-eliminated.					\
+	 */								\
+	FIRST(proto);							\
+	(void)BUILD_BUG_ON_ZERO(size != sizeof(*FIRST(args)));		\
+}									\
+__DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size)
+
+#undef DEFINE_EVENT
+#define DEFINE_EVENT(template, call, proto, args)			\
+	__DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), 0)
 
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
 	DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#undef DEFINE_EVENT_WRITABLE
+#undef __DEFINE_EVENT
+#undef FIRST
+
 #endif /* CONFIG_BPF_EVENTS */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index eaf2d3284248..f7fa7a34a62d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -168,6 +168,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_REUSEPORT,
 	BPF_PROG_TYPE_FLOW_DISSECTOR,
 	BPF_PROG_TYPE_CGROUP_SYSCTL,
+	BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
 };
 
 enum bpf_attach_type {
-- 
cgit v1.2.3


From ea106722c76f08002b69a6983ed84dc18958ba48 Mon Sep 17 00:00:00 2001
From: Matt Mullins <mmullins@fb.com>
Date: Fri, 26 Apr 2019 11:49:48 -0700
Subject: nbd: trace sending nbd requests

This adds a tracepoint that can both observe the nbd request being sent
to the server, as well as modify that request , e.g., setting a flag in
the request that will cause the server to collect detailed tracing data.

The struct request * being handled is included to permit correlation
with the block tracepoints.

Signed-off-by: Matt Mullins <mmullins@fb.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/trace/events/nbd.h | 56 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 include/trace/events/nbd.h

(limited to 'include')

diff --git a/include/trace/events/nbd.h b/include/trace/events/nbd.h
new file mode 100644
index 000000000000..5928255ed02e
--- /dev/null
+++ b/include/trace/events/nbd.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nbd
+
+#if !defined(_TRACE_NBD_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NBD_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(nbd_send_request,
+
+	TP_PROTO(struct nbd_request *nbd_request, int index,
+		 struct request *rq),
+
+	TP_ARGS(nbd_request, index, rq),
+
+	TP_STRUCT__entry(
+		__field(struct nbd_request *, nbd_request)
+		__field(u64, dev_index)
+		__field(struct request *, request)
+	),
+
+	TP_fast_assign(
+		__entry->nbd_request = 0;
+		__entry->dev_index = index;
+		__entry->request = rq;
+	),
+
+	TP_printk("nbd%lld: request %p", __entry->dev_index, __entry->request)
+);
+
+#ifdef DEFINE_EVENT_WRITABLE
+#undef NBD_DEFINE_EVENT
+#define NBD_DEFINE_EVENT(template, call, proto, args, size)		\
+	DEFINE_EVENT_WRITABLE(template, call, PARAMS(proto),		\
+			      PARAMS(args), size)
+#else
+#undef NBD_DEFINE_EVENT
+#define NBD_DEFINE_EVENT(template, call, proto, args, size)		\
+	DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args))
+#endif
+
+NBD_DEFINE_EVENT(nbd_send_request, nbd_send_request,
+
+	TP_PROTO(struct nbd_request *nbd_request, int index,
+		 struct request *rq),
+
+	TP_ARGS(nbd_request, index, rq),
+
+	sizeof(struct nbd_request)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From 2abd2de712cd891321a06b0890a85aef1e506cb5 Mon Sep 17 00:00:00 2001
From: Andrew Hall <hall@fb.com>
Date: Fri, 26 Apr 2019 11:49:49 -0700
Subject: nbd: add tracepoints for send/receive timing

This adds four tracepoints to nbd, enabling separate tracing of payload
and header sending/receipt.

In the send path for headers that have already been sent, we also
explicitly initialize the handle so it can be referenced by the later
tracepoint.

Signed-off-by: Andrew Hall <hall@fb.com>
Signed-off-by: Matt Mullins <mmullins@fb.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/trace/events/nbd.h | 51 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/nbd.h b/include/trace/events/nbd.h
index 5928255ed02e..9849956f34d8 100644
--- a/include/trace/events/nbd.h
+++ b/include/trace/events/nbd.h
@@ -7,6 +7,57 @@
 
 #include <linux/tracepoint.h>
 
+DECLARE_EVENT_CLASS(nbd_transport_event,
+
+	TP_PROTO(struct request *req, u64 handle),
+
+	TP_ARGS(req, handle),
+
+	TP_STRUCT__entry(
+		__field(struct request *, req)
+		__field(u64, handle)
+	),
+
+	TP_fast_assign(
+		__entry->req = req;
+		__entry->handle = handle;
+	),
+
+	TP_printk(
+		"nbd transport event: request %p, handle 0x%016llx",
+		__entry->req,
+		__entry->handle
+	)
+);
+
+DEFINE_EVENT(nbd_transport_event, nbd_header_sent,
+
+	TP_PROTO(struct request *req, u64 handle),
+
+	TP_ARGS(req, handle)
+);
+
+DEFINE_EVENT(nbd_transport_event, nbd_payload_sent,
+
+	TP_PROTO(struct request *req, u64 handle),
+
+	TP_ARGS(req, handle)
+);
+
+DEFINE_EVENT(nbd_transport_event, nbd_header_received,
+
+	TP_PROTO(struct request *req, u64 handle),
+
+	TP_ARGS(req, handle)
+);
+
+DEFINE_EVENT(nbd_transport_event, nbd_payload_received,
+
+	TP_PROTO(struct request *req, u64 handle),
+
+	TP_ARGS(req, handle)
+);
+
 DECLARE_EVENT_CLASS(nbd_send_request,
 
 	TP_PROTO(struct nbd_request *nbd_request, int index,
-- 
cgit v1.2.3


From e950e843367d7990b9d7ea964e3c33876d477c4b Mon Sep 17 00:00:00 2001
From: Matt Mullins <mmullins@fb.com>
Date: Fri, 26 Apr 2019 11:49:51 -0700
Subject: selftests: bpf: test writable buffers in raw tps

This tests that:
  * a BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE cannot be attached if it
    uses either:
    * a variable offset to the tracepoint buffer, or
    * an offset beyond the size of the tracepoint buffer
  * a tracer can modify the buffer provided when attached to a writable
    tracepoint in bpf_prog_test_run

Signed-off-by: Matt Mullins <mmullins@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/trace/events/bpf_test_run.h | 50 +++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 include/trace/events/bpf_test_run.h

(limited to 'include')

diff --git a/include/trace/events/bpf_test_run.h b/include/trace/events/bpf_test_run.h
new file mode 100644
index 000000000000..265447e3f71a
--- /dev/null
+++ b/include/trace/events/bpf_test_run.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bpf_test_run
+
+#if !defined(_TRACE_BPF_TEST_RUN_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BPF_TEST_RUN_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(bpf_test_finish,
+
+	TP_PROTO(int *err),
+
+	TP_ARGS(err),
+
+	TP_STRUCT__entry(
+		__field(int, err)
+	),
+
+	TP_fast_assign(
+		__entry->err = *err;
+	),
+
+	TP_printk("bpf_test_finish with err=%d", __entry->err)
+);
+
+#ifdef DEFINE_EVENT_WRITABLE
+#undef BPF_TEST_RUN_DEFINE_EVENT
+#define BPF_TEST_RUN_DEFINE_EVENT(template, call, proto, args, size)	\
+	DEFINE_EVENT_WRITABLE(template, call, PARAMS(proto),		\
+			      PARAMS(args), size)
+#else
+#undef BPF_TEST_RUN_DEFINE_EVENT
+#define BPF_TEST_RUN_DEFINE_EVENT(template, call, proto, args, size)	\
+	DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args))
+#endif
+
+BPF_TEST_RUN_DEFINE_EVENT(bpf_test_finish, bpf_test_finish,
+
+	TP_PROTO(int *err),
+
+	TP_ARGS(err),
+
+	sizeof(int)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From 6ac99e8f23d4b10258406ca0dd7bffca5f31da9d Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 26 Apr 2019 16:39:39 -0700
Subject: bpf: Introduce bpf sk local storage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After allowing a bpf prog to
- directly read the skb->sk ptr
- get the fullsock bpf_sock by "bpf_sk_fullsock()"
- get the bpf_tcp_sock by "bpf_tcp_sock()"
- get the listener sock by "bpf_get_listener_sock()"
- avoid duplicating the fields of "(bpf_)sock" and "(bpf_)tcp_sock"
  into different bpf running context.

this patch is another effort to make bpf's network programming
more intuitive to do (together with memory and performance benefit).

When bpf prog needs to store data for a sk, the current practice is to
define a map with the usual 4-tuples (src/dst ip/port) as the key.
If multiple bpf progs require to store different sk data, multiple maps
have to be defined.  Hence, wasting memory to store the duplicated
keys (i.e. 4 tuples here) in each of the bpf map.
[ The smallest key could be the sk pointer itself which requires
  some enhancement in the verifier and it is a separate topic. ]

Also, the bpf prog needs to clean up the elem when sk is freed.
Otherwise, the bpf map will become full and un-usable quickly.
The sk-free tracking currently could be done during sk state
transition (e.g. BPF_SOCK_OPS_STATE_CB).

The size of the map needs to be predefined which then usually ended-up
with an over-provisioned map in production.  Even the map was re-sizable,
while the sk naturally come and go away already, this potential re-size
operation is arguably redundant if the data can be directly connected
to the sk itself instead of proxy-ing through a bpf map.

This patch introduces sk->sk_bpf_storage to provide local storage space
at sk for bpf prog to use.  The space will be allocated when the first bpf
prog has created data for this particular sk.

The design optimizes the bpf prog's lookup (and then optionally followed by
an inline update).  bpf_spin_lock should be used if the inline update needs
to be protected.

BPF_MAP_TYPE_SK_STORAGE:
-----------------------
To define a bpf "sk-local-storage", a BPF_MAP_TYPE_SK_STORAGE map (new in
this patch) needs to be created.  Multiple BPF_MAP_TYPE_SK_STORAGE maps can
be created to fit different bpf progs' needs.  The map enforces
BTF to allow printing the sk-local-storage during a system-wise
sk dump (e.g. "ss -ta") in the future.

The purpose of a BPF_MAP_TYPE_SK_STORAGE map is not for lookup/update/delete
a "sk-local-storage" data from a particular sk.
Think of the map as a meta-data (or "type") of a "sk-local-storage".  This
particular "type" of "sk-local-storage" data can then be stored in any sk.

The main purposes of this map are mostly:
1. Define the size of a "sk-local-storage" type.
2. Provide a similar syscall userspace API as the map (e.g. lookup/update,
   map-id, map-btf...etc.)
3. Keep track of all sk's storages of this "type" and clean them up
   when the map is freed.

sk->sk_bpf_storage:
------------------
The main lookup/update/delete is done on sk->sk_bpf_storage (which
is a "struct bpf_sk_storage").  When doing a lookup,
the "map" pointer is now used as the "key" to search on the
sk_storage->list.  The "map" pointer is actually serving
as the "type" of the "sk-local-storage" that is being
requested.

To allow very fast lookup, it should be as fast as looking up an
array at a stable-offset.  At the same time, it is not ideal to
set a hard limit on the number of sk-local-storage "type" that the
system can have.  Hence, this patch takes a cache approach.
The last search result from sk_storage->list is cached in
sk_storage->cache[] which is a stable sized array.  Each
"sk-local-storage" type has a stable offset to the cache[] array.
In the future, a map's flag could be introduced to do cache
opt-out/enforcement if it became necessary.

The cache size is 16 (i.e. 16 types of "sk-local-storage").
Programs can share map.  On the program side, having a few bpf_progs
running in the networking hotpath is already a lot.  The bpf_prog
should have already consolidated the existing sock-key-ed map usage
to minimize the map lookup penalty.  16 has enough runway to grow.

All sk-local-storage data will be removed from sk->sk_bpf_storage
during sk destruction.

bpf_sk_storage_get() and bpf_sk_storage_delete():
------------------------------------------------
Instead of using bpf_map_(lookup|update|delete)_elem(),
the bpf prog needs to use the new helper bpf_sk_storage_get() and
bpf_sk_storage_delete().  The verifier can then enforce the
ARG_PTR_TO_SOCKET argument.  The bpf_sk_storage_get() also allows to
"create" new elem if one does not exist in the sk.  It is done by
the new BPF_SK_STORAGE_GET_F_CREATE flag.  An optional value can also be
provided as the initial value during BPF_SK_STORAGE_GET_F_CREATE.
The BPF_MAP_TYPE_SK_STORAGE also supports bpf_spin_lock.  Together,
it has eliminated the potential use cases for an equivalent
bpf_map_update_elem() API (for bpf_prog) in this patch.

Misc notes:
----------
1. map_get_next_key is not supported.  From the userspace syscall
   perspective,  the map has the socket fd as the key while the map
   can be shared by pinned-file or map-id.

   Since btf is enforced, the existing "ss" could be enhanced to pretty
   print the local-storage.

   Supporting a kernel defined btf with 4 tuples as the return key could
   be explored later also.

2. The sk->sk_lock cannot be acquired.  Atomic operations is used instead.
   e.g. cmpxchg is done on the sk->sk_bpf_storage ptr.
   Please refer to the source code comments for the details in
   synchronization cases and considerations.

3. The mem is charged to the sk->sk_omem_alloc as the sk filter does.

Benchmark:
---------
Here is the benchmark data collected by turning on
the "kernel.bpf_stats_enabled" sysctl.
Two bpf progs are tested:

One bpf prog with the usual bpf hashmap (max_entries = 8192) with the
sk ptr as the key. (verifier is modified to support sk ptr as the key
That should have shortened the key lookup time.)

Another bpf prog is with the new BPF_MAP_TYPE_SK_STORAGE.

Both are storing a "u32 cnt", do a lookup on "egress_skb/cgroup" for
each egress skb and then bump the cnt.  netperf is used to drive
data with 4096 connected UDP sockets.

BPF_MAP_TYPE_HASH with a modifier verifier (152ns per bpf run)
27: cgroup_skb  name egress_sk_map  tag 74f56e832918070b run_time_ns 58280107540 run_cnt 381347633
    loaded_at 2019-04-15T13:46:39-0700  uid 0
    xlated 344B  jited 258B  memlock 4096B  map_ids 16
    btf_id 5

BPF_MAP_TYPE_SK_STORAGE in this patch (66ns per bpf run)
30: cgroup_skb  name egress_sk_stora  tag d4aa70984cc7bbf6 run_time_ns 25617093319 run_cnt 390989739
    loaded_at 2019-04-15T13:47:54-0700  uid 0
    xlated 168B  jited 156B  memlock 4096B  map_ids 17
    btf_id 6

Here is a high-level picture on how are the objects organized:

       sk
    ┌──────┐
    │      │
    │      │
    │      │
    │*sk_bpf_storage─────▶ bpf_sk_storage
    └──────┘                 ┌───────┐
                 ┌───────────┤ list  │
                 │           │       │
                 │           │       │
                 │           │       │
                 │           └───────┘
                 │
                 │     elem
                 │  ┌────────┐
                 ├─▶│ snode  │
                 │  ├────────┤
                 │  │  data  │          bpf_map
                 │  ├────────┤        ┌─────────┐
                 │  │map_node│◀─┬─────┤  list   │
                 │  └────────┘  │     │         │
                 │              │     │         │
                 │     elem     │     │         │
                 │  ┌────────┐  │     └─────────┘
                 └─▶│ snode  │  │
                    ├────────┤  │
   bpf_map          │  data  │  │
 ┌─────────┐        ├────────┤  │
 │  list   ├───────▶│map_node│  │
 │         │        └────────┘  │
 │         │                    │
 │         │           elem     │
 └─────────┘        ┌────────┐  │
                 ┌─▶│ snode  │  │
                 │  ├────────┤  │
                 │  │  data  │  │
                 │  ├────────┤  │
                 │  │map_node│◀─┘
                 │  └────────┘
                 │
                 │
                 │          ┌───────┐
     sk          └──────────│ list  │
  ┌──────┐                  │       │
  │      │                  │       │
  │      │                  │       │
  │      │                  └───────┘
  │*sk_bpf_storage───────▶bpf_sk_storage
  └──────┘

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |  2 ++
 include/linux/bpf_types.h    |  1 +
 include/net/bpf_sk_storage.h | 13 +++++++++++++
 include/net/sock.h           |  5 +++++
 include/uapi/linux/bpf.h     | 44 +++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 64 insertions(+), 1 deletion(-)
 create mode 100644 include/net/bpf_sk_storage.h

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cd6341eabd74..9a21848fdb07 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -184,6 +184,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_MAP_KEY,	/* pointer to stack used as map key */
 	ARG_PTR_TO_MAP_VALUE,	/* pointer to stack used as map value */
 	ARG_PTR_TO_UNINIT_MAP_VALUE,	/* pointer to valid memory used to store a map value */
+	ARG_PTR_TO_MAP_VALUE_OR_NULL,	/* pointer to stack used as map value or NULL */
 
 	/* the following constraints used to prototype bpf_memcmp() and other
 	 * functions that access data on eBPF program stack
@@ -204,6 +205,7 @@ enum bpf_arg_type {
 	ARG_PTR_TO_SOCK_COMMON,	/* pointer to sock_common */
 	ARG_PTR_TO_INT,		/* pointer to int */
 	ARG_PTR_TO_LONG,	/* pointer to long */
+	ARG_PTR_TO_SOCKET,	/* pointer to bpf_sock (fullsock) */
 };
 
 /* type of values returned from helper functions */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index a10d37bce364..5a9975678d6f 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -61,6 +61,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 #ifdef CONFIG_NET
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops)
 #if defined(CONFIG_BPF_STREAM_PARSER)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
new file mode 100644
index 000000000000..b9dcb02e756b
--- /dev/null
+++ b/include/net/bpf_sk_storage.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2019 Facebook */
+#ifndef _BPF_SK_STORAGE_H
+#define _BPF_SK_STORAGE_H
+
+struct sock;
+
+void bpf_sk_storage_free(struct sock *sk);
+
+extern const struct bpf_func_proto bpf_sk_storage_get_proto;
+extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
+
+#endif /* _BPF_SK_STORAGE_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 784cd19d5ff7..4d208c0f9c14 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -236,6 +236,8 @@ struct sock_common {
 	/* public: */
 };
 
+struct bpf_sk_storage;
+
 /**
   *	struct sock - network layer representation of sockets
   *	@__sk_common: shared layout with inet_timewait_sock
@@ -510,6 +512,9 @@ struct sock {
 #endif
 	void                    (*sk_destruct)(struct sock *sk);
 	struct sock_reuseport __rcu	*sk_reuseport_cb;
+#ifdef CONFIG_BPF_SYSCALL
+	struct bpf_sk_storage __rcu	*sk_bpf_storage;
+#endif
 	struct rcu_head		sk_rcu;
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f7fa7a34a62d..72336bac7573 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -133,6 +133,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
 	BPF_MAP_TYPE_QUEUE,
 	BPF_MAP_TYPE_STACK,
+	BPF_MAP_TYPE_SK_STORAGE,
 };
 
 /* Note that tracing related programs such as
@@ -2630,6 +2631,42 @@ union bpf_attr {
  *		was provided.
  *
  *		**-ERANGE** if resulting value was out of range.
+ *
+ * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags)
+ *	Description
+ *		Get a bpf-local-storage from a sk.
+ *
+ *		Logically, it could be thought of getting the value from
+ *		a *map* with *sk* as the **key**.  From this
+ *		perspective,  the usage is not much different from
+ *		**bpf_map_lookup_elem(map, &sk)** except this
+ *		helper enforces the key must be a **bpf_fullsock()**
+ *		and the map must be a BPF_MAP_TYPE_SK_STORAGE also.
+ *
+ *		Underneath, the value is stored locally at *sk* instead of
+ *		the map.  The *map* is used as the bpf-local-storage **type**.
+ *		The bpf-local-storage **type** (i.e. the *map*) is searched
+ *		against all bpf-local-storages residing at sk.
+ *
+ *		An optional *flags* (BPF_SK_STORAGE_GET_F_CREATE) can be
+ *		used such that a new bpf-local-storage will be
+ *		created if one does not exist.  *value* can be used
+ *		together with BPF_SK_STORAGE_GET_F_CREATE to specify
+ *		the initial value of a bpf-local-storage.  If *value* is
+ *		NULL, the new bpf-local-storage will be zero initialized.
+ *	Return
+ *		A bpf-local-storage pointer is returned on success.
+ *
+ *		**NULL** if not found or there was an error in adding
+ *		a new bpf-local-storage.
+ *
+ * int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk)
+ *	Description
+ *		Delete a bpf-local-storage from a sk.
+ *	Return
+ *		0 on success.
+ *
+ *		**-ENOENT** if the bpf-local-storage cannot be found.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2738,7 +2775,9 @@ union bpf_attr {
 	FN(sysctl_get_new_value),	\
 	FN(sysctl_set_new_value),	\
 	FN(strtol),			\
-	FN(strtoul),
+	FN(strtoul),			\
+	FN(sk_storage_get),		\
+	FN(sk_storage_delete),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2814,6 +2853,9 @@ enum bpf_func_id {
 /* BPF_FUNC_sysctl_get_name flags. */
 #define BPF_F_SYSCTL_BASE_NAME		(1ULL << 0)
 
+/* BPF_FUNC_sk_storage_get flags */
+#define BPF_SK_STORAGE_GET_F_CREATE	(1ULL << 0)
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
-- 
cgit v1.2.3


From 9e9957973c7785b1f8fa77f099cac661cc5e7e5b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 25 Apr 2019 12:32:02 -0700
Subject: net/tls: remove old exports of sk_destruct functions

tls_device_sk_destruct being set on a socket used to indicate
that socket is a kTLS device one.  That is no longer true -
now we use sk_validate_xmit_skb pointer for that purpose.
Remove the export.  tls_device_attach() needs to be moved.

While at it, remove the dead declaration of tls_sk_destruct().

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index d9d0ac66f040..20196cb31ecc 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -317,7 +317,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx);
 int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
 int tls_device_sendpage(struct sock *sk, struct page *page,
 			int offset, size_t size, int flags);
-void tls_device_sk_destruct(struct sock *sk);
 void tls_device_free_resources_tx(struct sock *sk);
 void tls_device_init(void);
 void tls_device_cleanup(void);
@@ -336,7 +335,6 @@ static inline u32 tls_record_start_seq(struct tls_record_info *rec)
 	return rec->end_seq - rec->len;
 }
 
-void tls_sk_destruct(struct sock *sk, struct tls_context *ctx);
 int tls_push_sg(struct sock *sk, struct tls_context *ctx,
 		struct scatterlist *sg, u16 first_offset,
 		int flags);
-- 
cgit v1.2.3


From da68b4ad02343862fee1e3e8c6315984f16a4778 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 25 Apr 2019 12:32:03 -0700
Subject: net/tls: move definition of tls ops into net/tls.h

There seems to be no reason for tls_ops to be defined in netdevice.h
which is included in a lot of places.  Don't wrap the struct/enum
declaration in ifdefs, it trickles down unnecessary ifdefs into
driver code.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 23 +----------------------
 include/net/tls.h         | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c46d218a0456..44b47e9df94a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -914,34 +914,13 @@ struct xfrmdev_ops {
 };
 #endif
 
-#if IS_ENABLED(CONFIG_TLS_DEVICE)
-enum tls_offload_ctx_dir {
-	TLS_OFFLOAD_CTX_DIR_RX,
-	TLS_OFFLOAD_CTX_DIR_TX,
-};
-
-struct tls_crypto_info;
-struct tls_context;
-
-struct tlsdev_ops {
-	int (*tls_dev_add)(struct net_device *netdev, struct sock *sk,
-			   enum tls_offload_ctx_dir direction,
-			   struct tls_crypto_info *crypto_info,
-			   u32 start_offload_tcp_sn);
-	void (*tls_dev_del)(struct net_device *netdev,
-			    struct tls_context *ctx,
-			    enum tls_offload_ctx_dir direction);
-	void (*tls_dev_resync_rx)(struct net_device *netdev,
-				  struct sock *sk, u32 seq, u64 rcd_sn);
-};
-#endif
-
 struct dev_ifalias {
 	struct rcu_head rcuhead;
 	char ifalias[];
 };
 
 struct devlink;
+struct tlsdev_ops;
 
 /*
  * This structure defines the management hooks for network devices.
diff --git a/include/net/tls.h b/include/net/tls.h
index 20196cb31ecc..41a2ee643fc5 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -277,6 +277,23 @@ struct tls_context {
 	void (*unhash)(struct sock *sk);
 };
 
+enum tls_offload_ctx_dir {
+	TLS_OFFLOAD_CTX_DIR_RX,
+	TLS_OFFLOAD_CTX_DIR_TX,
+};
+
+struct tlsdev_ops {
+	int (*tls_dev_add)(struct net_device *netdev, struct sock *sk,
+			   enum tls_offload_ctx_dir direction,
+			   struct tls_crypto_info *crypto_info,
+			   u32 start_offload_tcp_sn);
+	void (*tls_dev_del)(struct net_device *netdev,
+			    struct tls_context *ctx,
+			    enum tls_offload_ctx_dir direction);
+	void (*tls_dev_resync_rx)(struct net_device *netdev,
+				  struct sock *sk, u32 seq, u64 rcd_sn);
+};
+
 struct tls_offload_context_rx {
 	/* sw must be the first member of tls_offload_context_rx */
 	struct tls_sw_context_rx sw;
-- 
cgit v1.2.3


From 63a1c95f3fe48b4e9fe0c261b376e5e527b71b25 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 25 Apr 2019 12:32:04 -0700
Subject: net/tls: byte swap device req TCP seq no upon setting

To avoid a sparse warning byteswap the be32 sequence number
before it's stored in the atomic value.  While at it drop
unnecessary brackets and use kernel's u64 type.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tls.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tls.h b/include/net/tls.h
index 41a2ee643fc5..39ea62f0c1f6 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -562,7 +562,7 @@ static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq)
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx);
 
-	atomic64_set(&rx_ctx->resync_req, ((((uint64_t)seq) << 32) | 1));
+	atomic64_set(&rx_ctx->resync_req, ((u64)ntohl(seq) << 32) | 1);
 }
 
 
-- 
cgit v1.2.3


From ae0be8de9a53cda3505865c11826d8ff0640237c Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Fri, 26 Apr 2019 11:13:06 +0200
Subject: netlink: make nla_nest_start() add NLA_F_NESTED flag

Even if the NLA_F_NESTED flag was introduced more than 11 years ago, most
netlink based interfaces (including recently added ones) are still not
setting it in kernel generated messages. Without the flag, message parsers
not aware of attribute semantics (e.g. wireshark dissector or libmnl's
mnl_nlmsg_fprintf()) cannot recognize nested attributes and won't display
the structure of their contents.

Unfortunately we cannot just add the flag everywhere as there may be
userspace applications which check nlattr::nla_type directly rather than
through a helper masking out the flags. Therefore the patch renames
nla_nest_start() to nla_nest_start_noflag() and introduces nla_nest_start()
as a wrapper adding NLA_F_NESTED. The calls which add NLA_F_NESTED manually
are rewritten to use nla_nest_start().

Except for changes in include/net/netlink.h, the patch was generated using
this semantic patch:

@@ expression E1, E2; @@
-nla_nest_start(E1, E2)
+nla_nest_start_noflag(E1, E2)

@@ expression E1, E2; @@
-nla_nest_start_noflag(E1, E2 | NLA_F_NESTED)
+nla_nest_start(E1, E2)

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/ipset/ip_set.h |  2 +-
 include/net/netlink.h                  | 26 +++++++++++++++++++++++---
 2 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index f2e1e6b13ca4..965dc6c6653e 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -401,7 +401,7 @@ ip_set_get_h16(const struct nlattr *attr)
 	return ntohs(nla_get_be16(attr));
 }
 
-#define ipset_nest_start(skb, attr) nla_nest_start(skb, attr | NLA_F_NESTED)
+#define ipset_nest_start(skb, attr) nla_nest_start(skb, attr)
 #define ipset_nest_end(skb, start)  nla_nest_end(skb, start)
 
 static inline int nla_put_ipaddr4(struct sk_buff *skb, int type, __be32 ipaddr)
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 23f27b0b3cef..1f18b47f41b4 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -1415,13 +1415,18 @@ static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp)
 }
 
 /**
- * nla_nest_start - Start a new level of nested attributes
+ * nla_nest_start_noflag - Start a new level of nested attributes
  * @skb: socket buffer to add attributes to
  * @attrtype: attribute type of container
  *
- * Returns the container attribute
+ * This function exists for backward compatibility to use in APIs which never
+ * marked their nest attributes with NLA_F_NESTED flag. New APIs should use
+ * nla_nest_start() which sets the flag.
+ *
+ * Returns the container attribute or NULL on error
  */
-static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype)
+static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb,
+						   int attrtype)
 {
 	struct nlattr *start = (struct nlattr *)skb_tail_pointer(skb);
 
@@ -1431,6 +1436,21 @@ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype)
 	return start;
 }
 
+/**
+ * nla_nest_start - Start a new level of nested attributes, with NLA_F_NESTED
+ * @skb: socket buffer to add attributes to
+ * @attrtype: attribute type of container
+ *
+ * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED
+ * flag. This is the preferred function to use in new code.
+ *
+ * Returns the container attribute or NULL on error
+ */
+static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype)
+{
+	return nla_nest_start_noflag(skb, attrtype | NLA_F_NESTED);
+}
+
 /**
  * nla_nest_end - Finalize nesting of attributes
  * @skb: socket buffer the attributes are stored in
-- 
cgit v1.2.3


From 12ad5f65f030ae7b8a2425f6f79137c4217e30d4 Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Fri, 26 Apr 2019 11:13:09 +0200
Subject: ipset: drop ipset_nest_start() and ipset_nest_end()

After the previous commit, both ipset_nest_start() and ipset_nest_end() are
just aliases for nla_nest_start() and nla_nest_end() so that there is no
need to keep them.

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Acked-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/ipset/ip_set.h | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 965dc6c6653e..e499d170f12d 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -401,33 +401,30 @@ ip_set_get_h16(const struct nlattr *attr)
 	return ntohs(nla_get_be16(attr));
 }
 
-#define ipset_nest_start(skb, attr) nla_nest_start(skb, attr)
-#define ipset_nest_end(skb, start)  nla_nest_end(skb, start)
-
 static inline int nla_put_ipaddr4(struct sk_buff *skb, int type, __be32 ipaddr)
 {
-	struct nlattr *__nested = ipset_nest_start(skb, type);
+	struct nlattr *__nested = nla_nest_start(skb, type);
 	int ret;
 
 	if (!__nested)
 		return -EMSGSIZE;
 	ret = nla_put_in_addr(skb, IPSET_ATTR_IPADDR_IPV4, ipaddr);
 	if (!ret)
-		ipset_nest_end(skb, __nested);
+		nla_nest_end(skb, __nested);
 	return ret;
 }
 
 static inline int nla_put_ipaddr6(struct sk_buff *skb, int type,
 				  const struct in6_addr *ipaddrptr)
 {
-	struct nlattr *__nested = ipset_nest_start(skb, type);
+	struct nlattr *__nested = nla_nest_start(skb, type);
 	int ret;
 
 	if (!__nested)
 		return -EMSGSIZE;
 	ret = nla_put_in6_addr(skb, IPSET_ATTR_IPADDR_IPV6, ipaddrptr);
 	if (!ret)
-		ipset_nest_end(skb, __nested);
+		nla_nest_end(skb, __nested);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 6f455f5f4e9c28aefaefbe18ce7304b499645d75 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 26 Apr 2019 14:07:27 +0200
Subject: netlink: add NLA_MIN_LEN

Rather than using NLA_UNSPEC for this type of thing, use NLA_MIN_LEN
so we can make NLA_UNSPEC be NLA_REJECT under certain conditions for
future attributes.

While at it, also use NLA_EXACT_LEN for the struct example.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 1f18b47f41b4..c77ed51c18f1 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -183,6 +183,7 @@ enum {
 	NLA_REJECT,
 	NLA_EXACT_LEN,
 	NLA_EXACT_LEN_WARN,
+	NLA_MIN_LEN,
 	__NLA_TYPE_MAX,
 };
 
@@ -212,6 +213,7 @@ enum nla_policy_validation {
  *    NLA_NUL_STRING       Maximum length of string (excluding NUL)
  *    NLA_FLAG             Unused
  *    NLA_BINARY           Maximum length of attribute payload
+ *    NLA_MIN_LEN          Minimum length of attribute payload
  *    NLA_NESTED,
  *    NLA_NESTED_ARRAY     Length verification is done by checking len of
  *                         nested header (or empty); len field is used if
@@ -230,6 +232,7 @@ enum nla_policy_validation {
  *                         it is rejected.
  *    NLA_EXACT_LEN_WARN   Attribute should have exactly this length, a warning
  *                         is logged if it is longer, shorter is rejected.
+ *    NLA_MIN_LEN          Minimum length of attribute payload
  *    All other            Minimum length of attribute payload
  *
  * Meaning of `validation_data' field:
@@ -281,7 +284,7 @@ enum nla_policy_validation {
  * static const struct nla_policy my_policy[ATTR_MAX+1] = {
  * 	[ATTR_FOO] = { .type = NLA_U16 },
  *	[ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ },
- *	[ATTR_BAZ] = { .len = sizeof(struct mystruct) },
+ *	[ATTR_BAZ] = { .type = NLA_EXACT_LEN, .len = sizeof(struct mystruct) },
  *	[ATTR_GOO] = { .type = NLA_BITFIELD32, .validation_data = &myvalidflags },
  * };
  */
@@ -302,6 +305,7 @@ struct nla_policy {
 #define NLA_POLICY_EXACT_LEN(_len)	{ .type = NLA_EXACT_LEN, .len = _len }
 #define NLA_POLICY_EXACT_LEN_WARN(_len)	{ .type = NLA_EXACT_LEN_WARN, \
 					  .len = _len }
+#define NLA_POLICY_MIN_LEN(_len)	{ .type = NLA_MIN_LEN, .len = _len }
 
 #define NLA_POLICY_ETH_ADDR		NLA_POLICY_EXACT_LEN(ETH_ALEN)
 #define NLA_POLICY_ETH_ADDR_COMPAT	NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN)
-- 
cgit v1.2.3


From 8cb081746c031fb164089322e2336a0bf5b3070c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 26 Apr 2019 14:07:28 +0200
Subject: netlink: make validation more configurable for future strictness

We currently have two levels of strict validation:

 1) liberal (default)
     - undefined (type >= max) & NLA_UNSPEC attributes accepted
     - attribute length >= expected accepted
     - garbage at end of message accepted
 2) strict (opt-in)
     - NLA_UNSPEC attributes accepted
     - attribute length >= expected accepted

Split out parsing strictness into four different options:
 * TRAILING     - check that there's no trailing data after parsing
                  attributes (in message or nested)
 * MAXTYPE      - reject attrs > max known type
 * UNSPEC       - reject attributes with NLA_UNSPEC policy entries
 * STRICT_ATTRS - strictly validate attribute size

The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().

Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.

We end up with the following renames:
 * nla_parse           -> nla_parse_deprecated
 * nla_parse_strict    -> nla_parse_deprecated_strict
 * nlmsg_parse         -> nlmsg_parse_deprecated
 * nlmsg_parse_strict  -> nlmsg_parse_deprecated_strict
 * nla_parse_nested    -> nla_parse_nested_deprecated
 * nla_validate_nested -> nla_validate_nested_deprecated

Using spatch, of course:
    @@
    expression TB, MAX, HEAD, LEN, POL, EXT;
    @@
    -nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
    +nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)

    @@
    expression NLH, HDRLEN, TB, MAX, POL, EXT;
    @@
    -nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
    +nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)

    @@
    expression NLH, HDRLEN, TB, MAX, POL, EXT;
    @@
    -nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
    +nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)

    @@
    expression TB, MAX, NLA, POL, EXT;
    @@
    -nla_parse_nested(TB, MAX, NLA, POL, EXT)
    +nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)

    @@
    expression START, MAX, POL, EXT;
    @@
    -nla_validate_nested(START, MAX, POL, EXT)
    +nla_validate_nested_deprecated(START, MAX, POL, EXT)

    @@
    expression NLH, HDRLEN, MAX, POL, EXT;
    @@
    -nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
    +nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)

For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.

Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.

Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.

In effect then, this adds fully strict validation for any new command.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h |  16 ++--
 include/net/netlink.h   | 238 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 199 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 6850c7b1a3a6..897cdba13569 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -165,7 +165,7 @@ static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr)
 }
 
 /**
- * genlmsg_parse - parse attributes of a genetlink message
+ * genlmsg_parse_deprecated - parse attributes of a genetlink message
  * @nlh: netlink message header
  * @family: genetlink message family
  * @tb: destination array with maxtype+1 elements
@@ -173,14 +173,14 @@ static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr)
  * @policy: validation policy
  * @extack: extended ACK report struct
  */
-static inline int genlmsg_parse(const struct nlmsghdr *nlh,
-				const struct genl_family *family,
-				struct nlattr *tb[], int maxtype,
-				const struct nla_policy *policy,
-				struct netlink_ext_ack *extack)
+static inline int genlmsg_parse_deprecated(const struct nlmsghdr *nlh,
+					   const struct genl_family *family,
+					   struct nlattr *tb[], int maxtype,
+					   const struct nla_policy *policy,
+					   struct netlink_ext_ack *extack)
 {
-	return nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype,
-			   policy, extack);
+	return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype,
+			     policy, NL_VALIDATE_LIBERAL, extack);
 }
 
 /**
diff --git a/include/net/netlink.h b/include/net/netlink.h
index c77ed51c18f1..ab26a5e3558b 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -369,21 +369,48 @@ struct nl_info {
 	bool			skip_notify;
 };
 
+/**
+ * enum netlink_validation - netlink message/attribute validation levels
+ * @NL_VALIDATE_LIBERAL: Old-style "be liberal" validation, not caring about
+ *	extra data at the end of the message, attributes being longer than
+ *	they should be, or unknown attributes being present.
+ * @NL_VALIDATE_TRAILING: Reject junk data encountered after attribute parsing.
+ * @NL_VALIDATE_MAXTYPE: Reject attributes > max type; Together with _TRAILING
+ *	this is equivalent to the old nla_parse_strict()/nlmsg_parse_strict().
+ * @NL_VALIDATE_UNSPEC: Reject attributes with NLA_UNSPEC in the policy.
+ *	This can safely be set by the kernel when the given policy has no
+ *	NLA_UNSPEC anymore, and can thus be used to ensure policy entries
+ *	are enforced going forward.
+ * @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g.
+ *	U8, U16, U32 must have exact size, etc.)
+ */
+enum netlink_validation {
+	NL_VALIDATE_LIBERAL = 0,
+	NL_VALIDATE_TRAILING = BIT(0),
+	NL_VALIDATE_MAXTYPE = BIT(1),
+	NL_VALIDATE_UNSPEC = BIT(2),
+	NL_VALIDATE_STRICT_ATTRS = BIT(3),
+};
+
+#define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\
+				       NL_VALIDATE_MAXTYPE)
+#define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\
+			    NL_VALIDATE_MAXTYPE |\
+			    NL_VALIDATE_UNSPEC |\
+			    NL_VALIDATE_STRICT_ATTRS)
+
 int netlink_rcv_skb(struct sk_buff *skb,
 		    int (*cb)(struct sk_buff *, struct nlmsghdr *,
 			      struct netlink_ext_ack *));
 int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
 		 unsigned int group, int report, gfp_t flags);
 
-int nla_validate(const struct nlattr *head, int len, int maxtype,
-		 const struct nla_policy *policy,
-		 struct netlink_ext_ack *extack);
-int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
-	      int len, const struct nla_policy *policy,
-	      struct netlink_ext_ack *extack);
-int nla_parse_strict(struct nlattr **tb, int maxtype, const struct nlattr *head,
-		     int len, const struct nla_policy *policy,
-		     struct netlink_ext_ack *extack);
+int __nla_validate(const struct nlattr *head, int len, int maxtype,
+		   const struct nla_policy *policy, unsigned int validate,
+		   struct netlink_ext_ack *extack);
+int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
+		int len, const struct nla_policy *policy, unsigned int validate,
+		struct netlink_ext_ack *extack);
 int nla_policy_len(const struct nla_policy *, int);
 struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype);
 size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize);
@@ -512,42 +539,121 @@ nlmsg_next(const struct nlmsghdr *nlh, int *remaining)
 }
 
 /**
- * nlmsg_parse - parse attributes of a netlink message
+ * nla_parse_deprecated - Parse a stream of attributes into a tb buffer
+ * @tb: destination array with maxtype+1 elements
+ * @maxtype: maximum attribute type to be expected
+ * @head: head of attribute stream
+ * @len: length of attribute stream
+ * @policy: validation policy
+ * @extack: extended ACK pointer
+ *
+ * Parses a stream of attributes and stores a pointer to each attribute in
+ * the tb array accessible via the attribute type. Attributes with a type
+ * exceeding maxtype will be ignored and attributes from the policy are not
+ * always strictly validated (only for new attributes).
+ *
+ * Returns 0 on success or a negative error code.
+ */
+static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype,
+				       const struct nlattr *head, int len,
+				       const struct nla_policy *policy,
+				       struct netlink_ext_ack *extack)
+{
+	return __nla_parse(tb, maxtype, head, len, policy,
+			   NL_VALIDATE_LIBERAL, extack);
+}
+
+/**
+ * nla_parse_deprecated_strict - Parse a stream of attributes into a tb buffer
+ * @tb: destination array with maxtype+1 elements
+ * @maxtype: maximum attribute type to be expected
+ * @head: head of attribute stream
+ * @len: length of attribute stream
+ * @policy: validation policy
+ * @extack: extended ACK pointer
+ *
+ * Parses a stream of attributes and stores a pointer to each attribute in
+ * the tb array accessible via the attribute type. Attributes with a type
+ * exceeding maxtype will be rejected as well as trailing data, but the
+ * policy is not completely strictly validated (only for new attributes).
+ *
+ * Returns 0 on success or a negative error code.
+ */
+static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype,
+					      const struct nlattr *head,
+					      int len,
+					      const struct nla_policy *policy,
+					      struct netlink_ext_ack *extack)
+{
+	return __nla_parse(tb, maxtype, head, len, policy,
+			   NL_VALIDATE_DEPRECATED_STRICT, extack);
+}
+
+/**
+ * __nlmsg_parse - parse attributes of a netlink message
  * @nlh: netlink message header
  * @hdrlen: length of family specific header
  * @tb: destination array with maxtype+1 elements
  * @maxtype: maximum attribute type to be expected
  * @policy: validation policy
+ * @validate: validation strictness
  * @extack: extended ACK report struct
  *
  * See nla_parse()
  */
-static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
-			      struct nlattr *tb[], int maxtype,
-			      const struct nla_policy *policy,
-			      struct netlink_ext_ack *extack)
+static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
+				struct nlattr *tb[], int maxtype,
+				const struct nla_policy *policy,
+				unsigned int validate,
+				struct netlink_ext_ack *extack)
 {
 	if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) {
 		NL_SET_ERR_MSG(extack, "Invalid header length");
 		return -EINVAL;
 	}
 
-	return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),
-			 nlmsg_attrlen(nlh, hdrlen), policy, extack);
+	return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),
+			   nlmsg_attrlen(nlh, hdrlen), policy, validate,
+			   extack);
 }
 
-static inline int nlmsg_parse_strict(const struct nlmsghdr *nlh, int hdrlen,
-				     struct nlattr *tb[], int maxtype,
-				     const struct nla_policy *policy,
-				     struct netlink_ext_ack *extack)
+/**
+ * nlmsg_parse_deprecated - parse attributes of a netlink message
+ * @nlh: netlink message header
+ * @hdrlen: length of family specific header
+ * @tb: destination array with maxtype+1 elements
+ * @maxtype: maximum attribute type to be expected
+ * @extack: extended ACK report struct
+ *
+ * See nla_parse_deprecated()
+ */
+static inline int nlmsg_parse_deprecated(const struct nlmsghdr *nlh, int hdrlen,
+					 struct nlattr *tb[], int maxtype,
+					 const struct nla_policy *policy,
+					 struct netlink_ext_ack *extack)
 {
-	if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) {
-		NL_SET_ERR_MSG(extack, "Invalid header length");
-		return -EINVAL;
-	}
+	return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
+			     NL_VALIDATE_LIBERAL, extack);
+}
 
-	return nla_parse_strict(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),
-				nlmsg_attrlen(nlh, hdrlen), policy, extack);
+/**
+ * nlmsg_parse_deprecated_strict - parse attributes of a netlink message
+ * @nlh: netlink message header
+ * @hdrlen: length of family specific header
+ * @tb: destination array with maxtype+1 elements
+ * @maxtype: maximum attribute type to be expected
+ * @extack: extended ACK report struct
+ *
+ * See nla_parse_deprecated_strict()
+ */
+static inline int
+nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen,
+			      struct nlattr *tb[], int maxtype,
+			      const struct nla_policy *policy,
+			      struct netlink_ext_ack *extack)
+{
+	return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
+			     NL_VALIDATE_DEPRECATED_STRICT, extack);
 }
 
 /**
@@ -566,26 +672,53 @@ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh,
 }
 
 /**
- * nlmsg_validate - validate a netlink message including attributes
+ * nla_validate_deprecated - Validate a stream of attributes
+ * @head: head of attribute stream
+ * @len: length of attribute stream
+ * @maxtype: maximum attribute type to be expected
+ * @policy: validation policy
+ * @validate: validation strictness
+ * @extack: extended ACK report struct
+ *
+ * Validates all attributes in the specified attribute stream against the
+ * specified policy. Validation is done in liberal mode.
+ * See documenation of struct nla_policy for more details.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+static inline int nla_validate_deprecated(const struct nlattr *head, int len,
+					  int maxtype,
+					  const struct nla_policy *policy,
+					  struct netlink_ext_ack *extack)
+{
+	return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_LIBERAL,
+			      extack);
+}
+
+
+/**
+ * nlmsg_validate_deprecated - validate a netlink message including attributes
  * @nlh: netlinket message header
  * @hdrlen: length of familiy specific header
  * @maxtype: maximum attribute type to be expected
  * @policy: validation policy
  * @extack: extended ACK report struct
  */
-static inline int nlmsg_validate(const struct nlmsghdr *nlh,
-				 int hdrlen, int maxtype,
-				 const struct nla_policy *policy,
-				 struct netlink_ext_ack *extack)
+static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh,
+					    int hdrlen, int maxtype,
+					    const struct nla_policy *policy,
+					    struct netlink_ext_ack *extack)
 {
 	if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
 		return -EINVAL;
 
-	return nla_validate(nlmsg_attrdata(nlh, hdrlen),
-			    nlmsg_attrlen(nlh, hdrlen), maxtype, policy,
-			    extack);
+	return __nla_validate(nlmsg_attrdata(nlh, hdrlen),
+			      nlmsg_attrlen(nlh, hdrlen), maxtype,
+			      policy, NL_VALIDATE_LIBERAL, extack);
 }
 
+
+
 /**
  * nlmsg_report - need to report back to application?
  * @nlh: netlink message header
@@ -899,22 +1032,22 @@ nla_find_nested(const struct nlattr *nla, int attrtype)
 }
 
 /**
- * nla_parse_nested - parse nested attributes
+ * nla_parse_nested_deprecated - parse nested attributes
  * @tb: destination array with maxtype+1 elements
  * @maxtype: maximum attribute type to be expected
  * @nla: attribute containing the nested attributes
  * @policy: validation policy
  * @extack: extended ACK report struct
  *
- * See nla_parse()
+ * See nla_parse_deprecated()
  */
-static inline int nla_parse_nested(struct nlattr *tb[], int maxtype,
-				   const struct nlattr *nla,
-				   const struct nla_policy *policy,
-				   struct netlink_ext_ack *extack)
+static inline int nla_parse_nested_deprecated(struct nlattr *tb[], int maxtype,
+					      const struct nlattr *nla,
+					      const struct nla_policy *policy,
+					      struct netlink_ext_ack *extack)
 {
-	return nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
-			 extack);
+	return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
+			   NL_VALIDATE_LIBERAL, extack);
 }
 
 /**
@@ -1489,6 +1622,7 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start)
  * @start: container attribute
  * @maxtype: maximum attribute type to be expected
  * @policy: validation policy
+ * @validate: validation strictness
  * @extack: extended ACK report struct
  *
  * Validates all attributes in the nested attribute stream against the
@@ -1497,12 +1631,22 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start)
  *
  * Returns 0 on success or a negative error code.
  */
-static inline int nla_validate_nested(const struct nlattr *start, int maxtype,
-				      const struct nla_policy *policy,
-				      struct netlink_ext_ack *extack)
+static inline int __nla_validate_nested(const struct nlattr *start, int maxtype,
+					const struct nla_policy *policy,
+					unsigned int validate,
+					struct netlink_ext_ack *extack)
+{
+	return __nla_validate(nla_data(start), nla_len(start), maxtype, policy,
+			      validate, extack);
+}
+
+static inline int
+nla_validate_nested_deprecated(const struct nlattr *start, int maxtype,
+			       const struct nla_policy *policy,
+			       struct netlink_ext_ack *extack)
 {
-	return nla_validate(nla_data(start), nla_len(start), maxtype, policy,
-			    extack);
+	return __nla_validate_nested(start, maxtype, policy,
+				     NL_VALIDATE_LIBERAL, extack);
 }
 
 /**
-- 
cgit v1.2.3


From 3de644035446567017e952f16da2594d6bd195fc Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 26 Apr 2019 14:07:29 +0200
Subject: netlink: re-add parse/validate functions in strict mode

This re-adds the parse and validate functions like nla_parse()
that are now actually strict after the previous rename and were
just split out to make sure everything is converted (and if not
compilation of the previous patch would fail.)

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h | 19 +++++++++++
 include/net/netlink.h   | 87 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+)

(limited to 'include')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 897cdba13569..68de579cfe5e 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -183,6 +183,25 @@ static inline int genlmsg_parse_deprecated(const struct nlmsghdr *nlh,
 			     policy, NL_VALIDATE_LIBERAL, extack);
 }
 
+/**
+ * genlmsg_parse - parse attributes of a genetlink message
+ * @nlh: netlink message header
+ * @family: genetlink message family
+ * @tb: destination array with maxtype+1 elements
+ * @maxtype: maximum attribute type to be expected
+ * @policy: validation policy
+ * @extack: extended ACK report struct
+ */
+static inline int genlmsg_parse(const struct nlmsghdr *nlh,
+				const struct genl_family *family,
+				struct nlattr *tb[], int maxtype,
+				const struct nla_policy *policy,
+				struct netlink_ext_ack *extack)
+{
+	return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype,
+			     policy, NL_VALIDATE_STRICT, extack);
+}
+
 /**
  * genl_dump_check_consistent - check if sequence is consistent and advertise if not
  * @cb: netlink callback structure that stores the sequence number
diff --git a/include/net/netlink.h b/include/net/netlink.h
index ab26a5e3558b..e4dd874412bf 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -538,6 +538,31 @@ nlmsg_next(const struct nlmsghdr *nlh, int *remaining)
 	return (struct nlmsghdr *) ((unsigned char *) nlh + totlen);
 }
 
+/**
+ * nla_parse - Parse a stream of attributes into a tb buffer
+ * @tb: destination array with maxtype+1 elements
+ * @maxtype: maximum attribute type to be expected
+ * @head: head of attribute stream
+ * @len: length of attribute stream
+ * @policy: validation policy
+ * @extack: extended ACK pointer
+ *
+ * Parses a stream of attributes and stores a pointer to each attribute in
+ * the tb array accessible via the attribute type. Attributes with a type
+ * exceeding maxtype will be rejected, policy must be specified, attributes
+ * will be validated in the strictest way possible.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+static inline int nla_parse(struct nlattr **tb, int maxtype,
+			    const struct nlattr *head, int len,
+			    const struct nla_policy *policy,
+			    struct netlink_ext_ack *extack)
+{
+	return __nla_parse(tb, maxtype, head, len, policy,
+			   NL_VALIDATE_STRICT, extack);
+}
+
 /**
  * nla_parse_deprecated - Parse a stream of attributes into a tb buffer
  * @tb: destination array with maxtype+1 elements
@@ -617,6 +642,27 @@ static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
 			   extack);
 }
 
+/**
+ * nlmsg_parse - parse attributes of a netlink message
+ * @nlh: netlink message header
+ * @hdrlen: length of family specific header
+ * @tb: destination array with maxtype+1 elements
+ * @maxtype: maximum attribute type to be expected
+ * @validate: validation strictness
+ * @extack: extended ACK report struct
+ *
+ * See nla_parse()
+ */
+static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
+			      struct nlattr *tb[], int maxtype,
+			      const struct nla_policy *policy,
+			      struct netlink_ext_ack *extack)
+{
+	return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),
+			   nlmsg_attrlen(nlh, hdrlen), policy,
+			   NL_VALIDATE_STRICT, extack);
+}
+
 /**
  * nlmsg_parse_deprecated - parse attributes of a netlink message
  * @nlh: netlink message header
@@ -695,6 +741,28 @@ static inline int nla_validate_deprecated(const struct nlattr *head, int len,
 			      extack);
 }
 
+/**
+ * nla_validate - Validate a stream of attributes
+ * @head: head of attribute stream
+ * @len: length of attribute stream
+ * @maxtype: maximum attribute type to be expected
+ * @policy: validation policy
+ * @validate: validation strictness
+ * @extack: extended ACK report struct
+ *
+ * Validates all attributes in the specified attribute stream against the
+ * specified policy. Validation is done in strict mode.
+ * See documenation of struct nla_policy for more details.
+ *
+ * Returns 0 on success or a negative error code.
+ */
+static inline int nla_validate(const struct nlattr *head, int len, int maxtype,
+			       const struct nla_policy *policy,
+			       struct netlink_ext_ack *extack)
+{
+	return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_STRICT,
+			      extack);
+}
 
 /**
  * nlmsg_validate_deprecated - validate a netlink message including attributes
@@ -1031,6 +1099,25 @@ nla_find_nested(const struct nlattr *nla, int attrtype)
 	return nla_find(nla_data(nla), nla_len(nla), attrtype);
 }
 
+/**
+ * nla_parse_nested - parse nested attributes
+ * @tb: destination array with maxtype+1 elements
+ * @maxtype: maximum attribute type to be expected
+ * @nla: attribute containing the nested attributes
+ * @policy: validation policy
+ * @extack: extended ACK report struct
+ *
+ * See nla_parse()
+ */
+static inline int nla_parse_nested(struct nlattr *tb[], int maxtype,
+				   const struct nlattr *nla,
+				   const struct nla_policy *policy,
+				   struct netlink_ext_ack *extack)
+{
+	return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
+			   NL_VALIDATE_STRICT, extack);
+}
+
 /**
  * nla_parse_nested_deprecated - parse nested attributes
  * @tb: destination array with maxtype+1 elements
-- 
cgit v1.2.3


From 56738f460841761abc70347c919d5c45f6f05a42 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 26 Apr 2019 14:07:30 +0200
Subject: netlink: add strict parsing for future attributes

Unfortunately, we cannot add strict parsing for all attributes, as
that would break existing userspace. We currently warn about it, but
that's about all we can do.

For new attributes, however, the story is better: nobody is using
them, so we can reject bad sizes.

Also, for new attributes, we need not accept them when the policy
doesn't declare their usage.

David Ahern and I went back and forth on how to best encode this, and
the best way we found was to have a "boundary type", from which point
on new attributes have all possible validation applied, and NLA_UNSPEC
is rejected.

As we didn't want to add another argument to all functions that get a
netlink policy, the workaround is to encode that boundary in the first
entry of the policy array (which is for type 0 and thus probably not
really valid anyway). I put it into the validation union for the rare
possibility that somebody is actually using attribute 0, which would
continue to work fine unless they tried to use the extended validation,
which isn't likely. We also didn't find any in-tree users with type 0.

The reason for setting the "start strict here" attribute is that we
never really need to start strict from 0, which is invalid anyway (or
in legacy families where that isn't true, it cannot be set to strict),
so we can thus reserve the value 0 for "don't do this check" and don't
have to add the tag to all policies right now.

Thus, policies can now opt in to this validation, which we should do
for all existing policies, at least when adding new attributes.

Note that entirely *new* policies won't need to set it, as the use
of that should be using nla_parse()/nlmsg_parse() etc. which anyway
do fully strict validation now, regardless of this.

So in effect, this patch only covers the "existing command with new
attribute" case.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index e4dd874412bf..679f649748d4 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -299,6 +299,24 @@ struct nla_policy {
 		};
 		int (*validate)(const struct nlattr *attr,
 				struct netlink_ext_ack *extack);
+		/* This entry is special, and used for the attribute at index 0
+		 * only, and specifies special data about the policy, namely it
+		 * specifies the "boundary type" where strict length validation
+		 * starts for any attribute types >= this value, also, strict
+		 * nesting validation starts here.
+		 *
+		 * Additionally, it means that NLA_UNSPEC is actually NLA_REJECT
+		 * for any types >= this, so need to use NLA_MIN_LEN to get the
+		 * previous pure { .len = xyz } behaviour. The advantage of this
+		 * is that types not specified in the policy will be rejected.
+		 *
+		 * For completely new families it should be set to 1 so that the
+		 * validation is enforced for all attributes. For existing ones
+		 * it should be set at least when new attributes are added to
+		 * the enum used by the policy, and be set to the new value that
+		 * was added to enforce strict validation from thereon.
+		 */
+		u16 strict_start_type;
 	};
 };
 
-- 
cgit v1.2.3


From ef6243acb4782df587a4d7d6c310fa5b5d82684b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 26 Apr 2019 14:07:31 +0200
Subject: genetlink: optionally validate strictly/dumps

Add options to strictly validate messages and dump messages,
sometimes perhaps validating dump messages non-strictly may
be required, so add an option for that as well.

Since none of this can really be applied to existing commands,
set the options everwhere using the following spatch:

    @@
    identifier ops;
    expression X;
    @@
    struct genl_ops ops[] = {
    ...,
     {
            .cmd = X,
    +       .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
            ...
     },
    ...
    };

For new commands one should just not copy the .validate 'opt-out'
flags and thus get strict validation.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 68de579cfe5e..9292f1c588b7 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -121,6 +121,12 @@ static inline int genl_err_attr(struct genl_info *info, int err,
 	return err;
 }
 
+enum genl_validate_flags {
+	GENL_DONT_VALIDATE_STRICT		= BIT(0),
+	GENL_DONT_VALIDATE_DUMP			= BIT(1),
+	GENL_DONT_VALIDATE_DUMP_STRICT		= BIT(2),
+};
+
 /**
  * struct genl_ops - generic netlink operations
  * @cmd: command identifier
@@ -141,6 +147,7 @@ struct genl_ops {
 	u8			cmd;
 	u8			internal_flags;
 	u8			flags;
+	u8			validate;
 };
 
 int genl_register_family(struct genl_family *family);
-- 
cgit v1.2.3


From 875138f81d71af3cfa80df57e32fe9efbc4f95bc Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 28 Apr 2019 19:37:11 +0200
Subject: dsa: Move tagger name into its ops structure

Rather than keep a list to map a tagger ops to a name, place the name
into the ops structure. This removes the hard coded list, a step
towards making the taggers more dynamic.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>

v2:
Move name to end of structure, keeping the hot entries at the beginning.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 0cfc2f828b87..801346e31e9b 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -56,6 +56,7 @@ struct dsa_device_ops {
 	int (*flow_dissect)(const struct sk_buff *skb, __be16 *proto,
 			    int *offset);
 	unsigned int overhead;
+	const char *name;
 };
 
 struct dsa_switch_tree {
-- 
cgit v1.2.3


From 0b42f03363706609d621c31324fae5c1250f579f Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 28 Apr 2019 19:37:12 +0200
Subject: dsa: Add MODULE_ALIAS to taggers in preparation to become modules

When the tag drivers become modules, we will need to dynamically load
them based on what the switch drivers need. Add aliases to map between
the TAG protocol and the driver.

In order to do this, we need the tag protocol number as something
which the C pre-processor can stringinfy. Only the compiler knows the
value of an enum, CPP cannot use them. So add #defines.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 43 ++++++++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 801346e31e9b..8f3d5e0825a2 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -30,20 +30,33 @@ struct phy_device;
 struct fixed_phy_status;
 struct phylink_link_state;
 
+#define DSA_TAG_PROTO_NONE_VALUE		0
+#define DSA_TAG_PROTO_BRCM_VALUE		1
+#define DSA_TAG_PROTO_BRCM_PREPEND_VALUE	2
+#define DSA_TAG_PROTO_DSA_VALUE			3
+#define DSA_TAG_PROTO_EDSA_VALUE		4
+#define DSA_TAG_PROTO_GSWIP_VALUE		5
+#define DSA_TAG_PROTO_KSZ9477_VALUE		6
+#define DSA_TAG_PROTO_KSZ9893_VALUE		7
+#define DSA_TAG_PROTO_LAN9303_VALUE		8
+#define DSA_TAG_PROTO_MTK_VALUE			9
+#define DSA_TAG_PROTO_QCA_VALUE			10
+#define DSA_TAG_PROTO_TRAILER_VALUE		11
+
 enum dsa_tag_protocol {
-	DSA_TAG_PROTO_NONE = 0,
-	DSA_TAG_PROTO_BRCM,
-	DSA_TAG_PROTO_BRCM_PREPEND,
-	DSA_TAG_PROTO_DSA,
-	DSA_TAG_PROTO_EDSA,
-	DSA_TAG_PROTO_GSWIP,
-	DSA_TAG_PROTO_KSZ9477,
-	DSA_TAG_PROTO_KSZ9893,
-	DSA_TAG_PROTO_LAN9303,
-	DSA_TAG_PROTO_MTK,
-	DSA_TAG_PROTO_QCA,
-	DSA_TAG_PROTO_TRAILER,
-	DSA_TAG_LAST,		/* MUST BE LAST */
+	DSA_TAG_PROTO_NONE		= DSA_TAG_PROTO_NONE_VALUE,
+	DSA_TAG_PROTO_BRCM		= DSA_TAG_PROTO_BRCM_VALUE,
+	DSA_TAG_PROTO_BRCM_PREPEND	= DSA_TAG_PROTO_BRCM_PREPEND_VALUE,
+	DSA_TAG_PROTO_DSA		= DSA_TAG_PROTO_DSA_VALUE,
+	DSA_TAG_PROTO_EDSA		= DSA_TAG_PROTO_EDSA_VALUE,
+	DSA_TAG_PROTO_GSWIP		= DSA_TAG_PROTO_GSWIP_VALUE,
+	DSA_TAG_PROTO_KSZ9477		= DSA_TAG_PROTO_KSZ9477_VALUE,
+	DSA_TAG_PROTO_KSZ9893		= DSA_TAG_PROTO_KSZ9893_VALUE,
+	DSA_TAG_PROTO_LAN9303		= DSA_TAG_PROTO_LAN9303_VALUE,
+	DSA_TAG_PROTO_MTK		= DSA_TAG_PROTO_MTK_VALUE,
+	DSA_TAG_PROTO_QCA		= DSA_TAG_PROTO_QCA_VALUE,
+	DSA_TAG_PROTO_TRAILER		= DSA_TAG_PROTO_TRAILER_VALUE,
+	DSA_TAG_LAST,			/* MUST BE LAST */
 };
 
 struct packet_type;
@@ -59,6 +72,10 @@ struct dsa_device_ops {
 	const char *name;
 };
 
+#define DSA_TAG_DRIVER_ALIAS "dsa_tag-"
+#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto)				\
+	MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE))
+
 struct dsa_switch_tree {
 	struct list_head	list;
 
-- 
cgit v1.2.3


From 056eed2fb071c11535527fc792bdfb985a9a3e26 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 28 Apr 2019 19:37:14 +0200
Subject: dsa: Add TAG protocol to tag ops

In order that we can match the tagging protocol a switch driver
request to the tagger, we need to know what protocol the tagger
supports. Add this information to the ops structure.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>

v2
More tag protocol to end of structure to keep hot members at the beginning.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 8f3d5e0825a2..720036f48fb3 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -70,6 +70,7 @@ struct dsa_device_ops {
 			    int *offset);
 	unsigned int overhead;
 	const char *name;
+	enum dsa_tag_protocol proto;
 };
 
 #define DSA_TAG_DRIVER_ALIAS "dsa_tag-"
-- 
cgit v1.2.3


From d3b8c04988ca1685700e345a82a1396df79e6291 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 28 Apr 2019 19:37:15 +0200
Subject: dsa: Add boilerplate helper to register DSA tag driver modules

A DSA tag driver module will need to register the tag protocols it
implements with the DSA core. Add macros containing this boiler plate.

The registration/unregistration code is currently just a stub. A Later
patch will add the real implementation.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>

v2
Fix indent of #endif
Rewrite to move list pointer into a new structure
v3
Move kdoc next to macro
Fix THIS_MODULE indentation

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 720036f48fb3..08ac05c014e3 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -594,4 +594,70 @@ int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data);
 int dsa_port_get_phy_sset_count(struct dsa_port *dp);
 void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up);
 
+struct dsa_tag_driver {
+	const struct dsa_device_ops *ops;
+	struct list_head list;
+	struct module *owner;
+};
+
+void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[],
+			      unsigned int count,
+			      struct module *owner);
+void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[],
+				unsigned int count);
+
+#define dsa_tag_driver_module_drivers(__dsa_tag_drivers_array, __count)	\
+static int __init dsa_tag_driver_module_init(void)			\
+{									\
+	dsa_tag_drivers_register(__dsa_tag_drivers_array, __count,	\
+				 THIS_MODULE);				\
+	return 0;							\
+}									\
+module_init(dsa_tag_driver_module_init);				\
+									\
+static void __exit dsa_tag_driver_module_exit(void)			\
+{									\
+	dsa_tag_drivers_unregister(__dsa_tag_drivers_array, __count);	\
+}									\
+module_exit(dsa_tag_driver_module_exit)
+
+/**
+ * module_dsa_tag_drivers() - Helper macro for registering DSA tag
+ * drivers
+ * @__ops_array: Array of tag driver strucutres
+ *
+ * Helper macro for DSA tag drivers which do not do anything special
+ * in module init/exit. Each module may only use this macro once, and
+ * calling it replaces module_init() and module_exit().
+ */
+#define module_dsa_tag_drivers(__ops_array)				\
+dsa_tag_driver_module_drivers(__ops_array, ARRAY_SIZE(__ops_array))
+
+#define DSA_TAG_DRIVER_NAME(__ops) dsa_tag_driver ## _ ## __ops
+
+/* Create a static structure we can build a linked list of dsa_tag
+ * drivers
+ */
+#define DSA_TAG_DRIVER(__ops)						\
+static struct dsa_tag_driver DSA_TAG_DRIVER_NAME(__ops) = {		\
+	.ops = &__ops,							\
+}
+
+/**
+ * module_dsa_tag_driver() - Helper macro for registering a single DSA tag
+ * driver
+ * @__ops: Single tag driver structures
+ *
+ * Helper macro for DSA tag drivers which do not do anything special
+ * in module init/exit. Each module may only use this macro once, and
+ * calling it replaces module_init() and module_exit().
+ */
+#define module_dsa_tag_driver(__ops)					\
+DSA_TAG_DRIVER(__ops);							\
+									\
+static struct dsa_tag_driver *dsa_tag_driver_array[] =	{		\
+	&DSA_TAG_DRIVER_NAME(__ops)					\
+};									\
+module_dsa_tag_drivers(dsa_tag_driver_array)
 #endif
+
-- 
cgit v1.2.3


From f81a43e8da07ccd91c4d923a44ffffaeee39dcc8 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 28 Apr 2019 19:37:21 +0200
Subject: dsa: Cleanup unneeded table and make tag structures static

Now that tag drivers dynamically register, we don't need the static
table. Remove it. This also means the tag driver structures can be
made static.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 08ac05c014e3..b550f7bb5314 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -56,7 +56,6 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_MTK		= DSA_TAG_PROTO_MTK_VALUE,
 	DSA_TAG_PROTO_QCA		= DSA_TAG_PROTO_QCA_VALUE,
 	DSA_TAG_PROTO_TRAILER		= DSA_TAG_PROTO_TRAILER_VALUE,
-	DSA_TAG_LAST,			/* MUST BE LAST */
 };
 
 struct packet_type;
-- 
cgit v1.2.3


From 316793fb2d907d4726b4977b6be26ec653827774 Mon Sep 17 00:00:00 2001
From: Eli Britstein <elibr@mellanox.com>
Date: Mon, 29 Apr 2019 18:14:01 +0000
Subject: net/mlx5: E-Switch: Introduce prio tag mode

Current ConnectX HW is unable to perform VLAN pop in TX path and VLAN
push on RX path. To workaround that limitation untagged packets will be
tagged with VLAN ID 0x000 (priority tag) and pop/push actions will be
replaced by VLAN re-write actions (which are supported by the HW).
Introduce prio tag mode as a pre-step to controlling the workaround
behavior.

Signed-off-by: Eli Britstein <elibr@mellanox.com>
Reviewed-by: Oz Shlomo <ozsh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 4b37519bd6a5..eeedf3f53ed3 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -951,7 +951,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         log_max_srq_sz[0x8];
 	u8         log_max_qp_sz[0x8];
-	u8         reserved_at_90[0xb];
+	u8         reserved_at_90[0x8];
+	u8         prio_tag_required[0x1];
+	u8         reserved_at_99[0x2];
 	u8         log_max_qp[0x5];
 
 	u8         reserved_at_a0[0xb];
-- 
cgit v1.2.3


From 27b942fbbd3107d4e969ece133925cd646239ef4 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Mon, 29 Apr 2019 18:14:02 +0000
Subject: net/mlx5: Get rid of storing copy of device name

Currently mlx5 core stores copy of the PCI device name in a
mlx5_priv structure and uses pr_warn, pr_err helpers.

Get rid of the copy of this name; instead store the parent device
pointer that contains name as well as dma specific parameters.
This also allows to use kernel's well defined dev_warn, dev_err, dev_dbg
device specific print routines.

This is also a preparation patch to access non PCI parent device in
future.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/driver.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 6c43191c0186..582a9680b182 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -56,7 +56,6 @@
 
 enum {
 	MLX5_BOARD_ID_LEN = 64,
-	MLX5_MAX_NAME_LEN = 16,
 };
 
 enum {
@@ -514,7 +513,6 @@ struct mlx5_rl_table {
 };
 
 struct mlx5_priv {
-	char			name[MLX5_MAX_NAME_LEN];
 	struct mlx5_eq_table	*eq_table;
 
 	/* pages stuff */
@@ -641,6 +639,7 @@ struct mlx5_fw_tracer;
 struct mlx5_vxlan;
 
 struct mlx5_core_dev {
+	struct device *device;
 	struct pci_dev	       *pdev;
 	/* sync pci state */
 	struct mutex		pci_status_mutex;
-- 
cgit v1.2.3


From d83eb50e29de36ddc819863ab7b9d2da58bccbd0 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Mon, 29 Apr 2019 18:14:12 +0000
Subject: net/mlx5: Add support in RDMA RX steering

Add new flow steering namespace - MLX5_FLOW_NAMESPACE_RDMA_RX.
Flow steering rules in this namespace are used to filter
RDMA traffic.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/device.h   | 6 ++++++
 include/linux/mlx5/fs.h       | 1 +
 include/linux/mlx5/mlx5_ifc.h | 2 +-
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index f93a5598b942..28ebb6c93542 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1166,6 +1166,12 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP_FLOWTABLE_SNIFFER_TX_MAX(mdev, cap) \
 	MLX5_CAP_FLOWTABLE_MAX(mdev, flow_table_properties_nic_transmit_sniffer.cap)
 
+#define MLX5_CAP_FLOWTABLE_RDMA_RX(mdev, cap) \
+	MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive_rdma.cap)
+
+#define MLX5_CAP_FLOWTABLE_RDMA_RX_MAX(mdev, cap) \
+	MLX5_CAP_FLOWTABLE_MAX(mdev, flow_table_properties_nic_receive_rdma.cap)
+
 #define MLX5_CAP_ESW_FLOWTABLE(mdev, cap) \
 	MLX5_GET(flow_table_eswitch_cap, \
 		 mdev->caps.hca_cur[MLX5_CAP_ESWITCH_FLOW_TABLE], cap)
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index fd91df3a4e09..e690ba0f965c 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -73,6 +73,7 @@ enum mlx5_flow_namespace_type {
 	MLX5_FLOW_NAMESPACE_SNIFFER_RX,
 	MLX5_FLOW_NAMESPACE_SNIFFER_TX,
 	MLX5_FLOW_NAMESPACE_EGRESS,
+	MLX5_FLOW_NAMESPACE_RDMA_RX,
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index eeedf3f53ed3..89e7194b3d97 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -598,7 +598,7 @@ struct mlx5_ifc_flow_table_nic_cap_bits {
 
 	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive;
 
-	u8         reserved_at_400[0x200];
+	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive_rdma;
 
 	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive_sniffer;
 
-- 
cgit v1.2.3


From f6f7d6b5bd818cc84eebb55ba6bcba38cfd3b385 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Mon, 29 Apr 2019 18:14:14 +0000
Subject: net/mlx5: Add new miss flow table action

Flow table supports three types of miss action:
1. Default miss action - go to default miss table according to table.
2. Go to specific table.
3. Switch domain - go to the root table of an alternative steering
   table domain.

New table miss action was added - switch_domain.
The next domain for RDMA_RX namespace is the NIC RX domain.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 89e7194b3d97..7d9264b282d1 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -370,7 +370,9 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 	u8	   reformat_l3_tunnel_to_l2[0x1];
 	u8	   reformat_l2_to_l3_tunnel[0x1];
 	u8	   reformat_and_modify_action[0x1];
-	u8         reserved_at_15[0xb];
+	u8         reserved_at_15[0x2];
+	u8	   table_miss_action_domain[0x1];
+	u8         reserved_at_18[0x8];
 	u8         reserved_at_20[0x2];
 	u8         log_max_ft_size[0x6];
 	u8         log_max_modify_header_context[0x8];
@@ -1284,6 +1286,12 @@ enum mlx5_flow_destination_type {
 	MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM = 0x101,
 };
 
+enum mlx5_flow_table_miss_action {
+	MLX5_FLOW_TABLE_MISS_ACTION_DEF,
+	MLX5_FLOW_TABLE_MISS_ACTION_FWD,
+	MLX5_FLOW_TABLE_MISS_ACTION_SWITCH_DOMAIN,
+};
+
 struct mlx5_ifc_dest_format_struct_bits {
 	u8         destination_type[0x8];
 	u8         destination_id[0x18];
-- 
cgit v1.2.3


From 80f09dfc237f181e92968a72d97b7a4202baa453 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Mon, 29 Apr 2019 18:14:16 +0000
Subject: net/mlx5: Eswitch, enable RoCE loopback traffic

When in switchdev mode, we would like to treat loopback RoCE
traffic (on eswitch manager) as RDMA and not as regular
Ethernet traffic
In order to enable it we add flow steering rule that forward RoCE
loopback traffic to the HW RoCE filter (by adding allow rule).
In addition we add RoCE address in GID index 0, which will be
set in the RoCE loopback packet.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Acked-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/driver.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 582a9680b182..7fa95270dd59 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -512,6 +512,12 @@ struct mlx5_rl_table {
 	struct mlx5_rl_entry   *rl_entry;
 };
 
+struct mlx5_core_roce {
+	struct mlx5_flow_table *ft;
+	struct mlx5_flow_group *fg;
+	struct mlx5_flow_handle *allow_rule;
+};
+
 struct mlx5_priv {
 	struct mlx5_eq_table	*eq_table;
 
@@ -565,6 +571,7 @@ struct mlx5_priv {
 	struct mlx5_lag		*lag;
 	struct mlx5_devcom	*devcom;
 	unsigned long		pci_dev_data;
+	struct mlx5_core_roce	roce;
 	struct mlx5_fc_stats		fc_stats;
 	struct mlx5_rl_table            rl_table;
 
-- 
cgit v1.2.3


From 75d90e7def8e20b57c1de9e2b672c3bf9249da83 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@mellanox.com>
Date: Mon, 29 Apr 2019 18:14:18 +0000
Subject: net/mlx5: Geneve, Add basic Geneve encap/decap flow table
 capabilities

Introduce support for Geneve flow specification and allow
the creation of rules that are matching on basic Geneve
protocol fields: VNI, OAM bit, protocol type, options length.

Reviewed-by: Oz Shlomo <ozsh@mellanox.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 7d9264b282d1..268ac126b3bb 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -307,7 +307,11 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
 	u8         outer_gre_protocol[0x1];
 	u8         outer_gre_key[0x1];
 	u8         outer_vxlan_vni[0x1];
-	u8         reserved_at_1a[0x5];
+	u8         outer_geneve_vni[0x1];
+	u8         outer_geneve_oam[0x1];
+	u8         outer_geneve_protocol_type[0x1];
+	u8         outer_geneve_opt_len[0x1];
+	u8         reserved_at_1e[0x1];
 	u8         source_eswitch_port[0x1];
 
 	u8         inner_dmac[0x1];
@@ -480,7 +484,9 @@ struct mlx5_ifc_fte_match_set_misc_bits {
 	u8         vxlan_vni[0x18];
 	u8         reserved_at_b8[0x8];
 
-	u8         reserved_at_c0[0x20];
+	u8         geneve_vni[0x18];
+	u8         reserved_at_d8[0x7];
+	u8         geneve_oam[0x1];
 
 	u8         reserved_at_e0[0xc];
 	u8         outer_ipv6_flow_label[0x14];
@@ -488,7 +494,11 @@ struct mlx5_ifc_fte_match_set_misc_bits {
 	u8         reserved_at_100[0xc];
 	u8         inner_ipv6_flow_label[0x14];
 
-	u8         reserved_at_120[0x28];
+	u8         reserved_at_120[0xa];
+	u8         geneve_opt_len[0x6];
+	u8         geneve_protocol_type[0x10];
+
+	u8         reserved_at_140[0x8];
 	u8         bth_dst_qp[0x18];
 	u8	   reserved_at_160[0x20];
 	u8	   outer_esp_spi[0x20];
-- 
cgit v1.2.3


From b169e64a24442e02cafee1586f17fcb713fe65a6 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@mellanox.com>
Date: Mon, 29 Apr 2019 18:14:20 +0000
Subject: net/mlx5: Geneve, Add flow table capabilities for Geneve decap with
 TLV options

Introduce specification for Geneve decap flow with encapsulation options
and allow creation of rules that are matching on Geneve TLV options.

Reviewed-by: Oz Shlomo <ozsh@mellanox.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/device.h   |  4 +++-
 include/linux/mlx5/mlx5_ifc.h | 50 ++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 48 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 28ebb6c93542..4ab801040e98 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1001,7 +1001,8 @@ enum {
 	MLX5_MATCH_OUTER_HEADERS	= 1 << 0,
 	MLX5_MATCH_MISC_PARAMETERS	= 1 << 1,
 	MLX5_MATCH_INNER_HEADERS	= 1 << 2,
-
+	MLX5_MATCH_MISC_PARAMETERS_2	= 1 << 3,
+	MLX5_MATCH_MISC_PARAMETERS_3	= 1 << 4,
 };
 
 enum {
@@ -1045,6 +1046,7 @@ enum mlx5_mpls_supported_fields {
 };
 
 enum mlx5_flex_parser_protos {
+	MLX5_FLEX_PROTO_GENEVE	      = 1 << 3,
 	MLX5_FLEX_PROTO_CW_MPLS_GRE   = 1 << 4,
 	MLX5_FLEX_PROTO_CW_MPLS_UDP   = 1 << 5,
 };
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 268ac126b3bb..6a7fc18a9fe3 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -86,6 +86,11 @@ enum {
 
 enum {
 	MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM = (1ULL << MLX5_OBJ_TYPE_SW_ICM),
+	MLX5_GENERAL_OBJ_TYPES_CAP_GENEVE_TLV_OPT = (1ULL << 11),
+};
+
+enum {
+	MLX5_OBJ_TYPE_GENEVE_TLV_OPT = 0x000b,
 };
 
 enum {
@@ -339,7 +344,8 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
 	u8         inner_tcp_flags[0x1];
 	u8         reserved_at_37[0x9];
 
-	u8         reserved_at_40[0x5];
+	u8         geneve_tlv_option_0_data[0x1];
+	u8         reserved_at_41[0x4];
 	u8         outer_first_mpls_over_udp[0x4];
 	u8         outer_first_mpls_over_gre[0x4];
 	u8         inner_first_mpls[0x4];
@@ -528,6 +534,12 @@ struct mlx5_ifc_fte_match_set_misc2_bits {
 	u8         reserved_at_1a0[0x60];
 };
 
+struct mlx5_ifc_fte_match_set_misc3_bits {
+	u8         reserved_at_0[0x120];
+	u8         geneve_tlv_option_0_data[0x20];
+	u8         reserved_at_140[0xc0];
+};
+
 struct mlx5_ifc_cmd_pas_bits {
 	u8         pa_h[0x20];
 
@@ -1247,9 +1259,13 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   num_of_uars_per_page[0x20];
 
 	u8         flex_parser_protocols[0x20];
-	u8         reserved_at_560[0x20];
 
-	u8         reserved_at_580[0x3c];
+	u8         max_geneve_tlv_options[0x8];
+	u8         reserved_at_568[0x3];
+	u8         max_geneve_tlv_option_data_len[0x5];
+	u8         reserved_at_570[0x10];
+
+	u8         reserved_at_580[0x1c];
 	u8         mini_cqe_resp_stride_index[0x1];
 	u8         cqe_128_always[0x1];
 	u8         cqe_compression_128[0x1];
@@ -1283,7 +1299,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         uctx_cap[0x20];
 
-	u8	   reserved_at_6c0[0x140];
+	u8         reserved_at_6c0[0x4];
+	u8         flex_parser_id_geneve_tlv_option_0[0x4];
+	u8         reserved_at_6c8[0x138];
 };
 
 enum mlx5_flow_destination_type {
@@ -1341,7 +1359,9 @@ struct mlx5_ifc_fte_match_param_bits {
 
 	struct mlx5_ifc_fte_match_set_misc2_bits misc_parameters_2;
 
-	u8         reserved_at_800[0x800];
+	struct mlx5_ifc_fte_match_set_misc3_bits misc_parameters_3;
+
+	u8         reserved_at_a00[0x600];
 };
 
 enum {
@@ -4850,6 +4870,7 @@ enum {
 	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS  = 0x1,
 	MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_INNER_HEADERS    = 0x2,
 	MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_2 = 0x3,
+	MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_3 = 0x4,
 };
 
 struct mlx5_ifc_query_flow_group_out_bits {
@@ -9545,6 +9566,20 @@ struct mlx5_ifc_sw_icm_bits {
 	u8         sw_icm_start_addr[0x40];
 
 	u8         reserved_at_c0[0x140];
+}; 
+
+struct mlx5_ifc_geneve_tlv_option_bits {
+	u8         modify_field_select[0x40];
+
+	u8         reserved_at_40[0x18];
+	u8         geneve_option_fte_index[0x8];
+
+	u8         option_class[0x10];
+	u8         option_type[0x8];
+	u8         reserved_at_78[0x3];
+	u8         option_data_length[0x5];
+
+	u8         reserved_at_80[0x180];
 };
 
 struct mlx5_ifc_create_umem_in_bits {
@@ -9589,6 +9624,11 @@ struct mlx5_ifc_create_sw_icm_in_bits {
 	struct mlx5_ifc_sw_icm_bits		      sw_icm;
 };
 
+struct mlx5_ifc_create_geneve_tlv_option_in_bits {
+	struct mlx5_ifc_general_obj_in_cmd_hdr_bits   hdr;
+	struct mlx5_ifc_geneve_tlv_option_bits        geneve_tlv_opt;
+};
+
 struct mlx5_ifc_mtrc_string_db_param_bits {
 	u8         string_db_base_address[0x20];
 
-- 
cgit v1.2.3


From f1f86d09ca7e35fb161a47bc54ec9cb2f4fe42d8 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 15 Apr 2019 16:43:14 -0400
Subject: netfilter: nf_tables: relocate header content to consumer

The nf_tables.h header is used in a lot of files, but it turns out
that there is only one actual user of nft_expr_clone().

Hence we relocate that function to be with the one consumer of it
and avoid having to process it with CPP for all the other files.

This will also enable a reduction in the other headers that the
nf_tables.h itself has to include just to be stand-alone, hence
a pending further significant reduction in the CPP content that
needs to get processed for each netfilter file.

Note that the explicit "inline" has been dropped as part of this
relocation.  In similar changes to this, I believe Dave has asked
this be done, so we free up gcc to make the choice of whether to
inline or not.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 2d5a0a1a87b8..706f744f7308 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -806,23 +806,6 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr);
 int nft_expr_dump(struct sk_buff *skb, unsigned int attr,
 		  const struct nft_expr *expr);
 
-static inline int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src)
-{
-	int err;
-
-	if (src->ops->clone) {
-		dst->ops = src->ops;
-		err = src->ops->clone(dst, src);
-		if (err < 0)
-			return err;
-	} else {
-		memcpy(dst, src, src->ops->size);
-	}
-
-	__module_get(src->ops->type->owner);
-	return 0;
-}
-
 /**
  *	struct nft_rule - nf_tables rule
  *
-- 
cgit v1.2.3


From a4cb98f32c9046fea28bcb4979182f2ff731a27a Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 15 Apr 2019 16:43:16 -0400
Subject: netfilter: nf_tables: drop include of module.h from nf_tables.h

Ideally, header files under include/linux shouldn't be adding
includes of other headers, in anticipation of their consumers,
but just the headers needed for the header itself to pass
parsing with CPP.

The module.h is particularly bad in this sense, as it itself does
include a whole bunch of other headers, due to the complexity of
module support.

Since nf_tables.h is not going into a module struct looking for
specific fields, we can just let it know that module is a struct,
just like about 60 other include/linux headers already do.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 706f744f7308..5b8624ae4a27 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -2,7 +2,6 @@
 #ifndef _NET_NF_TABLES_H
 #define _NET_NF_TABLES_H
 
-#include <linux/module.h>
 #include <linux/list.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nfnetlink.h>
@@ -13,6 +12,8 @@
 #include <net/netfilter/nf_flow_table.h>
 #include <net/netlink.h>
 
+struct module;
+
 #define NFT_JUMP_STACK_SIZE	16
 
 struct nft_pktinfo {
-- 
cgit v1.2.3


From 8f14c99c7edaaba9c0bb1727d44db6ebf157cc61 Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Sun, 7 Apr 2019 08:14:20 -0700
Subject: netfilter: conntrack: limit sysctl setting for boolean options

We use the zero and one to limit the boolean options setting.
After this patch we only set 0 or 1 to boolean options for nf
conntrack sysctl.

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netns/conntrack.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index f19b53130bf7..806454e767bf 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -24,9 +24,9 @@ struct nf_generic_net {
 
 struct nf_tcp_net {
 	unsigned int timeouts[TCP_CONNTRACK_TIMEOUT_MAX];
-	unsigned int tcp_loose;
-	unsigned int tcp_be_liberal;
-	unsigned int tcp_max_retrans;
+	int tcp_loose;
+	int tcp_be_liberal;
+	int tcp_max_retrans;
 };
 
 enum udp_conntrack {
-- 
cgit v1.2.3


From e1f172e162c0a11721f1188f12e5b4c3f9f80de6 Mon Sep 17 00:00:00 2001
From: Flavio Leitner <fbl@redhat.com>
Date: Wed, 17 Apr 2019 11:46:14 -0300
Subject: netfilter: use macros to create module aliases.

Each NAT helper creates a module alias which follows a pattern.
Use macros for consistency.

Signed-off-by: Flavio Leitner <fbl@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_helper.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index ec52a8dc32fd..28bd4569aa64 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -15,6 +15,10 @@
 #include <net/netfilter/nf_conntrack_extend.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 
+#define NF_NAT_HELPER_NAME(name)	"ip_nat_" name
+#define MODULE_ALIAS_NF_NAT_HELPER(name) \
+	MODULE_ALIAS(NF_NAT_HELPER_NAME(name))
+
 struct module;
 
 enum nf_ct_helper_flags {
-- 
cgit v1.2.3


From 08010a21602678932894c5e87014a282af0079cf Mon Sep 17 00:00:00 2001
From: Flavio Leitner <fbl@redhat.com>
Date: Wed, 17 Apr 2019 11:46:15 -0300
Subject: netfilter: add API to manage NAT helpers.

The API allows a conntrack helper to indicate its corresponding
NAT helper which then can be loaded and reference counted.

Signed-off-by: Flavio Leitner <fbl@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_helper.h | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index 28bd4569aa64..44b5a00a9c64 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -15,7 +15,8 @@
 #include <net/netfilter/nf_conntrack_extend.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 
-#define NF_NAT_HELPER_NAME(name)	"ip_nat_" name
+#define NF_NAT_HELPER_PREFIX		"ip_nat_"
+#define NF_NAT_HELPER_NAME(name)	NF_NAT_HELPER_PREFIX name
 #define MODULE_ALIAS_NF_NAT_HELPER(name) \
 	MODULE_ALIAS(NF_NAT_HELPER_NAME(name))
 
@@ -58,6 +59,8 @@ struct nf_conntrack_helper {
 	unsigned int queue_num;
 	/* length of userspace private data stored in nf_conn_help->data */
 	u16 data_len;
+	/* name of NAT helper module */
+	char nat_mod_name[NF_CT_HELPER_NAME_LEN];
 };
 
 /* Must be kept in sync with the classes defined by helpers */
@@ -157,4 +160,21 @@ nf_ct_helper_expectfn_find_by_symbol(const void *symbol);
 extern struct hlist_head *nf_ct_helper_hash;
 extern unsigned int nf_ct_helper_hsize;
 
+struct nf_conntrack_nat_helper {
+	struct list_head list;
+	char mod_name[NF_CT_HELPER_NAME_LEN];	/* module name */
+	struct module *module;			/* pointer to self */
+};
+
+#define NF_CT_NAT_HELPER_INIT(name) \
+	{ \
+	.mod_name = NF_NAT_HELPER_NAME(name), \
+	.module = THIS_MODULE \
+	}
+
+void nf_nat_helper_register(struct nf_conntrack_nat_helper *nat);
+void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat);
+int nf_nat_helper_try_module_get(const char *name, u16 l3num,
+				 u8 protonum);
+void nf_nat_helper_put(struct nf_conntrack_helper *helper);
 #endif /*_NF_CONNTRACK_HELPER_H*/
-- 
cgit v1.2.3


From 3087c3f7c23b9c54b956ee5519e97a42413ddf22 Mon Sep 17 00:00:00 2001
From: Brett Mastbergen <bmastbergen@untangle.com>
Date: Wed, 24 Apr 2019 10:48:44 -0400
Subject: netfilter: nft_ct: Add ct id support

The 'id' key returns the unique id of the conntrack entry as returned
by nf_ct_get_id().

Signed-off-by: Brett Mastbergen <bmastbergen@untangle.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 061bb3eb20c3..f0cf7b0f4f35 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -967,6 +967,7 @@ enum nft_socket_keys {
  * @NFT_CT_SRC_IP6: conntrack layer 3 protocol source (IPv6 address)
  * @NFT_CT_DST_IP6: conntrack layer 3 protocol destination (IPv6 address)
  * @NFT_CT_TIMEOUT: connection tracking timeout policy assigned to conntrack
+ * @NFT_CT_ID: conntrack id
  */
 enum nft_ct_keys {
 	NFT_CT_STATE,
@@ -993,6 +994,7 @@ enum nft_ct_keys {
 	NFT_CT_SRC_IP6,
 	NFT_CT_DST_IP6,
 	NFT_CT_TIMEOUT,
+	NFT_CT_ID,
 	__NFT_CT_MAX
 };
 #define NFT_CT_MAX		(__NFT_CT_MAX - 1)
-- 
cgit v1.2.3


From 33162e9a0590f16e1b21be764caae517e2bb310c Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 28 Apr 2019 21:45:43 +0300
Subject: net: dsa: Store vlan_filtering as a property of dsa_port

This allows drivers to query the VLAN setting imposed by the bridge
driver directly from DSA, instead of keeping their own state based on
the .port_vlan_filtering callback.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index b550f7bb5314..79a87913126c 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -161,6 +161,7 @@ struct dsa_port {
 	const char		*mac;
 	struct device_node	*dn;
 	unsigned int		ageing_time;
+	bool			vlan_filtering;
 	u8			stp_state;
 	struct net_device	*bridge_dev;
 	struct devlink_port	devlink_port;
-- 
cgit v1.2.3


From 8f5d16f638b9a1adf544a7f8cfd11ac1c01c6e25 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 28 Apr 2019 21:45:44 +0300
Subject: net: dsa: Be aware of switches where VLAN filtering is a global
 setting

On some switches, the action of whether to parse VLAN frame headers and use
that information for ingress admission is configurable, but not per
port. Such is the case for the Broadcom BCM53xx and the NXP SJA1105
families, for example. In that case, DSA can prevent the bridge core
from trying to apply different VLAN filtering settings on net devices
that belong to the same switch.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Suggested-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 79a87913126c..aab3c2029edd 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -228,6 +228,11 @@ struct dsa_switch {
 	/* Number of switch port queues */
 	unsigned int		num_tx_queues;
 
+	/* Disallow bridge core from requesting different VLAN awareness
+	 * settings on ports if not hardware-supported
+	 */
+	bool			vlan_filtering_is_global;
+
 	unsigned long		*bitmap;
 	unsigned long		_bitmap;
 
-- 
cgit v1.2.3


From 145746765f06a3dbc7869c81d0165b3ab96f935a Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 28 Apr 2019 21:45:48 +0300
Subject: net: dsa: Keep the vlan_filtering setting in dsa_switch if it's
 global

The current behavior is not as obvious as one would assume (which is
that, if the driver set vlan_filtering_is_global = 1, then checking any
dp->vlan_filtering would yield the same result). Only the ports which
are actively enslaved into a bridge would have vlan_filtering set.

This makes it tricky for drivers to check what the global state is.
So fix this and make the struct dsa_switch hold this global setting.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index aab3c2029edd..4e0f7e9c5aa1 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -233,6 +233,11 @@ struct dsa_switch {
 	 */
 	bool			vlan_filtering_is_global;
 
+	/* In case vlan_filtering_is_global is set, the VLAN awareness state
+	 * should be retrieved from here and not from the per-port settings.
+	 */
+	bool			vlan_filtering;
+
 	unsigned long		*bitmap;
 	unsigned long		_bitmap;
 
-- 
cgit v1.2.3


From cf2d45f5ba9a730df6ec190e0345cecde80b1d8b Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 28 Apr 2019 21:45:49 +0300
Subject: net: dsa: Add helper function to retrieve VLAN awareness setting

Since different types of hardware may or may not support this setting
per-port, DSA keeps it either in dsa_switch or in dsa_port.

While drivers may know the characteristics of their hardware and
retrieve it from the correct place without the need of helpers, it is
cumbersone to find out an unambigous answer from generic DSA code.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 4e0f7e9c5aa1..1e6b4efc80b9 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -305,6 +305,16 @@ static inline unsigned int dsa_upstream_port(struct dsa_switch *ds, int port)
 	return dsa_towards_port(ds, cpu_dp->ds->index, cpu_dp->index);
 }
 
+static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp)
+{
+	const struct dsa_switch *ds = dp->ds;
+
+	if (ds->vlan_filtering_is_global)
+		return ds->vlan_filtering;
+	else
+		return dp->vlan_filtering;
+}
+
 typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid,
 			      bool is_static, void *data);
 struct dsa_switch_ops {
-- 
cgit v1.2.3


From 93e86b3bc842c159a60e6987444bf3952adcd4db Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 28 Apr 2019 02:56:23 +0200
Subject: net: dsa: Remove legacy probing support

Now that all drivers can be probed using more traditional methods,
remove the legacy probe code.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 23 -----------------------
 1 file changed, 23 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 1e6b4efc80b9..18db7b8e7a8e 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -318,15 +318,6 @@ static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp)
 typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid,
 			      bool is_static, void *data);
 struct dsa_switch_ops {
-#if IS_ENABLED(CONFIG_NET_DSA_LEGACY)
-	/*
-	 * Legacy probing.
-	 */
-	const char	*(*probe)(struct device *dsa_dev,
-				  struct device *host_dev, int sw_addr,
-				  void **priv);
-#endif
-
 	enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds,
 						  int port);
 
@@ -516,20 +507,6 @@ struct dsa_switch_driver {
 	const struct dsa_switch_ops *ops;
 };
 
-#if IS_ENABLED(CONFIG_NET_DSA_LEGACY)
-/* Legacy driver registration */
-void register_switch_driver(struct dsa_switch_driver *type);
-void unregister_switch_driver(struct dsa_switch_driver *type);
-struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev);
-
-#else
-static inline void register_switch_driver(struct dsa_switch_driver *type) { }
-static inline void unregister_switch_driver(struct dsa_switch_driver *type) { }
-static inline struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev)
-{
-	return NULL;
-}
-#endif
 struct net_device *dsa_dev_to_net_device(struct device *dev);
 
 /* Keep inline for faster access in hot path */
-- 
cgit v1.2.3


From b587bdaf5f820cf7dac2c1b351db97bf98e1f427 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@mellanox.com>
Date: Mon, 29 Apr 2019 12:41:45 +0300
Subject: devlink: Change devlink health locking mechanism

The devlink health reporters create/destroy and user commands currently
use the devlink->lock as a locking mechanism. Different reporters have
different rules in the driver and are being created/destroyed during
different stages of driver load/unload/running. So during execution of a
reporter recover the flow can go through another reporter's destroy and
create. Such flow leads to deadlock trying to lock a mutex already
held.

With the new locking mechanism the different reporters share mutex lock
only to protect access to shared reporters list.
Added refcount per reporter, to protect the reporters from destroy while
being used.

Signed-off-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 4f5e41613503..1c4adfb4195a 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -32,6 +32,7 @@ struct devlink {
 	struct list_head region_list;
 	u32 snapshot_id;
 	struct list_head reporter_list;
+	struct mutex reporters_lock; /* protects reporter_list */
 	struct devlink_dpipe_headers *dpipe_headers;
 	const struct devlink_ops *ops;
 	struct device *dev;
-- 
cgit v1.2.3


From a3d43c0d56f1b94e74963a2fbadfb70126d92213 Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Mon, 29 Apr 2019 15:48:31 -0700
Subject: taprio: Add support adding an admin schedule

The IEEE 802.1Q-2018 defines two "types" of schedules, the "Oper" (from
operational?) and "Admin" ones. Up until now, 'taprio' only had
support for the "Oper" one, added when the qdisc is created. This adds
support for the "Admin" one, which allows the .change() operation to
be supported.

Just for clarification, some quick (and dirty) definitions, the "Oper"
schedule is the currently (as in this instant) running one, and it's
read-only. The "Admin" one is the one that the system configurator has
installed, it can be changed, and it will be "promoted" to "Oper" when
it's 'base-time' is reached.

The idea behing this patch is that calling something like the below,
(after taprio is already configured with an initial schedule):

$ tc qdisc change taprio dev IFACE parent root 	     \
     	   base-time X 	     	   	       	     \
     	   sched-entry <CMD> <GATES> <INTERVAL>	     \
	   ...

Will cause a new admin schedule to be created and programmed to be
"promoted" to "Oper" at instant X. If an "Admin" schedule already
exists, it will be overwritten with the new parameters.

Up until now, there was some code that was added to ease the support
of changing a single entry of a schedule, but was ultimately unused.
Now, that we have support for "change" with more well thought
semantics, updating a single entry seems to be less useful.

So we remove what is in practice dead code, and return a "not
supported" error if the user tries to use it. If changing a single
entry would make the user's life easier we may ressurrect this idea,
but at this point, removing it simplifies the code.

For now, only the schedule specific bits are allowed to be added for a
new schedule, that means that 'clockid', 'num_tc', 'map' and 'queues'
cannot be modified.

Example:

$ tc qdisc change dev IFACE parent root handle 100 taprio \
      base-time $BASE_TIME \
      sched-entry S 00 500000 \
      sched-entry S 0f 500000 \
      clockid CLOCK_TAI

The only change in the netlink API introduced by this change is the
introduction of an "admin" type in the response to a dump request,
that type allows userspace to separate the "oper" schedule from the
"admin" schedule. If userspace doesn't support the "admin" type, it
will only display the "oper" schedule.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 7ee74c3474bf..d59770d0eb84 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1148,6 +1148,16 @@ enum {
 
 #define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1)
 
+/* The format for the admin sched (dump only):
+ * [TCA_TAPRIO_SCHED_ADMIN_SCHED]
+ *   [TCA_TAPRIO_ATTR_SCHED_BASE_TIME]
+ *   [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]
+ *     [TCA_TAPRIO_ATTR_SCHED_ENTRY]
+ *       [TCA_TAPRIO_ATTR_SCHED_ENTRY_CMD]
+ *       [TCA_TAPRIO_ATTR_SCHED_ENTRY_GATES]
+ *       [TCA_TAPRIO_ATTR_SCHED_ENTRY_INTERVAL]
+ */
+
 enum {
 	TCA_TAPRIO_ATTR_UNSPEC,
 	TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */
@@ -1156,6 +1166,7 @@ enum {
 	TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */
 	TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */
 	TCA_TAPRIO_PAD,
+	TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */
 	__TCA_TAPRIO_ATTR_MAX,
 };
 
-- 
cgit v1.2.3


From 6ca6a6654225f3cd001304d33429c817e0c0b85f Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Mon, 29 Apr 2019 15:48:32 -0700
Subject: taprio: Add support for setting the cycle-time manually

IEEE 802.1Q-2018 defines that a the cycle-time of a schedule may be
overridden, so the schedule is truncated to a determined "width".

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index d59770d0eb84..7a32276838e1 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1167,6 +1167,7 @@ enum {
 	TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */
 	TCA_TAPRIO_PAD,
 	TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */
+	TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */
 	__TCA_TAPRIO_ATTR_MAX,
 };
 
-- 
cgit v1.2.3


From c25031e993440debdd530278ce2171ce477df029 Mon Sep 17 00:00:00 2001
From: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Date: Mon, 29 Apr 2019 15:48:33 -0700
Subject: taprio: Add support for cycle-time-extension

IEEE 802.1Q-2018 defines the concept of a cycle-time-extension, so the
last entry of a schedule before the start of a new schedule can be
extended, so "too-short" entries can be avoided.

Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 7a32276838e1..8b2f993cbb77 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1168,6 +1168,7 @@ enum {
 	TCA_TAPRIO_PAD,
 	TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */
 	TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */
+	TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */
 	__TCA_TAPRIO_ATTR_MAX,
 };
 
-- 
cgit v1.2.3


From 8425c41d1ef762cc15d9501d7117f009a79f3fe9 Mon Sep 17 00:00:00 2001
From: Esben Haabendal <esben@geanix.com>
Date: Tue, 30 Apr 2019 09:17:49 +0200
Subject: net: ll_temac: Extend support to non-device-tree platforms

Support initialization with platdata, so the driver can be used on
non-device-tree platforms.

For currently supported device-tree platforms, the driver should behave
as before.

Signed-off-by: Esben Haabendal <esben@geanix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/platform_data/xilinx-ll-temac.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 include/linux/platform_data/xilinx-ll-temac.h

(limited to 'include')

diff --git a/include/linux/platform_data/xilinx-ll-temac.h b/include/linux/platform_data/xilinx-ll-temac.h
new file mode 100644
index 000000000000..82e2f80648b0
--- /dev/null
+++ b/include/linux/platform_data/xilinx-ll-temac.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_XILINX_LL_TEMAC_H
+#define __LINUX_XILINX_LL_TEMAC_H
+
+#include <linux/if_ether.h>
+#include <linux/phy.h>
+
+struct ll_temac_platform_data {
+	bool txcsum;		/* Enable/disable TX checksum */
+	bool rxcsum;		/* Enable/disable RX checksum */
+	u8 mac_addr[ETH_ALEN];	/* MAC address (6 bytes) */
+	/* Clock frequency for input to MDIO clock generator */
+	u32 mdio_clk_freq;
+	unsigned long long mdio_bus_id; /* Unique id for MDIO bus */
+	int phy_addr;		/* Address of the PHY to connect to */
+	phy_interface_t phy_interface; /* PHY interface mode */
+};
+
+#endif /* __LINUX_XILINX_LL_TEMAC_H */
-- 
cgit v1.2.3


From a3246dc41aa3c9d799478ccc8dac5d19c509a923 Mon Sep 17 00:00:00 2001
From: Esben Haabendal <esben@geanix.com>
Date: Tue, 30 Apr 2019 09:17:51 +0200
Subject: net: ll_temac: Add support for non-native register endianness

Replace the powerpc specific MMIO register access functions with the
generic big-endian mmio access functions, and add support for
little-endian access depending on configuration.

Big-endian access is maintained as the default, but little-endian can
be configured in device-tree binding or in platform data.

The temac_ior()/temac_iow() functions are replaced with macro wrappers
to avoid modifying existing code more than necessary.

Signed-off-by: Esben Haabendal <esben@geanix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/platform_data/xilinx-ll-temac.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/platform_data/xilinx-ll-temac.h b/include/linux/platform_data/xilinx-ll-temac.h
index 82e2f80648b0..af87927abab3 100644
--- a/include/linux/platform_data/xilinx-ll-temac.h
+++ b/include/linux/platform_data/xilinx-ll-temac.h
@@ -14,6 +14,8 @@ struct ll_temac_platform_data {
 	unsigned long long mdio_bus_id; /* Unique id for MDIO bus */
 	int phy_addr;		/* Address of the PHY to connect to */
 	phy_interface_t phy_interface; /* PHY interface mode */
+	bool reg_little_endian;	/* Little endian TEMAC register access  */
+	bool dma_little_endian;	/* Little endian DMA register access  */
 };
 
 #endif /* __LINUX_XILINX_LL_TEMAC_H */
-- 
cgit v1.2.3


From f14f5c11f051ca4a41e65017d94408e5e702ba9d Mon Sep 17 00:00:00 2001
From: Esben Haabendal <esben@geanix.com>
Date: Tue, 30 Apr 2019 09:17:54 +0200
Subject: net: ll_temac: Support indirect_mutex share within TEMAC IP

Indirect register access goes through a DCR bus bridge, which
allows only one outstanding transaction.  And to make matters
worse, each TEMAC IP block contains two Ethernet interfaces, and
although they seem to have separate registers for indirect access,
they actually share the registers.  Or to be more specific, MSW, LSW
and CTL registers are physically shared between Ethernet interfaces
in same TEMAC IP, with RDY register being (almost) specificic to
the Ethernet interface.  The 0x10000 bit in RDY reflects combined
bus ready state though.

So we need to take care to synchronize not only within a single
device, but also between devices in same TEMAC IP.

This commit allows to do that with legacy platform devices.

For OF devices, the xlnx,compound parent of the temac node should be
used to find siblings, and setup a shared indirect_mutex between them.
I will leave this work to somebody else, as I don't have hardware to
test that.  No regression is introduced by that, as before this commit
using two Ethernet interfaces in same TEMAC block is simply broken.

Signed-off-by: Esben Haabendal <esben@geanix.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/platform_data/xilinx-ll-temac.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/platform_data/xilinx-ll-temac.h b/include/linux/platform_data/xilinx-ll-temac.h
index af87927abab3..b0b8238a9b7d 100644
--- a/include/linux/platform_data/xilinx-ll-temac.h
+++ b/include/linux/platform_data/xilinx-ll-temac.h
@@ -16,6 +16,12 @@ struct ll_temac_platform_data {
 	phy_interface_t phy_interface; /* PHY interface mode */
 	bool reg_little_endian;	/* Little endian TEMAC register access  */
 	bool dma_little_endian;	/* Little endian DMA register access  */
+	/* Pre-initialized mutex to use for synchronizing indirect
+	 * register access.  When using both interfaces of a single
+	 * TEMAC IP block, the same mutex should be passed here, as
+	 * they share the same DCR bus bridge.
+	 */
+	struct mutex *indirect_mutex;
 };
 
 #endif /* __LINUX_XILINX_LL_TEMAC_H */
-- 
cgit v1.2.3


From 7e97a194aca03c6ff86f84e46e196f5c9ed5c32c Mon Sep 17 00:00:00 2001
From: Esben Haabendal <esben@geanix.com>
Date: Tue, 30 Apr 2019 09:17:58 +0200
Subject: net: ll_temac: Allow configuration of IRQ coalescing

This allows custom setup of IRQ coalescing for platforms using legacy
platform_device. The irq timeout and count parameters can be used for
tuning cpu load vs. latency.

I have maintained the 0x00000400 bit in TX_CHNL_CTRL.  It is specified as
unused in the documentation I have available.  It does not make any
difference in the hardware I have available, so it is left in to not risk
breaking other platforms where it might be used.

Signed-off-by: Esben Haabendal <esben@geanix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/platform_data/xilinx-ll-temac.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/platform_data/xilinx-ll-temac.h b/include/linux/platform_data/xilinx-ll-temac.h
index b0b8238a9b7d..368530f98176 100644
--- a/include/linux/platform_data/xilinx-ll-temac.h
+++ b/include/linux/platform_data/xilinx-ll-temac.h
@@ -22,6 +22,11 @@ struct ll_temac_platform_data {
 	 * they share the same DCR bus bridge.
 	 */
 	struct mutex *indirect_mutex;
+	/* DMA channel control setup */
+	u8 tx_irq_timeout;	/* TX Interrupt Delay Time-out */
+	u8 tx_irq_count;	/* TX Interrupt Coalescing Threshold Count */
+	u8 rx_irq_timeout;	/* RX Interrupt Delay Time-out */
+	u8 rx_irq_count;	/* RX Interrupt Coalescing Threshold Count */
 };
 
 #endif /* __LINUX_XILINX_LL_TEMAC_H */
-- 
cgit v1.2.3


From 91a40a48d52d13fbde3239d5839335cabd9a4eae Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Wed, 1 May 2019 03:21:05 +0000
Subject: net/mlx5: Fix broken hca cap offset

The cited commit broke the offsets of hca cap struct, fix it.
While at it, cleanup a white space introduced by the same commit.

Fixes: b169e64a2444 ("net/mlx5: Geneve, Add flow table capabilities for Geneve decap with TLV options")
Reported-by: Qian Cai <cai@lca.pw>
Cc: Yevgeny Kliteynik <kliteyn@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6a7fc18a9fe3..6b2e6b710ac0 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1265,7 +1265,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         max_geneve_tlv_option_data_len[0x5];
 	u8         reserved_at_570[0x10];
 
-	u8         reserved_at_580[0x1c];
+	u8         reserved_at_580[0x3c];
 	u8         mini_cqe_resp_stride_index[0x1];
 	u8         cqe_128_always[0x1];
 	u8         cqe_compression_128[0x1];
@@ -9566,7 +9566,7 @@ struct mlx5_ifc_sw_icm_bits {
 	u8         sw_icm_start_addr[0x40];
 
 	u8         reserved_at_c0[0x140];
-}; 
+};
 
 struct mlx5_ifc_geneve_tlv_option_bits {
 	u8         modify_field_select[0x40];
-- 
cgit v1.2.3


From 0e1a2a3e6e7d37cea9f8586f6d7745b539147d9f Mon Sep 17 00:00:00 2001
From: Erez Alfasi <ereza@mellanox.com>
Date: Tue, 5 Mar 2019 15:42:23 +0200
Subject: ethtool: Add SFF-8436 and SFF-8636 max EEPROM length definitions

Added max EEPROM length defines for ethtool usage:
 #define ETH_MODULE_SFF_8636_MAX_LEN     640
 #define ETH_MODULE_SFF_8436_MAX_LEN     640

These definitions are used to determine the EEPROM
data length when reading high eeprom pages.

For example, SFF-8636 EEPROM data from page 03h
needs to be stored at data[512] - data[639].

Signed-off-by: Erez Alfasi <ereza@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/uapi/linux/ethtool.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 818ad368b586..3534ce157ae9 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1712,6 +1712,9 @@ static inline int ethtool_validate_duplex(__u8 duplex)
 #define ETH_MODULE_SFF_8436		0x4
 #define ETH_MODULE_SFF_8436_LEN		256
 
+#define ETH_MODULE_SFF_8636_MAX_LEN     640
+#define ETH_MODULE_SFF_8436_MAX_LEN     640
+
 /* Reset flags */
 /* The reset() operation must clear the flags for the components which
  * were actually reset.  On successful return, the flags indicate the
-- 
cgit v1.2.3


From a708fb7b1f8dcc7a8ed949839958cd5d812dd939 Mon Sep 17 00:00:00 2001
From: Erez Alfasi <ereza@mellanox.com>
Date: Thu, 21 Mar 2019 15:02:13 +0200
Subject: net/mlx5e: ethtool, Add support for EEPROM high pages query

Add the support to read additional EEPROM information from high pages.
Information for modules such as SFF-8436 and SFF-8636:
 1) Application select table
 2) User writable EEPROM
 3) Thresholds and alarms

Signed-off-by: Erez Alfasi <ereza@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/port.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index 64e78394fc9c..de9a272c9f3d 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -60,6 +60,7 @@ enum mlx5_an_status {
 #define MLX5_I2C_ADDR_LOW		0x50
 #define MLX5_I2C_ADDR_HIGH		0x51
 #define MLX5_EEPROM_PAGE_LENGTH		256
+#define MLX5_EEPROM_HIGH_PAGE_LENGTH	128
 
 enum mlx5e_link_mode {
 	MLX5E_1000BASE_CX_SGMII	 = 0,
-- 
cgit v1.2.3


From c9bbfb378bc35fcd0b51e4a8950bd50447f39832 Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Fri, 12 Apr 2019 16:14:03 -0500
Subject: net/mlx5: Remove unused mlx5_query_nic_vport_vlans

mlx5_query_nic_vport_vlans() is not used anymore. Hence remove it.
This patch doesn't change any functionality.

Signed-off-by: Bodong Wang <bodong@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/vport.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 0eef548b9946..3d1c6cdbbba7 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -118,10 +118,6 @@ int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev,
 				  int promisc_uc,
 				  int promisc_mc,
 				  int promisc_all);
-int mlx5_query_nic_vport_vlans(struct mlx5_core_dev *dev,
-			       u16 vport,
-			       u16 vlans[],
-			       int *size);
 int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev,
 				u16 vlans[],
 				int list_size);
-- 
cgit v1.2.3


From 6f4e02193c9a9ea54dd3151cf97489fa787cd0e6 Mon Sep 17 00:00:00 2001
From: Bodong Wang <bodong@mellanox.com>
Date: Thu, 18 Apr 2019 18:24:15 -0500
Subject: net/mlx5: E-Switch, Use atomic rep state to serialize state change

When the state of rep was introduced, it was also designed to prevent
duplicate unloading of the same rep. Considering the following two
flows when an eswitch manager is at switchdev mode with n VF reps loaded.

+--------------------------------------+--------------------------------+
| cpu-0                                | cpu-1                          |
| --------                             | --------                       |
| mlx5_ib_remove                       | mlx5_eswitch_disable_sriov     |
|  mlx5_ib_unregister_vport_reps       |  esw_offloads_cleanup          |
|   mlx5_eswitch_unregister_vport_reps |   esw_offloads_unload_all_reps |
|    __unload_reps_all_vport           |    __unload_reps_all_vport     |
+--------------------------------------+--------------------------------+

These two flows will try to unload the same rep. Per original design,
once one flow unloads the rep, the state moves to REGISTERED. The 2nd
flow will no longer needs to do the unload and bails out. However, as
read and write of the state is not atomic, when 1st flow is doing the
unload, the state is still LOADED, 2nd flow is able to do the same
unload action. Kernel crash will happen.

To solve this, driver should do atomic test-and-set for the state. So
that only one flow can change the rep state from LOADED to REGISTERED,
and proceed to do the actual unloading.

Since the state is changing to atomic type, all other read/write should
be atomic action as well.

Fixes: f121e0ea9586 (net/mlx5: E-Switch, Add state to eswitch vport representors)
Signed-off-by: Bodong Wang <bodong@mellanox.com>
Reviewed-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Vu Pham <vuhuong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/mlx5/eswitch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 96d8435421de..0ca77dd1429c 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -35,7 +35,7 @@ struct mlx5_eswitch_rep_if {
 	void		       (*unload)(struct mlx5_eswitch_rep *rep);
 	void		       *(*get_proto_dev)(struct mlx5_eswitch_rep *rep);
 	void			*priv;
-	u8			state;
+	atomic_t		state;
 };
 
 struct mlx5_eswitch_rep {
-- 
cgit v1.2.3


From 8b952747844526cef50fa2e0ae903f586e3cb2e4 Mon Sep 17 00:00:00 2001
From: Nicolas Ferre <nicolas.ferre@microchip.com>
Date: Fri, 3 May 2019 12:36:58 +0200
Subject: net: macb: shrink macb_platform_data structure

This structure was used intensively for machine specific values
when DT was not used. Since the removal of AVR32 from the kernel,
this structure is only used for passing clocks from PCI macb wrapper, all
other fields being 0.
All other known platforms use DT.

Remove the leftovers but make sure that PCI macb still works as
expected by using default values:
- phydev->irq is set to PHY_POLL by mdiobus_alloc()
- mii_bus->phy_mask is cleared while allocating it
- bp->phy_interface is set to PHY_INTERFACE_MODE_MII if mode not found
in DT.

This simplifies driver probe path and particularly phy handling.

Signed-off-by: Nicolas Ferre <nicolas.ferre@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/platform_data/macb.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/platform_data/macb.h b/include/linux/platform_data/macb.h
index 7815d50c26ff..2bc51b822956 100644
--- a/include/linux/platform_data/macb.h
+++ b/include/linux/platform_data/macb.h
@@ -12,19 +12,10 @@
 
 /**
  * struct macb_platform_data - platform data for MACB Ethernet
- * @phy_mask:		phy mask passed when register the MDIO bus
- *			within the driver
- * @phy_irq_pin:	PHY IRQ
- * @is_rmii:		using RMII interface?
- * @rev_eth_addr:	reverse Ethernet address byte order
  * @pclk:		platform clock
  * @hclk:		AHB clock
  */
 struct macb_platform_data {
-	u32		phy_mask;
-	int		phy_irq_pin;
-	u8		is_rmii;
-	u8		rev_eth_addr;
 	struct clk	*pclk;
 	struct clk	*hclk;
 };
-- 
cgit v1.2.3


From 554aae35007e49f533d3d10e788295f7141725bc Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Thu, 2 May 2019 23:23:29 +0300
Subject: lib: Add support for generic packing operations

This provides an unified API for accessing register bit fields
regardless of memory layout. The basic unit of data for these API
functions is the u64. The process of transforming an u64 from native CPU
encoding into the peripheral's encoding is called 'pack', and
transforming it from peripheral to native CPU encoding is 'unpack'.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/packing.h | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 include/linux/packing.h

(limited to 'include')

diff --git a/include/linux/packing.h b/include/linux/packing.h
new file mode 100644
index 000000000000..54667735cc67
--- /dev/null
+++ b/include/linux/packing.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2016-2018, NXP Semiconductors
+ * Copyright (c) 2018-2019, Vladimir Oltean <olteanv@gmail.com>
+ */
+#ifndef _LINUX_PACKING_H
+#define _LINUX_PACKING_H
+
+#include <linux/types.h>
+#include <linux/bitops.h>
+
+#define QUIRK_MSB_ON_THE_RIGHT	BIT(0)
+#define QUIRK_LITTLE_ENDIAN	BIT(1)
+#define QUIRK_LSW32_IS_FIRST	BIT(2)
+
+enum packing_op {
+	PACK,
+	UNPACK,
+};
+
+/**
+ * packing - Convert numbers (currently u64) between a packed and an unpacked
+ *	     format. Unpacked means laid out in memory in the CPU's native
+ *	     understanding of integers, while packed means anything else that
+ *	     requires translation.
+ *
+ * @pbuf: Pointer to a buffer holding the packed value.
+ * @uval: Pointer to an u64 holding the unpacked value.
+ * @startbit: The index (in logical notation, compensated for quirks) where
+ *	      the packed value starts within pbuf. Must be larger than, or
+ *	      equal to, endbit.
+ * @endbit: The index (in logical notation, compensated for quirks) where
+ *	    the packed value ends within pbuf. Must be smaller than, or equal
+ *	    to, startbit.
+ * @op: If PACK, then uval will be treated as const pointer and copied (packed)
+ *	into pbuf, between startbit and endbit.
+ *	If UNPACK, then pbuf will be treated as const pointer and the logical
+ *	value between startbit and endbit will be copied (unpacked) to uval.
+ * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and
+ *	    QUIRK_MSB_ON_THE_RIGHT.
+ *
+ * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming
+ *	   correct usage, return code may be discarded.
+ *	   If op is PACK, pbuf is modified.
+ *	   If op is UNPACK, uval is modified.
+ */
+int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen,
+	    enum packing_op op, u8 quirks);
+
+#endif
-- 
cgit v1.2.3


From 8aa9ebccae87621d997707e4f25e53fddd7e30e4 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Thu, 2 May 2019 23:23:30 +0300
Subject: net: dsa: Introduce driver for NXP SJA1105 5-port L2 switch

At this moment the following is supported:
* Link state management through phylib
* Autonomous L2 forwarding managed through iproute2 bridge commands.

IP termination must be done currently through the master netdevice,
since the switch is unmanaged at this point and using
DSA_TAG_PROTO_NONE.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: Georg Waibel <georg.waibel@sensor-technik.de>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/sja1105.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 include/linux/dsa/sja1105.h

(limited to 'include')

diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
new file mode 100644
index 000000000000..30559d1d0e1b
--- /dev/null
+++ b/include/linux/dsa/sja1105.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2019, Vladimir Oltean <olteanv@gmail.com>
+ */
+
+/* Included by drivers/net/dsa/sja1105/sja1105.h */
+
+#ifndef _NET_DSA_SJA1105_H
+#define _NET_DSA_SJA1105_H
+
+/* The switch can only be convinced to stay in unmanaged mode and not trap any
+ * link-local traffic by actually telling it to filter frames sent at the
+ * 00:00:00:00:00:00 destination MAC.
+ */
+#define SJA1105_LINKLOCAL_FILTER_A		0x000000000000ull
+#define SJA1105_LINKLOCAL_FILTER_A_MASK		0xFFFFFFFFFFFFull
+#define SJA1105_LINKLOCAL_FILTER_B		0x000000000000ull
+#define SJA1105_LINKLOCAL_FILTER_B_MASK		0xFFFFFFFFFFFFull
+
+#endif /* _NET_DSA_SJA1105_H */
-- 
cgit v1.2.3


From bf5bc3ce8a8f32a0d45b6820ede8f9fc3e9c23df Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Thu, 2 May 2019 23:23:33 +0300
Subject: ether: Add dedicated Ethertype for pseudo-802.1Q DSA tagging

There are two possible utilizations so far:

- Switch devices that don't support a native insertion/extraction header
  on the CPU port may still enjoy the benefits of port isolation with a
  custom VLAN tag.

  For this, they need to have a customizable TPID in hardware and a new
  Ethertype to distinguish between real 802.1Q traffic and the private
  tags used for port separation.

- Switches that don't support the deactivation of VLAN awareness, but
  still want to have a mode in which they accept all traffic, including
  frames that are tagged with a VLAN not configured on their ports, may
  use this as a fake to trick the hardware into thinking that the TPID
  for VLAN is something other than 0x8100.

What follows after the ETH_P_DSA_8021Q EtherType is a regular VLAN
header (TCI), however there is no other EtherType that can be used for
this purpose and doesn't already have a well-defined meaning.
ETH_P_8021AD, ETH_P_QINQ1, ETH_P_QINQ2 and ETH_P_QINQ3 expect that
another follow-up VLAN tag is present, which is not the case here.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Suggested-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_ether.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 3a45b4ad71a3..3158ba672b72 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -109,6 +109,7 @@
 #define ETH_P_QINQ2	0x9200		/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_QINQ3	0x9300		/* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_EDSA	0xDADA		/* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
+#define ETH_P_DSA_8021Q	0xDADB		/* Fake VLAN Header for DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_IFE	0xED3E		/* ForCES inter-FE LFB type */
 #define ETH_P_AF_IUCV   0xFBFB		/* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */
 
-- 
cgit v1.2.3


From 6666cebc5e306f49a25bd20aa8c1cb8ef8950df5 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Thu, 2 May 2019 23:23:34 +0300
Subject: net: dsa: sja1105: Add support for VLAN operations

VLAN filtering cannot be properly disabled in SJA1105. So in order to
emulate the "no VLAN awareness" behavior (not dropping traffic that is
tagged with a VID that isn't configured on the port), we need to hack
another switch feature: programmable TPID (which is 0x8100 for 802.1Q).
We are reprogramming the TPID to a bogus value which leaves the switch
thinking that all traffic is untagged, and therefore accepts it.

Under a vlan_filtering bridge, the proper TPID of ETH_P_8021Q is
installed again, and the switch starts identifying 802.1Q-tagged
traffic.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/sja1105.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index 30559d1d0e1b..abf3977e34fd 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -7,6 +7,10 @@
 #ifndef _NET_DSA_SJA1105_H
 #define _NET_DSA_SJA1105_H
 
+#include <linux/etherdevice.h>
+
+#define ETH_P_SJA1105				ETH_P_DSA_8021Q
+
 /* The switch can only be convinced to stay in unmanaged mode and not trap any
  * link-local traffic by actually telling it to filter frames sent at the
  * 00:00:00:00:00:00 destination MAC.
-- 
cgit v1.2.3


From a27415decd84dac124c6185f1184b6c779d0a5ab Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Wed, 1 May 2019 00:10:50 +0200
Subject: net: dsa: mv88e6xxx: Pass interrupt number in platform data

Allow an interrupt number to be passed in the platform data. The
driver will then use it if not zero, otherwise it will poll for
interrupts.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/platform_data/mv88e6xxx.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/platform_data/mv88e6xxx.h b/include/linux/platform_data/mv88e6xxx.h
index 963730b44aea..21452a9365e1 100644
--- a/include/linux/platform_data/mv88e6xxx.h
+++ b/include/linux/platform_data/mv88e6xxx.h
@@ -13,6 +13,7 @@ struct dsa_mv88e6xxx_pdata {
 	unsigned int enabled_ports;
 	struct net_device *netdev;
 	u32 eeprom_len;
+	int irq;
 };
 
 #endif
-- 
cgit v1.2.3


From 141b6b2ad75d92770240de3af98d55c41ce7cd18 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Wed, 1 May 2019 19:56:59 -0700
Subject: net: add a generic tracepoint for TX queue timeout

Although devlink health report does a nice job on reporting TX
timeout and other NIC errors, unfortunately it requires drivers
to support it but currently only mlx5 has implemented it.
Before other drivers could catch up, it is useful to have a
generic tracepoint to monitor this kind of TX timeout. We have
been suffering TX timeout with different drivers, we plan to
start to monitor it with rasdaemon which just needs a new tracepoint.

Sample output:

  ksoftirqd/1-16    [001] ..s2   144.043173: net_dev_xmit_timeout: dev=ens3 driver=e1000 queue=0

Cc: Eran Ben Elisha <eranbe@mellanox.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/net.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 1efd7d9b25fe..2399073c3afc 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -95,6 +95,29 @@ TRACE_EVENT(net_dev_xmit,
 		__get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
 );
 
+TRACE_EVENT(net_dev_xmit_timeout,
+
+	TP_PROTO(struct net_device *dev,
+		 int queue_index),
+
+	TP_ARGS(dev, queue_index),
+
+	TP_STRUCT__entry(
+		__string(	name,		dev->name	)
+		__string(	driver,		netdev_drivername(dev))
+		__field(	int,		queue_index	)
+	),
+
+	TP_fast_assign(
+		__assign_str(name, dev->name);
+		__assign_str(driver, netdev_drivername(dev));
+		__entry->queue_index = queue_index;
+	),
+
+	TP_printk("dev=%s driver=%s queue=%d",
+		__get_str(name), __get_str(driver), __entry->queue_index)
+);
+
 DECLARE_EVENT_CLASS(net_dev_template,
 
 	TP_PROTO(struct sk_buff *skb),
-- 
cgit v1.2.3


From 22c0ef6b1475aef4765efc4aa764b8580018123c Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 1 May 2019 21:34:43 +0200
Subject: net: phy: improve pause handling

When probing the phy device we set sym and asym pause in the "supported"
bitmap (unless the PHY tells us otherwise). However we don't know yet
whether the MAC supports pause. Simply copying phy->supported to
phy->advertising will trigger advertising pause, and that's not
what we want. Therefore add phy_advertise_supported() that copies all
modes but doesn't touch the pause bits.

In phy_support_(a)sym_pause we shouldn't set any bits in the supported
bitmap because we may set a bit the PHY intentionally disabled.
Effective pause support should be the AND-combined PHY and MAC pause
capabilities. If the MAC supports everything, then it's only relevant
what the PHY supports. If MAC supports sym pause only, then we have to
clear the asym bit in phydev->supported.
Copy the pause flags only and don't touch the modes, because a driver
may have intentionally removed a mode from phydev->advertising.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 0f9552b17ee7..4a03f8a46d33 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1154,6 +1154,7 @@ void phy_request_interrupt(struct phy_device *phydev);
 void phy_print_status(struct phy_device *phydev);
 int phy_set_max_speed(struct phy_device *phydev, u32 max_speed);
 void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode);
+void phy_advertise_supported(struct phy_device *phydev);
 void phy_support_sym_pause(struct phy_device *phydev);
 void phy_support_asym_pause(struct phy_device *phydev);
 void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx,
-- 
cgit v1.2.3


From f24098f80748ea95d53603a7bb7954a41bb3ca1b Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 1 May 2019 22:14:21 +0200
Subject: net: phy: improve resuming from hibernation

I got an interesting report [0] that after resuming from hibernation
the link has 100Mbps instead of 1Gbps. Reason is that another OS has
been used whilst Linux was hibernated. And this OS speeds down the link
due to WoL. Therefore, when resuming, we shouldn't expect that what
the PHY advertises is what it did when hibernating.
Easiest way to do this is removing state PHY_RESUMING. Instead always
go via PHY_UP that configures PHY advertisement.

[0] https://bugzilla.kernel.org/show_bug.cgi?id=202851

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 4a03f8a46d33..073fb151b5a9 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -308,13 +308,7 @@ struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr);
  *
  * HALTED: PHY is up, but no polling or interrupts are done. Or
  * PHY is in an error state.
- *
- * - phy_start moves to RESUMING
- *
- * RESUMING: PHY was halted, but now wants to run again.
- * - If we are forcing, or aneg is done, timer moves to RUNNING
- * - If aneg is not done, timer moves to AN
- * - phy_stop moves to HALTED
+ * - phy_start moves to UP
  */
 enum phy_state {
 	PHY_DOWN = 0,
@@ -324,7 +318,6 @@ enum phy_state {
 	PHY_RUNNING,
 	PHY_NOLINK,
 	PHY_FORCING,
-	PHY_RESUMING
 };
 
 /**
-- 
cgit v1.2.3


From b424e432e770d6dd572765459d5b6a96a19c5286 Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Thu, 2 May 2019 16:15:10 +0200
Subject: netlink: add validation of NLA_F_NESTED flag

Add new validation flag NL_VALIDATE_NESTED which adds three consistency
checks of NLA_F_NESTED_FLAG:

  - the flag is set on attributes with NLA_NESTED{,_ARRAY} policy
  - the flag is not set on attributes with other policies except NLA_UNSPEC
  - the flag is set on attribute passed to nla_parse_nested()

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>

v2: change error messages to mention NLA_F_NESTED explicitly
Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 679f649748d4..395b4406f4b0 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -401,6 +401,8 @@ struct nl_info {
  *	are enforced going forward.
  * @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g.
  *	U8, U16, U32 must have exact size, etc.)
+ * @NL_VALIDATE_NESTED: Check that NLA_F_NESTED is set for NLA_NESTED(_ARRAY)
+ *	and unset for other policies.
  */
 enum netlink_validation {
 	NL_VALIDATE_LIBERAL = 0,
@@ -408,6 +410,7 @@ enum netlink_validation {
 	NL_VALIDATE_MAXTYPE = BIT(1),
 	NL_VALIDATE_UNSPEC = BIT(2),
 	NL_VALIDATE_STRICT_ATTRS = BIT(3),
+	NL_VALIDATE_NESTED = BIT(4),
 };
 
 #define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\
@@ -415,7 +418,8 @@ enum netlink_validation {
 #define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\
 			    NL_VALIDATE_MAXTYPE |\
 			    NL_VALIDATE_UNSPEC |\
-			    NL_VALIDATE_STRICT_ATTRS)
+			    NL_VALIDATE_STRICT_ATTRS |\
+			    NL_VALIDATE_NESTED)
 
 int netlink_rcv_skb(struct sk_buff *skb,
 		    int (*cb)(struct sk_buff *, struct nlmsghdr *,
@@ -1132,6 +1136,11 @@ static inline int nla_parse_nested(struct nlattr *tb[], int maxtype,
 				   const struct nla_policy *policy,
 				   struct netlink_ext_ack *extack)
 {
+	if (!(nla->nla_type & NLA_F_NESTED)) {
+		NL_SET_ERR_MSG_ATTR(extack, nla, "NLA_F_NESTED is missing");
+		return -EINVAL;
+	}
+
 	return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
 			   NL_VALIDATE_STRICT, extack);
 }
-- 
cgit v1.2.3


From 0f457a36626fa94026e483836fbf29e451434567 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 30 Apr 2019 07:45:48 -0700
Subject: ipv4: Move cached routes to fib_nh_common

While the cached routes, nh_pcpu_rth_output and nh_rth_input, are IPv4
specific, a later patch wants to make them accessible for IPv6 nexthops
with IPv4 routes using a fib6_nh. Move the cached routes from fib_nh to
fib_nh_common and update references.

Initialization of the cached entries is moved to fib_nh_common_init,
and free is moved to fib_nh_common_release.

Change in location only, from fib_nh up to fib_nh_common; no functional
change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 772a9e61bd84..659c5081c40b 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -96,6 +96,10 @@ struct fib_nh_common {
 
 	int			nhc_weight;
 	atomic_t		nhc_upper_bound;
+
+	/* v4 specific, but allows fib6_nh with v4 routes */
+	struct rtable __rcu * __percpu *nhc_pcpu_rth_output;
+	struct rtable __rcu     *nhc_rth_input;
 };
 
 struct fib_nh {
@@ -107,8 +111,6 @@ struct fib_nh {
 #endif
 	__be32			nh_saddr;
 	int			nh_saddr_genid;
-	struct rtable __rcu * __percpu *nh_pcpu_rth_output;
-	struct rtable __rcu	*nh_rth_input;
 	struct fnhe_hash_bucket	__rcu *nh_exceptions;
 #define fib_nh_family		nh_common.nhc_family
 #define fib_nh_dev		nh_common.nhc_dev
-- 
cgit v1.2.3


From a5995e7107eb3d9c44744d3cf47d49fabfef01f5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 30 Apr 2019 07:45:50 -0700
Subject: ipv4: Move exception bucket to nh_common

Similar to the cached routes, make IPv4 exceptions accessible when
using an IPv6 nexthop struct with IPv4 routes. Simplify the exception
functions by passing in fib_nh_common since that is all it needs,
and then cleanup the call sites that have extraneous fib_nh conversions.

As with the cached routes this is a change in location only, from fib_nh
up to fib_nh_common; no functional change intended.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 659c5081c40b..d0e28f4ab099 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -100,6 +100,7 @@ struct fib_nh_common {
 	/* v4 specific, but allows fib6_nh with v4 routes */
 	struct rtable __rcu * __percpu *nhc_pcpu_rth_output;
 	struct rtable __rcu     *nhc_rth_input;
+	struct fnhe_hash_bucket	__rcu *nhc_exceptions;
 };
 
 struct fib_nh {
@@ -111,7 +112,6 @@ struct fib_nh {
 #endif
 	__be32			nh_saddr;
 	int			nh_saddr_genid;
-	struct fnhe_hash_bucket	__rcu *nh_exceptions;
 #define fib_nh_family		nh_common.nhc_family
 #define fib_nh_dev		nh_common.nhc_dev
 #define fib_nh_oif		nh_common.nhc_oif
-- 
cgit v1.2.3


From f80c5dad7b6467b884c445ffea45985793b4b2d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Paulo=20Rechi=20Vita?= <jprvita@gmail.com>
Date: Thu, 2 May 2019 10:01:52 +0800
Subject: Bluetooth: Ignore CC events not matching the last HCI command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit makes the kernel not send the next queued HCI command until
a command complete arrives for the last HCI command sent to the
controller. This change avoids a problem with some buggy controllers
(seen on two SKUs of QCA9377) that send an extra command complete event
for the previous command after the kernel had already sent a new HCI
command to the controller.

The problem was reproduced when starting an active scanning procedure,
where an extra command complete event arrives for the LE_SET_RANDOM_ADDR
command. When this happends the kernel ends up not processing the
command complete for the following commmand, LE_SET_SCAN_PARAM, and
ultimately behaving as if a passive scanning procedure was being
performed, when in fact controller is performing an active scanning
procedure. This makes it impossible to discover BLE devices as no device
found events are sent to userspace.

This problem is reproducible on 100% of the attempts on the affected
controllers. The extra command complete event can be seen at timestamp
27.420131 on the btmon logs bellow.

Bluetooth monitor ver 5.50
= Note: Linux version 5.0.0+ (x86_64)                                  0.352340
= Note: Bluetooth subsystem version 2.22                               0.352343
= New Index: 80:C5:F2:8F:87:84 (Primary,USB,hci0)               [hci0] 0.352344
= Open Index: 80:C5:F2:8F:87:84                                 [hci0] 0.352345
= Index Info: 80:C5:F2:8F:87:84 (Qualcomm)                      [hci0] 0.352346
@ MGMT Open: bluetoothd (privileged) version 1.14             {0x0001} 0.352347
@ MGMT Open: btmon (privileged) version 1.14                  {0x0002} 0.352366
@ MGMT Open: btmgmt (privileged) version 1.14                {0x0003} 27.302164
@ MGMT Command: Start Discovery (0x0023) plen 1       {0x0003} [hci0] 27.302310
        Address type: 0x06
          LE Public
          LE Random
< HCI Command: LE Set Random Address (0x08|0x0005) plen 6   #1 [hci0] 27.302496
        Address: 15:60:F2:91:B2:24 (Non-Resolvable)
> HCI Event: Command Complete (0x0e) plen 4                 #2 [hci0] 27.419117
      LE Set Random Address (0x08|0x0005) ncmd 1
        Status: Success (0x00)
< HCI Command: LE Set Scan Parameters (0x08|0x000b) plen 7  #3 [hci0] 27.419244
        Type: Active (0x01)
        Interval: 11.250 msec (0x0012)
        Window: 11.250 msec (0x0012)
        Own address type: Random (0x01)
        Filter policy: Accept all advertisement (0x00)
> HCI Event: Command Complete (0x0e) plen 4                 #4 [hci0] 27.420131
      LE Set Random Address (0x08|0x0005) ncmd 1
        Status: Success (0x00)
< HCI Command: LE Set Scan Enable (0x08|0x000c) plen 2      #5 [hci0] 27.420259
        Scanning: Enabled (0x01)
        Filter duplicates: Enabled (0x01)
> HCI Event: Command Complete (0x0e) plen 4                 #6 [hci0] 27.420969
      LE Set Scan Parameters (0x08|0x000b) ncmd 1
        Status: Success (0x00)
> HCI Event: Command Complete (0x0e) plen 4                 #7 [hci0] 27.421983
      LE Set Scan Enable (0x08|0x000c) ncmd 1
        Status: Success (0x00)
@ MGMT Event: Command Complete (0x0001) plen 4        {0x0003} [hci0] 27.422059
      Start Discovery (0x0023) plen 1
        Status: Success (0x00)
        Address type: 0x06
          LE Public
          LE Random
@ MGMT Event: Discovering (0x0013) plen 2             {0x0003} [hci0] 27.422067
        Address type: 0x06
          LE Public
          LE Random
        Discovery: Enabled (0x01)
@ MGMT Event: Discovering (0x0013) plen 2             {0x0002} [hci0] 27.422067
        Address type: 0x06
          LE Public
          LE Random
        Discovery: Enabled (0x01)
@ MGMT Event: Discovering (0x0013) plen 2             {0x0001} [hci0] 27.422067
        Address type: 0x06
          LE Public
          LE Random
        Discovery: Enabled (0x01)

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/hci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index fbba43e9bef5..9a5330eed794 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -282,6 +282,7 @@ enum {
 	HCI_FORCE_BREDR_SMP,
 	HCI_FORCE_STATIC_ADDR,
 	HCI_LL_RPA_RESOLUTION,
+	HCI_CMD_PENDING,
 
 	__HCI_NUM_FLAGS,
 };
-- 
cgit v1.2.3


From 47d3d7fdb10a21c223036b58bd70ffdc24a472c4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 3 May 2019 08:24:44 -0700
Subject: ip6: fix skb leak in ip6frag_expire_frag_queue()

Since ip6frag_expire_frag_queue() now pulls the head skb
from frag queue, we should no longer use skb_get(), since
this leads to an skb leak.

Stefan Bader initially reported a problem in 4.4.stable [1] caused
by the skb_get(), so this patch should also fix this issue.

296583.091021] kernel BUG at /build/linux-6VmqmP/linux-4.4.0/net/core/skbuff.c:1207!
[296583.091734] Call Trace:
[296583.091749]  [<ffffffff81740e50>] __pskb_pull_tail+0x50/0x350
[296583.091764]  [<ffffffff8183939a>] _decode_session6+0x26a/0x400
[296583.091779]  [<ffffffff817ec719>] __xfrm_decode_session+0x39/0x50
[296583.091795]  [<ffffffff818239d0>] icmpv6_route_lookup+0xf0/0x1c0
[296583.091809]  [<ffffffff81824421>] icmp6_send+0x5e1/0x940
[296583.091823]  [<ffffffff81753238>] ? __netif_receive_skb+0x18/0x60
[296583.091838]  [<ffffffff817532b2>] ? netif_receive_skb_internal+0x32/0xa0
[296583.091858]  [<ffffffffc0199f74>] ? ixgbe_clean_rx_irq+0x594/0xac0 [ixgbe]
[296583.091876]  [<ffffffffc04eb260>] ? nf_ct_net_exit+0x50/0x50 [nf_defrag_ipv6]
[296583.091893]  [<ffffffff8183d431>] icmpv6_send+0x21/0x30
[296583.091906]  [<ffffffff8182b500>] ip6_expire_frag_queue+0xe0/0x120
[296583.091921]  [<ffffffffc04eb27f>] nf_ct_frag6_expire+0x1f/0x30 [nf_defrag_ipv6]
[296583.091938]  [<ffffffff810f3b57>] call_timer_fn+0x37/0x140
[296583.091951]  [<ffffffffc04eb260>] ? nf_ct_net_exit+0x50/0x50 [nf_defrag_ipv6]
[296583.091968]  [<ffffffff810f5464>] run_timer_softirq+0x234/0x330
[296583.091982]  [<ffffffff8108a339>] __do_softirq+0x109/0x2b0

Fixes: d4289fcc9b16 ("net: IP6 defrag: use rbtrees for IPv6 defrag")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Stefan Bader <stefan.bader@canonical.com>
Cc: Peter Oskolkov <posk@google.com>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6_frag.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h
index 28aa9b30aece..1f77fb4dc79d 100644
--- a/include/net/ipv6_frag.h
+++ b/include/net/ipv6_frag.h
@@ -94,7 +94,6 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
 		goto out;
 
 	head->dev = dev;
-	skb_get(head);
 	spin_unlock(&fq->q.lock);
 
 	icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
-- 
cgit v1.2.3


From 9b3040a6aafd7898ece7fc7efcbca71e42aa8069 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 5 May 2019 11:16:20 -0700
Subject: ipv4: Define __ipv4_neigh_lookup_noref when CONFIG_INET is disabled

Define __ipv4_neigh_lookup_noref to return NULL when CONFIG_INET is disabled.

Fixes: 4b2a2bfeb3f0 ("neighbor: Call __ipv4_neigh_lookup_noref in neigh_xmit")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/arp.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/net/arp.h b/include/net/arp.h
index 977aabfcdc03..c8f580a0e6b1 100644
--- a/include/net/arp.h
+++ b/include/net/arp.h
@@ -18,6 +18,7 @@ static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32
 	return val * hash_rnd[0];
 }
 
+#ifdef CONFIG_INET
 static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key)
 {
 	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
@@ -25,6 +26,13 @@ static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev
 
 	return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev);
 }
+#else
+static inline
+struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key)
+{
+	return NULL;
+}
+#endif
 
 static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key)
 {
-- 
cgit v1.2.3


From 522e4077e8dcdfc5b8e96469d3bc2324bc5d6466 Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Sun, 28 Apr 2019 15:12:19 +0800
Subject: netfilter: slightly optimize nf_inet_addr_mask

using 64bit computation to slightly optimize nf_inet_addr_mask

Signed-off-by: Li RongQing <lirongqing@baidu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index a7252f3baeb0..996bc247ef6e 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -41,10 +41,19 @@ static inline void nf_inet_addr_mask(const union nf_inet_addr *a1,
 				     union nf_inet_addr *result,
 				     const union nf_inet_addr *mask)
 {
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+	const unsigned long *ua = (const unsigned long *)a1;
+	unsigned long *ur = (unsigned long *)result;
+	const unsigned long *um = (const unsigned long *)mask;
+
+	ur[0] = ua[0] & um[0];
+	ur[1] = ua[1] & um[1];
+#else
 	result->all[0] = a1->all[0] & mask->all[0];
 	result->all[1] = a1->all[1] & mask->all[1];
 	result->all[2] = a1->all[2] & mask->all[2];
 	result->all[3] = a1->all[3] & mask->all[3];
+#endif
 }
 
 int netfilter_init(void);
-- 
cgit v1.2.3


From a7a7be6087b07563490725f61f4dbf4826f099e2 Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Sat, 4 May 2019 04:46:16 -0700
Subject: net/sched: add sample action to the hardware intermediate
 representation

Add sample action to the hardware intermediate representation model which
would subsequently allow it to be used by drivers for offload.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index d035183c8d03..9a6c89b2c2bb 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -118,6 +118,7 @@ enum flow_action_id {
 	FLOW_ACTION_MARK,
 	FLOW_ACTION_WAKE,
 	FLOW_ACTION_QUEUE,
+	FLOW_ACTION_SAMPLE,
 };
 
 /* This is mirroring enum pedit_header_type definition for easy mapping between
@@ -157,6 +158,12 @@ struct flow_action_entry {
 			u32		index;
 			u8		vf;
 		} queue;
+		struct {				/* FLOW_ACTION_SAMPLE */
+			struct psample_group	*psample_group;
+			u32			rate;
+			u32			trunc_size;
+			bool			truncate;
+		} sample;
 	};
 };
 
-- 
cgit v1.2.3


From f00cbf1968145afbae385a867a66c69845e30711 Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Sat, 4 May 2019 04:46:17 -0700
Subject: net/sched: use the hardware intermediate representation for matchall

Extends matchall offload to make use of the hardware intermediate
representation. More specifically, this patch moves the native TC
actions in cls_matchall offload to the newer flow_action
representation. This ultimately allows us to avoid a direct
dependency on native TC actions for matchall.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index d5e7a1af346f..c852ed502cc6 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -789,6 +789,7 @@ enum tc_matchall_command {
 struct tc_cls_matchall_offload {
 	struct tc_cls_common_offload common;
 	enum tc_matchall_command command;
+	struct flow_rule *rule;
 	struct tcf_exts *exts;
 	unsigned long cookie;
 };
-- 
cgit v1.2.3


From ab79af32b0a5606324ce04c0f04a0d2f90b94464 Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Sat, 4 May 2019 04:46:18 -0700
Subject: mlxsw: use intermediate representation for matchall offload

Updates the Mellanox spectrum driver to use the newer intermediate
representation for flow actions in matchall offloads.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 9a6c89b2c2bb..3bf67dd64be5 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -177,6 +177,17 @@ static inline bool flow_action_has_entries(const struct flow_action *action)
 	return action->num_entries;
 }
 
+/**
+ * flow_action_has_one_action() - check if exactly one action is present
+ * @action: tc filter flow offload action
+ *
+ * Returns true if exactly one action is present.
+ */
+static inline bool flow_offload_has_one_action(const struct flow_action *action)
+{
+	return action->num_entries == 1;
+}
+
 #define flow_action_for_each(__i, __act, __actions)			\
         for (__i = 0, __act = &(__actions)->entries[0]; __i < (__actions)->num_entries; __act = &(__actions)->entries[++__i])
 
-- 
cgit v1.2.3


From dfcb19f0fae3d07f9c56f6efe2c9bbebef6826c9 Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Sat, 4 May 2019 04:46:20 -0700
Subject: net/sched: remove unused functions for matchall offload

Cleanup unused functions and variables after porting to the newer
intermediate representation.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 25 -------------------------
 1 file changed, 25 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index c852ed502cc6..2d0470661277 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -371,30 +371,6 @@ static inline bool tcf_exts_has_actions(struct tcf_exts *exts)
 #endif
 }
 
-/**
- * tcf_exts_has_one_action - check if exactly one action is present
- * @exts: tc filter extensions handle
- *
- * Returns true if exactly one action is present.
- */
-static inline bool tcf_exts_has_one_action(struct tcf_exts *exts)
-{
-#ifdef CONFIG_NET_CLS_ACT
-	return exts->nr_actions == 1;
-#else
-	return false;
-#endif
-}
-
-static inline struct tc_action *tcf_exts_first_action(struct tcf_exts *exts)
-{
-#ifdef CONFIG_NET_CLS_ACT
-	return exts->actions[0];
-#else
-	return NULL;
-#endif
-}
-
 /**
  * tcf_exts_exec - execute tc filter extensions
  * @skb: socket buffer
@@ -790,7 +766,6 @@ struct tc_cls_matchall_offload {
 	struct tc_cls_common_offload common;
 	enum tc_matchall_command command;
 	struct flow_rule *rule;
-	struct tcf_exts *exts;
 	unsigned long cookie;
 };
 
-- 
cgit v1.2.3


From fa762da94d9860f584c909621d1f8ccbe24c5d5e Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Sat, 4 May 2019 04:46:21 -0700
Subject: net/sched: move police action structures to header

Move tcf_police_params, tcf_police and tc_police_compat structures to a
header. Making them usable to other code for example drivers that would
offload police actions to hardware.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_police.h | 70 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 include/net/tc_act/tc_police.h

(limited to 'include')

diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h
new file mode 100644
index 000000000000..8b9ef3664262
--- /dev/null
+++ b/include/net/tc_act/tc_police.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_TC_POLICE_H
+#define __NET_TC_POLICE_H
+
+#include <net/act_api.h>
+
+struct tcf_police_params {
+	int			tcfp_result;
+	u32			tcfp_ewma_rate;
+	s64			tcfp_burst;
+	u32			tcfp_mtu;
+	s64			tcfp_mtu_ptoks;
+	struct psched_ratecfg	rate;
+	bool			rate_present;
+	struct psched_ratecfg	peak;
+	bool			peak_present;
+	struct rcu_head rcu;
+};
+
+struct tcf_police {
+	struct tc_action	common;
+	struct tcf_police_params __rcu *params;
+
+	spinlock_t		tcfp_lock ____cacheline_aligned_in_smp;
+	s64			tcfp_toks;
+	s64			tcfp_ptoks;
+	s64			tcfp_t_c;
+};
+
+#define to_police(pc) ((struct tcf_police *)pc)
+
+/* old policer structure from before tc actions */
+struct tc_police_compat {
+	u32			index;
+	int			action;
+	u32			limit;
+	u32			burst;
+	u32			mtu;
+	struct tc_ratespec	rate;
+	struct tc_ratespec	peakrate;
+};
+
+static inline bool is_tcf_police(const struct tc_action *act)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	if (act->ops && act->ops->id == TCA_ID_POLICE)
+		return true;
+#endif
+	return false;
+}
+
+static inline u64 tcf_police_rate_bytes_ps(const struct tc_action *act)
+{
+	struct tcf_police *police = to_police(act);
+	struct tcf_police_params *params;
+
+	params = rcu_dereference_bh(police->params);
+	return params->rate.rate_bytes_ps;
+}
+
+static inline s64 tcf_police_tcfp_burst(const struct tc_action *act)
+{
+	struct tcf_police *police = to_police(act);
+	struct tcf_police_params *params;
+
+	params = rcu_dereference_bh(police->params);
+	return params->tcfp_burst;
+}
+
+#endif /* __NET_TC_POLICE_H */
-- 
cgit v1.2.3


From 8c8cfc6ed274e6fb86f00b53f3e7811afce29043 Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Sat, 4 May 2019 04:46:22 -0700
Subject: net/sched: add police action to the hardware intermediate
 representation

Add police action to the hardware intermediate representation which
would subsequently allow it to be used by drivers for offload.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 3bf67dd64be5..6200900434e1 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -119,6 +119,7 @@ enum flow_action_id {
 	FLOW_ACTION_WAKE,
 	FLOW_ACTION_QUEUE,
 	FLOW_ACTION_SAMPLE,
+	FLOW_ACTION_POLICE,
 };
 
 /* This is mirroring enum pedit_header_type definition for easy mapping between
@@ -164,6 +165,10 @@ struct flow_action_entry {
 			u32			trunc_size;
 			bool			truncate;
 		} sample;
+		struct {				/* FLOW_ACTION_POLICE */
+			s64			burst;
+			u64			rate_bytes_ps;
+		} police;
 	};
 };
 
-- 
cgit v1.2.3


From b7fe4ab8a6013c3c721bed91f73e76eec8fb5d89 Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Sat, 4 May 2019 04:46:23 -0700
Subject: net/sched: extend matchall offload for hardware statistics

Introduce a new command for matchall classifiers that allows hardware
to update statistics.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 2d0470661277..161fcf8516ac 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -760,12 +760,14 @@ tc_cls_flower_offload_flow_rule(struct tc_cls_flower_offload *tc_flow_cmd)
 enum tc_matchall_command {
 	TC_CLSMATCHALL_REPLACE,
 	TC_CLSMATCHALL_DESTROY,
+	TC_CLSMATCHALL_STATS,
 };
 
 struct tc_cls_matchall_offload {
 	struct tc_cls_common_offload common;
 	enum tc_matchall_command command;
 	struct flow_rule *rule;
+	struct flow_stats stats;
 	unsigned long cookie;
 };
 
-- 
cgit v1.2.3


From 88c44a5200849c8182eaf36535b4ceae6b90b19d Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Sat, 4 May 2019 04:46:25 -0700
Subject: net/sched: add block pointer to tc_cls_common_offload structure

Some actions like the police action are stateful and could share state
between devices. This is incompatible with offloading to multiple devices
and drivers might want to test for shared blocks when offloading.
Store a pointer to the tcf_block structure in the tc_cls_common_offload
structure to allow drivers to determine when offloads apply to a shared
block.

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 161fcf8516ac..eed98f8fcb5e 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -100,6 +100,11 @@ int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		 struct tcf_result *res, bool compat_mode);
 
 #else
+static inline bool tcf_block_shared(struct tcf_block *block)
+{
+	return false;
+}
+
 static inline
 int tcf_block_get(struct tcf_block **p_block,
 		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
@@ -624,6 +629,7 @@ struct tc_cls_common_offload {
 	u32 chain_index;
 	__be16 protocol;
 	u32 prio;
+	struct tcf_block *block;
 	struct netlink_ext_ack *extack;
 };
 
@@ -725,11 +731,13 @@ static inline bool tc_in_hw(u32 flags)
 static inline void
 tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
 			   const struct tcf_proto *tp, u32 flags,
+			   struct tcf_block *block,
 			   struct netlink_ext_ack *extack)
 {
 	cls_common->chain_index = tp->chain->index;
 	cls_common->protocol = tp->protocol;
 	cls_common->prio = tp->prio;
+	cls_common->block = block;
 	if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE)
 		cls_common->extack = extack;
 }
-- 
cgit v1.2.3


From f9bbe4477c30ece44296437ee26142b42ef8070b Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 5 May 2019 13:19:22 +0300
Subject: net: dsa: Optional VLAN-based port separation for switches without
 tagging

This patch provides generic DSA code for using VLAN (802.1Q) tags for
the same purpose as a dedicated switch tag for injection/extraction.
It is based on the discussions and interest that has been so far
expressed in https://www.spinics.net/lists/netdev/msg556125.html.

Unlike all other DSA-supported tagging protocols, CONFIG_NET_DSA_TAG_8021Q
does not offer a complete solution for drivers (nor can it). Instead, it
provides generic code that driver can opt into calling:
- dsa_8021q_xmit: Inserts a VLAN header with the specified contents.
  Can be called from another tagging protocol's xmit function.
  Currently the LAN9303 driver is inserting headers that are simply
  802.1Q with custom fields, so this is an opportunity for code reuse.
- dsa_8021q_rcv: Retrieves the TPID and TCI from a VLAN-tagged skb.
  Removing the VLAN header is left as a decision for the caller to make.
- dsa_port_setup_8021q_tagging: For each user port, installs an Rx VID
  and a Tx VID, for proper untagged traffic identification on ingress
  and steering on egress. Also sets up the VLAN trunk on the upstream
  (CPU or DSA) port. Drivers are intentionally left to call this
  function explicitly, depending on the context and hardware support.
  The expected switch behavior and VLAN semantics should not be violated
  under any conditions. That is, after calling
  dsa_port_setup_8021q_tagging, the hardware should still pass all
  ingress traffic, be it tagged or untagged.

For uniformity with the other tagging protocols, a module for the
dsa_8021q_netdev_ops structure is registered, but the typical usage is
to set up another tagging protocol which selects CONFIG_NET_DSA_TAG_8021Q,
and calls the API from tag_8021q.h. Null function definitions are also
provided so that a "depends on" is not forced in the Kconfig.

This tagging protocol only works when switch ports are standalone, or
when they are added to a VLAN-unaware bridge. It will probably remain
this way for the reasons below.

When added to a bridge that has vlan_filtering 1, the bridge core will
install its own VLANs and reset the pvids through switchdev. For the
bridge core, switchdev is a write-only pipe. All VLAN-related state is
kept in the bridge core and nothing is read from DSA/switchdev or from
the driver. So the bridge core will break this port separation because
it will install the vlan_default_pvid into all switchdev ports.

Even if we could teach the bridge driver about switchdev preference of a
certain vlan_default_pvid (task difficult in itself since the current
setting is per-bridge but we would need it per-port), there would still
exist many other challenges.

Firstly, in the DSA rcv callback, a driver would have to perform an
iterative reverse lookup to find the correct switch port. That is
because the port is a bridge slave, so its Rx VID (port PVID) is subject
to user configuration. How would we ensure that the user doesn't reset
the pvid to a different value (which would make an O(1) translation
impossible), or to a non-unique value within this DSA switch tree (which
would make any translation impossible)?

Finally, not all switch ports are equal in DSA, and that makes it
difficult for the bridge to be completely aware of this anyway.
The CPU port needs to transmit tagged packets (VLAN trunk) in order for
the DSA rcv code to be able to decode source information.
But the bridge code has absolutely no idea which switch port is the CPU
port, if nothing else then just because there is no netdevice registered
by DSA for the CPU port.
Also DSA does not currently allow the user to specify that they want the
CPU port to do VLAN trunking anyway. VLANs are added to the CPU port
using the same flags as they were added on the user port.

So the VLANs installed by dsa_port_setup_8021q_tagging per driver
request should remain private from the bridge's and user's perspective,
and should not alter the VLAN semantics observed by the user.

In the current implementation a VLAN range ending at 4095 (VLAN_N_VID)
is reserved for this purpose. Each port receives a unique Rx VLAN and a
unique Tx VLAN. Separate VLANs are needed for Rx and Tx because they
serve different purposes: on Rx the switch must process traffic as
untagged and process it with a port-based VLAN, but with care not to
hinder bridging. On the other hand, the Tx VLAN is where the
reachability restrictions are imposed, since by tagging frames in the
xmit callback we are telling the switch onto which port to steer the
frame.

Some general guidance on how this support might be employed for
real-life hardware (some comments made by Florian Fainelli):

- If the hardware supports VLAN tag stacking, it should somehow back
  up its private VLAN settings when the bridge tries to override them.
  Then the driver could re-apply them as outer tags. Dedicating an outer
  tag per bridge device would allow identical inner tag VID numbers to
  co-exist, yet preserve broadcast domain isolation.

- If the switch cannot handle VLAN tag stacking, it should disable this
  port separation when added as slave to a vlan_filtering bridge, in
  that case having reduced functionality.

- Drivers for old switches that don't support the entire VLAN_N_VID
  range will need to rework the current range selection mechanism.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vivien Didelot <vivien.didelot@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/8021q.h | 76 +++++++++++++++++++++++++++++++++++++++++++++++
 include/net/dsa.h         |  2 ++
 2 files changed, 78 insertions(+)
 create mode 100644 include/linux/dsa/8021q.h

(limited to 'include')

diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
new file mode 100644
index 000000000000..3911e0586478
--- /dev/null
+++ b/include/linux/dsa/8021q.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2019, Vladimir Oltean <olteanv@gmail.com>
+ */
+
+#ifndef _NET_DSA_8021Q_H
+#define _NET_DSA_8021Q_H
+
+#include <linux/types.h>
+
+struct dsa_switch;
+struct sk_buff;
+struct net_device;
+struct packet_type;
+
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q)
+
+int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index,
+				 bool enabled);
+
+struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
+			       u16 tpid, u16 tci);
+
+struct sk_buff *dsa_8021q_rcv(struct sk_buff *skb, struct net_device *netdev,
+			      struct packet_type *pt, u16 *tpid, u16 *tci);
+
+u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port);
+
+u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port);
+
+int dsa_8021q_rx_switch_id(u16 vid);
+
+int dsa_8021q_rx_source_port(u16 vid);
+
+#else
+
+int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index,
+				 bool enabled)
+{
+	return 0;
+}
+
+struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
+			       u16 tpid, u16 tci)
+{
+	return NULL;
+}
+
+struct sk_buff *dsa_8021q_rcv(struct sk_buff *skb, struct net_device *netdev,
+			      struct packet_type *pt, u16 *tpid, u16 *tci)
+{
+	return NULL;
+}
+
+u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port)
+{
+	return 0;
+}
+
+u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port)
+{
+	return 0;
+}
+
+int dsa_8021q_rx_switch_id(u16 vid)
+{
+	return 0;
+}
+
+int dsa_8021q_rx_source_port(u16 vid)
+{
+	return 0;
+}
+
+#endif /* IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) */
+
+#endif /* _NET_DSA_8021Q_H */
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 18db7b8e7a8e..69f3714f42ba 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -42,6 +42,7 @@ struct phylink_link_state;
 #define DSA_TAG_PROTO_MTK_VALUE			9
 #define DSA_TAG_PROTO_QCA_VALUE			10
 #define DSA_TAG_PROTO_TRAILER_VALUE		11
+#define DSA_TAG_PROTO_8021Q_VALUE		12
 
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE		= DSA_TAG_PROTO_NONE_VALUE,
@@ -56,6 +57,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_MTK		= DSA_TAG_PROTO_MTK_VALUE,
 	DSA_TAG_PROTO_QCA		= DSA_TAG_PROTO_QCA_VALUE,
 	DSA_TAG_PROTO_TRAILER		= DSA_TAG_PROTO_TRAILER_VALUE,
+	DSA_TAG_PROTO_8021Q		= DSA_TAG_PROTO_8021Q_VALUE,
 };
 
 struct packet_type;
-- 
cgit v1.2.3


From cc1939e4b3aaf534fb2f3706820012036825731c Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 5 May 2019 13:19:23 +0300
Subject: net: dsa: Allow drivers to filter packets they can decode source port
 from

Frames get processed by DSA and redirected to switch port net devices
based on the ETH_P_XDSA multiplexed packet_type handler found by the
network stack when calling eth_type_trans().

The running assumption is that once the DSA .rcv function is called, DSA
is always able to decode the switch tag in order to change the skb->dev
from its master.

However there are tagging protocols (such as the new DSA_TAG_PROTO_SJA1105,
user of DSA_TAG_PROTO_8021Q) where this assumption is not completely
true, since switch tagging piggybacks on the absence of a vlan_filtering
bridge. Moreover, management traffic (BPDU, PTP) for this switch doesn't
rely on switch tagging, but on a different mechanism. So it would make
sense to at least be able to terminate that.

Having DSA receive traffic it can't decode would put it in an impossible
situation: the eth_type_trans() function would invoke the DSA .rcv(),
which could not change skb->dev, then eth_type_trans() would be invoked
again, which again would call the DSA .rcv, and the packet would never
be able to exit the DSA filter and would spiral in a loop until the
whole system dies.

This happens because eth_type_trans() doesn't actually look at the skb
(so as to identify a potential tag) when it deems it as being
ETH_P_XDSA. It just checks whether skb->dev has a DSA private pointer
installed (therefore it's a DSA master) and that there exists a .rcv
callback (everybody except DSA_TAG_PROTO_NONE has that). This is
understandable as there are many switch tags out there, and exhaustively
checking for all of them is far from ideal.

The solution lies in introducing a filtering function for each tagging
protocol. In the absence of a filtering function, all traffic is passed
to the .rcv DSA callback. The tagging protocol should see the filtering
function as a pre-validation that it can decode the incoming skb. The
traffic that doesn't match the filter will bypass the DSA .rcv callback
and be left on the master netdevice, which wasn't previously possible.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 69f3714f42ba..c90ceeec7d1f 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -69,6 +69,11 @@ struct dsa_device_ops {
 			       struct packet_type *pt);
 	int (*flow_dissect)(const struct sk_buff *skb, __be16 *proto,
 			    int *offset);
+	/* Used to determine which traffic should match the DSA filter in
+	 * eth_type_trans, and which, if any, should bypass it and be processed
+	 * as regular on the master net device.
+	 */
+	bool (*filter)(const struct sk_buff *skb, struct net_device *dev);
 	unsigned int overhead;
 	const char *name;
 	enum dsa_tag_protocol proto;
@@ -148,6 +153,7 @@ struct dsa_port {
 	struct dsa_switch_tree *dst;
 	struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev,
 			       struct packet_type *pt);
+	bool (*filter)(const struct sk_buff *skb, struct net_device *dev);
 
 	enum {
 		DSA_PORT_TYPE_UNUSED = 0,
@@ -520,6 +526,15 @@ static inline bool netdev_uses_dsa(struct net_device *dev)
 	return false;
 }
 
+static inline bool dsa_can_decode(const struct sk_buff *skb,
+				  struct net_device *dev)
+{
+#if IS_ENABLED(CONFIG_NET_DSA)
+	return !dev->dsa_ptr->filter || dev->dsa_ptr->filter(skb, dev);
+#endif
+	return false;
+}
+
 struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n);
 void dsa_unregister_switch(struct dsa_switch *ds);
 int dsa_register_switch(struct dsa_switch *ds);
-- 
cgit v1.2.3


From b68b0dd0fb2d91056d5241a19960cf47d4a80f05 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 5 May 2019 13:19:24 +0300
Subject: net: dsa: Keep private info in the skb->cb

Map a DSA structure over the 48-byte control block that will hold
skb info on transmit and receive. This is only for use within the DSA
processing layer (e.g. communicating between DSA core and tagger) and
not for passing info around with other layers such as the master net
device.

Also add a DSA_SKB_CB_PRIV() macro which retrieves a pointer to the
space up to 48 bytes that the DSA structure does not use. This space can
be used for drivers to add their own private info.

One use is for the PTP timestamping code path. When cloning a skb,
annotate the original with a pointer to the clone, which the driver can
then find easily and place the timestamp to. This avoids the need of a
separate queue to hold clones and a way to match an original to a cloned
skb.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index c90ceeec7d1f..d628587e0bde 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -83,6 +83,37 @@ struct dsa_device_ops {
 #define MODULE_ALIAS_DSA_TAG_DRIVER(__proto)				\
 	MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE))
 
+struct dsa_skb_cb {
+	struct sk_buff *clone;
+};
+
+struct __dsa_skb_cb {
+	struct dsa_skb_cb cb;
+	u8 priv[48 - sizeof(struct dsa_skb_cb)];
+};
+
+#define __DSA_SKB_CB(skb) ((struct __dsa_skb_cb *)((skb)->cb))
+
+#define DSA_SKB_CB(skb) ((struct dsa_skb_cb *)((skb)->cb))
+
+#define DSA_SKB_CB_COPY(nskb, skb)		\
+	{ *__DSA_SKB_CB(nskb) = *__DSA_SKB_CB(skb); }
+
+#define DSA_SKB_CB_ZERO(skb)			\
+	{ *__DSA_SKB_CB(skb) = (struct __dsa_skb_cb) {0}; }
+
+#define DSA_SKB_CB_PRIV(skb)			\
+	((void *)(skb)->cb + offsetof(struct __dsa_skb_cb, priv))
+
+#define DSA_SKB_CB_CLONE(_clone, _skb)		\
+	{					\
+		struct sk_buff *clone = _clone;	\
+		struct sk_buff *skb = _skb;	\
+						\
+		DSA_SKB_CB_COPY(clone, skb);	\
+		DSA_SKB_CB(skb)->clone = clone; \
+	}
+
 struct dsa_switch_tree {
 	struct list_head	list;
 
-- 
cgit v1.2.3


From 97a69a0dea9a048c6769249f1552de5f56731524 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 5 May 2019 13:19:25 +0300
Subject: net: dsa: Add support for deferred xmit

Some hardware needs to take work to get convinced to receive frames on
the CPU port (such as the sja1105 which takes temporary L2 forwarding
rules over SPI that last for a single frame). Such work needs a
sleepable context, and because the regular .ndo_start_xmit is atomic,
this cannot be done in the tagger. So introduce a generic DSA mechanism
that sets up a transmit skb queue and a workqueue for deferred
transmission.

The new driver callback (.port_deferred_xmit) is in dsa_switch and not
in the tagger because the operations that require sleeping typically
also involve interacting with the hardware, and not simply skb
manipulations. Therefore having it there simplifies the structure a bit
and makes it unnecessary to export functions from the driver to the
tagger.

The driver is responsible of calling dsa_enqueue_skb which transfers it
to the master netdevice. This is so that it has a chance of performing
some more work afterwards, such as cleanup or TX timestamping.

To tell DSA that skb xmit deferral is required, I have thought about
changing the return type of the tagger .xmit from struct sk_buff * into
a enum dsa_tx_t that could potentially encode a DSA_XMIT_DEFER value.

But the trailer tagger is reallocating every skb on xmit and therefore
making a valid use of the pointer return value. So instead of reworking
the API in complicated ways, right now a boolean property in the newly
introduced DSA_SKB_CB is set.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index d628587e0bde..0260b73938e2 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -85,6 +85,7 @@ struct dsa_device_ops {
 
 struct dsa_skb_cb {
 	struct sk_buff *clone;
+	bool deferred_xmit;
 };
 
 struct __dsa_skb_cb {
@@ -205,6 +206,10 @@ struct dsa_port {
 	struct net_device	*bridge_dev;
 	struct devlink_port	devlink_port;
 	struct phylink		*pl;
+
+	struct work_struct	xmit_work;
+	struct sk_buff_head	xmit_queue;
+
 	/*
 	 * Original copy of the master netdev ethtool_ops
 	 */
@@ -539,6 +544,12 @@ struct dsa_switch_ops {
 				 struct sk_buff *clone, unsigned int type);
 	bool	(*port_rxtstamp)(struct dsa_switch *ds, int port,
 				 struct sk_buff *skb, unsigned int type);
+
+	/*
+	 * Deferred frame Tx
+	 */
+	netdev_tx_t (*port_deferred_xmit)(struct dsa_switch *ds, int port,
+					  struct sk_buff *skb);
 };
 
 struct dsa_switch_driver {
@@ -634,6 +645,7 @@ static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev,
 #define BRCM_TAG_GET_QUEUE(v)		((v) & 0xff)
 
 
+netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev);
 int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data);
 int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data);
 int dsa_port_get_phy_sset_count(struct dsa_port *dp);
-- 
cgit v1.2.3


From c362beb072e14b929eb657dc174d83ccdd9b0eed Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 5 May 2019 13:19:26 +0300
Subject: net: dsa: Add a private structure pointer to dsa_port

This is supposed to share information between the driver and the tagger,
or used by the tagger to keep some state. Its use is optional.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vivien Didelot <vivien.didelot@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 0260b73938e2..e20be1ceb306 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -210,6 +210,12 @@ struct dsa_port {
 	struct work_struct	xmit_work;
 	struct sk_buff_head	xmit_queue;
 
+	/*
+	 * Give the switch driver somewhere to hang its per-port private data
+	 * structures (accessible from the tagger).
+	 */
+	void *priv;
+
 	/*
 	 * Original copy of the master netdev ethtool_ops
 	 */
-- 
cgit v1.2.3


From 227d07a07ef126272ea2eed97fd136cd7a803d81 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <olteanv@gmail.com>
Date: Sun, 5 May 2019 13:19:27 +0300
Subject: net: dsa: sja1105: Add support for traffic through standalone ports

In order to support this, we are creating a make-shift switch tag out of
a VLAN trunk configured on the CPU port. Termination of normal traffic
on switch ports only works when not under a vlan_filtering bridge.
Termination of management (PTP, BPDU) traffic works under all
circumstances because it uses a different tagging mechanism
(incl_srcpt). We are making use of the generic CONFIG_NET_DSA_TAG_8021Q
code and leveraging it from our own CONFIG_NET_DSA_TAG_SJA1105.

There are two types of traffic: regular and link-local.

The link-local traffic received on the CPU port is trapped from the
switch's regular forwarding decisions because it matched one of the two
DMAC filters for management traffic.

On transmission, the switch requires special massaging for these
link-local frames. Due to a weird implementation of the switching IP, by
default it drops link-local frames that originate on the CPU port.
It needs to be told where to forward them to, through an SPI command
("management route") that is valid for only a single frame.
So when we're sending link-local traffic, we are using the
dsa_defer_xmit mechanism.

Signed-off-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/dsa/sja1105.h | 35 ++++++++++++++++++++++++++---------
 include/net/dsa.h           |  2 ++
 2 files changed, 28 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h
index abf3977e34fd..603a02e5a8cb 100644
--- a/include/linux/dsa/sja1105.h
+++ b/include/linux/dsa/sja1105.h
@@ -2,22 +2,39 @@
  * Copyright (c) 2019, Vladimir Oltean <olteanv@gmail.com>
  */
 
-/* Included by drivers/net/dsa/sja1105/sja1105.h */
+/* Included by drivers/net/dsa/sja1105/sja1105.h and net/dsa/tag_sja1105.c */
 
 #ifndef _NET_DSA_SJA1105_H
 #define _NET_DSA_SJA1105_H
 
+#include <linux/skbuff.h>
 #include <linux/etherdevice.h>
+#include <net/dsa.h>
 
 #define ETH_P_SJA1105				ETH_P_DSA_8021Q
 
-/* The switch can only be convinced to stay in unmanaged mode and not trap any
- * link-local traffic by actually telling it to filter frames sent at the
- * 00:00:00:00:00:00 destination MAC.
- */
-#define SJA1105_LINKLOCAL_FILTER_A		0x000000000000ull
-#define SJA1105_LINKLOCAL_FILTER_A_MASK		0xFFFFFFFFFFFFull
-#define SJA1105_LINKLOCAL_FILTER_B		0x000000000000ull
-#define SJA1105_LINKLOCAL_FILTER_B_MASK		0xFFFFFFFFFFFFull
+/* IEEE 802.3 Annex 57A: Slow Protocols PDUs (01:80:C2:xx:xx:xx) */
+#define SJA1105_LINKLOCAL_FILTER_A		0x0180C2000000ull
+#define SJA1105_LINKLOCAL_FILTER_A_MASK		0xFFFFFF000000ull
+/* IEEE 1588 Annex F: Transport of PTP over Ethernet (01:1B:19:xx:xx:xx) */
+#define SJA1105_LINKLOCAL_FILTER_B		0x011B19000000ull
+#define SJA1105_LINKLOCAL_FILTER_B_MASK		0xFFFFFF000000ull
+
+enum sja1105_frame_type {
+	SJA1105_FRAME_TYPE_NORMAL = 0,
+	SJA1105_FRAME_TYPE_LINK_LOCAL,
+};
+
+struct sja1105_skb_cb {
+	enum sja1105_frame_type type;
+};
+
+#define SJA1105_SKB_CB(skb) \
+	((struct sja1105_skb_cb *)DSA_SKB_CB_PRIV(skb))
+
+struct sja1105_port {
+	struct dsa_port *dp;
+	int mgmt_slot;
+};
 
 #endif /* _NET_DSA_SJA1105_H */
diff --git a/include/net/dsa.h b/include/net/dsa.h
index e20be1ceb306..6aaaadd6a413 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -43,6 +43,7 @@ struct phylink_link_state;
 #define DSA_TAG_PROTO_QCA_VALUE			10
 #define DSA_TAG_PROTO_TRAILER_VALUE		11
 #define DSA_TAG_PROTO_8021Q_VALUE		12
+#define DSA_TAG_PROTO_SJA1105_VALUE		13
 
 enum dsa_tag_protocol {
 	DSA_TAG_PROTO_NONE		= DSA_TAG_PROTO_NONE_VALUE,
@@ -58,6 +59,7 @@ enum dsa_tag_protocol {
 	DSA_TAG_PROTO_QCA		= DSA_TAG_PROTO_QCA_VALUE,
 	DSA_TAG_PROTO_TRAILER		= DSA_TAG_PROTO_TRAILER_VALUE,
 	DSA_TAG_PROTO_8021Q		= DSA_TAG_PROTO_8021Q_VALUE,
+	DSA_TAG_PROTO_SJA1105		= DSA_TAG_PROTO_SJA1105_VALUE,
 };
 
 struct packet_type;
-- 
cgit v1.2.3


From d6787147e15dffa7b7f3116a5bc3cbe0670bd74f Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Mon, 6 May 2019 17:24:21 -0700
Subject: net/sched: remove block pointer from common offload structure

Based on feedback from Jiri avoid carrying a pointer to the tcf_block
structure in the tc_cls_common_offload structure. Instead store
a flag in driver private data which indicates if offloads apply
to a shared block at block binding time.

Suggested-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index eed98f8fcb5e..514e3c80ecc1 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -629,7 +629,6 @@ struct tc_cls_common_offload {
 	u32 chain_index;
 	__be16 protocol;
 	u32 prio;
-	struct tcf_block *block;
 	struct netlink_ext_ack *extack;
 };
 
@@ -731,13 +730,11 @@ static inline bool tc_in_hw(u32 flags)
 static inline void
 tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common,
 			   const struct tcf_proto *tp, u32 flags,
-			   struct tcf_block *block,
 			   struct netlink_ext_ack *extack)
 {
 	cls_common->chain_index = tp->chain->index;
 	cls_common->protocol = tp->protocol;
 	cls_common->prio = tp->prio;
-	cls_common->block = block;
 	if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE)
 		cls_common->extack = extack;
 }
-- 
cgit v1.2.3