From e227300c8395dffaa7614ce7c7666a82180ebc60 Mon Sep 17 00:00:00 2001
From: Purushottam Kushwaha <pkushwah@qti.qualcomm.com>
Date: Wed, 12 Oct 2016 18:25:35 +0530
Subject: cfg80211: pass struct to interface combination check/iter

Move the growing parameter list to a structure for the interface
combination check and iteration functions in cfg80211 and mac80211
to make the code easier to understand.

Signed-off-by: Purushottam Kushwaha <pkushwah@qti.qualcomm.com>
[edit commit message]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index fe78f02a242e..ea108541e1e0 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -771,6 +771,26 @@ struct cfg80211_csa_settings {
 	u8 count;
 };
 
+/**
+ * struct iface_combination_params - input parameters for interface combinations
+ *
+ * Used to pass interface combination parameters
+ *
+ * @num_different_channels: the number of different channels we want
+ *	to use for verification
+ * @radar_detect: a bitmap where each bit corresponds to a channel
+ *	width where radar detection is needed, as in the definition of
+ *	&struct ieee80211_iface_combination.@radar_detect_widths
+ * @iftype_num: array with the number of interfaces of each interface
+ *	type.  The index is the interface type as specified in &enum
+ *	nl80211_iftype.
+ */
+struct iface_combination_params {
+	int num_different_channels;
+	u8 radar_detect;
+	int iftype_num[NUM_NL80211_IFTYPES];
+};
+
 /**
  * enum station_parameters_apply_mask - station parameter values to apply
  * @STATION_PARAM_APPLY_UAPSD: apply new uAPSD parameters (uapsd_queues, max_sp)
@@ -5575,36 +5595,20 @@ unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy);
  * cfg80211_check_combinations - check interface combinations
  *
  * @wiphy: the wiphy
- * @num_different_channels: the number of different channels we want
- *	to use for verification
- * @radar_detect: a bitmap where each bit corresponds to a channel
- *	width where radar detection is needed, as in the definition of
- *	&struct ieee80211_iface_combination.@radar_detect_widths
- * @iftype_num: array with the numbers of interfaces of each interface
- *	type.  The index is the interface type as specified in &enum
- *	nl80211_iftype.
+ * @params: the interface combinations parameter
  *
  * This function can be called by the driver to check whether a
  * combination of interfaces and their types are allowed according to
  * the interface combinations.
  */
 int cfg80211_check_combinations(struct wiphy *wiphy,
-				const int num_different_channels,
-				const u8 radar_detect,
-				const int iftype_num[NUM_NL80211_IFTYPES]);
+				struct iface_combination_params *params);
 
 /**
  * cfg80211_iter_combinations - iterate over matching combinations
  *
  * @wiphy: the wiphy
- * @num_different_channels: the number of different channels we want
- *	to use for verification
- * @radar_detect: a bitmap where each bit corresponds to a channel
- *	width where radar detection is needed, as in the definition of
- *	&struct ieee80211_iface_combination.@radar_detect_widths
- * @iftype_num: array with the numbers of interfaces of each interface
- *	type.  The index is the interface type as specified in &enum
- *	nl80211_iftype.
+ * @params: the interface combinations parameter
  * @iter: function to call for each matching combination
  * @data: pointer to pass to iter function
  *
@@ -5613,9 +5617,7 @@ int cfg80211_check_combinations(struct wiphy *wiphy,
  * purposes.
  */
 int cfg80211_iter_combinations(struct wiphy *wiphy,
-			       const int num_different_channels,
-			       const u8 radar_detect,
-			       const int iftype_num[NUM_NL80211_IFTYPES],
+			       struct iface_combination_params *params,
 			       void (*iter)(const struct ieee80211_iface_combination *c,
 					    void *data),
 			       void *data);
-- 
cgit v1.2.3


From 0c317a02ca982ca093e71bf07cb562265ba40032 Mon Sep 17 00:00:00 2001
From: Purushottam Kushwaha <pkushwah@qti.qualcomm.com>
Date: Wed, 12 Oct 2016 18:26:51 +0530
Subject: cfg80211: support virtual interfaces with different beacon intervals

This commit provides a mechanism for the host drivers to advertise the
support for different beacon intervals among the respective interface
combinations in a group, through NL80211_IFACE_COMB_BI_MIN_GCD (u32).

This value will be compared against GCD of all beaconing interfaces of
matching combinations.

If the driver doesn't advertise this value, the old behaviour where
all beacon intervals must be identical is retained.

If it is specified, then any beacon interval for an interface in the
interface combination as well as the GCD of all active beacon intervals
in the combination must be greater or equal to this value.

Signed-off-by: Purushottam Kushwaha <pkushwah@qti.qualcomm.com>
[change commit message, some variable names, small other things]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 15 +++++++++++++++
 include/uapi/linux/nl80211.h |  8 ++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index ea108541e1e0..5000ec758eb3 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -784,11 +784,19 @@ struct cfg80211_csa_settings {
  * @iftype_num: array with the number of interfaces of each interface
  *	type.  The index is the interface type as specified in &enum
  *	nl80211_iftype.
+ * @beacon_int_gcd: a value specifying GCD of all beaconing interfaces,
+ *	the GCD of a single value is considered the value itself, so for
+ *	a single interface this should be set to that interface's beacon
+ *	interval
+ * @beacon_int_different: a flag indicating whether or not all beacon
+ *	intervals (of beaconing interfaces) are different or not.
  */
 struct iface_combination_params {
 	int num_different_channels;
 	u8 radar_detect;
 	int iftype_num[NUM_NL80211_IFTYPES];
+	u32 beacon_int_gcd;
+	bool beacon_int_different;
 };
 
 /**
@@ -3100,6 +3108,12 @@ struct ieee80211_iface_limit {
  *	only in special cases.
  * @radar_detect_widths: bitmap of channel widths supported for radar detection
  * @radar_detect_regions: bitmap of regions supported for radar detection
+ * @beacon_int_min_gcd: This interface combination supports different
+ *	beacon intervals.
+ *	= 0 - all beacon intervals for different interface must be same.
+ *	> 0 - any beacon interval for the interface part of this combination AND
+ *	      *GCD* of all beacon intervals from beaconing interfaces of this
+ *	      combination must be greater or equal to this value.
  *
  * With this structure the driver can describe which interface
  * combinations it supports concurrently.
@@ -3158,6 +3172,7 @@ struct ieee80211_iface_combination {
 	bool beacon_int_infra_match;
 	u8 radar_detect_widths;
 	u8 radar_detect_regions;
+	u32 beacon_int_min_gcd;
 };
 
 struct ieee80211_txrx_stypes {
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 56368e9b4622..1362d24957b5 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4280,6 +4280,9 @@ enum nl80211_iface_limit_attrs {
  *	of supported channel widths for radar detection.
  * @NL80211_IFACE_COMB_RADAR_DETECT_REGIONS: u32 attribute containing the bitmap
  *	of supported regulatory regions for radar detection.
+ * @NL80211_IFACE_COMB_BI_MIN_GCD: u32 attribute specifying the minimum GCD of
+ *	different beacon intervals supported by all the interface combinations
+ *	in this group (if not present, all beacon intervals be identical).
  * @NUM_NL80211_IFACE_COMB: number of attributes
  * @MAX_NL80211_IFACE_COMB: highest attribute number
  *
@@ -4287,8 +4290,8 @@ enum nl80211_iface_limit_attrs {
  *	limits = [ #{STA} <= 1, #{AP} <= 1 ], matching BI, channels = 1, max = 2
  *	=> allows an AP and a STA that must match BIs
  *
- *	numbers = [ #{AP, P2P-GO} <= 8 ], channels = 1, max = 8
- *	=> allows 8 of AP/GO
+ *	numbers = [ #{AP, P2P-GO} <= 8 ], BI min gcd, channels = 1, max = 8,
+ *	=> allows 8 of AP/GO that can have BI gcd >= min gcd
  *
  *	numbers = [ #{STA} <= 2 ], channels = 2, max = 2
  *	=> allows two STAs on different channels
@@ -4314,6 +4317,7 @@ enum nl80211_if_combination_attrs {
 	NL80211_IFACE_COMB_NUM_CHANNELS,
 	NL80211_IFACE_COMB_RADAR_DETECT_WIDTHS,
 	NL80211_IFACE_COMB_RADAR_DETECT_REGIONS,
+	NL80211_IFACE_COMB_BI_MIN_GCD,
 
 	/* keep last */
 	NUM_NL80211_IFACE_COMB,
-- 
cgit v1.2.3


From 61e84623ace35ce48975e8f90bbbac7557c43d61 Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Fri, 7 Oct 2016 22:04:33 -0400
Subject: net: centralize net_device min/max MTU checking

While looking into an MTU issue with sfc, I started noticing that almost
every NIC driver with an ndo_change_mtu function implemented almost
exactly the same range checks, and in many cases, that was the only
practical thing their ndo_change_mtu function was doing. Quite a few
drivers have either 68, 64, 60 or 46 as their minimum MTU value checked,
and then various sizes from 1500 to 65535 for their maximum MTU value. We
can remove a whole lot of redundant code here if we simple store min_mtu
and max_mtu in net_device, and check against those in net/core/dev.c's
dev_set_mtu().

In theory, there should be zero functional change with this patch, it just
puts the infrastructure in place. Subsequent patches will attempt to start
using said infrastructure, with theoretically zero change in
functionality.

CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 136ae6bbe81e..fbdf923af4d3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1506,6 +1506,8 @@ enum netdev_priv_flags {
  *	@if_port:	Selectable AUI, TP, ...
  *	@dma:		DMA channel
  *	@mtu:		Interface MTU value
+ *	@min_mtu:	Interface Minimum MTU value
+ *	@max_mtu:	Interface Maximum MTU value
  *	@type:		Interface hardware type
  *	@hard_header_len: Maximum hardware header length.
  *
@@ -1726,6 +1728,8 @@ struct net_device {
 	unsigned char		dma;
 
 	unsigned int		mtu;
+	unsigned int		min_mtu;
+	unsigned int		max_mtu;
 	unsigned short		type;
 	unsigned short		hard_header_len;
 
-- 
cgit v1.2.3


From a52ad514fdf3b8a57ca4322c92d2d8d5c6182485 Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Fri, 7 Oct 2016 22:04:34 -0400
Subject: net: deprecate eth_change_mtu, remove usage

With centralized MTU checking, there's nothing productive done by
eth_change_mtu that isn't already done in dev_set_mtu, so mark it as
deprecated and remove all usage of it in the kernel. All callers have been
audited for calls to alloc_etherdev* or ether_setup directly, which means
they all have a valid dev->min_mtu and dev->max_mtu. Now eth_change_mtu
prints out a netdev_warn about being deprecated, for the benefit of
out-of-tree drivers that might be utilizing it.

Of note, dvb_net.c actually had dev->mtu = 4096, while using
eth_change_mtu, meaning that if you ever tried changing it's mtu, you
couldn't set it above 1500 anymore. It's now getting dev->max_mtu also set
to 4096 to remedy that.

v2: fix up lantiq_etop, missed breakage due to drive not compiling on x86

CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_ether.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 117d02e0fc31..864d6f2b2cb0 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -35,6 +35,8 @@
 #define ETH_FRAME_LEN	1514		/* Max. octets in frame sans FCS */
 #define ETH_FCS_LEN	4		/* Octets in the FCS		 */
 
+#define ETH_MIN_MTU	68		/* Min IPv4 MTU per RFC791	*/
+
 /*
  *	These are the defined Ethernet Protocol ID's.
  */
-- 
cgit v1.2.3


From cc6ac9bccf6b9814d37932e86a92f8e6a92960dc Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 8 Oct 2016 11:36:05 +0800
Subject: sctp: reuse sent_count to avoid retransmitted chunks for RTT
 measurements

Now sctp uses chunk->resent to record if a chunk is retransmitted, for
RTT measurements with retransmitted DATA chunks. chunk->sent_count was
introduced to record how many times one chunk has been sent for prsctp
RTX policy before. We actually can know if one chunk is retransmitted
by checking chunk->sent_count is greater than 1.

This patch is to remove resent from sctp_chunk and reuse sent_count
to avoid retransmitted chunks for RTT measurements.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 11c3bf262a85..27a933e4c90e 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -641,7 +641,6 @@ struct sctp_chunk {
 #define SCTP_NEED_FRTX 0x1
 #define SCTP_DONT_FRTX 0x2
 	__u16	rtt_in_progress:1,	/* This chunk used for RTT calc? */
-		resent:1,		/* Has this chunk ever been resent. */
 		has_tsn:1,		/* Does this chunk have a TSN yet? */
 		has_ssn:1,		/* Does this chunk have a SSN yet? */
 		singleton:1,		/* Only chunk in the packet? */
@@ -656,6 +655,7 @@ struct sctp_chunk {
 		fast_retransmit:2;	/* Is this chunk fast retransmitted? */
 };
 
+#define sctp_chunk_retransmitted(chunk)	(chunk->sent_count > 1)
 void sctp_chunk_hold(struct sctp_chunk *);
 void sctp_chunk_put(struct sctp_chunk *);
 int sctp_user_addto_chunk(struct sctp_chunk *chunk, int len,
-- 
cgit v1.2.3


From 8ae808eb853e3789b81b8a502cdf22bb01b76880 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 8 Oct 2016 11:40:16 +0800
Subject: sctp: remove the old ttl expires policy

The prsctp polices include ttl expires policy already, we should remove
the old ttl expires codes, and just adjust the new polices' codes to be
compatible with the old one for users.

This patch is to remove all the old expires codes, and if prsctp polices
are not set, it will still set msg's expires_at and check the expires in
sctp_check_abandoned.

Note that asoc->prsctp_enable is set by default, so users can't feel any
difference even if they use the old expires api in userspace.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 27a933e4c90e..bd4a3ded7c87 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -530,7 +530,6 @@ struct sctp_datamsg {
 	/* Did the messenge fail to send? */
 	int send_error;
 	u8 send_failed:1,
-	   can_abandon:1,   /* can chunks from this message can be abandoned. */
 	   can_delay;	    /* should this message be Nagle delayed */
 };
 
-- 
cgit v1.2.3


From cf53b1da73bdf940f1523ec5a7d375d7056c759c Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Tue, 11 Oct 2016 13:04:09 -0700
Subject: Revert "net: Add driver helper functions to determine checksum
 offloadability"

This reverts commit 6ae23ad36253a8033c5714c52b691b84456487c5.

The code has been in kernel since 4.4 but there are no in tree
code that uses. Unused code is broken code, remove it.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 78 -----------------------------------------------
 1 file changed, 78 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fbdf923af4d3..bf341b65ca5e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2653,71 +2653,6 @@ static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb,
 	remcsum_unadjust((__sum16 *)ptr, grc->delta);
 }
 
-struct skb_csum_offl_spec {
-	__u16		ipv4_okay:1,
-			ipv6_okay:1,
-			encap_okay:1,
-			ip_options_okay:1,
-			ext_hdrs_okay:1,
-			tcp_okay:1,
-			udp_okay:1,
-			sctp_okay:1,
-			vlan_okay:1,
-			no_encapped_ipv6:1,
-			no_not_encapped:1;
-};
-
-bool __skb_csum_offload_chk(struct sk_buff *skb,
-			    const struct skb_csum_offl_spec *spec,
-			    bool *csum_encapped,
-			    bool csum_help);
-
-static inline bool skb_csum_offload_chk(struct sk_buff *skb,
-					const struct skb_csum_offl_spec *spec,
-					bool *csum_encapped,
-					bool csum_help)
-{
-	if (skb->ip_summed != CHECKSUM_PARTIAL)
-		return false;
-
-	return __skb_csum_offload_chk(skb, spec, csum_encapped, csum_help);
-}
-
-static inline bool skb_csum_offload_chk_help(struct sk_buff *skb,
-					     const struct skb_csum_offl_spec *spec)
-{
-	bool csum_encapped;
-
-	return skb_csum_offload_chk(skb, spec, &csum_encapped, true);
-}
-
-static inline bool skb_csum_off_chk_help_cmn(struct sk_buff *skb)
-{
-	static const struct skb_csum_offl_spec csum_offl_spec = {
-		.ipv4_okay = 1,
-		.ip_options_okay = 1,
-		.ipv6_okay = 1,
-		.vlan_okay = 1,
-		.tcp_okay = 1,
-		.udp_okay = 1,
-	};
-
-	return skb_csum_offload_chk_help(skb, &csum_offl_spec);
-}
-
-static inline bool skb_csum_off_chk_help_cmn_v4_only(struct sk_buff *skb)
-{
-	static const struct skb_csum_offl_spec csum_offl_spec = {
-		.ipv4_okay = 1,
-		.ip_options_okay = 1,
-		.tcp_okay = 1,
-		.udp_okay = 1,
-		.vlan_okay = 1,
-	};
-
-	return skb_csum_offload_chk_help(skb, &csum_offl_spec);
-}
-
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
 				  const void *daddr, const void *saddr,
@@ -3961,19 +3896,6 @@ static inline bool can_checksum_protocol(netdev_features_t features,
 	}
 }
 
-/* Map an ethertype into IP protocol if possible */
-static inline int eproto_to_ipproto(int eproto)
-{
-	switch (eproto) {
-	case htons(ETH_P_IP):
-		return IPPROTO_IP;
-	case htons(ETH_P_IPV6):
-		return IPPROTO_IPV6;
-	default:
-		return -1;
-	}
-}
-
 #ifdef CONFIG_BUG
 void netdev_rx_csum_fault(struct net_device *dev);
 #else
-- 
cgit v1.2.3


From 4f58e6dceb0e44ca8f21568ed81e1df24e55964c Mon Sep 17 00:00:00 2001
From: "Allan W. Nielsen" <allan.nielsen@microsemi.com>
Date: Wed, 12 Oct 2016 15:47:51 +0200
Subject: net: phy: Cleanup the Edge-Rate feature in Microsemi PHYs.

Edge-Rate cleanup include the following:
- Updated device tree bindings documentation for edge-rate
- The edge-rate is now specified as a "slowdown", meaning that it is now
  being specified as positive values instead of negative (both
  documentation and implementation wise).
- Only explicitly documented values for "vsc8531,vddmac" and
  "vsc8531,edge-slowdown" are accepted by the device driver.
- Deleted include/dt-bindings/net/mscc-phy-vsc8531.h as it was not needed.
- Read/validate devicetree settings in probe instead of init

Signed-off-by: Allan W. Nielsen <allan.nielsen@microsemi.com>
Signed-off-by: Raju Lakkaraju <raju.lakkaraju@microsemi.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/dt-bindings/net/mscc-phy-vsc8531.h | 21 ---------------------
 1 file changed, 21 deletions(-)
 delete mode 100644 include/dt-bindings/net/mscc-phy-vsc8531.h

(limited to 'include')

diff --git a/include/dt-bindings/net/mscc-phy-vsc8531.h b/include/dt-bindings/net/mscc-phy-vsc8531.h
deleted file mode 100644
index 2383dd20ff43..000000000000
--- a/include/dt-bindings/net/mscc-phy-vsc8531.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Device Tree constants for Microsemi VSC8531 PHY
- *
- * Author: Nagaraju Lakkaraju
- *
- * License: Dual MIT/GPL
- * Copyright (c) 2016 Microsemi Corporation
- */
-
-#ifndef _DT_BINDINGS_MSCC_VSC8531_H
-#define _DT_BINDINGS_MSCC_VSC8531_H
-
-/* MAC interface Edge rate control VDDMAC in milli Volts */
-#define MSCC_VDDMAC_3300		 3300
-#define MSCC_VDDMAC_2500		 2500
-#define MSCC_VDDMAC_1800		 1800
-#define MSCC_VDDMAC_1500		 1500
-#define MSCC_VDDMAC_MAX			 4
-#define MSCC_SLOWDOWN_MAX		 8
-
-#endif
-- 
cgit v1.2.3


From 165779231ff9e9c4ac7baaee84eff91d589f3e22 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Thu, 13 Oct 2016 09:06:41 +0300
Subject: net/sched: act_mirred: Rename tcfm_ok_push to tcfm_mac_header_xmit
 and make it a bool

'tcfm_ok_push' specifies whether a mac_len sized push is needed upon
egress to the target device (if action is performed at ingress).

Rename it to 'tcfm_mac_header_xmit' as this is actually an attribute of
the target device (and use a bool instead of int).

This allows to decouple the attribute from the action to be taken.

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_mirred.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h
index 62770add15bd..95431092c4b6 100644
--- a/include/net/tc_act/tc_mirred.h
+++ b/include/net/tc_act/tc_mirred.h
@@ -8,7 +8,7 @@ struct tcf_mirred {
 	struct tc_action	common;
 	int			tcfm_eaction;
 	int			tcfm_ifindex;
-	int			tcfm_ok_push;
+	bool			tcfm_mac_header_xmit;
 	struct net_device __rcu	*tcfm_dev;
 	struct list_head	tcfm_list;
 };
-- 
cgit v1.2.3


From 5724b8b5694794829a071c6da7dd0bc146df0756 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Thu, 13 Oct 2016 09:06:43 +0300
Subject: net/sched: tc_mirred: Rename public predicates
 'is_tcf_mirred_redirect' and 'is_tcf_mirred_mirror'

These accessors are used in various drivers that support tc offloading,
to detect properties of a given 'tc_action'.

'is_tcf_mirred_redirect' tests that the action is TCA_EGRESS_REDIR.
'is_tcf_mirred_mirror' tests that the action is TCA_EGRESS_MIRROR.

As a prep towards supporting INGRESS redir/mirror, rename these
predicates to reflect their true meaning:
  s/is_tcf_mirred_redirect/is_tcf_mirred_egress_redirect/
  s/is_tcf_mirred_mirror/is_tcf_mirred_egress_mirror/

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Cc: Hariprasad S <hariprasad@chelsio.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Ido Schimmel <idosch@mellanox.com>
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_mirred.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h
index 95431092c4b6..604bc31e23ab 100644
--- a/include/net/tc_act/tc_mirred.h
+++ b/include/net/tc_act/tc_mirred.h
@@ -14,7 +14,7 @@ struct tcf_mirred {
 };
 #define to_mirred(a) ((struct tcf_mirred *)a)
 
-static inline bool is_tcf_mirred_redirect(const struct tc_action *a)
+static inline bool is_tcf_mirred_egress_redirect(const struct tc_action *a)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	if (a->ops && a->ops->type == TCA_ACT_MIRRED)
@@ -23,7 +23,7 @@ static inline bool is_tcf_mirred_redirect(const struct tc_action *a)
 	return false;
 }
 
-static inline bool is_tcf_mirred_mirror(const struct tc_action *a)
+static inline bool is_tcf_mirred_egress_mirror(const struct tc_action *a)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	if (a->ops && a->ops->type == TCA_ACT_MIRRED)
-- 
cgit v1.2.3


From c3aaa403840a5ccd305fb5e73f3cbfac6453b5e5 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@caviumnetworks.com>
Date: Fri, 14 Oct 2016 05:19:17 -0400
Subject: qed: Pass MAC hints to VFs

Some hypervisors can support MAC hints to their VFs.
Even though we don't have such a hypervisor API in linux, we add
sufficient logic for the VF to be able to receive such hints and
set the mac accordingly - as long as the VF has not been set with
a MAC already.

Signed-off-by: Yuval Mintz <Yuval.Mintz@caviumnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_eth_if.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 33c24ebc9b7f..1c779486c30d 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -129,7 +129,7 @@ struct qed_tunn_params {
 
 struct qed_eth_cb_ops {
 	struct qed_common_cb_ops common;
-	void (*force_mac) (void *dev, u8 *mac);
+	void (*force_mac) (void *dev, u8 *mac, bool forced);
 };
 
 #ifdef CONFIG_DCB
-- 
cgit v1.2.3


From 7b7e70f979e34ed84d725eab8ea42921ab6f42e3 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@caviumnetworks.com>
Date: Fri, 14 Oct 2016 05:19:20 -0400
Subject: qed*: Allow unicast filtering

Apparently qede fails to set IFF_UNICAST_FLT, and as a result is not
actually performing unicast MAC filtering.
While we're at it - relax a hard-coded limitation that limits each
interface into using at most 15 unicast MAC addresses before turning
promiscuous. Instead utilize the HW resources to their limit.

Signed-off-by: Yuval Mintz <Yuval.Mintz@caviumnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_eth_if.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 1c779486c30d..15130805d792 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -23,6 +23,7 @@ struct qed_dev_eth_info {
 
 	u8	port_mac[ETH_ALEN];
 	u8	num_vlan_filters;
+	u16	num_mac_filters;
 
 	/* Legacy VF - this affects the datapath, so qede has to know */
 	bool is_legacy;
-- 
cgit v1.2.3


From 1104d9ba443a3972052ea4eaa01e51f9ee084652 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Fri, 14 Oct 2016 11:25:36 -0700
Subject: lwtunnel: Add destroy state operation

Users of lwt tunnels may set up some secondary state in build_state
function. Add a corresponding destroy_state function to allow users to
clean up state. This destroy state function is called from lwstate_free.
Also, we now free lwstate using kfree_rcu so user can assume structure
is not freed before rcu.

Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index ea3f80f58fd6..67d235f43202 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -29,6 +29,7 @@ struct lwtunnel_state {
 	int		(*orig_input)(struct sk_buff *);
 	int             len;
 	__u16		headroom;
+	struct		rcu_head rcu;
 	__u8            data[0];
 };
 
@@ -36,6 +37,7 @@ struct lwtunnel_encap_ops {
 	int (*build_state)(struct net_device *dev, struct nlattr *encap,
 			   unsigned int family, const void *cfg,
 			   struct lwtunnel_state **ts);
+	void (*destroy_state)(struct lwtunnel_state *lws);
 	int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
 	int (*input)(struct sk_buff *skb);
 	int (*fill_encap)(struct sk_buff *skb,
@@ -46,10 +48,7 @@ struct lwtunnel_encap_ops {
 };
 
 #ifdef CONFIG_LWTUNNEL
-static inline void lwtstate_free(struct lwtunnel_state *lws)
-{
-	kfree(lws);
-}
+void lwtstate_free(struct lwtunnel_state *lws);
 
 static inline struct lwtunnel_state *
 lwtstate_get(struct lwtunnel_state *lws)
-- 
cgit v1.2.3


From a3e2f4b6ed9de85086850fe49801f9b00adb6ae1 Mon Sep 17 00:00:00 2001
From: Michael Braun <michael-dev@fami-braun.de>
Date: Sat, 15 Oct 2016 13:28:19 +0200
Subject: mac80211: fix A-MSDU outer SA/DA

According to IEEE 802.11-2012 section 8.3.2 table 8-19, the outer SA/DA
of A-MSDU frames need to be changed depending on FromDS/ToDS values.

Signed-off-by: Michael Braun <michael-dev@fami-braun.de>
[use ether_addr_copy and add alignment annotations]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a810dfcb83c2..e50c9e02889a 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1438,7 +1438,7 @@ enum ieee80211_vif_flags {
 struct ieee80211_vif {
 	enum nl80211_iftype type;
 	struct ieee80211_bss_conf bss_conf;
-	u8 addr[ETH_ALEN];
+	u8 addr[ETH_ALEN] __aligned(2);
 	bool p2p;
 	bool csa_active;
 	bool mu_mimo_owner;
-- 
cgit v1.2.3


From 664fcf123a30edf16b47d2ce1f610d654ba917b2 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Sun, 16 Oct 2016 19:56:51 +0200
Subject: net: phy: Threaded interrupts allow some simplification

The PHY interrupts are now handled in a threaded interrupt handler,
which can sleep. The work queue is no longer needed, phy_change() can
be called directly. phy_mac_interrupt() still needs to be safe to call
in interrupt context, so keep the work queue, and use a helper to call
phy_change().

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index e25f1830fbcf..c47378c93607 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -343,7 +343,7 @@ struct phy_c45_device_ids {
  * giving up on the current attempt at acquiring a link
  * irq: IRQ number of the PHY's interrupt (-1 if none)
  * phy_timer: The timer for handling the state machine
- * phy_queue: A work_queue for the interrupt
+ * phy_queue: A work_queue for the phy_mac_interrupt
  * attached_dev: The attached enet driver's device instance ptr
  * adjust_link: Callback for the enet controller to respond to
  * changes in the link state.
@@ -802,7 +802,8 @@ int phy_driver_register(struct phy_driver *new_driver, struct module *owner);
 int phy_drivers_register(struct phy_driver *new_driver, int n,
 			 struct module *owner);
 void phy_state_machine(struct work_struct *work);
-void phy_change(struct work_struct *work);
+void phy_change(struct phy_device *phydev);
+void phy_change_work(struct work_struct *work);
 void phy_mac_interrupt(struct phy_device *phydev, int new_link);
 void phy_start_machine(struct phy_device *phydev);
 void phy_stop_machine(struct phy_device *phydev);
-- 
cgit v1.2.3


From 1a3f060c1a47dba4e12ac21ce62b57666b9c4e95 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Mon, 17 Oct 2016 19:15:44 -0700
Subject: net: Introduce new api for walking upper and lower devices

This patch introduces netdev_walk_all_upper_dev_rcu,
netdev_walk_all_lower_dev and netdev_walk_all_lower_dev_rcu. These
functions recursively walk the adj_list of devices to determine all upper
and lower devices.

The functions take a callback function that is invoked for each device
in the list. If the callback returns non-0, the walk is terminated and
the functions return that code back to callers.

v3
- simplified netdev_has_upper_dev_all_rcu and __netdev_has_upper_dev and
  removed typecast as suggested by Stephen

v2
- fixed definition of netdev_next_lower_dev_rcu to mirror the upper_dev
  version.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bf341b65ca5e..a5902d995907 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3778,6 +3778,14 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
 	     updev; \
 	     updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter)))
 
+int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
+				  int (*fn)(struct net_device *upper_dev,
+					    void *data),
+				  void *data);
+
+bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
+				  struct net_device *upper_dev);
+
 void *netdev_lower_get_next_private(struct net_device *dev,
 				    struct list_head **iter);
 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
@@ -3821,6 +3829,15 @@ struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
 	     ldev; \
 	     ldev = netdev_all_lower_get_next_rcu(dev, &(iter)))
 
+int netdev_walk_all_lower_dev(struct net_device *dev,
+			      int (*fn)(struct net_device *lower_dev,
+					void *data),
+			      void *data);
+int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
+				  int (*fn)(struct net_device *lower_dev,
+					    void *data),
+				  void *data);
+
 void *netdev_adjacent_get_private(struct list_head *adj_list);
 void *netdev_lower_get_first_private_rcu(struct net_device *dev);
 struct net_device *netdev_master_upper_dev_get(struct net_device *dev);
-- 
cgit v1.2.3


From f1170fd462c67c4ae2f20734566d94e0f8f62f69 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Mon, 17 Oct 2016 19:15:51 -0700
Subject: net: Remove all_adj_list and its references

Only direct adjacencies are maintained. All upper or lower devices can
be learned via the new walk API which recursively walks the adj_list for
upper devices or lower devices.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 25 -------------------------
 1 file changed, 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a5902d995907..458c87631e7f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1456,7 +1456,6 @@ enum netdev_priv_flags {
  *	@ptype_specific: Device-specific, protocol-specific packet handlers
  *
  *	@adj_list:	Directly linked devices, like slaves for bonding
- *	@all_adj_list:	All linked devices, *including* neighbours
  *	@features:	Currently active device features
  *	@hw_features:	User-changeable features
  *
@@ -1675,11 +1674,6 @@ struct net_device {
 		struct list_head lower;
 	} adj_list;
 
-	struct {
-		struct list_head upper;
-		struct list_head lower;
-	} all_adj_list;
-
 	netdev_features_t	features;
 	netdev_features_t	hw_features;
 	netdev_features_t	wanted_features;
@@ -3771,13 +3765,6 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
 	     updev; \
 	     updev = netdev_upper_get_next_dev_rcu(dev, &(iter)))
 
-/* iterate through upper list, must be called under RCU read lock */
-#define netdev_for_each_all_upper_dev_rcu(dev, updev, iter) \
-	for (iter = &(dev)->all_adj_list.upper, \
-	     updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter)); \
-	     updev; \
-	     updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter)))
-
 int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
 				  int (*fn)(struct net_device *upper_dev,
 					    void *data),
@@ -3817,18 +3804,6 @@ struct net_device *netdev_all_lower_get_next(struct net_device *dev,
 struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
 						 struct list_head **iter);
 
-#define netdev_for_each_all_lower_dev(dev, ldev, iter) \
-	for (iter = (dev)->all_adj_list.lower.next, \
-	     ldev = netdev_all_lower_get_next(dev, &(iter)); \
-	     ldev; \
-	     ldev = netdev_all_lower_get_next(dev, &(iter)))
-
-#define netdev_for_each_all_lower_dev_rcu(dev, ldev, iter) \
-	for (iter = (dev)->all_adj_list.lower.next, \
-	     ldev = netdev_all_lower_get_next_rcu(dev, &(iter)); \
-	     ldev; \
-	     ldev = netdev_all_lower_get_next_rcu(dev, &(iter)))
-
 int netdev_walk_all_lower_dev(struct net_device *dev,
 			      int (*fn)(struct net_device *lower_dev,
 					void *data),
-- 
cgit v1.2.3


From 1f9127caece42514a47011326b83ad93d95cd5d7 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@ni.com>
Date: Mon, 17 Oct 2016 10:49:54 -0500
Subject: net: phy: Create phy_supported_speeds function which lists speeds
 currently supported by a phydevice

phy_supported_speeds provides a means to get a list of all the speeds a
phy device currently supports.

Signed-off-by: Zach Brown <zach.brown@ni.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index c47378c93607..4b6c246c63bb 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -84,6 +84,21 @@ typedef enum {
 	PHY_INTERFACE_MODE_MAX,
 } phy_interface_t;
 
+/**
+ * phy_supported_speeds - return all speeds currently supported by a phy device
+ * @phy: The phy device to return supported speeds of.
+ * @speeds: buffer to store supported speeds in.
+ * @size: size of speeds buffer.
+ *
+ * Description: Returns the number of supported speeds, and
+ * fills the speeds * buffer with the supported speeds. If speeds buffer is
+ * too small to contain * all currently supported speeds, will return as
+ * many speeds as can fit.
+ */
+unsigned int phy_supported_speeds(struct phy_device *phy,
+				      unsigned int *speeds,
+				      unsigned int size);
+
 /**
  * It maps 'enum phy_interface_t' found in include/linux/phy.h
  * into the device tree binding of 'phy-mode', so that Ethernet
-- 
cgit v1.2.3


From 2e0bc452f4721520502575362a9cd3c1248d2337 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@ni.com>
Date: Mon, 17 Oct 2016 10:49:55 -0500
Subject: net: phy: leds: add support for led triggers on phy link state change

Create an option CONFIG_LED_TRIGGER_PHY (default n), which will create a
set of led triggers for each instantiated PHY device. There is one LED
trigger per link-speed, per-phy.
The triggers are registered during phy_attach and unregistered during
phy_detach.

This allows for a user to configure their system to allow a set of LEDs
not controlled by the phy to represent link state changes on the phy.
LEDS controlled by the phy are unaffected.

For example, we have a board where some of the leds in the
RJ45 socket are controlled by the phy, but others are not. Using the
triggers provided by this patch the leds not controlled by the phy can
be configured to show the current speed of the ethernet connection. The
leds controlled by the phy are unaffected.

Signed-off-by: Josh Cartwright <josh.cartwright@ni.com>
Signed-off-by: Nathan Sullivan <nathan.sullivan@ni.com>
Signed-off-by: Zach Brown <zach.brown@ni.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h              |  7 ++++++
 include/linux/phy_led_triggers.h | 51 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+)
 create mode 100644 include/linux/phy_led_triggers.h

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 4b6c246c63bb..e7e1fd382564 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -25,6 +25,7 @@
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/mod_devicetable.h>
+#include <linux/phy_led_triggers.h>
 
 #include <linux/atomic.h>
 
@@ -420,6 +421,12 @@ struct phy_device {
 
 	int link_timeout;
 
+#ifdef CONFIG_LED_TRIGGER_PHY
+	struct phy_led_trigger *phy_led_triggers;
+	unsigned int phy_num_led_triggers;
+	struct phy_led_trigger *last_triggered;
+#endif
+
 	/*
 	 * Interrupt number for this PHY
 	 * -1 means no interrupt
diff --git a/include/linux/phy_led_triggers.h b/include/linux/phy_led_triggers.h
new file mode 100644
index 000000000000..a2daea0a37d2
--- /dev/null
+++ b/include/linux/phy_led_triggers.h
@@ -0,0 +1,51 @@
+/* Copyright (C) 2016 National Instruments Corp.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef __PHY_LED_TRIGGERS
+#define __PHY_LED_TRIGGERS
+
+struct phy_device;
+
+#ifdef CONFIG_LED_TRIGGER_PHY
+
+#include <linux/leds.h>
+
+#define PHY_LED_TRIGGER_SPEED_SUFFIX_SIZE	10
+#define PHY_MII_BUS_ID_SIZE	(20 - 3)
+
+#define PHY_LINK_LED_TRIGGER_NAME_SIZE (PHY_MII_BUS_ID_SIZE + \
+				       FIELD_SIZEOF(struct mdio_device, addr)+\
+				       PHY_LED_TRIGGER_SPEED_SUFFIX_SIZE)
+
+struct phy_led_trigger {
+	struct led_trigger trigger;
+	char name[PHY_LINK_LED_TRIGGER_NAME_SIZE];
+	unsigned int speed;
+};
+
+
+extern int phy_led_triggers_register(struct phy_device *phy);
+extern void phy_led_triggers_unregister(struct phy_device *phy);
+extern void phy_led_trigger_change_speed(struct phy_device *phy);
+
+#else
+
+static inline int phy_led_triggers_register(struct phy_device *phy)
+{
+	return 0;
+}
+static inline void phy_led_triggers_unregister(struct phy_device *phy) { }
+static inline void phy_led_trigger_change_speed(struct phy_device *phy) { }
+
+#endif
+
+#endif
-- 
cgit v1.2.3


From 9a97434215819872b054c3d0c067e5e4fa768b0e Mon Sep 17 00:00:00 2001
From: Robert Jarzmik <robert.jarzmik@free.fr>
Date: Mon, 17 Oct 2016 21:45:29 +0200
Subject: ARM: pxa: enhance smc91x platform data

Instead of having the smc91x driver relying on machine_is_*() calls,
provide this data through platform data, ie. idp, mainstone and
stargate.

This way, the driver doesn't need anymore machine_is_*() calls, which
wouldn't work anymore with a device-tree build.

Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/smc91x.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/smc91x.h b/include/linux/smc91x.h
index e302c447e057..129bc674dcf5 100644
--- a/include/linux/smc91x.h
+++ b/include/linux/smc91x.h
@@ -39,6 +39,7 @@ struct smc91x_platdata {
 	unsigned long flags;
 	unsigned char leda;
 	unsigned char ledb;
+	bool pxa_u16_align4;	/* PXA buggy u16 writes on 4*n+2 addresses */
 };
 
 #endif /* __SMC91X_H__ */
-- 
cgit v1.2.3


From a1264c3d6c04f0e4e9d447caaa249d6288b01520 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 18 Oct 2016 23:12:07 +0300
Subject: wireless: radiotap: fix timestamp sampling position values

The values don't match the radiotap spec, corrected that.

Reported-by: Oz Shalev <oz.shalev@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/ieee80211_radiotap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index ba07b9d8ed63..d0e7e3f8e67a 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -333,9 +333,9 @@ enum ieee80211_radiotap_type {
 #define IEEE80211_RADIOTAP_TIMESTAMP_UNIT_NS			0x0003
 #define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_MASK			0x00F0
 #define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_BEGIN_MDPU		0x0000
-#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_EO_MPDU		0x0010
+#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_PLCP_SIG_ACQ		0x0010
 #define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_EO_PPDU		0x0020
-#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_PLCP_SIG_ACQ		0x0030
+#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_EO_MPDU		0x0030
 #define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_UNKNOWN		0x00F0
 
 #define IEEE80211_RADIOTAP_TIMESTAMP_FLAG_64BIT			0x00
-- 
cgit v1.2.3


From 0aa419ec6e7b98d485f6c66a62a90965eda3c1bb Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Tue, 18 Oct 2016 23:12:10 +0300
Subject: mac80211: allow the driver not to pass the tid to
 ieee80211_sta_uapsd_trigger

iwlwifi will check internally that the tid maps to an AC
that is trigger enabled, but can't know what tid exactly.
Allow the driver to pass a generic tid and make mac80211
assume that a trigger frame was received.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index e50c9e02889a..f3dbadafe16e 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4087,6 +4087,10 @@ void ieee80211_sta_pspoll(struct ieee80211_sta *sta);
  * This must be used in conjunction with ieee80211_sta_ps_transition()
  * and possibly ieee80211_sta_pspoll(); calls to all three must be
  * serialized.
+ * %IEEE80211_NUM_TIDS can be passed as the tid if the tid is unknown.
+ * In this case, mac80211 will not check that this tid maps to an AC
+ * that is trigger enabled and assume that the caller did the proper
+ * checks.
  */
 void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *sta, u8 tid);
 
-- 
cgit v1.2.3


From f3fe4e93dd6346c01fd4070ae02ec746fbae73bb Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Tue, 18 Oct 2016 23:12:11 +0300
Subject: mac80211: add a HW flag for supporting HW TX fragmentation

Currently mac80211 determines whether HW does fragmentation
by checking whether the set_frag_threshold callback is set
or not.
However, some drivers may want to set the HW fragmentation
capability depending on HW generation.
Allow this by checking a HW flag instead of checking the
callback.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
[added the flag to ath10k and wlcore]
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index f3dbadafe16e..a1a27021f452 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2025,6 +2025,10 @@ struct ieee80211_txq {
  *	drivers, mac80211 packet loss mechanism will not be triggered and driver
  *	is completely depending on firmware event for station kickout.
  *
+ * @IEEE80211_HW_SUPPORTS_TX_FRAG: Hardware does fragmentation by itself.
+ *	The stack will not do fragmentation.
+ *	The callback for @set_frag_threshold should be set as well.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2066,6 +2070,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_TX_AMSDU,
 	IEEE80211_HW_TX_FRAG_LIST,
 	IEEE80211_HW_REPORTS_LOW_ACK,
+	IEEE80211_HW_SUPPORTS_TX_FRAG,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
@@ -3093,8 +3098,9 @@ enum ieee80211_reconfig_type {
  *	The callback must be atomic.
  *
  * @set_frag_threshold: Configuration of fragmentation threshold. Assign this
- *	if the device does fragmentation by itself; if this callback is
- *	implemented then the stack will not do fragmentation.
+ *	if the device does fragmentation by itself. Note that to prevent the
+ *	stack from doing fragmentation IEEE80211_HW_SUPPORTS_TX_FRAG
+ *	should be set as well.
  *	The callback can sleep.
  *
  * @set_rts_threshold: Configuration of RTS threshold (if device needs it)
-- 
cgit v1.2.3


From f438ceb81d424cb90a5a1aad569056bd7c2ab4c5 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Tue, 18 Oct 2016 23:12:12 +0300
Subject: mac80211: uapsd_queues is in QoS IE order

The uapsd_queue field is in QoS IE order and not in
IEEE80211_AC_*'s order.
This means that mac80211 would get confused between
BK and BE which is certainly not such a big deal but
needs to be fixed.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 3 ++-
 include/net/mac80211.h | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5000ec758eb3..10a26f0fbafe 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4574,7 +4574,8 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr);
  *	moves to cfg80211 in this call
  * @buf: authentication frame (header + body)
  * @len: length of the frame data
- * @uapsd_queues: bitmap of ACs configured to uapsd. -1 if n/a.
+ * @uapsd_queues: bitmap of queues configured for uapsd. Same format
+ *	as the AC bitmap in the QoS info field
  *
  * After being asked to associate via cfg80211_ops::assoc() the driver must
  * call either this function or cfg80211_auth_timeout().
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a1a27021f452..b9b24abd9103 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1745,7 +1745,8 @@ struct ieee80211_sta_rates {
  * @drv_priv: data area for driver use, will always be aligned to
  *	sizeof(void *), size is determined in hw information.
  * @uapsd_queues: bitmap of queues configured for uapsd. Only valid
- *	if wme is supported.
+ *	if wme is supported. The bits order is like in
+ *	IEEE80211_WMM_IE_STA_QOSINFO_AC_*.
  * @max_sp: max Service Period. Only valid if wme is supported.
  * @bandwidth: current bandwidth the station can receive with
  * @rx_nss: in HT/VHT, the maximum number of spatial streams the
-- 
cgit v1.2.3


From 0711d638786941ec02551dd9b4aa0d8341f7db5b Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Tue, 18 Oct 2016 23:12:13 +0300
Subject: cfg80211: allow aborting in-progress connection atttempts

On a disconnect request from userspace, cfg80211 currently calls
called rdev_disconnect() only in case that 'current_bss' was set,
i.e. connection had been established.

Change this to allow the userspace call to succeed and call the
driver's disconnect() method also while the connection attempt is
in progress, to be able to abort attempts.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
[change commit subject/message]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 10a26f0fbafe..2bbbcc3eecac 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2564,9 +2564,10 @@ struct cfg80211_nan_func {
  *	cases, the result of roaming is indicated with a call to
  *	cfg80211_roamed() or cfg80211_roamed_bss().
  *	(invoked with the wireless_dev mutex held)
- * @disconnect: Disconnect from the BSS/ESS. Once done, call
- *	cfg80211_disconnected().
- *	(invoked with the wireless_dev mutex held)
+ * @disconnect: Disconnect from the BSS/ESS or stop connection attempts if
+ *      connection is in progress. Once done, call cfg80211_disconnected() in
+ *      case connection was already established (invoked with the
+ *      wireless_dev mutex held), otherwise call cfg80211_connect_timeout().
  *
  * @join_ibss: Join the specified IBSS (or create if necessary). Once done, call
  *	cfg80211_ibss_joined(), also call that function when changing BSSID due
-- 
cgit v1.2.3


From 57a09bf0a416700676e77102c28f9cfcb48267e0 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 18 Oct 2016 19:51:19 +0200
Subject: bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers

A BPF program is required to check the return register of a
map_elem_lookup() call before accessing memory. The verifier keeps
track of this by converting the type of the result register from
PTR_TO_MAP_VALUE_OR_NULL to PTR_TO_MAP_VALUE after a conditional
jump ensures safety. This check is currently exclusively performed
for the result register 0.

In the event the compiler reorders instructions, BPF_MOV64_REG
instructions may be moved before the conditional jump which causes
them to keep their type PTR_TO_MAP_VALUE_OR_NULL to which the
verifier objects when the register is accessed:

0: (b7) r1 = 10
1: (7b) *(u64 *)(r10 -8) = r1
2: (bf) r2 = r10
3: (07) r2 += -8
4: (18) r1 = 0x59c00000
6: (85) call 1
7: (bf) r4 = r0
8: (15) if r0 == 0x0 goto pc+1
 R0=map_value(ks=8,vs=8) R4=map_value_or_null(ks=8,vs=8) R10=fp
9: (7a) *(u64 *)(r4 +0) = 0
R4 invalid mem access 'map_value_or_null'

This commit extends the verifier to keep track of all identical
PTR_TO_MAP_VALUE_OR_NULL registers after a map_elem_lookup() by
assigning them an ID and then marking them all when the conditional
jump is observed.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7035b997aaa5..ac5b393ee6b2 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -23,13 +23,13 @@ struct bpf_reg_state {
 	 * result in a bad access.
 	 */
 	u64 min_value, max_value;
+	u32 id;
 	union {
 		/* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
 		s64 imm;
 
 		/* valid when type == PTR_TO_PACKET* */
 		struct {
-			u32 id;
 			u16 off;
 			u16 range;
 		};
-- 
cgit v1.2.3


From d894be57ca92c8a8819ab544d550809e8731137b Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Thu, 20 Oct 2016 13:55:16 -0400
Subject: ethernet: use net core MTU range checking in more drivers

Somehow, I missed a healthy number of ethernet drivers in the last pass.
Most of these drivers either were in need of an updated max_mtu to make
jumbo frames possible to enable again. In a few cases, also setting a
different min_mtu to match previous lower bounds. There are also a few
drivers that had no upper bounds checking, so they're getting a brand new
ETH_MAX_MTU that is identical to IP_MAX_MTU, but accessible by includes
all ethernet and ethernet-like drivers all have already.

acenic:
- min_mtu = 0, max_mtu = 9000

amazon/ena:
- min_mtu = 128, max_mtu = adapter->max_mtu

amd/xgbe:
- min_mtu = 0, max_mtu = 9000

sb1250:
- min_mtu = 0, max_mtu = 1518

cxgb3:
- min_mtu = 81, max_mtu = 65535

cxgb4:
- min_mtu = 81, max_mtu = 9600

cxgb4vf:
- min_mtu = 81, max_mtu = 65535

benet:
- min_mtu = 256, max_mtu = 9000

ibmveth:
- min_mtu = 68, max_mtu = 65535

ibmvnic:
- min_mtu = adapter->min_mtu, max_mtu = adapter->max_mtu
- remove now redundant ibmvnic_change_mtu

jme:
- min_mtu = 1280, max_mtu = 9202

mv643xx_eth:
- min_mtu = 64, max_mtu = 9500

mlxsw:
- min_mtu = 0, max_mtu = 65535
- Basically bypassing the core checks, and instead relying on dynamic
  checks in the respective switch drivers' ndo_change_mtu functions

ns83820:
- min_mtu = 0
- remove redundant ns83820_change_mtu, only checked for mtu > 1500

netxen:
- min_mtu = 0, max_mtu = 8000 (P2), max_mtu = 9600 (P3)

qlge:
- min_mtu = 1500, max_mtu = 9000
- driver only supports setting mtu to 1500 or 9000, so the core check only
  rules out < 1500 and > 9000, qlge_change_mtu still needs to check that
  the value is 1500 or 9000

qualcomm/emac:
- min_mtu = 46, max_mtu = 9194

xilinx_axienet:
- min_mtu = 64, max_mtu = 9000

Fixes: 61e84623ace3 ("net: centralize net_device min/max MTU checking")
CC: netdev@vger.kernel.org
CC: Jes Sorensen <jes@trained-monkey.org>
CC: Netanel Belgazal <netanel@annapurnalabs.com>
CC: Tom Lendacky <thomas.lendacky@amd.com>
CC: Santosh Raspatur <santosh@chelsio.com>
CC: Hariprasad S <hariprasad@chelsio.com>
CC: Sathya Perla <sathya.perla@broadcom.com>
CC: Ajit Khaparde <ajit.khaparde@broadcom.com>
CC: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
CC: Somnath Kotur <somnath.kotur@broadcom.com>
CC: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
CC: John Allen <jallen@linux.vnet.ibm.com>
CC: Guo-Fu Tseng <cooldavid@cooldavid.org>
CC: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
CC: Jiri Pirko <jiri@mellanox.com>
CC: Ido Schimmel <idosch@mellanox.com>
CC: Manish Chopra <manish.chopra@qlogic.com>
CC: Sony Chacko <sony.chacko@qlogic.com>
CC: Rajesh Borundia <rajesh.borundia@qlogic.com>
CC: Timur Tabi <timur@codeaurora.org>
CC: Anirudha Sarangi <anirudh@xilinx.com>
CC: John Linn <John.Linn@xilinx.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_ether.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 864d6f2b2cb0..3e5185e9ef03 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -36,6 +36,7 @@
 #define ETH_FCS_LEN	4		/* Octets in the FCS		 */
 
 #define ETH_MIN_MTU	68		/* Min IPv4 MTU per RFC791	*/
+#define ETH_MAX_MTU	0xFFFFU		/* 65535, same as IP_MAX_MTU	*/
 
 /*
  *	These are the defined Ethernet Protocol ID's.
-- 
cgit v1.2.3


From 8b6b4135e4fb2b537f33b811c13f77bee25ca8d3 Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Thu, 20 Oct 2016 13:55:19 -0400
Subject: net: use core MTU range checking in WAN drivers

- set min/max_mtu in all hdlc drivers, remove hdlc_change_mtu
- sent max_mtu in lec driver, remove lec_change_mtu
- set min/max_mtu in x25_asy driver

CC: netdev@vger.kernel.org
CC: Krzysztof Halasa <khc@pm.waw.pl>
CC: Krzysztof Halasa <khalasa@piap.pl>
CC: Jan "Yenya" Kasprzak <kas@fi.muni.cz>
CC: Francois Romieu <romieu@fr.zoreil.com>
CC: Kevin Curtis <kevin.curtis@farsite.co.uk>
CC: Zhao Qiang <qiang.zhao@nxp.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/hdlc.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/hdlc.h b/include/linux/hdlc.h
index e31bcd4c7859..97585d9679f3 100644
--- a/include/linux/hdlc.h
+++ b/include/linux/hdlc.h
@@ -93,8 +93,6 @@ static __inline__ void debug_frame(const struct sk_buff *skb)
 int hdlc_open(struct net_device *dev);
 /* Must be called by hardware driver when HDLC device is being closed */
 void hdlc_close(struct net_device *dev);
-/* May be used by hardware driver */
-int hdlc_change_mtu(struct net_device *dev, int new_mtu);
 /* Must be pointed to by hw driver's dev->netdev_ops->ndo_start_xmit */
 netdev_tx_t hdlc_start_xmit(struct sk_buff *skb, struct net_device *dev);
 
-- 
cgit v1.2.3


From b3e3893e1253692c3d2b8e8ebd5a26183ed30443 Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Thu, 20 Oct 2016 13:55:22 -0400
Subject: net: use core MTU range checking in misc drivers

firewire-net:
- set min/max_mtu
- remove fwnet_change_mtu

nes:
- set max_mtu
- clean up nes_netdev_change_mtu

xpnet:
- set min/max_mtu
- remove xpnet_dev_change_mtu

hippi:
- set min/max_mtu
- remove hippi_change_mtu

batman-adv:
- set max_mtu
- remove batadv_interface_change_mtu
- initialization is a little async, not 100% certain that max_mtu is set
  in the optimal place, don't have hardware to test with

rionet:
- set min/max_mtu
- remove rionet_change_mtu

slip:
- set min/max_mtu
- streamline sl_change_mtu

um/net_kern:
- remove pointless ndo_change_mtu

hsi/clients/ssi_protocol:
- use core MTU range checking
- remove now redundant ssip_pn_set_mtu

ipoib:
- set a default max MTU value
- Note: ipoib's actual max MTU can vary, depending on if the device is in
  connected mode or not, so we'll just set the max_mtu value to the max
  possible, and let the ndo_change_mtu function continue to validate any new
  MTU change requests with checks for CM or not. Note that ipoib has no
  min_mtu set, and thus, the network core's mtu > 0 check is the only lower
  bounds here.

mptlan:
- use net core MTU range checking
- remove now redundant mpt_lan_change_mtu

fddi:
- min_mtu = 21, max_mtu = 4470
- remove now redundant fddi_change_mtu (including export)

fjes:
- min_mtu = 8192, max_mtu = 65536
- The max_mtu value is actually one over IP_MAX_MTU here, but the idea is to
  get past the core net MTU range checks so fjes_change_mtu can validate a
  new MTU against what it supports (see fjes_support_mtu in fjes_hw.c)

hsr:
- min_mtu = 0 (calls ether_setup, max_mtu is 1500)

f_phonet:
- min_mtu = 6, max_mtu = 65541

u_ether:
- min_mtu = 14, max_mtu = 15412

phonet/pep-gprs:
- min_mtu = 576, max_mtu = 65530
- remove redundant gprs_set_mtu

CC: netdev@vger.kernel.org
CC: linux-rdma@vger.kernel.org
CC: Stefan Richter <stefanr@s5r6.in-berlin.de>
CC: Faisal Latif <faisal.latif@intel.com>
CC: linux-rdma@vger.kernel.org
CC: Cliff Whickman <cpw@sgi.com>
CC: Robin Holt <robinmholt@gmail.com>
CC: Jes Sorensen <jes@trained-monkey.org>
CC: Marek Lindner <mareklindner@neomailbox.ch>
CC: Simon Wunderlich <sw@simonwunderlich.de>
CC: Antonio Quartulli <a@unstable.cc>
CC: Sathya Prakash <sathya.prakash@broadcom.com>
CC: Chaitra P B <chaitra.basappa@broadcom.com>
CC: Suganath Prabu Subramani <suganath-prabu.subramani@broadcom.com>
CC: MPT-FusionLinux.pdl@broadcom.com
CC: Sebastian Reichel <sre@kernel.org>
CC: Felipe Balbi <balbi@kernel.org>
CC: Arvid Brodin <arvid.brodin@alten.se>
CC: Remi Denis-Courmont <courmisch@gmail.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/fddidevice.h  | 1 -
 include/linux/hippidevice.h | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/fddidevice.h b/include/linux/fddidevice.h
index 9a79f0106da1..32c22cfb238b 100644
--- a/include/linux/fddidevice.h
+++ b/include/linux/fddidevice.h
@@ -26,7 +26,6 @@
 
 #ifdef __KERNEL__
 __be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev);
-int fddi_change_mtu(struct net_device *dev, int new_mtu);
 struct net_device *alloc_fddidev(int sizeof_priv);
 #endif
 
diff --git a/include/linux/hippidevice.h b/include/linux/hippidevice.h
index 8ec23fb0b412..402f99e328d4 100644
--- a/include/linux/hippidevice.h
+++ b/include/linux/hippidevice.h
@@ -32,7 +32,6 @@ struct hippi_cb {
 };
 
 __be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev);
-int hippi_change_mtu(struct net_device *dev, int new_mtu);
 int hippi_mac_addr(struct net_device *dev, void *p);
 int hippi_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p);
 struct net_device *alloc_hippi_dev(int sizeof_priv);
-- 
cgit v1.2.3


From f8c3bf00d440df2bc2c3f669d460868d9ba67226 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 21 Oct 2016 13:55:45 +0200
Subject: net/socket: factor out helpers for memory and queue manipulation

Basic sock operations that udp code can use with its own
memory accounting schema. No functional change is introduced
in the existing APIs.

v4 -> v5:
  - avoid whitespace changes

v2 -> v4:
  - avoid exporting __sock_enqueue_skb

v1 -> v2:
  - avoid export sock_rmem_free

Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index ebf75db08e06..276489553338 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1274,7 +1274,9 @@ static inline struct inode *SOCK_INODE(struct socket *socket)
 /*
  * Functions for memory accounting
  */
+int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind);
 int __sk_mem_schedule(struct sock *sk, int size, int kind);
+void __sk_mem_reduce_allocated(struct sock *sk, int amount);
 void __sk_mem_reclaim(struct sock *sk, int amount);
 
 #define SK_MEM_QUANTUM ((int)PAGE_SIZE)
@@ -1950,6 +1952,8 @@ void sk_reset_timer(struct sock *sk, struct timer_list *timer,
 
 void sk_stop_timer(struct sock *sk, struct timer_list *timer);
 
+int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
+			unsigned int flags);
 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 
-- 
cgit v1.2.3


From f970bd9e3a06f06df8d8ecf1f8ad2c8615cc17eb Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 21 Oct 2016 13:55:46 +0200
Subject: udp: implement memory accounting helpers

Avoid using the generic helpers.
Use the receive queue spin lock to protect the memory
accounting operation, both on enqueue and on dequeue.

On dequeue perform partial memory reclaiming, trying to
leave a quantum of forward allocated memory.

On enqueue use a custom helper, to allow some optimizations:
- use a plain spin_lock() variant instead of the slightly
  costly spin_lock_irqsave(),
- avoid dst_force check, since the calling code has already
  dropped the skb dst
- avoid orphaning the skb, since skb_steal_sock() already did
  the work for us

The above needs custom memory reclaiming on shutdown, provided
by the udp_destruct_sock().

v5 -> v6:
  - don't orphan the skb on enqueue

v4 -> v5:
  - replace the mem_lock with the receive queue spin lock
  - ensure that the bh is always allowed to enqueue at least
    a skb, even if sk_rcvbuf is exceeded

v3 -> v4:
  - reworked memory accunting, simplifying the schema
  - provide an helper for both memory scheduling and enqueuing

v1 -> v2:
  - use a udp specific destrctor to perform memory reclaiming
  - remove a couple of helpers, unneeded after the above cleanup
  - do not reclaim memory on dequeue if not under memory
    pressure
  - reworked the fwd accounting schema to avoid potential
    integer overflow

Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/udp.h b/include/net/udp.h
index ea53a87d880f..18f1e6b91927 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -246,6 +246,9 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
 }
 
 /* net/ipv4/udp.c */
+void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
+int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
+
 void udp_v4_early_demux(struct sk_buff *skb);
 int udp_get_port(struct sock *sk, unsigned short snum,
 		 int (*saddr_cmp)(const struct sock *,
@@ -258,6 +261,7 @@ void udp_flush_pending_frames(struct sock *sk);
 void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
 int udp_rcv(struct sk_buff *skb);
 int udp_ioctl(struct sock *sk, int cmd, unsigned long arg);
+int udp_init_sock(struct sock *sk);
 int udp_disconnect(struct sock *sk, int flags);
 unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait);
 struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
-- 
cgit v1.2.3


From 2d0e30c30f84d08dc16f0f2af41f1b8a85f0755e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 21 Oct 2016 12:46:33 +0200
Subject: bpf: add helper for retrieving current numa node id

Use case is mainly for soreuseport to select sockets for the local
numa node, but since generic, lets also add this for other networking
and tracing program types.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      | 1 +
 include/uapi/linux/bpf.h | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c201017b5730..edcd96ded8aa 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -319,6 +319,7 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
+extern const struct bpf_func_proto bpf_get_numa_node_id_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;
 extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
 extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f09c70b97eca..374ef582ae18 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -426,6 +426,12 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_set_hash_invalid,
 
+	/**
+	 * bpf_get_numa_node_id()
+	 * Returns the id of the current NUMA node.
+	 */
+	BPF_FUNC_get_numa_node_id,
+
 	__BPF_FUNC_MAX_ID,
 };
 
-- 
cgit v1.2.3


From f76a9db351f8beb3259c4ba38058de0058ab8000 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 21 Oct 2016 16:10:22 +0200
Subject: lwt: Remove unused len field

The field is initialized by ILA and MPLS but never used. Remove it.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 67d235f43202..82e76fe1c1f7 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -24,11 +24,10 @@ enum {
 struct lwtunnel_state {
 	__u16		type;
 	__u16		flags;
+	__u16		headroom;
 	atomic_t	refcnt;
 	int		(*orig_output)(struct net *net, struct sock *sk, struct sk_buff *skb);
 	int		(*orig_input)(struct sk_buff *);
-	int             len;
-	__u16		headroom;
 	struct		rcu_head rcu;
 	__u8            data[0];
 };
-- 
cgit v1.2.3


From 432490f9d455fb842d70219f22d9d2c812371676 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Fri, 21 Oct 2016 13:03:44 +0300
Subject: net: ip, diag -- Add diag interface for raw sockets

In criu we are actively using diag interface to collect sockets
present in the system when dumping applications. And while for
unix, tcp, udp[lite], packet, netlink it works as expected,
the raw sockets do not have. Thus add it.

v2:
 - add missing sock_put calls in raw_diag_dump_one (by eric.dumazet@)
 - implement @destroy for diag requests (by dsa@)

v3:
 - add export of raw_abort for IPv6 (by dsa@)
 - pass net-admin flag into inet_sk_diag_fill due to
   changes in net-next branch (by dsa@)

v4:
 - use @pad in struct inet_diag_req_v2 for raw socket
   protocol specification: raw module carries sockets
   which may have custom protocol passed from socket()
   syscall and sole @sdiag_protocol is not enough to
   match underlied ones
 - start reporting protocol specifed in socket() call
   when sockets are raw ones for the same reason: user
   space tools like ss may parse this attribute and use
   it for socket matching

v5 (by eric.dumazet@):
 - use sock_hold in raw_sock_get instead of atomic_inc,
   we're holding (raw_v4_hashinfo|raw_v6_hashinfo)->lock
   when looking up so counter won't be zero here.

v6:
 - use sdiag_raw_protocol() helper which will access @pad
   structure used for raw sockets protocol specification:
   we can't simply rename this member without breaking uapi

v7:
 - sine sdiag_raw_protocol() helper is not suitable for
   uapi lets rather make an alias structure with proper
   names. __check_inet_diag_req_raw helper will catch
   if any of structure unintentionally changed.

CC: David S. Miller <davem@davemloft.net>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: David Ahern <dsa@cumulusnetworks.com>
CC: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
CC: James Morris <jmorris@namei.org>
CC: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
CC: Patrick McHardy <kaber@trash.net>
CC: Andrey Vagin <avagin@openvz.org>
CC: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/raw.h              |  6 ++++++
 include/net/rawv6.h            |  7 +++++++
 include/uapi/linux/inet_diag.h | 17 +++++++++++++++++
 3 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/include/net/raw.h b/include/net/raw.h
index 3e789008394d..57c33dd22ec4 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -23,6 +23,12 @@
 
 extern struct proto raw_prot;
 
+extern struct raw_hashinfo raw_v4_hashinfo;
+struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
+			     unsigned short num, __be32 raddr,
+			     __be32 laddr, int dif);
+
+int raw_abort(struct sock *sk, int err);
 void raw_icmp_error(struct sk_buff *, int, u32);
 int raw_local_deliver(struct sk_buff *, int);
 
diff --git a/include/net/rawv6.h b/include/net/rawv6.h
index 87783dea0791..cbe4e9de1894 100644
--- a/include/net/rawv6.h
+++ b/include/net/rawv6.h
@@ -3,6 +3,13 @@
 
 #include <net/protocol.h>
 
+extern struct raw_hashinfo raw_v6_hashinfo;
+struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
+			     unsigned short num, const struct in6_addr *loc_addr,
+			     const struct in6_addr *rmt_addr, int dif);
+
+int raw_abort(struct sock *sk, int err);
+
 void raw6_icmp_error(struct sk_buff *, int nexthdr,
 		u8 type, u8 code, int inner_offset, __be32);
 bool raw6_local_deliver(struct sk_buff *, int);
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 509cd961068d..bbe201047df6 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -43,6 +43,23 @@ struct inet_diag_req_v2 {
 	struct inet_diag_sockid id;
 };
 
+/*
+ * SOCK_RAW sockets require the underlied protocol to be
+ * additionally specified so we can use @pad member for
+ * this, but we can't rename it because userspace programs
+ * still may depend on this name. Instead lets use another
+ * structure definition as an alias for struct
+ * @inet_diag_req_v2.
+ */
+struct inet_diag_req_raw {
+	__u8	sdiag_family;
+	__u8	sdiag_protocol;
+	__u8	idiag_ext;
+	__u8	sdiag_raw_protocol;
+	__u32	idiag_states;
+	struct inet_diag_sockid id;
+};
+
 enum {
 	INET_DIAG_REQ_NONE,
 	INET_DIAG_REQ_BYTECODE,
-- 
cgit v1.2.3


From 3cf25904fe467aebeaa77d402b6cf3c6c5d6303b Mon Sep 17 00:00:00 2001
From: Xo Wang <xow@google.com>
Date: Fri, 21 Oct 2016 10:20:12 -0700
Subject: net: phy: broadcom: Update Auxiliary Control Register macros

Add the RXD-to-RXC skew (delay) time bit in the Miscellaneous Control
shadow register and a mask for the shadow selector field.

Remove a re-definition of MII_BCM54XX_AUXCTL_SHDWSEL_AUXCTL.

Signed-off-by: Xo Wang <xow@google.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/brcmphy.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index e3354b74286c..22c4421c916c 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -105,11 +105,12 @@
 #define MII_BCM54XX_AUXCTL_ACTL_SMDSP_ENA	0x0800
 
 #define MII_BCM54XX_AUXCTL_MISC_WREN	0x8000
+#define MII_BCM54XX_AUXCTL_MISC_RXD_RXC_SKEW	0x0100
 #define MII_BCM54XX_AUXCTL_MISC_FORCE_AMDIX	0x0200
 #define MII_BCM54XX_AUXCTL_MISC_RDSEL_MISC	0x7000
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC	0x0007
 
-#define MII_BCM54XX_AUXCTL_SHDWSEL_AUXCTL	0x0000
+#define MII_BCM54XX_AUXCTL_SHDWSEL_MASK	0x0007
 
 /*
  * Broadcom LED source encodings.  These are used in BCM5461, BCM5481,
-- 
cgit v1.2.3


From d92ead16be405b6d52ff7b366d1c9865ccc684bd Mon Sep 17 00:00:00 2001
From: Xo Wang <xow@google.com>
Date: Fri, 21 Oct 2016 10:20:13 -0700
Subject: net: phy: broadcom: Add support for BCM54612E

This PHY has internal delays enabled after reset. This clears the
internal delay enables unless the interface specifically requests them.

Signed-off-by: Xo Wang <xow@google.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/brcmphy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 22c4421c916c..60def78c4e12 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -18,6 +18,7 @@
 #define PHY_ID_BCM5421			0x002060e0
 #define PHY_ID_BCM5464			0x002060b0
 #define PHY_ID_BCM5461			0x002060c0
+#define PHY_ID_BCM54612E		0x03625e60
 #define PHY_ID_BCM54616S		0x03625d10
 #define PHY_ID_BCM57780			0x03625d90
 
-- 
cgit v1.2.3


From 73c7da3dae1e7cd8febeab13767b2698b84dfa15 Mon Sep 17 00:00:00 2001
From: Arend Van Spriel <arend.vanspriel@broadcom.com>
Date: Thu, 20 Oct 2016 20:08:22 +0100
Subject: cfg80211: add generic helper to check interface is running

Add a helper using wdev to check if interface is running. This
deals with both non-netdev and netdev interfaces. In struct
wireless_dev replace 'p2p_started' and 'nan_started' by
'is_running' as those are mutually exclusive anyway, and unify
all the code to use wdev_running().

Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 2bbbcc3eecac..ec39f891b7a3 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3781,8 +3781,8 @@ struct cfg80211_cached_keys;
  * @beacon_interval: beacon interval used on this device for transmitting
  *	beacons, 0 when not valid
  * @address: The address for this device, valid only if @netdev is %NULL
- * @p2p_started: true if this is a P2P Device that has been started
- * @nan_started: true if this is a NAN interface that has been started
+ * @is_running: true if this is a non-netdev device that has been started, e.g.
+ *	the P2P Device.
  * @cac_started: true if DFS channel availability check has been started
  * @cac_start_time: timestamp (jiffies) when the dfs state was entered.
  * @cac_time_ms: CAC time in ms
@@ -3814,7 +3814,7 @@ struct wireless_dev {
 
 	struct mutex mtx;
 
-	bool use_4addr, p2p_started, nan_started;
+	bool use_4addr, is_running;
 
 	u8 address[ETH_ALEN] __aligned(sizeof(u16));
 
@@ -3871,6 +3871,13 @@ static inline u8 *wdev_address(struct wireless_dev *wdev)
 	return wdev->address;
 }
 
+static inline bool wdev_running(struct wireless_dev *wdev)
+{
+	if (wdev->netdev)
+		return netif_running(wdev->netdev);
+	return wdev->is_running;
+}
+
 /**
  * wdev_priv - return wiphy priv from wireless_dev
  *
-- 
cgit v1.2.3


From 4c8dea638c16141adb046fd2e0cab51dfe43650c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 21 Oct 2016 14:25:13 +0200
Subject: cfg80211: validate beacon int as part of iface combinations

Remove the pointless checking against interface combinations in
the initial basic beacon interval validation, that currently isn't
taking into account radar detection or channels properly. Instead,
just validate the basic range there, and then delay real checking
to the interface combination validation that drivers must do.

This means that drivers wanting to use the beacon_int_min_gcd will
now have to pass the new_beacon_int when validating the AP/mesh
start.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index ec39f891b7a3..d1ffbc3a8e55 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -784,19 +784,15 @@ struct cfg80211_csa_settings {
  * @iftype_num: array with the number of interfaces of each interface
  *	type.  The index is the interface type as specified in &enum
  *	nl80211_iftype.
- * @beacon_int_gcd: a value specifying GCD of all beaconing interfaces,
- *	the GCD of a single value is considered the value itself, so for
- *	a single interface this should be set to that interface's beacon
- *	interval
- * @beacon_int_different: a flag indicating whether or not all beacon
- *	intervals (of beaconing interfaces) are different or not.
+ * @new_beacon_int: set this to the beacon interval of a new interface
+ *	that's not operating yet, if such is to be checked as part of
+ *	the verification
  */
 struct iface_combination_params {
 	int num_different_channels;
 	u8 radar_detect;
 	int iftype_num[NUM_NL80211_IFTYPES];
-	u32 beacon_int_gcd;
-	bool beacon_int_different;
+	u32 new_beacon_int;
 };
 
 /**
-- 
cgit v1.2.3


From 11b6b5a4ced2f2c76073b97ee08ca0eab8358fde Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Thu, 27 Oct 2016 00:41:58 +0300
Subject: cfg80211: Rename SAE_DATA to more generic AUTH_DATA

This adds defines and nl80211 extensions to allow FILS Authentication to
be implemented similarly to SAE. FILS does not need the special rules
for the Authentication transaction number and Status code fields, but it
does need to add non-IE fields. The previously used
NL80211_ATTR_SAE_DATA can be reused for this to avoid having to
duplicate that implementation. Rename that attribute to more generic
NL80211_ATTR_AUTH_DATA (with backwards compatibility define for
NL80211_SAE_DATA).

Also document the special rules related to the Authentication
transaction number and Status code fiels.

Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 12 +++++++-----
 include/uapi/linux/nl80211.h | 15 ++++++++++++---
 2 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index d1ffbc3a8e55..dffc265a4fd6 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1785,9 +1785,11 @@ const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 ie);
  * @key_len: length of WEP key for shared key authentication
  * @key_idx: index of WEP key for shared key authentication
  * @key: WEP key for shared key authentication
- * @sae_data: Non-IE data to use with SAE or %NULL. This starts with
- *	Authentication transaction sequence number field.
- * @sae_data_len: Length of sae_data buffer in octets
+ * @auth_data: Fields and elements in Authentication frames. This contains
+ *	the authentication frame body (non-IE and IE data), excluding the
+ *	Authentication algorithm number, i.e., starting at the Authentication
+ *	transaction sequence number field.
+ * @auth_data_len: Length of auth_data buffer in octets
  */
 struct cfg80211_auth_request {
 	struct cfg80211_bss *bss;
@@ -1796,8 +1798,8 @@ struct cfg80211_auth_request {
 	enum nl80211_auth_type auth_type;
 	const u8 *key;
 	u8 key_len, key_idx;
-	const u8 *sae_data;
-	size_t sae_data_len;
+	const u8 *auth_data;
+	size_t auth_data_len;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 1362d24957b5..18bcf44899aa 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1638,8 +1638,16 @@ enum nl80211_commands {
  *	the connection request from a station. nl80211_connect_failed_reason
  *	enum has different reasons of connection failure.
  *
- * @NL80211_ATTR_SAE_DATA: SAE elements in Authentication frames. This starts
- *	with the Authentication transaction sequence number field.
+ * @NL80211_ATTR_AUTH_DATA: Fields and elements in Authentication frames.
+ *	This contains the authentication frame body (non-IE and IE data),
+ *	excluding the Authentication algorithm number, i.e., starting at the
+ *	Authentication transaction sequence number field. It is used with
+ *	authentication algorithms that need special fields to be added into
+ *	the frames (SAE and FILS). Currently, only the SAE cases use the
+ *	initial two fields (Authentication transaction sequence number and
+ *	Status code). However, those fields are included in the attribute data
+ *	for all authentication algorithms to keep the attribute definition
+ *	consistent.
  *
  * @NL80211_ATTR_VHT_CAPABILITY: VHT Capability information element (from
  *	association request when used with NL80211_CMD_NEW_STATION)
@@ -2195,7 +2203,7 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_CONN_FAILED_REASON,
 
-	NL80211_ATTR_SAE_DATA,
+	NL80211_ATTR_AUTH_DATA,
 
 	NL80211_ATTR_VHT_CAPABILITY,
 
@@ -2347,6 +2355,7 @@ enum nl80211_attrs {
 #define NL80211_ATTR_SCAN_GENERATION NL80211_ATTR_GENERATION
 #define	NL80211_ATTR_MESH_PARAMS NL80211_ATTR_MESH_CONFIG
 #define NL80211_ATTR_IFACE_SOCKET_OWNER NL80211_ATTR_SOCKET_OWNER
+#define NL80211_ATTR_SAE_DATA NL80211_ATTR_AUTH_DATA
 
 /*
  * Allow user space programs to use #ifdef on new attributes by defining them
-- 
cgit v1.2.3


From 60b8084e844814631b57da3d35f272e0ff799ab2 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Thu, 27 Oct 2016 00:42:00 +0300
Subject: cfg80211: Add feature flag for Fast Initial Link Setup (FILS) as STA

This defines a feature flag that drivers can use to indicate that they
support FILS authentication/association (IEEE 802.11ai) when using user
space SME (NL80211_CMD_AUTHENTICATE) in station mode.

Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 18bcf44899aa..7825fd4db19e 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -4647,6 +4647,8 @@ enum nl80211_feature_flags {
  *	configuration (AP/mesh) with HT rates.
  * @NL80211_EXT_FEATURE_BEACON_RATE_VHT: Driver supports beacon rate
  *	configuration (AP/mesh) with VHT rates.
+ * @NL80211_EXT_FEATURE_FILS_STA: This driver supports Fast Initial Link Setup
+ *	with user space SME (NL80211_CMD_AUTHENTICATE) in station mode.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -4661,6 +4663,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_BEACON_RATE_LEGACY,
 	NL80211_EXT_FEATURE_BEACON_RATE_HT,
 	NL80211_EXT_FEATURE_BEACON_RATE_VHT,
+	NL80211_EXT_FEATURE_FILS_STA,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
-- 
cgit v1.2.3


From 3f817fe718c6cb3ddcc2ab04ba86faecc20ef8fe Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Thu, 27 Oct 2016 00:42:01 +0300
Subject: cfg80211: Define IEEE P802.11ai (FILS) information elements

Define the Element IDs and Element ID Extensions from IEEE
P802.11ai/D11.0. In addition, add a new cfg80211_find_ext_ie() wrapper
to make it easier to find information elements that used the Element ID
Extension field.

Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 20 ++++++++++++++++++++
 include/net/cfg80211.h    | 21 +++++++++++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index a80516fd65c8..d428adf51446 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1960,6 +1960,26 @@ enum ieee80211_eid {
 
 	WLAN_EID_VENDOR_SPECIFIC = 221,
 	WLAN_EID_QOS_PARAMETER = 222,
+	WLAN_EID_CAG_NUMBER = 237,
+	WLAN_EID_AP_CSN = 239,
+	WLAN_EID_FILS_INDICATION = 240,
+	WLAN_EID_DILS = 241,
+	WLAN_EID_FRAGMENT = 242,
+	WLAN_EID_EXTENSION = 255
+};
+
+/* Element ID Extensions for Element ID 255 */
+enum ieee80211_eid_ext {
+	WLAN_EID_EXT_ASSOC_DELAY_INFO = 1,
+	WLAN_EID_EXT_FILS_REQ_PARAMS = 2,
+	WLAN_EID_EXT_FILS_KEY_CONFIRM = 3,
+	WLAN_EID_EXT_FILS_SESSION = 4,
+	WLAN_EID_EXT_FILS_HLP_CONTAINER = 5,
+	WLAN_EID_EXT_FILS_IP_ADDR_ASSIGN = 6,
+	WLAN_EID_EXT_KEY_DELIVERY = 7,
+	WLAN_EID_EXT_FILS_WRAPPED_DATA = 8,
+	WLAN_EID_EXT_FILS_PUBLIC_KEY = 12,
+	WLAN_EID_EXT_FILS_NONCE = 13,
 };
 
 /* Action category code */
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index dffc265a4fd6..8ca2e9f354f7 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4180,6 +4180,27 @@ static inline const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len)
 	return cfg80211_find_ie_match(eid, ies, len, NULL, 0, 0);
 }
 
+/**
+ * cfg80211_find_ext_ie - find information element with EID Extension in data
+ *
+ * @ext_eid: element ID Extension
+ * @ies: data consisting of IEs
+ * @len: length of data
+ *
+ * Return: %NULL if the extended element ID could not be found or if
+ * the element is invalid (claims to be longer than the given
+ * data), or a pointer to the first byte of the requested
+ * element, that is the byte containing the element ID.
+ *
+ * Note: There are no checks on the element length other than
+ * having to fit into the given data.
+ */
+static inline const u8 *cfg80211_find_ext_ie(u8 ext_eid, const u8 *ies, int len)
+{
+	return cfg80211_find_ie_match(WLAN_EID_EXTENSION, ies, len,
+				      &ext_eid, 1, 2);
+}
+
 /**
  * cfg80211_find_vendor_ie - find vendor specific information element in data
  *
-- 
cgit v1.2.3


From 631810603a20874554b2f17adf42b72d0f15eda5 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Thu, 27 Oct 2016 00:42:02 +0300
Subject: cfg80211: Add Fast Initial Link Setup (FILS) auth algs

This defines authentication algorithms for FILS (IEEE 802.11ai).

Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h    | 3 +++
 include/uapi/linux/nl80211.h | 6 ++++++
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index d428adf51446..793a0174ba29 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1576,6 +1576,9 @@ struct ieee80211_vht_operation {
 #define WLAN_AUTH_SHARED_KEY 1
 #define WLAN_AUTH_FT 2
 #define WLAN_AUTH_SAE 3
+#define WLAN_AUTH_FILS_SK 4
+#define WLAN_AUTH_FILS_SK_PFS 5
+#define WLAN_AUTH_FILS_PK 6
 #define WLAN_AUTH_LEAP 128
 
 #define WLAN_AUTH_CHALLENGE_LEN 128
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 7825fd4db19e..4dc21265cd12 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -3669,6 +3669,9 @@ enum nl80211_bss_status {
  * @NL80211_AUTHTYPE_FT: Fast BSS Transition (IEEE 802.11r)
  * @NL80211_AUTHTYPE_NETWORK_EAP: Network EAP (some Cisco APs and mainly LEAP)
  * @NL80211_AUTHTYPE_SAE: Simultaneous authentication of equals
+ * @NL80211_AUTHTYPE_FILS_SK: Fast Initial Link Setup shared key
+ * @NL80211_AUTHTYPE_FILS_SK_PFS: Fast Initial Link Setup shared key with PFS
+ * @NL80211_AUTHTYPE_FILS_PK: Fast Initial Link Setup public key
  * @__NL80211_AUTHTYPE_NUM: internal
  * @NL80211_AUTHTYPE_MAX: maximum valid auth algorithm
  * @NL80211_AUTHTYPE_AUTOMATIC: determine automatically (if necessary by
@@ -3681,6 +3684,9 @@ enum nl80211_auth_type {
 	NL80211_AUTHTYPE_FT,
 	NL80211_AUTHTYPE_NETWORK_EAP,
 	NL80211_AUTHTYPE_SAE,
+	NL80211_AUTHTYPE_FILS_SK,
+	NL80211_AUTHTYPE_FILS_SK_PFS,
+	NL80211_AUTHTYPE_FILS_PK,
 
 	/* keep last */
 	__NL80211_AUTHTYPE_NUM,
-- 
cgit v1.2.3


From 348bd456699801920a309c66e382380809fbdf41 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Thu, 27 Oct 2016 00:42:03 +0300
Subject: cfg80211: Add KEK/nonces for FILS association frames

The new nl80211 attributes can be used to provide KEK and nonces to
allow the driver to encrypt and decrypt FILS (Re)Association
Request/Response frames in station mode.

Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h    | 3 +++
 include/net/cfg80211.h       | 9 +++++++++
 include/uapi/linux/nl80211.h | 8 ++++++++
 3 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 793a0174ba29..fe849329511a 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2096,6 +2096,9 @@ enum ieee80211_key_len {
 #define IEEE80211_GCMP_MIC_LEN		16
 #define IEEE80211_GCMP_PN_LEN		6
 
+#define FILS_NONCE_LEN			16
+#define FILS_MAX_KEK_LEN		64
+
 /* Public action codes */
 enum ieee80211_pub_actioncode {
 	WLAN_PUB_ACTION_EXT_CHANSW_ANN = 4,
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 8ca2e9f354f7..738b4d8a4666 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1840,6 +1840,12 @@ enum cfg80211_assoc_req_flags {
  * @ht_capa_mask:  The bits of ht_capa which are to be used.
  * @vht_capa: VHT capability override
  * @vht_capa_mask: VHT capability mask indicating which fields to use
+ * @fils_kek: FILS KEK for protecting (Re)Association Request/Response frame or
+ *	%NULL if FILS is not used.
+ * @fils_kek_len: Length of fils_kek in octets
+ * @fils_nonces: FILS nonces (part of AAD) for protecting (Re)Association
+ *	Request/Response frame or %NULL if FILS is not used. This field starts
+ *	with 16 octets of STA Nonce followed by 16 octets of AP Nonce.
  */
 struct cfg80211_assoc_request {
 	struct cfg80211_bss *bss;
@@ -1851,6 +1857,9 @@ struct cfg80211_assoc_request {
 	struct ieee80211_ht_cap ht_capa;
 	struct ieee80211_ht_cap ht_capa_mask;
 	struct ieee80211_vht_cap vht_capa, vht_capa_mask;
+	const u8 *fils_kek;
+	size_t fils_kek_len;
+	const u8 *fils_nonces;
 };
 
 /**
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 4dc21265cd12..a268a009528a 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1944,6 +1944,11 @@ enum nl80211_commands {
  *	attribute.
  * @NL80211_ATTR_NAN_MATCH: used to report a match. This is a nested attribute.
  *	See &enum nl80211_nan_match_attributes.
+ * @NL80211_ATTR_FILS_KEK: KEK for FILS (Re)Association Request/Response frame
+ *	protection.
+ * @NL80211_ATTR_FILS_NONCES: Nonces (part of AAD) for FILS (Re)Association
+ *	Request/Response frame protection. This attribute contains the 16 octet
+ *	STA Nonce followed by 16 octets of AP Nonce.
  *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
@@ -2344,6 +2349,9 @@ enum nl80211_attrs {
 	NL80211_ATTR_NAN_FUNC,
 	NL80211_ATTR_NAN_MATCH,
 
+	NL80211_ATTR_FILS_KEK,
+	NL80211_ATTR_FILS_NONCES,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
-- 
cgit v1.2.3


From ce0ce13a1c89ff8b94b7f8fb32eb4c43e111c82e Mon Sep 17 00:00:00 2001
From: Michael Braun <michael-dev@fami-braun.de>
Date: Mon, 10 Oct 2016 19:12:22 +0200
Subject: cfg80211: configure multicast to unicast for AP interfaces

Add the ability to configure if an AP (and associated VLANs) will
do multicast-to-unicast conversion for ARP, IPv4 and IPv6 frames
(possibly within 802.1Q). If enabled, such frames are to be sent
to each station separately, with the DA replaced by their own MAC
address rather than the group address.

Note that this may break certain expectations of the receiver,
such as the ability to drop unicast IP packets received within
multicast L2 frames, or the ability to not send ICMP destination
unreachable messages for packets received in L2 multicast (which
is required, but the receiver can't tell the difference if this
new option is enabled.)

This also doesn't implement the 802.11 DMS (directed multicast
service).

Signed-off-by: Michael Braun <michael-dev@fami-braun.de>
[fix disabling, add better documentation & commit message]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       |  6 ++++++
 include/uapi/linux/nl80211.h | 21 +++++++++++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 738b4d8a4666..41ae3f500957 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2742,6 +2742,8 @@ struct cfg80211_nan_func {
  * @nan_change_conf: changes NAN configuration. The changed parameters must
  *	be specified in @changes (using &enum cfg80211_nan_conf_changes);
  *	All other parameters must be ignored.
+ *
+ * @set_multicast_to_unicast: configure multicast to unicast conversion for BSS
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -3018,6 +3020,10 @@ struct cfg80211_ops {
 				   struct wireless_dev *wdev,
 				   struct cfg80211_nan_conf *conf,
 				   u32 changes);
+
+	int	(*set_multicast_to_unicast)(struct wiphy *wiphy,
+					    struct net_device *dev,
+					    const bool enabled);
 };
 
 /*
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index a268a009528a..e21d23dcb588 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -600,6 +600,20 @@
  *
  * @NL80211_CMD_SET_WDS_PEER: Set the MAC address of the peer on a WDS interface.
  *
+ * @NL80211_CMD_SET_MULTICAST_TO_UNICAST: Configure if this AP should perform
+ *	multicast to unicast conversion. When enabled, all multicast packets
+ *	with ethertype ARP, IPv4 or IPv6 (possibly within an 802.1Q header)
+ *	will be sent out to each station once with the destination (multicast)
+ *	MAC address replaced by the station's MAC address. Note that this may
+ *	break certain expectations of the receiver, e.g. the ability to drop
+ *	unicast IP packets encapsulated in multicast L2 frames, or the ability
+ *	to not send destination unreachable messages in such cases.
+ *	This can only be toggled per BSS. Configure this on an interface of
+ *	type %NL80211_IFTYPE_AP. It applies to all its VLAN interfaces
+ *	(%NL80211_IFTYPE_AP_VLAN), except for those in 4addr (WDS) mode.
+ *	If %NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED is not present with this
+ *	command, the feature is disabled.
+ *
  * @NL80211_CMD_JOIN_MESH: Join a mesh. The mesh ID must be given, and initial
  *	mesh config parameters may be given.
  * @NL80211_CMD_LEAVE_MESH: Leave the mesh network -- no special arguments, the
@@ -1069,6 +1083,8 @@ enum nl80211_commands {
 	NL80211_CMD_CHANGE_NAN_CONFIG,
 	NL80211_CMD_NAN_MATCH,
 
+	NL80211_CMD_SET_MULTICAST_TO_UNICAST,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
@@ -1950,6 +1966,9 @@ enum nl80211_commands {
  *	Request/Response frame protection. This attribute contains the 16 octet
  *	STA Nonce followed by 16 octets of AP Nonce.
  *
+ * @NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED: Indicates whether or not multicast
+ *	packets should be send out as unicast to all stations (flag attribute).
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2352,6 +2371,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_FILS_KEK,
 	NL80211_ATTR_FILS_NONCES,
 
+	NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
-- 
cgit v1.2.3


From 088e8df82f91a24728d49d9532cab7ebdee5117f Mon Sep 17 00:00:00 2001
From: vamsi krishna <vamsin@qti.qualcomm.com>
Date: Thu, 27 Oct 2016 16:51:11 +0300
Subject: cfg80211: Add support to update connection parameters

Add functionality to update the connection parameters when in connected
state, so that driver/firmware uses the updated parameters for
subsequent roaming. This is for drivers that support internal BSS
selection and roaming. The new command does not change the current
association state, i.e., it can be used to update IE contents for future
(re)associations without causing an immediate disassociation or
reassociation with the current BSS.

This commit implements the required functionality for updating IEs for
(Re)Association Request frame only. Other parameters can be added in
future when required.

Signed-off-by: vamsi krishna <vamsin@qti.qualcomm.com>
Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h       | 24 ++++++++++++++++++++++++
 include/uapi/linux/nl80211.h |  8 ++++++++
 2 files changed, 32 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 41ae3f500957..c575583b50fb 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2050,6 +2050,18 @@ struct cfg80211_connect_params {
 	const u8 *prev_bssid;
 };
 
+/**
+ * enum cfg80211_connect_params_changed - Connection parameters being updated
+ *
+ * This enum provides information of all connect parameters that
+ * have to be updated as part of update_connect_params() call.
+ *
+ * @UPDATE_ASSOC_IES: Indicates whether association request IEs are updated
+ */
+enum cfg80211_connect_params_changed {
+	UPDATE_ASSOC_IES		= BIT(0),
+};
+
 /**
  * enum wiphy_params_flags - set_wiphy_params bitfield values
  * @WIPHY_PARAM_RETRY_SHORT: wiphy->retry_short has changed
@@ -2571,6 +2583,14 @@ struct cfg80211_nan_func {
  *	cases, the result of roaming is indicated with a call to
  *	cfg80211_roamed() or cfg80211_roamed_bss().
  *	(invoked with the wireless_dev mutex held)
+ * @update_connect_params: Update the connect parameters while connected to a
+ *	BSS. The updated parameters can be used by driver/firmware for
+ *	subsequent BSS selection (roaming) decisions and to form the
+ *	Authentication/(Re)Association Request frames. This call does not
+ *	request an immediate disassociation or reassociation with the current
+ *	BSS, i.e., this impacts only subsequent (re)associations. The bits in
+ *	changed are defined in &enum cfg80211_connect_params_changed.
+ *	(invoked with the wireless_dev mutex held)
  * @disconnect: Disconnect from the BSS/ESS or stop connection attempts if
  *      connection is in progress. Once done, call cfg80211_disconnected() in
  *      case connection was already established (invoked with the
@@ -2858,6 +2878,10 @@ struct cfg80211_ops {
 
 	int	(*connect)(struct wiphy *wiphy, struct net_device *dev,
 			   struct cfg80211_connect_params *sme);
+	int	(*update_connect_params)(struct wiphy *wiphy,
+					 struct net_device *dev,
+					 struct cfg80211_connect_params *sme,
+					 u32 changed);
 	int	(*disconnect)(struct wiphy *wiphy, struct net_device *dev,
 			      u16 reason_code);
 
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index e21d23dcb588..259c9c77fdc1 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -888,6 +888,12 @@
  *	This will contain a %NL80211_ATTR_NAN_MATCH nested attribute and
  *	%NL80211_ATTR_COOKIE.
  *
+ * @NL80211_CMD_UPDATE_CONNECT_PARAMS: Update one or more connect parameters
+ *	for subsequent roaming cases if the driver or firmware uses internal
+ *	BSS selection. This command can be issued only while connected and it
+ *	does not result in a change for the current association. Currently,
+ *	only the %NL80211_ATTR_IE data is used and updated with this command.
+ *
  * @NL80211_CMD_MAX: highest used command number
  * @__NL80211_CMD_AFTER_LAST: internal use
  */
@@ -1085,6 +1091,8 @@ enum nl80211_commands {
 
 	NL80211_CMD_SET_MULTICAST_TO_UNICAST,
 
+	NL80211_CMD_UPDATE_CONNECT_PARAMS,
+
 	/* add new commands above here */
 
 	/* used to define NL80211_CMD_MAX below */
-- 
cgit v1.2.3


From 4fe77d82ef80c77031c9c6f8554cd0dee2aa423a Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <a@unstable.cc>
Date: Mon, 24 Oct 2016 20:32:57 +0800
Subject: skbedit: allow the user to specify bitmask for mark

The user may want to use only some bits of the skb mark in
his skbedit rules because the remaining part might be used by
something else.

Introduce the "mask" parameter to the skbedit actor in order
to implement such functionality.

When the mask is specified, only those bits selected by the
latter are altered really changed by the actor, while the
rest is left untouched.

Signed-off-by: Antonio Quartulli <antonio@open-mesh.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_skbedit.h        | 1 +
 include/uapi/linux/tc_act/tc_skbedit.h | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h
index 5767e9dbcf92..19cd3d345804 100644
--- a/include/net/tc_act/tc_skbedit.h
+++ b/include/net/tc_act/tc_skbedit.h
@@ -27,6 +27,7 @@ struct tcf_skbedit {
 	u32		flags;
 	u32		priority;
 	u32		mark;
+	u32		mask;
 	u16		queue_mapping;
 	u16		ptype;
 };
diff --git a/include/uapi/linux/tc_act/tc_skbedit.h b/include/uapi/linux/tc_act/tc_skbedit.h
index a4d00c608d8f..2884425738ce 100644
--- a/include/uapi/linux/tc_act/tc_skbedit.h
+++ b/include/uapi/linux/tc_act/tc_skbedit.h
@@ -28,6 +28,7 @@
 #define SKBEDIT_F_QUEUE_MAPPING		0x2
 #define SKBEDIT_F_MARK			0x4
 #define SKBEDIT_F_PTYPE			0x8
+#define SKBEDIT_F_MASK			0x10
 
 struct tc_skbedit {
 	tc_gen;
@@ -42,6 +43,7 @@ enum {
 	TCA_SKBEDIT_MARK,
 	TCA_SKBEDIT_PAD,
 	TCA_SKBEDIT_PTYPE,
+	TCA_SKBEDIT_MASK,
 	__TCA_SKBEDIT_MAX
 };
 #define TCA_SKBEDIT_MAX (__TCA_SKBEDIT_MAX - 1)
-- 
cgit v1.2.3


From c90c39dab3e02ce45427a214746711f33ad13be6 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 24 Oct 2016 14:40:01 +0200
Subject: genetlink: introduce and use genl_family_attrbuf()

This helper function allows family implementations to access
their family's attrbuf. This gets rid of the attrbuf usage
in families, and also adds locking validation, since it's not
valid to use the attrbuf with parallel_ops or outside of the
dumpit callback.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 8d4608ce8716..ef9defb3f5bc 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -73,6 +73,8 @@ struct genl_family {
 	struct module		*module;
 };
 
+struct nlattr **genl_family_attrbuf(struct genl_family *family);
+
 /**
  * struct genl_info - receiving information
  * @snd_seq: sending sequence number
-- 
cgit v1.2.3


From a07ea4d9941af5a0c6f0be2a71b51ac9c083c5e5 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 24 Oct 2016 14:40:02 +0200
Subject: genetlink: no longer support using static family IDs

Static family IDs have never really been used, the only
use case was the workaround I introduced for those users
that assumed their family ID was also their multicast
group ID.

Additionally, because static family IDs would never be
reserved by the generic netlink code, using a relatively
low ID would only work for built-in families that can be
registered immediately after generic netlink is started,
which is basically only the control family (apart from
the workaround code, which I also had to add code for so
it would reserve those IDs)

Thus, anything other than GENL_ID_GENERATE is flawed and
luckily not used except in the cases I mentioned. Move
those workarounds into a few lines of code, and then get
rid of GENL_ID_GENERATE entirely, making it more robust.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/genl_magic_func.h | 1 -
 include/net/genetlink.h         | 7 ++-----
 include/uapi/linux/genetlink.h  | 1 -
 3 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index 667c31101b8b..7c070c1fe457 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -260,7 +260,6 @@ static struct genl_ops ZZZ_genl_ops[] __read_mostly = {
  */
 #define ZZZ_genl_family		CONCAT_(GENL_MAGIC_FAMILY, _genl_family)
 static struct genl_family ZZZ_genl_family __read_mostly = {
-	.id = GENL_ID_GENERATE,
 	.name = __stringify(GENL_MAGIC_FAMILY),
 	.version = GENL_MAGIC_VERSION,
 #ifdef GENL_MAGIC_FAMILY_HDRSZ
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index ef9defb3f5bc..43a5c3975a2f 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -20,7 +20,7 @@ struct genl_info;
 
 /**
  * struct genl_family - generic netlink family
- * @id: protocol family idenfitier
+ * @id: protocol family identifier (private)
  * @hdrsize: length of user specific header in bytes
  * @name: name of family
  * @version: protocol version
@@ -48,7 +48,7 @@ struct genl_info;
  * @n_ops: number of operations supported by this family (private)
  */
 struct genl_family {
-	unsigned int		id;
+	unsigned int		id;		/* private */
 	unsigned int		hdrsize;
 	char			name[GENL_NAMSIZ];
 	unsigned int		version;
@@ -149,9 +149,6 @@ static inline int genl_register_family(struct genl_family *family)
  * Registers the specified family and operations from the specified table.
  * Only one family may be registered with the same family name or identifier.
  *
- * The family id may equal GENL_ID_GENERATE causing an unique id to
- * be automatically generated and assigned.
- *
  * Either a doit or dumpit callback must be specified for every registered
  * operation or the function will fail. Only one operation structure per
  * command identifier may be registered.
diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
index 5512c90af7e3..d9b2db4a29c6 100644
--- a/include/uapi/linux/genetlink.h
+++ b/include/uapi/linux/genetlink.h
@@ -26,7 +26,6 @@ struct genlmsghdr {
 /*
  * List of reserved static generic netlink identifiers:
  */
-#define GENL_ID_GENERATE	0
 #define GENL_ID_CTRL		NLMSG_MIN_TYPE
 #define GENL_ID_VFS_DQUOT	(NLMSG_MIN_TYPE + 1)
 #define GENL_ID_PMCRAID		(NLMSG_MIN_TYPE + 2)
-- 
cgit v1.2.3


From 489111e5c25b93be80340c3113d71903d7c82136 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 24 Oct 2016 14:40:03 +0200
Subject: genetlink: statically initialize families

Instead of providing macros/inline functions to initialize
the families, make all users initialize them statically and
get rid of the macros.

This reduces the kernel code size by about 1.6k on x86-64
(with allyesconfig).

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/drbd_genl.h       |  2 +-
 include/linux/genl_magic_func.h | 28 +++++++++-------
 include/net/genetlink.h         | 71 +++++++----------------------------------
 3 files changed, 29 insertions(+), 72 deletions(-)

(limited to 'include')

diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h
index c934d3a96b5e..2896f93808ae 100644
--- a/include/linux/drbd_genl.h
+++ b/include/linux/drbd_genl.h
@@ -67,7 +67,7 @@
  *	genl_magic_func.h
  *		generates an entry in the static genl_ops array,
  *		and static register/unregister functions to
- *		genl_register_family_with_ops().
+ *		genl_register_family().
  *
  *	flags and handler:
  *		GENL_op_init( .doit = x, .dumpit = y, .flags = something)
diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index 7c070c1fe457..40c2e39362c8 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -259,15 +259,7 @@ static struct genl_ops ZZZ_genl_ops[] __read_mostly = {
  *									{{{2
  */
 #define ZZZ_genl_family		CONCAT_(GENL_MAGIC_FAMILY, _genl_family)
-static struct genl_family ZZZ_genl_family __read_mostly = {
-	.name = __stringify(GENL_MAGIC_FAMILY),
-	.version = GENL_MAGIC_VERSION,
-#ifdef GENL_MAGIC_FAMILY_HDRSZ
-	.hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ),
-#endif
-	.maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1,
-};
-
+static struct genl_family ZZZ_genl_family;
 /*
  * Magic: define multicast groups
  * Magic: define multicast group registration helper
@@ -301,11 +293,23 @@ static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)(	\
 #undef GENL_mc_group
 #define GENL_mc_group(group)
 
+static struct genl_family ZZZ_genl_family __read_mostly = {
+	.name = __stringify(GENL_MAGIC_FAMILY),
+	.version = GENL_MAGIC_VERSION,
+#ifdef GENL_MAGIC_FAMILY_HDRSZ
+	.hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ),
+#endif
+	.maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1,
+	.ops = ZZZ_genl_ops,
+	.n_ops = ARRAY_SIZE(ZZZ_genl_ops),
+	.mcgrps = ZZZ_genl_mcgrps,
+	.n_mcgrps = ARRAY_SIZE(ZZZ_genl_mcgrps),
+	.module = THIS_MODULE,
+};
+
 int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void)
 {
-	return genl_register_family_with_ops_groups(&ZZZ_genl_family,	\
-						    ZZZ_genl_ops,	\
-						    ZZZ_genl_mcgrps);
+	return genl_register_family(&ZZZ_genl_family);
 }
 
 void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void)
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 43a5c3975a2f..2298b50cee34 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -39,13 +39,14 @@ struct genl_info;
  *	Note that unbind() will not be called symmetrically if the
  *	generic netlink family is removed while there are still open
  *	sockets.
- * @attrbuf: buffer to store parsed attributes
- * @family_list: family list
- * @mcgrps: multicast groups used by this family (private)
- * @n_mcgrps: number of multicast groups (private)
+ * @attrbuf: buffer to store parsed attributes (private)
+ * @family_list: family list (private)
+ * @mcgrps: multicast groups used by this family
+ * @n_mcgrps: number of multicast groups
  * @mcgrp_offset: starting number of multicast group IDs in this family
- * @ops: the operations supported by this family (private)
- * @n_ops: number of operations supported by this family (private)
+ *	(private)
+ * @ops: the operations supported by this family
+ * @n_ops: number of operations supported by this family
  */
 struct genl_family {
 	unsigned int		id;		/* private */
@@ -64,10 +65,10 @@ struct genl_family {
 	int			(*mcast_bind)(struct net *net, int group);
 	void			(*mcast_unbind)(struct net *net, int group);
 	struct nlattr **	attrbuf;	/* private */
-	const struct genl_ops *	ops;		/* private */
-	const struct genl_multicast_group *mcgrps; /* private */
-	unsigned int		n_ops;		/* private */
-	unsigned int		n_mcgrps;	/* private */
+	const struct genl_ops *	ops;
+	const struct genl_multicast_group *mcgrps;
+	unsigned int		n_ops;
+	unsigned int		n_mcgrps;
 	unsigned int		mcgrp_offset;	/* private */
 	struct list_head	family_list;	/* private */
 	struct module		*module;
@@ -132,55 +133,7 @@ struct genl_ops {
 	u8			flags;
 };
 
-int __genl_register_family(struct genl_family *family);
-
-static inline int genl_register_family(struct genl_family *family)
-{
-	family->module = THIS_MODULE;
-	return __genl_register_family(family);
-}
-
-/**
- * genl_register_family_with_ops - register a generic netlink family with ops
- * @family: generic netlink family
- * @ops: operations to be registered
- * @n_ops: number of elements to register
- *
- * Registers the specified family and operations from the specified table.
- * Only one family may be registered with the same family name or identifier.
- *
- * Either a doit or dumpit callback must be specified for every registered
- * operation or the function will fail. Only one operation structure per
- * command identifier may be registered.
- *
- * See include/net/genetlink.h for more documenation on the operations
- * structure.
- *
- * Return 0 on success or a negative error code.
- */
-static inline int
-_genl_register_family_with_ops_grps(struct genl_family *family,
-				    const struct genl_ops *ops, size_t n_ops,
-				    const struct genl_multicast_group *mcgrps,
-				    size_t n_mcgrps)
-{
-	family->module = THIS_MODULE;
-	family->ops = ops;
-	family->n_ops = n_ops;
-	family->mcgrps = mcgrps;
-	family->n_mcgrps = n_mcgrps;
-	return __genl_register_family(family);
-}
-
-#define genl_register_family_with_ops(family, ops)			\
-	_genl_register_family_with_ops_grps((family),			\
-					    (ops), ARRAY_SIZE(ops),	\
-					    NULL, 0)
-#define genl_register_family_with_ops_groups(family, ops, grps)	\
-	_genl_register_family_with_ops_grps((family),			\
-					    (ops), ARRAY_SIZE(ops),	\
-					    (grps), ARRAY_SIZE(grps))
-
+int genl_register_family(struct genl_family *family);
 int genl_unregister_family(struct genl_family *family);
 void genl_notify(struct genl_family *family, struct sk_buff *skb,
 		 struct genl_info *info, u32 group, gfp_t flags);
-- 
cgit v1.2.3


From 2ae0f17df1cd52aafd1ab0415ea1f1dd56dc0e2a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 24 Oct 2016 14:40:04 +0200
Subject: genetlink: use idr to track families

Since generic netlink family IDs are small integers, allocated
densely, IDR is an ideal match for lookups. Replace the existing
hand-written hash-table with IDR for allocation and lookup.

This lets the families only be written to once, during register,
since the list_head can be removed and removal of a family won't
cause any writes.

It also slightly reduces the code size (by about 1.3k on x86-64).

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h        | 31 +++++++++++++++----------------
 include/uapi/linux/genetlink.h |  2 ++
 2 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 2298b50cee34..3ec87bacc0f5 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -40,7 +40,6 @@ struct genl_info;
  *	generic netlink family is removed while there are still open
  *	sockets.
  * @attrbuf: buffer to store parsed attributes (private)
- * @family_list: family list (private)
  * @mcgrps: multicast groups used by this family
  * @n_mcgrps: number of multicast groups
  * @mcgrp_offset: starting number of multicast group IDs in this family
@@ -70,11 +69,10 @@ struct genl_family {
 	unsigned int		n_ops;
 	unsigned int		n_mcgrps;
 	unsigned int		mcgrp_offset;	/* private */
-	struct list_head	family_list;	/* private */
 	struct module		*module;
 };
 
-struct nlattr **genl_family_attrbuf(struct genl_family *family);
+struct nlattr **genl_family_attrbuf(const struct genl_family *family);
 
 /**
  * struct genl_info - receiving information
@@ -134,12 +132,12 @@ struct genl_ops {
 };
 
 int genl_register_family(struct genl_family *family);
-int genl_unregister_family(struct genl_family *family);
-void genl_notify(struct genl_family *family, struct sk_buff *skb,
+int genl_unregister_family(const struct genl_family *family);
+void genl_notify(const struct genl_family *family, struct sk_buff *skb,
 		 struct genl_info *info, u32 group, gfp_t flags);
 
 void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
-		  struct genl_family *family, int flags, u8 cmd);
+		  const struct genl_family *family, int flags, u8 cmd);
 
 /**
  * genlmsg_nlhdr - Obtain netlink header from user specified header
@@ -148,8 +146,8 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
  *
  * Returns pointer to netlink header.
  */
-static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr,
-					     struct genl_family *family)
+static inline struct nlmsghdr *
+genlmsg_nlhdr(void *user_hdr, const struct genl_family *family)
 {
 	return (struct nlmsghdr *)((char *)user_hdr -
 				   family->hdrsize -
@@ -185,7 +183,7 @@ static inline int genlmsg_parse(const struct nlmsghdr *nlh,
  */
 static inline void genl_dump_check_consistent(struct netlink_callback *cb,
 					      void *user_hdr,
-					      struct genl_family *family)
+					      const struct genl_family *family)
 {
 	nl_dump_check_consistent(cb, genlmsg_nlhdr(user_hdr, family));
 }
@@ -202,7 +200,7 @@ static inline void genl_dump_check_consistent(struct netlink_callback *cb,
  */
 static inline void *genlmsg_put_reply(struct sk_buff *skb,
 				      struct genl_info *info,
-				      struct genl_family *family,
+				      const struct genl_family *family,
 				      int flags, u8 cmd)
 {
 	return genlmsg_put(skb, info->snd_portid, info->snd_seq, family,
@@ -239,7 +237,7 @@ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr)
  * @group: offset of multicast group in groups array
  * @flags: allocation flags
  */
-static inline int genlmsg_multicast_netns(struct genl_family *family,
+static inline int genlmsg_multicast_netns(const struct genl_family *family,
 					  struct net *net, struct sk_buff *skb,
 					  u32 portid, unsigned int group, gfp_t flags)
 {
@@ -257,7 +255,7 @@ static inline int genlmsg_multicast_netns(struct genl_family *family,
  * @group: offset of multicast group in groups array
  * @flags: allocation flags
  */
-static inline int genlmsg_multicast(struct genl_family *family,
+static inline int genlmsg_multicast(const struct genl_family *family,
 				    struct sk_buff *skb, u32 portid,
 				    unsigned int group, gfp_t flags)
 {
@@ -275,7 +273,7 @@ static inline int genlmsg_multicast(struct genl_family *family,
  *
  * This function must hold the RTNL or rcu_read_lock().
  */
-int genlmsg_multicast_allns(struct genl_family *family,
+int genlmsg_multicast_allns(const struct genl_family *family,
 			    struct sk_buff *skb, u32 portid,
 			    unsigned int group, gfp_t flags);
 
@@ -359,8 +357,9 @@ static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags)
  * This function returns the number of broadcast listeners that have set the
  * NETLINK_RECV_NO_ENOBUFS socket option.
  */
-static inline int genl_set_err(struct genl_family *family, struct net *net,
-			       u32 portid, u32 group, int code)
+static inline int genl_set_err(const struct genl_family *family,
+			       struct net *net, u32 portid,
+			       u32 group, int code)
 {
 	if (WARN_ON_ONCE(group >= family->n_mcgrps))
 		return -EINVAL;
@@ -368,7 +367,7 @@ static inline int genl_set_err(struct genl_family *family, struct net *net,
 	return netlink_set_err(net->genl_sock, portid, group, code);
 }
 
-static inline int genl_has_listeners(struct genl_family *family,
+static inline int genl_has_listeners(const struct genl_family *family,
 				     struct net *net, unsigned int group)
 {
 	if (WARN_ON_ONCE(group >= family->n_mcgrps))
diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
index d9b2db4a29c6..adc899381e0d 100644
--- a/include/uapi/linux/genetlink.h
+++ b/include/uapi/linux/genetlink.h
@@ -29,6 +29,8 @@ struct genlmsghdr {
 #define GENL_ID_CTRL		NLMSG_MIN_TYPE
 #define GENL_ID_VFS_DQUOT	(NLMSG_MIN_TYPE + 1)
 #define GENL_ID_PMCRAID		(NLMSG_MIN_TYPE + 2)
+/* must be last reserved + 1 */
+#define GENL_START_ALLOC	(NLMSG_MIN_TYPE + 3)
 
 /**************************************************************************
  * Controller
-- 
cgit v1.2.3


From 56989f6d8568c21257dcec0f5e644d5570ba3281 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 24 Oct 2016 14:40:05 +0200
Subject: genetlink: mark families as __ro_after_init

Now genl_register_family() is the only thing (other than the
users themselves, perhaps, but I didn't find any doing that)
writing to the family struct.

In all families that I found, genl_register_family() is only
called from __init functions (some indirectly, in which case
I've add __init annotations to clarifly things), so all can
actually be marked __ro_after_init.

This protects the data structure from accidental corruption.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/genl_magic_func.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index 40c2e39362c8..377257d8f7e3 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -293,7 +293,7 @@ static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)(	\
 #undef GENL_mc_group
 #define GENL_mc_group(group)
 
-static struct genl_family ZZZ_genl_family __read_mostly = {
+static struct genl_family ZZZ_genl_family __ro_after_init = {
 	.name = __stringify(GENL_MAGIC_FAMILY),
 	.version = GENL_MAGIC_VERSION,
 #ifdef GENL_MAGIC_FAMILY_HDRSZ
-- 
cgit v1.2.3


From b15ca182ed136087f6a2cb9ffe880c923f36a56e Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 26 Oct 2016 10:53:16 +0200
Subject: netlink: Add nla_memdup() to wrap kmemdup() use on nlattr

Wrap several common instances of:
	kmemdup(nla_data(attr), nla_len(attr), GFP_KERNEL);

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 254a0fc01800..a34f53acb6d6 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -1190,6 +1190,16 @@ static inline struct in6_addr nla_get_in6_addr(const struct nlattr *nla)
 	return tmp;
 }
 
+/**
+ * nla_memdup - duplicate attribute memory (kmemdup)
+ * @src: netlink attribute to duplicate from
+ * @gfp: GFP mask
+ */
+static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp)
+{
+	return kmemdup(nla_data(src), nla_len(src), gfp);
+}
+
 /**
  * nla_nest_start - Start a new level of nested attributes
  * @skb: socket buffer to add attributes to
-- 
cgit v1.2.3


From 5ea8ea2cb7f1d0db15762c9b0bb9e7330425a071 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 26 Oct 2016 09:27:57 -0700
Subject: tcp/dccp: drop SYN packets if accept queue is full

Per listen(fd, backlog) rules, there is really no point accepting a SYN,
sending a SYNACK, and dropping the following ACK packet if accept queue
is full, because application is not draining accept queue fast enough.

This behavior is fooling TCP clients that believe they established a
flow, while there is nothing at server side. They might then send about
10 MSS (if using IW10) that will be dropped anyway while server is under
stress.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 197a30d221e9..146054ceea8e 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -289,11 +289,6 @@ static inline int inet_csk_reqsk_queue_len(const struct sock *sk)
 	return reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue);
 }
 
-static inline int inet_csk_reqsk_queue_young(const struct sock *sk)
-{
-	return reqsk_queue_len_young(&inet_csk(sk)->icsk_accept_queue);
-}
-
 static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
 {
 	return inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog;
-- 
cgit v1.2.3


From b917783c7b350518f8c5d88bb5848aa8064408a6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 26 Oct 2016 18:49:46 +0200
Subject: flow_dissector: __skb_get_hash_symmetric arg can be const

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 601258f6e621..663fda2887f7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1086,7 +1086,7 @@ __skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4)
 }
 
 void __skb_get_hash(struct sk_buff *skb);
-u32 __skb_get_hash_symmetric(struct sk_buff *skb);
+u32 __skb_get_hash_symmetric(const struct sk_buff *skb);
 u32 skb_get_poff(const struct sk_buff *skb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen);
-- 
cgit v1.2.3


From ebb676daa1a340ccef25eb769aefc09b79c01f8a Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 27 Oct 2016 11:23:51 +0200
Subject: bpf: Print function name in addition to function id

The verifier currently prints raw function ids when printing CALL
instructions or when complaining:

	5: (85) call 23
	unknown func 23

print a meaningful function name instead:

	5: (85) call bpf_redirect#23
	unknown func bpf_redirect#23

Moves the function documentation to a single comment and renames all
helpers names in the list to conform to the bpf_ prefix notation so
they can be greped in the kernel source.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 574 ++++++++++++++++++++++++-----------------------
 1 file changed, 289 insertions(+), 285 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 374ef582ae18..e2f38e0091b6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -143,297 +143,301 @@ union bpf_attr {
 	};
 } __attribute__((aligned(8)));
 
+/* BPF helper function descriptions:
+ *
+ * void *bpf_map_lookup_elem(&map, &key)
+ *     Return: Map value or NULL
+ *
+ * int bpf_map_update_elem(&map, &key, &value, flags)
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_map_delete_elem(&map, &key)
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_probe_read(void *dst, int size, void *src)
+ *     Return: 0 on success or negative error
+ *
+ * u64 bpf_ktime_get_ns(void)
+ *     Return: current ktime
+ *
+ * int bpf_trace_printk(const char *fmt, int fmt_size, ...)
+ *     Return: length of buffer written or negative error
+ *
+ * u32 bpf_prandom_u32(void)
+ *     Return: random value
+ *
+ * u32 bpf_raw_smp_processor_id(void)
+ *     Return: SMP processor ID
+ *
+ * int bpf_skb_store_bytes(skb, offset, from, len, flags)
+ *     store bytes into packet
+ *     @skb: pointer to skb
+ *     @offset: offset within packet from skb->mac_header
+ *     @from: pointer where to copy bytes from
+ *     @len: number of bytes to store into packet
+ *     @flags: bit 0 - if true, recompute skb->csum
+ *             other bits - reserved
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_l3_csum_replace(skb, offset, from, to, flags)
+ *     recompute IP checksum
+ *     @skb: pointer to skb
+ *     @offset: offset within packet where IP checksum is located
+ *     @from: old value of header field
+ *     @to: new value of header field
+ *     @flags: bits 0-3 - size of header field
+ *             other bits - reserved
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_l4_csum_replace(skb, offset, from, to, flags)
+ *     recompute TCP/UDP checksum
+ *     @skb: pointer to skb
+ *     @offset: offset within packet where TCP/UDP checksum is located
+ *     @from: old value of header field
+ *     @to: new value of header field
+ *     @flags: bits 0-3 - size of header field
+ *             bit 4 - is pseudo header
+ *             other bits - reserved
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_tail_call(ctx, prog_array_map, index)
+ *     jump into another BPF program
+ *     @ctx: context pointer passed to next program
+ *     @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
+ *     @index: index inside array that selects specific program to run
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_clone_redirect(skb, ifindex, flags)
+ *     redirect to another netdev
+ *     @skb: pointer to skb
+ *     @ifindex: ifindex of the net device
+ *     @flags: bit 0 - if set, redirect to ingress instead of egress
+ *             other bits - reserved
+ *     Return: 0 on success or negative error
+ *
+ * u64 bpf_get_current_pid_tgid(void)
+ *     Return: current->tgid << 32 | current->pid
+ *
+ * u64 bpf_get_current_uid_gid(void)
+ *     Return: current_gid << 32 | current_uid
+ *
+ * int bpf_get_current_comm(char *buf, int size_of_buf)
+ *     stores current->comm into buf
+ *     Return: 0 on success or negative error
+ *
+ * u32 bpf_get_cgroup_classid(skb)
+ *     retrieve a proc's classid
+ *     @skb: pointer to skb
+ *     Return: classid if != 0
+ *
+ * int bpf_skb_vlan_push(skb, vlan_proto, vlan_tci)
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_skb_vlan_pop(skb)
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_skb_get_tunnel_key(skb, key, size, flags)
+ * int bpf_skb_set_tunnel_key(skb, key, size, flags)
+ *     retrieve or populate tunnel metadata
+ *     @skb: pointer to skb
+ *     @key: pointer to 'struct bpf_tunnel_key'
+ *     @size: size of 'struct bpf_tunnel_key'
+ *     @flags: room for future extensions
+ *     Return: 0 on success or negative error
+ *
+ * u64 bpf_perf_event_read(&map, index)
+ *     Return: Number events read or error code
+ *
+ * int bpf_redirect(ifindex, flags)
+ *     redirect to another netdev
+ *     @ifindex: ifindex of the net device
+ *     @flags: bit 0 - if set, redirect to ingress instead of egress
+ *             other bits - reserved
+ *     Return: TC_ACT_REDIRECT
+ *
+ * u32 bpf_get_route_realm(skb)
+ *     retrieve a dst's tclassid
+ *     @skb: pointer to skb
+ *     Return: realm if != 0
+ *
+ * int bpf_perf_event_output(ctx, map, index, data, size)
+ *     output perf raw sample
+ *     @ctx: struct pt_regs*
+ *     @map: pointer to perf_event_array map
+ *     @index: index of event in the map
+ *     @data: data on stack to be output as raw data
+ *     @size: size of data
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_get_stackid(ctx, map, flags)
+ *     walk user or kernel stack and return id
+ *     @ctx: struct pt_regs*
+ *     @map: pointer to stack_trace map
+ *     @flags: bits 0-7 - numer of stack frames to skip
+ *             bit 8 - collect user stack instead of kernel
+ *             bit 9 - compare stacks by hash only
+ *             bit 10 - if two different stacks hash into the same stackid
+ *                      discard old
+ *             other bits - reserved
+ *     Return: >= 0 stackid on success or negative error
+ *
+ * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
+ *     calculate csum diff
+ *     @from: raw from buffer
+ *     @from_size: length of from buffer
+ *     @to: raw to buffer
+ *     @to_size: length of to buffer
+ *     @seed: optional seed
+ *     Return: csum result or negative error code
+ *
+ * int bpf_skb_get_tunnel_opt(skb, opt, size)
+ *     retrieve tunnel options metadata
+ *     @skb: pointer to skb
+ *     @opt: pointer to raw tunnel option data
+ *     @size: size of @opt
+ *     Return: option size
+ *
+ * int bpf_skb_set_tunnel_opt(skb, opt, size)
+ *     populate tunnel options metadata
+ *     @skb: pointer to skb
+ *     @opt: pointer to raw tunnel option data
+ *     @size: size of @opt
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_skb_change_proto(skb, proto, flags)
+ *     Change protocol of the skb. Currently supported is v4 -> v6,
+ *     v6 -> v4 transitions. The helper will also resize the skb. eBPF
+ *     program is expected to fill the new headers via skb_store_bytes
+ *     and lX_csum_replace.
+ *     @skb: pointer to skb
+ *     @proto: new skb->protocol type
+ *     @flags: reserved
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_skb_change_type(skb, type)
+ *     Change packet type of skb.
+ *     @skb: pointer to skb
+ *     @type: new skb->pkt_type type
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_skb_under_cgroup(skb, map, index)
+ *     Check cgroup2 membership of skb
+ *     @skb: pointer to skb
+ *     @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
+ *     @index: index of the cgroup in the bpf_map
+ *     Return:
+ *       == 0 skb failed the cgroup2 descendant test
+ *       == 1 skb succeeded the cgroup2 descendant test
+ *        < 0 error
+ *
+ * u32 bpf_get_hash_recalc(skb)
+ *     Retrieve and possibly recalculate skb->hash.
+ *     @skb: pointer to skb
+ *     Return: hash
+ *
+ * u64 bpf_get_current_task(void)
+ *     Returns current task_struct
+ *     Return: current
+ *
+ * int bpf_probe_write_user(void *dst, void *src, int len)
+ *     safely attempt to write to a location
+ *     @dst: destination address in userspace
+ *     @src: source address on stack
+ *     @len: number of bytes to copy
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_current_task_under_cgroup(map, index)
+ *     Check cgroup2 membership of current task
+ *     @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
+ *     @index: index of the cgroup in the bpf_map
+ *     Return:
+ *       == 0 current failed the cgroup2 descendant test
+ *       == 1 current succeeded the cgroup2 descendant test
+ *        < 0 error
+ *
+ * int bpf_skb_change_tail(skb, len, flags)
+ *     The helper will resize the skb to the given new size, to be used f.e.
+ *     with control messages.
+ *     @skb: pointer to skb
+ *     @len: new skb length
+ *     @flags: reserved
+ *     Return: 0 on success or negative error
+ *
+ * int bpf_skb_pull_data(skb, len)
+ *     The helper will pull in non-linear data in case the skb is non-linear
+ *     and not all of len are part of the linear section. Only needed for
+ *     read/write with direct packet access.
+ *     @skb: pointer to skb
+ *     @len: len to make read/writeable
+ *     Return: 0 on success or negative error
+ *
+ * s64 bpf_csum_update(skb, csum)
+ *     Adds csum into skb->csum in case of CHECKSUM_COMPLETE.
+ *     @skb: pointer to skb
+ *     @csum: csum to add
+ *     Return: csum on success or negative error
+ *
+ * void bpf_set_hash_invalid(skb)
+ *     Invalidate current skb->hash.
+ *     @skb: pointer to skb
+ *
+ * int bpf_get_numa_node_id()
+ *     Return: Id of current NUMA node.
+ */
+#define __BPF_FUNC_MAPPER(FN)		\
+	FN(unspec),			\
+	FN(map_lookup_elem),		\
+	FN(map_update_elem),		\
+	FN(map_delete_elem),		\
+	FN(probe_read),			\
+	FN(ktime_get_ns),		\
+	FN(trace_printk),		\
+	FN(get_prandom_u32),		\
+	FN(get_smp_processor_id),	\
+	FN(skb_store_bytes),		\
+	FN(l3_csum_replace),		\
+	FN(l4_csum_replace),		\
+	FN(tail_call),			\
+	FN(clone_redirect),		\
+	FN(get_current_pid_tgid),	\
+	FN(get_current_uid_gid),	\
+	FN(get_current_comm),		\
+	FN(get_cgroup_classid),		\
+	FN(skb_vlan_push),		\
+	FN(skb_vlan_pop),		\
+	FN(skb_get_tunnel_key),		\
+	FN(skb_set_tunnel_key),		\
+	FN(perf_event_read),		\
+	FN(redirect),			\
+	FN(get_route_realm),		\
+	FN(perf_event_output),		\
+	FN(skb_load_bytes),		\
+	FN(get_stackid),		\
+	FN(csum_diff),			\
+	FN(skb_get_tunnel_opt),		\
+	FN(skb_set_tunnel_opt),		\
+	FN(skb_change_proto),		\
+	FN(skb_change_type),		\
+	FN(skb_under_cgroup),		\
+	FN(get_hash_recalc),		\
+	FN(get_current_task),		\
+	FN(probe_write_user),		\
+	FN(current_task_under_cgroup),	\
+	FN(skb_change_tail),		\
+	FN(skb_pull_data),		\
+	FN(csum_update),		\
+	FN(set_hash_invalid),		\
+	FN(get_numa_node_id),
+
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
  */
+#define __BPF_ENUM_FN(x) BPF_FUNC_ ## x
 enum bpf_func_id {
-	BPF_FUNC_unspec,
-	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
-	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
-	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
-	BPF_FUNC_probe_read,      /* int bpf_probe_read(void *dst, int size, void *src) */
-	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
-	BPF_FUNC_trace_printk,    /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */
-	BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */
-	BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */
-
-	/**
-	 * skb_store_bytes(skb, offset, from, len, flags) - store bytes into packet
-	 * @skb: pointer to skb
-	 * @offset: offset within packet from skb->mac_header
-	 * @from: pointer where to copy bytes from
-	 * @len: number of bytes to store into packet
-	 * @flags: bit 0 - if true, recompute skb->csum
-	 *         other bits - reserved
-	 * Return: 0 on success
-	 */
-	BPF_FUNC_skb_store_bytes,
-
-	/**
-	 * l3_csum_replace(skb, offset, from, to, flags) - recompute IP checksum
-	 * @skb: pointer to skb
-	 * @offset: offset within packet where IP checksum is located
-	 * @from: old value of header field
-	 * @to: new value of header field
-	 * @flags: bits 0-3 - size of header field
-	 *         other bits - reserved
-	 * Return: 0 on success
-	 */
-	BPF_FUNC_l3_csum_replace,
-
-	/**
-	 * l4_csum_replace(skb, offset, from, to, flags) - recompute TCP/UDP checksum
-	 * @skb: pointer to skb
-	 * @offset: offset within packet where TCP/UDP checksum is located
-	 * @from: old value of header field
-	 * @to: new value of header field
-	 * @flags: bits 0-3 - size of header field
-	 *         bit 4 - is pseudo header
-	 *         other bits - reserved
-	 * Return: 0 on success
-	 */
-	BPF_FUNC_l4_csum_replace,
-
-	/**
-	 * bpf_tail_call(ctx, prog_array_map, index) - jump into another BPF program
-	 * @ctx: context pointer passed to next program
-	 * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
-	 * @index: index inside array that selects specific program to run
-	 * Return: 0 on success
-	 */
-	BPF_FUNC_tail_call,
-
-	/**
-	 * bpf_clone_redirect(skb, ifindex, flags) - redirect to another netdev
-	 * @skb: pointer to skb
-	 * @ifindex: ifindex of the net device
-	 * @flags: bit 0 - if set, redirect to ingress instead of egress
-	 *         other bits - reserved
-	 * Return: 0 on success
-	 */
-	BPF_FUNC_clone_redirect,
-
-	/**
-	 * u64 bpf_get_current_pid_tgid(void)
-	 * Return: current->tgid << 32 | current->pid
-	 */
-	BPF_FUNC_get_current_pid_tgid,
-
-	/**
-	 * u64 bpf_get_current_uid_gid(void)
-	 * Return: current_gid << 32 | current_uid
-	 */
-	BPF_FUNC_get_current_uid_gid,
-
-	/**
-	 * bpf_get_current_comm(char *buf, int size_of_buf)
-	 * stores current->comm into buf
-	 * Return: 0 on success
-	 */
-	BPF_FUNC_get_current_comm,
-
-	/**
-	 * bpf_get_cgroup_classid(skb) - retrieve a proc's classid
-	 * @skb: pointer to skb
-	 * Return: classid if != 0
-	 */
-	BPF_FUNC_get_cgroup_classid,
-	BPF_FUNC_skb_vlan_push, /* bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) */
-	BPF_FUNC_skb_vlan_pop,  /* bpf_skb_vlan_pop(skb) */
-
-	/**
-	 * bpf_skb_[gs]et_tunnel_key(skb, key, size, flags)
-	 * retrieve or populate tunnel metadata
-	 * @skb: pointer to skb
-	 * @key: pointer to 'struct bpf_tunnel_key'
-	 * @size: size of 'struct bpf_tunnel_key'
-	 * @flags: room for future extensions
-	 * Retrun: 0 on success
-	 */
-	BPF_FUNC_skb_get_tunnel_key,
-	BPF_FUNC_skb_set_tunnel_key,
-	BPF_FUNC_perf_event_read,	/* u64 bpf_perf_event_read(&map, index) */
-	/**
-	 * bpf_redirect(ifindex, flags) - redirect to another netdev
-	 * @ifindex: ifindex of the net device
-	 * @flags: bit 0 - if set, redirect to ingress instead of egress
-	 *         other bits - reserved
-	 * Return: TC_ACT_REDIRECT
-	 */
-	BPF_FUNC_redirect,
-
-	/**
-	 * bpf_get_route_realm(skb) - retrieve a dst's tclassid
-	 * @skb: pointer to skb
-	 * Return: realm if != 0
-	 */
-	BPF_FUNC_get_route_realm,
-
-	/**
-	 * bpf_perf_event_output(ctx, map, index, data, size) - output perf raw sample
-	 * @ctx: struct pt_regs*
-	 * @map: pointer to perf_event_array map
-	 * @index: index of event in the map
-	 * @data: data on stack to be output as raw data
-	 * @size: size of data
-	 * Return: 0 on success
-	 */
-	BPF_FUNC_perf_event_output,
-	BPF_FUNC_skb_load_bytes,
-
-	/**
-	 * bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id
-	 * @ctx: struct pt_regs*
-	 * @map: pointer to stack_trace map
-	 * @flags: bits 0-7 - numer of stack frames to skip
-	 *         bit 8 - collect user stack instead of kernel
-	 *         bit 9 - compare stacks by hash only
-	 *         bit 10 - if two different stacks hash into the same stackid
-	 *                  discard old
-	 *         other bits - reserved
-	 * Return: >= 0 stackid on success or negative error
-	 */
-	BPF_FUNC_get_stackid,
-
-	/**
-	 * bpf_csum_diff(from, from_size, to, to_size, seed) - calculate csum diff
-	 * @from: raw from buffer
-	 * @from_size: length of from buffer
-	 * @to: raw to buffer
-	 * @to_size: length of to buffer
-	 * @seed: optional seed
-	 * Return: csum result
-	 */
-	BPF_FUNC_csum_diff,
-
-	/**
-	 * bpf_skb_[gs]et_tunnel_opt(skb, opt, size)
-	 * retrieve or populate tunnel options metadata
-	 * @skb: pointer to skb
-	 * @opt: pointer to raw tunnel option data
-	 * @size: size of @opt
-	 * Return: 0 on success for set, option size for get
-	 */
-	BPF_FUNC_skb_get_tunnel_opt,
-	BPF_FUNC_skb_set_tunnel_opt,
-
-	/**
-	 * bpf_skb_change_proto(skb, proto, flags)
-	 * Change protocol of the skb. Currently supported is
-	 * v4 -> v6, v6 -> v4 transitions. The helper will also
-	 * resize the skb. eBPF program is expected to fill the
-	 * new headers via skb_store_bytes and lX_csum_replace.
-	 * @skb: pointer to skb
-	 * @proto: new skb->protocol type
-	 * @flags: reserved
-	 * Return: 0 on success or negative error
-	 */
-	BPF_FUNC_skb_change_proto,
-
-	/**
-	 * bpf_skb_change_type(skb, type)
-	 * Change packet type of skb.
-	 * @skb: pointer to skb
-	 * @type: new skb->pkt_type type
-	 * Return: 0 on success or negative error
-	 */
-	BPF_FUNC_skb_change_type,
-
-	/**
-	 * bpf_skb_under_cgroup(skb, map, index) - Check cgroup2 membership of skb
-	 * @skb: pointer to skb
-	 * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
-	 * @index: index of the cgroup in the bpf_map
-	 * Return:
-	 *   == 0 skb failed the cgroup2 descendant test
-	 *   == 1 skb succeeded the cgroup2 descendant test
-	 *    < 0 error
-	 */
-	BPF_FUNC_skb_under_cgroup,
-
-	/**
-	 * bpf_get_hash_recalc(skb)
-	 * Retrieve and possibly recalculate skb->hash.
-	 * @skb: pointer to skb
-	 * Return: hash
-	 */
-	BPF_FUNC_get_hash_recalc,
-
-	/**
-	 * u64 bpf_get_current_task(void)
-	 * Returns current task_struct
-	 * Return: current
-	 */
-	BPF_FUNC_get_current_task,
-
-	/**
-	 * bpf_probe_write_user(void *dst, void *src, int len)
-	 * safely attempt to write to a location
-	 * @dst: destination address in userspace
-	 * @src: source address on stack
-	 * @len: number of bytes to copy
-	 * Return: 0 on success or negative error
-	 */
-	BPF_FUNC_probe_write_user,
-
-	/**
-	 * bpf_current_task_under_cgroup(map, index) - Check cgroup2 membership of current task
-	 * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
-	 * @index: index of the cgroup in the bpf_map
-	 * Return:
-	 *   == 0 current failed the cgroup2 descendant test
-	 *   == 1 current succeeded the cgroup2 descendant test
-	 *    < 0 error
-	 */
-	BPF_FUNC_current_task_under_cgroup,
-
-	/**
-	 * bpf_skb_change_tail(skb, len, flags)
-	 * The helper will resize the skb to the given new size,
-	 * to be used f.e. with control messages.
-	 * @skb: pointer to skb
-	 * @len: new skb length
-	 * @flags: reserved
-	 * Return: 0 on success or negative error
-	 */
-	BPF_FUNC_skb_change_tail,
-
-	/**
-	 * bpf_skb_pull_data(skb, len)
-	 * The helper will pull in non-linear data in case the
-	 * skb is non-linear and not all of len are part of the
-	 * linear section. Only needed for read/write with direct
-	 * packet access.
-	 * @skb: pointer to skb
-	 * @len: len to make read/writeable
-	 * Return: 0 on success or negative error
-	 */
-	BPF_FUNC_skb_pull_data,
-
-	/**
-	 * bpf_csum_update(skb, csum)
-	 * Adds csum into skb->csum in case of CHECKSUM_COMPLETE.
-	 * @skb: pointer to skb
-	 * @csum: csum to add
-	 * Return: csum on success or negative error
-	 */
-	BPF_FUNC_csum_update,
-
-	/**
-	 * bpf_set_hash_invalid(skb)
-	 * Invalidate current skb>hash.
-	 * @skb: pointer to skb
-	 */
-	BPF_FUNC_set_hash_invalid,
-
-	/**
-	 * bpf_get_numa_node_id()
-	 * Returns the id of the current NUMA node.
-	 */
-	BPF_FUNC_get_numa_node_id,
-
+	__BPF_FUNC_MAPPER(__BPF_ENUM_FN)
 	__BPF_FUNC_MAX_ID,
 };
+#undef __BPF_ENUM_FN
 
 /* All flags used by eBPF helper functions, placed here. */
 
-- 
cgit v1.2.3


From 5579e1519bad43b874922dbe87c74fdcbd97a7db Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Wed, 31 Aug 2016 05:17:54 +0000
Subject: net/mlx5: Update struct mlx5_ifc_xrqc_bits

Update struct mlx5_ifc_xrqc_bits according to last specification

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6045d4d58065..12f72e45a3f0 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -2844,7 +2844,7 @@ struct mlx5_ifc_xrqc_bits {
 
 	struct mlx5_ifc_tag_matching_topology_context_bits tag_matching_topology_context;
 
-	u8         reserved_at_180[0x200];
+	u8         reserved_at_180[0x880];
 
 	struct mlx5_ifc_wq_bits wq;
 };
-- 
cgit v1.2.3


From dd257efb1e0f8875ed7e42b88837a8dada0d0e41 Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Wed, 31 Aug 2016 05:29:58 +0000
Subject: net/mlx5: Ensure SRQ physical address structure endianness

SRQ physical address structure field should be in big-endian format.

Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/srq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/srq.h b/include/linux/mlx5/srq.h
index 33c97dc900f8..1cde0fd53f90 100644
--- a/include/linux/mlx5/srq.h
+++ b/include/linux/mlx5/srq.h
@@ -55,7 +55,7 @@ struct mlx5_srq_attr {
 	u32 lwm;
 	u32 user_index;
 	u64 db_record;
-	u64 *pas;
+	__be64 *pas;
 };
 
 struct mlx5_core_dev;
-- 
cgit v1.2.3


From 813f854053c26204e2723c498def4c7870dcc7f4 Mon Sep 17 00:00:00 2001
From: Mohamad Haj Yahia <mohamad@mellanox.com>
Date: Thu, 11 Aug 2016 11:21:39 +0300
Subject: net/mlx5: Introduce TSAR manipulation firmware commands

TSAR (stands for Transmit Scheduling ARbiter) is a hardware component
that is responsible for selecting the next entity to serve on the
transmit path.
The arbitration defines the QoS policy between the agents connected to
the TSAR.
The TSAR is a consist two main features:
1) BW Allocation between agents:
The TSAR implements a defecit weighted round robin between the agents.
Each agent attached to the TSAR is assigned with a weight and it is
awarded transmission tokens according to this weight.
2) Rate limer per agent:
Each agent attached to the TSAR is (optionally) assigned with a rate
limit.
TSAR will not allow scheduling for an agent exceeding its defined rate
limit.

In this patch we implement the API of manipulating the TSAR.

Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 199 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 195 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 12f72e45a3f0..2632cb2caf10 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -145,6 +145,12 @@ enum {
 	MLX5_CMD_OP_QUERY_Q_COUNTER               = 0x773,
 	MLX5_CMD_OP_SET_RATE_LIMIT                = 0x780,
 	MLX5_CMD_OP_QUERY_RATE_LIMIT              = 0x781,
+	MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT      = 0x782,
+	MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT     = 0x783,
+	MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT       = 0x784,
+	MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT      = 0x785,
+	MLX5_CMD_OP_CREATE_QOS_PARA_VPORT         = 0x786,
+	MLX5_CMD_OP_DESTROY_QOS_PARA_VPORT        = 0x787,
 	MLX5_CMD_OP_ALLOC_PD                      = 0x800,
 	MLX5_CMD_OP_DEALLOC_PD                    = 0x801,
 	MLX5_CMD_OP_ALLOC_UAR                     = 0x802,
@@ -537,13 +543,27 @@ struct mlx5_ifc_e_switch_cap_bits {
 
 struct mlx5_ifc_qos_cap_bits {
 	u8         packet_pacing[0x1];
-	u8         reserved_0[0x1f];
-	u8         reserved_1[0x20];
+	u8         esw_scheduling[0x1];
+	u8         reserved_at_2[0x1e];
+
+	u8         reserved_at_20[0x20];
+
 	u8         packet_pacing_max_rate[0x20];
+
 	u8         packet_pacing_min_rate[0x20];
-	u8         reserved_2[0x10];
+
+	u8         reserved_at_80[0x10];
 	u8         packet_pacing_rate_table_size[0x10];
-	u8         reserved_3[0x760];
+
+	u8         esw_element_type[0x10];
+	u8         esw_tsar_type[0x10];
+
+	u8         reserved_at_c0[0x10];
+	u8         max_qos_para_vport[0x10];
+
+	u8         max_tsar_bw_share[0x20];
+
+	u8         reserved_at_100[0x700];
 };
 
 struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
@@ -2333,6 +2353,30 @@ struct mlx5_ifc_sqc_bits {
 	struct mlx5_ifc_wq_bits wq;
 };
 
+enum {
+	SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR = 0x0,
+	SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT = 0x1,
+	SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC = 0x2,
+	SCHEDULING_CONTEXT_ELEMENT_TYPE_PARA_VPORT_TC = 0x3,
+};
+
+struct mlx5_ifc_scheduling_context_bits {
+	u8         element_type[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         element_attributes[0x20];
+
+	u8         parent_element_id[0x20];
+
+	u8         reserved_at_60[0x40];
+
+	u8         bw_share[0x20];
+
+	u8         max_average_bw[0x20];
+
+	u8         reserved_at_e0[0x120];
+};
+
 struct mlx5_ifc_rqtc_bits {
 	u8         reserved_at_0[0xa0];
 
@@ -2920,6 +2964,29 @@ struct mlx5_ifc_register_loopback_control_bits {
 	u8         reserved_at_20[0x60];
 };
 
+struct mlx5_ifc_vport_tc_element_bits {
+	u8         traffic_class[0x4];
+	u8         reserved_at_4[0xc];
+	u8         vport_number[0x10];
+};
+
+struct mlx5_ifc_vport_element_bits {
+	u8         reserved_at_0[0x10];
+	u8         vport_number[0x10];
+};
+
+enum {
+	TSAR_ELEMENT_TSAR_TYPE_DWRR = 0x0,
+	TSAR_ELEMENT_TSAR_TYPE_ROUND_ROBIN = 0x1,
+	TSAR_ELEMENT_TSAR_TYPE_ETS = 0x2,
+};
+
+struct mlx5_ifc_tsar_element_bits {
+	u8         reserved_at_0[0x8];
+	u8         tsar_type[0x8];
+	u8         reserved_at_10[0x10];
+};
+
 struct mlx5_ifc_teardown_hca_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
@@ -3540,6 +3607,39 @@ struct mlx5_ifc_query_special_contexts_in_bits {
 	u8         reserved_at_40[0x40];
 };
 
+struct mlx5_ifc_query_scheduling_element_out_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0xc0];
+
+	struct mlx5_ifc_scheduling_context_bits scheduling_context;
+
+	u8         reserved_at_300[0x100];
+};
+
+enum {
+	SCHEDULING_HIERARCHY_E_SWITCH = 0x2,
+};
+
+struct mlx5_ifc_query_scheduling_element_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         scheduling_hierarchy[0x8];
+	u8         reserved_at_48[0x18];
+
+	u8         scheduling_element_id[0x20];
+
+	u8         reserved_at_80[0x180];
+};
+
 struct mlx5_ifc_query_rqt_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
@@ -4725,6 +4825,43 @@ struct mlx5_ifc_modify_sq_in_bits {
 	struct mlx5_ifc_sqc_bits ctx;
 };
 
+struct mlx5_ifc_modify_scheduling_element_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x1c0];
+};
+
+enum {
+	MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE = 0x1,
+	MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW = 0x2,
+};
+
+struct mlx5_ifc_modify_scheduling_element_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         scheduling_hierarchy[0x8];
+	u8         reserved_at_48[0x18];
+
+	u8         scheduling_element_id[0x20];
+
+	u8         reserved_at_80[0x20];
+
+	u8         modify_bitmask[0x20];
+
+	u8         reserved_at_c0[0x40];
+
+	struct mlx5_ifc_scheduling_context_bits scheduling_context;
+
+	u8         reserved_at_300[0x100];
+};
+
 struct mlx5_ifc_modify_rqt_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
@@ -5390,6 +5527,30 @@ struct mlx5_ifc_destroy_sq_in_bits {
 	u8         reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_destroy_scheduling_element_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x1c0];
+};
+
+struct mlx5_ifc_destroy_scheduling_element_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         scheduling_hierarchy[0x8];
+	u8         reserved_at_48[0x18];
+
+	u8         scheduling_element_id[0x20];
+
+	u8         reserved_at_80[0x180];
+};
+
 struct mlx5_ifc_destroy_rqt_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
@@ -6017,6 +6178,36 @@ struct mlx5_ifc_create_sq_in_bits {
 	struct mlx5_ifc_sqc_bits ctx;
 };
 
+struct mlx5_ifc_create_scheduling_element_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+
+	u8         scheduling_element_id[0x20];
+
+	u8         reserved_at_a0[0x160];
+};
+
+struct mlx5_ifc_create_scheduling_element_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         scheduling_hierarchy[0x8];
+	u8         reserved_at_48[0x18];
+
+	u8         reserved_at_60[0xa0];
+
+	struct mlx5_ifc_scheduling_context_bits scheduling_context;
+
+	u8         reserved_at_300[0x100];
+};
+
 struct mlx5_ifc_create_rqt_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
-- 
cgit v1.2.3


From 74491de937125d0c98c9b9c9208b4105717a3caa Mon Sep 17 00:00:00 2001
From: Mark Bloch <markb@mellanox.com>
Date: Wed, 31 Aug 2016 11:24:25 +0000
Subject: net/mlx5: Add multi dest support

Currently when calling mlx5_add_flow_rule we accept
only one flow destination, this commit allows to pass
multiple destinations.

This change forces us to change the return structure to a more
flexible one. We introduce a flow handle (struct mlx5_flow_handle),
it holds internally the number for rules created and holds an array
where each cell points the to a flow rule.

From the consumers (of mlx5_add_flow_rule) point of view this
change is only cosmetic and requires only to change the type
of the returned value they store.

From the core point of view, we now need to use a loop when
allocating and deleting rules (e.g given to us a flow handler).

Signed-off-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/linux/mlx5/fs.h | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 93ebc5e21334..0dcd287f4bd0 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -69,8 +69,8 @@ enum mlx5_flow_namespace_type {
 
 struct mlx5_flow_table;
 struct mlx5_flow_group;
-struct mlx5_flow_rule;
 struct mlx5_flow_namespace;
+struct mlx5_flow_handle;
 
 struct mlx5_flow_spec {
 	u8   match_criteria_enable;
@@ -127,18 +127,20 @@ void mlx5_destroy_flow_group(struct mlx5_flow_group *fg);
 /* Single destination per rule.
  * Group ID is implied by the match criteria.
  */
-struct mlx5_flow_rule *
-mlx5_add_flow_rule(struct mlx5_flow_table *ft,
-		   struct mlx5_flow_spec *spec,
-		   u32 action,
-		   u32 flow_tag,
-		   struct mlx5_flow_destination *dest);
-void mlx5_del_flow_rule(struct mlx5_flow_rule *fr);
-
-int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
-				 struct mlx5_flow_destination *dest);
-
-struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_rule *rule);
+struct mlx5_flow_handle *
+mlx5_add_flow_rules(struct mlx5_flow_table *ft,
+		    struct mlx5_flow_spec *spec,
+		    u32 action,
+		    u32 flow_tag,
+		    struct mlx5_flow_destination *dest,
+		    int dest_num);
+void mlx5_del_flow_rules(struct mlx5_flow_handle *fr);
+
+int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler,
+				 struct mlx5_flow_destination *new_dest,
+				 struct mlx5_flow_destination *old_dest);
+
+struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_handle *handler);
 struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging);
 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
 void mlx5_fc_query_cached(struct mlx5_fc *counter,
-- 
cgit v1.2.3


From c62cce2caee558e18aa05c01c2fd3b40f07174f2 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Mon, 24 Oct 2016 18:29:13 -0700
Subject: net: add an ioctl to get a socket network namespace

Each socket operates in a network namespace where it has been created,
so if we want to dump and restore a socket, we have to know its network
namespace.

We have a socket_diag to get information about sockets, it doesn't
report sockets which are not bound or connected.

This patch introduces a new socket ioctl, which is called SIOCGSKNS
and used to get a file descriptor for a socket network namespace.

A task must have CAP_NET_ADMIN in a target network namespace to
use this ioctl.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrei Vagin <avagin@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/proc_fs.h      | 4 ++++
 include/uapi/linux/sockios.h | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index b97bf2ef996e..368c7ad06ae5 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -82,4 +82,8 @@ static inline struct proc_dir_entry *proc_net_mkdir(
 	return proc_mkdir_data(name, 0, parent, net);
 }
 
+struct ns_common;
+int open_related_ns(struct ns_common *ns,
+		   struct ns_common *(*get_ns)(struct ns_common *ns));
+
 #endif /* _LINUX_PROC_FS_H */
diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h
index 8e7890b26d9a..83cc54ce6081 100644
--- a/include/uapi/linux/sockios.h
+++ b/include/uapi/linux/sockios.h
@@ -84,6 +84,7 @@
 #define SIOCWANDEV	0x894A		/* get/set netdev parameters	*/
 
 #define SIOCOUTQNSD	0x894B		/* output queue size (not sent only) */
+#define SIOCGSKNS	0x894C		/* get socket network namespace */
 
 /* ARP cache control calls. */
 		    /*  0x8950 - 0x8952  * obsolete calls, don't re-use */
-- 
cgit v1.2.3


From 20861f26e33d76a4f3587bcc866fa1dab3e01094 Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Thu, 27 Oct 2016 09:05:22 +0800
Subject: driver: tun: Use new macro SOCK_IOC_TYPE instead of literal number
 0x89

The current codes use _IOC_TYPE(cmd) == 0x89 to check if the cmd is one
socket ioctl command like SIOCGIFHWADDR. But the literal number 0x89 may
confuse readers. So create one macro SOCK_IOC_TYPE to enhance the readability.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/sockios.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h
index 83cc54ce6081..79d029d25310 100644
--- a/include/uapi/linux/sockios.h
+++ b/include/uapi/linux/sockios.h
@@ -24,6 +24,8 @@
 #define SIOCINQ		FIONREAD
 #define SIOCOUTQ	TIOCOUTQ        /* output queue size (not sent + not acked) */
 
+#define SOCK_IOC_TYPE	0x89
+
 /* Routing table calls. */
 #define SIOCADDRT	0x890B		/* add routing table entry	*/
 #define SIOCDELRT	0x890C		/* delete routing table entry	*/
-- 
cgit v1.2.3


From 9cf1f6a8c4cbb7836b838b51b3b02ddf32c6c6a0 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 28 Oct 2016 11:43:20 -0400
Subject: net: Move functions for configuring traffic classes out of inline
 headers

The functions for configuring the traffic class to queue mappings have
other effects that need to be addressed.  Instead of trying to export a
bunch of new functions just relocate the functions so that we can
instrument them directly with the functionality they will need.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 31 +++----------------------------
 1 file changed, 3 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 20ce8df115ac..e05ab3bd48d2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1920,34 +1920,9 @@ int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc)
 	return 0;
 }
 
-static inline
-void netdev_reset_tc(struct net_device *dev)
-{
-	dev->num_tc = 0;
-	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
-	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
-}
-
-static inline
-int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
-{
-	if (tc >= dev->num_tc)
-		return -EINVAL;
-
-	dev->tc_to_txq[tc].count = count;
-	dev->tc_to_txq[tc].offset = offset;
-	return 0;
-}
-
-static inline
-int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
-{
-	if (num_tc > TC_MAX_QUEUE)
-		return -EINVAL;
-
-	dev->num_tc = num_tc;
-	return 0;
-}
+void netdev_reset_tc(struct net_device *dev);
+int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset);
+int netdev_set_num_tc(struct net_device *dev, u8 num_tc);
 
 static inline
 int netdev_get_num_tc(struct net_device *dev)
-- 
cgit v1.2.3


From 8d059b0f6f5b1d3acf829454e1087818ad660058 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 28 Oct 2016 11:43:49 -0400
Subject: net: Add sysfs value to determine queue traffic class

Add a sysfs attribute for a Tx queue that allows us to determine the
traffic class for a given queue.  This will allow us to more easily
determine this in the future.  It is needed as XPS will take the traffic
class for a group of queues into account in order to avoid pulling traffic
from one traffic class into another.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e05ab3bd48d2..d91a41860614 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1920,6 +1920,7 @@ int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc)
 	return 0;
 }
 
+int netdev_txq_to_tc(struct net_device *dev, unsigned int txq);
 void netdev_reset_tc(struct net_device *dev);
 int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset);
 int netdev_set_num_tc(struct net_device *dev, u8 num_tc);
-- 
cgit v1.2.3


From 184c449f91fef521042970cca46bd5cdfc0e3a37 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 28 Oct 2016 11:50:13 -0400
Subject: net: Add support for XPS with QoS via traffic classes

This patch adds support for setting and using XPS when QoS via traffic
classes is enabled.  With this change we will factor in the priority and
traffic class mapping of the packet and use that information to correctly
select the queue.

This allows us to define a set of queues for a given traffic class via
mqprio and then configure the XPS mapping for those queues so that the
traffic flows can avoid head-of-line blocking between the individual CPUs
if so desired.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d91a41860614..66fd61c681d9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -732,8 +732,8 @@ struct xps_dev_maps {
 	struct rcu_head rcu;
 	struct xps_map __rcu *cpu_map[0];
 };
-#define XPS_DEV_MAPS_SIZE (sizeof(struct xps_dev_maps) +		\
-    (nr_cpu_ids * sizeof(struct xps_map *)))
+#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +		\
+	(nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
 #endif /* CONFIG_XPS */
 
 #define TC_MAX_QUEUE	16
-- 
cgit v1.2.3


From 0fefbfbaad298162737d5418eb85065879f99b3e Mon Sep 17 00:00:00 2001
From: Sudarsana Kalluru <Sudarsana.Kalluru@cavium.com>
Date: Mon, 31 Oct 2016 07:14:21 +0200
Subject: qed*: Management firmware - notifications and defaults

Management firmware is interested in various tidbits about
the driver - including the driver state & several configuration
related fields [MTU, primtary MAC, etc.].
This adds the necessray logic to update MFW with such configurations,
some of which are passed directly via qed while for others APIs
are provide so that qede would be able to later configure if needed.

This also introduces a new default configuration for MTU which would
replace the default inherited by being an ethernet device.

Signed-off-by: Sudarsana Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_if.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'include')

diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 8978a60371f4..5c909cd02764 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -267,6 +267,7 @@ struct qed_dev_info {
 	u8		mf_mode;
 	bool		tx_switching;
 	bool		rdma_supported;
+	u16		mtu;
 };
 
 enum qed_sb_type {
@@ -554,6 +555,33 @@ struct qed_common_ops {
  */
 	int (*set_led)(struct qed_dev *cdev,
 		       enum qed_led_mode mode);
+
+/**
+ * @brief update_drv_state - API to inform the change in the driver state.
+ *
+ * @param cdev
+ * @param active
+ *
+ */
+	int (*update_drv_state)(struct qed_dev *cdev, bool active);
+
+/**
+ * @brief update_mac - API to inform the change in the mac address
+ *
+ * @param cdev
+ * @param mac
+ *
+ */
+	int (*update_mac)(struct qed_dev *cdev, u8 *mac);
+
+/**
+ * @brief update_mtu - API to inform the change in the mtu
+ *
+ * @param cdev
+ * @param mtu
+ *
+ */
+	int (*update_mtu)(struct qed_dev *cdev, u16 mtu);
 };
 
 #define MASK_FIELD(_name, _value) \
-- 
cgit v1.2.3


From 7a4b21b7d1f0644456501e33d3917c9aaee76a75 Mon Sep 17 00:00:00 2001
From: "Mintz, Yuval" <Yuval.Mintz@cavium.com>
Date: Mon, 31 Oct 2016 07:14:22 +0200
Subject: qed: Add nvram selftest

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_if.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 5c909cd02764..ffc2d2f5e88f 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -402,6 +402,15 @@ struct qed_selftest_ops {
  * @return 0 on success, error otherwise.
  */
 	int (*selftest_clock)(struct qed_dev *cdev);
+
+/**
+ * @brief selftest_nvram - Perform nvram test
+ *
+ * @param cdev
+ *
+ * @return 0 on success, error otherwise.
+ */
+	int (*selftest_nvram) (struct qed_dev *cdev);
 };
 
 struct qed_common_ops {
-- 
cgit v1.2.3


From 14d39648cbfc6289e3f873d30f282b9517ebe860 Mon Sep 17 00:00:00 2001
From: "Mintz, Yuval" <Yuval.Mintz@cavium.com>
Date: Mon, 31 Oct 2016 07:14:23 +0200
Subject: qed*: Add support for WoL

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_if.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index ffc2d2f5e88f..ea095b4893aa 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -268,6 +268,8 @@ struct qed_dev_info {
 	bool		tx_switching;
 	bool		rdma_supported;
 	u16		mtu;
+
+	bool wol_support;
 };
 
 enum qed_sb_type {
@@ -591,6 +593,14 @@ struct qed_common_ops {
  *
  */
 	int (*update_mtu)(struct qed_dev *cdev, u16 mtu);
+
+/**
+ * @brief update_wol - update of changes in the WoL configuration
+ *
+ * @param cdev
+ * @param enabled - true iff WoL should be enabled.
+ */
+	int (*update_wol) (struct qed_dev *cdev, bool enabled);
 };
 
 #define MASK_FIELD(_name, _value) \
-- 
cgit v1.2.3


From 2edbff8dcb5da324fd4c4fe953629e4f6ca73c99 Mon Sep 17 00:00:00 2001
From: Tomer Tayar <Tomer.Tayar@cavium.com>
Date: Mon, 31 Oct 2016 07:14:27 +0200
Subject: qed: Learn resources from management firmware

Currently, each interfaces assumes it receives an equal portion
of HW/FW resources, but this is wasteful - different partitions
[and specifically, parititions exposing different protocol support]
might require different resources.

Implement a new resource learning scheme where the information is
received directly from the management firmware [which has knowledge
of all of the functions and can serve as arbiter].

Signed-off-by: Tomer Tayar <Tomer.Tayar@cavium.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_eth_if.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 15130805d792..9755a3feb52e 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -22,7 +22,7 @@ struct qed_dev_eth_info {
 	u8	num_tc;
 
 	u8	port_mac[ETH_ALEN];
-	u8	num_vlan_filters;
+	u16	num_vlan_filters;
 	u16	num_mac_filters;
 
 	/* Legacy VF - this affects the datapath, so qede has to know */
-- 
cgit v1.2.3


From 556d299fcb4af8f2e8eacf311c4eee352c746788 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Mon, 31 Oct 2016 13:21:02 +0100
Subject: net: pim: add common pimhdr struct and helpers

Add the common pimhdr structure and helpers to access it, also cleanup the
format of the header file.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pim.h | 44 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 36 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/pim.h b/include/linux/pim.h
index e1d756f81348..354235a2691b 100644
--- a/include/linux/pim.h
+++ b/include/linux/pim.h
@@ -1,6 +1,7 @@
 #ifndef __LINUX_PIM_H
 #define __LINUX_PIM_H
 
+#include <linux/skbuff.h>
 #include <asm/byteorder.h>
 
 /* Message types - V1 */
@@ -13,20 +14,47 @@
 
 #define PIM_NULL_REGISTER	cpu_to_be32(0x40000000)
 
-static inline bool ipmr_pimsm_enabled(void)
-{
-	return IS_BUILTIN(CONFIG_IP_PIMSM_V1) || IS_BUILTIN(CONFIG_IP_PIMSM_V2);
-}
+/* RFC7761, sec 4.9:
+ * The PIM header common to all PIM messages is:
+ *   0                   1                   2                   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  |PIM Ver| Type  |   Reserved    |           Checksum            |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct pimhdr {
+	__u8	type;
+	__u8	reserved;
+	__be16	csum;
+};
 
 /* PIMv2 register message header layout (ietf-draft-idmr-pimvsm-v2-00.ps */
-struct pimreghdr
-{
+struct pimreghdr {
 	__u8	type;
 	__u8	reserved;
 	__be16	csum;
 	__be32	flags;
 };
 
-struct sk_buff;
-extern int pim_rcv_v1(struct sk_buff *);
+int pim_rcv_v1(struct sk_buff *skb);
+
+static inline bool ipmr_pimsm_enabled(void)
+{
+	return IS_BUILTIN(CONFIG_IP_PIMSM_V1) || IS_BUILTIN(CONFIG_IP_PIMSM_V2);
+}
+
+static inline struct pimhdr *pim_hdr(const struct sk_buff *skb)
+{
+	return (struct pimhdr *)skb_transport_header(skb);
+}
+
+static inline u8 pim_hdr_version(const struct pimhdr *pimhdr)
+{
+	return pimhdr->type >> 4;
+}
+
+static inline u8 pim_hdr_type(const struct pimhdr *pimhdr)
+{
+	return pimhdr->type & 0xf;
+}
 #endif
-- 
cgit v1.2.3


From 20bb6ce9879e19eee7539329eaa2408d12b00306 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Mon, 31 Oct 2016 13:21:03 +0100
Subject: net: pim: add a helper to check for IPv4 all pim routers address

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pim.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/pim.h b/include/linux/pim.h
index 354235a2691b..1b6c0dbba94e 100644
--- a/include/linux/pim.h
+++ b/include/linux/pim.h
@@ -57,4 +57,10 @@ static inline u8 pim_hdr_type(const struct pimhdr *pimhdr)
 {
 	return pimhdr->type & 0xf;
 }
+
+/* check if the address is 224.0.0.13, RFC7761 sec 4.3.1 */
+static inline bool pim_ipv4_all_pim_routers(__be32 addr)
+{
+	return addr == htonl(0xE000000D);
+}
 #endif
-- 
cgit v1.2.3


From 56245cae19f5ccb371fa63b09bb6b9ce7c0f1266 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Mon, 31 Oct 2016 13:21:04 +0100
Subject: net: pim: add all RFC7761 message types

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pim.h | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pim.h b/include/linux/pim.h
index 1b6c0dbba94e..0e81b2778ae0 100644
--- a/include/linux/pim.h
+++ b/include/linux/pim.h
@@ -10,7 +10,36 @@
 
 /* Message types - V2 */
 #define PIM_VERSION		2
-#define PIM_REGISTER		1
+
+/* RFC7761, sec 4.9:
+ *  Type
+ *        Types for specific PIM messages.  PIM Types are:
+ *
+ *  Message Type                          Destination
+ *  ---------------------------------------------------------------------
+ *  0 = Hello                             Multicast to ALL-PIM-ROUTERS
+ *  1 = Register                          Unicast to RP
+ *  2 = Register-Stop                     Unicast to source of Register
+ *                                        packet
+ *  3 = Join/Prune                        Multicast to ALL-PIM-ROUTERS
+ *  4 = Bootstrap                         Multicast to ALL-PIM-ROUTERS
+ *  5 = Assert                            Multicast to ALL-PIM-ROUTERS
+ *  6 = Graft (used in PIM-DM only)       Unicast to RPF'(S)
+ *  7 = Graft-Ack (used in PIM-DM only)   Unicast to source of Graft
+ *                                        packet
+ *  8 = Candidate-RP-Advertisement        Unicast to Domain's BSR
+ */
+enum {
+	PIM_TYPE_HELLO,
+	PIM_TYPE_REGISTER,
+	PIM_TYPE_REGISTER_STOP,
+	PIM_TYPE_JOIN_PRUNE,
+	PIM_TYPE_BOOTSTRAP,
+	PIM_TYPE_ASSERT,
+	PIM_TYPE_GRAFT,
+	PIM_TYPE_GRAFT_ACK,
+	PIM_TYPE_CANDIDATE_RP_ADV
+};
 
 #define PIM_NULL_REGISTER	cpu_to_be32(0x40000000)
 
-- 
cgit v1.2.3


From bd68a2a854ad5a85f0c8d0a9c8048ca3f6391efb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 31 Oct 2016 13:32:55 -0700
Subject: net: set SK_MEM_QUANTUM to 4096

Systems with large pages (64KB pages for example) do not always have
huge quantity of memory.

A big SK_MEM_QUANTUM value leads to fewer interactions with the
global counters (like tcp_memory_allocated) but might trigger
memory pressure much faster, giving suboptimal TCP performance
since windows are lowered to ridiculous values.

Note that sysctl_mem units being in pages and in ABI, we also need
to change sk_prot_mem_limits() accordingly.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index f13ac87a8015..93331a1492db 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1162,11 +1162,6 @@ static inline void sk_enter_memory_pressure(struct sock *sk)
 	sk->sk_prot->enter_memory_pressure(sk);
 }
 
-static inline long sk_prot_mem_limits(const struct sock *sk, int index)
-{
-	return sk->sk_prot->sysctl_mem[index];
-}
-
 static inline long
 sk_memory_allocated(const struct sock *sk)
 {
@@ -1281,11 +1276,27 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind);
 void __sk_mem_reduce_allocated(struct sock *sk, int amount);
 void __sk_mem_reclaim(struct sock *sk, int amount);
 
-#define SK_MEM_QUANTUM ((int)PAGE_SIZE)
+/* We used to have PAGE_SIZE here, but systems with 64KB pages
+ * do not necessarily have 16x time more memory than 4KB ones.
+ */
+#define SK_MEM_QUANTUM 4096
 #define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
 #define SK_MEM_SEND	0
 #define SK_MEM_RECV	1
 
+/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */
+static inline long sk_prot_mem_limits(const struct sock *sk, int index)
+{
+	long val = sk->sk_prot->sysctl_mem[index];
+
+#if PAGE_SIZE > SK_MEM_QUANTUM
+	val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
+#elif PAGE_SIZE < SK_MEM_QUANTUM
+	val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT;
+#endif
+	return val;
+}
+
 static inline int sk_mem_pages(int amt)
 {
 	return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT;
-- 
cgit v1.2.3


From f6d0cbcf09c506b9b022df8f9d7693a7cec3c732 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 24 Oct 2016 16:56:40 +0200
Subject: netfilter: nf_tables: add fib expression

Add FIB expression, supported for ipv4, ipv6 and inet family (the latter
just dispatches to ipv4 or ipv6 one based on nfproto).

Currently supports fetching output interface index/name and the
rtm_type associated with an address.

This can be used for adding path filtering. rtm_type is useful
to e.g. enforce a strong-end host model where packets
are only accepted if daddr is configured on the interface the
packet arrived on.

The fib expression is a native nftables alternative to the
xtables addrtype and rp_filter matches.

FIB result order for oif/oifname retrieval is as follows:
 - if packet is local (skb has rtable, RTF_LOCAL set, this
   will also catch looped-back multicast packets), set oif to
   the loopback interface.
 - if fib lookup returns an error, or result points to local,
   store zero result.  This means '--local' option of -m rpfilter
   is not supported. It is possible to use 'fib type local' or add
   explicit saddr/daddr matching rules to create exceptions if this
   is really needed.
 - store result in the destination register.
   In case of multiple routes, search set for desired oif in case
   strict matching is requested.

ipv4 and ipv6 behave fib expressions are supposed to behave the same.

[ I have collapsed Arnd Bergmann's ("netfilter: nf_tables: fib warnings")

	http://patchwork.ozlabs.org/patch/688615/

  to address fallout from this patch after rebasing nf-next, that was
  posted to address compilation warnings. --pablo ]

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_fib.h          | 31 +++++++++++++++++++++++++++
 include/uapi/linux/netfilter/nf_tables.h | 36 ++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 include/net/netfilter/nft_fib.h

(limited to 'include')

diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h
new file mode 100644
index 000000000000..cbedda077db2
--- /dev/null
+++ b/include/net/netfilter/nft_fib.h
@@ -0,0 +1,31 @@
+#ifndef _NFT_FIB_H_
+#define _NFT_FIB_H_
+
+struct nft_fib {
+	enum nft_registers	dreg:8;
+	u8			result;
+	u32			flags;
+};
+
+extern const struct nla_policy nft_fib_policy[];
+
+int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr);
+int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+		 const struct nlattr * const tb[]);
+int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
+		     const struct nft_data **data);
+
+
+void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
+			const struct nft_pktinfo *pkt);
+void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
+		   const struct nft_pktinfo *pkt);
+
+void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
+			const struct nft_pktinfo *pkt);
+void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
+		   const struct nft_pktinfo *pkt);
+
+void nft_fib_store_result(void *reg, enum nft_fib_result r,
+			  const struct nft_pktinfo *pkt, int index);
+#endif
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index c6c4477c136b..a054ad2c8853 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1109,6 +1109,42 @@ enum nft_gen_attributes {
 };
 #define NFTA_GEN_MAX		(__NFTA_GEN_MAX - 1)
 
+/*
+ * enum nft_fib_attributes - nf_tables fib expression netlink attributes
+ *
+ * @NFTA_FIB_DREG: destination register (NLA_U32)
+ * @NFTA_FIB_RESULT: desired result (NLA_U32)
+ * @NFTA_FIB_FLAGS: flowi fields to initialize when querying the FIB (NLA_U32)
+ *
+ * The FIB expression performs a route lookup according
+ * to the packet data.
+ */
+enum nft_fib_attributes {
+	NFTA_FIB_UNSPEC,
+	NFTA_FIB_DREG,
+	NFTA_FIB_RESULT,
+	NFTA_FIB_FLAGS,
+	__NFTA_FIB_MAX
+};
+#define NFTA_FIB_MAX (__NFTA_FIB_MAX - 1)
+
+enum nft_fib_result {
+	NFT_FIB_RESULT_UNSPEC,
+	NFT_FIB_RESULT_OIF,
+	NFT_FIB_RESULT_OIFNAME,
+	NFT_FIB_RESULT_ADDRTYPE,
+	__NFT_FIB_RESULT_MAX
+};
+#define NFT_FIB_RESULT_MAX	(__NFT_FIB_RESULT_MAX - 1)
+
+enum nft_fib_flags {
+	NFTA_FIB_F_SADDR	= 1 << 0,	/* look up src */
+	NFTA_FIB_F_DADDR	= 1 << 1,	/* look up dst */
+	NFTA_FIB_F_MARK		= 1 << 2,	/* use skb->mark */
+	NFTA_FIB_F_IIF		= 1 << 3,	/* restrict to iif */
+	NFTA_FIB_F_OIF		= 1 << 4,	/* restrict to oif */
+};
+
 /**
  * enum nft_trace_attributes - nf_tables trace netlink attributes
  *
-- 
cgit v1.2.3


From 1fddf4bad0ac9f4d32c74af286fc1eec2a03c82c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 27 Oct 2016 19:49:42 +0100
Subject: netfilter: nf_log: add packet logging for netdev family

Move layer 2 packet logging into nf_log_l2packet() that resides in
nf_log_common.c, so this can be shared by both bridge and netdev
families.

This patch adds the boiler plate code to register the netdev logging
family.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_log.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index 309cd267be4f..a559aa41253c 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -109,5 +109,10 @@ void nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf,
 			       const struct net_device *out,
 			       const struct nf_loginfo *loginfo,
 			       const char *prefix);
+void nf_log_l2packet(struct net *net, u_int8_t pf, unsigned int hooknum,
+		     const struct sk_buff *skb,
+		     const struct net_device *in,
+		     const struct net_device *out,
+		     const struct nf_loginfo *loginfo, const char *prefix);
 
 #endif /* _NF_LOG_H */
-- 
cgit v1.2.3


From 8db4c5be88f62ffd7a552f70687a10c614dc697b Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 27 Oct 2016 19:49:48 +0100
Subject: netfilter: move socket lookup infrastructure to nf_socket_ipv{4,6}.c

We need this split to reuse existing codebase for the upcoming nf_tables
socket expression.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_socket.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 include/net/netfilter/nf_socket.h

(limited to 'include')

diff --git a/include/net/netfilter/nf_socket.h b/include/net/netfilter/nf_socket.h
new file mode 100644
index 000000000000..f2fc39c97d43
--- /dev/null
+++ b/include/net/netfilter/nf_socket.h
@@ -0,0 +1,27 @@
+#ifndef _NF_SOCK_H_
+#define _NF_SOCK_H_
+
+struct net_device;
+struct sk_buff;
+struct sock;
+struct net;
+
+static inline bool nf_sk_is_transparent(struct sock *sk)
+{
+	switch (sk->sk_state) {
+	case TCP_TIME_WAIT:
+		return inet_twsk(sk)->tw_transparent;
+	case TCP_NEW_SYN_RECV:
+		return inet_rsk(inet_reqsk(sk))->no_srccheck;
+	default:
+		return inet_sk(sk)->transparent;
+	}
+}
+
+struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb,
+				  const struct net_device *indev);
+
+struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
+				  const struct net_device *indev);
+
+#endif
-- 
cgit v1.2.3


From 2fa841938c648fe4359691f41e8e1f37ff1a3aa2 Mon Sep 17 00:00:00 2001
From: "Anders K. Pedersen" <akp@cohaesio.com>
Date: Fri, 28 Oct 2016 05:54:15 +0000
Subject: netfilter: nf_tables: introduce routing expression

Introduces an nftables rt expression for routing related data with support
for nexthop (i.e. the directly connected IP address that an outgoing packet
is sent to), which can be used either for matching or accounting, eg.

 # nft add rule filter postrouting \
	ip daddr 192.168.1.0/24 rt nexthop != 192.168.0.1 drop

This will drop any traffic to 192.168.1.0/24 that is not routed via
192.168.0.1.

 # nft add rule filter postrouting \
	flow table acct { rt nexthop timeout 600s counter }
 # nft add rule ip6 filter postrouting \
	flow table acct { rt nexthop timeout 600s counter }

These rules count outgoing traffic per nexthop. Note that the timeout
releases an entry if no traffic is seen for this nexthop within 10 minutes.

 # nft add rule inet filter postrouting \
	ether type ip \
	flow table acct { rt nexthop timeout 600s counter }
 # nft add rule inet filter postrouting \
	ether type ip6 \
	flow table acct { rt nexthop timeout 600s counter }

Same as above, but via the inet family, where the ether type must be
specified explicitly.

"rt classid" is also implemented identical to "meta rtclassid", since it
is more logical to have this match in the routing expression going forward.

Signed-off-by: Anders K. Pedersen <akp@cohaesio.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index a054ad2c8853..14e5f619167e 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -758,6 +758,19 @@ enum nft_meta_keys {
 	NFT_META_PRANDOM,
 };
 
+/**
+ * enum nft_rt_keys - nf_tables routing expression keys
+ *
+ * @NFT_RT_CLASSID: realm value of packet's route (skb->dst->tclassid)
+ * @NFT_RT_NEXTHOP4: routing nexthop for IPv4
+ * @NFT_RT_NEXTHOP6: routing nexthop for IPv6
+ */
+enum nft_rt_keys {
+	NFT_RT_CLASSID,
+	NFT_RT_NEXTHOP4,
+	NFT_RT_NEXTHOP6,
+};
+
 /**
  * enum nft_hash_attributes - nf_tables hash expression netlink attributes
  *
@@ -796,6 +809,20 @@ enum nft_meta_attributes {
 };
 #define NFTA_META_MAX		(__NFTA_META_MAX - 1)
 
+/**
+ * enum nft_rt_attributes - nf_tables routing expression netlink attributes
+ *
+ * @NFTA_RT_DREG: destination register (NLA_U32)
+ * @NFTA_RT_KEY: routing data item to load (NLA_U32: nft_rt_keys)
+ */
+enum nft_rt_attributes {
+	NFTA_RT_UNSPEC,
+	NFTA_RT_DREG,
+	NFTA_RT_KEY,
+	__NFTA_RT_MAX
+};
+#define NFTA_RT_MAX		(__NFTA_RT_MAX - 1)
+
 /**
  * enum nft_ct_keys - nf_tables ct expression keys
  *
-- 
cgit v1.2.3


From bc8ee596afe8f35b379f87575c46d800dd8e7e68 Mon Sep 17 00:00:00 2001
From: Philippe Reynes <tremyfr@gmail.com>
Date: Tue, 1 Nov 2016 16:32:25 +0100
Subject: net: mii: add generic function to support ksetting support

The old ethtool api (get_setting and set_setting) has generic mii
functions mii_ethtool_sset and mii_ethtool_gset.

To support the new ethtool api ({get|set}_link_ksettings), we add
two generics mii function mii_ethtool_{get|set}_link_ksettings_get.

Signed-off-by: Philippe Reynes <tremyfr@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mii.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/mii.h b/include/linux/mii.h
index 47492c9631b3..1629a0c32679 100644
--- a/include/linux/mii.h
+++ b/include/linux/mii.h
@@ -31,7 +31,11 @@ struct mii_if_info {
 extern int mii_link_ok (struct mii_if_info *mii);
 extern int mii_nway_restart (struct mii_if_info *mii);
 extern int mii_ethtool_gset(struct mii_if_info *mii, struct ethtool_cmd *ecmd);
+extern int mii_ethtool_get_link_ksettings(
+	struct mii_if_info *mii, struct ethtool_link_ksettings *cmd);
 extern int mii_ethtool_sset(struct mii_if_info *mii, struct ethtool_cmd *ecmd);
+extern int mii_ethtool_set_link_ksettings(
+	struct mii_if_info *mii, const struct ethtool_link_ksettings *cmd);
 extern int mii_check_gmii_support(struct mii_if_info *mii);
 extern void mii_check_link (struct mii_if_info *mii);
 extern unsigned int mii_check_media (struct mii_if_info *mii,
-- 
cgit v1.2.3


From 1610a73c4175e7d63985316b52ac932b65a4dc90 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 3 Nov 2016 10:56:12 +0100
Subject: netfilter: kill NF_HOOK_THRESH() and state->tresh

Patch c5136b15ea36 ("netfilter: bridge: add and use br_nf_hook_thresh")
introduced br_nf_hook_thresh().

Replace NF_HOOK_THRESH() by br_nf_hook_thresh from
br_nf_forward_finish(), so we have no more callers for this macro.

As a result, state->thresh and explicit thresh parameter in the hook
state structure is not required anymore. And we can get rid of
skip-hook-under-thresh loop in nf_iterate() in the core path that is
only used by br_netfilter to search for the filter hook.

Suggested-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h         | 50 ++++++++++-----------------------------
 include/linux/netfilter_ingress.h |  2 +-
 2 files changed, 14 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index abc7fdcb9eb1..e0d000f6c9bf 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -49,7 +49,6 @@ struct sock;
 
 struct nf_hook_state {
 	unsigned int hook;
-	int thresh;
 	u_int8_t pf;
 	struct net_device *in;
 	struct net_device *out;
@@ -84,7 +83,7 @@ struct nf_hook_entry {
 static inline void nf_hook_state_init(struct nf_hook_state *p,
 				      struct nf_hook_entry *hook_entry,
 				      unsigned int hook,
-				      int thresh, u_int8_t pf,
+				      u_int8_t pf,
 				      struct net_device *indev,
 				      struct net_device *outdev,
 				      struct sock *sk,
@@ -92,7 +91,6 @@ static inline void nf_hook_state_init(struct nf_hook_state *p,
 				      int (*okfn)(struct net *, struct sock *, struct sk_buff *))
 {
 	p->hook = hook;
-	p->thresh = thresh;
 	p->pf = pf;
 	p->in = indev;
 	p->out = outdev;
@@ -155,20 +153,16 @@ extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state);
 
 /**
- *	nf_hook_thresh - call a netfilter hook
+ *	nf_hook - call a netfilter hook
  *
  *	Returns 1 if the hook has allowed the packet to pass.  The function
  *	okfn must be invoked by the caller in this case.  Any other return
  *	value indicates the packet has been consumed by the hook.
  */
-static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,
-				 struct net *net,
-				 struct sock *sk,
-				 struct sk_buff *skb,
-				 struct net_device *indev,
-				 struct net_device *outdev,
-				 int (*okfn)(struct net *, struct sock *, struct sk_buff *),
-				 int thresh)
+static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
+			  struct sock *sk, struct sk_buff *skb,
+			  struct net_device *indev, struct net_device *outdev,
+			  int (*okfn)(struct net *, struct sock *, struct sk_buff *))
 {
 	struct nf_hook_entry *hook_head;
 	int ret = 1;
@@ -185,8 +179,8 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,
 	if (hook_head) {
 		struct nf_hook_state state;
 
-		nf_hook_state_init(&state, hook_head, hook, thresh,
-				   pf, indev, outdev, sk, net, okfn);
+		nf_hook_state_init(&state, hook_head, hook, pf, indev, outdev,
+				   sk, net, okfn);
 
 		ret = nf_hook_slow(skb, &state);
 	}
@@ -195,14 +189,6 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook,
 	return ret;
 }
 
-static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
-			  struct sock *sk, struct sk_buff *skb,
-			  struct net_device *indev, struct net_device *outdev,
-			  int (*okfn)(struct net *, struct sock *, struct sk_buff *))
-{
-	return nf_hook_thresh(pf, hook, net, sk, skb, indev, outdev, okfn, INT_MIN);
-}
-                   
 /* Activate hook; either okfn or kfree_skb called, unless a hook
    returns NF_STOLEN (in which case, it's up to the hook to deal with
    the consequences).
@@ -220,19 +206,6 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
    coders :)
 */
 
-static inline int
-NF_HOOK_THRESH(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
-	       struct sk_buff *skb, struct net_device *in,
-	       struct net_device *out,
-	       int (*okfn)(struct net *, struct sock *, struct sk_buff *),
-	       int thresh)
-{
-	int ret = nf_hook_thresh(pf, hook, net, sk, skb, in, out, okfn, thresh);
-	if (ret == 1)
-		ret = okfn(net, sk, skb);
-	return ret;
-}
-
 static inline int
 NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
 	     struct sk_buff *skb, struct net_device *in, struct net_device *out,
@@ -242,7 +215,7 @@ NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
 	int ret;
 
 	if (!cond ||
-	    ((ret = nf_hook_thresh(pf, hook, net, sk, skb, in, out, okfn, INT_MIN)) == 1))
+	    ((ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn)) == 1))
 		ret = okfn(net, sk, skb);
 	return ret;
 }
@@ -252,7 +225,10 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct
 	struct net_device *in, struct net_device *out,
 	int (*okfn)(struct net *, struct sock *, struct sk_buff *))
 {
-	return NF_HOOK_THRESH(pf, hook, net, sk, skb, in, out, okfn, INT_MIN);
+	int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
+	if (ret == 1)
+		ret = okfn(net, sk, skb);
+	return ret;
 }
 
 /* Call setsockopt() */
diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h
index 33e37fb41d5d..fd44e4131710 100644
--- a/include/linux/netfilter_ingress.h
+++ b/include/linux/netfilter_ingress.h
@@ -26,7 +26,7 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
 	if (unlikely(!e))
 		return 0;
 
-	nf_hook_state_init(&state, e, NF_NETDEV_INGRESS, INT_MIN,
+	nf_hook_state_init(&state, e, NF_NETDEV_INGRESS,
 			   NFPROTO_NETDEV, skb->dev, NULL, NULL,
 			   dev_net(skb->dev), NULL);
 	return nf_hook_slow(skb, &state);
-- 
cgit v1.2.3


From 06fd3a392bb36ff162d10cb7d5794185b94edb2f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 3 Nov 2016 10:56:17 +0100
Subject: netfilter: deprecate NF_STOP

NF_STOP is only used by br_netfilter these days, and it can be emulated
with a combination of NF_STOLEN plus explicit call to the ->okfn()
function as Florian suggests.

To retain binary compatibility with userspace nf_queue application, we
have to keep NF_STOP around, so libnetfilter_queue userspace userspace
applications still work if they use NF_STOP for some exotic reason.

Out of tree modules using NF_STOP would break, but we don't care about
those.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h
index d93f949d1d9a..7550e9176a54 100644
--- a/include/uapi/linux/netfilter.h
+++ b/include/uapi/linux/netfilter.h
@@ -13,7 +13,7 @@
 #define NF_STOLEN 2
 #define NF_QUEUE 3
 #define NF_REPEAT 4
-#define NF_STOP 5
+#define NF_STOP 5	/* Deprecated, for userspace nf_queue compatibility. */
 #define NF_MAX_VERDICT NF_STOP
 
 /* we overload the higher bits for encoding auxiliary data such as the queue
-- 
cgit v1.2.3


From 613dbd95723aee7abd16860745691b6c7bda20dc Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 3 Nov 2016 10:56:21 +0100
Subject: netfilter: x_tables: move hook state into xt_action_param structure

Place pointer to hook state in xt_action_param structure instead of
copying the fields that we need. After this change xt_action_param fits
into one cacheline.

This patch also adds a set of new wrapper functions to fetch relevant
hook state structure fields.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 48 ++++++++++++++++++++++++++++++--------
 include/net/netfilter/nf_tables.h  | 11 +++++----
 2 files changed, 44 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 2ad1a2b289b5..cd4eaf8df445 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -4,6 +4,7 @@
 
 #include <linux/netdevice.h>
 #include <linux/static_key.h>
+#include <linux/netfilter.h>
 #include <uapi/linux/netfilter/x_tables.h>
 
 /* Test a struct->invflags and a boolean for inequality */
@@ -17,14 +18,9 @@
  * @target:	the target extension
  * @matchinfo:	per-match data
  * @targetinfo:	per-target data
- * @net		network namespace through which the action was invoked
- * @in:		input netdevice
- * @out:	output netdevice
+ * @state:	pointer to hook state this packet came from
  * @fragoff:	packet is a fragment, this is the data offset
  * @thoff:	position of transport header relative to skb->data
- * @hook:	hook number given packet came from
- * @family:	Actual NFPROTO_* through which the function is invoked
- * 		(helpful when match->family == NFPROTO_UNSPEC)
  *
  * Fields written to by extensions:
  *
@@ -38,15 +34,47 @@ struct xt_action_param {
 	union {
 		const void *matchinfo, *targinfo;
 	};
-	struct net *net;
-	const struct net_device *in, *out;
+	const struct nf_hook_state *state;
 	int fragoff;
 	unsigned int thoff;
-	unsigned int hooknum;
-	u_int8_t family;
 	bool hotdrop;
 };
 
+static inline struct net *xt_net(const struct xt_action_param *par)
+{
+	return par->state->net;
+}
+
+static inline struct net_device *xt_in(const struct xt_action_param *par)
+{
+	return par->state->in;
+}
+
+static inline const char *xt_inname(const struct xt_action_param *par)
+{
+	return par->state->in->name;
+}
+
+static inline struct net_device *xt_out(const struct xt_action_param *par)
+{
+	return par->state->out;
+}
+
+static inline const char *xt_outname(const struct xt_action_param *par)
+{
+	return par->state->out->name;
+}
+
+static inline unsigned int xt_hooknum(const struct xt_action_param *par)
+{
+	return par->state->hook;
+}
+
+static inline u_int8_t xt_family(const struct xt_action_param *par)
+{
+	return par->state->pf;
+}
+
 /**
  * struct xt_mtchk_param - parameters for match extensions'
  * checkentry functions
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 5031e072567b..44060344f958 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -30,11 +30,12 @@ static inline void nft_set_pktinfo(struct nft_pktinfo *pkt,
 				   const struct nf_hook_state *state)
 {
 	pkt->skb = skb;
-	pkt->net = pkt->xt.net = state->net;
-	pkt->in = pkt->xt.in = state->in;
-	pkt->out = pkt->xt.out = state->out;
-	pkt->hook = pkt->xt.hooknum = state->hook;
-	pkt->pf = pkt->xt.family = state->pf;
+	pkt->net = state->net;
+	pkt->in = state->in;
+	pkt->out = state->out;
+	pkt->hook = state->hook;
+	pkt->pf = state->pf;
+	pkt->xt.state = state;
 }
 
 static inline void nft_set_pktinfo_proto_unspec(struct nft_pktinfo *pkt,
-- 
cgit v1.2.3


From 0e5a1c7eb3fc705c4cc6c1e058e81d1f2e721c72 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 3 Nov 2016 10:56:26 +0100
Subject: netfilter: nf_tables: use hook state from xt_action_param structure

Don't copy relevant fields from hook state structure, instead use the
one that is already available in struct xt_action_param.

This patch also adds a set of new wrapper functions to fetch relevant
hook state structure fields.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 44060344f958..3295fb85bff6 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -14,27 +14,42 @@
 
 struct nft_pktinfo {
 	struct sk_buff			*skb;
-	struct net			*net;
-	const struct net_device		*in;
-	const struct net_device		*out;
-	u8				pf;
-	u8				hook;
 	bool				tprot_set;
 	u8				tprot;
 	/* for x_tables compatibility */
 	struct xt_action_param		xt;
 };
 
+static inline struct net *nft_net(const struct nft_pktinfo *pkt)
+{
+	return pkt->xt.state->net;
+}
+
+static inline unsigned int nft_hook(const struct nft_pktinfo *pkt)
+{
+	return pkt->xt.state->hook;
+}
+
+static inline u8 nft_pf(const struct nft_pktinfo *pkt)
+{
+	return pkt->xt.state->pf;
+}
+
+static inline const struct net_device *nft_in(const struct nft_pktinfo *pkt)
+{
+	return pkt->xt.state->in;
+}
+
+static inline const struct net_device *nft_out(const struct nft_pktinfo *pkt)
+{
+	return pkt->xt.state->out;
+}
+
 static inline void nft_set_pktinfo(struct nft_pktinfo *pkt,
 				   struct sk_buff *skb,
 				   const struct nf_hook_state *state)
 {
 	pkt->skb = skb;
-	pkt->net = state->net;
-	pkt->in = state->in;
-	pkt->out = state->out;
-	pkt->hook = state->hook;
-	pkt->pf = state->pf;
 	pkt->xt.state = state;
 }
 
-- 
cgit v1.2.3


From 01886bd91f1ba418ce669dfe97a06ca9504e482a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 3 Nov 2016 10:56:35 +0100
Subject: netfilter: remove hook_entries field from nf_hook_state

This field is only useful for nf_queue, so store it in the
nf_queue_entry structure instead, away from the core path. Pass
hook_head to nf_hook_slow().

Since we always have a valid entry on the first iteration in
nf_iterate(), we can use 'do { ... } while (entry)' loop instead.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h         | 10 ++++------
 include/linux/netfilter_ingress.h |  4 ++--
 include/net/netfilter/nf_queue.h  |  1 +
 3 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index e0d000f6c9bf..69230140215b 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -54,7 +54,6 @@ struct nf_hook_state {
 	struct net_device *out;
 	struct sock *sk;
 	struct net *net;
-	struct nf_hook_entry __rcu *hook_entries;
 	int (*okfn)(struct net *, struct sock *, struct sk_buff *);
 };
 
@@ -81,7 +80,6 @@ struct nf_hook_entry {
 };
 
 static inline void nf_hook_state_init(struct nf_hook_state *p,
-				      struct nf_hook_entry *hook_entry,
 				      unsigned int hook,
 				      u_int8_t pf,
 				      struct net_device *indev,
@@ -96,7 +94,6 @@ static inline void nf_hook_state_init(struct nf_hook_state *p,
 	p->out = outdev;
 	p->sk = sk;
 	p->net = net;
-	RCU_INIT_POINTER(p->hook_entries, hook_entry);
 	p->okfn = okfn;
 }
 
@@ -150,7 +147,8 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg);
 extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
 #endif
 
-int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state);
+int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
+		 struct nf_hook_entry *entry);
 
 /**
  *	nf_hook - call a netfilter hook
@@ -179,10 +177,10 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
 	if (hook_head) {
 		struct nf_hook_state state;
 
-		nf_hook_state_init(&state, hook_head, hook, pf, indev, outdev,
+		nf_hook_state_init(&state, hook, pf, indev, outdev,
 				   sk, net, okfn);
 
-		ret = nf_hook_slow(skb, &state);
+		ret = nf_hook_slow(skb, &state, hook_head);
 	}
 	rcu_read_unlock();
 
diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h
index fd44e4131710..2dc3b49b804a 100644
--- a/include/linux/netfilter_ingress.h
+++ b/include/linux/netfilter_ingress.h
@@ -26,10 +26,10 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
 	if (unlikely(!e))
 		return 0;
 
-	nf_hook_state_init(&state, e, NF_NETDEV_INGRESS,
+	nf_hook_state_init(&state, NF_NETDEV_INGRESS,
 			   NFPROTO_NETDEV, skb->dev, NULL, NULL,
 			   dev_net(skb->dev), NULL);
-	return nf_hook_slow(skb, &state);
+	return nf_hook_slow(skb, &state, e);
 }
 
 static inline void nf_hook_ingress_init(struct net_device *dev)
diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 2280cfe86c56..09948d10e38e 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -12,6 +12,7 @@ struct nf_queue_entry {
 	unsigned int		id;
 
 	struct nf_hook_state	state;
+	struct nf_hook_entry	*hook;
 	u16			size; /* sizeof(entry) + saved route keys */
 
 	/* extra space to store route keys */
-- 
cgit v1.2.3


From 70ecc24841326396a827deb55c3fefac582a729d Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 2 Nov 2016 11:02:16 -0400
Subject: ipv4: add IP_RECVFRAGSIZE cmsg

The IP stack records the largest fragment of a reassembled packet
in IPCB(skb)->frag_max_size. When reading a datagram or raw packet
that arrived fragmented, expose the value to allow applications to
estimate receive path MTU.

Tested:
  Sent data over a veth pair of which the source has a small mtu.
  Sent data using netcat, received using a dedicated process.

  Verified that the cmsg IP_RECVFRAGSIZE is returned only when
  data arrives fragmented, and in that cases matches the veth mtu.

    ip link add veth0 type veth peer name veth1

    ip netns add from
    ip netns add to

    ip link set dev veth1 netns to
    ip netns exec to ip addr add dev veth1 192.168.10.1/24
    ip netns exec to ip link set dev veth1 up

    ip link set dev veth0 netns from
    ip netns exec from ip addr add dev veth0 192.168.10.2/24
    ip netns exec from ip link set dev veth0 up
    ip netns exec from ip link set dev veth0 mtu 1300
    ip netns exec from ethtool -K veth0 ufo off

    dd if=/dev/zero bs=1 count=1400 2>/dev/null > payload

    ip netns exec to ./recv_cmsg_recvfragsize -4 -u -p 6000 &
    ip netns exec from nc -q 1 -u 192.168.10.1 6000 < payload

  using github.com/wdebruij/kerneltools/blob/master/tests/recvfragsize.c

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h | 1 +
 include/uapi/linux/in.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 236a81034fef..c9cff977a7fb 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -228,6 +228,7 @@ struct inet_sock {
 #define IP_CMSG_PASSSEC		BIT(5)
 #define IP_CMSG_ORIGDSTADDR	BIT(6)
 #define IP_CMSG_CHECKSUM	BIT(7)
+#define IP_CMSG_RECVFRAGSIZE	BIT(8)
 
 /**
  * sk_to_full_sk - Access to a full socket
diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h
index eaf94919291a..4e557f4e9553 100644
--- a/include/uapi/linux/in.h
+++ b/include/uapi/linux/in.h
@@ -117,6 +117,7 @@ struct in_addr {
 #define IP_NODEFRAG     22
 #define IP_CHECKSUM	23
 #define IP_BIND_ADDRESS_NO_PORT	24
+#define IP_RECVFRAGSIZE	25
 
 /* IP_MTU_DISCOVER values */
 #define IP_PMTUDISC_DONT		0	/* Never send DF frames */
-- 
cgit v1.2.3


From 0cc0aa614b4c24b21b2492c0a1753035ee8c6edb Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 2 Nov 2016 11:02:17 -0400
Subject: ipv6: add IPV6_RECVFRAGSIZE cmsg

When reading a datagram or raw packet that arrived fragmented, expose
the maximum fragment size if recorded to allow applications to
estimate receive path MTU.

At this point, the field is only recorded when ipv6 connection
tracking is enabled. A follow-up patch will record this field also
in the ipv6 input path.

Tested using the test for IP_RECVFRAGSIZE plus

  ip netns exec to ip addr add dev veth1 fc07::1/64
  ip netns exec from ip addr add dev veth0 fc07::2/64

  ip netns exec to ./recv_cmsg_recvfragsize -6 -u -p 6000 &
  ip netns exec from nc -q 1 -u fc07::1 6000 < payload

Both with and without enabling connection tracking

  ip6tables -A INPUT -m state --state NEW -p udp -j LOG

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h     | 5 +++--
 include/uapi/linux/in6.h | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index ca1ad9ebbc92..1afb6e8d35c3 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -229,8 +229,9 @@ struct ipv6_pinfo {
                                 rxflow:1,
 				rxtclass:1,
 				rxpmtu:1,
-				rxorigdstaddr:1;
-				/* 2 bits hole */
+				rxorigdstaddr:1,
+				recvfragsize:1;
+				/* 1 bits hole */
 		} bits;
 		__u16		all;
 	} rxopt;
diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h
index b39ea4f2e701..46444f8fbee4 100644
--- a/include/uapi/linux/in6.h
+++ b/include/uapi/linux/in6.h
@@ -283,6 +283,7 @@ struct in6_flowlabel_req {
 #define IPV6_RECVORIGDSTADDR    IPV6_ORIGDSTADDR
 #define IPV6_TRANSPARENT        75
 #define IPV6_UNICAST_IF         76
+#define IPV6_RECVFRAGSIZE	77
 
 /*
  * Multicast Routing:
-- 
cgit v1.2.3


From 5976c5f45c40588b90dda173ded9010917f8f45e Mon Sep 17 00:00:00 2001
From: Simon Horman <simon.horman@netronome.com>
Date: Thu, 3 Nov 2016 13:24:21 +0100
Subject: net/sched: cls_flower: Support matching on SCTP ports

Support matching on SCTP ports in the same way that matching
on TCP and UDP ports is already supported.

Example usage:

tc qdisc add dev eth0 ingress

tc filter add dev eth0 protocol ip parent ffff: \
        flower indev eth0 ip_proto sctp dst_port 80 \
        action drop

Signed-off-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 8fd715f806a2..eb94781757ee 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -447,6 +447,11 @@ enum {
 	TCA_FLOWER_KEY_TCP_DST_MASK,	/* be16 */
 	TCA_FLOWER_KEY_UDP_SRC_MASK,	/* be16 */
 	TCA_FLOWER_KEY_UDP_DST_MASK,	/* be16 */
+	TCA_FLOWER_KEY_SCTP_SRC_MASK,	/* be16 */
+	TCA_FLOWER_KEY_SCTP_DST_MASK,	/* be16 */
+
+	TCA_FLOWER_KEY_SCTP_SRC,	/* be16 */
+	TCA_FLOWER_KEY_SCTP_DST,	/* be16 */
 	__TCA_FLOWER_MAX,
 };
 
-- 
cgit v1.2.3


From 86741ec25462e4c8cdce6df2f41ead05568c7d5e Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Fri, 4 Nov 2016 02:23:41 +0900
Subject: net: core: Add a UID field to struct sock.

Protocol sockets (struct sock) don't have UIDs, but most of the
time, they map 1:1 to userspace sockets (struct socket) which do.

Various operations such as the iptables xt_owner match need
access to the "UID of a socket", and do so by following the
backpointer to the struct socket. This involves taking
sk_callback_lock and doesn't work when there is no socket
because userspace has already called close().

Simplify this by adding a sk_uid field to struct sock whose value
matches the UID of the corresponding struct socket. The semantics
are as follows:

1. Whenever sk_socket is non-null: sk_uid is the same as the UID
   in sk_socket, i.e., matches the return value of sock_i_uid.
   Specifically, the UID is set when userspace calls socket(),
   fchown(), or accept().
2. When sk_socket is NULL, sk_uid is defined as follows:
   - For a socket that no longer has a sk_socket because
     userspace has called close(): the previous UID.
   - For a cloned socket (e.g., an incoming connection that is
     established but on which userspace has not yet called
     accept): the UID of the socket it was cloned from.
   - For a socket that has never had an sk_socket: UID 0 inside
     the user namespace corresponding to the network namespace
     the socket belongs to.

Kernel sockets created by sock_create_kern are a special case
of #1 and sk_uid is the user that created them. For kernel
sockets created at network namespace creation time, such as the
per-processor ICMP and TCP sockets, this is the user that created
the network namespace.

Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 93331a1492db..cf617ee16723 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -419,6 +419,7 @@ struct sock {
 	u32			sk_max_ack_backlog;
 	__u32			sk_priority;
 	__u32			sk_mark;
+	kuid_t			sk_uid;
 	struct pid		*sk_peer_pid;
 	const struct cred	*sk_peer_cred;
 	long			sk_rcvtimeo;
@@ -1664,6 +1665,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
 	sk->sk_wq = parent->wq;
 	parent->sk = sk;
 	sk_set_socket(sk, parent);
+	sk->sk_uid = SOCK_INODE(parent)->i_uid;
 	security_sock_graft(sk, parent);
 	write_unlock_bh(&sk->sk_callback_lock);
 }
@@ -1671,6 +1673,11 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
 kuid_t sock_i_uid(struct sock *sk);
 unsigned long sock_i_ino(struct sock *sk);
 
+static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
+{
+	return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
+}
+
 static inline u32 net_tx_rndhash(void)
 {
 	u32 v = prandom_u32();
-- 
cgit v1.2.3


From 622ec2c9d52405973c9f1ca5116eb1c393adfc7d Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Fri, 4 Nov 2016 02:23:42 +0900
Subject: net: core: add UID to flows, rules, and routes

- Define a new FIB rule attributes, FRA_UID_RANGE, to describe a
  range of UIDs.
- Define a RTA_UID attribute for per-UID route lookups and dumps.
- Support passing these attributes to and from userspace via
  rtnetlink. The value INVALID_UID indicates no UID was
  specified.
- Add a UID field to the flow structures.

Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h        | 9 ++++++++-
 include/net/flow.h             | 5 +++++
 include/uapi/linux/fib_rules.h | 6 ++++++
 include/uapi/linux/rtnetlink.h | 1 +
 4 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 456e4a6006ab..8dbfdf728cd8 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -8,6 +8,11 @@
 #include <net/flow.h>
 #include <net/rtnetlink.h>
 
+struct fib_kuid_range {
+	kuid_t start;
+	kuid_t end;
+};
+
 struct fib_rule {
 	struct list_head	list;
 	int			iifindex;
@@ -30,6 +35,7 @@ struct fib_rule {
 	int			suppress_prefixlen;
 	char			iifname[IFNAMSIZ];
 	char			oifname[IFNAMSIZ];
+	struct fib_kuid_range	uid_range;
 	struct rcu_head		rcu;
 };
 
@@ -92,7 +98,8 @@ struct fib_rules_ops {
 	[FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \
 	[FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \
 	[FRA_GOTO]	= { .type = NLA_U32 }, \
-	[FRA_L3MDEV]	= { .type = NLA_U8 }
+	[FRA_L3MDEV]	= { .type = NLA_U8 }, \
+	[FRA_UID_RANGE]	= { .len = sizeof(struct fib_rule_uid_range) }
 
 static inline void fib_rule_get(struct fib_rule *rule)
 {
diff --git a/include/net/flow.h b/include/net/flow.h
index 035aa7716967..51373f3a5e31 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -11,6 +11,7 @@
 #include <linux/in6.h>
 #include <linux/atomic.h>
 #include <net/flow_dissector.h>
+#include <linux/uidgid.h>
 
 /*
  * ifindex generation is per-net namespace, and loopback is
@@ -37,6 +38,7 @@ struct flowi_common {
 #define FLOWI_FLAG_SKIP_NH_OIF		0x04
 	__u32	flowic_secid;
 	struct flowi_tunnel flowic_tun_key;
+	kuid_t  flowic_uid;
 };
 
 union flowi_uli {
@@ -74,6 +76,7 @@ struct flowi4 {
 #define flowi4_flags		__fl_common.flowic_flags
 #define flowi4_secid		__fl_common.flowic_secid
 #define flowi4_tun_key		__fl_common.flowic_tun_key
+#define flowi4_uid		__fl_common.flowic_uid
 
 	/* (saddr,daddr) must be grouped, same order as in IP header */
 	__be32			saddr;
@@ -131,6 +134,7 @@ struct flowi6 {
 #define flowi6_flags		__fl_common.flowic_flags
 #define flowi6_secid		__fl_common.flowic_secid
 #define flowi6_tun_key		__fl_common.flowic_tun_key
+#define flowi6_uid		__fl_common.flowic_uid
 	struct in6_addr		daddr;
 	struct in6_addr		saddr;
 	/* Note: flowi6_tos is encoded in flowlabel, too. */
@@ -176,6 +180,7 @@ struct flowi {
 #define flowi_flags	u.__fl_common.flowic_flags
 #define flowi_secid	u.__fl_common.flowic_secid
 #define flowi_tun_key	u.__fl_common.flowic_tun_key
+#define flowi_uid	u.__fl_common.flowic_uid
 } __attribute__((__aligned__(BITS_PER_LONG/8)));
 
 static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4)
diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index 14404b3ebb89..bbf02a63a011 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -29,6 +29,11 @@ struct fib_rule_hdr {
 	__u32		flags;
 };
 
+struct fib_rule_uid_range {
+	__u32		start;
+	__u32		end;
+};
+
 enum {
 	FRA_UNSPEC,
 	FRA_DST,	/* destination address */
@@ -51,6 +56,7 @@ enum {
 	FRA_OIFNAME,
 	FRA_PAD,
 	FRA_L3MDEV,	/* iif or oif is l3mdev goto its table */
+	FRA_UID_RANGE,	/* UID range */
 	__FRA_MAX
 };
 
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 5a78be518101..e14377f2ec27 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -318,6 +318,7 @@ enum rtattr_type_t {
 	RTA_ENCAP,
 	RTA_EXPIRES,
 	RTA_PAD,
+	RTA_UID,
 	__RTA_MAX
 };
 
-- 
cgit v1.2.3


From e2d118a1cb5e60d077131a09db1d81b90a5295fe Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Fri, 4 Nov 2016 02:23:43 +0900
Subject: net: inet: Support UID-based routing in IP protocols.

- Use the UID in routing lookups made by protocol connect() and
  sendmsg() functions.
- Make sure that routing lookups triggered by incoming packets
  (e.g., Path MTU discovery) take the UID of the socket into
  account.
- For packets not associated with a userspace socket, (e.g., ping
  replies) use UID 0 inside the user namespace corresponding to
  the network namespace the socket belongs to. This allows
  all namespaces to apply routing and iptables rules to
  kernel-originated traffic in that namespaces by matching UID 0.
  This is better than using the UID of the kernel socket that is
  sending the traffic, because the UID of kernel sockets created
  at namespace creation time (e.g., the per-processor ICMP and
  TCP sockets) is the UID of the user that created the socket,
  which might not be mapped in the namespace.

Tested: compiles allnoconfig, allyesconfig, allmodconfig
Tested: https://android-review.googlesource.com/253302
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h      | 4 +++-
 include/net/ip.h        | 1 +
 include/net/ip6_route.h | 5 +++--
 include/net/route.h     | 5 +++--
 4 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/flow.h b/include/net/flow.h
index 51373f3a5e31..6bbbca8af8e3 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -96,7 +96,8 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
 				      __u32 mark, __u8 tos, __u8 scope,
 				      __u8 proto, __u8 flags,
 				      __be32 daddr, __be32 saddr,
-				      __be16 dport, __be16 sport)
+				      __be16 dport, __be16 sport,
+				      kuid_t uid)
 {
 	fl4->flowi4_oif = oif;
 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
@@ -107,6 +108,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
 	fl4->flowi4_flags = flags;
 	fl4->flowi4_secid = 0;
 	fl4->flowi4_tun_key.tun_id = 0;
+	fl4->flowi4_uid = uid;
 	fl4->daddr = daddr;
 	fl4->saddr = saddr;
 	fl4->fl4_dport = dport;
diff --git a/include/net/ip.h b/include/net/ip.h
index 5413883ac47f..55cdaac02957 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -179,6 +179,7 @@ struct ip_reply_arg {
 				/* -1 if not needed */ 
 	int	    bound_dev_if;
 	u8  	    tos;
+	kuid_t	    uid;
 }; 
 
 #define IP_REPLY_ARG_NOSRCCHECK 1
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index f83e78d071a3..9dc2c182a263 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -140,9 +140,10 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 		  const struct in6_addr *gwaddr);
 
 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif,
-		     u32 mark);
+		     u32 mark, kuid_t uid);
 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu);
-void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark);
+void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
+		  kuid_t uid);
 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
 			    u32 mark);
 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk);
diff --git a/include/net/route.h b/include/net/route.h
index 0429d47cad25..c0874c87c173 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -153,7 +153,7 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi
 	flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos,
 			   RT_SCOPE_UNIVERSE, proto,
 			   sk ? inet_sk_flowi_flags(sk) : 0,
-			   daddr, saddr, dport, sport);
+			   daddr, saddr, dport, sport, sock_net_uid(net, sk));
 	if (sk)
 		security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
 	return ip_route_output_flow(net, fl4, sk);
@@ -269,7 +269,8 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32
 		flow_flags |= FLOWI_FLAG_ANYSRC;
 
 	flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
-			   protocol, flow_flags, dst, src, dport, sport);
+			   protocol, flow_flags, dst, src, dport, sport,
+			   sk->sk_uid);
 }
 
 static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
-- 
cgit v1.2.3


From 68f929ff2654bced015ccb9b5555667f46f88dfa Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 3 Nov 2016 17:12:06 +0000
Subject: debugfs: constify argument to debugfs_real_fops()

seq_file users can only access const version of file pointer,
because the ->file member of struct seq_operations is marked
as such.  Make parameter to debugfs_real_fops() const.

CC: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
CC: Nicolai Stange <nicstange@gmail.com>
CC: Christian Lamparter <chunkeey@gmail.com>
CC: LKML <linux-kernel@vger.kernel.org>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/debugfs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 4d3f0d1aec73..bf1907d96097 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -52,7 +52,8 @@ extern struct srcu_struct debugfs_srcu;
  * Must only be called under the protection established by
  * debugfs_use_file_start().
  */
-static inline const struct file_operations *debugfs_real_fops(struct file *filp)
+static inline const struct file_operations *
+debugfs_real_fops(const struct file *filp)
 	__must_hold(&debugfs_srcu)
 {
 	/*
-- 
cgit v1.2.3


From 5b4e2900512321435a5cd7dd77f58f23f3109950 Mon Sep 17 00:00:00 2001
From: Jon Mason <jon.mason@broadcom.com>
Date: Fri, 4 Nov 2016 01:10:56 -0400
Subject: net: phy: broadcom: add bcm54xx_auxctl_read

Add a helper function to read the AUXCTL register for the BCM54xx.  This
mirrors the bcm54xx_auxctl_write function already present in the code.

Signed-off-by: Jon Mason <jon.mason@broadcom.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/brcmphy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 60def78c4e12..0ed66914b61c 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -110,6 +110,7 @@
 #define MII_BCM54XX_AUXCTL_MISC_FORCE_AMDIX	0x0200
 #define MII_BCM54XX_AUXCTL_MISC_RDSEL_MISC	0x7000
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC	0x0007
+#define MII_BCM54XX_AUXCTL_SHDWSEL_READ_SHIFT	12
 
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MASK	0x0007
 
-- 
cgit v1.2.3


From b14995ac2527b43a75c9190fbd4efd43fb1f4562 Mon Sep 17 00:00:00 2001
From: Jon Mason <jon.mason@broadcom.com>
Date: Fri, 4 Nov 2016 01:10:58 -0400
Subject: net: phy: broadcom: Add BCM54810 PHY entry

The BCM54810 PHY requires some semi-unique configuration, which results
in some additional configuration in addition to the standard config.
Also, some users of the BCM54810 require the PHY lanes to be swapped.
Since there is no way to detect this, add a device tree query to see if
it is applicable.

Inspired-by: Vikas Soni <vsoni@broadcom.com>
Signed-off-by: Jon Mason <jon.mason@broadcom.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/brcmphy.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 0ed66914b61c..848dc508ef57 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -13,6 +13,7 @@
 #define PHY_ID_BCM5241			0x0143bc30
 #define PHY_ID_BCMAC131			0x0143bc70
 #define PHY_ID_BCM5481			0x0143bca0
+#define PHY_ID_BCM54810			0x03625d00
 #define PHY_ID_BCM5482			0x0143bcb0
 #define PHY_ID_BCM5411			0x00206070
 #define PHY_ID_BCM5421			0x002060e0
@@ -56,6 +57,7 @@
 #define PHY_BRCM_EXT_IBND_TX_ENABLE	0x00002000
 #define PHY_BRCM_CLEAR_RGMII_MODE	0x00004000
 #define PHY_BRCM_DIS_TXCRXC_NOENRGY	0x00008000
+
 /* Broadcom BCM7xxx specific workarounds */
 #define PHY_BRCM_7XXX_REV(x)		(((x) >> 8) & 0xff)
 #define PHY_BRCM_7XXX_PATCH(x)		((x) & 0xff)
@@ -111,6 +113,7 @@
 #define MII_BCM54XX_AUXCTL_MISC_RDSEL_MISC	0x7000
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC	0x0007
 #define MII_BCM54XX_AUXCTL_SHDWSEL_READ_SHIFT	12
+#define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN	(1 << 8)
 
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MASK	0x0007
 
@@ -192,6 +195,12 @@
 #define BCM5482_SSD_SGMII_SLAVE_EN	0x0002	/* Slave mode enable */
 #define BCM5482_SSD_SGMII_SLAVE_AD	0x0001	/* Slave auto-detection */
 
+/* BCM54810 Registers */
+#define BCM54810_EXP_BROADREACH_LRE_MISC_CTL	(MII_BCM54XX_EXP_SEL_ER + 0x90)
+#define BCM54810_EXP_BROADREACH_LRE_MISC_CTL_EN	(1 << 0)
+#define BCM54810_SHD_CLK_CTL			0x3
+#define BCM54810_SHD_CLK_CTL_GTXCLK_EN		(1 << 9)
+
 
 /*****************************************************************************/
 /* Fast Ethernet Transceiver definitions. */
-- 
cgit v1.2.3


From ad959036a70890bea121403c6a4e373dff5b7311 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 4 Nov 2016 11:28:58 +0100
Subject: net/sock: add an explicit sk argument for ip_cmsg_recv_offset()

So that we can use it even after orphaining the skbuff.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ip.h b/include/net/ip.h
index 55cdaac02957..f48c67cab222 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -579,7 +579,8 @@ int ip_options_rcv_srr(struct sk_buff *skb);
  */
 
 void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb);
-void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, int tlen, int offset);
+void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
+			 struct sk_buff *skb, int tlen, int offset);
 int ip_cmsg_send(struct sock *sk, struct msghdr *msg,
 		 struct ipcm_cookie *ipc, bool allow_ipv6);
 int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
@@ -601,7 +602,7 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport,
 
 static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
 {
-	ip_cmsg_recv_offset(msg, skb, 0, 0);
+	ip_cmsg_recv_offset(msg, skb->sk, skb, 0, 0);
 }
 
 bool icmp_global_allow(void);
-- 
cgit v1.2.3


From 7c13f97ffde63cc792c49ec1513f3974f2f05229 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 4 Nov 2016 11:28:59 +0100
Subject: udp: do fwd memory scheduling on dequeue

A new argument is added to __skb_recv_datagram to provide
an explicit skb destructor, invoked under the receive queue
lock.
The UDP protocol uses such argument to perform memory
reclaiming on dequeue, so that the UDP protocol does not
set anymore skb->desctructor.
Instead explicit memory reclaiming is performed at close() time and
when skbs are removed from the receive queue.
The in kernel UDP protocol users now need to call a
skb_recv_udp() variant instead of skb_recv_datagram() to
properly perform memory accounting on dequeue.

Overall, this allows acquiring only once the receive queue
lock on dequeue.

Tested using pktgen with random src port, 64 bytes packet,
wire-speed on a 10G link as sender and udp_sink as the receiver,
using an l4 tuple rxhash to stress the contention, and one or more
udp_sink instances with reuseport.

nr sinks	vanilla		patched
1		440		560
3		2150		2300
6		3650		3800
9		4450		4600
12		6250		6450

v1 -> v2:
 - do rmem and allocated memory scheduling under the receive lock
 - do bulk scheduling in first_packet_length() and in udp_destruct_sock()
 - avoid the typdef for the dequeue callback

Suggested-by: Eric Dumazet <edumazet@google.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  4 ++++
 include/net/udp.h      | 15 +++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cc6e23eaac91..a4aeeca7e805 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3033,9 +3033,13 @@ static inline void skb_frag_list_init(struct sk_buff *skb)
 int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
 				const struct sk_buff *skb);
 struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned flags,
+					void (*destructor)(struct sock *sk,
+							   struct sk_buff *skb),
 					int *peeked, int *off, int *err,
 					struct sk_buff **last);
 struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
+				    void (*destructor)(struct sock *sk,
+						       struct sk_buff *skb),
 				    int *peeked, int *off, int *err);
 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock,
 				  int *err);
diff --git a/include/net/udp.h b/include/net/udp.h
index 6134f37ba3ab..e6e4e19be387 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -248,6 +248,21 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
 /* net/ipv4/udp.c */
 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
+void udp_skb_destructor(struct sock *sk, struct sk_buff *skb);
+static inline struct sk_buff *
+__skb_recv_udp(struct sock *sk, unsigned int flags, int noblock, int *peeked,
+	       int *off, int *err)
+{
+	return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
+				   udp_skb_destructor, peeked, off, err);
+}
+static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
+					   int noblock, int *err)
+{
+	int peeked, off = 0;
+
+	return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err);
+}
 
 void udp_v4_early_demux(struct sk_buff *skb);
 int udp_get_port(struct sock *sk, unsigned short snum,
-- 
cgit v1.2.3


From d0a81f67cd6286d32f42a167d19c7a387c23db79 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 3 Nov 2016 14:56:01 +0100
Subject: net: make default TX queue length a defined constant

The default TX queue length of Ethernet devices have been a magic
constant of 1000, ever since the initial git import.

Looking back in historical trees[1][2] the value used to be 100,
with the same comment "Ethernet wants good queues". The commit[3]
that changed this from 100 to 1000 didn't describe why, but from
conversations with Robert Olsson it seems that it was changed
when Ethernet devices went from 100Mbit/s to 1Gbit/s, because the
link speed increased x10 the queue size were also adjusted.  This
value later caused much heartache for the bufferbloat community.

This patch merely moves the value into a defined constant.

[1] https://git.kernel.org/cgit/linux/kernel/git/davem/netdev-vger-cvs.git/
[2] https://git.kernel.org/cgit/linux/kernel/git/tglx/history.git/
[3] https://git.kernel.org/tglx/history/c/98921832c232

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_sched.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index cd334c9584e9..f1b76b8e6d2d 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -6,6 +6,8 @@
 #include <linux/if_vlan.h>
 #include <net/sch_generic.h>
 
+#define DEFAULT_TX_QUEUE_LEN	1000
+
 struct qdisc_walker {
 	int	stop;
 	int	skip;
-- 
cgit v1.2.3


From 67db3e4bfbc90657c7be840aad5585be46240d6f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 4 Nov 2016 11:54:32 -0700
Subject: tcp: no longer hold ehash lock while calling tcp_get_info()

We had various problems in the past in tcp_get_info() and used
specific synchronization to avoid deadlocks.

We would like to add more instrumentation points for TCP, and
avoiding grabing socket lock in tcp_getinfo() was too costly.

Being able to lock the socket allows to provide consistent set
of fields.

inet_diag_dump_icsk() can make sure ehash locks are not
held any more when tcp_get_info() is called.

We can remove syncp added in commit d654976cbf85
("tcp: fix a potential deadlock in tcp_get_info()"), but we need
to use lock_sock_fast() instead of spin_lock_bh() since TCP input
path can now be run from process context.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index a17ae7b85218..32a7c7e35b71 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -176,8 +176,6 @@ struct tcp_sock {
 				 * sum(delta(snd_una)), or how many bytes
 				 * were acked.
 				 */
-	struct u64_stats_sync syncp; /* protects 64bit vars (cf tcp_get_info()) */
-
  	u32	snd_una;	/* First byte we want an ack for	*/
  	u32	snd_sml;	/* Last byte of the most recently transmitted small packet */
 	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
-- 
cgit v1.2.3


From 5a3c7805c444d9d55f302a4b3930e8758be13fab Mon Sep 17 00:00:00 2001
From: Joachim Eastwood <manabian@gmail.com>
Date: Sat, 5 Nov 2016 14:04:52 +0100
Subject: Revert "net: stmmac: allow to split suspend/resume from init/exit
 callbacks"

Instead of adding hooks inside stmmac_platform it is better to just use
the standard PM callbacks within the specific dwmac-driver. This only
used by the dwmac-rk driver.

This reverts commit cecbc5563a02 ("stmmac: allow to split suspend/resume
from init/exit callbacks").

Signed-off-by: Joachim Eastwood <manabian@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/stmmac.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 705840e0438f..3537fb33cc90 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -135,8 +135,6 @@ struct plat_stmmacenet_data {
 	void (*bus_setup)(void __iomem *ioaddr);
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
-	void (*suspend)(struct platform_device *pdev, void *priv);
-	void (*resume)(struct platform_device *pdev, void *priv);
 	void *bsp_priv;
 	struct stmmac_axi *axi;
 	int has_gmac4;
-- 
cgit v1.2.3


From 9ce183b4c4d24559467d7712e313f2b3f9277437 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 7 Nov 2016 15:14:36 +0200
Subject: net/sched: act_tunnel_key: add helper inlines to access
 tcf_tunnel_key

Needed for drivers to pick the relevant action when offloading tunnel
key act.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_tunnel_key.h | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'include')

diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h
index 253f8da6c2a6..efef0b4b1b2b 100644
--- a/include/net/tc_act/tc_tunnel_key.h
+++ b/include/net/tc_act/tc_tunnel_key.h
@@ -12,6 +12,8 @@
 #define __NET_TC_TUNNEL_KEY_H
 
 #include <net/act_api.h>
+#include <linux/tc_act/tc_tunnel_key.h>
+#include <net/dst_metadata.h>
 
 struct tcf_tunnel_key_params {
 	struct rcu_head		rcu;
@@ -27,4 +29,39 @@ struct tcf_tunnel_key {
 
 #define to_tunnel_key(a) ((struct tcf_tunnel_key *)a)
 
+static inline bool is_tcf_tunnel_set(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	struct tcf_tunnel_key *t = to_tunnel_key(a);
+	struct tcf_tunnel_key_params *params = rtnl_dereference(t->params);
+
+	if (a->ops && a->ops->type == TCA_ACT_TUNNEL_KEY)
+		return params->tcft_action == TCA_TUNNEL_KEY_ACT_SET;
+#endif
+	return false;
+}
+
+static inline bool is_tcf_tunnel_release(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	struct tcf_tunnel_key *t = to_tunnel_key(a);
+	struct tcf_tunnel_key_params *params = rtnl_dereference(t->params);
+
+	if (a->ops && a->ops->type == TCA_ACT_TUNNEL_KEY)
+		return params->tcft_action == TCA_TUNNEL_KEY_ACT_RELEASE;
+#endif
+	return false;
+}
+
+static inline struct ip_tunnel_info *tcf_tunnel_info(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	struct tcf_tunnel_key *t = to_tunnel_key(a);
+	struct tcf_tunnel_key_params *params = rtnl_dereference(t->params);
+
+	return &params->tcft_enc_metadata->u.tun_info;
+#else
+	return NULL;
+#endif
+}
 #endif /* __NET_TC_TUNNEL_KEY_H */
-- 
cgit v1.2.3


From 9ba6a9a9f7a42673e9fc08ff3594f64caae64d3c Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 7 Nov 2016 15:14:37 +0200
Subject: flow_dissector: Add enums for encapsulation keys

New encapsulation keys were added to the flower classifier, which allow
classification according to outer (encapsulation) headers attributes
such as key and IP addresses.
In order to expose those attributes outside flower, add
corresponding enums in the flow dissector.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index d9534927d93b..4e7cf38a7750 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -128,6 +128,10 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags */
 	FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */
 	FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */
+	FLOW_DISSECTOR_KEY_ENC_KEYID, /* struct flow_dissector_key_keyid */
+	FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
+	FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
+	FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
-- 
cgit v1.2.3


From f4d997fd613001e612543339e0275c037f94ffe9 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 7 Nov 2016 15:14:39 +0200
Subject: net/sched: cls_flower: Add UDP port to tunnel parameters

The current IP tunneling classification supports only IP addresses and key.
Enhance UDP based IP tunneling classification parameters by adding UDP
src and dst port.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 1 +
 include/uapi/linux/pkt_cls.h | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 4e7cf38a7750..c4f31666afd2 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -132,6 +132,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
 	FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
 	FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */
+	FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index eb94781757ee..86786d45ee66 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -452,6 +452,11 @@ enum {
 
 	TCA_FLOWER_KEY_SCTP_SRC,	/* be16 */
 	TCA_FLOWER_KEY_SCTP_DST,	/* be16 */
+
+	TCA_FLOWER_KEY_ENC_UDP_SRC_PORT,	/* be16 */
+	TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK,	/* be16 */
+	TCA_FLOWER_KEY_ENC_UDP_DST_PORT,	/* be16 */
+	TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,	/* be16 */
 	__TCA_FLOWER_MAX,
 };
 
-- 
cgit v1.2.3


From 24ba898d43e87f9ac87353c7a13eef4ee726cab7 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 7 Nov 2016 15:14:40 +0200
Subject: net/dst: Add dst port to dst_metadata utility functions

Add dst port parameter to __ip_tun_set_dst and __ipv6_tun_set_dst
utility functions.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst_metadata.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 6965c8f68ade..701fc814d0af 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -115,6 +115,7 @@ static inline struct ip_tunnel_info *skb_tunnel_info_unclone(struct sk_buff *skb
 static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr,
 						    __be32 daddr,
 						    __u8 tos, __u8 ttl,
+						    __be16 tp_dst,
 						    __be16 flags,
 						    __be64 tunnel_id,
 						    int md_size)
@@ -127,7 +128,7 @@ static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr,
 
 	ip_tunnel_key_init(&tun_dst->u.tun_info.key,
 			   saddr, daddr, tos, ttl,
-			   0, 0, 0, tunnel_id, flags);
+			   0, 0, tp_dst, tunnel_id, flags);
 	return tun_dst;
 }
 
@@ -139,12 +140,13 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
 	const struct iphdr *iph = ip_hdr(skb);
 
 	return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl,
-				flags, tunnel_id, md_size);
+				0, flags, tunnel_id, md_size);
 }
 
 static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr,
 						      const struct in6_addr *daddr,
 						      __u8 tos, __u8 ttl,
+						      __be16 tp_dst,
 						      __be32 label,
 						      __be16 flags,
 						      __be64 tunnel_id,
@@ -162,7 +164,7 @@ static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *sad
 	info->key.tun_flags = flags;
 	info->key.tun_id = tunnel_id;
 	info->key.tp_src = 0;
-	info->key.tp_dst = 0;
+	info->key.tp_dst = tp_dst;
 
 	info->key.u.ipv6.src = *saddr;
 	info->key.u.ipv6.dst = *daddr;
@@ -183,7 +185,7 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb,
 
 	return __ipv6_tun_set_dst(&ip6h->saddr, &ip6h->daddr,
 				  ipv6_get_dsfield(ip6h), ip6h->hop_limit,
-				  ip6_flowlabel(ip6h), flags, tunnel_id,
+				  0, ip6_flowlabel(ip6h), flags, tunnel_id,
 				  md_size);
 }
 #endif /* __NET_DST_METADATA_H */
-- 
cgit v1.2.3


From 75bfbca01e48d2d62e8321609ae32aaf6c6fab0e Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 7 Nov 2016 15:14:41 +0200
Subject: net/sched: act_tunnel_key: Add UDP dst port option

The current tunnel set action supports only IP addresses and key
options. Add UDP dst port option.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tc_act/tc_tunnel_key.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h
index 890106ff16e6..84ea55e1076b 100644
--- a/include/uapi/linux/tc_act/tc_tunnel_key.h
+++ b/include/uapi/linux/tc_act/tc_tunnel_key.h
@@ -33,6 +33,7 @@ enum {
 	TCA_TUNNEL_KEY_ENC_IPV6_DST,	/* struct in6_addr */
 	TCA_TUNNEL_KEY_ENC_KEY_ID,	/* be64 */
 	TCA_TUNNEL_KEY_PAD,
+	TCA_TUNNEL_KEY_ENC_DST_PORT,	/* be16 */
 	__TCA_TUNNEL_KEY_MAX,
 };
 
-- 
cgit v1.2.3


From c9f1b073d0d750ccf8b30b272d1d76479f4cccbc Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 7 Nov 2016 15:14:44 +0200
Subject: net/mlx5: Add creation flags when adding new flow table

When creating flow tables, allow the caller to specify creation flags.
Currently no flags are used and as such this patch doesn't add any new
functionality.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/fs.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 0dcd287f4bd0..ab1a5fd2e995 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -42,6 +42,10 @@ enum {
 	MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO	= 1 << 16,
 };
 
+enum {
+	MLX5_FLOW_TABLE_TUNNEL_EN = BIT(0),
+};
+
 #define LEFTOVERS_RULE_NUM	 2
 static inline void build_leftovers_ft_param(int *priority,
 					    int *n_ent,
@@ -97,13 +101,15 @@ mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
 				    int prio,
 				    int num_flow_table_entries,
 				    int max_num_groups,
-				    u32 level);
+				    u32 level,
+				    u32 flags);
 
 struct mlx5_flow_table *
 mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
 		       int prio,
 		       int num_flow_table_entries,
-		       u32 level);
+		       u32 level,
+		       u32 flags);
 struct mlx5_flow_table *
 mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
 			     int prio,
-- 
cgit v1.2.3


From 66958ed906b87816314c0517f05fe0b5766ec7fe Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 7 Nov 2016 15:14:45 +0200
Subject: net/mlx5: Support encap id when setting new steering entry

In order to support steering rules which add encapsulation headers,
encap_id parameter is needed.

Add new mlx5_flow_act struct which holds action related parameter:
action, flow_tag and encap_id. Use mlx5_flow_act struct when adding a new
steering rule.
This patch doesn't change any functionality.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/fs.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index ab1a5fd2e995..949b24b6c479 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -130,14 +130,19 @@ struct mlx5_flow_group *
 mlx5_create_flow_group(struct mlx5_flow_table *ft, u32 *in);
 void mlx5_destroy_flow_group(struct mlx5_flow_group *fg);
 
+struct mlx5_flow_act {
+	u32 action;
+	u32 flow_tag;
+	u32 encap_id;
+};
+
 /* Single destination per rule.
  * Group ID is implied by the match criteria.
  */
 struct mlx5_flow_handle *
 mlx5_add_flow_rules(struct mlx5_flow_table *ft,
 		    struct mlx5_flow_spec *spec,
-		    u32 action,
-		    u32 flow_tag,
+		    struct mlx5_flow_act *flow_act,
 		    struct mlx5_flow_destination *dest,
 		    int dest_num);
 void mlx5_del_flow_rules(struct mlx5_flow_handle *fr);
-- 
cgit v1.2.3


From 4e24877e61e8507c0843e4bddbc6ecccbfd2e87d Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Sun, 6 Nov 2016 21:15:51 +0800
Subject: netfilter: nf_tables: simplify the basic expressions' init routine

Some basic expressions are built into nf_tables.ko, such as nft_cmp,
nft_lookup, nft_range and so on. But these basic expressions' init
routine is a little ugly, too many goto errX labels, and we forget
to call nft_range_module_exit in the exit routine, although it is
harmless.

Acctually, the init and exit routines of these basic expressions
are same, i.e. do nft_register_expr in the init routine and do
nft_unregister_expr in the exit routine.

So it's better to arrange them into an array and deal with them
together.

Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h | 33 +++++++++------------------------
 1 file changed, 9 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 00f4f6b1b1ba..862373d4ea9d 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -1,12 +1,18 @@
 #ifndef _NET_NF_TABLES_CORE_H
 #define _NET_NF_TABLES_CORE_H
 
+extern struct nft_expr_type nft_imm_type;
+extern struct nft_expr_type nft_cmp_type;
+extern struct nft_expr_type nft_lookup_type;
+extern struct nft_expr_type nft_bitwise_type;
+extern struct nft_expr_type nft_byteorder_type;
+extern struct nft_expr_type nft_payload_type;
+extern struct nft_expr_type nft_dynset_type;
+extern struct nft_expr_type nft_range_type;
+
 int nf_tables_core_module_init(void);
 void nf_tables_core_module_exit(void);
 
-int nft_immediate_module_init(void);
-void nft_immediate_module_exit(void);
-
 struct nft_cmp_fast_expr {
 	u32			data;
 	enum nft_registers	sreg:8;
@@ -25,24 +31,6 @@ static inline u32 nft_cmp_fast_mask(unsigned int len)
 
 extern const struct nft_expr_ops nft_cmp_fast_ops;
 
-int nft_cmp_module_init(void);
-void nft_cmp_module_exit(void);
-
-int nft_range_module_init(void);
-void nft_range_module_exit(void);
-
-int nft_lookup_module_init(void);
-void nft_lookup_module_exit(void);
-
-int nft_dynset_module_init(void);
-void nft_dynset_module_exit(void);
-
-int nft_bitwise_module_init(void);
-void nft_bitwise_module_exit(void);
-
-int nft_byteorder_module_init(void);
-void nft_byteorder_module_exit(void);
-
 struct nft_payload {
 	enum nft_payload_bases	base:8;
 	u8			offset;
@@ -62,7 +50,4 @@ struct nft_payload_set {
 extern const struct nft_expr_ops nft_payload_fast_ops;
 extern struct static_key_false nft_trace_enabled;
 
-int nft_payload_module_init(void);
-void nft_payload_module_exit(void);
-
 #endif /* _NET_NF_TABLES_CORE_H */
-- 
cgit v1.2.3


From 0e54d2179f650bac80d89a9def429dbdbed58c11 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Mon, 7 Nov 2016 18:31:17 +0100
Subject: netfilter: conntrack: simplify init/uninit of L4 protocol trackers

modify registration and deregistration of layer-4 protocol trackers to
facilitate inclusion of new elements into the current list of builtin
protocols. Both builtin (TCP, UDP, ICMP) and non-builtin (DCCP, GRE, SCTP,
UDPlite) layer-4 protocol trackers usually register/deregister themselves
using consecutive calls to nf_ct_l4proto_{,pernet}_{,un}register(...).
This sequence is interrupted and rolled back in case of error; in order to
simplify addition of builtin protocols, the input of the above functions
has been modified to allow registering/unregistering multiple protocols.

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index de629f1520df..2152b70626d5 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -125,14 +125,24 @@ struct nf_conntrack_l4proto *nf_ct_l4proto_find_get(u_int16_t l3proto,
 void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p);
 
 /* Protocol pernet registration. */
+int nf_ct_l4proto_pernet_register_one(struct net *net,
+				      struct nf_conntrack_l4proto *proto);
+void nf_ct_l4proto_pernet_unregister_one(struct net *net,
+					 struct nf_conntrack_l4proto *proto);
 int nf_ct_l4proto_pernet_register(struct net *net,
-				  struct nf_conntrack_l4proto *proto);
+				  struct nf_conntrack_l4proto *proto[],
+				  unsigned int num_proto);
 void nf_ct_l4proto_pernet_unregister(struct net *net,
-				     struct nf_conntrack_l4proto *proto);
+				     struct nf_conntrack_l4proto *proto[],
+				     unsigned int num_proto);
 
 /* Protocol global registration. */
-int nf_ct_l4proto_register(struct nf_conntrack_l4proto *proto);
-void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *proto);
+int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *proto);
+void nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *proto);
+int nf_ct_l4proto_register(struct nf_conntrack_l4proto *proto[],
+			   unsigned int num_proto);
+void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *proto[],
+			      unsigned int num_proto);
 
 /* Generic netlink helpers */
 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
-- 
cgit v1.2.3


From 3f11ec045fecf2c0fb21f08f68ebc9237bd1d03c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= <asbjorn@asbjorn.st>
Date: Mon, 7 Nov 2016 20:39:24 +0000
Subject: net: l2tp: change L2TP_ATTR_UDP_ZERO_CSUM6_{RX, TX} attribute types

The attributes L2TP_ATTR_UDP_ZERO_CSUM6_RX and
L2TP_ATTR_UDP_ZERO_CSUM6_TX are used as flags,
but is defined as a u8 in a comment.

This patch redocuments them as flags.

Adding nla_policy entries would break API, so not doing that.

CC: Tom Herbert <therbert@google.com>
Signed-off-by: Asbjoern Sloth Toennesen <asbjorn@asbjorn.st>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/l2tp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index 4bd27d0270a2..5daa48e2571e 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -124,8 +124,8 @@ enum {
 	L2TP_ATTR_STATS,		/* nested */
 	L2TP_ATTR_IP6_SADDR,		/* struct in6_addr */
 	L2TP_ATTR_IP6_DADDR,		/* struct in6_addr */
-	L2TP_ATTR_UDP_ZERO_CSUM6_TX,	/* u8 */
-	L2TP_ATTR_UDP_ZERO_CSUM6_RX,	/* u8 */
+	L2TP_ATTR_UDP_ZERO_CSUM6_TX,	/* flag */
+	L2TP_ATTR_UDP_ZERO_CSUM6_RX,	/* flag */
 	L2TP_ATTR_PAD,
 	__L2TP_ATTR_MAX,
 };
-- 
cgit v1.2.3


From 1ababeba4a21f3dba3da3523c670b207fb2feb62 Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Tue, 8 Nov 2016 14:57:39 +0100
Subject: ipv6: implement dataplane support for rthdr type 4 (Segment Routing
 Header)

Implement minimal support for processing of SR-enabled packets
as described in
https://tools.ietf.org/html/draft-ietf-6man-segment-routing-header-02.

This patch implements the following operations:
- Intermediate segment endpoint: incrementation of active segment and rerouting.
- Egress for SR-encapsulated packets: decapsulation of outer IPv6 header + SRH
  and routing of inner packet.
- Cleanup flag support for SR-inlined packets: removal of SRH if we are the
  penultimate segment endpoint.

A per-interface sysctl seg6_enabled is provided, to accept/deny SR-enabled
packets. Default is deny.

This patch does not provide support for HMAC-signed packets.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h      |  1 +
 include/linux/seg6.h      |  6 ++++++
 include/net/seg6.h        | 36 +++++++++++++++++++++++++++++++
 include/uapi/linux/ipv6.h |  2 ++
 include/uapi/linux/seg6.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 99 insertions(+)
 create mode 100644 include/linux/seg6.h
 create mode 100644 include/net/seg6.h
 create mode 100644 include/uapi/linux/seg6.h

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 1afb6e8d35c3..68d3f71f0abf 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -64,6 +64,7 @@ struct ipv6_devconf {
 	} stable_secret;
 	__s32		use_oif_addrs_only;
 	__s32		keep_addr_on_down;
+	__s32		seg6_enabled;
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/include/linux/seg6.h b/include/linux/seg6.h
new file mode 100644
index 000000000000..7a66d2b4c5a6
--- /dev/null
+++ b/include/linux/seg6.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SEG6_H
+#define _LINUX_SEG6_H
+
+#include <uapi/linux/seg6.h>
+
+#endif
diff --git a/include/net/seg6.h b/include/net/seg6.h
new file mode 100644
index 000000000000..4dd52a7e95f1
--- /dev/null
+++ b/include/net/seg6.h
@@ -0,0 +1,36 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _NET_SEG6_H
+#define _NET_SEG6_H
+
+static inline void update_csum_diff4(struct sk_buff *skb, __be32 from,
+				     __be32 to)
+{
+	__be32 diff[] = { ~from, to };
+
+	skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum);
+}
+
+static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from,
+				      __be32 *to)
+{
+	__be32 diff[] = {
+		~from[0], ~from[1], ~from[2], ~from[3],
+		to[0], to[1], to[2], to[3],
+	};
+
+	skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum);
+}
+
+#endif
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 8c2772340c3f..7ff1d654e333 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -39,6 +39,7 @@ struct in6_ifreq {
 #define IPV6_SRCRT_STRICT	0x01	/* Deprecated; will be removed */
 #define IPV6_SRCRT_TYPE_0	0	/* Deprecated; will be removed */
 #define IPV6_SRCRT_TYPE_2	2	/* IPv6 type 2 Routing Header	*/
+#define IPV6_SRCRT_TYPE_4	4	/* Segment Routing with IPv6 */
 
 /*
  *	routing header
@@ -178,6 +179,7 @@ enum {
 	DEVCONF_DROP_UNSOLICITED_NA,
 	DEVCONF_KEEP_ADDR_ON_DOWN,
 	DEVCONF_RTR_SOLICIT_MAX_INTERVAL,
+	DEVCONF_SEG6_ENABLED,
 	DEVCONF_MAX
 };
 
diff --git a/include/uapi/linux/seg6.h b/include/uapi/linux/seg6.h
new file mode 100644
index 000000000000..c396a8052f73
--- /dev/null
+++ b/include/uapi/linux/seg6.h
@@ -0,0 +1,54 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_SEG6_H
+#define _UAPI_LINUX_SEG6_H
+
+/*
+ * SRH
+ */
+struct ipv6_sr_hdr {
+	__u8	nexthdr;
+	__u8	hdrlen;
+	__u8	type;
+	__u8	segments_left;
+	__u8	first_segment;
+	__u8	flag_1;
+	__u8	flag_2;
+	__u8	reserved;
+
+	struct in6_addr segments[0];
+};
+
+#define SR6_FLAG1_CLEANUP	(1 << 7)
+#define SR6_FLAG1_PROTECTED	(1 << 6)
+#define SR6_FLAG1_OAM		(1 << 5)
+#define SR6_FLAG1_ALERT		(1 << 4)
+#define SR6_FLAG1_HMAC		(1 << 3)
+
+#define SR6_TLV_INGRESS		1
+#define SR6_TLV_EGRESS		2
+#define SR6_TLV_OPAQUE		3
+#define SR6_TLV_PADDING		4
+#define SR6_TLV_HMAC		5
+
+#define sr_has_cleanup(srh) ((srh)->flag_1 & SR6_FLAG1_CLEANUP)
+#define sr_has_hmac(srh) ((srh)->flag_1 & SR6_FLAG1_HMAC)
+
+struct sr6_tlv {
+	__u8 type;
+	__u8 len;
+	__u8 data[0];
+};
+
+#endif
-- 
cgit v1.2.3


From 915d7e5e5930b4f01d0971d93b9b25ed17d221aa Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Tue, 8 Nov 2016 14:57:40 +0100
Subject: ipv6: sr: add code base for control plane support of SR-IPv6

This patch adds the necessary hooks and structures to provide support
for SR-IPv6 control plane, essentially the Generic Netlink commands
that will be used for userspace control over the Segment Routing
kernel structures.

The genetlink commands provide control over two different structures:
tunnel source and HMAC data. The tunnel source is the source address
that will be used by default when encapsulating packets into an
outer IPv6 header + SRH. If the tunnel source is set to :: then an
address of the outgoing interface will be selected as the source.

The HMAC commands currently just return ENOTSUPP and will be implemented
in a future patch.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/seg6_genl.h      |  6 ++++++
 include/net/netns/ipv6.h       |  1 +
 include/net/seg6.h             | 16 ++++++++++++++++
 include/uapi/linux/seg6_genl.h | 32 ++++++++++++++++++++++++++++++++
 4 files changed, 55 insertions(+)
 create mode 100644 include/linux/seg6_genl.h
 create mode 100644 include/uapi/linux/seg6_genl.h

(limited to 'include')

diff --git a/include/linux/seg6_genl.h b/include/linux/seg6_genl.h
new file mode 100644
index 000000000000..d6c3fb4f3734
--- /dev/null
+++ b/include/linux/seg6_genl.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SEG6_GENL_H
+#define _LINUX_SEG6_GENL_H
+
+#include <uapi/linux/seg6_genl.h>
+
+#endif
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 10d0848f5b8a..de7745e2edcc 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -85,6 +85,7 @@ struct netns_ipv6 {
 #endif
 	atomic_t		dev_addr_genid;
 	atomic_t		fib6_sernum;
+	struct seg6_pernet_data *seg6_data;
 };
 
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
diff --git a/include/net/seg6.h b/include/net/seg6.h
index 4dd52a7e95f1..7c7b8ed39661 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -14,6 +14,9 @@
 #ifndef _NET_SEG6_H
 #define _NET_SEG6_H
 
+#include <linux/net.h>
+#include <linux/ipv6.h>
+
 static inline void update_csum_diff4(struct sk_buff *skb, __be32 from,
 				     __be32 to)
 {
@@ -33,4 +36,17 @@ static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from,
 	skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum);
 }
 
+struct seg6_pernet_data {
+	struct mutex lock;
+	struct in6_addr __rcu *tun_src;
+};
+
+static inline struct seg6_pernet_data *seg6_pernet(struct net *net)
+{
+	return net->ipv6.seg6_data;
+}
+
+extern int seg6_init(void);
+extern void seg6_exit(void);
+
 #endif
diff --git a/include/uapi/linux/seg6_genl.h b/include/uapi/linux/seg6_genl.h
new file mode 100644
index 000000000000..fcf1c60d7df3
--- /dev/null
+++ b/include/uapi/linux/seg6_genl.h
@@ -0,0 +1,32 @@
+#ifndef _UAPI_LINUX_SEG6_GENL_H
+#define _UAPI_LINUX_SEG6_GENL_H
+
+#define SEG6_GENL_NAME		"SEG6"
+#define SEG6_GENL_VERSION	0x1
+
+enum {
+	SEG6_ATTR_UNSPEC,
+	SEG6_ATTR_DST,
+	SEG6_ATTR_DSTLEN,
+	SEG6_ATTR_HMACKEYID,
+	SEG6_ATTR_SECRET,
+	SEG6_ATTR_SECRETLEN,
+	SEG6_ATTR_ALGID,
+	SEG6_ATTR_HMACINFO,
+	__SEG6_ATTR_MAX,
+};
+
+#define SEG6_ATTR_MAX (__SEG6_ATTR_MAX - 1)
+
+enum {
+	SEG6_CMD_UNSPEC,
+	SEG6_CMD_SETHMAC,
+	SEG6_CMD_DUMPHMAC,
+	SEG6_CMD_SET_TUNSRC,
+	SEG6_CMD_GET_TUNSRC,
+	__SEG6_CMD_MAX,
+};
+
+#define SEG6_CMD_MAX (__SEG6_CMD_MAX - 1)
+
+#endif
-- 
cgit v1.2.3


From 6c8702c60b88651072460f3f4026c7dfe2521d12 Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Tue, 8 Nov 2016 14:57:41 +0100
Subject: ipv6: sr: add support for SRH encapsulation and injection with
 lwtunnels

This patch creates a new type of interfaceless lightweight tunnel (SEG6),
enabling the encapsulation and injection of SRH within locally emitted
packets and forwarded packets.

>From a configuration viewpoint, a seg6 tunnel would be configured as follows:

  ip -6 ro ad fc00::1/128 encap seg6 mode encap segs fc42::1,fc42::2,fc42::3 dev eth0

Any packet whose destination address is fc00::1 would thus be encapsulated
within an outer IPv6 header containing the SRH with three segments, and would
actually be routed to the first segment of the list. If `mode inline' was
specified instead of `mode encap', then the SRH would be directly inserted
after the IPv6 header without outer encapsulation.

The inline mode is only available if CONFIG_IPV6_SEG6_INLINE is enabled. This
feature was made configurable because direct header insertion may break
several mechanisms such as PMTUD or IPSec AH.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/seg6_iptunnel.h      |  6 ++++++
 include/net/seg6.h                 |  6 ++++++
 include/uapi/linux/lwtunnel.h      |  1 +
 include/uapi/linux/seg6_iptunnel.h | 44 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 57 insertions(+)
 create mode 100644 include/linux/seg6_iptunnel.h
 create mode 100644 include/uapi/linux/seg6_iptunnel.h

(limited to 'include')

diff --git a/include/linux/seg6_iptunnel.h b/include/linux/seg6_iptunnel.h
new file mode 100644
index 000000000000..5377cf6a5a02
--- /dev/null
+++ b/include/linux/seg6_iptunnel.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SEG6_IPTUNNEL_H
+#define _LINUX_SEG6_IPTUNNEL_H
+
+#include <uapi/linux/seg6_iptunnel.h>
+
+#endif
diff --git a/include/net/seg6.h b/include/net/seg6.h
index 7c7b8ed39661..ff5da0ce83e9 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -16,6 +16,8 @@
 
 #include <linux/net.h>
 #include <linux/ipv6.h>
+#include <net/lwtunnel.h>
+#include <linux/seg6.h>
 
 static inline void update_csum_diff4(struct sk_buff *skb, __be32 from,
 				     __be32 to)
@@ -48,5 +50,9 @@ static inline struct seg6_pernet_data *seg6_pernet(struct net *net)
 
 extern int seg6_init(void);
 extern void seg6_exit(void);
+extern int seg6_iptunnel_init(void);
+extern void seg6_iptunnel_exit(void);
+
+extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len);
 
 #endif
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index a478fe80e203..453cc6215bfd 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -9,6 +9,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_IP,
 	LWTUNNEL_ENCAP_ILA,
 	LWTUNNEL_ENCAP_IP6,
+	LWTUNNEL_ENCAP_SEG6,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h
new file mode 100644
index 000000000000..0f7dbd280a9c
--- /dev/null
+++ b/include/uapi/linux/seg6_iptunnel.h
@@ -0,0 +1,44 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_SEG6_IPTUNNEL_H
+#define _UAPI_LINUX_SEG6_IPTUNNEL_H
+
+enum {
+	SEG6_IPTUNNEL_UNSPEC,
+	SEG6_IPTUNNEL_SRH,
+	__SEG6_IPTUNNEL_MAX,
+};
+#define SEG6_IPTUNNEL_MAX (__SEG6_IPTUNNEL_MAX - 1)
+
+struct seg6_iptunnel_encap {
+	int mode;
+	struct ipv6_sr_hdr srh[0];
+};
+
+#define SEG6_IPTUN_ENCAP_SIZE(x) ((sizeof(*x)) + (((x)->srh->hdrlen + 1) << 3))
+
+enum {
+	SEG6_IPTUN_MODE_INLINE,
+	SEG6_IPTUN_MODE_ENCAP,
+};
+
+static inline size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo)
+{
+	int encap = (tuninfo->mode == SEG6_IPTUN_MODE_ENCAP);
+
+	return ((tuninfo->srh->hdrlen + 1) << 3) +
+	       (encap * sizeof(struct ipv6hdr));
+}
+
+#endif
-- 
cgit v1.2.3


From bf355b8d2c30a289232042cacc1cfaea4923936c Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Tue, 8 Nov 2016 14:57:42 +0100
Subject: ipv6: sr: add core files for SR HMAC support

This patch adds the necessary functions to compute and check the HMAC signature
of an SR-enabled packet. Two HMAC algorithms are supported: hmac(sha1) and
hmac(sha256).

In order to avoid dynamic memory allocation for each HMAC computation,
a per-cpu ring buffer is allocated for this purpose.

A new per-interface sysctl called seg6_require_hmac is added, allowing a
user-defined policy for processing HMAC-signed SR-enabled packets.
A value of -1 means that the HMAC field will always be ignored.
A value of 0 means that if an HMAC field is present, its validity will
be enforced (the packet is dropped is the signature is incorrect).
Finally, a value of 1 means that any SR-enabled packet that does not
contain an HMAC signature or whose signature is incorrect will be dropped.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h           |  3 ++
 include/linux/seg6_hmac.h      |  6 ++++
 include/net/seg6.h             |  4 +++
 include/net/seg6_hmac.h        | 62 ++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/ipv6.h      |  1 +
 include/uapi/linux/seg6_hmac.h | 21 ++++++++++++++
 6 files changed, 97 insertions(+)
 create mode 100644 include/linux/seg6_hmac.h
 create mode 100644 include/net/seg6_hmac.h
 create mode 100644 include/uapi/linux/seg6_hmac.h

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 68d3f71f0abf..93756585521f 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -65,6 +65,9 @@ struct ipv6_devconf {
 	__s32		use_oif_addrs_only;
 	__s32		keep_addr_on_down;
 	__s32		seg6_enabled;
+#ifdef CONFIG_IPV6_SEG6_HMAC
+	__s32		seg6_require_hmac;
+#endif
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/include/linux/seg6_hmac.h b/include/linux/seg6_hmac.h
new file mode 100644
index 000000000000..da437ebdc6cd
--- /dev/null
+++ b/include/linux/seg6_hmac.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_SEG6_HMAC_H
+#define _LINUX_SEG6_HMAC_H
+
+#include <uapi/linux/seg6_hmac.h>
+
+#endif
diff --git a/include/net/seg6.h b/include/net/seg6.h
index ff5da0ce83e9..4e0357517d79 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -18,6 +18,7 @@
 #include <linux/ipv6.h>
 #include <net/lwtunnel.h>
 #include <linux/seg6.h>
+#include <linux/rhashtable.h>
 
 static inline void update_csum_diff4(struct sk_buff *skb, __be32 from,
 				     __be32 to)
@@ -41,6 +42,9 @@ static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from,
 struct seg6_pernet_data {
 	struct mutex lock;
 	struct in6_addr __rcu *tun_src;
+#ifdef CONFIG_IPV6_SEG6_HMAC
+	struct rhashtable hmac_infos;
+#endif
 };
 
 static inline struct seg6_pernet_data *seg6_pernet(struct net *net)
diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h
new file mode 100644
index 000000000000..69c3a106056b
--- /dev/null
+++ b/include/net/seg6_hmac.h
@@ -0,0 +1,62 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _NET_SEG6_HMAC_H
+#define _NET_SEG6_HMAC_H
+
+#include <net/flow.h>
+#include <net/ip6_fib.h>
+#include <net/sock.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/route.h>
+#include <net/seg6.h>
+#include <linux/seg6_hmac.h>
+#include <linux/rhashtable.h>
+
+#define SEG6_HMAC_MAX_DIGESTSIZE	160
+#define SEG6_HMAC_RING_SIZE		256
+
+struct seg6_hmac_info {
+	struct rhash_head node;
+	struct rcu_head rcu;
+
+	u32 hmackeyid;
+	char secret[SEG6_HMAC_SECRET_LEN];
+	u8 slen;
+	u8 alg_id;
+};
+
+struct seg6_hmac_algo {
+	u8 alg_id;
+	char name[64];
+	struct crypto_shash * __percpu *tfms;
+	struct shash_desc * __percpu *shashs;
+};
+
+extern int seg6_hmac_compute(struct seg6_hmac_info *hinfo,
+			     struct ipv6_sr_hdr *hdr, struct in6_addr *saddr,
+			     u8 *output);
+extern struct seg6_hmac_info *seg6_hmac_info_lookup(struct net *net, u32 key);
+extern int seg6_hmac_info_add(struct net *net, u32 key,
+			      struct seg6_hmac_info *hinfo);
+extern int seg6_hmac_info_del(struct net *net, u32 key);
+extern int seg6_push_hmac(struct net *net, struct in6_addr *saddr,
+			  struct ipv6_sr_hdr *srh);
+extern bool seg6_hmac_validate_skb(struct sk_buff *skb);
+extern int seg6_hmac_init(void);
+extern void seg6_hmac_exit(void);
+extern int seg6_hmac_net_init(struct net *net);
+extern void seg6_hmac_net_exit(struct net *net);
+
+#endif
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 7ff1d654e333..53561be1ac21 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -180,6 +180,7 @@ enum {
 	DEVCONF_KEEP_ADDR_ON_DOWN,
 	DEVCONF_RTR_SOLICIT_MAX_INTERVAL,
 	DEVCONF_SEG6_ENABLED,
+	DEVCONF_SEG6_REQUIRE_HMAC,
 	DEVCONF_MAX
 };
 
diff --git a/include/uapi/linux/seg6_hmac.h b/include/uapi/linux/seg6_hmac.h
new file mode 100644
index 000000000000..b652dfd51bc5
--- /dev/null
+++ b/include/uapi/linux/seg6_hmac.h
@@ -0,0 +1,21 @@
+#ifndef _UAPI_LINUX_SEG6_HMAC_H
+#define _UAPI_LINUX_SEG6_HMAC_H
+
+#include <linux/seg6.h>
+
+#define SEG6_HMAC_SECRET_LEN	64
+#define SEG6_HMAC_FIELD_LEN	32
+
+struct sr6_tlv_hmac {
+	struct sr6_tlv tlvhdr;
+	__u16 reserved;
+	__be32 hmackeyid;
+	__u8 hmac[SEG6_HMAC_FIELD_LEN];
+};
+
+enum {
+	SEG6_HMAC_ALGO_SHA1 = 1,
+	SEG6_HMAC_ALGO_SHA256 = 2,
+};
+
+#endif
-- 
cgit v1.2.3


From 613fa3ca9e9e6af57927dab238121010c510fe4c Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Tue, 8 Nov 2016 14:59:20 +0100
Subject: ipv6: add source address argument for ipv6_push_nfrag_opts

This patch prepares for insertion of SRH through setsockopt().
The new source address argument is used when an HMAC field is
present in the SRH, which must be filled. The HMAC signature
process requires the source address as input text.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 8fed1cd78658..0a3622bf086f 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -932,7 +932,8 @@ int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
  */
 
 void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
-			  u8 *proto, struct in6_addr **daddr_p);
+			  u8 *proto, struct in6_addr **daddr_p,
+			  struct in6_addr *saddr);
 void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
 			 u8 *proto);
 
-- 
cgit v1.2.3


From f41cd11d64b2b21012eb4abffbe579bc0b90467f Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Tue, 8 Nov 2016 17:24:03 +0200
Subject: tc_act: Remove tcf_act macro

tc_act macro addressed a non existing field, and was not used in the
kernel source.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 82f3c912a5b1..d8eae87ea778 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -42,7 +42,6 @@ struct tc_action {
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
 	struct gnet_stats_queue __percpu *cpu_qstats;
 };
-#define tcf_act		common.tcfa_act
 #define tcf_head	common.tcfa_head
 #define tcf_index	common.tcfa_index
 #define tcf_refcnt	common.tcfa_refcnt
-- 
cgit v1.2.3


From 149d6ad83663b4820ca09c9d40b1eea7f5c22c2b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 8 Nov 2016 11:07:28 -0800
Subject: net: napi_hash_add() is no longer exported

There are no more users except from net/core/dev.c
napi_hash_add() can now be static.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 66fd61c681d9..d64135a0ab71 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -467,17 +467,6 @@ static inline void napi_complete(struct napi_struct *n)
 	return napi_complete_done(n, 0);
 }
 
-/**
- *	napi_hash_add - add a NAPI to global hashtable
- *	@napi: NAPI context
- *
- * Generate a new napi_id and store a @napi under it in napi_hash.
- * Used for busy polling (CONFIG_NET_RX_BUSY_POLL).
- * Note: This is normally automatically done from netif_napi_add(),
- * so might disappear in a future Linux version.
- */
-void napi_hash_add(struct napi_struct *napi);
-
 /**
  *	napi_hash_del - remove a NAPI from global table
  *	@napi: NAPI context
-- 
cgit v1.2.3


From d8d26354191399627bac9cf0da0667b0f5178686 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Tue, 8 Nov 2016 22:49:16 +0100
Subject: ptp: Introduce a high resolution frequency adjustment method.

The internal PTP Hardware Clock (PHC) interface limits the resolution for
frequency adjustments to one part per billion.  However, some hardware
devices allow finer adjustment, and making use of the increased resolution
improves synchronization measurably on such devices.

This patch adds an alternative method that allows finer frequency tuning
by passing the scaled ppm value to PHC drivers.  This value comes from
user space, and it has a resolution of about 0.015 ppb.  We also deprecate
the older method, anticipating its removal once existing drivers have been
converted over.

Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Suggested-by: Ulrik De Bie <ulrik.debie-os@e2big.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptp_clock_kernel.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 5ad54fc66cf0..b76d47aba564 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -58,7 +58,14 @@ struct system_device_crosststamp;
  *
  * clock operations
  *
+ * @adjfine:  Adjusts the frequency of the hardware clock.
+ *            parameter scaled_ppm: Desired frequency offset from
+ *            nominal frequency in parts per million, but with a
+ *            16 bit binary fractional field.
+ *
  * @adjfreq:  Adjusts the frequency of the hardware clock.
+ *            This method is deprecated.  New drivers should implement
+ *            the @adjfine method instead.
  *            parameter delta: Desired frequency offset from nominal frequency
  *            in parts per billion
  *
@@ -108,6 +115,7 @@ struct ptp_clock_info {
 	int n_pins;
 	int pps;
 	struct ptp_pin_desc *pin_config;
+	int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm);
 	int (*adjfreq)(struct ptp_clock_info *ptp, s32 delta);
 	int (*adjtime)(struct ptp_clock_info *ptp, s64 delta);
 	int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts);
-- 
cgit v1.2.3


From 2da16a6948ca8f025e2c226ea4fc32baa6b90f27 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 10 Nov 2016 11:17:25 +0100
Subject: netfilter: ipset: Remove extra whitespaces in ip_set.h

Remove unnecessary whitespaces.

Ported from a patch proposed by Sergey Popovich <popovich_sergei@mail.ua>.

Suggested-by: Sergey Popovich <popovich_sergei@mail.ua>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 83b9a2e0d8d4..5b1fd090f34b 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -336,14 +336,15 @@ ip_set_update_counter(struct ip_set_counter *counter,
 
 static inline void
 ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo,
-		      const struct ip_set_ext *ext,
-		      struct ip_set_ext *mext, u32 flags)
+		   const struct ip_set_ext *ext,
+		   struct ip_set_ext *mext, u32 flags)
 {
-		mext->skbmark = skbinfo->skbmark;
-		mext->skbmarkmask = skbinfo->skbmarkmask;
-		mext->skbprio = skbinfo->skbprio;
-		mext->skbqueue = skbinfo->skbqueue;
+	mext->skbmark = skbinfo->skbmark;
+	mext->skbmarkmask = skbinfo->skbmarkmask;
+	mext->skbprio = skbinfo->skbprio;
+	mext->skbqueue = skbinfo->skbqueue;
 }
+
 static inline bool
 ip_set_put_skbinfo(struct sk_buff *skb, struct ip_set_skbinfo *skbinfo)
 {
-- 
cgit v1.2.3


From da9fbfa76f32a031cb70b11e9fa650e30c85d040 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 10 Nov 2016 11:24:15 +0100
Subject: netfilter: ipset: Mark some helper args as const.

Mark some of the helpers arguments as const.

Ported from a patch proposed by Sergey Popovich <popovich_sergei@mail.ua>.

Suggested-by: Sergey Popovich <popovich_sergei@mail.ua>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h         | 4 ++--
 include/linux/netfilter/ipset/ip_set_comment.h | 2 +-
 include/linux/netfilter/ipset/ip_set_timeout.h | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 5b1fd090f34b..524467f933bf 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -346,7 +346,7 @@ ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo,
 }
 
 static inline bool
-ip_set_put_skbinfo(struct sk_buff *skb, struct ip_set_skbinfo *skbinfo)
+ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo)
 {
 	/* Send nonzero parameters only */
 	return ((skbinfo->skbmark || skbinfo->skbmarkmask) &&
@@ -373,7 +373,7 @@ ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo,
 }
 
 static inline bool
-ip_set_put_counter(struct sk_buff *skb, struct ip_set_counter *counter)
+ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter)
 {
 	return nla_put_net64(skb, IPSET_ATTR_BYTES,
 			     cpu_to_be64(ip_set_get_bytes(counter)),
diff --git a/include/linux/netfilter/ipset/ip_set_comment.h b/include/linux/netfilter/ipset/ip_set_comment.h
index 8d0248525957..bae5c7609be2 100644
--- a/include/linux/netfilter/ipset/ip_set_comment.h
+++ b/include/linux/netfilter/ipset/ip_set_comment.h
@@ -43,7 +43,7 @@ ip_set_init_comment(struct ip_set_comment *comment,
 
 /* Used only when dumping a set, protected by rcu_read_lock_bh() */
 static inline int
-ip_set_put_comment(struct sk_buff *skb, struct ip_set_comment *comment)
+ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment)
 {
 	struct ip_set_comment_rcu *c = rcu_dereference_bh(comment->c);
 
diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h
index 1d6a935c1ac5..bfb3531fd88a 100644
--- a/include/linux/netfilter/ipset/ip_set_timeout.h
+++ b/include/linux/netfilter/ipset/ip_set_timeout.h
@@ -40,7 +40,7 @@ ip_set_timeout_uget(struct nlattr *tb)
 }
 
 static inline bool
-ip_set_timeout_expired(unsigned long *t)
+ip_set_timeout_expired(const unsigned long *t)
 {
 	return *t != IPSET_ELEM_PERMANENT && time_is_before_jiffies(*t);
 }
@@ -63,7 +63,7 @@ ip_set_timeout_set(unsigned long *timeout, u32 value)
 }
 
 static inline u32
-ip_set_timeout_get(unsigned long *timeout)
+ip_set_timeout_get(const unsigned long *timeout)
 {
 	return *timeout == IPSET_ELEM_PERMANENT ? 0 :
 		jiffies_to_msecs(*timeout - jiffies)/MSEC_PER_SEC;
-- 
cgit v1.2.3


From 7ffea37957b900422ce8b82e9651f7a0a6fac733 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 10 Nov 2016 11:31:03 +0100
Subject: netfilter: ipset: Headers file cleanup

Group counter helper functions together.

Ported from a patch proposed by Sergey Popovich <popovich_sergei@mail.ua>.

Suggested-by: Sergey Popovich <popovich_sergei@mail.ua>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h | 42 +++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 524467f933bf..1ea28e30a6dd 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -334,6 +334,27 @@ ip_set_update_counter(struct ip_set_counter *counter,
 	}
 }
 
+static inline bool
+ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter)
+{
+	return nla_put_net64(skb, IPSET_ATTR_BYTES,
+			     cpu_to_be64(ip_set_get_bytes(counter)),
+			     IPSET_ATTR_PAD) ||
+	       nla_put_net64(skb, IPSET_ATTR_PACKETS,
+			     cpu_to_be64(ip_set_get_packets(counter)),
+			     IPSET_ATTR_PAD);
+}
+
+static inline void
+ip_set_init_counter(struct ip_set_counter *counter,
+		    const struct ip_set_ext *ext)
+{
+	if (ext->bytes != ULLONG_MAX)
+		atomic64_set(&(counter)->bytes, (long long)(ext->bytes));
+	if (ext->packets != ULLONG_MAX)
+		atomic64_set(&(counter)->packets, (long long)(ext->packets));
+}
+
 static inline void
 ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo,
 		   const struct ip_set_ext *ext,
@@ -372,27 +393,6 @@ ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo,
 	skbinfo->skbqueue = ext->skbqueue;
 }
 
-static inline bool
-ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter)
-{
-	return nla_put_net64(skb, IPSET_ATTR_BYTES,
-			     cpu_to_be64(ip_set_get_bytes(counter)),
-			     IPSET_ATTR_PAD) ||
-	       nla_put_net64(skb, IPSET_ATTR_PACKETS,
-			     cpu_to_be64(ip_set_get_packets(counter)),
-			     IPSET_ATTR_PAD);
-}
-
-static inline void
-ip_set_init_counter(struct ip_set_counter *counter,
-		    const struct ip_set_ext *ext)
-{
-	if (ext->bytes != ULLONG_MAX)
-		atomic64_set(&(counter)->bytes, (long long)(ext->bytes));
-	if (ext->packets != ULLONG_MAX)
-		atomic64_set(&(counter)->packets, (long long)(ext->packets));
-}
-
 /* Netlink CB args */
 enum {
 	IPSET_CB_NET = 0,	/* net namespace */
-- 
cgit v1.2.3


From bec810d973003b30bc477146904af6bd93fd2df8 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Tue, 5 May 2015 17:13:28 +0200
Subject: netfilter: ipset: Improve skbinfo get/init helpers

Use struct ip_set_skbinfo in struct ip_set_ext instead of open
coded fields and assign structure members in get/init helpers
instead of copying members one by one. Explicitly note that
struct ip_set_skbinfo must be padded to prevent non-aligned
access in the extension blob.

Ported from a patch proposed by Sergey Popovich <popovich_sergei@mail.ua>.

Suggested-by: Sergey Popovich <popovich_sergei@mail.ua>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 1ea28e30a6dd..780262124632 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -92,17 +92,6 @@ struct ip_set_ext_type {
 
 extern const struct ip_set_ext_type ip_set_extensions[];
 
-struct ip_set_ext {
-	u64 packets;
-	u64 bytes;
-	u32 timeout;
-	u32 skbmark;
-	u32 skbmarkmask;
-	u32 skbprio;
-	u16 skbqueue;
-	char *comment;
-};
-
 struct ip_set_counter {
 	atomic64_t bytes;
 	atomic64_t packets;
@@ -122,6 +111,15 @@ struct ip_set_skbinfo {
 	u32 skbmarkmask;
 	u32 skbprio;
 	u16 skbqueue;
+	u16 __pad;
+};
+
+struct ip_set_ext {
+	struct ip_set_skbinfo skbinfo;
+	u64 packets;
+	u64 bytes;
+	char *comment;
+	u32 timeout;
 };
 
 struct ip_set;
@@ -360,10 +358,7 @@ ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo,
 		   const struct ip_set_ext *ext,
 		   struct ip_set_ext *mext, u32 flags)
 {
-	mext->skbmark = skbinfo->skbmark;
-	mext->skbmarkmask = skbinfo->skbmarkmask;
-	mext->skbprio = skbinfo->skbprio;
-	mext->skbqueue = skbinfo->skbqueue;
+	mext->skbinfo = *skbinfo;
 }
 
 static inline bool
@@ -387,10 +382,7 @@ static inline void
 ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo,
 		    const struct ip_set_ext *ext)
 {
-	skbinfo->skbmark = ext->skbmark;
-	skbinfo->skbmarkmask = ext->skbmarkmask;
-	skbinfo->skbprio = ext->skbprio;
-	skbinfo->skbqueue = ext->skbqueue;
+	*skbinfo = ext->skbinfo;
 }
 
 /* Netlink CB args */
-- 
cgit v1.2.3


From 1d0d6bd61d495d271b9774a15fbea93e4875474b Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Wed, 6 May 2015 07:27:28 +0200
Subject: netfilter: ipset: Use kmalloc() in comment extension helper

Allocate memory with kmalloc() rather than kzalloc(): the string
is immediately initialized so it is unnecessary to zero out
the allocated memory area.

Ported from a patch proposed by Sergey Popovich <popovich_sergei@mail.ua>.

Suggested-by: Sergey Popovich <popovich_sergei@mail.ua>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set_comment.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set_comment.h b/include/linux/netfilter/ipset/ip_set_comment.h
index bae5c7609be2..5444b1bbe656 100644
--- a/include/linux/netfilter/ipset/ip_set_comment.h
+++ b/include/linux/netfilter/ipset/ip_set_comment.h
@@ -34,7 +34,7 @@ ip_set_init_comment(struct ip_set_comment *comment,
 		return;
 	if (unlikely(len > IPSET_MAX_COMMENT_SIZE))
 		len = IPSET_MAX_COMMENT_SIZE;
-	c = kzalloc(sizeof(*c) + len + 1, GFP_ATOMIC);
+	c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC);
 	if (unlikely(!c))
 		return;
 	strlcpy(c->str, ext->comment, len + 1);
-- 
cgit v1.2.3


From 57982edc2739b4473868e7579c0185270468bae1 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Mon, 10 Oct 2016 21:34:56 +0200
Subject: netfilter: ipset: Split extensions into separate files

Cleanup to separate all extensions into individual files.

Ported from a patch proposed by Sergey Popovich <popovich_sergei@mail.ua>.

Suggested-by: Sergey Popovich <popovich_sergei@mail.ua>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h         | 95 +-------------------------
 include/linux/netfilter/ipset/ip_set_counter.h | 75 ++++++++++++++++++++
 include/linux/netfilter/ipset/ip_set_skbinfo.h | 46 +++++++++++++
 3 files changed, 123 insertions(+), 93 deletions(-)
 create mode 100644 include/linux/netfilter/ipset/ip_set_counter.h
 create mode 100644 include/linux/netfilter/ipset/ip_set_skbinfo.h

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 780262124632..b5bd0fb3d07b 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -292,99 +292,6 @@ ip_set_put_flags(struct sk_buff *skb, struct ip_set *set)
 	return nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(cadt_flags));
 }
 
-static inline void
-ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter)
-{
-	atomic64_add((long long)bytes, &(counter)->bytes);
-}
-
-static inline void
-ip_set_add_packets(u64 packets, struct ip_set_counter *counter)
-{
-	atomic64_add((long long)packets, &(counter)->packets);
-}
-
-static inline u64
-ip_set_get_bytes(const struct ip_set_counter *counter)
-{
-	return (u64)atomic64_read(&(counter)->bytes);
-}
-
-static inline u64
-ip_set_get_packets(const struct ip_set_counter *counter)
-{
-	return (u64)atomic64_read(&(counter)->packets);
-}
-
-static inline void
-ip_set_update_counter(struct ip_set_counter *counter,
-		      const struct ip_set_ext *ext,
-		      struct ip_set_ext *mext, u32 flags)
-{
-	if (ext->packets != ULLONG_MAX &&
-	    !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) {
-		ip_set_add_bytes(ext->bytes, counter);
-		ip_set_add_packets(ext->packets, counter);
-	}
-	if (flags & IPSET_FLAG_MATCH_COUNTERS) {
-		mext->packets = ip_set_get_packets(counter);
-		mext->bytes = ip_set_get_bytes(counter);
-	}
-}
-
-static inline bool
-ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter)
-{
-	return nla_put_net64(skb, IPSET_ATTR_BYTES,
-			     cpu_to_be64(ip_set_get_bytes(counter)),
-			     IPSET_ATTR_PAD) ||
-	       nla_put_net64(skb, IPSET_ATTR_PACKETS,
-			     cpu_to_be64(ip_set_get_packets(counter)),
-			     IPSET_ATTR_PAD);
-}
-
-static inline void
-ip_set_init_counter(struct ip_set_counter *counter,
-		    const struct ip_set_ext *ext)
-{
-	if (ext->bytes != ULLONG_MAX)
-		atomic64_set(&(counter)->bytes, (long long)(ext->bytes));
-	if (ext->packets != ULLONG_MAX)
-		atomic64_set(&(counter)->packets, (long long)(ext->packets));
-}
-
-static inline void
-ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo,
-		   const struct ip_set_ext *ext,
-		   struct ip_set_ext *mext, u32 flags)
-{
-	mext->skbinfo = *skbinfo;
-}
-
-static inline bool
-ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo)
-{
-	/* Send nonzero parameters only */
-	return ((skbinfo->skbmark || skbinfo->skbmarkmask) &&
-		nla_put_net64(skb, IPSET_ATTR_SKBMARK,
-			      cpu_to_be64((u64)skbinfo->skbmark << 32 |
-					  skbinfo->skbmarkmask),
-			      IPSET_ATTR_PAD)) ||
-	       (skbinfo->skbprio &&
-		nla_put_net32(skb, IPSET_ATTR_SKBPRIO,
-			      cpu_to_be32(skbinfo->skbprio))) ||
-	       (skbinfo->skbqueue &&
-		nla_put_net16(skb, IPSET_ATTR_SKBQUEUE,
-			     cpu_to_be16(skbinfo->skbqueue)));
-}
-
-static inline void
-ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo,
-		    const struct ip_set_ext *ext)
-{
-	*skbinfo = ext->skbinfo;
-}
-
 /* Netlink CB args */
 enum {
 	IPSET_CB_NET = 0,	/* net namespace */
@@ -539,6 +446,8 @@ bitmap_bytes(u32 a, u32 b)
 
 #include <linux/netfilter/ipset/ip_set_timeout.h>
 #include <linux/netfilter/ipset/ip_set_comment.h>
+#include <linux/netfilter/ipset/ip_set_counter.h>
+#include <linux/netfilter/ipset/ip_set_skbinfo.h>
 
 int
 ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
diff --git a/include/linux/netfilter/ipset/ip_set_counter.h b/include/linux/netfilter/ipset/ip_set_counter.h
new file mode 100644
index 000000000000..bb6fba480118
--- /dev/null
+++ b/include/linux/netfilter/ipset/ip_set_counter.h
@@ -0,0 +1,75 @@
+#ifndef _IP_SET_COUNTER_H
+#define _IP_SET_COUNTER_H
+
+/* Copyright (C) 2015 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifdef __KERNEL__
+
+static inline void
+ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter)
+{
+	atomic64_add((long long)bytes, &(counter)->bytes);
+}
+
+static inline void
+ip_set_add_packets(u64 packets, struct ip_set_counter *counter)
+{
+	atomic64_add((long long)packets, &(counter)->packets);
+}
+
+static inline u64
+ip_set_get_bytes(const struct ip_set_counter *counter)
+{
+	return (u64)atomic64_read(&(counter)->bytes);
+}
+
+static inline u64
+ip_set_get_packets(const struct ip_set_counter *counter)
+{
+	return (u64)atomic64_read(&(counter)->packets);
+}
+
+static inline void
+ip_set_update_counter(struct ip_set_counter *counter,
+		      const struct ip_set_ext *ext,
+		      struct ip_set_ext *mext, u32 flags)
+{
+	if (ext->packets != ULLONG_MAX &&
+	    !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) {
+		ip_set_add_bytes(ext->bytes, counter);
+		ip_set_add_packets(ext->packets, counter);
+	}
+	if (flags & IPSET_FLAG_MATCH_COUNTERS) {
+		mext->packets = ip_set_get_packets(counter);
+		mext->bytes = ip_set_get_bytes(counter);
+	}
+}
+
+static inline bool
+ip_set_put_counter(struct sk_buff *skb, const struct ip_set_counter *counter)
+{
+	return nla_put_net64(skb, IPSET_ATTR_BYTES,
+			     cpu_to_be64(ip_set_get_bytes(counter)),
+			     IPSET_ATTR_PAD) ||
+	       nla_put_net64(skb, IPSET_ATTR_PACKETS,
+			     cpu_to_be64(ip_set_get_packets(counter)),
+			     IPSET_ATTR_PAD);
+}
+
+static inline void
+ip_set_init_counter(struct ip_set_counter *counter,
+		    const struct ip_set_ext *ext)
+{
+	if (ext->bytes != ULLONG_MAX)
+		atomic64_set(&(counter)->bytes, (long long)(ext->bytes));
+	if (ext->packets != ULLONG_MAX)
+		atomic64_set(&(counter)->packets, (long long)(ext->packets));
+}
+
+#endif /* __KERNEL__ */
+#endif /* _IP_SET_COUNTER_H */
diff --git a/include/linux/netfilter/ipset/ip_set_skbinfo.h b/include/linux/netfilter/ipset/ip_set_skbinfo.h
new file mode 100644
index 000000000000..29d7ef2bc3fa
--- /dev/null
+++ b/include/linux/netfilter/ipset/ip_set_skbinfo.h
@@ -0,0 +1,46 @@
+#ifndef _IP_SET_SKBINFO_H
+#define _IP_SET_SKBINFO_H
+
+/* Copyright (C) 2015 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifdef __KERNEL__
+
+static inline void
+ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo,
+		   const struct ip_set_ext *ext,
+		   struct ip_set_ext *mext, u32 flags)
+{
+	mext->skbinfo = *skbinfo;
+}
+
+static inline bool
+ip_set_put_skbinfo(struct sk_buff *skb, const struct ip_set_skbinfo *skbinfo)
+{
+	/* Send nonzero parameters only */
+	return ((skbinfo->skbmark || skbinfo->skbmarkmask) &&
+		nla_put_net64(skb, IPSET_ATTR_SKBMARK,
+			      cpu_to_be64((u64)skbinfo->skbmark << 32 |
+					  skbinfo->skbmarkmask),
+			      IPSET_ATTR_PAD)) ||
+	       (skbinfo->skbprio &&
+		nla_put_net32(skb, IPSET_ATTR_SKBPRIO,
+			      cpu_to_be32(skbinfo->skbprio))) ||
+	       (skbinfo->skbqueue &&
+		nla_put_net16(skb, IPSET_ATTR_SKBQUEUE,
+			      cpu_to_be16(skbinfo->skbqueue)));
+}
+
+static inline void
+ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo,
+		    const struct ip_set_ext *ext)
+{
+	*skbinfo = ext->skbinfo;
+}
+
+#endif /* __KERNEL__ */
+#endif /* _IP_SET_SKBINFO_H */
-- 
cgit v1.2.3


From 837a90eab67edfa464dcc0ddef193449d23da408 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Mon, 10 Oct 2016 21:52:51 +0200
Subject: netfilter: ipset: Regroup ip_set_put_extensions and add extern

Cleanup: group ip_set_put_extensions and ip_set_get_extensions
together and add missing extern.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index b5bd0fb3d07b..7a218eb74887 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -331,6 +331,8 @@ extern size_t ip_set_elem_len(struct ip_set *set, struct nlattr *tb[],
 			      size_t len, size_t align);
 extern int ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
 				 struct ip_set_ext *ext);
+extern int ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
+				 const void *e, bool active);
 
 static inline int
 ip_set_get_hostipaddr4(struct nlattr *nla, u32 *ipaddr)
@@ -449,10 +451,6 @@ bitmap_bytes(u32 a, u32 b)
 #include <linux/netfilter/ipset/ip_set_counter.h>
 #include <linux/netfilter/ipset/ip_set_skbinfo.h>
 
-int
-ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set,
-		      const void *e, bool active);
-
 #define IP_SET_INIT_KEXT(skb, opt, set)			\
 	{ .bytes = (skb)->len, .packets = 1,		\
 	  .timeout = ip_set_adt_opt_timeout(opt, set) }
-- 
cgit v1.2.3


From 702b71e7c666a1c9be9d49e8cd173f0d4d1e859f Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Mon, 10 Oct 2016 22:07:41 +0200
Subject: netfilter: ipset: Add element count to all set types header

It is better to list the set elements for all set types, thus the
header information is uniform. Element counts are therefore added
to the bitmap and list types.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h        | 2 ++
 include/linux/netfilter/ipset/ip_set_bitmap.h | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 7a218eb74887..4671d740610f 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -250,6 +250,8 @@ struct ip_set {
 	u8 flags;
 	/* Default timeout value, if enabled */
 	u32 timeout;
+	/* Number of elements (vs timeout) */
+	u32 elements;
 	/* Element data size */
 	size_t dsize;
 	/* Offsets to extensions in elements */
diff --git a/include/linux/netfilter/ipset/ip_set_bitmap.h b/include/linux/netfilter/ipset/ip_set_bitmap.h
index 5e4662a71e01..366d6c0ea04f 100644
--- a/include/linux/netfilter/ipset/ip_set_bitmap.h
+++ b/include/linux/netfilter/ipset/ip_set_bitmap.h
@@ -6,8 +6,8 @@
 #define IPSET_BITMAP_MAX_RANGE	0x0000FFFF
 
 enum {
+	IPSET_ADD_STORE_PLAIN_TIMEOUT = -1,
 	IPSET_ADD_FAILED = 1,
-	IPSET_ADD_STORE_PLAIN_TIMEOUT,
 	IPSET_ADD_START_STORED_TIMEOUT,
 };
 
-- 
cgit v1.2.3


From 9e41f26a505cca04b7122e65053cf6447007ea79 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 10 Nov 2016 12:05:34 +0100
Subject: netfilter: ipset: Count non-static extension memory for userspace

Non-static (i.e. comment) extension was not counted into the memory
size. A new internal counter is introduced for this. In the case of
the hash types the sizes of the arrays are counted there as well so
that we can avoid to scan the whole set when just the header data
is requested.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
---
 include/linux/netfilter/ipset/ip_set.h         | 8 ++++++--
 include/linux/netfilter/ipset/ip_set_comment.h | 7 +++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 4671d740610f..8e42253e5d4d 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -79,10 +79,12 @@ enum ip_set_ext_id {
 	IPSET_EXT_ID_MAX,
 };
 
+struct ip_set;
+
 /* Extension type */
 struct ip_set_ext_type {
 	/* Destroy extension private data (can be NULL) */
-	void (*destroy)(void *ext);
+	void (*destroy)(struct ip_set *set, void *ext);
 	enum ip_set_extension type;
 	enum ipset_cadt_flags flag;
 	/* Size and minimal alignment */
@@ -252,6 +254,8 @@ struct ip_set {
 	u32 timeout;
 	/* Number of elements (vs timeout) */
 	u32 elements;
+	/* Size of the dynamic extensions (vs timeout) */
+	size_t ext_size;
 	/* Element data size */
 	size_t dsize;
 	/* Offsets to extensions in elements */
@@ -268,7 +272,7 @@ ip_set_ext_destroy(struct ip_set *set, void *data)
 	 */
 	if (SET_WITH_COMMENT(set))
 		ip_set_extensions[IPSET_EXT_ID_COMMENT].destroy(
-			ext_comment(data, set));
+			set, ext_comment(data, set));
 }
 
 static inline int
diff --git a/include/linux/netfilter/ipset/ip_set_comment.h b/include/linux/netfilter/ipset/ip_set_comment.h
index 5444b1bbe656..8e2bab1e8e90 100644
--- a/include/linux/netfilter/ipset/ip_set_comment.h
+++ b/include/linux/netfilter/ipset/ip_set_comment.h
@@ -20,13 +20,14 @@ ip_set_comment_uget(struct nlattr *tb)
  * The kadt functions don't use the comment extensions in any way.
  */
 static inline void
-ip_set_init_comment(struct ip_set_comment *comment,
+ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment,
 		    const struct ip_set_ext *ext)
 {
 	struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1);
 	size_t len = ext->comment ? strlen(ext->comment) : 0;
 
 	if (unlikely(c)) {
+		set->ext_size -= sizeof(*c) + strlen(c->str) + 1;
 		kfree_rcu(c, rcu);
 		rcu_assign_pointer(comment->c, NULL);
 	}
@@ -38,6 +39,7 @@ ip_set_init_comment(struct ip_set_comment *comment,
 	if (unlikely(!c))
 		return;
 	strlcpy(c->str, ext->comment, len + 1);
+	set->ext_size += sizeof(*c) + strlen(c->str) + 1;
 	rcu_assign_pointer(comment->c, c);
 }
 
@@ -58,13 +60,14 @@ ip_set_put_comment(struct sk_buff *skb, const struct ip_set_comment *comment)
  * of the set data anymore.
  */
 static inline void
-ip_set_comment_free(struct ip_set_comment *comment)
+ip_set_comment_free(struct ip_set *set, struct ip_set_comment *comment)
 {
 	struct ip_set_comment_rcu *c;
 
 	c = rcu_dereference_protected(comment->c, 1);
 	if (unlikely(!c))
 		return;
+	set->ext_size -= sizeof(*c) + strlen(c->str) + 1;
 	kfree_rcu(c, rcu);
 	rcu_assign_pointer(comment->c, NULL);
 }
-- 
cgit v1.2.3


From c540594f864bb4645573c2c0a304919fabb3d7ea Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 9 Nov 2016 22:02:34 +0100
Subject: bpf, mlx4: fix prog refcount in mlx4_en_try_alloc_resources error
 path

Commit 67f8b1dcb9ee ("net/mlx4_en: Refactor the XDP forwarding rings
scheme") added a bug in that the prog's reference count is not dropped
in the error path when mlx4_en_try_alloc_resources() is failing from
mlx4_xdp_set().

We previously took bpf_prog_add(prog, priv->rx_ring_num - 1), that we
need to release again. Earlier in the call path, dev_change_xdp_fd()
itself holds a reference to the prog as well (hence the '- 1' in the
bpf_prog_add()), so a simple atomic_sub() is safe to use here. When
an error is propagated, then bpf_prog_put() is called eventually from
dev_change_xdp_fd()

Fixes: 67f8b1dcb9ee ("net/mlx4_en: Refactor the XDP forwarding rings scheme")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index edcd96ded8aa..01c1487277b2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -234,6 +234,7 @@ void bpf_register_map_type(struct bpf_map_type_list *tl);
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
 struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i);
+void bpf_prog_sub(struct bpf_prog *prog, int i);
 struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog);
 void bpf_prog_put(struct bpf_prog *prog);
 
@@ -303,6 +304,10 @@ static inline struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
+static inline void bpf_prog_sub(struct bpf_prog *prog, int i)
+{
+}
+
 static inline void bpf_prog_put(struct bpf_prog *prog)
 {
 }
-- 
cgit v1.2.3


From 91820da6ae85904d95ed53bf3a83f9ec44a6b80a Mon Sep 17 00:00:00 2001
From: Jiri Benc <jbenc@redhat.com>
Date: Thu, 10 Nov 2016 16:28:23 +0100
Subject: openvswitch: add Ethernet push and pop actions

It's not allowed to push Ethernet header in front of another Ethernet
header.

It's not allowed to pop Ethernet header if there's a vlan tag. This
preserves the invariant that L3 packet never has a vlan tag.

Based on previous versions by Lorand Jakab and Simon Horman.

Signed-off-by: Lorand Jakab <lojakab@cisco.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/openvswitch.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 59ed3992c760..375d812fea36 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -705,6 +705,15 @@ enum ovs_nat_attr {
 
 #define OVS_NAT_ATTR_MAX (__OVS_NAT_ATTR_MAX - 1)
 
+/*
+ * struct ovs_action_push_eth - %OVS_ACTION_ATTR_PUSH_ETH action argument.
+ * @addresses: Source and destination MAC addresses.
+ * @eth_type: Ethernet type
+ */
+struct ovs_action_push_eth {
+	struct ovs_key_ethernet addresses;
+};
+
 /**
  * enum ovs_action_attr - Action types.
  *
@@ -738,6 +747,10 @@ enum ovs_nat_attr {
  * is no MPLS label stack, as determined by ethertype, no action is taken.
  * @OVS_ACTION_ATTR_CT: Track the connection. Populate the conntrack-related
  * entries in the flow key.
+ * @OVS_ACTION_ATTR_PUSH_ETH: Push a new outermost Ethernet header onto the
+ * packet.
+ * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
+ * packet.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -765,6 +778,8 @@ enum ovs_action_attr {
 				       * bits. */
 	OVS_ACTION_ATTR_CT,           /* Nested OVS_CT_ATTR_* . */
 	OVS_ACTION_ATTR_TRUNC,        /* u32 struct ovs_action_trunc. */
+	OVS_ACTION_ATTR_PUSH_ETH,     /* struct ovs_action_push_eth. */
+	OVS_ACTION_ATTR_POP_ETH,      /* No argument. */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
-- 
cgit v1.2.3


From 372788f964c95a6fa0f677c43d6153c27896ef42 Mon Sep 17 00:00:00 2001
From: "Lendacky, Thomas" <Thomas.Lendacky@amd.com>
Date: Thu, 10 Nov 2016 17:10:46 -0600
Subject: net: phy: expose phy_aneg_done API for use by drivers

Make phy_aneg_done() available to drivers so that the result of the
auto-negotiation initiated by phy_start_aneg() can be determined.

Remove the local implementation of phy_aneg_done() from the Aeroflex
driver and use the phy library version.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index e7e1fd382564..9880d73a2c3d 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -786,6 +786,7 @@ void phy_detach(struct phy_device *phydev);
 void phy_start(struct phy_device *phydev);
 void phy_stop(struct phy_device *phydev);
 int phy_start_aneg(struct phy_device *phydev);
+int phy_aneg_done(struct phy_device *phydev);
 
 int phy_stop_interrupts(struct phy_device *phydev);
 
-- 
cgit v1.2.3


From 98e4321b97cc790c6aac3fb7c92bbf13212452ee Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 13 Nov 2016 12:14:59 -0500
Subject: genetlink: Make family a signed integer.

The idr_alloc(), idr_remove(), et al. routines all expect IDs to be
signed integers.  Therefore make the genl_family member 'id' signed
too.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 3ec87bacc0f5..a34275be3600 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -48,7 +48,7 @@ struct genl_info;
  * @n_ops: number of operations supported by this family
  */
 struct genl_family {
-	unsigned int		id;		/* private */
+	int			id;		/* private */
 	unsigned int		hdrsize;
 	char			name[GENL_NAMSIZ];
 	unsigned int		version;
-- 
cgit v1.2.3


From 7e416ad7416307e22871a0ef0f0f14e2bb66a0d1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 10 Nov 2016 14:17:01 +0100
Subject: netfilter: conntrack: remove unused netns_ct member

since 23014011ba420 ('netfilter: conntrack: support a fixed size of 128 distinct labels')
this isn't needed anymore.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netns/conntrack.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index e469e85de3f9..3d06d94d2e52 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -91,7 +91,6 @@ struct netns_ct {
 	struct nf_ip_net	nf_ct_proto;
 #if defined(CONFIG_NF_CONNTRACK_LABELS)
 	unsigned int		labels_used;
-	u8			label_words;
 #endif
 };
 #endif
-- 
cgit v1.2.3


From d9dc8b0f8b4ec8cdc48ad5a20a3105387138be82 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Fri, 11 Nov 2016 10:20:50 -0800
Subject: net: fix sleeping for sk_wait_event()

Similar to commit 14135f30e33c ("inet: fix sleeping inside inet_wait_for_connect()"),
sk_wait_event() needs to fix too, because release_sock() is blocking,
it changes the process state back to running after sleep, which breaks
the previous prepare_to_wait().

Switch to the new wait API.

Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index cf617ee16723..9d905ed0cd25 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -915,14 +915,16 @@ static inline void sock_rps_reset_rxhash(struct sock *sk)
 #endif
 }
 
-#define sk_wait_event(__sk, __timeo, __condition)			\
+#define sk_wait_event(__sk, __timeo, __condition, __wait)		\
 	({	int __rc;						\
 		release_sock(__sk);					\
 		__rc = __condition;					\
 		if (!__rc) {						\
-			*(__timeo) = schedule_timeout(*(__timeo));	\
+			*(__timeo) = wait_woken(__wait,			\
+						TASK_INTERRUPTIBLE,	\
+						*(__timeo));		\
 		}							\
-		sched_annotate_sleep();						\
+		sched_annotate_sleep();					\
 		lock_sock(__sk);					\
 		__rc = __condition;					\
 		__rc;							\
-- 
cgit v1.2.3


From 29ba732acbeece1e34c68483d1ec1f3720fa1bb3 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 11 Nov 2016 10:55:09 -0800
Subject: bpf: Add BPF_MAP_TYPE_LRU_HASH

Provide a LRU version of the existing BPF_MAP_TYPE_HASH.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e2f38e0091b6..ed8c6799fb14 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -85,6 +85,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_PERCPU_ARRAY,
 	BPF_MAP_TYPE_STACK_TRACE,
 	BPF_MAP_TYPE_CGROUP_ARRAY,
+	BPF_MAP_TYPE_LRU_HASH,
 };
 
 enum bpf_prog_type {
@@ -106,6 +107,13 @@ enum bpf_prog_type {
 #define BPF_EXIST	2 /* update existing element */
 
 #define BPF_F_NO_PREALLOC	(1U << 0)
+/* Instead of having one common LRU list in the
+ * BPF_MAP_TYPE_LRU_HASH map, use a percpu LRU list
+ * which can scale and perform better.
+ * Note, the LRU nodes (including free nodes) cannot be moved
+ * across different LRU lists.
+ */
+#define BPF_F_NO_COMMON_LRU	(1U << 1)
 
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
-- 
cgit v1.2.3


From 8f8449384ec364ba2a654f11f94e754e4ff719e0 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 11 Nov 2016 10:55:10 -0800
Subject: bpf: Add BPF_MAP_TYPE_LRU_PERCPU_HASH

Provide a LRU version of the existing BPF_MAP_TYPE_PERCPU_HASH

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ed8c6799fb14..7d9b2832c280 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -86,6 +86,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_STACK_TRACE,
 	BPF_MAP_TYPE_CGROUP_ARRAY,
 	BPF_MAP_TYPE_LRU_HASH,
+	BPF_MAP_TYPE_LRU_PERCPU_HASH,
 };
 
 enum bpf_prog_type {
@@ -108,7 +109,7 @@ enum bpf_prog_type {
 
 #define BPF_F_NO_PREALLOC	(1U << 0)
 /* Instead of having one common LRU list in the
- * BPF_MAP_TYPE_LRU_HASH map, use a percpu LRU list
+ * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list
  * which can scale and perform better.
  * Note, the LRU nodes (including free nodes) cannot be moved
  * across different LRU lists.
-- 
cgit v1.2.3


From c915fe13cbaae5c7aa7b44f367d05addd60c9008 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 15 Nov 2016 16:37:53 +0100
Subject: udplite: fix NULL pointer dereference

The commit 850cbaddb52d ("udp: use it's own memory accounting schema")
assumes that the socket proto has memory accounting enabled,
but this is not the case for UDPLITE.
Fix it enabling memory accounting for UDPLITE and performing
fwd allocated memory reclaiming on socket shutdown.
UDP and UDPLITE share now the same memory accounting limits.
Also drop the backlog receive operation, since is no more needed.

Fixes: 850cbaddb52d ("udp: use it's own memory accounting schema")
Reported-by: Andrei Vagin <avagin@gmail.com>
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h     | 1 +
 include/net/udplite.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/udp.h b/include/net/udp.h
index e6e4e19be387..1661791e8ca1 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -246,6 +246,7 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
 }
 
 /* net/ipv4/udp.c */
+void udp_destruct_sock(struct sock *sk);
 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
 void udp_skb_destructor(struct sock *sk, struct sk_buff *skb);
diff --git a/include/net/udplite.h b/include/net/udplite.h
index 80761938b9a7..36097d388219 100644
--- a/include/net/udplite.h
+++ b/include/net/udplite.h
@@ -27,6 +27,7 @@ static __inline__ int udplite_getfrag(void *from, char *to, int  offset,
 static inline int udplite_sk_init(struct sock *sk)
 {
 	udp_sk(sk)->pcflag = UDPLITE_BIT;
+	sk->sk_destruct = udp_destruct_sock;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 4a4f86cc7d6bc74522f581341a2cae3119d5a0f5 Mon Sep 17 00:00:00 2001
From: pravin shelar <pshelar@ovn.org>
Date: Sun, 13 Nov 2016 20:43:52 -0800
Subject: vxlan: avoid vlan processing in vxlan device.

VxLan device does not have special handling for vlan taging on egress.
Therefore it does not make sense to expose vlan offloading feature.
This patch does not change vxlan functinality.

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 3319d97d789d..8d5fcd6284ce 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -399,22 +399,6 @@ static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb)
 		skb->vlan_tci = 0;
 	return skb;
 }
-/*
- * vlan_hwaccel_push_inside - pushes vlan tag to the payload
- * @skb: skbuff to tag
- *
- * Checks is tag is present in @skb->vlan_tci and if it is, it pushes the
- * VLAN tag from @skb->vlan_tci inside to the payload.
- *
- * Following the skb_unshare() example, in case of error, the calling function
- * doesn't have to worry about freeing the original skb.
- */
-static inline struct sk_buff *vlan_hwaccel_push_inside(struct sk_buff *skb)
-{
-	if (skb_vlan_tag_present(skb))
-		skb = __vlan_hwaccel_push_inside(skb);
-	return skb;
-}
 
 /**
  * __vlan_hwaccel_put_tag - hardware accelerated VLAN inserting
-- 
cgit v1.2.3


From 9efdb92d68c726e70066e5b4189c1186c9b6f90c Mon Sep 17 00:00:00 2001
From: pravin shelar <pshelar@ovn.org>
Date: Sun, 13 Nov 2016 20:43:58 -0800
Subject: vxlan: remove unsed vxlan_dev_dst_port()

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/vxlan.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include')

diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 308adc4154f4..49a59202f85e 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -281,16 +281,6 @@ struct vxlan_dev {
 struct net_device *vxlan_dev_create(struct net *net, const char *name,
 				    u8 name_assign_type, struct vxlan_config *conf);
 
-static inline __be16 vxlan_dev_dst_port(struct vxlan_dev *vxlan,
-					unsigned short family)
-{
-#if IS_ENABLED(CONFIG_IPV6)
-	if (family == AF_INET6)
-		return inet_sk(vxlan->vn6_sock->sock->sk)->inet_sport;
-#endif
-	return inet_sk(vxlan->vn4_sock->sock->sk)->inet_sport;
-}
-
 static inline netdev_features_t vxlan_features_check(struct sk_buff *skb,
 						     netdev_features_t features)
 {
-- 
cgit v1.2.3


From e86a8987e458a1826f509c41494b0b29a61144a7 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 15 Nov 2016 10:06:30 -0800
Subject: net: phy: Add phy_ethtool_nway_reset

This function just calls into genphy_restart_aneg() to perform an
autonegotation restart.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 9880d73a2c3d..b9bd3b4f4ea1 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -860,6 +860,7 @@ int phy_ethtool_get_link_ksettings(struct net_device *ndev,
 				   struct ethtool_link_ksettings *cmd);
 int phy_ethtool_set_link_ksettings(struct net_device *ndev,
 				   const struct ethtool_link_ksettings *cmd);
+int phy_ethtool_nway_reset(struct net_device *ndev);
 
 int __init mdio_bus_init(void);
 void mdio_bus_exit(void);
-- 
cgit v1.2.3


From ff86aae3b4112b85d2231c23bccbc49589df1c06 Mon Sep 17 00:00:00 2001
From: Madalin Bucur <madalin.bucur@nxp.com>
Date: Tue, 15 Nov 2016 10:41:01 +0200
Subject: devres: add devm_alloc_percpu()

Introduce managed counterparts for alloc_percpu() and free_percpu().
Add devm_alloc_percpu() and devm_free_percpu() into the managed
interfaces list.

Signed-off-by: Madalin Bucur <madalin.bucur@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/device.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/linux/device.h b/include/linux/device.h
index bc41e87a969b..a00105cf795e 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -698,6 +698,25 @@ static inline int devm_add_action_or_reset(struct device *dev,
 	return ret;
 }
 
+/**
+ * devm_alloc_percpu - Resource-managed alloc_percpu
+ * @dev: Device to allocate per-cpu memory for
+ * @type: Type to allocate per-cpu memory for
+ *
+ * Managed alloc_percpu. Per-cpu memory allocated with this function is
+ * automatically freed on driver detach.
+ *
+ * RETURNS:
+ * Pointer to allocated memory on success, NULL on failure.
+ */
+#define devm_alloc_percpu(dev, type)      \
+	((typeof(type) __percpu *)__devm_alloc_percpu((dev), sizeof(type), \
+						      __alignof__(type)))
+
+void __percpu *__devm_alloc_percpu(struct device *dev, size_t size,
+				   size_t align);
+void devm_free_percpu(struct device *dev, void __percpu *pdata);
+
 struct device_dma_parameters {
 	/*
 	 * a low level driver may set these to teach IOMMU code about
-- 
cgit v1.2.3


From 217f6974368188fd8bd7804bf5a036aa5762c5e4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 15 Nov 2016 10:15:11 -0800
Subject: net: busy-poll: allow preemption in sk_busy_loop()

After commit 4cd13c21b207 ("softirq: Let ksoftirqd do its job"),
sk_busy_loop() needs a bit of care :
softirqs might be delayed since we do not allow preemption yet.

This patch adds preemptiom points in sk_busy_loop(),
and makes sure no unnecessary cache line dirtying
or atomic operations are done while looping.

A new flag is added into napi->state : NAPI_STATE_IN_BUSY_POLL

This prevents napi_complete_done() from clearing NAPIF_STATE_SCHED,
so that sk_busy_loop() does not have to grab it again.

Similarly, netpoll_poll_lock() is done one time.

This gives about 10 to 20 % improvement in various busy polling
tests, especially when many threads are busy polling in
configurations with large number of NIC queues.

This should allow experimenting with bigger delays without
hurting overall latencies.

Tested:
 On a 40Gb mlx4 NIC, 32 RX/TX queues.

 echo 70 >/proc/sys/net/core/busy_read
 for i in `seq 1 40`; do echo -n $i: ; ./super_netperf $i -H lpaa24 -t UDP_RR -- -N -n; done

    Before:      After:
 1:   90072   92819
 2:  157289  184007
 3:  235772  213504
 4:  344074  357513
 5:  394755  458267
 6:  461151  487819
 7:  549116  625963
 8:  544423  716219
 9:  720460  738446
10:  794686  837612
11:  915998  923960
12:  937507  925107
13: 1019677  971506
14: 1046831 1113650
15: 1114154 1148902
16: 1105221 1179263
17: 1266552 1299585
18: 1258454 1383817
19: 1341453 1312194
20: 1363557 1488487
21: 1387979 1501004
22: 1417552 1601683
23: 1550049 1642002
24: 1568876 1601915
25: 1560239 1683607
26: 1640207 1745211
27: 1706540 1723574
28: 1638518 1722036
29: 1734309 1757447
30: 1782007 1855436
31: 1724806 1888539
32: 1717716 1944297
33: 1778716 1869118
34: 1805738 1983466
35: 1815694 2020758
36: 1893059 2035632
37: 1843406 2034653
38: 1888830 2086580
39: 1972827 2143567
40: 1877729 2181851

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Adam Belay <abelay@google.com>
Cc: Tariq Toukan <tariqt@mellanox.com>
Cc: Yuval Mintz <Yuval.Mintz@cavium.com>
Cc: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 86bacf6a64f0..e71de66e3792 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -334,6 +334,16 @@ enum {
 	NAPI_STATE_NPSVC,	/* Netpoll - don't dequeue from poll_list */
 	NAPI_STATE_HASHED,	/* In NAPI hash (busy polling possible) */
 	NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
+	NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
+};
+
+enum {
+	NAPIF_STATE_SCHED	 = (1UL << NAPI_STATE_SCHED),
+	NAPIF_STATE_DISABLE	 = (1UL << NAPI_STATE_DISABLE),
+	NAPIF_STATE_NPSVC	 = (1UL << NAPI_STATE_NPSVC),
+	NAPIF_STATE_HASHED	 = (1UL << NAPI_STATE_HASHED),
+	NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL),
+	NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL),
 };
 
 enum gro_result {
-- 
cgit v1.2.3


From 21cb84c48ca0619181106f0f44f3802a989de024 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 15 Nov 2016 10:15:12 -0800
Subject: net: busy-poll: remove need_resched() from sk_can_busy_loop()

Now sk_busy_loop() can schedule by itself, we can remove
need_resched() check from sk_can_busy_loop()

Also add a const to its struct sock parameter.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Adam Belay <abelay@google.com>
Cc: Tariq Toukan <tariqt@mellanox.com>
Cc: Yuval Mintz <Yuval.Mintz@cavium.com>
Cc: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/busy_poll.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 2fbeb1313c0f..965e52b9b5a3 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -58,10 +58,9 @@ static inline unsigned long busy_loop_end_time(void)
 	return busy_loop_us_clock() + ACCESS_ONCE(sysctl_net_busy_poll);
 }
 
-static inline bool sk_can_busy_loop(struct sock *sk)
+static inline bool sk_can_busy_loop(const struct sock *sk)
 {
-	return sk->sk_ll_usec && sk->sk_napi_id &&
-	       !need_resched() && !signal_pending(current);
+	return sk->sk_ll_usec && sk->sk_napi_id && !signal_pending(current);
 }
 
 
-- 
cgit v1.2.3


From 364b6055738b4c752c30ccaaf25c624e69d76195 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 15 Nov 2016 10:15:13 -0800
Subject: net: busy-poll: return busypolling status to drivers

NAPI drivers use napi_complete_done() or napi_complete() when
they drained RX ring and right before re-enabling device interrupts.

In busy polling, we can avoid interrupts being delivered since
we are polling RX ring in a controlled loop.

Drivers can chose to use napi_complete_done() return value
to reduce interrupts overhead while busy polling is active.

This is optional, legacy drivers should work fine even
if not updated.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Adam Belay <abelay@google.com>
Cc: Tariq Toukan <tariqt@mellanox.com>
Cc: Yuval Mintz <Yuval.Mintz@cavium.com>
Cc: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e71de66e3792..bcddf951ccee 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -463,16 +463,17 @@ static inline bool napi_reschedule(struct napi_struct *napi)
 	return false;
 }
 
-void __napi_complete(struct napi_struct *n);
-void napi_complete_done(struct napi_struct *n, int work_done);
+bool __napi_complete(struct napi_struct *n);
+bool napi_complete_done(struct napi_struct *n, int work_done);
 /**
  *	napi_complete - NAPI processing complete
  *	@n: NAPI context
  *
  * Mark NAPI processing as complete.
  * Consider using napi_complete_done() instead.
+ * Return false if device should avoid rearming interrupts.
  */
-static inline void napi_complete(struct napi_struct *n)
+static inline bool napi_complete(struct napi_struct *n)
 {
 	return napi_complete_done(n, 0);
 }
-- 
cgit v1.2.3


From a23a8f5bc17e6209eefadcd1cb3d3e28a2cdf530 Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Wed, 16 Nov 2016 10:05:46 +0100
Subject: lwtunnel: subtract tunnel headroom from mtu on output redirect

This patch changes the lwtunnel_headroom() function which is called
in ipv4_mtu() and ip6_mtu(), to also return the correct headroom
value when the lwtunnel state is OUTPUT_REDIRECT.

This patch enables e.g. SR-IPv6 encapsulations to work without
manually setting the route mtu.

Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 82e76fe1c1f7..d4c1c75b8862 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -94,7 +94,8 @@ static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate)
 static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate,
 					     unsigned int mtu)
 {
-	if (lwtunnel_xmit_redirect(lwtstate) && lwtstate->headroom < mtu)
+	if ((lwtunnel_xmit_redirect(lwtstate) ||
+	     lwtunnel_output_redirect(lwtstate)) && lwtstate->headroom < mtu)
 		return lwtstate->headroom;
 
 	return 0;
-- 
cgit v1.2.3


From 89c4b442b78bdba388337cc746fe63caba85f46c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Nov 2016 14:54:50 -0800
Subject: netpoll: more efficient locking

Callers of netpoll_poll_lock() own NAPI_STATE_SCHED

Callers of netpoll_poll_unlock() have BH blocked between
the NAPI_STATE_SCHED being cleared and poll_lock is released.

We can avoid the spinlock which has no contention, and use cmpxchg()
on poll_owner which we need to set anyway.

This removes a possible lockdep violation after the cited commit,
since sk_busy_loop() re-enables BH before calling busy_poll_stop()

Fixes: 217f69743681 ("net: busy-poll: allow preemption in sk_busy_loop()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 -
 include/linux/netpoll.h   | 13 +++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bcddf951ccee..e84800edd249 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -316,7 +316,6 @@ struct napi_struct {
 	unsigned int		gro_count;
 	int			(*poll)(struct napi_struct *, int);
 #ifdef CONFIG_NETPOLL
-	spinlock_t		poll_lock;
 	int			poll_owner;
 #endif
 	struct net_device	*dev;
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index b25ee9ffdbe6..1828900c9411 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -78,8 +78,11 @@ static inline void *netpoll_poll_lock(struct napi_struct *napi)
 	struct net_device *dev = napi->dev;
 
 	if (dev && dev->npinfo) {
-		spin_lock(&napi->poll_lock);
-		napi->poll_owner = smp_processor_id();
+		int owner = smp_processor_id();
+
+		while (cmpxchg(&napi->poll_owner, -1, owner) != -1)
+			cpu_relax();
+
 		return napi;
 	}
 	return NULL;
@@ -89,10 +92,8 @@ static inline void netpoll_poll_unlock(void *have)
 {
 	struct napi_struct *napi = have;
 
-	if (napi) {
-		napi->poll_owner = -1;
-		spin_unlock(&napi->poll_lock);
-	}
+	if (napi)
+		smp_store_release(&napi->poll_owner, -1);
 }
 
 static inline bool netpoll_tx_running(struct net_device *dev)
-- 
cgit v1.2.3


From 7fda702f9315e6f4a74fee155c540750788a2d66 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 15 Nov 2016 23:23:11 +0800
Subject: sctp: use new rhlist interface on sctp transport rhashtable

Now sctp transport rhashtable uses hash(lport, dport, daddr) as the key
to hash a node to one chain. If in one host thousands of assocs connect
to one server with the same lport and different laddrs (although it's
not a normal case), all the transports would be hashed into the same
chain.

It may cause to keep returning -EBUSY when inserting a new node, as the
chain is too long and sctp inserts a transport node in a loop, which
could even lead to system hangs there.

The new rhlist interface works for this case that there are many nodes
with the same key in one chain. It puts them into a list then makes this
list be as a node of the chain.

This patch is to replace rhashtable_ interface with rhltable_ interface.
Since a chain would not be too long and it would not return -EBUSY with
this fix when inserting a node, the reinsert loop is also removed here.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h    | 2 +-
 include/net/sctp/structs.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 31acc3f4f132..f0dcaebebddb 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -164,7 +164,7 @@ void sctp_backlog_migrate(struct sctp_association *assoc,
 			  struct sock *oldsk, struct sock *newsk);
 int sctp_transport_hashtable_init(void);
 void sctp_transport_hashtable_destroy(void);
-void sctp_hash_transport(struct sctp_transport *t);
+int sctp_hash_transport(struct sctp_transport *t);
 void sctp_unhash_transport(struct sctp_transport *t);
 struct sctp_transport *sctp_addrs_lookup_transport(
 				struct net *net,
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index bd4a3ded7c87..92daabdc007d 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -124,7 +124,7 @@ extern struct sctp_globals {
 	/* This is the sctp port control hash.	*/
 	struct sctp_bind_hashbucket *port_hashtable;
 	/* This is the hash of all transports. */
-	struct rhashtable transport_hashtable;
+	struct rhltable transport_hashtable;
 
 	/* Sizes of above hashtables. */
 	int ep_hashsize;
@@ -761,7 +761,7 @@ static inline int sctp_packet_empty(struct sctp_packet *packet)
 struct sctp_transport {
 	/* A list of transports. */
 	struct list_head transports;
-	struct rhash_head node;
+	struct rhlist_head node;
 
 	/* Reference counting. */
 	atomic_t refcnt;
-- 
cgit v1.2.3


From e68b6e50fa359cc5aad4d2f8ac2bdbc1a8f4fd59 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Nov 2016 09:10:42 -0800
Subject: udp: enable busy polling for all sockets

UDP busy polling is restricted to connected UDP sockets.

This is because sk_busy_loop() only takes care of one NAPI context.

There are cases where it could be extended.

1) Some hosts receive traffic on a single NIC, with one RX queue.

2) Some applications use SO_REUSEPORT and associated BPF filter
   to split the incoming traffic on one UDP socket per RX
queue/thread/cpu

3) Some UDP sockets are used to send/receive traffic for one flow, but
they do not bother with connect()

This patch records the napi_id of first received skb, giving more
reach to busy polling.

Tested:

lpaa23:~# echo 70 >/proc/sys/net/core/busy_read
lpaa24:~# echo 70 >/proc/sys/net/core/busy_read

lpaa23:~# for f in `seq 1 10`; do ./super_netperf 1 -H lpaa24 -t UDP_RR -l 5; done

Before patch :
   27867   28870   37324   41060   41215
   36764   36838   44455   41282   43843
After patch :
   73920   73213   70147   74845   71697
   68315   68028   75219   70082   73707

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/busy_poll.h | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 965e52b9b5a3..d73b849e29a6 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -80,11 +80,6 @@ static inline void skb_mark_napi_id(struct sk_buff *skb,
 	skb->napi_id = napi->napi_id;
 }
 
-/* used in the protocol hanlder to propagate the napi_id to the socket */
-static inline void sk_mark_napi_id(struct sock *sk, struct sk_buff *skb)
-{
-	sk->sk_napi_id = skb->napi_id;
-}
 
 #else /* CONFIG_NET_RX_BUSY_POLL */
 static inline unsigned long net_busy_loop_on(void)
@@ -107,10 +102,6 @@ static inline void skb_mark_napi_id(struct sk_buff *skb,
 {
 }
 
-static inline void sk_mark_napi_id(struct sock *sk, struct sk_buff *skb)
-{
-}
-
 static inline bool busy_loop_timeout(unsigned long end_time)
 {
 	return true;
@@ -122,4 +113,23 @@ static inline bool sk_busy_loop(struct sock *sk, int nonblock)
 }
 
 #endif /* CONFIG_NET_RX_BUSY_POLL */
+
+/* used in the protocol hanlder to propagate the napi_id to the socket */
+static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	sk->sk_napi_id = skb->napi_id;
+#endif
+}
+
+/* variant used for unconnected sockets */
+static inline void sk_mark_napi_id_once(struct sock *sk,
+					const struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	if (!sk->sk_napi_id)
+		sk->sk_napi_id = skb->napi_id;
+#endif
+}
+
 #endif /* _LINUX_NET_BUSY_POLL_H */
-- 
cgit v1.2.3


From c7d03a00b56fc23c3a01a8353789ad257363e281 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 17 Nov 2016 04:58:21 +0300
Subject: netns: make struct pernet_operations::id unsigned int

Make struct pernet_operations::id unsigned.

There are 2 reasons to do so:

1)
This field is really an index into an zero based array and
thus is unsigned entity. Using negative value is out-of-bound
access by definition.

2)
On x86_64 unsigned 32-bit data which are mixed with pointers
via array indexing or offsets added or subtracted to pointers
are preffered to signed 32-bit data.

"int" being used as an array index needs to be sign-extended
to 64-bit before being used.

	void f(long *p, int i)
	{
		g(p[i]);
	}

  roughly translates to

	movsx	rsi, esi
	mov	rdi, [rsi+...]
	call 	g

MOVSX is 3 byte instruction which isn't necessary if the variable is
unsigned because x86_64 is zero extending by default.

Now, there is net_generic() function which, you guessed it right, uses
"int" as an array index:

	static inline void *net_generic(const struct net *net, int id)
	{
		...
		ptr = ng->ptr[id - 1];
		...
	}

And this function is used a lot, so those sign extensions add up.

Patch snipes ~1730 bytes on allyesconfig kernel (without all junk
messing with code generation):

	add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)

Unfortunately some functions actually grow bigger.
This is a semmingly random artefact of code generation with register
allocator being used differently. gcc decides that some variable
needs to live in new r8+ registers and every access now requires REX
prefix. Or it is shifted into r12, so [r12+0] addressing mode has to be
used which is longer than [r8]

However, overall balance is in negative direction:

	add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)
	function                                     old     new   delta
	nfsd4_lock                                  3886    3959     +73
	tipc_link_build_proto_msg                   1096    1140     +44
	mac80211_hwsim_new_radio                    2776    2808     +32
	tipc_mon_rcv                                1032    1058     +26
	svcauth_gss_legacy_init                     1413    1429     +16
	tipc_bcbase_select_primary                   379     392     +13
	nfsd4_exchange_id                           1247    1260     +13
	nfsd4_setclientid_confirm                    782     793     +11
		...
	put_client_renew_locked                      494     480     -14
	ip_set_sockfn_get                            730     716     -14
	geneve_sock_add                              829     813     -16
	nfsd4_sequence_done                          721     703     -18
	nlmclnt_lookup_host                          708     686     -22
	nfsd4_lockt                                 1085    1063     -22
	nfs_get_client                              1077    1050     -27
	tcf_bpf_init                                1106    1076     -30
	nfsd4_encode_fattr                          5997    5930     -67
	Total: Before=154856051, After=154854321, chg -0.00%

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/bonding.h                         | 2 +-
 include/net/ip_tunnels.h                      | 6 +++---
 include/net/net_namespace.h                   | 2 +-
 include/net/netfilter/nf_conntrack_l4proto.h  | 2 +-
 include/net/netfilter/nf_conntrack_synproxy.h | 2 +-
 include/net/netns/generic.h                   | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/bonding.h b/include/net/bonding.h
index f32f7ef8a23a..3c857778a6ca 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -681,7 +681,7 @@ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip)
 }
 
 /* exported from bond_main.c */
-extern int bond_net_id;
+extern unsigned int bond_net_id;
 extern const struct bond_parm_tbl bond_lacp_tbl[];
 extern const struct bond_parm_tbl xmit_hashtype_tbl[];
 extern const struct bond_parm_tbl arp_validate_tbl[];
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 59557c07904b..e893fe43dd13 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -129,7 +129,7 @@ struct ip_tunnel {
 #endif
 	struct ip_tunnel_prl_entry __rcu *prl;	/* potential router list */
 	unsigned int		prl_count;	/* # of entries in PRL */
-	int			ip_tnl_net_id;
+	unsigned int		ip_tnl_net_id;
 	struct gro_cells	gro_cells;
 	bool			collect_md;
 	bool			ignore_df;
@@ -248,7 +248,7 @@ void ip_tunnel_uninit(struct net_device *dev);
 void  ip_tunnel_dellink(struct net_device *dev, struct list_head *head);
 struct net *ip_tunnel_get_link_net(const struct net_device *dev);
 int ip_tunnel_get_iflink(const struct net_device *dev);
-int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
+int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
 		       struct rtnl_link_ops *ops, char *devname);
 
 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops);
@@ -275,7 +275,7 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
 			 struct ip_tunnel_parm *p);
 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 		      struct ip_tunnel_parm *p);
-void ip_tunnel_setup(struct net_device *dev, int net_id);
+void ip_tunnel_setup(struct net_device *dev, unsigned int net_id);
 
 struct ip_tunnel_encap_ops {
 	size_t (*encap_hlen)(struct ip_tunnel_encap *e);
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index fc4f757107df..d7149e93a60a 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -291,7 +291,7 @@ struct pernet_operations {
 	int (*init)(struct net *net);
 	void (*exit)(struct net *net);
 	void (*exit_batch)(struct list_head *net_exit_list);
-	int *id;
+	unsigned int *id;
 	size_t size;
 };
 
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 2152b70626d5..e7b836590f0b 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -98,7 +98,7 @@ struct nf_conntrack_l4proto {
 		const struct nla_policy *nla_policy;
 	} ctnl_timeout;
 #endif
-	int	*net_id;
+	unsigned int	*net_id;
 	/* Init l4proto pernet data */
 	int (*init_net)(struct net *net, u_int16_t proto);
 
diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h
index e6937318546c..b0ca402c1f72 100644
--- a/include/net/netfilter/nf_conntrack_synproxy.h
+++ b/include/net/netfilter/nf_conntrack_synproxy.h
@@ -54,7 +54,7 @@ struct synproxy_net {
 	struct synproxy_stats __percpu	*stats;
 };
 
-extern int synproxy_net_id;
+extern unsigned int synproxy_net_id;
 static inline struct synproxy_net *synproxy_pernet(struct net *net)
 {
 	return net_generic(net, synproxy_net_id);
diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h
index 70e158551704..d315786bcfd7 100644
--- a/include/net/netns/generic.h
+++ b/include/net/netns/generic.h
@@ -31,7 +31,7 @@ struct net_generic {
 	void *ptr[0];
 };
 
-static inline void *net_generic(const struct net *net, int id)
+static inline void *net_generic(const struct net *net, unsigned int id)
 {
 	struct net_generic *ng;
 	void *ptr;
-- 
cgit v1.2.3


From 0ac3ea70897fb9f84b620aeda074ecccf481629d Mon Sep 17 00:00:00 2001
From: Mohamad Haj Yahia <mohamad@mellanox.com>
Date: Thu, 17 Nov 2016 13:45:55 +0200
Subject: net/mlx5: Make the command interface cache more flexible

Add more cache command size sets and more entries for each set based on
the current commands set different sizes and commands frequency.

Fixes: e126ba97dba9 ('mlx5: Add driver for Mellanox Connect-IB adapters')
Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/driver.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ecc451d89ccd..5e7dbbcf47f0 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -208,7 +208,7 @@ struct mlx5_cmd_first {
 
 struct mlx5_cmd_msg {
 	struct list_head		list;
-	struct cache_ent	       *cache;
+	struct cmd_msg_cache	       *parent;
 	u32				len;
 	struct mlx5_cmd_first		first;
 	struct mlx5_cmd_mailbox	       *next;
@@ -228,17 +228,17 @@ struct mlx5_cmd_debug {
 	u16			outlen;
 };
 
-struct cache_ent {
+struct cmd_msg_cache {
 	/* protect block chain allocations
 	 */
 	spinlock_t		lock;
 	struct list_head	head;
+	unsigned int		max_inbox_size;
+	unsigned int		num_ent;
 };
 
-struct cmd_msg_cache {
-	struct cache_ent	large;
-	struct cache_ent	med;
-
+enum {
+	MLX5_NUM_COMMAND_CACHES = 5,
 };
 
 struct mlx5_cmd_stats {
@@ -281,7 +281,7 @@ struct mlx5_cmd {
 	struct mlx5_cmd_work_ent *ent_arr[MLX5_MAX_COMMANDS];
 	struct pci_pool *pool;
 	struct mlx5_cmd_debug dbg;
-	struct cmd_msg_cache cache;
+	struct cmd_msg_cache cache[MLX5_NUM_COMMAND_CACHES];
 	int checksum_disabled;
 	struct mlx5_cmd_stats stats[MLX5_CMD_OP_MAX];
 };
-- 
cgit v1.2.3


From 4ce3bf2fa8ba309b5ca19539fcc8671a0fc084f9 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Thu, 17 Nov 2016 13:45:56 +0200
Subject: net/mlx5: Port module event hardware structures

Add hardware structures and constants definitions needed for module
events support.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/device.h   | 11 +++++++++++
 include/linux/mlx5/mlx5_ifc.h |  3 ++-
 include/linux/mlx5/port.h     |  3 +++
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 58276144ba81..52b437431c6a 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -277,6 +277,7 @@ enum mlx5_event {
 	MLX5_EVENT_TYPE_INTERNAL_ERROR	   = 0x08,
 	MLX5_EVENT_TYPE_PORT_CHANGE	   = 0x09,
 	MLX5_EVENT_TYPE_GPIO_EVENT	   = 0x15,
+	MLX5_EVENT_TYPE_PORT_MODULE_EVENT  = 0x16,
 	MLX5_EVENT_TYPE_REMOTE_CONFIG	   = 0x19,
 
 	MLX5_EVENT_TYPE_DB_BF_CONGESTION   = 0x1a,
@@ -552,6 +553,15 @@ struct mlx5_eqe_vport_change {
 	__be32		rsvd1[6];
 } __packed;
 
+struct mlx5_eqe_port_module {
+	u8        reserved_at_0[1];
+	u8        module;
+	u8        reserved_at_2[1];
+	u8        module_status;
+	u8        reserved_at_4[2];
+	u8        error_type;
+} __packed;
+
 union ev_data {
 	__be32				raw[7];
 	struct mlx5_eqe_cmd		cmd;
@@ -565,6 +575,7 @@ union ev_data {
 	struct mlx5_eqe_page_req	req_pages;
 	struct mlx5_eqe_page_fault	page_fault;
 	struct mlx5_eqe_vport_change	vport_change;
+	struct mlx5_eqe_port_module	port_module;
 } __packed;
 
 struct mlx5_eqe {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 2632cb2caf10..cd1d530ca368 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -824,7 +824,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   early_vf_enable[0x1];
 	u8         reserved_at_1a9[0x2];
 	u8         local_ca_ack_delay[0x5];
-	u8         reserved_at_1af[0x2];
+	u8         port_module_event[0x1];
+	u8         reserved_at_1b0[0x1];
 	u8         ports_check[0x1];
 	u8         reserved_at_1b2[0x1];
 	u8         disable_link_up[0x1];
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index b3065acd20b4..dde8c7ec5ff1 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -94,6 +94,9 @@ enum mlx5e_link_mode {
 
 #define MLX5E_PROT_MASK(link_mode) (1 << link_mode)
 
+#define PORT_MODULE_EVENT_MODULE_STATUS_MASK 0xF
+#define PORT_MODULE_EVENT_ERROR_TYPE_MASK         0xF
+
 int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps);
 int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys,
 			 int ptys_size, int proto_mask, u8 local_port);
-- 
cgit v1.2.3


From d4eb4cd78b0774c7061db56844ed2ea7790cc77c Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Thu, 17 Nov 2016 13:45:57 +0200
Subject: net/mlx5: Add handling for port module event

For each asynchronous port module event:
  1. print with ratelimit to the dmesg log
  2. increment the corresponding event counter

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/driver.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'include')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5e7dbbcf47f0..7336c8e529d7 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -498,6 +498,31 @@ struct mlx5_rl_table {
 	struct mlx5_rl_entry   *rl_entry;
 };
 
+enum port_module_event_status_type {
+	MLX5_MODULE_STATUS_PLUGGED   = 0x1,
+	MLX5_MODULE_STATUS_UNPLUGGED = 0x2,
+	MLX5_MODULE_STATUS_ERROR     = 0x3,
+	MLX5_MODULE_STATUS_NUM       = 0x3,
+};
+
+enum  port_module_event_error_type {
+	MLX5_MODULE_EVENT_ERROR_POWER_BUDGET_EXCEEDED,
+	MLX5_MODULE_EVENT_ERROR_LONG_RANGE_FOR_NON_MLNX_CABLE_MODULE,
+	MLX5_MODULE_EVENT_ERROR_BUS_STUCK,
+	MLX5_MODULE_EVENT_ERROR_NO_EEPROM_RETRY_TIMEOUT,
+	MLX5_MODULE_EVENT_ERROR_ENFORCE_PART_NUMBER_LIST,
+	MLX5_MODULE_EVENT_ERROR_UNKNOWN_IDENTIFIER,
+	MLX5_MODULE_EVENT_ERROR_HIGH_TEMPERATURE,
+	MLX5_MODULE_EVENT_ERROR_BAD_CABLE,
+	MLX5_MODULE_EVENT_ERROR_UNKNOWN,
+	MLX5_MODULE_EVENT_ERROR_NUM,
+};
+
+struct mlx5_port_module_event_stats {
+	u64 status_counters[MLX5_MODULE_STATUS_NUM];
+	u64 error_counters[MLX5_MODULE_EVENT_ERROR_NUM];
+};
+
 struct mlx5_priv {
 	char			name[MLX5_MAX_NAME_LEN];
 	struct mlx5_eq_table	eq_table;
@@ -559,6 +584,8 @@ struct mlx5_priv {
 	unsigned long		pci_dev_data;
 	struct mlx5_fc_stats		fc_stats;
 	struct mlx5_rl_table            rl_table;
+
+	struct mlx5_port_module_event_stats  pme_stats;
 };
 
 enum mlx5_device_state {
-- 
cgit v1.2.3


From 0dbc6fe09fbe5f5191bcc606f3bdc9a829f97066 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Thu, 17 Nov 2016 13:45:59 +0200
Subject: net/mlx5: Set driver version infrastructure

Add driver_version capability bit is enabled, and set driver
version command in mlx5_ifc firmware header.  The only purpose
of this command is to store a driver version/OS string in FW
to be reported and displayed in various management systems,
such as IPMI/BMC.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/mlx5_ifc.h | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index cd1d530ca368..f08a06247fba 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -83,6 +83,7 @@ enum {
 	MLX5_CMD_OP_SET_HCA_CAP                   = 0x109,
 	MLX5_CMD_OP_QUERY_ISSI                    = 0x10a,
 	MLX5_CMD_OP_SET_ISSI                      = 0x10b,
+	MLX5_CMD_OP_SET_DRIVER_VERSION            = 0x10d,
 	MLX5_CMD_OP_CREATE_MKEY                   = 0x200,
 	MLX5_CMD_OP_QUERY_MKEY                    = 0x201,
 	MLX5_CMD_OP_DESTROY_MKEY                  = 0x202,
@@ -909,7 +910,7 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         log_pg_sz[0x8];
 
 	u8         bf[0x1];
-	u8         reserved_at_261[0x1];
+	u8         driver_version[0x1];
 	u8         pad_tx_eth_packet[0x1];
 	u8         reserved_at_263[0x8];
 	u8         log_bf_reg_size[0x5];
@@ -4005,6 +4006,25 @@ struct mlx5_ifc_query_issi_in_bits {
 	u8         reserved_at_40[0x40];
 };
 
+struct mlx5_ifc_set_driver_version_out_bits {
+	u8         status[0x8];
+	u8         reserved_0[0x18];
+
+	u8         syndrome[0x20];
+	u8         reserved_1[0x40];
+};
+
+struct mlx5_ifc_set_driver_version_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_0[0x10];
+
+	u8         reserved_1[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_2[0x40];
+	u8         driver_version[64][0x8];
+};
+
 struct mlx5_ifc_query_hca_vport_pkey_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
-- 
cgit v1.2.3


From 7f503169cabd70c1f13b9279c50eca7dfb9a7d51 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Thu, 17 Nov 2016 13:46:01 +0200
Subject: net/mlx5: Add MPCNT register infrastructure

Add the needed infrastructure for future use of MPCNT register.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/device.h   |  5 +++
 include/linux/mlx5/driver.h   |  1 +
 include/linux/mlx5/mlx5_ifc.h | 93 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 99 insertions(+)

(limited to 'include')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 52b437431c6a..9f489365b3d3 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1071,6 +1071,11 @@ enum {
 	MLX5_INFINIBAND_PORT_COUNTERS_GROUP   = 0x20,
 };
 
+enum {
+	MLX5_PCIE_PERFORMANCE_COUNTERS_GROUP       = 0x0,
+	MLX5_PCIE_TIMERS_AND_STATES_COUNTERS_GROUP = 0x2,
+};
+
 static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz)
 {
 	if (pkey_sz > MLX5_MAX_LOG_PKEY_TABLE)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7336c8e529d7..ae1f451e8f89 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -121,6 +121,7 @@ enum {
 	MLX5_REG_HOST_ENDIANNESS = 0x7004,
 	MLX5_REG_MCIA		 = 0x9014,
 	MLX5_REG_MLCR		 = 0x902b,
+	MLX5_REG_MPCNT		 = 0x9051,
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index f08a06247fba..a5f0fbedf1e7 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1757,6 +1757,80 @@ struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits {
 	u8         reserved_at_4c0[0x300];
 };
 
+struct mlx5_ifc_pcie_perf_cntrs_grp_data_layout_bits {
+	u8         life_time_counter_high[0x20];
+
+	u8         life_time_counter_low[0x20];
+
+	u8         rx_errors[0x20];
+
+	u8         tx_errors[0x20];
+
+	u8         l0_to_recovery_eieos[0x20];
+
+	u8         l0_to_recovery_ts[0x20];
+
+	u8         l0_to_recovery_framing[0x20];
+
+	u8         l0_to_recovery_retrain[0x20];
+
+	u8         crc_error_dllp[0x20];
+
+	u8         crc_error_tlp[0x20];
+
+	u8         reserved_at_140[0x680];
+};
+
+struct mlx5_ifc_pcie_tas_cntrs_grp_data_layout_bits {
+	u8         life_time_counter_high[0x20];
+
+	u8         life_time_counter_low[0x20];
+
+	u8         time_to_boot_image_start[0x20];
+
+	u8         time_to_link_image[0x20];
+
+	u8         calibration_time[0x20];
+
+	u8         time_to_first_perst[0x20];
+
+	u8         time_to_detect_state[0x20];
+
+	u8         time_to_l0[0x20];
+
+	u8         time_to_crs_en[0x20];
+
+	u8         time_to_plastic_image_start[0x20];
+
+	u8         time_to_iron_image_start[0x20];
+
+	u8         perst_handler[0x20];
+
+	u8         times_in_l1[0x20];
+
+	u8         times_in_l23[0x20];
+
+	u8         dl_down[0x20];
+
+	u8         config_cycle1usec[0x20];
+
+	u8         config_cycle2to7usec[0x20];
+
+	u8         config_cycle_8to15usec[0x20];
+
+	u8         config_cycle_16_to_63usec[0x20];
+
+	u8         config_cycle_64usec[0x20];
+
+	u8         correctable_err_msg_sent[0x20];
+
+	u8         non_fatal_err_msg_sent[0x20];
+
+	u8         fatal_err_msg_sent[0x20];
+
+	u8         reserved_at_2e0[0x4e0];
+};
+
 struct mlx5_ifc_cmd_inter_comp_event_bits {
 	u8         command_completion_vector[0x20];
 
@@ -2921,6 +2995,12 @@ union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits {
 	u8         reserved_at_0[0x7c0];
 };
 
+union mlx5_ifc_pcie_cntrs_grp_data_layout_auto_bits {
+	struct mlx5_ifc_pcie_perf_cntrs_grp_data_layout_bits pcie_perf_cntrs_grp_data_layout;
+	struct mlx5_ifc_pcie_tas_cntrs_grp_data_layout_bits pcie_tas_cntrs_grp_data_layout;
+	u8         reserved_at_0[0x7c0];
+};
+
 union mlx5_ifc_event_auto_bits {
 	struct mlx5_ifc_comp_event_bits comp_event;
 	struct mlx5_ifc_dct_events_bits dct_events;
@@ -7240,6 +7320,18 @@ struct mlx5_ifc_ppcnt_reg_bits {
 	union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits counter_set;
 };
 
+struct mlx5_ifc_mpcnt_reg_bits {
+	u8         reserved_at_0[0x8];
+	u8         pcie_index[0x8];
+	u8         reserved_at_10[0xa];
+	u8         grp[0x6];
+
+	u8         clr[0x1];
+	u8         reserved_at_21[0x1f];
+
+	union mlx5_ifc_pcie_cntrs_grp_data_layout_auto_bits counter_set;
+};
+
 struct mlx5_ifc_ppad_reg_bits {
 	u8         reserved_at_0[0x3];
 	u8         single_mac[0x1];
@@ -7845,6 +7937,7 @@ union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_pmtu_reg_bits pmtu_reg;
 	struct mlx5_ifc_ppad_reg_bits ppad_reg;
 	struct mlx5_ifc_ppcnt_reg_bits ppcnt_reg;
+	struct mlx5_ifc_mpcnt_reg_bits mpcnt_reg;
 	struct mlx5_ifc_pplm_reg_bits pplm_reg;
 	struct mlx5_ifc_pplr_reg_bits pplr_reg;
 	struct mlx5_ifc_ppsc_reg_bits ppsc_reg;
-- 
cgit v1.2.3


From 0d27f4e437e448c4ff440a31567b9729d1634d66 Mon Sep 17 00:00:00 2001
From: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>
Date: Thu, 17 Nov 2016 13:07:20 +0100
Subject: ethtool: (uapi) Add ETHTOOL_PHY_GTUNABLE and ETHTOOL_PHY_STUNABLE

Defines a generic API to get/set phy tunables. The API is using the
existing ethtool_tunable/tunable_type_id types which is already being used
for mac level tunables.

Signed-off-by: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Allan W. Nielsen <allan.nielsen@microsemi.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 8e547231c1b7..42f696f139ec 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -248,6 +248,16 @@ struct ethtool_tunable {
 	void	*data[0];
 };
 
+enum phy_tunable_id {
+	ETHTOOL_PHY_ID_UNSPEC,
+
+	/*
+	 * Add your fresh new phy tunable attribute above and remember to update
+	 * phy_tunable_strings[] in net/core/ethtool.c
+	 */
+	__ETHTOOL_PHY_TUNABLE_COUNT,
+};
+
 /**
  * struct ethtool_regs - hardware register dump
  * @cmd: Command number = %ETHTOOL_GREGS
@@ -548,6 +558,7 @@ struct ethtool_pauseparam {
  * @ETH_SS_FEATURES: Device feature names
  * @ETH_SS_RSS_HASH_FUNCS: RSS hush function names
  * @ETH_SS_PHY_STATS: Statistic names, for use with %ETHTOOL_GPHYSTATS
+ * @ETH_SS_PHY_TUNABLES: PHY tunable names
  */
 enum ethtool_stringset {
 	ETH_SS_TEST		= 0,
@@ -558,6 +569,7 @@ enum ethtool_stringset {
 	ETH_SS_RSS_HASH_FUNCS,
 	ETH_SS_TUNABLES,
 	ETH_SS_PHY_STATS,
+	ETH_SS_PHY_TUNABLES,
 };
 
 /**
@@ -1313,7 +1325,8 @@ struct ethtool_per_queue_op {
 
 #define ETHTOOL_GLINKSETTINGS	0x0000004c /* Get ethtool_link_settings */
 #define ETHTOOL_SLINKSETTINGS	0x0000004d /* Set ethtool_link_settings */
-
+#define ETHTOOL_PHY_GTUNABLE	0x0000004e /* Get PHY tunable configuration */
+#define ETHTOOL_PHY_STUNABLE	0x0000004f /* Set PHY tunable configuration */
 
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
-- 
cgit v1.2.3


From 968ad9da7e0e333e25442950e10a1b631981ce84 Mon Sep 17 00:00:00 2001
From: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>
Date: Thu, 17 Nov 2016 13:07:21 +0100
Subject: ethtool: Implements ETHTOOL_PHY_GTUNABLE/ETHTOOL_PHY_STUNABLE

Adding get_tunable/set_tunable function pointer to the phy_driver
structure, and uses these function pointers to implement the
ETHTOOL_PHY_GTUNABLE/ETHTOOL_PHY_STUNABLE ioctls.

Signed-off-by: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Allan W. Nielsen <allan.nielsen@microsemi.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index b9bd3b4f4ea1..edde28ce163a 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -611,6 +611,13 @@ struct phy_driver {
 	void (*get_strings)(struct phy_device *dev, u8 *data);
 	void (*get_stats)(struct phy_device *dev,
 			  struct ethtool_stats *stats, u64 *data);
+
+	/* Get and Set PHY tunables */
+	int (*get_tunable)(struct phy_device *dev,
+			   struct ethtool_tunable *tuna, void *data);
+	int (*set_tunable)(struct phy_device *dev,
+			    struct ethtool_tunable *tuna,
+			    const void *data);
 };
 #define to_phy_driver(d) container_of(to_mdio_common_driver(d),		\
 				      struct phy_driver, mdiodrv)
-- 
cgit v1.2.3


From 607c7029146790201e90b58c4235ddff0304d6e0 Mon Sep 17 00:00:00 2001
From: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>
Date: Thu, 17 Nov 2016 13:07:22 +0100
Subject: ethtool: (uapi) Add ETHTOOL_PHY_DOWNSHIFT to PHY tunables

For operation in cabling environments that are incompatible with
1000BASE-T, PHY device may provide an automatic link speed downshift
operation. When enabled, the device automatically changes its 1000BASE-T
auto-negotiation to the next slower speed after a configured number of
failed attempts at 1000BASE-T.  This feature is useful in setting up in
networks using older cable installations that include only pairs A and B,
and not pairs C and D.

Signed-off-by: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>
Signed-off-by: Allan W. Nielsen <allan.nielsen@microsemi.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 42f696f139ec..f0db7788f887 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -248,9 +248,12 @@ struct ethtool_tunable {
 	void	*data[0];
 };
 
+#define DOWNSHIFT_DEV_DEFAULT_COUNT	0xff
+#define DOWNSHIFT_DEV_DISABLE		0
+
 enum phy_tunable_id {
 	ETHTOOL_PHY_ID_UNSPEC,
-
+	ETHTOOL_PHY_DOWNSHIFT,
 	/*
 	 * Add your fresh new phy tunable attribute above and remember to update
 	 * phy_tunable_strings[] in net/core/ethtool.c
-- 
cgit v1.2.3


From 603ab57363a0ba66c77ca4b3027bc0b4505df504 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 17 Nov 2016 11:19:12 -0800
Subject: bus: mvebu-bus: Provide inline stub for mvebu_mbus_get_dram_win_info

In preparation for allowing CONFIG_MVNETA_BM to build with COMPILE_TEST,
provide an inline stub for mvebu_mbus_get_dram_win_info().

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mbus.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/mbus.h b/include/linux/mbus.h
index 2931aa43dab1..0d3f14fd2621 100644
--- a/include/linux/mbus.h
+++ b/include/linux/mbus.h
@@ -82,6 +82,7 @@ static inline int mvebu_mbus_get_io_win_info(phys_addr_t phyaddr, u32 *size,
 }
 #endif
 
+#ifdef CONFIG_MVEBU_MBUS
 int mvebu_mbus_save_cpu_target(u32 __iomem *store_addr);
 void mvebu_mbus_get_pcie_mem_aperture(struct resource *res);
 void mvebu_mbus_get_pcie_io_aperture(struct resource *res);
@@ -97,5 +98,12 @@ int mvebu_mbus_init(const char *soc, phys_addr_t mbus_phys_base,
 		    size_t mbus_size, phys_addr_t sdram_phys_base,
 		    size_t sdram_size);
 int mvebu_mbus_dt_init(bool is_coherent);
+#else
+static inline int mvebu_mbus_get_dram_win_info(phys_addr_t phyaddr, u8 *target,
+					       u8 *attr)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_MVEBU_MBUS */
 
 #endif /* __LINUX_MBUS_H */
-- 
cgit v1.2.3


From d66016a77757b004b8637f44d87bedfc4a47b89c Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Fri, 18 Nov 2016 15:40:39 -0800
Subject: virtio_net.h: Fix comment.

Fix incorrent comment after the final #endif.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 1c912f85e041..74f1e3363506 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -98,4 +98,4 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 	return 0;
 }
 
-#endif /* _LINUX_VIRTIO_BYTEORDER */
+#endif /* _LINUX_VIRTIO_NET_H */
-- 
cgit v1.2.3


From 9403cd7cbb08aa3709c632decafa2014c8ed96e6 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Fri, 18 Nov 2016 15:40:40 -0800
Subject: virtio_net: Do not clear memory for struct virtio_net_hdr twice.

virtio_net_hdr_from_skb() clears the memory for the header, so there
is no point for the callers to do the same.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 74f1e3363506..66204007d7ac 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -58,7 +58,7 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 					  struct virtio_net_hdr *hdr,
 					  bool little_endian)
 {
-	memset(hdr, 0, sizeof(*hdr));
+	memset(hdr, 0, sizeof(*hdr));   /* no info leak */
 
 	if (skb_is_gso(skb)) {
 		struct skb_shared_info *sinfo = skb_shinfo(skb);
-- 
cgit v1.2.3


From 3b2c75d371740fb0dcd0c9eac545ab1dd28b4706 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 19 Nov 2016 03:54:35 +0300
Subject: netlink: use "unsigned int" in nla_next()

->nla_len is unsigned entity (it's length after all) and u16,
thus it can't overflow when being aligned into int/unsigned int.

(nlmsg_next has the same code, but I didn't yet convince myself
it is correct to do so).

There is pointer arithmetic in this function and offset being
unsigned is better:

	add/remove: 0/0 grow/shrink: 1/64 up/down: 5/-309 (-304)
	function                                     old     new   delta
	nl80211_set_wiphy                           1444    1449      +5
	team_nl_cmd_options_set                      997     995      -2
	tcf_em_tree_validate                         872     870      -2
	switchdev_port_bridge_setlink                352     350      -2
	switchdev_port_br_afspec                     312     310      -2
	rtm_to_fib_config                            428     426      -2
	qla4xxx_sysfs_ddb_set_param                 2193    2191      -2
	qla4xxx_iface_set_param                     4470    4468      -2
	ovs_nla_free_flow_actions                    152     150      -2
	output_userspace                             518     516      -2
		...
	nl80211_set_reg                              654     649      -5
	validate_scan_freqs                          148     142      -6
	validate_linkmsg                             288     282      -6
	nl80211_parse_connkeys                       489     483      -6
	nlattr_set                                   231     224      -7
	nf_tables_delsetelem                         267     260      -7
	do_setlink                                  3416    3408      -8
	netlbl_cipsov4_add_std                      1672    1659     -13
	nl80211_parse_sched_scan                    2902    2888     -14
	nl80211_trigger_scan                        1738    1720     -18
	do_execute_actions                          2821    2738     -83
	Total: Before=154865355, After=154865051, chg -0.00%

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index a34f53acb6d6..d3938f11ae52 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -713,7 +713,7 @@ static inline int nla_ok(const struct nlattr *nla, int remaining)
  */
 static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
 {
-	int totlen = NLA_ALIGN(nla->nla_len);
+	unsigned int totlen = NLA_ALIGN(nla->nla_len);
 
 	*remaining -= totlen;
 	return (struct nlattr *) ((char *) nla + totlen);
-- 
cgit v1.2.3


From c72d8cdaa5dbd3baf918046ee5149ab69330923e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 19 Nov 2016 04:08:08 +0300
Subject: net: fix bogus cast in skb_pagelen() and use unsigned variables

1) cast to "int" is unnecessary:
   u8 will be promoted to int before decrementing,
   small positive numbers fit into "int", so their values won't be changed
   during promotion.

   Once everything is int including loop counters, signedness doesn't
   matter: 32-bit operations will stay 32-bit operations.

   But! Someone tried to make this loop smart by making everything of
   the same type apparently in an attempt to optimise it.
   Do the optimization, just differently.
   Do the cast where it matters. :^)

2) frag size is unsigned entity and sum of fragments sizes is also
   unsigned.

Make everything unsigned, leave no MOVSX instruction behind.

	add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-4 (-4)
	function                                     old     new   delta
	skb_cow_data                                 835     834      -1
	ip_do_fragment                              2549    2548      -1
	ip6_fragment                                3130    3128      -2
	Total: Before=154865032, After=154865028, chg -0.00%

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a4aeeca7e805..9c535fbccf2c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1799,11 +1799,11 @@ static inline unsigned int skb_headlen(const struct sk_buff *skb)
 	return skb->len - skb->data_len;
 }
 
-static inline int skb_pagelen(const struct sk_buff *skb)
+static inline unsigned int skb_pagelen(const struct sk_buff *skb)
 {
-	int i, len = 0;
+	unsigned int i, len = 0;
 
-	for (i = (int)skb_shinfo(skb)->nr_frags - 1; i >= 0; i--)
+	for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--)
 		len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
 	return len + skb_headlen(skb);
 }
-- 
cgit v1.2.3


From 6d67942dd0ebc3dddc86edf9208169d064a9b3d7 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 19 Nov 2016 01:45:03 +0100
Subject: bpf: add __must_check attributes to refcount manipulating helpers

Helpers like bpf_prog_add(), bpf_prog_inc(), bpf_map_inc() can fail
with an error, so make sure the caller properly checks their return
value and not just ignores it, which could worst-case lead to use
after free.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 01c1487277b2..69d0a7f12a3b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -233,14 +233,14 @@ void bpf_register_map_type(struct bpf_map_type_list *tl);
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
-struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i);
+struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
 void bpf_prog_sub(struct bpf_prog *prog, int i);
-struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog);
+struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog);
 void bpf_prog_put(struct bpf_prog *prog);
 
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
-struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref);
+struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
 int bpf_map_precharge_memlock(u32 pages);
@@ -299,7 +299,8 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
-static inline struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
+static inline struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog,
+							  int i)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
@@ -311,7 +312,8 @@ static inline void bpf_prog_sub(struct bpf_prog *prog, int i)
 static inline void bpf_prog_put(struct bpf_prog *prog)
 {
 }
-static inline struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+
+static inline struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
-- 
cgit v1.2.3


From 5e9235853d652a295d5f56cb8652950b6b5bf56b Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Mon, 21 Nov 2016 13:03:24 +0100
Subject: bridge: mcast: add IGMPv3 query support

This patch adds basic support for IGMPv3 queries, the default is IGMPv2
as before. A new multicast option - multicast_igmp_version, adds the
ability to change it between 2 and 3 via netlink and sysfs. The option
struct member is in a 4 byte hole in net_bridge.

There also a few minor style adjustments in br_multicast_new_group and
br_multicast_add_group.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index b4fba662cd32..325d2601150d 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -275,6 +275,7 @@ enum {
 	IFLA_BR_PAD,
 	IFLA_BR_VLAN_STATS_ENABLED,
 	IFLA_BR_MCAST_STATS_ENABLED,
+	IFLA_BR_MCAST_IGMP_VERSION,
 	__IFLA_BR_MAX,
 };
 
-- 
cgit v1.2.3


From aa2ae3e71c74cc00ec22f133dc900b3817415785 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Mon, 21 Nov 2016 13:03:25 +0100
Subject: bridge: mcast: add MLDv2 querier support

This patch adds basic support for MLDv2 queries, the default is MLDv1
as before. A new multicast option - multicast_mld_version, adds the
ability to change it between 1 and 2 via netlink and sysfs.
The MLD option is disabled if CONFIG_IPV6 is disabled.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 325d2601150d..92b2d4928bf1 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -276,6 +276,7 @@ enum {
 	IFLA_BR_VLAN_STATS_ENABLED,
 	IFLA_BR_MCAST_STATS_ENABLED,
 	IFLA_BR_MCAST_IGMP_VERSION,
+	IFLA_BR_MCAST_MLD_VERSION,
 	__IFLA_BR_MAX,
 };
 
-- 
cgit v1.2.3


From e97991832a4ea4a5f47d65f068a4c966a2eb5730 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 21 Nov 2016 14:18:38 +0100
Subject: tcp: make undo_cwnd mandatory for congestion modules

The undo_cwnd fallback in the stack doubles cwnd based on ssthresh,
which un-does reno halving behaviour.

It seems more appropriate to let congctl algorithms pair .ssthresh
and .undo_cwnd properly. Add a 'tcp_reno_undo_cwnd' function and wire it
up for all congestion algorithms that used to rely on the fallback.

Cc: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 123979fe12bf..7de80739adab 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -958,6 +958,7 @@ u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
 
 u32 tcp_reno_ssthresh(struct sock *sk);
+u32 tcp_reno_undo_cwnd(struct sock *sk);
 void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
 extern struct tcp_congestion_ops tcp_reno;
 
-- 
cgit v1.2.3


From e22e996b72d47c7bf5bb6b17071b2e9d3db462d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <uwe@kleine-koenig.org>
Date: Tue, 22 Nov 2016 16:47:11 +0100
Subject: net/phy: add trace events for mdio accesses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make it possible to generate trace events for mdio read and write accesses.

Signed-off-by: Uwe Kleine-König <uwe@kleine-koenig.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/mdio.h | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 include/trace/events/mdio.h

(limited to 'include')

diff --git a/include/trace/events/mdio.h b/include/trace/events/mdio.h
new file mode 100644
index 000000000000..00d85f5f54e4
--- /dev/null
+++ b/include/trace/events/mdio.h
@@ -0,0 +1,42 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mdio
+
+#if !defined(_TRACE_MDIO_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MDIO_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT_CONDITION(mdio_access,
+
+	TP_PROTO(struct mii_bus *bus, char read,
+		 u8 addr, unsigned regnum, u16 val, int err),
+
+	TP_ARGS(bus, read, addr, regnum, val, err),
+
+	TP_CONDITION(err >= 0),
+
+	TP_STRUCT__entry(
+		__array(char, busid, MII_BUS_ID_SIZE)
+		__field(char, read)
+		__field(u8, addr)
+		__field(u16, val)
+		__field(unsigned, regnum)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->busid, bus->id, MII_BUS_ID_SIZE);
+		__entry->read = read;
+		__entry->addr = addr;
+		__entry->regnum = regnum;
+		__entry->val = val;
+	),
+
+	TP_printk("%s %-5s phy:0x%02hhx reg:0x%02x val:0x%04hx",
+		  __entry->busid, __entry->read ? "read" : "write",
+		  __entry->addr, __entry->regnum, __entry->val)
+);
+
+#endif /* if !defined(_TRACE_MDIO_H) || defined(TRACE_HEADER_MULTI_READ) */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From d06f78c4232d6a84b50839f61d9d7fbb222d8118 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 22 Nov 2016 11:40:55 -0800
Subject: net: phy: broadcom: Add support code for downshift/Wirespeed

Broadcom's Wirespeed feature allows us to configure how auto-negotiation
should behave with fewer working pairs of wires on a cable. Add support
code for retrieving and setting such downshift counters using the
recently added ethtool downshift tunables.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/brcmphy.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 848dc508ef57..f9f8aaf9c943 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -114,6 +114,7 @@
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC	0x0007
 #define MII_BCM54XX_AUXCTL_SHDWSEL_READ_SHIFT	12
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_RGMII_SKEW_EN	(1 << 8)
+#define MII_BCM54XX_AUXCTL_SHDWSEL_MISC_WIRESPEED_EN	(1 << 4)
 
 #define MII_BCM54XX_AUXCTL_SHDWSEL_MASK	0x0007
 
@@ -130,6 +131,7 @@
 #define BCM_LED_SRC_INTR	0x6
 #define BCM_LED_SRC_QUALITY	0x7
 #define BCM_LED_SRC_RCVLED	0x8
+#define BCM_LED_SRC_WIRESPEED	0x9
 #define BCM_LED_SRC_MULTICOLOR1	0xa
 #define BCM_LED_SRC_OPENSHORT	0xb
 #define BCM_LED_SRC_OFF		0xe	/* Tied high */
@@ -141,6 +143,14 @@
  * Shadow values go into bits [14:10] of register 0x1c to select a shadow
  * register to access.
  */
+
+/* 00100: Reserved control register 2 */
+#define BCM54XX_SHD_SCR2		0x04
+#define  BCM54XX_SHD_SCR2_WSPD_RTRY_DIS	0x100
+#define  BCM54XX_SHD_SCR2_WSPD_RTRY_LMT_SHIFT	2
+#define  BCM54XX_SHD_SCR2_WSPD_RTRY_LMT_OFFSET	2
+#define  BCM54XX_SHD_SCR2_WSPD_RTRY_LMT_MASK	0x7
+
 /* 00101: Spare Control Register 3 */
 #define BCM54XX_SHD_SCR3		0x05
 #define  BCM54XX_SHD_SCR3_DEF_CLK125	0x0001
-- 
cgit v1.2.3


From 3df5b3c67546fb05266766b6abaf71563f82efe4 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Tue, 22 Nov 2016 23:09:54 +0200
Subject: net: Add net-device param to the get offloaded stats ndo

Some drivers would need to check few internal matters for
that. To be used in downstream mlx5 commit.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e84800edd249..ae32a27523f9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -925,7 +925,7 @@ struct netdev_xdp {
  *	3. Update dev->stats asynchronously and atomically, and define
  *	   neither operation.
  *
- * bool (*ndo_has_offload_stats)(int attr_id)
+ * bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id)
  *	Return true if this device supports offload stats of this attr_id.
  *
  * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev,
@@ -1165,7 +1165,7 @@ struct net_device_ops {
 
 	struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev,
 						     struct rtnl_link_stats64 *storage);
-	bool			(*ndo_has_offload_stats)(int attr_id);
+	bool			(*ndo_has_offload_stats)(const struct net_device *dev, int attr_id);
 	int			(*ndo_get_offload_stats)(int attr_id,
 							 const struct net_device *dev,
 							 void *attr_data);
-- 
cgit v1.2.3


From 59bfde01fab0c4550778cd53e8d266f1dfddf7b7 Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@mellanox.com>
Date: Tue, 22 Nov 2016 23:09:57 +0200
Subject: devlink: Add E-Switch inline mode control

Some HWs need the VF driver to put part of the packet headers on the
TX descriptor so the e-switch can do proper matching and steering.

The supported modes: none, link, network, transport.

Signed-off-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h        | 2 ++
 include/uapi/linux/devlink.h | 8 ++++++++
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 211bd3c37028..d29e5fc82582 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -92,6 +92,8 @@ struct devlink_ops {
 
 	int (*eswitch_mode_get)(struct devlink *devlink, u16 *p_mode);
 	int (*eswitch_mode_set)(struct devlink *devlink, u16 mode);
+	int (*eswitch_inline_mode_get)(struct devlink *devlink, u8 *p_inline_mode);
+	int (*eswitch_inline_mode_set)(struct devlink *devlink, u8 inline_mode);
 };
 
 static inline void *devlink_priv(struct devlink *devlink)
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 915bfa74458c..9014c33d4e77 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -102,6 +102,13 @@ enum devlink_eswitch_mode {
 	DEVLINK_ESWITCH_MODE_SWITCHDEV,
 };
 
+enum devlink_eswitch_inline_mode {
+	DEVLINK_ESWITCH_INLINE_MODE_NONE,
+	DEVLINK_ESWITCH_INLINE_MODE_LINK,
+	DEVLINK_ESWITCH_INLINE_MODE_NETWORK,
+	DEVLINK_ESWITCH_INLINE_MODE_TRANSPORT,
+};
+
 enum devlink_attr {
 	/* don't change the order or add anything between, this is ABI! */
 	DEVLINK_ATTR_UNSPEC,
@@ -133,6 +140,7 @@ enum devlink_attr {
 	DEVLINK_ATTR_SB_OCC_CUR,		/* u32 */
 	DEVLINK_ATTR_SB_OCC_MAX,		/* u32 */
 	DEVLINK_ATTR_ESWITCH_MODE,		/* u16 */
+	DEVLINK_ATTR_ESWITCH_INLINE_MODE,	/* u8 */
 
 	/* add new attributes above here, update the policy in devlink.c */
 
-- 
cgit v1.2.3


From 34e4e99078667d30f71a50c1e5181e4270e9d8bb Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@mellanox.com>
Date: Tue, 22 Nov 2016 23:09:58 +0200
Subject: net/mlx5: Enable to query min inline for a specific vport

Also move the inline capablities enum to a shared header vport.h

Signed-off-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/vport.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h
index 451b0bde9083..ec35157ea725 100644
--- a/include/linux/mlx5/vport.h
+++ b/include/linux/mlx5/vport.h
@@ -36,6 +36,12 @@
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/device.h>
 
+enum {
+	MLX5_CAP_INLINE_MODE_L2,
+	MLX5_CAP_INLINE_MODE_VPORT_CONTEXT,
+	MLX5_CAP_INLINE_MODE_NOT_REQUIRED,
+};
+
 u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport);
 u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
 				u16 vport);
@@ -43,8 +49,8 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
 				  u16 vport, u8 state);
 int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
 				     u16 vport, u8 *addr);
-void mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev,
-				     u8 *min_inline);
+int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev,
+				    u16 vport, u8 *min_inline);
 int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev,
 				     u16 vport, u8 min_inline);
 int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev,
-- 
cgit v1.2.3


From 0e33661de493db325435d565a4a722120ae4cbf3 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Wed, 23 Nov 2016 16:52:25 +0100
Subject: bpf: add new prog type for cgroup socket filtering

This program type is similar to BPF_PROG_TYPE_SOCKET_FILTER, except that
it does not allow BPF_LD_[ABS|IND] instructions and hooks up the
bpf_skb_load_bytes() helper.

Programs of this type will be attached to cgroups for network filtering
and accounting.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7d9b2832c280..5ae679fac993 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -98,8 +98,17 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_TRACEPOINT,
 	BPF_PROG_TYPE_XDP,
 	BPF_PROG_TYPE_PERF_EVENT,
+	BPF_PROG_TYPE_CGROUP_SKB,
 };
 
+enum bpf_attach_type {
+	BPF_CGROUP_INET_INGRESS,
+	BPF_CGROUP_INET_EGRESS,
+	__MAX_BPF_ATTACH_TYPE
+};
+
+#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
+
 #define BPF_PSEUDO_MAP_FD	1
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
-- 
cgit v1.2.3


From 3007098494bec614fb55dee7bc0410bb7db5ad18 Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Wed, 23 Nov 2016 16:52:26 +0100
Subject: cgroup: add support for eBPF programs

This patch adds two sets of eBPF program pointers to struct cgroup.
One for such that are directly pinned to a cgroup, and one for such
that are effective for it.

To illustrate the logic behind that, assume the following example
cgroup hierarchy.

  A - B - C
        \ D - E

If only B has a program attached, it will be effective for B, C, D
and E. If D then attaches a program itself, that will be effective for
both D and E, and the program in B will only affect B and C. Only one
program of a given type is effective for a cgroup.

Attaching and detaching programs will be done through the bpf(2)
syscall. For now, ingress and egress inet socket filtering are the
only supported use-cases.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h  | 79 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/cgroup-defs.h |  4 +++
 2 files changed, 83 insertions(+)
 create mode 100644 include/linux/bpf-cgroup.h

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
new file mode 100644
index 000000000000..ec80d0c0953e
--- /dev/null
+++ b/include/linux/bpf-cgroup.h
@@ -0,0 +1,79 @@
+#ifndef _BPF_CGROUP_H
+#define _BPF_CGROUP_H
+
+#include <linux/bpf.h>
+#include <linux/jump_label.h>
+#include <uapi/linux/bpf.h>
+
+struct sock;
+struct cgroup;
+struct sk_buff;
+
+#ifdef CONFIG_CGROUP_BPF
+
+extern struct static_key_false cgroup_bpf_enabled_key;
+#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
+
+struct cgroup_bpf {
+	/*
+	 * Store two sets of bpf_prog pointers, one for programs that are
+	 * pinned directly to this cgroup, and one for those that are effective
+	 * when this cgroup is accessed.
+	 */
+	struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
+	struct bpf_prog *effective[MAX_BPF_ATTACH_TYPE];
+};
+
+void cgroup_bpf_put(struct cgroup *cgrp);
+void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
+
+void __cgroup_bpf_update(struct cgroup *cgrp,
+			 struct cgroup *parent,
+			 struct bpf_prog *prog,
+			 enum bpf_attach_type type);
+
+/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
+void cgroup_bpf_update(struct cgroup *cgrp,
+		       struct bpf_prog *prog,
+		       enum bpf_attach_type type);
+
+int __cgroup_bpf_run_filter(struct sock *sk,
+			    struct sk_buff *skb,
+			    enum bpf_attach_type type);
+
+/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb)			\
+({									\
+	int __ret = 0;							\
+	if (cgroup_bpf_enabled)						\
+		__ret = __cgroup_bpf_run_filter(sk, skb,		\
+						BPF_CGROUP_INET_INGRESS); \
+									\
+	__ret;								\
+})
+
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb)				\
+({									\
+	int __ret = 0;							\
+	if (cgroup_bpf_enabled && sk && sk == skb->sk) {		\
+		typeof(sk) __sk = sk_to_full_sk(sk);			\
+		if (sk_fullsock(__sk))					\
+			__ret = __cgroup_bpf_run_filter(__sk, skb,	\
+						BPF_CGROUP_INET_EGRESS); \
+	}								\
+	__ret;								\
+})
+
+#else
+
+struct cgroup_bpf {};
+static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
+static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
+				      struct cgroup *parent) {}
+
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
+
+#endif /* CONFIG_CGROUP_BPF */
+
+#endif /* _BPF_CGROUP_H */
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 5b17de62c962..861b4677fc5b 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -16,6 +16,7 @@
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/workqueue.h>
+#include <linux/bpf-cgroup.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -300,6 +301,9 @@ struct cgroup {
 	/* used to schedule release agent */
 	struct work_struct release_agent_work;
 
+	/* used to store eBPF programs */
+	struct cgroup_bpf bpf;
+
 	/* ids of the ancestors at each level including self */
 	int ancestor_ids[];
 };
-- 
cgit v1.2.3


From f4324551489e8781d838f941b7aee4208e52e8bf Mon Sep 17 00:00:00 2001
From: Daniel Mack <daniel@zonque.org>
Date: Wed, 23 Nov 2016 16:52:27 +0100
Subject: bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands

Extend the bpf(2) syscall by two new commands, BPF_PROG_ATTACH and
BPF_PROG_DETACH which allow attaching and detaching eBPF programs
to a target.

On the API level, the target could be anything that has an fd in
userspace, hence the name of the field in union bpf_attr is called
'target_fd'.

When called with BPF_ATTACH_TYPE_CGROUP_INET_{E,IN}GRESS, the target is
expected to be a valid file descriptor of a cgroup v2 directory which
has the bpf controller enabled. These are the only use-cases
implemented by this patch at this point, but more can be added.

If a program of the given type already exists in the given cgroup,
the program is swapped automically, so userspace does not have to drop
an existing program first before installing a new one, which would
otherwise leave a gap in which no program is attached.

For more information on the propagation logic to subcgroups, please
refer to the bpf cgroup controller implementation.

The API is guarded by CAP_NET_ADMIN.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5ae679fac993..1370a9d1456f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -73,6 +73,8 @@ enum bpf_cmd {
 	BPF_PROG_LOAD,
 	BPF_OBJ_PIN,
 	BPF_OBJ_GET,
+	BPF_PROG_ATTACH,
+	BPF_PROG_DETACH,
 };
 
 enum bpf_map_type {
@@ -159,6 +161,12 @@ union bpf_attr {
 		__aligned_u64	pathname;
 		__u32		bpf_fd;
 	};
+
+	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
+		__u32		target_fd;	/* container object to attach to */
+		__u32		attach_bpf_fd;	/* eBPF program to attach */
+		__u32		attach_type;
+	};
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
-- 
cgit v1.2.3


From 21fcf572716f61926f5d23a358e79d06a9d4d54f Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Wed, 5 Oct 2016 22:51:42 +0200
Subject: Bluetooth: __ variants of u8 and friends are not neccessary inside
 kernel

bluetooth.h is not part of user API, so __ variants are not neccessary
here.

Signed-off-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/bluetooth.h | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 0a1e21d7bce1..01487192f628 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -197,7 +197,7 @@ typedef struct {
 #define BDADDR_LE_PUBLIC	0x01
 #define BDADDR_LE_RANDOM	0x02
 
-static inline bool bdaddr_type_is_valid(__u8 type)
+static inline bool bdaddr_type_is_valid(u8 type)
 {
 	switch (type) {
 	case BDADDR_BREDR:
@@ -209,7 +209,7 @@ static inline bool bdaddr_type_is_valid(__u8 type)
 	return false;
 }
 
-static inline bool bdaddr_type_is_le(__u8 type)
+static inline bool bdaddr_type_is_le(u8 type)
 {
 	switch (type) {
 	case BDADDR_LE_PUBLIC:
@@ -279,15 +279,16 @@ struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock);
 
 /* Skb helpers */
 struct l2cap_ctrl {
-	__u8	sframe:1,
+	u8	sframe:1,
 		poll:1,
 		final:1,
 		fcs:1,
 		sar:2,
 		super:2;
-	__u16	reqseq;
-	__u16	txseq;
-	__u8	retries;
+
+	u16	reqseq;
+	u16	txseq;
+	u8	retries;
 	__le16  psm;
 	bdaddr_t bdaddr;
 	struct l2cap_chan *chan;
@@ -303,7 +304,7 @@ typedef void (*hci_req_complete_skb_t)(struct hci_dev *hdev, u8 status,
 #define HCI_REQ_SKB	BIT(1)
 
 struct hci_ctrl {
-	__u16 opcode;
+	u16 opcode;
 	u8 req_flags;
 	u8 req_event;
 	union {
@@ -313,10 +314,10 @@ struct hci_ctrl {
 };
 
 struct bt_skb_cb {
-	__u8 pkt_type;
-	__u8 force_active;
-	__u16 expect;
-	__u8 incoming:1;
+	u8 pkt_type;
+	u8 force_active;
+	u16 expect;
+	u8 incoming:1;
 	union {
 		struct l2cap_ctrl l2cap;
 		struct hci_ctrl hci;
@@ -366,7 +367,7 @@ out:
 	return NULL;
 }
 
-int bt_to_errno(__u16 code);
+int bt_to_errno(u16 code);
 
 void hci_sock_set_flag(struct sock *sk, int nr);
 void hci_sock_clear_flag(struct sock *sk, int nr);
-- 
cgit v1.2.3


From 5a717f4f8f2830f297b5511022481bdc27b9d576 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 24 Nov 2016 07:04:08 +0200
Subject: netdevice: fix sparse warning for HARD_TX_LOCK

sparse warns about context imbalance in any code
that uses HARD_TX_LOCK/UNLOCK - this is because it's
unable to determine that flags don't change so
lock and unlock are paired.

Seems easy enough to fix by adding __acquire/__release
calls.

With this patch af_packet.c is now sparse-clean,

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ff57cd2eba3b..4ffcd874cc20 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3462,6 +3462,17 @@ static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
 	txq->xmit_lock_owner = cpu;
 }
 
+static inline bool __netif_tx_acquire(struct netdev_queue *txq)
+{
+	__acquire(&txq->_xmit_lock);
+	return true;
+}
+
+static inline void __netif_tx_release(struct netdev_queue *txq)
+{
+	__release(&txq->_xmit_lock);
+}
+
 static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
 {
 	spin_lock_bh(&txq->_xmit_lock);
@@ -3563,17 +3574,21 @@ static inline void netif_tx_unlock_bh(struct net_device *dev)
 #define HARD_TX_LOCK(dev, txq, cpu) {			\
 	if ((dev->features & NETIF_F_LLTX) == 0) {	\
 		__netif_tx_lock(txq, cpu);		\
+	} else {					\
+		__netif_tx_acquire(txq);		\
 	}						\
 }
 
 #define HARD_TX_TRYLOCK(dev, txq)			\
 	(((dev->features & NETIF_F_LLTX) == 0) ?	\
 		__netif_tx_trylock(txq) :		\
-		true )
+		__netif_tx_acquire(txq))
 
 #define HARD_TX_UNLOCK(dev, txq) {			\
 	if ((dev->features & NETIF_F_LLTX) == 0) {	\
 		__netif_tx_unlock(txq);			\
+	} else {					\
+		__netif_tx_release(txq);		\
 	}						\
 }
 
-- 
cgit v1.2.3


From 88575199cc65de99a156888629a68180c830eff2 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 26 Nov 2016 01:28:04 +0100
Subject: bpf: drop unnecessary context cast from BPF_PROG_RUN

Since long already bpf_func is not only about struct sk_buff * as
input anymore. Make it generic as void *, so that callers don't
need to cast for it each time they call BPF_PROG_RUN().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1f09c521adfe..7f246a281435 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -408,8 +408,8 @@ struct bpf_prog {
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
 	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
-	unsigned int		(*bpf_func)(const struct sk_buff *skb,
-					    const struct bpf_insn *filter);
+	unsigned int		(*bpf_func)(const void *ctx,
+					    const struct bpf_insn *insn);
 	/* Instructions for interpreter */
 	union {
 		struct sock_filter	insns[0];
@@ -504,7 +504,7 @@ static inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
 	u32 ret;
 
 	rcu_read_lock();
-	ret = BPF_PROG_RUN(prog, (void *)xdp);
+	ret = BPF_PROG_RUN(prog, xdp);
 	rcu_read_unlock();
 
 	return ret;
-- 
cgit v1.2.3


From c491680f8f489926eebfdf2cd006767fc8bdaa49 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 26 Nov 2016 01:28:06 +0100
Subject: bpf: reuse dev_is_mac_header_xmit for redirect

Commit dcf800344a91 ("net/sched: act_mirred: Refactor detection whether
dev needs xmit at mac header") added dev_is_mac_header_xmit(); since it's
also useful elsewhere, move it to if_arp.h and reuse it for BPF.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_arp.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include')

diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h
index f563907ed776..3355efc89781 100644
--- a/include/linux/if_arp.h
+++ b/include/linux/if_arp.h
@@ -44,4 +44,20 @@ static inline int arp_hdr_len(struct net_device *dev)
 		return sizeof(struct arphdr) + (dev->addr_len + sizeof(u32)) * 2;
 	}
 }
+
+static inline bool dev_is_mac_header_xmit(const struct net_device *dev)
+{
+	switch (dev->type) {
+	case ARPHRD_TUNNEL:
+	case ARPHRD_TUNNEL6:
+	case ARPHRD_SIT:
+	case ARPHRD_IPGRE:
+	case ARPHRD_VOID:
+	case ARPHRD_NONE:
+		return false;
+	default:
+		return true;
+	}
+}
+
 #endif	/* _LINUX_IF_ARP_H */
-- 
cgit v1.2.3


From 3a6a931dfb8e49a7377825b465d84e110fe89f68 Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Sun, 27 Nov 2016 17:02:04 +0200
Subject: net/mlx5e: Support DCBX CEE API

Add DCBX CEE API interface for ConnectX-4. Configurations are stored in
a temporary structure and are applied to the card's firmware when
the CEE's setall callback function is called.

Note:
  priority group in CEE is equivalent to traffic class in ConnectX-4
  hardware spec.

  bw allocation per priority in CEE is not supported because ConnectX-4
  only supports bw allocation per traffic class.

  user priority in CEE does not have an equivalent term in ConnectX-4.
  Therefore, user priority to priority mapping in CEE is not supported.

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/port.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index dde8c7ec5ff1..bdee439f8cf3 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -141,8 +141,12 @@ int mlx5_query_port_pfc(struct mlx5_core_dev *dev, u8 *pfc_en_tx,
 int mlx5_max_tc(struct mlx5_core_dev *mdev);
 
 int mlx5_set_port_prio_tc(struct mlx5_core_dev *mdev, u8 *prio_tc);
+int mlx5_query_port_prio_tc(struct mlx5_core_dev *mdev,
+			    u8 prio, u8 *tc);
 int mlx5_set_port_tc_group(struct mlx5_core_dev *mdev, u8 *tc_group);
 int mlx5_set_port_tc_bw_alloc(struct mlx5_core_dev *mdev, u8 *tc_bw);
+int mlx5_query_port_tc_bw_alloc(struct mlx5_core_dev *mdev,
+				u8 tc, u8 *bw_pct);
 int mlx5_modify_port_ets_rate_limit(struct mlx5_core_dev *mdev,
 				    u8 *max_bw_value,
 				    u8 *max_bw_unit);
-- 
cgit v1.2.3


From 341c5ee2fb78420ffc441df36f93226be8069b0a Mon Sep 17 00:00:00 2001
From: Huy Nguyen <huyn@mellanox.com>
Date: Sun, 27 Nov 2016 17:02:06 +0200
Subject: net/mlx5: Add DCBX firmware commands support

Add set/query commands for DCBX_PARAM register

Signed-off-by: Huy Nguyen <huyn@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/driver.h | 7 +++++++
 include/linux/mlx5/port.h   | 2 ++
 2 files changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index ae1f451e8f89..68b85efc3908 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -104,6 +104,8 @@ enum {
 enum {
 	MLX5_REG_QETCR		 = 0x4005,
 	MLX5_REG_QTCT		 = 0x400a,
+	MLX5_REG_DCBX_PARAM      = 0x4020,
+	MLX5_REG_DCBX_APP        = 0x4021,
 	MLX5_REG_PCAP		 = 0x5001,
 	MLX5_REG_PMTU		 = 0x5003,
 	MLX5_REG_PTYS		 = 0x5004,
@@ -124,6 +126,11 @@ enum {
 	MLX5_REG_MPCNT		 = 0x9051,
 };
 
+enum mlx5_dcbx_oper_mode {
+	MLX5E_DCBX_PARAM_VER_OPER_HOST  = 0x0,
+	MLX5E_DCBX_PARAM_VER_OPER_AUTO  = 0x3,
+};
+
 enum {
 	MLX5_ATOMIC_OPS_CMP_SWAP	= 1 << 0,
 	MLX5_ATOMIC_OPS_FETCH_ADD	= 1 << 1,
diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h
index bdee439f8cf3..e527732fb31b 100644
--- a/include/linux/mlx5/port.h
+++ b/include/linux/mlx5/port.h
@@ -162,4 +162,6 @@ void mlx5_query_port_fcs(struct mlx5_core_dev *mdev, bool *supported,
 int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
 			     u16 offset, u16 size, u8 *data);
 
+int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out);
+int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in);
 #endif /* __MLX5_PORT_H__ */
-- 
cgit v1.2.3


From d853d145ea3e63387a2ac759aa41d5e43876e561 Mon Sep 17 00:00:00 2001
From: jbrunet <jbrunet@baylibre.com>
Date: Mon, 28 Nov 2016 10:46:46 +0100
Subject: net: phy: add an option to disable EEE advertisement
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds an option to disable EEE advertisement in the generic PHY
by providing a mask of prohibited modes corresponding to the value found in
the MDIO_AN_EEE_ADV register.

On some platforms, PHY Low power idle seems to be causing issues, even
breaking the link some cases. The patch provides a convenient way for these
platforms to disable EEE advertisement and work around the issue.

Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Tested-by: Yegor Yefremov <yegorslists@googlemail.com>
Tested-by: Andreas Färber <afaerber@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index edde28ce163a..b53177fd38af 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -417,6 +417,9 @@ struct phy_device {
 	u32 advertising;
 	u32 lp_advertising;
 
+	/* Energy efficient ethernet modes which should be prohibited */
+	u32 eee_broken_modes;
+
 	int autoneg;
 
 	int link_timeout;
-- 
cgit v1.2.3


From 1fc31357ad194fb98691f3d122bcd47e59239e83 Mon Sep 17 00:00:00 2001
From: jbrunet <jbrunet@baylibre.com>
Date: Mon, 28 Nov 2016 10:46:47 +0100
Subject: dt-bindings: net: add EEE capability constants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Tested-by: Yegor Yefremov <yegorslists@googlemail.com>
Tested-by: Andreas Färber <afaerber@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/dt-bindings/net/mdio.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 include/dt-bindings/net/mdio.h

(limited to 'include')

diff --git a/include/dt-bindings/net/mdio.h b/include/dt-bindings/net/mdio.h
new file mode 100644
index 000000000000..99c6d903d439
--- /dev/null
+++ b/include/dt-bindings/net/mdio.h
@@ -0,0 +1,19 @@
+/*
+ * This header provides generic constants for ethernet MDIO bindings
+ */
+
+#ifndef _DT_BINDINGS_NET_MDIO_H
+#define _DT_BINDINGS_NET_MDIO_H
+
+/*
+ * EEE capability Advertisement
+ */
+
+#define MDIO_EEE_100TX		0x0002	/* 100TX EEE cap */
+#define MDIO_EEE_1000T		0x0004	/* 1000T EEE cap */
+#define MDIO_EEE_10GT		0x0008	/* 10GT EEE cap */
+#define MDIO_EEE_1000KX		0x0010	/* 1000KX EEE cap */
+#define MDIO_EEE_10GKX4		0x0020	/* 10G KX4 EEE cap */
+#define MDIO_EEE_10GKR		0x0040	/* 10G KR EEE cap */
+
+#endif
-- 
cgit v1.2.3


From 05b055e89121394058c75dc354e9a46e1e765579 Mon Sep 17 00:00:00 2001
From: Francis Yan <francisyyan@gmail.com>
Date: Sun, 27 Nov 2016 23:07:13 -0800
Subject: tcp: instrument tcp sender limits chronographs

This patch implements the skeleton of the TCP chronograph
instrumentation on sender side limits:

	1) idle (unspec)
	2) busy sending data other than 3-4 below
	3) rwnd-limited
	4) sndbuf-limited

The limits are enumerated 'tcp_chrono'. Since a connection in
theory can idle forever, we do not track the actual length of this
uninteresting idle period. For the rest we track how long the sender
spends in each limit. At any point during the life time of a
connection, the sender must be in one of the four states.

If there are multiple conditions worthy of tracking in a chronograph
then the highest priority enum takes precedence over
the other conditions. So that if something "more interesting"
starts happening, stop the previous chrono and start a new one.

The time unit is jiffy(u32) in order to save space in tcp_sock.
This implies application must sample the stats no longer than every
49 days of 1ms jiffy.

Signed-off-by: Francis Yan <francisyyan@gmail.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h |  7 +++++--
 include/net/tcp.h   | 14 ++++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 32a7c7e35b71..d5d3bd814338 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -211,8 +211,11 @@ struct tcp_sock {
 		u8 reord;    /* reordering detected */
 	} rack;
 	u16	advmss;		/* Advertised MSS			*/
-	u8	rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
-		unused:7;
+	u32	chrono_start;	/* Start time in jiffies of a TCP chrono */
+	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
+	u8	chrono_type:2,	/* current chronograph type */
+		rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
+		unused:5;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		thin_dupack : 1,/* Fast retransmit on first dupack      */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7de80739adab..e5ff4083870d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1516,6 +1516,20 @@ struct tcp_fastopen_context {
 	struct rcu_head		rcu;
 };
 
+/* Latencies incurred by various limits for a sender. They are
+ * chronograph-like stats that are mutually exclusive.
+ */
+enum tcp_chrono {
+	TCP_CHRONO_UNSPEC,
+	TCP_CHRONO_BUSY, /* Actively sending data (non-empty write queue) */
+	TCP_CHRONO_RWND_LIMITED, /* Stalled by insufficient receive window */
+	TCP_CHRONO_SNDBUF_LIMITED, /* Stalled by insufficient send buffer */
+	__TCP_CHRONO_MAX,
+};
+
+void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
+void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);
+
 /* write queue abstraction */
 static inline void tcp_write_queue_purge(struct sock *sk)
 {
-- 
cgit v1.2.3


From 0f87230d1a6c253681550c6064715d06a32be73d Mon Sep 17 00:00:00 2001
From: Francis Yan <francisyyan@gmail.com>
Date: Sun, 27 Nov 2016 23:07:14 -0800
Subject: tcp: instrument how long TCP is busy sending

This patch measures TCP busy time, which is defined as the period
of time when sender has data (or FIN) to send. The time starts when
data is buffered and stops when the write queue is flushed by ACKs
or error events.

Note the busy time does not include SYN time, unless data is
included in SYN (i.e. Fast Open). It does include FIN time even
if the FIN carries no payload. Excluding pure FIN is possible but
would incur one additional test in the fast path, which may not
be worth it.

Signed-off-by: Francis Yan <francisyyan@gmail.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index e5ff4083870d..3e097e39d4d2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1535,6 +1535,7 @@ static inline void tcp_write_queue_purge(struct sock *sk)
 {
 	struct sk_buff *skb;
 
+	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
 	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
 		sk_wmem_free_skb(sk, skb);
 	sk_mem_reclaim(sk);
@@ -1593,8 +1594,10 @@ static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *
 
 static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
 {
-	if (sk->sk_send_head == skb_unlinked)
+	if (sk->sk_send_head == skb_unlinked) {
 		sk->sk_send_head = NULL;
+		tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+	}
 	if (tcp_sk(sk)->highest_sack == skb_unlinked)
 		tcp_sk(sk)->highest_sack = NULL;
 }
@@ -1616,6 +1619,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
 	/* Queue it, remembering where we must start sending. */
 	if (sk->sk_send_head == NULL) {
 		sk->sk_send_head = skb;
+		tcp_chrono_start(sk, TCP_CHRONO_BUSY);
 
 		if (tcp_sk(sk)->highest_sack == NULL)
 			tcp_sk(sk)->highest_sack = skb;
-- 
cgit v1.2.3


From efd90174167530c67a54273fd5d8369c87f9bd32 Mon Sep 17 00:00:00 2001
From: Francis Yan <francisyyan@gmail.com>
Date: Sun, 27 Nov 2016 23:07:17 -0800
Subject: tcp: export sender limits chronographs to TCP_INFO

This patch exports all the sender chronograph measurements collected
in the previous patches to TCP_INFO interface. Note that busy time
exported includes all the other sending limits (rwnd-limited,
sndbuf-limited). Internally the time unit is jiffy but externally
the measurements are in microseconds for future extensions.

Signed-off-by: Francis Yan <francisyyan@gmail.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 73ac0db487f8..2863b661d6e1 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -214,6 +214,10 @@ struct tcp_info {
 	__u32	tcpi_data_segs_out;	/* RFC4898 tcpEStatsDataSegsOut */
 
 	__u64   tcpi_delivery_rate;
+
+	__u64	tcpi_busy_time;      /* Time (usec) busy sending data */
+	__u64	tcpi_rwnd_limited;   /* Time (usec) limited by receive window */
+	__u64	tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
 };
 
 /* for TCP_MD5SIG socket option */
-- 
cgit v1.2.3


From 1c885808e45601b2b6f68b30ac1d999e10b6f606 Mon Sep 17 00:00:00 2001
From: Francis Yan <francisyyan@gmail.com>
Date: Sun, 27 Nov 2016 23:07:18 -0800
Subject: tcp: SOF_TIMESTAMPING_OPT_STATS option for SO_TIMESTAMPING

This patch exports the sender chronograph stats via the socket
SO_TIMESTAMPING channel. Currently we can instrument how long a
particular application unit of data was queued in TCP by tracking
SOF_TIMESTAMPING_TX_SOFTWARE and SOF_TIMESTAMPING_TX_SCHED. Having
these sender chronograph stats exported simultaneously along with
these timestamps allow further breaking down the various sender
limitation.  For example, a video server can tell if a particular
chunk of video on a connection takes a long time to deliver because
TCP was experiencing small receive window. It is not possible to
tell before this patch without packet traces.

To prepare these stats, the user needs to set
SOF_TIMESTAMPING_OPT_STATS and SOF_TIMESTAMPING_OPT_TSONLY flags
while requesting other SOF_TIMESTAMPING TX timestamps. When the
timestamps are available in the error queue, the stats are returned
in a separate control message of type SCM_TIMESTAMPING_OPT_STATS,
in a list of TLVs (struct nlattr) of types: TCP_NLA_BUSY_TIME,
TCP_NLA_RWND_LIMITED, TCP_NLA_SNDBUF_LIMITED. Unit is microsecond.

Signed-off-by: Francis Yan <francisyyan@gmail.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h               | 2 ++
 include/uapi/asm-generic/socket.h | 2 ++
 include/uapi/linux/net_tstamp.h   | 3 ++-
 include/uapi/linux/tcp.h          | 8 ++++++++
 4 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index d5d3bd814338..00e0ee8f001f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -428,4 +428,6 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp)
 	tp->saved_syn = NULL;
 }
 
+struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk);
+
 #endif	/* _LINUX_TCP_H */
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 67d632f1743d..2c748ddad5f8 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -92,4 +92,6 @@
 
 #define SO_CNX_ADVICE		53
 
+#define SCM_TIMESTAMPING_OPT_STATS	54
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index 264e515de16f..464dcca5ed68 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -25,8 +25,9 @@ enum {
 	SOF_TIMESTAMPING_TX_ACK = (1<<9),
 	SOF_TIMESTAMPING_OPT_CMSG = (1<<10),
 	SOF_TIMESTAMPING_OPT_TSONLY = (1<<11),
+	SOF_TIMESTAMPING_OPT_STATS = (1<<12),
 
-	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TSONLY,
+	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_STATS,
 	SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
 				 SOF_TIMESTAMPING_LAST
 };
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 2863b661d6e1..c53de2691cec 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -220,6 +220,14 @@ struct tcp_info {
 	__u64	tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
 };
 
+/* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
+enum {
+	TCP_NLA_PAD,
+	TCP_NLA_BUSY,		/* Time (usec) busy sending data */
+	TCP_NLA_RWND_LIMITED,	/* Time (usec) limited by receive window */
+	TCP_NLA_SNDBUF_LIMITED,	/* Time (usec) limited by send buffer */
+};
+
 /* for TCP_MD5SIG socket option */
 #define TCP_MD5SIG_MAXKEYLEN	80
 
-- 
cgit v1.2.3


From 820ee17b8d3b2a57b1ea20b247cc6a1dddaf8b8d Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 29 Nov 2016 09:57:17 -0800
Subject: net: phy: broadcom: Add support code for reading PHY counters

Broadcom PHYs expose a number of PHY error counters: receive errors,
false carrier sense, SerDes BER count, local and remote receive errors.
Add support code to allow retrieving these error counters. Since the
Broadcom PHY library code is used by several drivers, make it possible
for them to specify the storage for the software copy of the statistics.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/brcmphy.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index f9f8aaf9c943..4f7d8be9ddbf 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -244,6 +244,9 @@
 #define LPI_FEATURE_EN_DIG1000X		0x4000
 
 /* Core register definitions*/
+#define MII_BRCM_CORE_BASE12	0x12
+#define MII_BRCM_CORE_BASE13	0x13
+#define MII_BRCM_CORE_BASE14	0x14
 #define MII_BRCM_CORE_BASE1E	0x1E
 #define MII_BRCM_CORE_EXPB0	0xB0
 #define MII_BRCM_CORE_EXPB1	0xB1
-- 
cgit v1.2.3


From 85de8576a0b14aecc99136cfbf90e367fa2142cb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 28 Nov 2016 23:16:54 +0100
Subject: bpf, xdp: allow to pass flags to dev_change_xdp_fd

Add an IFLA_XDP_FLAGS attribute that can be passed for setting up
XDP along with IFLA_XDP_FD, which eventually allows user space to
implement typical add/replace/delete logic for programs. Right now,
calling into dev_change_xdp_fd() will always replace previous programs.

When passed XDP_FLAGS_UPDATE_IF_NOEXIST, we can handle this more
graceful when requested by returning -EBUSY in case we try to
attach a new program, but we find that another one is already
attached. This will be used by upcoming front-end for iproute2 as
well.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    | 2 +-
 include/uapi/linux/if_link.h | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4ffcd874cc20..3755317cc6a9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3253,7 +3253,7 @@ int dev_get_phys_port_id(struct net_device *dev,
 int dev_get_phys_port_name(struct net_device *dev,
 			   char *name, size_t len);
 int dev_change_proto_down(struct net_device *dev, bool proto_down);
-int dev_change_xdp_fd(struct net_device *dev, int fd);
+int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags);
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 92b2d4928bf1..6b13e591abc9 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -876,10 +876,14 @@ enum {
 
 /* XDP section */
 
+#define XDP_FLAGS_UPDATE_IF_NOEXIST	(1U << 0)
+#define XDP_FLAGS_MASK			(XDP_FLAGS_UPDATE_IF_NOEXIST)
+
 enum {
 	IFLA_XDP_UNSPEC,
 	IFLA_XDP_FD,
 	IFLA_XDP_ATTACHED,
+	IFLA_XDP_FLAGS,
 	__IFLA_XDP_MAX,
 };
 
-- 
cgit v1.2.3


From b634d30a79ecc2d28e61cbe5b1f4443952f37a8f Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 30 Nov 2016 10:16:08 -0800
Subject: cgroup, bpf: remove unnecessary #include

this #include is unnecessary and brings whole set of
other headers into cgroup-defs.h. Remove it.

Fixes: 3007098494be ("cgroup: add support for eBPF programs")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Rami Rosen <roszenrami@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index ec80d0c0953e..0cf1adfadd2d 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -1,7 +1,6 @@
 #ifndef _BPF_CGROUP_H
 #define _BPF_CGROUP_H
 
-#include <linux/bpf.h>
 #include <linux/jump_label.h>
 #include <uapi/linux/bpf.h>
 
-- 
cgit v1.2.3


From 6d937acfb3f166f6e10abd978fafafa120d6f0d7 Mon Sep 17 00:00:00 2001
From: "Mintz, Yuval" <Yuval.Mintz@cavium.com>
Date: Tue, 29 Nov 2016 16:47:01 +0200
Subject: qed: Optimize qed_chain datapath usage

The chain structure and functions are widely used by the qed* modules,
both for configuration and datapath.
E.g., qede's Tx has one such chain and its Rx has two.

Currently, the strucutre's fields which are required for datapath
related functions [produce/consume] are intertwined with fields which
are required only for configuration purposes [init/destroy/etc.].

This patch re-arranges the chain structure so that all the fields which
are required for datapath usage could reside in a single cacheline instead
of the two which are required today.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_chain.h | 144 +++++++++++++++++++++++-------------------
 1 file changed, 80 insertions(+), 64 deletions(-)

(limited to 'include')

diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h
index 72d88cf3ca25..37dfba101c6c 100644
--- a/include/linux/qed/qed_chain.h
+++ b/include/linux/qed/qed_chain.h
@@ -56,23 +56,6 @@ struct qed_chain_pbl_u32 {
 	u32 cons_page_idx;
 };
 
-struct qed_chain_pbl {
-	/* Base address of a pre-allocated buffer for pbl */
-	dma_addr_t	p_phys_table;
-	void		*p_virt_table;
-
-	/* Table for keeping the virtual addresses of the chain pages,
-	 * respectively to the physical addresses in the pbl table.
-	 */
-	void **pp_virt_addr_tbl;
-
-	/* Index to current used page by producer/consumer */
-	union {
-		struct qed_chain_pbl_u16 pbl16;
-		struct qed_chain_pbl_u32 pbl32;
-	} u;
-};
-
 struct qed_chain_u16 {
 	/* Cyclic index of next element to produce/consme */
 	u16 prod_idx;
@@ -86,46 +69,78 @@ struct qed_chain_u32 {
 };
 
 struct qed_chain {
-	void			*p_virt_addr;
-	dma_addr_t		p_phys_addr;
-	void			*p_prod_elem;
-	void			*p_cons_elem;
+	/* fastpath portion of the chain - required for commands such
+	 * as produce / consume.
+	 */
+	/* Point to next element to produce/consume */
+	void *p_prod_elem;
+	void *p_cons_elem;
+
+	/* Fastpath portions of the PBL [if exists] */
+	struct {
+		/* Table for keeping the virtual addresses of the chain pages,
+		 * respectively to the physical addresses in the pbl table.
+		 */
+		void **pp_virt_addr_tbl;
 
-	enum qed_chain_mode	mode;
-	enum qed_chain_use_mode intended_use; /* used to produce/consume */
-	enum qed_chain_cnt_type cnt_type;
+		union {
+			struct qed_chain_pbl_u16 u16;
+			struct qed_chain_pbl_u32 u32;
+		} c;
+	} pbl;
 
 	union {
 		struct qed_chain_u16 chain16;
 		struct qed_chain_u32 chain32;
 	} u;
 
+	/* Capacity counts only usable elements */
+	u32 capacity;
 	u32 page_cnt;
 
-	/* Number of elements - capacity is for usable elements only,
-	 * while size will contain total number of elements [for entire chain].
+	enum qed_chain_mode mode;
+
+	/* Elements information for fast calculations */
+	u16 elem_per_page;
+	u16 elem_per_page_mask;
+	u16 elem_size;
+	u16 next_page_mask;
+	u16 usable_per_page;
+	u8 elem_unusable;
+
+	u8 cnt_type;
+
+	/* Slowpath of the chain - required for initialization and destruction,
+	 * but isn't involved in regular functionality.
 	 */
-	u32 capacity;
+
+	/* Base address of a pre-allocated buffer for pbl */
+	struct {
+		dma_addr_t p_phys_table;
+		void *p_virt_table;
+	} pbl_sp;
+
+	/* Address of first page of the chain - the address is required
+	 * for fastpath operation [consume/produce] but only for the the SINGLE
+	 * flavour which isn't considered fastpath [== SPQ].
+	 */
+	void *p_virt_addr;
+	dma_addr_t p_phys_addr;
+
+	/* Total number of elements [for entire chain] */
 	u32 size;
 
-	/* Elements information for fast calculations */
-	u16			elem_per_page;
-	u16			elem_per_page_mask;
-	u16			elem_unusable;
-	u16			usable_per_page;
-	u16			elem_size;
-	u16			next_page_mask;
-	struct qed_chain_pbl	pbl;
+	u8 intended_use;
 };
 
 #define QED_CHAIN_PBL_ENTRY_SIZE        (8)
 #define QED_CHAIN_PAGE_SIZE             (0x1000)
 #define ELEMS_PER_PAGE(elem_size)       (QED_CHAIN_PAGE_SIZE / (elem_size))
 
-#define UNUSABLE_ELEMS_PER_PAGE(elem_size, mode)     \
-	((mode == QED_CHAIN_MODE_NEXT_PTR) ?	     \
-	 (1 + ((sizeof(struct qed_chain_next) - 1) / \
-	       (elem_size))) : 0)
+#define UNUSABLE_ELEMS_PER_PAGE(elem_size, mode)	 \
+	(((mode) == QED_CHAIN_MODE_NEXT_PTR) ?		 \
+	 (u8)(1 + ((sizeof(struct qed_chain_next) - 1) / \
+		   (elem_size))) : 0)
 
 #define USABLE_ELEMS_PER_PAGE(elem_size, mode) \
 	((u32)(ELEMS_PER_PAGE(elem_size) -     \
@@ -186,7 +201,7 @@ static inline u16 qed_chain_get_usable_per_page(struct qed_chain *p_chain)
 	return p_chain->usable_per_page;
 }
 
-static inline u16 qed_chain_get_unusable_per_page(struct qed_chain *p_chain)
+static inline u8 qed_chain_get_unusable_per_page(struct qed_chain *p_chain)
 {
 	return p_chain->elem_unusable;
 }
@@ -198,7 +213,7 @@ static inline u32 qed_chain_get_page_cnt(struct qed_chain *p_chain)
 
 static inline dma_addr_t qed_chain_get_pbl_phys(struct qed_chain *p_chain)
 {
-	return p_chain->pbl.p_phys_table;
+	return p_chain->pbl_sp.p_phys_table;
 }
 
 /**
@@ -214,10 +229,10 @@ static inline dma_addr_t qed_chain_get_pbl_phys(struct qed_chain *p_chain)
 static inline void
 qed_chain_advance_page(struct qed_chain *p_chain,
 		       void **p_next_elem, void *idx_to_inc, void *page_to_inc)
-
 {
 	struct qed_chain_next *p_next = NULL;
 	u32 page_index = 0;
+
 	switch (p_chain->mode) {
 	case QED_CHAIN_MODE_NEXT_PTR:
 		p_next = *p_next_elem;
@@ -305,7 +320,7 @@ static inline void *qed_chain_produce(struct qed_chain *p_chain)
 		if ((p_chain->u.chain16.prod_idx &
 		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
 			p_prod_idx = &p_chain->u.chain16.prod_idx;
-			p_prod_page_idx = &p_chain->pbl.u.pbl16.prod_page_idx;
+			p_prod_page_idx = &p_chain->pbl.c.u16.prod_page_idx;
 			qed_chain_advance_page(p_chain, &p_chain->p_prod_elem,
 					       p_prod_idx, p_prod_page_idx);
 		}
@@ -314,7 +329,7 @@ static inline void *qed_chain_produce(struct qed_chain *p_chain)
 		if ((p_chain->u.chain32.prod_idx &
 		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
 			p_prod_idx = &p_chain->u.chain32.prod_idx;
-			p_prod_page_idx = &p_chain->pbl.u.pbl32.prod_page_idx;
+			p_prod_page_idx = &p_chain->pbl.c.u32.prod_page_idx;
 			qed_chain_advance_page(p_chain, &p_chain->p_prod_elem,
 					       p_prod_idx, p_prod_page_idx);
 		}
@@ -378,7 +393,7 @@ static inline void *qed_chain_consume(struct qed_chain *p_chain)
 		if ((p_chain->u.chain16.cons_idx &
 		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
 			p_cons_idx = &p_chain->u.chain16.cons_idx;
-			p_cons_page_idx = &p_chain->pbl.u.pbl16.cons_page_idx;
+			p_cons_page_idx = &p_chain->pbl.c.u16.cons_page_idx;
 			qed_chain_advance_page(p_chain, &p_chain->p_cons_elem,
 					       p_cons_idx, p_cons_page_idx);
 		}
@@ -387,8 +402,8 @@ static inline void *qed_chain_consume(struct qed_chain *p_chain)
 		if ((p_chain->u.chain32.cons_idx &
 		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
 			p_cons_idx = &p_chain->u.chain32.cons_idx;
-			p_cons_page_idx = &p_chain->pbl.u.pbl32.cons_page_idx;
-		qed_chain_advance_page(p_chain, &p_chain->p_cons_elem,
+			p_cons_page_idx = &p_chain->pbl.c.u32.cons_page_idx;
+			qed_chain_advance_page(p_chain, &p_chain->p_cons_elem,
 					       p_cons_idx, p_cons_page_idx);
 		}
 		p_chain->u.chain32.cons_idx++;
@@ -429,25 +444,26 @@ static inline void qed_chain_reset(struct qed_chain *p_chain)
 		u32 reset_val = p_chain->page_cnt - 1;
 
 		if (is_chain_u16(p_chain)) {
-			p_chain->pbl.u.pbl16.prod_page_idx = (u16)reset_val;
-			p_chain->pbl.u.pbl16.cons_page_idx = (u16)reset_val;
+			p_chain->pbl.c.u16.prod_page_idx = (u16)reset_val;
+			p_chain->pbl.c.u16.cons_page_idx = (u16)reset_val;
 		} else {
-			p_chain->pbl.u.pbl32.prod_page_idx = reset_val;
-			p_chain->pbl.u.pbl32.cons_page_idx = reset_val;
+			p_chain->pbl.c.u32.prod_page_idx = reset_val;
+			p_chain->pbl.c.u32.cons_page_idx = reset_val;
 		}
 	}
 
 	switch (p_chain->intended_use) {
-	case QED_CHAIN_USE_TO_CONSUME_PRODUCE:
-	case QED_CHAIN_USE_TO_PRODUCE:
-		/* Do nothing */
-		break;
-
 	case QED_CHAIN_USE_TO_CONSUME:
 		/* produce empty elements */
 		for (i = 0; i < p_chain->capacity; i++)
 			qed_chain_recycle_consumed(p_chain);
 		break;
+
+	case QED_CHAIN_USE_TO_CONSUME_PRODUCE:
+	case QED_CHAIN_USE_TO_PRODUCE:
+	default:
+		/* Do nothing */
+		break;
 	}
 }
 
@@ -473,13 +489,13 @@ static inline void qed_chain_init_params(struct qed_chain *p_chain,
 	p_chain->p_virt_addr = NULL;
 	p_chain->p_phys_addr = 0;
 	p_chain->elem_size	= elem_size;
-	p_chain->intended_use = intended_use;
+	p_chain->intended_use = (u8)intended_use;
 	p_chain->mode		= mode;
-	p_chain->cnt_type = cnt_type;
+	p_chain->cnt_type = (u8)cnt_type;
 
-	p_chain->elem_per_page		= ELEMS_PER_PAGE(elem_size);
+	p_chain->elem_per_page = ELEMS_PER_PAGE(elem_size);
 	p_chain->usable_per_page = USABLE_ELEMS_PER_PAGE(elem_size, mode);
-	p_chain->elem_per_page_mask	= p_chain->elem_per_page - 1;
+	p_chain->elem_per_page_mask = p_chain->elem_per_page - 1;
 	p_chain->elem_unusable = UNUSABLE_ELEMS_PER_PAGE(elem_size, mode);
 	p_chain->next_page_mask = (p_chain->usable_per_page &
 				   p_chain->elem_per_page_mask);
@@ -488,8 +504,8 @@ static inline void qed_chain_init_params(struct qed_chain *p_chain,
 	p_chain->capacity = p_chain->usable_per_page * page_cnt;
 	p_chain->size = p_chain->elem_per_page * page_cnt;
 
-	p_chain->pbl.p_phys_table = 0;
-	p_chain->pbl.p_virt_table = NULL;
+	p_chain->pbl_sp.p_phys_table = 0;
+	p_chain->pbl_sp.p_virt_table = NULL;
 	p_chain->pbl.pp_virt_addr_tbl = NULL;
 }
 
@@ -530,8 +546,8 @@ static inline void qed_chain_init_pbl_mem(struct qed_chain *p_chain,
 					  dma_addr_t p_phys_pbl,
 					  void **pp_virt_addr_tbl)
 {
-	p_chain->pbl.p_phys_table = p_phys_pbl;
-	p_chain->pbl.p_virt_table = p_virt_pbl;
+	p_chain->pbl_sp.p_phys_table = p_phys_pbl;
+	p_chain->pbl_sp.p_virt_table = p_virt_pbl;
 	p_chain->pbl.pp_virt_addr_tbl = pp_virt_addr_tbl;
 }
 
-- 
cgit v1.2.3


From 3da7a37ae6886cfba9ef35428eb976fc2ef561fa Mon Sep 17 00:00:00 2001
From: "Mintz, Yuval" <Yuval.Mintz@cavium.com>
Date: Tue, 29 Nov 2016 16:47:06 +0200
Subject: qed*: Handle-based L2-queues.

The driver needs to maintain several FW/HW-indices for each one of
its queues. Currently, that mapping is done by the QED where it uses
an rx/tx array of so-called hw-cids, populating them whenever a new
queue is opened and clearing them upon destruction of said queues.

This maintenance is far from ideal - there's no real reason why
QED needs to maintain such a data-structure. It becomes even worse
when considering the fact that the PF's queues and its child VFs' queues
are all mapped into the same data-structure.
As a by-product, the set of parameters an interface needs to supply for
queue APIs is non-trivial, and some of the variables in the API
structures have different meaning depending on their exact place
in the configuration flow.

This patch re-organizes the way L2 queues are configured and maintained.
In short:
  - Required parameters for queue init are now well-defined.
  - Qed would allocate a queue-cid based on parameters.
    Upon initialization success, it would return a handle to caller.
  - Queue-handle would be maintained by entity requesting queue-init,
    not necessarily qed.
  - All further queue-APIs [update, destroy] would use the opaque
    handle as reference for the queue instead of various indices.

The possible owners of such handles:
  - PF queues [qede] - complete handles based on provided configuration.
  - VF queues [qede] - fw-context-less handles, containing only relative
    information; Only the PF-side would need the absolute indices
    for configuration, so they're omitted here.
  - VF queues [qed, PF-side] - complete handles based on VF initialization.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_eth_if.h | 56 ++++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 9755a3feb52e..7a52f7c58c37 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -15,6 +15,29 @@
 #include <linux/qed/qed_if.h>
 #include <linux/qed/qed_iov_if.h>
 
+struct qed_queue_start_common_params {
+	/* Should always be relative to entity sending this. */
+	u8 vport_id;
+	u16 queue_id;
+
+	/* Relative, but relevant only for PFs */
+	u8 stats_id;
+
+	/* These are always absolute */
+	u16 sb;
+	u8 sb_idx;
+};
+
+struct qed_rxq_start_ret_params {
+	void __iomem *p_prod;
+	void *p_handle;
+};
+
+struct qed_txq_start_ret_params {
+	void __iomem *p_doorbell;
+	void *p_handle;
+};
+
 struct qed_dev_eth_info {
 	struct qed_dev_info common;
 
@@ -56,18 +79,6 @@ struct qed_start_vport_params {
 	bool clear_stats;
 };
 
-struct qed_stop_rxq_params {
-	u8 rss_id;
-	u8 rx_queue_id;
-	u8 vport_id;
-	bool eq_completion_only;
-};
-
-struct qed_stop_txq_params {
-	u8 rss_id;
-	u8 tx_queue_id;
-};
-
 enum qed_filter_rx_mode_type {
 	QED_FILTER_RX_MODE_TYPE_REGULAR,
 	QED_FILTER_RX_MODE_TYPE_MULTI_PROMISC,
@@ -112,15 +123,6 @@ struct qed_filter_params {
 	union qed_filter_type_params filter;
 };
 
-struct qed_queue_start_common_params {
-	u8 rss_id;
-	u8 queue_id;
-	u8 vport_id;
-	u16 sb;
-	u16 sb_idx;
-	u16 vf_qid;
-};
-
 struct qed_tunn_params {
 	u16 vxlan_port;
 	u8 update_vxlan_port;
@@ -220,24 +222,24 @@ struct qed_eth_ops {
 			    struct qed_update_vport_params *params);
 
 	int (*q_rx_start)(struct qed_dev *cdev,
+			  u8 rss_num,
 			  struct qed_queue_start_common_params *params,
 			  u16 bd_max_bytes,
 			  dma_addr_t bd_chain_phys_addr,
 			  dma_addr_t cqe_pbl_addr,
 			  u16 cqe_pbl_size,
-			  void __iomem **pp_prod);
+			  struct qed_rxq_start_ret_params *ret_params);
 
-	int (*q_rx_stop)(struct qed_dev *cdev,
-			 struct qed_stop_rxq_params *params);
+	int (*q_rx_stop)(struct qed_dev *cdev, u8 rss_id, void *handle);
 
 	int (*q_tx_start)(struct qed_dev *cdev,
+			  u8 rss_num,
 			  struct qed_queue_start_common_params *params,
 			  dma_addr_t pbl_addr,
 			  u16 pbl_size,
-			  void __iomem **pp_doorbell);
+			  struct qed_txq_start_ret_params *ret_params);
 
-	int (*q_tx_stop)(struct qed_dev *cdev,
-			 struct qed_stop_txq_params *params);
+	int (*q_tx_stop)(struct qed_dev *cdev, u8 rss_id, void *handle);
 
 	int (*filter_config)(struct qed_dev *cdev,
 			     struct qed_filter_params *params);
-- 
cgit v1.2.3


From f4ed2fe34fb793755ef8cfc3509e783c4709ffc1 Mon Sep 17 00:00:00 2001
From: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>
Date: Tue, 29 Nov 2016 15:16:46 +0530
Subject: net: phy: add mdix_ctrl to hold the user configuration.

Add new parameter mdix_ctrl to hold the user configuration.
Existing mdix maintain the current status of MDI(X) crossover performed or
not.
mdix_ctrl can configure either ETH_TP_MDI or ETH_TP_MDI_X orETH_TP_MDI_AUTO.

Signed-off-by: Raju Lakkaraju <Raju.Lakkaraju@microsemi.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index b53177fd38af..feb8a98e8dd3 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -450,6 +450,7 @@ struct phy_device {
 	struct net_device *attached_dev;
 
 	u8 mdix;
+	u8 mdix_ctrl;
 
 	void (*adjust_link)(struct net_device *dev);
 };
-- 
cgit v1.2.3


From 1c1b522808a18402f043c1418b4e48c7355480cc Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 30 Nov 2016 17:59:37 +0200
Subject: net/mlx5e: Implement Fragmented Work Queue (WQ)

Add new type of struct mlx5_frag_buf which is used to allocate fragmented
buffers rather than contiguous, and make the Completion Queues (CQs) use
it as they are big (default of 2MB per CQ in Striding RQ).

This fixes the failures of type:
"mlx5e_open_locked: mlx5e_open_channels failed, -12"
due to dma_zalloc_coherent insufficient contiguous coherent memory to
satisfy the driver's request when the user tries to setup more or larger
rings.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Reported-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/driver.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 68b85efc3908..0ae55361e674 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -318,6 +318,13 @@ struct mlx5_buf {
 	u8			page_shift;
 };
 
+struct mlx5_frag_buf {
+	struct mlx5_buf_list	*frags;
+	int			npages;
+	int			size;
+	u8			page_shift;
+};
+
 struct mlx5_eq_tasklet {
 	struct list_head list;
 	struct list_head process_list;
@@ -822,6 +829,9 @@ int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
 			struct mlx5_buf *buf, int node);
 int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf);
 void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf);
+int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size,
+			     struct mlx5_frag_buf *buf, int node);
+void mlx5_frag_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf);
 struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev,
 						      gfp_t flags, int npages);
 void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev,
@@ -866,6 +876,7 @@ void mlx5_unregister_debugfs(void);
 int mlx5_eq_init(struct mlx5_core_dev *dev);
 void mlx5_eq_cleanup(struct mlx5_core_dev *dev);
 void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas);
+void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas);
 void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn);
 void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type);
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-- 
cgit v1.2.3


From 3a0af8fd61f90920f6fa04e4f1e9a6a73c1b4fd2 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 30 Nov 2016 17:10:10 +0100
Subject: bpf: BPF for lightweight tunnel infrastructure

Registers new BPF program types which correspond to the LWT hooks:
  - BPF_PROG_TYPE_LWT_IN   => dst_input()
  - BPF_PROG_TYPE_LWT_OUT  => dst_output()
  - BPF_PROG_TYPE_LWT_XMIT => lwtunnel_xmit()

The separate program types are required to differentiate between the
capabilities each LWT hook allows:

 * Programs attached to dst_input() or dst_output() are restricted and
   may only read the data of an skb. This prevent modification and
   possible invalidation of already validated packet headers on receive
   and the construction of illegal headers while the IP headers are
   still being assembled.

 * Programs attached to lwtunnel_xmit() are allowed to modify packet
   content as well as prepending an L2 header via a newly introduced
   helper bpf_skb_change_head(). This is safe as lwtunnel_xmit() is
   invoked after the IP header has been assembled completely.

All BPF programs receive an skb with L3 headers attached and may return
one of the following error codes:

 BPF_OK - Continue routing as per nexthop
 BPF_DROP - Drop skb and return EPERM
 BPF_REDIRECT - Redirect skb to device as per redirect() helper.
                (Only valid in lwtunnel_xmit() context)

The return codes are binary compatible with their TC_ACT_
relatives to ease compatibility.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h        |  2 +-
 include/uapi/linux/bpf.h      | 32 +++++++++++++++++++++++++++++++-
 include/uapi/linux/lwtunnel.h | 23 +++++++++++++++++++++++
 3 files changed, 55 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7f246a281435..7ba644626553 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -438,7 +438,7 @@ struct xdp_buff {
 };
 
 /* compute the linear packet data range [data, data_end) which
- * will be accessed by cls_bpf and act_bpf programs
+ * will be accessed by cls_bpf, act_bpf and lwt programs
  */
 static inline void bpf_compute_data_end(struct sk_buff *skb)
 {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1370a9d1456f..22ac82792687 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -101,6 +101,9 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_XDP,
 	BPF_PROG_TYPE_PERF_EVENT,
 	BPF_PROG_TYPE_CGROUP_SKB,
+	BPF_PROG_TYPE_LWT_IN,
+	BPF_PROG_TYPE_LWT_OUT,
+	BPF_PROG_TYPE_LWT_XMIT,
 };
 
 enum bpf_attach_type {
@@ -409,6 +412,16 @@ union bpf_attr {
  *
  * int bpf_get_numa_node_id()
  *     Return: Id of current NUMA node.
+ *
+ * int bpf_skb_change_head()
+ *     Grows headroom of skb and adjusts MAC header offset accordingly.
+ *     Will extends/reallocae as required automatically.
+ *     May change skb data pointer and will thus invalidate any check
+ *     performed for direct packet access.
+ *     @skb: pointer to skb
+ *     @len: length of header to be pushed in front
+ *     @flags: Flags (unused for now)
+ *     Return: 0 on success or negative error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -453,7 +466,8 @@ union bpf_attr {
 	FN(skb_pull_data),		\
 	FN(csum_update),		\
 	FN(set_hash_invalid),		\
-	FN(get_numa_node_id),
+	FN(get_numa_node_id),		\
+	FN(skb_change_head),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -537,6 +551,22 @@ struct bpf_tunnel_key {
 	__u32 tunnel_label;
 };
 
+/* Generic BPF return codes which all BPF program types may support.
+ * The values are binary compatible with their TC_ACT_* counter-part to
+ * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
+ * programs.
+ *
+ * XDP is handled seprately, see XDP_*.
+ */
+enum bpf_ret_code {
+	BPF_OK = 0,
+	/* 1 reserved */
+	BPF_DROP = 2,
+	/* 3-6 reserved */
+	BPF_REDIRECT = 7,
+	/* >127 are reserved for prog type specific return codes */
+};
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 453cc6215bfd..92724cba1eba 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -10,6 +10,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_ILA,
 	LWTUNNEL_ENCAP_IP6,
 	LWTUNNEL_ENCAP_SEG6,
+	LWTUNNEL_ENCAP_BPF,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
@@ -43,4 +44,26 @@ enum lwtunnel_ip6_t {
 
 #define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1)
 
+enum {
+	LWT_BPF_PROG_UNSPEC,
+	LWT_BPF_PROG_FD,
+	LWT_BPF_PROG_NAME,
+	__LWT_BPF_PROG_MAX,
+};
+
+#define LWT_BPF_PROG_MAX (__LWT_BPF_PROG_MAX - 1)
+
+enum {
+	LWT_BPF_UNSPEC,
+	LWT_BPF_IN,
+	LWT_BPF_OUT,
+	LWT_BPF_XMIT,
+	LWT_BPF_XMIT_HEADROOM,
+	__LWT_BPF_MAX,
+};
+
+#define LWT_BPF_MAX (__LWT_BPF_MAX - 1)
+
+#define LWT_BPF_MAX_HEADROOM 256
+
 #endif /* _UAPI_LWTUNNEL_H_ */
-- 
cgit v1.2.3


From 366cbf2f46048d70005c6c33dc289330f24b54b0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 30 Nov 2016 22:16:06 +0100
Subject: bpf, xdp: drop rcu_read_lock from bpf_prog_run_xdp and move to caller

After 326fe02d1ed6 ("net/mlx4_en: protect ring->xdp_prog with rcu_read_lock"),
the rcu_read_lock() in bpf_prog_run_xdp() is superfluous, since callers
need to hold rcu_read_lock() already to make sure BPF program doesn't
get released in the background.

Thus, drop it from bpf_prog_run_xdp(), as it can otherwise be misleading.
Still keeping the bpf_prog_run_xdp() is useful as it allows for grepping
in XDP supported drivers and to keep the typecheck on the context intact.
For mlx4, this means we don't have a double rcu_read_lock() anymore. nfp can
just make use of bpf_prog_run_xdp(), too. For qede, just move rcu_read_lock()
out of the helper. When the driver gets atomic replace support, this will
move to call-sites eventually.

mlx5 needs actual fixing as it has the same issue as described already in
326fe02d1ed6 ("net/mlx4_en: protect ring->xdp_prog with rcu_read_lock"),
that is, we're under RCU bh at this time, BPF programs are released via
call_rcu(), and call_rcu() != call_rcu_bh(), so we need to properly mark
read side as programs can get xchg()'ed in mlx5e_xdp_set() without queue
reset.

Fixes: 86994156c736 ("net/mlx5e: XDP fast RX drop bpf programs support")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7ba644626553..97338134398f 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -498,16 +498,16 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
 	return BPF_PROG_RUN(prog, skb);
 }
 
-static inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
-				   struct xdp_buff *xdp)
+static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
+					    struct xdp_buff *xdp)
 {
-	u32 ret;
-
-	rcu_read_lock();
-	ret = BPF_PROG_RUN(prog, xdp);
-	rcu_read_unlock();
-
-	return ret;
+	/* Caller needs to hold rcu_read_lock() (!), otherwise program
+	 * can be released while still running, or map elements could be
+	 * freed early while still having concurrent users. XDP fastpath
+	 * already takes rcu_read_lock() when fetching the program, so
+	 * it's not necessary here anymore.
+	 */
+	return BPF_PROG_RUN(prog, xdp);
 }
 
 static inline unsigned int bpf_prog_size(unsigned int proglen)
-- 
cgit v1.2.3


From fc831825f99eb3a2f1bf3fe7307b392513b642a5 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuval.mintz@cavium.com>
Date: Thu, 1 Dec 2016 00:21:06 -0800
Subject: qed: Add support for hardware offloaded iSCSI.

This adds the backbone required for the various HW initalizations
which are necessary for the iSCSI driver (qedi) for QLogic FastLinQ
4xxxx line of adapters - FW notification, resource initializations, etc.

Signed-off-by: Arun Easi <arun.easi@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mintz@cavium.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_if.h       |   2 +
 include/linux/qed/qed_iscsi_if.h | 229 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 231 insertions(+)
 create mode 100644 include/linux/qed/qed_iscsi_if.h

(limited to 'include')

diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index ea095b4893aa..4b454f4f5b25 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -166,6 +166,7 @@ struct qed_iscsi_pf_params {
 	u32 max_cwnd;
 	u16 cq_num_entries;
 	u16 cmdq_num_entries;
+	u32 two_msl_timer;
 	u16 dup_ack_threshold;
 	u16 tx_sws_timer;
 	u16 min_rto;
@@ -275,6 +276,7 @@ struct qed_dev_info {
 enum qed_sb_type {
 	QED_SB_TYPE_L2_QUEUE,
 	QED_SB_TYPE_CNQ,
+	QED_SB_TYPE_STORAGE,
 };
 
 enum qed_protocol {
diff --git a/include/linux/qed/qed_iscsi_if.h b/include/linux/qed/qed_iscsi_if.h
new file mode 100644
index 000000000000..d27912480cb3
--- /dev/null
+++ b/include/linux/qed/qed_iscsi_if.h
@@ -0,0 +1,229 @@
+/* QLogic qed NIC Driver
+ * Copyright (c) 2015 QLogic Corporation
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef _QED_ISCSI_IF_H
+#define _QED_ISCSI_IF_H
+#include <linux/types.h>
+#include <linux/qed/qed_if.h>
+
+typedef int (*iscsi_event_cb_t) (void *context,
+				 u8 fw_event_code, void *fw_handle);
+struct qed_iscsi_stats {
+	u64 iscsi_rx_bytes_cnt;
+	u64 iscsi_rx_packet_cnt;
+	u64 iscsi_rx_new_ooo_isle_events_cnt;
+	u32 iscsi_cmdq_threshold_cnt;
+	u32 iscsi_rq_threshold_cnt;
+	u32 iscsi_immq_threshold_cnt;
+
+	u64 iscsi_rx_dropped_pdus_task_not_valid;
+
+	u64 iscsi_rx_data_pdu_cnt;
+	u64 iscsi_rx_r2t_pdu_cnt;
+	u64 iscsi_rx_total_pdu_cnt;
+
+	u64 iscsi_tx_go_to_slow_start_event_cnt;
+	u64 iscsi_tx_fast_retransmit_event_cnt;
+
+	u64 iscsi_tx_data_pdu_cnt;
+	u64 iscsi_tx_r2t_pdu_cnt;
+	u64 iscsi_tx_total_pdu_cnt;
+
+	u64 iscsi_tx_bytes_cnt;
+	u64 iscsi_tx_packet_cnt;
+};
+
+struct qed_dev_iscsi_info {
+	struct qed_dev_info common;
+
+	void __iomem *primary_dbq_rq_addr;
+	void __iomem *secondary_bdq_rq_addr;
+};
+
+struct qed_iscsi_id_params {
+	u8 mac[ETH_ALEN];
+	u32 ip[4];
+	u16 port;
+};
+
+struct qed_iscsi_params_offload {
+	u8 layer_code;
+	dma_addr_t sq_pbl_addr;
+	u32 initial_ack;
+
+	struct qed_iscsi_id_params src;
+	struct qed_iscsi_id_params dst;
+	u16 vlan_id;
+	u8 tcp_flags;
+	u8 ip_version;
+	u8 default_cq;
+
+	u8 ka_max_probe_cnt;
+	u8 dup_ack_theshold;
+	u32 rcv_next;
+	u32 snd_una;
+	u32 snd_next;
+	u32 snd_max;
+	u32 snd_wnd;
+	u32 rcv_wnd;
+	u32 snd_wl1;
+	u32 cwnd;
+	u32 ss_thresh;
+	u16 srtt;
+	u16 rtt_var;
+	u32 ts_time;
+	u32 ts_recent;
+	u32 ts_recent_age;
+	u32 total_rt;
+	u32 ka_timeout_delta;
+	u32 rt_timeout_delta;
+	u8 dup_ack_cnt;
+	u8 snd_wnd_probe_cnt;
+	u8 ka_probe_cnt;
+	u8 rt_cnt;
+	u32 flow_label;
+	u32 ka_timeout;
+	u32 ka_interval;
+	u32 max_rt_time;
+	u32 initial_rcv_wnd;
+	u8 ttl;
+	u8 tos_or_tc;
+	u16 remote_port;
+	u16 local_port;
+	u16 mss;
+	u8 snd_wnd_scale;
+	u8 rcv_wnd_scale;
+	u32 ts_ticks_per_second;
+	u16 da_timeout_value;
+	u8 ack_frequency;
+};
+
+struct qed_iscsi_params_update {
+	u8 update_flag;
+#define QED_ISCSI_CONN_HD_EN            BIT(0)
+#define QED_ISCSI_CONN_DD_EN            BIT(1)
+#define QED_ISCSI_CONN_INITIAL_R2T      BIT(2)
+#define QED_ISCSI_CONN_IMMEDIATE_DATA   BIT(3)
+
+	u32 max_seq_size;
+	u32 max_recv_pdu_length;
+	u32 max_send_pdu_length;
+	u32 first_seq_length;
+	u32 exp_stat_sn;
+};
+
+#define MAX_TID_BLOCKS_ISCSI (512)
+struct qed_iscsi_tid {
+	u32 size;		/* In bytes per task */
+	u32 num_tids_per_block;
+	u8 *blocks[MAX_TID_BLOCKS_ISCSI];
+};
+
+struct qed_iscsi_cb_ops {
+	struct qed_common_cb_ops common;
+};
+
+/**
+ * struct qed_iscsi_ops - qed iSCSI operations.
+ * @common:		common operations pointer
+ * @ll2:		light L2 operations pointer
+ * @fill_dev_info:	fills iSCSI specific information
+ *			@param cdev
+ *			@param info
+ *			@return 0 on sucesss, otherwise error value.
+ * @register_ops:	register iscsi operations
+ *			@param cdev
+ *			@param ops - specified using qed_iscsi_cb_ops
+ *			@param cookie - driver private
+ * @start:		iscsi in FW
+ *			@param cdev
+ *			@param tasks - qed will fill information about tasks
+ *			return 0 on success, otherwise error value.
+ * @stop:		iscsi in FW
+ *			@param cdev
+ *			return 0 on success, otherwise error value.
+ * @acquire_conn:	acquire a new iscsi connection
+ *			@param cdev
+ *			@param handle - qed will fill handle that should be
+ *				used henceforth as identifier of the
+ *				connection.
+ *			@param p_doorbell - qed will fill the address of the
+ *				doorbell.
+ *			@return 0 on sucesss, otherwise error value.
+ * @release_conn:	release a previously acquired iscsi connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@return 0 on success, otherwise error value.
+ * @offload_conn:	configures an offloaded connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@param conn_info - the configuration to use for the
+ *				offload.
+ *			@return 0 on success, otherwise error value.
+ * @update_conn:	updates an offloaded connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@param conn_info - the configuration to use for the
+ *				offload.
+ *			@return 0 on success, otherwise error value.
+ * @destroy_conn:	stops an offloaded connection
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@return 0 on success, otherwise error value.
+ * @clear_sq:		clear all task in sq
+ *			@param cdev
+ *			@param handle - the connection handle.
+ *			@return 0 on success, otherwise error value.
+ * @get_stats:		iSCSI related statistics
+ *			@param cdev
+ *			@param stats - pointer to struck that would be filled
+ *				we stats
+ *			@return 0 on success, error otherwise.
+ */
+struct qed_iscsi_ops {
+	const struct qed_common_ops *common;
+
+	const struct qed_ll2_ops *ll2;
+
+	int (*fill_dev_info)(struct qed_dev *cdev,
+			     struct qed_dev_iscsi_info *info);
+
+	void (*register_ops)(struct qed_dev *cdev,
+			     struct qed_iscsi_cb_ops *ops, void *cookie);
+
+	int (*start)(struct qed_dev *cdev,
+		     struct qed_iscsi_tid *tasks,
+		     void *event_context, iscsi_event_cb_t async_event_cb);
+
+	int (*stop)(struct qed_dev *cdev);
+
+	int (*acquire_conn)(struct qed_dev *cdev,
+			    u32 *handle,
+			    u32 *fw_cid, void __iomem **p_doorbell);
+
+	int (*release_conn)(struct qed_dev *cdev, u32 handle);
+
+	int (*offload_conn)(struct qed_dev *cdev,
+			    u32 handle,
+			    struct qed_iscsi_params_offload *conn_info);
+
+	int (*update_conn)(struct qed_dev *cdev,
+			   u32 handle,
+			   struct qed_iscsi_params_update *conn_info);
+
+	int (*destroy_conn)(struct qed_dev *cdev, u32 handle, u8 abrt_conn);
+
+	int (*clear_sq)(struct qed_dev *cdev, u32 handle);
+
+	int (*get_stats)(struct qed_dev *cdev,
+			 struct qed_iscsi_stats *stats);
+};
+
+const struct qed_iscsi_ops *qed_get_iscsi_ops(void);
+void qed_put_iscsi_ops(void);
+#endif
-- 
cgit v1.2.3


From 95a22caee396cef0bb2ca8fafdd82966a49367bb Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 1 Dec 2016 11:32:06 +0100
Subject: tcp: randomize tcp timestamp offsets for each connection

jiffies based timestamps allow for easy inference of number of devices
behind NAT translators and also makes tracking of hosts simpler.

commit ceaa1fef65a7c2e ("tcp: adding a per-socket timestamp offset")
added the main infrastructure that is needed for per-connection ts
randomization, in particular writing/reading the on-wire tcp header
format takes the offset into account so rest of stack can use normal
tcp_time_stamp (jiffies).

So only two items are left:
 - add a tsoffset for request sockets
 - extend the tcp isn generator to also return another 32bit number
   in addition to the ISN.

Re-use of ISN generator also means timestamps are still monotonically
increasing for same connection quadruple, i.e. PAWS will still work.

Includes fixes from Eric Dumazet.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 1 +
 include/net/secure_seq.h | 8 ++++----
 include/net/tcp.h        | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 00e0ee8f001f..734bab4c3bef 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -123,6 +123,7 @@ struct tcp_request_sock {
 	u32				txhash;
 	u32				rcv_isn;
 	u32				snt_isn;
+	u32				ts_off;
 	u32				last_oow_ack_time; /* last SYNACK */
 	u32				rcv_nxt; /* the ack # by SYNACK. For
 						  * FastOpen it's the seq#
diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h
index 3f36d45b714a..0caee631a836 100644
--- a/include/net/secure_seq.h
+++ b/include/net/secure_seq.h
@@ -6,10 +6,10 @@
 u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport);
 u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
 			       __be16 dport);
-__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
-				 __be16 sport, __be16 dport);
-__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
-				   __be16 sport, __be16 dport);
+u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
+			       __be16 sport, __be16 dport, u32 *tsoff);
+u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
+				 __be16 sport, __be16 dport, u32 *tsoff);
 u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
 				__be16 sport, __be16 dport);
 u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3e097e39d4d2..207147b4c6b2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1827,7 +1827,7 @@ struct tcp_request_sock_ops {
 	struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl,
 				       const struct request_sock *req,
 				       bool *strict);
-	__u32 (*init_seq)(const struct sk_buff *skb);
+	__u32 (*init_seq)(const struct sk_buff *skb, u32 *tsoff);
 	int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
 			   struct flowi *fl, struct request_sock *req,
 			   struct tcp_fastopen_cookie *foc,
-- 
cgit v1.2.3


From 55330f05969437c5d22fcc2ae2e54810b5236b7b Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Thu, 1 Dec 2016 14:06:33 +0200
Subject: net/sched: Add separate check for skip_hw flag

Creating a difference between two possible cases:
1. Not offloading tc rule since the user sets 'skip_hw' flag.
2. Not offloading tc rule since the device doesn't support offloading.

This patch doesn't add any new functionality.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 767b03a3fe67..45ad9aab9bba 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -425,16 +425,14 @@ struct tc_cls_u32_offload {
 	};
 };
 
-static inline bool tc_should_offload(const struct net_device *dev,
-				     const struct tcf_proto *tp, u32 flags)
+static inline bool tc_can_offload(const struct net_device *dev,
+				  const struct tcf_proto *tp)
 {
 	const struct Qdisc *sch = tp->q;
 	const struct Qdisc_class_ops *cops = sch->ops->cl_ops;
 
 	if (!(dev->features & NETIF_F_HW_TC))
 		return false;
-	if (flags & TCA_CLS_FLAGS_SKIP_HW)
-		return false;
 	if (!dev->netdev_ops->ndo_setup_tc)
 		return false;
 	if (cops && cops->tcf_cl_offload)
@@ -443,6 +441,19 @@ static inline bool tc_should_offload(const struct net_device *dev,
 	return true;
 }
 
+static inline bool tc_skip_hw(u32 flags)
+{
+	return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false;
+}
+
+static inline bool tc_should_offload(const struct net_device *dev,
+				     const struct tcf_proto *tp, u32 flags)
+{
+	if (tc_skip_hw(flags))
+		return false;
+	return tc_can_offload(dev, tp);
+}
+
 static inline bool tc_skip_sw(u32 flags)
 {
 	return (flags & TCA_CLS_FLAGS_SKIP_SW) ? true : false;
-- 
cgit v1.2.3


From 255cb30425c0ced57d6d85f3e7cddb99b9576046 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Thu, 1 Dec 2016 14:06:36 +0200
Subject: net/sched: act_mirred: Add new tc_action_ops get_dev()

Adding support to a new tc_action_ops.
get_dev is a general option which allows to get the underline
device when trying to offload a tc rule.

In case of mirred action the returned device is the mirred (egress)
device.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index d8eae87ea778..9dddf77a69cc 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -119,6 +119,8 @@ struct tc_action_ops {
 	int     (*walk)(struct net *, struct sk_buff *,
 			struct netlink_callback *, int, const struct tc_action_ops *);
 	void	(*stats_update)(struct tc_action *, u64, u32, u64);
+	int	(*get_dev)(const struct tc_action *a, struct net *net,
+			   struct net_device **mirred_dev);
 };
 
 struct tc_action_net {
-- 
cgit v1.2.3


From 7091d8c7055d7310339435ae3af2fb490a92524d Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Thu, 1 Dec 2016 14:06:37 +0200
Subject: net/sched: cls_flower: Add offload support using egress Hardware
 device

In order to support hardware offloading when the device given by the tc
rule is different from the Hardware underline device, extract the mirred
(egress) device from the tc action when a filter is added, using the new
tc_action_ops, get_dev().

Flower caches the information about the mirred device and use it for
calling ndo_setup_tc in filter change, update stats and delete.

Calling ndo_setup_tc of the mirred (egress) device instead of the
ingress device will allow a resolution between the software ingress
device and the underline hardware device.

The resolution will take place inside the offloading driver using
'egress_device' flag added to tc_to_netdev struct which is provided to
the offloading driver.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 include/net/pkt_cls.h     | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3755317cc6a9..1ff5ea6e1221 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -802,6 +802,7 @@ struct tc_to_netdev {
 		struct tc_cls_matchall_offload *cls_mall;
 		struct tc_cls_bpf_offload *cls_bpf;
 	};
+	bool egress_dev;
 };
 
 /* These structures hold the attributes of xdp state that are being passed
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 45ad9aab9bba..f0a051480c6c 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -171,6 +171,8 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
 		     struct tcf_exts *src);
 int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts);
 int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts);
+int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
+		     struct net_device **hw_dev);
 
 /**
  * struct tcf_pkt_info - packet information
-- 
cgit v1.2.3


From b2cd12574aa3e1625f471ff57cde7f628a18a46b Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 1 Dec 2016 08:48:03 -0800
Subject: bpf: Refactor cgroups code in prep for new type

Code move and rename only; no functional change intended.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 0cf1adfadd2d..af2ca8b432c0 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -36,31 +36,31 @@ void cgroup_bpf_update(struct cgroup *cgrp,
 		       struct bpf_prog *prog,
 		       enum bpf_attach_type type);
 
-int __cgroup_bpf_run_filter(struct sock *sk,
-			    struct sk_buff *skb,
-			    enum bpf_attach_type type);
-
-/* Wrappers for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled. */
-#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb)			\
-({									\
-	int __ret = 0;							\
-	if (cgroup_bpf_enabled)						\
-		__ret = __cgroup_bpf_run_filter(sk, skb,		\
-						BPF_CGROUP_INET_INGRESS); \
-									\
-	__ret;								\
+int __cgroup_bpf_run_filter_skb(struct sock *sk,
+				struct sk_buff *skb,
+				enum bpf_attach_type type);
+
+/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
+#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
+({									      \
+	int __ret = 0;							      \
+	if (cgroup_bpf_enabled)						      \
+		__ret = __cgroup_bpf_run_filter_skb(sk, skb,		      \
+						    BPF_CGROUP_INET_INGRESS); \
+									      \
+	__ret;								      \
 })
 
-#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb)				\
-({									\
-	int __ret = 0;							\
-	if (cgroup_bpf_enabled && sk && sk == skb->sk) {		\
-		typeof(sk) __sk = sk_to_full_sk(sk);			\
-		if (sk_fullsock(__sk))					\
-			__ret = __cgroup_bpf_run_filter(__sk, skb,	\
-						BPF_CGROUP_INET_EGRESS); \
-	}								\
-	__ret;								\
+#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb)			       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled && sk && sk == skb->sk) {		       \
+		typeof(sk) __sk = sk_to_full_sk(sk);			       \
+		if (sk_fullsock(__sk))					       \
+			__ret = __cgroup_bpf_run_filter_skb(__sk, skb,	       \
+						      BPF_CGROUP_INET_EGRESS); \
+	}								       \
+	__ret;								       \
 })
 
 #else
-- 
cgit v1.2.3


From 61023658760032e97869b07d54be9681d2529e77 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 1 Dec 2016 08:48:04 -0800
Subject: bpf: Add new cgroup attach type to enable sock modifications

Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
Currently only sk_bound_dev_if is exported to userspace for modification
by a bpf program.

This allows a cgroup to be configured such that AF_INET{6} sockets opened
by processes are automatically bound to a specific device. In turn, this
enables the running of programs that do not support SO_BINDTODEVICE in a
specific VRF context / L3 domain.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h | 14 ++++++++++++++
 include/uapi/linux/bpf.h   |  6 ++++++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index af2ca8b432c0..7b6e5d168c95 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -40,6 +40,9 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
 				enum bpf_attach_type type);
 
+int __cgroup_bpf_run_filter_sk(struct sock *sk,
+			       enum bpf_attach_type type);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -63,6 +66,16 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	__ret;								       \
 })
 
+#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk)				       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled && sk) {					       \
+		__ret = __cgroup_bpf_run_filter_sk(sk,			       \
+						 BPF_CGROUP_INET_SOCK_CREATE); \
+	}								       \
+	__ret;								       \
+})
+
 #else
 
 struct cgroup_bpf {};
@@ -72,6 +85,7 @@ static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
 
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 22ac82792687..bfe5e31a1288 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -101,6 +101,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_XDP,
 	BPF_PROG_TYPE_PERF_EVENT,
 	BPF_PROG_TYPE_CGROUP_SKB,
+	BPF_PROG_TYPE_CGROUP_SOCK,
 	BPF_PROG_TYPE_LWT_IN,
 	BPF_PROG_TYPE_LWT_OUT,
 	BPF_PROG_TYPE_LWT_XMIT,
@@ -109,6 +110,7 @@ enum bpf_prog_type {
 enum bpf_attach_type {
 	BPF_CGROUP_INET_INGRESS,
 	BPF_CGROUP_INET_EGRESS,
+	BPF_CGROUP_INET_SOCK_CREATE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -567,6 +569,10 @@ enum bpf_ret_code {
 	/* >127 are reserved for prog type specific return codes */
 };
 
+struct bpf_sock {
+	__u32 bound_dev_if;
+};
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result
-- 
cgit v1.2.3


From aa4c1037a30f4e88f444e83d42c2befbe0d5caf5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 1 Dec 2016 08:48:06 -0800
Subject: bpf: Add support for reading socket family, type, protocol

Add socket family, type and protocol to bpf_sock allowing bpf programs
read-only access.

Add __sk_flags_offset[0] to struct sock before the bitfield to
programmtically determine the offset of the unsigned int containing
protocol and type.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h       | 15 +++++++++++++++
 include/uapi/linux/bpf.h |  3 +++
 2 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 442cbb118a07..69afda6bea15 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -389,6 +389,21 @@ struct sock {
 	 * Because of non atomicity rules, all
 	 * changes are protected by socket lock.
 	 */
+	unsigned int		__sk_flags_offset[0];
+#ifdef __BIG_ENDIAN_BITFIELD
+#define SK_FL_PROTO_SHIFT  16
+#define SK_FL_PROTO_MASK   0x00ff0000
+
+#define SK_FL_TYPE_SHIFT   0
+#define SK_FL_TYPE_MASK    0x0000ffff
+#else
+#define SK_FL_PROTO_SHIFT  8
+#define SK_FL_PROTO_MASK   0x0000ff00
+
+#define SK_FL_TYPE_SHIFT   16
+#define SK_FL_TYPE_MASK    0xffff0000
+#endif
+
 	kmemcheck_bitfield_begin(flags);
 	unsigned int		sk_padding : 2,
 				sk_no_check_tx : 1,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bfe5e31a1288..6123d9b8e828 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -571,6 +571,9 @@ enum bpf_ret_code {
 
 struct bpf_sock {
 	__u32 bound_dev_if;
+	__u32 family;
+	__u32 type;
+	__u32 protocol;
 };
 
 /* User return codes for XDP prog type.
-- 
cgit v1.2.3


From 4f7df337fe79bba1e4c2d525525d63b5ba186bbd Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 2 Dec 2016 03:59:06 +0300
Subject: netlink: 2-clause nla_ok()

nla_ok() consists of 3 clauses:

	1) int rem >= (int)sizeof(struct nlattr)

	2) u16 nla_len >= sizeof(struct nlattr)

	3) u16 nla_len <= int rem

The statement is that clause (1) is redundant.

What it does is ensuring that "rem" is a positive number,
so that in clause (3) positive number will be compared to positive number
with no problems.

However, "u16" fully fits into "int" and integers do not change value
when upcasting even to signed type. Negative integers will be rejected
by clause (3) just fine. Small positive integers will be rejected
by transitivity of comparison operator.

NOTE: all of the above DOES NOT apply to nlmsg_ok() where ->nlmsg_len is
u32(!), so 3 clauses AND A CAST TO INT are necessary.

Obligatory space savings report: -1.6 KB

	$ ./scripts/bloat-o-meter ../vmlinux-000* ../vmlinux-001*
	add/remove: 0/0 grow/shrink: 3/63 up/down: 35/-1692 (-1657)
	function                                     old     new   delta
	validate_scan_freqs                          142     155     +13
	tcf_em_tree_validate                         867     879     +12
	dcbnl_ieee_del                               328     338     +10
	netlbl_cipsov4_add_common.isra               218     215      -3
		...
	ovs_nla_put_actions                          888     806     -82
	netlbl_cipsov4_add_std                      1648    1566     -82
	nl80211_parse_sched_scan                    2889    2780    -109
	ip_tun_from_nlattr                          3086    2945    -141

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index d3938f11ae52..dd657a33f8c3 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -698,8 +698,7 @@ static inline int nla_len(const struct nlattr *nla)
  */
 static inline int nla_ok(const struct nlattr *nla, int remaining)
 {
-	return remaining >= (int) sizeof(*nla) &&
-	       nla->nla_len >= sizeof(*nla) &&
+	return nla->nla_len >= sizeof(*nla) &&
 	       nla->nla_len <= remaining;
 }
 
-- 
cgit v1.2.3


From 9bfc7b9969dbb800460e2577f1dea59336269ce4 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 2 Dec 2016 04:12:58 +0300
Subject: netns: add dummy struct inside "struct net_generic"

This is precursor to fixing "[id - 1]" bloat inside net_generic().

Name "s" is chosen to complement name "u" often used for dummy unions.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/generic.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h
index d315786bcfd7..65ccce69b040 100644
--- a/include/net/netns/generic.h
+++ b/include/net/netns/generic.h
@@ -25,8 +25,10 @@
  */
 
 struct net_generic {
-	unsigned int len;
-	struct rcu_head rcu;
+	struct {
+		unsigned int len;
+		struct rcu_head rcu;
+	} s;
 
 	void *ptr[0];
 };
-- 
cgit v1.2.3


From 6af2d5fff2fdcd481cb9a4f354a0880142b17c60 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 2 Dec 2016 04:21:32 +0300
Subject: netns: fix net_generic() "id - 1" bloat

net_generic() function is both a) inline and b) used ~600 times.

It has the following code inside

		...
	ptr = ng->ptr[id - 1];
		...

"id" is never compile time constant so compiler is forced to subtract 1.
And those decrements or LEA [r32 - 1] instructions add up.

We also start id'ing from 1 to catch bugs where pernet sybsystem id
is not initialized and 0. This is quite pointless idea (nothing will
work or immediate interference with first registered subsystem) in
general but it hints what needs to be done for code size reduction.

Namely, overlaying allocation of pointer array and fixed part of
structure in the beginning and using usual base-0 addressing.

Ids are just cookies, their exact values do not matter, so lets start
with 3 on x86_64.

Code size savings (oh boy): -4.2 KB

As usual, ignore the initial compiler stupidity part of the table.

	add/remove: 0/0 grow/shrink: 12/670 up/down: 89/-4297 (-4208)
	function                                     old     new   delta
	tipc_nametbl_insert_publ                    1250    1270     +20
	nlmclnt_lookup_host                          686     703     +17
	nfsd4_encode_fattr                          5930    5941     +11
	nfs_get_client                              1050    1061     +11
	register_pernet_operations                   333     342      +9
	tcf_mirred_init                              843     849      +6
	tcf_bpf_init                                1143    1149      +6
	gss_setup_upcall                             990     994      +4
	idmap_name_to_id                             432     434      +2
	ops_init                                     274     275      +1
	nfsd_inject_forget_client                    259     260      +1
	nfs4_alloc_client                            612     613      +1
	tunnel_key_walker                            164     163      -1

		...

	tipc_bcbase_select_primary                   392     360     -32
	mac80211_hwsim_new_radio                    2808    2767     -41
	ipip6_tunnel_ioctl                          2228    2186     -42
	tipc_bcast_rcv                               715     672     -43
	tipc_link_build_proto_msg                   1140    1089     -51
	nfsd4_lock                                  3851    3796     -55
	tipc_mon_rcv                                1012     956     -56
	Total: Before=156643951, After=156639743, chg -0.00%

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/generic.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h
index 65ccce69b040..f15daaa89385 100644
--- a/include/net/netns/generic.h
+++ b/include/net/netns/generic.h
@@ -25,12 +25,14 @@
  */
 
 struct net_generic {
-	struct {
-		unsigned int len;
-		struct rcu_head rcu;
-	} s;
-
-	void *ptr[0];
+	union {
+		struct {
+			unsigned int len;
+			struct rcu_head rcu;
+		} s;
+
+		void *ptr[0];
+	};
 };
 
 static inline void *net_generic(const struct net *net, unsigned int id)
@@ -40,7 +42,7 @@ static inline void *net_generic(const struct net *net, unsigned int id)
 
 	rcu_read_lock();
 	ng = rcu_dereference(net->gen);
-	ptr = ng->ptr[id - 1];
+	ptr = ng->ptr[id];
 	rcu_read_unlock();
 
 	return ptr;
-- 
cgit v1.2.3


From 1c677b3d2828a84e0360e1a07d1204531540dc03 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sat, 3 Dec 2016 16:44:59 +0100
Subject: ipv4: fib: Add fib_info_hold() helper

As explained in the previous commit, modules are going to need to take a
reference on fib info and then drop it using fib_info_put().

Add the fib_info_hold() helper to make the code more readable and also
symmetric with fib_info_put().

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Suggested-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f390c3bb05c5..6c67b9391fc0 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -397,6 +397,11 @@ static inline void fib_combine_itag(u32 *itag, const struct fib_result *res)
 
 void free_fib_info(struct fib_info *fi);
 
+static inline void fib_info_hold(struct fib_info *fi)
+{
+	atomic_inc(&fi->fib_clntref);
+}
+
 static inline void fib_info_put(struct fib_info *fi)
 {
 	if (atomic_dec_and_test(&fi->fib_clntref))
-- 
cgit v1.2.3


From cacaad11f43aefbbe5fca00af3b9c16e6aee1ba4 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sat, 3 Dec 2016 16:45:06 +0100
Subject: ipv4: fib: Allow for consistent FIB dumping

The next patch will enable listeners of the FIB notification chain to
request a dump of the FIB tables. However, since RTNL isn't taken during
the dump, it's possible for the FIB tables to change mid-dump, which
will result in inconsistency between the listener's table and the
kernel's.

Allow listeners to know about changes that occurred mid-dump, by adding
a change sequence counter to each net namespace. The counter is
incremented just before a notification is sent in the FIB chain.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 7adf4386ac8f..f0cf5a1b777e 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -135,6 +135,9 @@ struct netns_ipv4 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	int sysctl_fib_multipath_use_neigh;
 #endif
+
+	unsigned int	fib_seq;	/* protected by rtnl_mutex */
+
 	atomic_t	rt_genid;
 };
 #endif
-- 
cgit v1.2.3


From c3852ef7f2f8f75a9f85a864bec1f6f5a3068eea Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sat, 3 Dec 2016 16:45:07 +0100
Subject: ipv4: fib: Replay events when registering FIB notifier

Commit b90eb7549499 ("fib: introduce FIB notification infrastructure")
introduced a new notification chain to notify listeners (f.e., switchdev
drivers) about addition and deletion of routes.

However, upon registration to the chain the FIB tables can already be
populated, which means potential listeners will have an incomplete view
of the tables.

Solve that by dumping the FIB tables and replaying the events to the
passed notification block. The dump itself is done using RCU in order
not to starve consumers that need RTNL to make progress.

The integrity of the dump is ensured by reading the FIB change sequence
counter before and after the dump under RTNL. This allows us to avoid
the problematic situation in which the dumping process sends a ENTRY_ADD
notification following ENTRY_DEL generated by another process holding
RTNL.

Callers of the registration function may pass a callback that is
executed in case the dump was inconsistent with current FIB tables.

The number of retries until a consistent dump is achieved is set to a
fixed number to prevent callers from looping for long periods of time.
In case current limit proves to be problematic in the future, it can be
easily converted to be configurable using a sysctl.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 6c67b9391fc0..5f376af377c7 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -221,7 +221,8 @@ enum fib_event_type {
 	FIB_EVENT_RULE_DEL,
 };
 
-int register_fib_notifier(struct notifier_block *nb);
+int register_fib_notifier(struct notifier_block *nb,
+			  void (*cb)(struct notifier_block *nb));
 int unregister_fib_notifier(struct notifier_block *nb);
 int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
 		       struct fib_notifier_info *info);
-- 
cgit v1.2.3


From adc176c5472214971d77c1a61c83db9b01e9cdc7 Mon Sep 17 00:00:00 2001
From: Erik Nordmark <nordmark@arista.com>
Date: Fri, 2 Dec 2016 14:00:08 -0800
Subject: ipv6 addrconf: Implemented enhanced DAD (RFC7527)

Implemented RFC7527 Enhanced DAD.
IPv6 duplicate address detection can fail if there is some temporary
loopback of Ethernet frames. RFC7527 solves this by including a random
nonce in the NS messages used for DAD, and if an NS is received with the
same nonce it is assumed to be a looped back DAD probe and is ignored.
RFC7527 is enabled by default. Can be disabled by setting both of
conf/{all,interface}/enhanced_dad to zero.

Signed-off-by: Erik Nordmark <nordmark@arista.com>
Signed-off-by: Bob Gilligan <gilligan@arista.com>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h      | 1 +
 include/net/if_inet6.h    | 1 +
 include/net/ndisc.h       | 5 ++++-
 include/uapi/linux/ipv6.h | 1 +
 4 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 3f95233b2733..671d014e6429 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -68,6 +68,7 @@ struct ipv6_devconf {
 #ifdef CONFIG_IPV6_SEG6_HMAC
 	__s32		seg6_require_hmac;
 #endif
+	__u32		enhanced_dad;
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index b0576cb2ab25..0fa4c324b713 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -55,6 +55,7 @@ struct inet6_ifaddr {
 	__u8			stable_privacy_retry;
 
 	__u16			scope;
+	__u64			dad_nonce;
 
 	unsigned long		cstamp;	/* created timestamp */
 	unsigned long		tstamp; /* updated timestamp */
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index be1fe2283254..d562a2fe4860 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -31,6 +31,7 @@ enum {
 	ND_OPT_PREFIX_INFO = 3,		/* RFC2461 */
 	ND_OPT_REDIRECT_HDR = 4,	/* RFC2461 */
 	ND_OPT_MTU = 5,			/* RFC2461 */
+	ND_OPT_NONCE = 14,              /* RFC7527 */
 	__ND_OPT_ARRAY_MAX,
 	ND_OPT_ROUTE_INFO = 24,		/* RFC4191 */
 	ND_OPT_RDNSS = 25,		/* RFC5006 */
@@ -121,6 +122,7 @@ struct ndisc_options {
 #define nd_opts_pi_end			nd_opt_array[__ND_OPT_PREFIX_INFO_END]
 #define nd_opts_rh			nd_opt_array[ND_OPT_REDIRECT_HDR]
 #define nd_opts_mtu			nd_opt_array[ND_OPT_MTU]
+#define nd_opts_nonce			nd_opt_array[ND_OPT_NONCE]
 #define nd_802154_opts_src_lladdr	nd_802154_opt_array[ND_OPT_SOURCE_LL_ADDR]
 #define nd_802154_opts_tgt_lladdr	nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR]
 
@@ -398,7 +400,8 @@ void ndisc_cleanup(void);
 int ndisc_rcv(struct sk_buff *skb);
 
 void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
-		   const struct in6_addr *daddr, const struct in6_addr *saddr);
+		   const struct in6_addr *daddr, const struct in6_addr *saddr,
+		   u64 nonce);
 
 void ndisc_send_rs(struct net_device *dev,
 		   const struct in6_addr *saddr, const struct in6_addr *daddr);
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 53561be1ac21..eaf65dc82e22 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -181,6 +181,7 @@ enum {
 	DEVCONF_RTR_SOLICIT_MAX_INTERVAL,
 	DEVCONF_SEG6_ENABLED,
 	DEVCONF_SEG6_REQUIRE_HMAC,
+	DEVCONF_ENHANCED_DAD,
 	DEVCONF_MAX
 };
 
-- 
cgit v1.2.3


From 0c4e966eafff8253bec545d8c27b9efa231c1f62 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Thu, 20 Oct 2016 18:33:01 +0200
Subject: netfilter: built-in NAT support for DCCP

CONFIG_NF_NAT_PROTO_DCCP is no more a tristate. When set to y, NAT
support for DCCP protocol is built-in into nf_nat.ko.

footprint test:

(nf_nat_proto_)           | dccp   || nf_nat
--------------------------+--------++--------
no builtin                | 409800 || 2241312
DCCP builtin              |   -    || 2578968

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat_l4proto.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_nat_l4proto.h b/include/net/netfilter/nf_nat_l4proto.h
index 12f4cc841b6e..92b147be00ef 100644
--- a/include/net/netfilter/nf_nat_l4proto.h
+++ b/include/net/netfilter/nf_nat_l4proto.h
@@ -54,6 +54,9 @@ extern const struct nf_nat_l4proto nf_nat_l4proto_udp;
 extern const struct nf_nat_l4proto nf_nat_l4proto_icmp;
 extern const struct nf_nat_l4proto nf_nat_l4proto_icmpv6;
 extern const struct nf_nat_l4proto nf_nat_l4proto_unknown;
+#ifdef CONFIG_NF_NAT_PROTO_DCCP
+extern const struct nf_nat_l4proto nf_nat_l4proto_dccp;
+#endif
 
 bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple,
 			     enum nf_nat_manip_type maniptype,
-- 
cgit v1.2.3


From 7a2dd28c703408ef27d6fe6a4fcd7c58968ce3bf Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Thu, 20 Oct 2016 18:33:02 +0200
Subject: netfilter: built-in NAT support for SCTP

CONFIG_NF_NAT_PROTO_SCTP is no more a tristate. When set to y, NAT
support for SCTP protocol is built-in into nf_nat.ko.

footprint test:

(nf_nat_proto_)           | sctp   || nf_nat
--------------------------+--------++--------
no builtin                | 428344 || 2241312
SCTP builtin              |   -    || 2597032

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat_l4proto.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_nat_l4proto.h b/include/net/netfilter/nf_nat_l4proto.h
index 92b147be00ef..2cbaf3856e21 100644
--- a/include/net/netfilter/nf_nat_l4proto.h
+++ b/include/net/netfilter/nf_nat_l4proto.h
@@ -57,6 +57,9 @@ extern const struct nf_nat_l4proto nf_nat_l4proto_unknown;
 #ifdef CONFIG_NF_NAT_PROTO_DCCP
 extern const struct nf_nat_l4proto nf_nat_l4proto_dccp;
 #endif
+#ifdef CONFIG_NF_NAT_PROTO_SCTP
+extern const struct nf_nat_l4proto nf_nat_l4proto_sctp;
+#endif
 
 bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple,
 			     enum nf_nat_manip_type maniptype,
-- 
cgit v1.2.3


From b8ad652f9779976d0300ae199961e413859d5378 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Thu, 20 Oct 2016 18:33:03 +0200
Subject: netfilter: built-in NAT support for UDPlite

CONFIG_NF_NAT_PROTO_UDPLITE is no more a tristate. When set to y, NAT
support for UDPlite protocol is built-in into nf_nat.ko.

footprint test:

(nf_nat_proto_)           |udplite || nf_nat
--------------------------+--------++--------
no builtin                | 408048 || 2241312
UDPLITE builtin           |   -    || 2577256

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat_l4proto.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_nat_l4proto.h b/include/net/netfilter/nf_nat_l4proto.h
index 2cbaf3856e21..3923150f2a1e 100644
--- a/include/net/netfilter/nf_nat_l4proto.h
+++ b/include/net/netfilter/nf_nat_l4proto.h
@@ -60,6 +60,9 @@ extern const struct nf_nat_l4proto nf_nat_l4proto_dccp;
 #ifdef CONFIG_NF_NAT_PROTO_SCTP
 extern const struct nf_nat_l4proto nf_nat_l4proto_sctp;
 #endif
+#ifdef CONFIG_NF_NAT_PROTO_UDPLITE
+extern const struct nf_nat_l4proto nf_nat_l4proto_udplite;
+#endif
 
 bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple,
 			     enum nf_nat_manip_type maniptype,
-- 
cgit v1.2.3


From 673ab46f345557e9d741e97ca0301280360d1af1 Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Mon, 14 Nov 2016 22:39:25 +0800
Subject: netfilter: nf_log: do not assume ethernet header in netdev family

In netdev family, we will handle non ethernet packets, so using
eth_hdr(skb)->h_proto is incorrect.

Meanwhile, we can use socket(AF_PACKET...) to sending packets, so
skb->protocol is not always set in bridge family.

Add an extra parameter into nf_log_l2packet to solve this issue.

Fixes: 1fddf4bad0ac ("netfilter: nf_log: add packet logging for netdev family")
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_log.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index a559aa41253c..450f87f95415 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -109,7 +109,9 @@ void nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf,
 			       const struct net_device *out,
 			       const struct nf_loginfo *loginfo,
 			       const char *prefix);
-void nf_log_l2packet(struct net *net, u_int8_t pf, unsigned int hooknum,
+void nf_log_l2packet(struct net *net, u_int8_t pf,
+		     __be16 protocol,
+		     unsigned int hooknum,
 		     const struct sk_buff *skb,
 		     const struct net_device *in,
 		     const struct net_device *out,
-- 
cgit v1.2.3


From 3fefeb88d002850e591339fed291eb6a795d9f21 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Tue, 15 Nov 2016 15:08:24 +0100
Subject: netfilter: nf_conntrack_tuple_common.h: fix #include

To allow usage of enum ip_conntrack_dir in include/net/netns/conntrack.h,
this patch encloses #include <linux/netfilter.h> in a #ifndef __KERNEL__
directive, so that compiler errors caused by unwanted inclusion of
include/linux/netfilter.h are avoided.
In addition, #include <linux/netfilter/nf_conntrack_common.h> line has
been added to resolve correctly CTINFO2DIR macro.

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Acked-by: Mikko Rapeli <mikko.rapeli@iki.fi>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_tuple_common.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
index a9c3834abdd4..526b42496b78 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
@@ -2,7 +2,10 @@
 #define _NF_CONNTRACK_TUPLE_COMMON_H
 
 #include <linux/types.h>
+#ifndef __KERNEL__
 #include <linux/netfilter.h>
+#endif
+#include <linux/netfilter/nf_conntrack_common.h> /* IP_CT_IS_REPLY */
 
 enum ip_conntrack_dir {
 	IP_CT_DIR_ORIGINAL,
-- 
cgit v1.2.3


From c51d39010a1bccc9c1294e2d7c00005aefeb2b5c Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Tue, 15 Nov 2016 15:08:25 +0100
Subject: netfilter: conntrack: built-in support for DCCP

CONFIG_NF_CT_PROTO_DCCP is no more a tristate. When set to y, connection
tracking support for DCCP protocol is built-in into nf_conntrack.ko.

footprint test:
$ ls -l net/netfilter/nf_conntrack{_proto_dccp,}.ko \
        net/ipv4/netfilter/nf_conntrack_ipv4.ko \
        net/ipv6/netfilter/nf_conntrack_ipv6.ko

(builtin)||  dccp  |  ipv4  |  ipv6  | nf_conntrack
---------++--------+--------+--------+--------------
none     || 469140 | 828755 | 828676 | 6141434
DCCP     ||   -    | 830566 | 829935 | 6533526

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nf_conntrack_dccp.h    |  2 +-
 include/net/netfilter/ipv4/nf_conntrack_ipv4.h |  3 +++
 include/net/netfilter/ipv6/nf_conntrack_ipv6.h |  3 +++
 include/net/netns/conntrack.h                  | 14 ++++++++++++++
 4 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netfilter/nf_conntrack_dccp.h b/include/linux/netfilter/nf_conntrack_dccp.h
index 40dcc82058d1..ff721d7325cf 100644
--- a/include/linux/netfilter/nf_conntrack_dccp.h
+++ b/include/linux/netfilter/nf_conntrack_dccp.h
@@ -25,7 +25,7 @@ enum ct_dccp_roles {
 #define CT_DCCP_ROLE_MAX	(__CT_DCCP_ROLE_MAX - 1)
 
 #ifdef __KERNEL__
-#include <net/netfilter/nf_conntrack_tuple.h>
+#include <linux/netfilter/nf_conntrack_tuple_common.h>
 
 struct nf_ct_dccp {
 	u_int8_t	role[IP_CT_DIR_MAX];
diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index 981c327374da..c2f155fd9299 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -15,6 +15,9 @@ extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp;
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4;
+#endif
 
 int nf_conntrack_ipv4_compat_init(void);
 void nf_conntrack_ipv4_compat_fini(void);
diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
index a4c993685795..5ec66c0d21c4 100644
--- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
@@ -6,6 +6,9 @@ extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6;
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6;
+#endif
 
 #include <linux/sysctl.h>
 extern struct ctl_table nf_ct_ipv6_sysctl_table[];
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 3d06d94d2e52..440b781baf0b 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -6,6 +6,9 @@
 #include <linux/atomic.h>
 #include <linux/workqueue.h>
 #include <linux/netfilter/nf_conntrack_tcp.h>
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+#include <linux/netfilter/nf_conntrack_dccp.h>
+#endif
 #include <linux/seqlock.h>
 
 struct ctl_table_header;
@@ -48,12 +51,23 @@ struct nf_icmp_net {
 	unsigned int timeout;
 };
 
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+struct nf_dccp_net {
+	struct nf_proto_net pn;
+	int dccp_loose;
+	unsigned int dccp_timeout[CT_DCCP_MAX + 1];
+};
+#endif
+
 struct nf_ip_net {
 	struct nf_generic_net   generic;
 	struct nf_tcp_net	tcp;
 	struct nf_udp_net	udp;
 	struct nf_icmp_net	icmp;
 	struct nf_icmp_net	icmpv6;
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+	struct nf_dccp_net	dccp;
+#endif
 };
 
 struct ct_pcpu {
-- 
cgit v1.2.3


From a85406afeb3e045b001b2aac5b4f89f4266fede3 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Tue, 15 Nov 2016 15:08:26 +0100
Subject: netfilter: conntrack: built-in support for SCTP

CONFIG_NF_CT_PROTO_SCTP is no more a tristate. When set to y, connection
tracking support for SCTP protocol is built-in into nf_conntrack.ko.

footprint test:
$ ls -l net/netfilter/nf_conntrack{_proto_sctp,}.ko \
        net/ipv4/netfilter/nf_conntrack_ipv4.ko \
        net/ipv6/netfilter/nf_conntrack_ipv6.ko

(builtin)||  sctp  |  ipv4  |  ipv6  | nf_conntrack
---------++--------+--------+--------+--------------
none     || 498243 | 828755 | 828676 | 6141434
SCTP     ||   -    | 829254 | 829175 | 6547872

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_conntrack_ipv4.h |  3 +++
 include/net/netfilter/ipv6/nf_conntrack_ipv6.h |  3 +++
 include/net/netns/conntrack.h                  | 13 +++++++++++++
 3 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index c2f155fd9299..5f1fc15a51fb 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -18,6 +18,9 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp;
 #ifdef CONFIG_NF_CT_PROTO_DCCP
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+extern struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4;
+#endif
 
 int nf_conntrack_ipv4_compat_init(void);
 void nf_conntrack_ipv4_compat_fini(void);
diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
index 5ec66c0d21c4..f70d191a8820 100644
--- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
@@ -9,6 +9,9 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6;
 #ifdef CONFIG_NF_CT_PROTO_DCCP
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+extern struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6;
+#endif
 
 #include <linux/sysctl.h>
 extern struct ctl_table nf_ct_ipv6_sysctl_table[];
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 440b781baf0b..17724c62de97 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -9,6 +9,9 @@
 #ifdef CONFIG_NF_CT_PROTO_DCCP
 #include <linux/netfilter/nf_conntrack_dccp.h>
 #endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+#include <linux/netfilter/nf_conntrack_sctp.h>
+#endif
 #include <linux/seqlock.h>
 
 struct ctl_table_header;
@@ -59,6 +62,13 @@ struct nf_dccp_net {
 };
 #endif
 
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+struct nf_sctp_net {
+	struct nf_proto_net pn;
+	unsigned int timeouts[SCTP_CONNTRACK_MAX];
+};
+#endif
+
 struct nf_ip_net {
 	struct nf_generic_net   generic;
 	struct nf_tcp_net	tcp;
@@ -68,6 +78,9 @@ struct nf_ip_net {
 #ifdef CONFIG_NF_CT_PROTO_DCCP
 	struct nf_dccp_net	dccp;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+	struct nf_sctp_net	sctp;
+#endif
 };
 
 struct ct_pcpu {
-- 
cgit v1.2.3


From 9b91c96c5d1f9da79438292f8c82f65cbf078645 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Tue, 15 Nov 2016 15:08:27 +0100
Subject: netfilter: conntrack: built-in support for UDPlite

CONFIG_NF_CT_PROTO_UDPLITE is no more a tristate. When set to y,
connection tracking support for UDPlite protocol is built-in into
nf_conntrack.ko.

footprint test:
$ ls -l net/netfilter/nf_conntrack{_proto_udplite,}.ko \
        net/ipv4/netfilter/nf_conntrack_ipv4.ko \
        net/ipv6/netfilter/nf_conntrack_ipv6.ko

(builtin)|| udplite|  ipv4  |  ipv6  |nf_conntrack
---------++--------+--------+--------+--------------
none     || 432538 | 828755 | 828676 | 6141434
UDPlite  ||   -    | 829649 | 829362 | 6498204

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_conntrack_ipv4.h |  3 +++
 include/net/netfilter/ipv6/nf_conntrack_ipv6.h |  3 +++
 include/net/netns/conntrack.h                  | 16 ++++++++++++++++
 3 files changed, 22 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index 5f1fc15a51fb..919e4e8af327 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -21,6 +21,9 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4;
 #ifdef CONFIG_NF_CT_PROTO_SCTP
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4;
+#endif
 
 int nf_conntrack_ipv4_compat_init(void);
 void nf_conntrack_ipv4_compat_fini(void);
diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
index f70d191a8820..eaea968f8657 100644
--- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
@@ -12,6 +12,9 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6;
 #ifdef CONFIG_NF_CT_PROTO_SCTP
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6;
+#endif
 
 #include <linux/sysctl.h>
 extern struct ctl_table nf_ct_ipv6_sysctl_table[];
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 17724c62de97..cf799fc3fdec 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -69,6 +69,19 @@ struct nf_sctp_net {
 };
 #endif
 
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+enum udplite_conntrack {
+	UDPLITE_CT_UNREPLIED,
+	UDPLITE_CT_REPLIED,
+	UDPLITE_CT_MAX
+};
+
+struct nf_udplite_net {
+	struct nf_proto_net pn;
+	unsigned int timeouts[UDPLITE_CT_MAX];
+};
+#endif
+
 struct nf_ip_net {
 	struct nf_generic_net   generic;
 	struct nf_tcp_net	tcp;
@@ -81,6 +94,9 @@ struct nf_ip_net {
 #ifdef CONFIG_NF_CT_PROTO_SCTP
 	struct nf_sctp_net	sctp;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+	struct nf_udplite_net	udplite;
+#endif
 };
 
 struct ct_pcpu {
-- 
cgit v1.2.3


From a379854d91b2cb0af07b0f62845449f4dacbd673 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 15 Nov 2016 21:36:39 +0100
Subject: netfilter: conntrack: remove unused init_net hook

since adf0516845bcd0 ("netfilter: remove ip_conntrack* sysctl compat code")
the only user (ipv4 tracker) sets this to an empty stub function.

After this change nf_ct_l3proto_pernet_register() is also empty,
but this will change in a followup patch to add conditional register
of the hooks.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l3proto.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index 8992e4229da9..cf8f3dfd810d 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -63,9 +63,6 @@ struct nf_conntrack_l3proto {
 
 	size_t nla_size;
 
-	/* Init l3proto pernet data */
-	int (*init_net)(struct net *net);
-
 	/* Module (if any) which this is connected to. */
 	struct module *me;
 };
-- 
cgit v1.2.3


From ecb2421b5ddf48e6e116fced7f74c985bb546138 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 15 Nov 2016 21:36:40 +0100
Subject: netfilter: add and use nf_ct_netns_get/put

currently aliased to try_module_get/_put.
Will be changed in next patch when we add functions to make use of ->net
argument to store usercount per l3proto tracker.

This is needed to avoid registering the conntrack hooks in all netns and
later only enable connection tracking in those that need conntrack.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index d9d52c020a70..5916aa9ab3f0 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -181,6 +181,10 @@ static inline void nf_ct_put(struct nf_conn *ct)
 int nf_ct_l3proto_try_module_get(unsigned short l3proto);
 void nf_ct_l3proto_module_put(unsigned short l3proto);
 
+/* load module; enable/disable conntrack in this namespace */
+int nf_ct_netns_get(struct net *net, u8 nfproto);
+void nf_ct_netns_put(struct net *net, u8 nfproto);
+
 /*
  * Allocate a hashtable of hlist_head (if nulls == 0),
  * or hlist_nulls_head (if nulls == 1)
-- 
cgit v1.2.3


From 0c66dc1ea3f0366221f8a5a16c73f01ea9259678 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 15 Nov 2016 21:36:43 +0100
Subject: netfilter: conntrack: register hooks in netns when needed by ruleset

This makes use of nf_ct_netns_get/put added in previous patch.
We add get/put functions to nf_conntrack_l3proto structure, ipv4 and ipv6
then implement use-count to track how many users (nft or xtables modules)
have a dependency on ipv4 and/or ipv6 connection tracking functionality.

When count reaches zero, the hooks are unregistered.

This delays activation of connection tracking inside a namespace until
stateful firewall rule or nat rule gets added.

This patch breaks backwards compatibility in the sense that connection
tracking won't be active anymore when the protocol tracker module is
loaded.  This breaks e.g. setups that ctnetlink for flow accounting and
the like, without any '-m conntrack' packet filter rules.

Followup patch restores old behavour and makes new delayed scheme
optional via sysctl.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l3proto.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index cf8f3dfd810d..e7dcd72be21c 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -52,6 +52,10 @@ struct nf_conntrack_l3proto {
 	int (*tuple_to_nlattr)(struct sk_buff *skb,
 			       const struct nf_conntrack_tuple *t);
 
+	/* Called when netns wants to use connection tracking */
+	int (*net_ns_get)(struct net *);
+	void (*net_ns_put)(struct net *);
+
 	/*
 	 * Calculate size of tuple nlattr
 	 */
-- 
cgit v1.2.3


From 481fa3734769b67f00ed09a42f2a6a8cbd00b869 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 15 Nov 2016 21:36:44 +0100
Subject: netfilter: conntrack: add nf_conntrack_default_on sysctl

This switch (default on) can be used to disable automatic registration
of connection tracking functionality in newly created network
namespaces.

This means that when net namespace goes down (or the tracker protocol
module is unloaded) we *might* have to unregister the hooks.

We can either add another per-netns variable that tells if
the hooks got registered by default, or, alternatively, just call
the protocol _put() function and have the callee deal with a possible
'extra' put() operation that doesn't pair with a get() one.

This uses the latter approach, i.e. a put() without a get has no effect.

Conntrack is still enabled automatically regardless of the new sysctl
setting if the new net namespace requires connection tracking, e.g. when
NAT rules are created.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l3proto.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index e7dcd72be21c..e01559b4d781 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -73,9 +73,18 @@ struct nf_conntrack_l3proto {
 
 extern struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX];
 
+#ifdef CONFIG_SYSCTL
 /* Protocol pernet registration. */
 int nf_ct_l3proto_pernet_register(struct net *net,
 				  struct nf_conntrack_l3proto *proto);
+#else
+static inline int nf_ct_l3proto_pernet_register(struct net *n,
+						struct nf_conntrack_l3proto *p)
+{
+	return 0;
+}
+#endif
+
 void nf_ct_l3proto_pernet_unregister(struct net *net,
 				     struct nf_conntrack_l3proto *proto);
 
-- 
cgit v1.2.3


From 40fc3423b983b864bf70b03199191260ae9b2ea6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 3 Dec 2016 11:14:50 -0800
Subject: tcp: tsq: add tsq_flags / tsq_enum

This is a cleanup, to ease code review of following patches.

Old 'enum tsq_flags' is renamed, and a new enumeration is added
with the flags used in cmpxchg() operations as opposed to
single bit operations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 734bab4c3bef..d8be083ab0b0 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -364,7 +364,7 @@ struct tcp_sock {
 	u32	*saved_syn;
 };
 
-enum tsq_flags {
+enum tsq_enum {
 	TSQ_THROTTLED,
 	TSQ_QUEUED,
 	TCP_TSQ_DEFERRED,	   /* tcp_tasklet_func() found socket was owned */
@@ -375,6 +375,15 @@ enum tsq_flags {
 				    */
 };
 
+enum tsq_flags {
+	TSQF_THROTTLED			= (1UL << TSQ_THROTTLED),
+	TSQF_QUEUED			= (1UL << TSQ_QUEUED),
+	TCPF_TSQ_DEFERRED		= (1UL << TCP_TSQ_DEFERRED),
+	TCPF_WRITE_TIMER_DEFERRED	= (1UL << TCP_WRITE_TIMER_DEFERRED),
+	TCPF_DELACK_TIMER_DEFERRED	= (1UL << TCP_DELACK_TIMER_DEFERRED),
+	TCPF_MTU_REDUCED_DEFERRED	= (1UL << TCP_MTU_REDUCED_DEFERRED),
+};
+
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
 {
 	return (struct tcp_sock *)sk;
-- 
cgit v1.2.3


From 9115e8cd2a0c6eaaa900c462721f12e1d45f326c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 3 Dec 2016 11:14:56 -0800
Subject: net: reorganize struct sock for better data locality

Group fields used in TX path, and keep some cache lines mostly read
to permit sharing among cpus.

Gained two 4 bytes holes on 64bit arches.

Added a place holder for tcp tsq_flags, next to sk_wmem_alloc
to speed up tcp_wfree() in the following patch.

I have not added ____cacheline_aligned_in_smp, this might be done later.
I prefer doing this once inet and tcp/udp sockets reorg is also done.

Tested with both TCP and UDP.

UDP receiver performance under flood increased by ~20 % :
Accessing sk_filter/sk_wq/sk_napi_id no longer stalls because sk_drops
was moved away from a critical cache line, now mostly read and shared.

	/* --- cacheline 4 boundary (256 bytes) --- */
	unsigned int               sk_napi_id;           /* 0x100   0x4 */
	int                        sk_rcvbuf;            /* 0x104   0x4 */
	struct sk_filter *         sk_filter;            /* 0x108   0x8 */
	union {
		struct socket_wq * sk_wq;                /*         0x8 */
		struct socket_wq * sk_wq_raw;            /*         0x8 */
	};                                               /* 0x110   0x8 */
	struct xfrm_policy *       sk_policy[2];         /* 0x118  0x10 */
	struct dst_entry *         sk_rx_dst;            /* 0x128   0x8 */
	struct dst_entry *         sk_dst_cache;         /* 0x130   0x8 */
	atomic_t                   sk_omem_alloc;        /* 0x138   0x4 */
	int                        sk_sndbuf;            /* 0x13c   0x4 */
	/* --- cacheline 5 boundary (320 bytes) --- */
	int                        sk_wmem_queued;       /* 0x140   0x4 */
	atomic_t                   sk_wmem_alloc;        /* 0x144   0x4 */
	long unsigned int          sk_tsq_flags;         /* 0x148   0x8 */
	struct sk_buff *           sk_send_head;         /* 0x150   0x8 */
	struct sk_buff_head        sk_write_queue;       /* 0x158  0x18 */
	__s32                      sk_peek_off;          /* 0x170   0x4 */
	int                        sk_write_pending;     /* 0x174   0x4 */
	long int                   sk_sndtimeo;          /* 0x178   0x8 */

Signed-off-by: Eric Dumazet <edumazet@google.com>
Tested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 51 +++++++++++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 69afda6bea15..6dfe3aa22b97 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -343,6 +343,9 @@ struct sock {
 #define sk_rxhash		__sk_common.skc_rxhash
 
 	socket_lock_t		sk_lock;
+	atomic_t		sk_drops;
+	int			sk_rcvlowat;
+	struct sk_buff_head	sk_error_queue;
 	struct sk_buff_head	sk_receive_queue;
 	/*
 	 * The backlog queue is special, it is always used with
@@ -359,14 +362,13 @@ struct sock {
 		struct sk_buff	*tail;
 	} sk_backlog;
 #define sk_rmem_alloc sk_backlog.rmem_alloc
-	int			sk_forward_alloc;
 
-	__u32			sk_txhash;
+	int			sk_forward_alloc;
 #ifdef CONFIG_NET_RX_BUSY_POLL
-	unsigned int		sk_napi_id;
 	unsigned int		sk_ll_usec;
+	/* ===== mostly read cache line ===== */
+	unsigned int		sk_napi_id;
 #endif
-	atomic_t		sk_drops;
 	int			sk_rcvbuf;
 
 	struct sk_filter __rcu	*sk_filter;
@@ -379,11 +381,30 @@ struct sock {
 #endif
 	struct dst_entry	*sk_rx_dst;
 	struct dst_entry __rcu	*sk_dst_cache;
-	/* Note: 32bit hole on 64bit arches */
-	atomic_t		sk_wmem_alloc;
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
+
+	/* ===== cache line for TX ===== */
+	int			sk_wmem_queued;
+	atomic_t		sk_wmem_alloc;
+	unsigned long		sk_tsq_flags;
+	struct sk_buff		*sk_send_head;
 	struct sk_buff_head	sk_write_queue;
+	__s32			sk_peek_off;
+	int			sk_write_pending;
+	long			sk_sndtimeo;
+	struct timer_list	sk_timer;
+	__u32			sk_priority;
+	__u32			sk_mark;
+	u32			sk_pacing_rate; /* bytes per second */
+	u32			sk_max_pacing_rate;
+	struct page_frag	sk_frag;
+	netdev_features_t	sk_route_caps;
+	netdev_features_t	sk_route_nocaps;
+	int			sk_gso_type;
+	unsigned int		sk_gso_max_size;
+	gfp_t			sk_allocation;
+	__u32			sk_txhash;
 
 	/*
 	 * Because of non atomicity rules, all
@@ -414,42 +435,24 @@ struct sock {
 #define SK_PROTOCOL_MAX U8_MAX
 	kmemcheck_bitfield_end(flags);
 
-	int			sk_wmem_queued;
-	gfp_t			sk_allocation;
-	u32			sk_pacing_rate; /* bytes per second */
-	u32			sk_max_pacing_rate;
-	netdev_features_t	sk_route_caps;
-	netdev_features_t	sk_route_nocaps;
-	int			sk_gso_type;
-	unsigned int		sk_gso_max_size;
 	u16			sk_gso_max_segs;
-	int			sk_rcvlowat;
 	unsigned long	        sk_lingertime;
-	struct sk_buff_head	sk_error_queue;
 	struct proto		*sk_prot_creator;
 	rwlock_t		sk_callback_lock;
 	int			sk_err,
 				sk_err_soft;
 	u32			sk_ack_backlog;
 	u32			sk_max_ack_backlog;
-	__u32			sk_priority;
-	__u32			sk_mark;
 	kuid_t			sk_uid;
 	struct pid		*sk_peer_pid;
 	const struct cred	*sk_peer_cred;
 	long			sk_rcvtimeo;
-	long			sk_sndtimeo;
-	struct timer_list	sk_timer;
 	ktime_t			sk_stamp;
 	u16			sk_tsflags;
 	u8			sk_shutdown;
 	u32			sk_tskey;
 	struct socket		*sk_socket;
 	void			*sk_user_data;
-	struct page_frag	sk_frag;
-	struct sk_buff		*sk_send_head;
-	__s32			sk_peek_off;
-	int			sk_write_pending;
 #ifdef CONFIG_SECURITY
 	void			*sk_security;
 #endif
-- 
cgit v1.2.3


From 7aa5470c2c09265902b5e4289afa82e4e7c2987e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 3 Dec 2016 11:14:57 -0800
Subject: tcp: tsq: move tsq_flags close to sk_wmem_alloc

tsq_flags being in the same cache line than sk_wmem_alloc
makes a lot of sense. Both fields are changed from tcp_wfree()
and more generally by various TSQ related functions.

Prior patch made room in struct sock and added sk_tsq_flags,
this patch deletes tsq_flags from struct tcp_sock.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index d8be083ab0b0..fc5848dad7a4 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -186,7 +186,6 @@ struct tcp_sock {
 	u32	tsoffset;	/* timestamp offset */
 
 	struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
-	unsigned long	tsq_flags;
 
 	/* Data for direct copy to user */
 	struct {
-- 
cgit v1.2.3


From 1c0d32fde5bdf1184bc274f864c09799278a1114 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 4 Dec 2016 09:48:16 -0800
Subject: net_sched: gen_estimator: complete rewrite of rate estimators

1) Old code was hard to maintain, due to complex lock chains.
   (We probably will be able to remove some kfree_rcu() in callers)

2) Using a single timer to update all estimators does not scale.

3) Code was buggy on 32bit kernel (WRITE_ONCE() on 64bit quantity
   is not supposed to work well)

In this rewrite :

- I removed the RB tree that had to be scanned in
  gen_estimator_active(). qdisc dumps should be much faster.

- Each estimator has its own timer.

- Estimations are maintained in net_rate_estimator structure,
  instead of dirtying the qdisc. Minor, but part of the simplification.

- Reading the estimator uses RCU and a seqcount to provide proper
  support for 32bit kernels.

- We reduce memory need when estimators are not used, since
  we store a pointer, instead of the bytes/packets counters.

- xt_rateest_mt() no longer has to grab a spinlock.
  (In the future, xt_rateest_tg() could be switched to per cpu counters)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h              |  2 +-
 include/net/gen_stats.h            | 17 +++++++++--------
 include/net/netfilter/xt_rateest.h | 10 +++++++---
 include/net/sch_generic.h          |  2 +-
 4 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 9dddf77a69cc..1d716449209e 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -36,7 +36,7 @@ struct tc_action {
 	struct tcf_t			tcfa_tm;
 	struct gnet_stats_basic_packed	tcfa_bstats;
 	struct gnet_stats_queue		tcfa_qstats;
-	struct gnet_stats_rate_est64	tcfa_rate_est;
+	struct net_rate_estimator __rcu *tcfa_rate_est;
 	spinlock_t			tcfa_lock;
 	struct rcu_head			tcfa_rcu;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 231e121cc7d9..8b7aa370e7a4 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -11,6 +11,8 @@ struct gnet_stats_basic_cpu {
 	struct u64_stats_sync syncp;
 };
 
+struct net_rate_estimator;
+
 struct gnet_dump {
 	spinlock_t *      lock;
 	struct sk_buff *  skb;
@@ -42,8 +44,7 @@ void __gnet_stats_copy_basic(const seqcount_t *running,
 			     struct gnet_stats_basic_cpu __percpu *cpu,
 			     struct gnet_stats_basic_packed *b);
 int gnet_stats_copy_rate_est(struct gnet_dump *d,
-			     const struct gnet_stats_basic_packed *b,
-			     struct gnet_stats_rate_est64 *r);
+			     struct net_rate_estimator __rcu **ptr);
 int gnet_stats_copy_queue(struct gnet_dump *d,
 			  struct gnet_stats_queue __percpu *cpu_q,
 			  struct gnet_stats_queue *q, __u32 qlen);
@@ -53,16 +54,16 @@ int gnet_stats_finish_copy(struct gnet_dump *d);
 
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
-		      struct gnet_stats_rate_est64 *rate_est,
+		      struct net_rate_estimator __rcu **rate_est,
 		      spinlock_t *stats_lock,
 		      seqcount_t *running, struct nlattr *opt);
-void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
-			struct gnet_stats_rate_est64 *rate_est);
+void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
 			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
-			  struct gnet_stats_rate_est64 *rate_est,
+			  struct net_rate_estimator __rcu **ptr,
 			  spinlock_t *stats_lock,
 			  seqcount_t *running, struct nlattr *opt);
-bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
-			  const struct gnet_stats_rate_est64 *rate_est);
+bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
+bool gen_estimator_read(struct net_rate_estimator __rcu **ptr,
+			struct gnet_stats_rate_est64 *sample);
 #endif
diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h
index 79f45e19f31e..130e58361f99 100644
--- a/include/net/netfilter/xt_rateest.h
+++ b/include/net/netfilter/xt_rateest.h
@@ -1,19 +1,23 @@
 #ifndef _XT_RATEEST_H
 #define _XT_RATEEST_H
 
+#include <net/gen_stats.h>
+
 struct xt_rateest {
 	/* keep lock and bstats on same cache line to speedup xt_rateest_tg() */
 	struct gnet_stats_basic_packed	bstats;
 	spinlock_t			lock;
-	/* keep rstats and lock on same cache line to speedup xt_rateest_mt() */
-	struct gnet_stats_rate_est64	rstats;
+
 
 	/* following fields not accessed in hot path */
+	unsigned int			refcnt;
 	struct hlist_node		list;
 	char				name[IFNAMSIZ];
-	unsigned int			refcnt;
 	struct gnet_estimator		params;
 	struct rcu_head			rcu;
+
+	/* keep this field far away to speedup xt_rateest_mt() */
+	struct net_rate_estimator __rcu *rate_est;
 };
 
 struct xt_rateest *xt_rateest_lookup(const char *name);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e6aa0a249672..498f81b229a4 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -76,7 +76,7 @@ struct Qdisc {
 
 	struct netdev_queue	*dev_queue;
 
-	struct gnet_stats_rate_est64	rate_est;
+	struct net_rate_estimator __rcu *rate_est;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
 	struct gnet_stats_queue	__percpu *cpu_qstats;
 
-- 
cgit v1.2.3


From 7bd509e311f408f7a5132fcdde2069af65fa05ae Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 4 Dec 2016 23:19:41 +0100
Subject: bpf: add prog_digest and expose it via fdinfo/netlink

When loading a BPF program via bpf(2), calculate the digest over
the program's instruction stream and store it in struct bpf_prog's
digest member. This is done at a point in time before any instructions
are rewritten by the verifier. Any unstable map file descriptor
number part of the imm field will be zeroed for the hash.

fdinfo example output for progs:

  # cat /proc/1590/fdinfo/5
  pos:          0
  flags:        02000002
  mnt_id:       11
  prog_type:    1
  prog_jited:   1
  prog_digest:  b27e8b06da22707513aa97363dfb11c7c3675d28
  memlock:      4096

When programs are pinned and retrieved by an ELF loader, the loader
can check the program's digest through fdinfo and compare it against
one that was generated over the ELF file's program section to see
if the program needs to be reloaded. Furthermore, this can also be
exposed through other means such as netlink in case of a tc cls/act
dump (or xdp in future), but also through tracepoints or other
facilities to identify the program. Other than that, the digest can
also serve as a base name for the work in progress kallsyms support
of programs. The digest doesn't depend/select the crypto layer, since
we need to keep dependencies to a minimum. iproute2 will get support
for this facility.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h                | 1 +
 include/linux/filter.h             | 7 ++++++-
 include/uapi/linux/pkt_cls.h       | 1 +
 include/uapi/linux/tc_act/tc_bpf.h | 1 +
 4 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 69d0a7f12a3b..8796ff03f472 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -216,6 +216,7 @@ u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
+void bpf_prog_calc_digest(struct bpf_prog *fp);
 
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 97338134398f..f078d2b1cff6 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -14,6 +14,7 @@
 #include <linux/workqueue.h>
 #include <linux/sched.h>
 #include <linux/capability.h>
+#include <linux/cryptohash.h>
 
 #include <net/sch_generic.h>
 
@@ -56,6 +57,9 @@ struct bpf_prog_aux;
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
+/* Maximum BPF program size in bytes. */
+#define MAX_BPF_SIZE	(BPF_MAXINSNS * sizeof(struct bpf_insn))
+
 /* Helper macros for filter block array initializers. */
 
 /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
@@ -404,8 +408,9 @@ struct bpf_prog {
 				cb_access:1,	/* Is control block accessed? */
 				dst_needed:1;	/* Do we need dst entry? */
 	kmemcheck_bitfield_end(meta);
-	u32			len;		/* Number of filter blocks */
 	enum bpf_prog_type	type;		/* Type of BPF program */
+	u32			len;		/* Number of filter blocks */
+	u32			digest[SHA_DIGEST_WORDS]; /* Program digest */
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
 	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
 	unsigned int		(*bpf_func)(const void *ctx,
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 86786d45ee66..1adc0b654996 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -397,6 +397,7 @@ enum {
 	TCA_BPF_NAME,
 	TCA_BPF_FLAGS,
 	TCA_BPF_FLAGS_GEN,
+	TCA_BPF_DIGEST,
 	__TCA_BPF_MAX,
 };
 
diff --git a/include/uapi/linux/tc_act/tc_bpf.h b/include/uapi/linux/tc_act/tc_bpf.h
index 063d9d465119..a6b88a6f7f71 100644
--- a/include/uapi/linux/tc_act/tc_bpf.h
+++ b/include/uapi/linux/tc_act/tc_bpf.h
@@ -27,6 +27,7 @@ enum {
 	TCA_ACT_BPF_FD,
 	TCA_ACT_BPF_NAME,
 	TCA_ACT_BPF_PAD,
+	TCA_ACT_BPF_DIGEST,
 	__TCA_ACT_BPF_MAX,
 };
 #define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1)
-- 
cgit v1.2.3


From 834184b1f3a4635efbdfdae5fb437f109f6605fa Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 15 Nov 2016 21:36:45 +0100
Subject: netfilter: defrag: only register defrag functionality if needed

nf_defrag modules for ipv4 and ipv6 export an empty stub function.
Any module that needs the defragmentation hooks registered simply 'calls'
this empty function to create a phony module dependency -- modprobe will
then load the defrag module too.

This extends netfilter ipv4/ipv6 defragmentation modules to delay the hook
registration until the functionality is requested within a network namespace
instead of module load time for all namespaces.

Hooks are only un-registered on module unload or when a namespace that used
such defrag functionality exits.

We have to use struct net for this as the register hooks can be called
before netns initialization here from the ipv4/ipv6 conntrack module
init path.

There is no unregister functionality support, defrag will always be
active once it was requested inside a net namespace.

The reason is that defrag has impact on nft and iptables rulesets
(without defrag we might see framents).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_defrag_ipv4.h | 3 ++-
 include/net/netfilter/ipv6/nf_defrag_ipv6.h | 3 ++-
 include/net/netns/netfilter.h               | 6 ++++++
 3 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/ipv4/nf_defrag_ipv4.h b/include/net/netfilter/ipv4/nf_defrag_ipv4.h
index f01ef208dff6..db405f70e538 100644
--- a/include/net/netfilter/ipv4/nf_defrag_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_defrag_ipv4.h
@@ -1,6 +1,7 @@
 #ifndef _NF_DEFRAG_IPV4_H
 #define _NF_DEFRAG_IPV4_H
 
-void nf_defrag_ipv4_enable(void);
+struct net;
+int nf_defrag_ipv4_enable(struct net *);
 
 #endif /* _NF_DEFRAG_IPV4_H */
diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h
index ddf162f7966f..7664efe37974 100644
--- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h
@@ -1,7 +1,8 @@
 #ifndef _NF_DEFRAG_IPV6_H
 #define _NF_DEFRAG_IPV6_H
 
-void nf_defrag_ipv6_enable(void);
+struct net;
+int nf_defrag_ipv6_enable(struct net *);
 
 int nf_ct_frag6_init(void);
 void nf_ct_frag6_cleanup(void);
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index 58487b1cc99a..cea396b53a60 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -17,5 +17,11 @@ struct netns_nf {
 	struct ctl_table_header *nf_log_dir_header;
 #endif
 	struct nf_hook_entry __rcu *hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
+	bool			defrag_ipv4;
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+	bool			defrag_ipv6;
+#endif
 };
 #endif
-- 
cgit v1.2.3


From 0aa8c57a04907a5d02068ff9f917629be97ea78d Mon Sep 17 00:00:00 2001
From: Aaron Conole <aconole@bytheb.org>
Date: Tue, 15 Nov 2016 17:48:44 -0500
Subject: netfilter: introduce accessor functions for hook entries

This allows easier future refactoring.

Signed-off-by: Aaron Conole <aconole@bytheb.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 69230140215b..575aa198097e 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -79,6 +79,33 @@ struct nf_hook_entry {
 	const struct nf_hook_ops	*orig_ops;
 };
 
+static inline void
+nf_hook_entry_init(struct nf_hook_entry *entry,	const struct nf_hook_ops *ops)
+{
+	entry->next = NULL;
+	entry->ops = *ops;
+	entry->orig_ops = ops;
+}
+
+static inline int
+nf_hook_entry_priority(const struct nf_hook_entry *entry)
+{
+	return entry->ops.priority;
+}
+
+static inline int
+nf_hook_entry_hookfn(const struct nf_hook_entry *entry, struct sk_buff *skb,
+		     struct nf_hook_state *state)
+{
+	return entry->ops.hook(entry->ops.priv, skb, state);
+}
+
+static inline const struct nf_hook_ops *
+nf_hook_entry_ops(const struct nf_hook_entry *entry)
+{
+	return entry->orig_ops;
+}
+
 static inline void nf_hook_state_init(struct nf_hook_state *p,
 				      unsigned int hook,
 				      u_int8_t pf,
-- 
cgit v1.2.3


From d415b9eb76fc55c03ef5451691170aa5771dcea3 Mon Sep 17 00:00:00 2001
From: Aaron Conole <aconole@redhat.com>
Date: Tue, 15 Nov 2016 17:48:45 -0500
Subject: netfilter: decouple nf_hook_entry and nf_hook_ops

During nfhook traversal we only need a very small subset of
nf_hook_ops members.

We need:
- next element
- hook function to call
- hook function priv argument

Bridge netfilter also needs 'thresh'; can be obtained via ->orig_ops.

nf_hook_entry struct is now 32 bytes on x86_64.

A followup patch will turn the run-time list into an array that only
stores hook functions plus their priv arguments, eliminating the ->next
element.

Suggested-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Aaron Conole <aconole@bytheb.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 575aa198097e..a4b97be30b28 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -75,7 +75,8 @@ struct nf_hook_ops {
 
 struct nf_hook_entry {
 	struct nf_hook_entry __rcu	*next;
-	struct nf_hook_ops		ops;
+	nf_hookfn			*hook;
+	void				*priv;
 	const struct nf_hook_ops	*orig_ops;
 };
 
@@ -83,21 +84,22 @@ static inline void
 nf_hook_entry_init(struct nf_hook_entry *entry,	const struct nf_hook_ops *ops)
 {
 	entry->next = NULL;
-	entry->ops = *ops;
+	entry->hook = ops->hook;
+	entry->priv = ops->priv;
 	entry->orig_ops = ops;
 }
 
 static inline int
 nf_hook_entry_priority(const struct nf_hook_entry *entry)
 {
-	return entry->ops.priority;
+	return entry->orig_ops->priority;
 }
 
 static inline int
 nf_hook_entry_hookfn(const struct nf_hook_entry *entry, struct sk_buff *skb,
 		     struct nf_hook_state *state)
 {
-	return entry->ops.hook(entry->ops.priv, skb, state);
+	return entry->hook(entry->priv, skb, state);
 }
 
 static inline const struct nf_hook_ops *
-- 
cgit v1.2.3


From 4d31eef5176df06f218201bc9c0ce40babb41660 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 22 Nov 2016 14:44:17 +0100
Subject: netfilter: x_tables: pass xt_counters struct instead of packet
 counter

On SMP we overload the packet counter (unsigned long) to contain
percpu offset.  Hide this from callers and pass xt_counters address
instead.

Preparation patch to allocate the percpu counters in page-sized batch
chunks.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index cd4eaf8df445..6e61edeb68e3 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -430,11 +430,7 @@ static inline unsigned long xt_percpu_counter_alloc(void)
 
 	return 0;
 }
-static inline void xt_percpu_counter_free(u64 pcnt)
-{
-	if (nr_cpu_ids > 1)
-		free_percpu((void __percpu *) (unsigned long) pcnt);
-}
+void xt_percpu_counter_free(struct xt_counters *cnt);
 
 static inline struct xt_counters *
 xt_get_this_cpu_counter(struct xt_counters *cnt)
-- 
cgit v1.2.3


From f28e15bacedd444608e25421c72eb2cf4527c9ca Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 22 Nov 2016 14:44:18 +0100
Subject: netfilter: x_tables: pass xt_counters struct to counter allocator

Keeps some noise away from a followup patch.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 27 +--------------------------
 1 file changed, 1 insertion(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 6e61edeb68e3..05a94bd32c55 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -404,32 +404,7 @@ static inline unsigned long ifname_compare_aligned(const char *_a,
 }
 
 
-/* On SMP, ip(6)t_entry->counters.pcnt holds address of the
- * real (percpu) counter.  On !SMP, its just the packet count,
- * so nothing needs to be done there.
- *
- * xt_percpu_counter_alloc returns the address of the percpu
- * counter, or 0 on !SMP. We force an alignment of 16 bytes
- * so that bytes/packets share a common cache line.
- *
- * Hence caller must use IS_ERR_VALUE to check for error, this
- * allows us to return 0 for single core systems without forcing
- * callers to deal with SMP vs. NONSMP issues.
- */
-static inline unsigned long xt_percpu_counter_alloc(void)
-{
-	if (nr_cpu_ids > 1) {
-		void __percpu *res = __alloc_percpu(sizeof(struct xt_counters),
-						    sizeof(struct xt_counters));
-
-		if (res == NULL)
-			return -ENOMEM;
-
-		return (__force unsigned long) res;
-	}
-
-	return 0;
-}
+bool xt_percpu_counter_alloc(struct xt_counters *counters);
 void xt_percpu_counter_free(struct xt_counters *cnt);
 
 static inline struct xt_counters *
-- 
cgit v1.2.3


From ae0ac0ed6fcf5af3be0f63eb935f483f44a402d2 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 22 Nov 2016 14:44:19 +0100
Subject: netfilter: x_tables: pack percpu counter allocations

instead of allocating each xt_counter individually, allocate 4k chunks
and then use these for counter allocation requests.

This should speed up rule evaluation by increasing data locality,
also speeds up ruleset loading because we reduce calls to the percpu
allocator.

As Eric points out we can't use PAGE_SIZE, page_allocator would fail on
arches with 64k page size.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 05a94bd32c55..5117e4d2ddfa 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -403,8 +403,13 @@ static inline unsigned long ifname_compare_aligned(const char *_a,
 	return ret;
 }
 
+struct xt_percpu_counter_alloc_state {
+	unsigned int off;
+	const char __percpu *mem;
+};
 
-bool xt_percpu_counter_alloc(struct xt_counters *counters);
+bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state,
+			     struct xt_counters *counter);
 void xt_percpu_counter_free(struct xt_counters *cnt);
 
 static inline struct xt_counters *
-- 
cgit v1.2.3


From 1814096980bbe546c4384b7b064126cbe7d40d30 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 24 Nov 2016 12:04:55 +0100
Subject: netfilter: nft_payload: layer 4 checksum adjustment for pseudoheader
 fields

This patch adds a new flag that signals the kernel to update layer 4
checksum if the packet field belongs to the layer 4 pseudoheader. This
implicitly provides stateless NAT 1:1 that is useful under very specific
usecases.

Since rules mangling layer 3 fields that are part of the pseudoheader
may potentially convey any layer 4 packet, we have to deal with the
layer 4 checksum adjustment using protocol specific code.

This patch adds support for TCP, UDP and ICMPv6, since they include the
pseudoheader in the layer 4 checksum calculation. ICMP doesn't, so we
can skip it.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h   | 1 +
 include/uapi/linux/netfilter/nf_tables.h | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 862373d4ea9d..8f690effec37 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -45,6 +45,7 @@ struct nft_payload_set {
 	enum nft_registers	sreg:8;
 	u8			csum_type;
 	u8			csum_offset;
+	u8			csum_flags;
 };
 
 extern const struct nft_expr_ops nft_payload_fast_ops;
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 14e5f619167e..f030e59aa2ec 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -659,6 +659,10 @@ enum nft_payload_csum_types {
 	NFT_PAYLOAD_CSUM_INET,
 };
 
+enum nft_payload_csum_flags {
+	NFT_PAYLOAD_L4CSUM_PSEUDOHDR = (1 << 0),
+};
+
 /**
  * enum nft_payload_attributes - nf_tables payload expression netlink attributes
  *
@@ -669,6 +673,7 @@ enum nft_payload_csum_types {
  * @NFTA_PAYLOAD_SREG: source register to load data from (NLA_U32: nft_registers)
  * @NFTA_PAYLOAD_CSUM_TYPE: checksum type (NLA_U32)
  * @NFTA_PAYLOAD_CSUM_OFFSET: checksum offset relative to base (NLA_U32)
+ * @NFTA_PAYLOAD_CSUM_FLAGS: checksum flags (NLA_U32)
  */
 enum nft_payload_attributes {
 	NFTA_PAYLOAD_UNSPEC,
@@ -679,6 +684,7 @@ enum nft_payload_attributes {
 	NFTA_PAYLOAD_SREG,
 	NFTA_PAYLOAD_CSUM_TYPE,
 	NFTA_PAYLOAD_CSUM_OFFSET,
+	NFTA_PAYLOAD_CSUM_FLAGS,
 	__NFTA_PAYLOAD_MAX
 };
 #define NFTA_PAYLOAD_MAX	(__NFTA_PAYLOAD_MAX - 1)
-- 
cgit v1.2.3


From df122f58b834b24c27d7e2ac02a4910d3e56f6ae Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 28 Nov 2016 11:40:05 +0100
Subject: netfilter: ingress: translate 0 nf_hook_slow retval to -1

The caller assumes that < 0 means that skb was stolen (or free'd).

All other return values continue skb processing.

nf_hook_slow returns 3 different return value types:

A) a (negative) errno value: the skb was dropped (NF_DROP, e.g.
by iptables '-j DROP' rule).

B) 0. The skb was stolen by the hook or queued to userspace.

C) 1. all hooks returned NF_ACCEPT so the caller should invoke
   the okfn so packet processing can continue.

nft ingress facility currently doesn't have the 'okfn' that
the NF_HOOK() macros use; there is no nfqueue support either.

So 1 means that nf_hook_ingress() caller should go on processing the skb.

In order to allow use of NF_STOLEN from ingress we need to translate
this to an errno number, else we'd crash because we continue with
already-free'd (or about to be free-d) skb.

The errno value isn't checked, its just important that its less than 0,
so return -1.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ingress.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h
index 2dc3b49b804a..59476061de86 100644
--- a/include/linux/netfilter_ingress.h
+++ b/include/linux/netfilter_ingress.h
@@ -19,6 +19,7 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
 {
 	struct nf_hook_entry *e = rcu_dereference(skb->dev->nf_hooks_ingress);
 	struct nf_hook_state state;
+	int ret;
 
 	/* Must recheck the ingress hook head, in the event it became NULL
 	 * after the check in nf_hook_ingress_active evaluated to true.
@@ -29,7 +30,11 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
 	nf_hook_state_init(&state, NF_NETDEV_INGRESS,
 			   NFPROTO_NETDEV, skb->dev, NULL, NULL,
 			   dev_net(skb->dev), NULL);
-	return nf_hook_slow(skb, &state, e);
+	ret = nf_hook_slow(skb, &state, e);
+	if (ret == 0)
+		return -1;
+
+	return ret;
 }
 
 static inline void nf_hook_ingress_init(struct net_device *dev)
-- 
cgit v1.2.3


From 3bf3276119455bd0fc7a7e31be2823118e613842 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 28 Nov 2016 11:40:06 +0100
Subject: netfilter: add and use nf_fwd_netdev_egress

... so we can use current skb instead of working with a clone.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_dup_netdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_dup_netdev.h b/include/net/netfilter/nf_dup_netdev.h
index 397dcae349f9..3e919356bedf 100644
--- a/include/net/netfilter/nf_dup_netdev.h
+++ b/include/net/netfilter/nf_dup_netdev.h
@@ -2,5 +2,6 @@
 #define _NF_DUP_NETDEV_H_
 
 void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif);
+void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif);
 
 #endif
-- 
cgit v1.2.3


From e50092404c1bc7aaeb0a0f4077fa6f07b073a20f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 28 Nov 2016 00:04:32 +0100
Subject: netfilter: nf_tables: add stateful objects

This patch augments nf_tables to support stateful objects. This new
infrastructure allows you to create, dump and delete stateful objects,
that are identified by a user-defined name.

This patch adds the generic infrastructure, follow up patches add
support for two stateful objects: counters and quotas.

This patch provides a native infrastructure for nf_tables to replace
nfacct, the extended accounting infrastructure for iptables.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        | 79 ++++++++++++++++++++++++++++++++
 include/uapi/linux/netfilter/nf_tables.h | 29 ++++++++++++
 2 files changed, 108 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 32970cba184a..903cd618f50e 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -875,6 +875,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);
  *	@list: used internally
  *	@chains: chains in the table
  *	@sets: sets in the table
+ *	@objects: stateful objects in the table
  *	@hgenerator: handle generator state
  *	@use: number of chain references to this table
  *	@flags: table flag (see enum nft_table_flags)
@@ -885,6 +886,7 @@ struct nft_table {
 	struct list_head		list;
 	struct list_head		chains;
 	struct list_head		sets;
+	struct list_head		objects;
 	u64				hgenerator;
 	u32				use;
 	u16				flags:14,
@@ -934,6 +936,73 @@ void nft_unregister_expr(struct nft_expr_type *);
 int nft_verdict_dump(struct sk_buff *skb, int type,
 		     const struct nft_verdict *v);
 
+/**
+ *	struct nft_object - nf_tables stateful object
+ *
+ *	@list: table stateful object list node
+ *	@type: pointer to object type
+ *	@data: pointer to object data
+ *	@name: name of this stateful object
+ *	@genmask: generation mask
+ *	@use: number of references to this stateful object
+ * 	@data: object data, layout depends on type
+ */
+struct nft_object {
+	struct list_head		list;
+	char				name[NFT_OBJ_MAXNAMELEN];
+	u32				genmask:2,
+					use:30;
+	/* runtime data below here */
+	const struct nft_object_type	*type ____cacheline_aligned;
+	unsigned char			data[]
+		__attribute__((aligned(__alignof__(u64))));
+};
+
+static inline void *nft_obj_data(const struct nft_object *obj)
+{
+	return (void *)obj->data;
+}
+
+#define nft_expr_obj(expr)	*((struct nft_object **)nft_expr_priv(expr))
+
+struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
+					const struct nlattr *nla, u32 objtype,
+					u8 genmask);
+
+/**
+ *	struct nft_object_type - stateful object type
+ *
+ *	@eval: stateful object evaluation function
+ *	@list: list node in list of object types
+ *	@type: stateful object numeric type
+ *	@size: stateful object size
+ *	@owner: module owner
+ *	@maxattr: maximum netlink attribute
+ *	@policy: netlink attribute policy
+ *	@init: initialize object from netlink attributes
+ *	@destroy: release existing stateful object
+ *	@dump: netlink dump stateful object
+ */
+struct nft_object_type {
+	void				(*eval)(struct nft_object *obj,
+						struct nft_regs *regs,
+						const struct nft_pktinfo *pkt);
+	struct list_head		list;
+	u32				type;
+	unsigned int			size;
+	unsigned int			maxattr;
+	struct module			*owner;
+	const struct nla_policy		*policy;
+	int				(*init)(const struct nlattr * const tb[],
+						struct nft_object *obj);
+	void				(*destroy)(struct nft_object *obj);
+	int				(*dump)(struct sk_buff *skb,
+						const struct nft_object *obj);
+};
+
+int nft_register_obj(struct nft_object_type *obj_type);
+void nft_unregister_obj(struct nft_object_type *obj_type);
+
 /**
  *	struct nft_traceinfo - nft tracing information and state
  *
@@ -981,6 +1050,9 @@ void nft_trace_notify(struct nft_traceinfo *info);
 #define MODULE_ALIAS_NFT_SET() \
 	MODULE_ALIAS("nft-set")
 
+#define MODULE_ALIAS_NFT_OBJ(type) \
+	MODULE_ALIAS("nft-obj-" __stringify(type))
+
 /*
  * The gencursor defines two generations, the currently active and the
  * next one. Objects contain a bitmask of 2 bits specifying the generations
@@ -1157,4 +1229,11 @@ struct nft_trans_elem {
 #define nft_trans_elem(trans)	\
 	(((struct nft_trans_elem *)trans->data)->elem)
 
+struct nft_trans_obj {
+	struct nft_object		*obj;
+};
+
+#define nft_trans_obj(trans)	\
+	(((struct nft_trans_obj *)trans->data)->obj)
+
 #endif /* _NET_NF_TABLES_H */
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index f030e59aa2ec..18e30dbc8c3f 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -4,6 +4,7 @@
 #define NFT_TABLE_MAXNAMELEN	32
 #define NFT_CHAIN_MAXNAMELEN	32
 #define NFT_SET_MAXNAMELEN	32
+#define NFT_OBJ_MAXNAMELEN	32
 #define NFT_USERDATA_MAXLEN	256
 
 /**
@@ -85,6 +86,9 @@ enum nft_verdicts {
  * @NFT_MSG_NEWGEN: announce a new generation, only for events (enum nft_gen_attributes)
  * @NFT_MSG_GETGEN: get the rule-set generation (enum nft_gen_attributes)
  * @NFT_MSG_TRACE: trace event (enum nft_trace_attributes)
+ * @NFT_MSG_NEWOBJ: create a stateful object (enum nft_obj_attributes)
+ * @NFT_MSG_GETOBJ: get a stateful object (enum nft_obj_attributes)
+ * @NFT_MSG_DELOBJ: delete a stateful object (enum nft_obj_attributes)
  */
 enum nf_tables_msg_types {
 	NFT_MSG_NEWTABLE,
@@ -105,6 +109,9 @@ enum nf_tables_msg_types {
 	NFT_MSG_NEWGEN,
 	NFT_MSG_GETGEN,
 	NFT_MSG_TRACE,
+	NFT_MSG_NEWOBJ,
+	NFT_MSG_GETOBJ,
+	NFT_MSG_DELOBJ,
 	NFT_MSG_MAX,
 };
 
@@ -1178,6 +1185,28 @@ enum nft_fib_flags {
 	NFTA_FIB_F_OIF		= 1 << 4,	/* restrict to oif */
 };
 
+#define NFT_OBJECT_UNSPEC	0
+
+/**
+ * enum nft_object_attributes - nf_tables stateful object netlink attributes
+ *
+ * @NFTA_OBJ_TABLE: name of the table containing the expression (NLA_STRING)
+ * @NFTA_OBJ_NAME: name of this expression type (NLA_STRING)
+ * @NFTA_OBJ_TYPE: stateful object type (NLA_U32)
+ * @NFTA_OBJ_DATA: stateful object data (NLA_NESTED)
+ * @NFTA_OBJ_USE: number of references to this expression (NLA_U32)
+ */
+enum nft_object_attributes {
+	NFTA_OBJ_UNSPEC,
+	NFTA_OBJ_TABLE,
+	NFTA_OBJ_NAME,
+	NFTA_OBJ_TYPE,
+	NFTA_OBJ_DATA,
+	NFTA_OBJ_USE,
+	__NFTA_OBJ_MAX
+};
+#define NFTA_OBJ_MAX		(__NFTA_OBJ_MAX - 1)
+
 /**
  * enum nft_trace_attributes - nf_tables trace netlink attributes
  *
-- 
cgit v1.2.3


From b1ce0ced101ee134c5d0bbb378b2c3cadc617f20 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 28 Nov 2016 00:04:36 +0100
Subject: netfilter: nft_counter: add stateful object type

Register a new percpu counter stateful object type into the stateful
object infrastructure.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 18e30dbc8c3f..e352ef65d753 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1186,6 +1186,7 @@ enum nft_fib_flags {
 };
 
 #define NFT_OBJECT_UNSPEC	0
+#define NFT_OBJECT_COUNTER	1
 
 /**
  * enum nft_object_attributes - nf_tables stateful object netlink attributes
-- 
cgit v1.2.3


From 173705d9a2df1490478bf0d39f1b517bd489c8fa Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 28 Nov 2016 00:04:43 +0100
Subject: netfilter: nft_quota: add stateful object type

Register a new quota stateful object type into the new stateful object
infrastructure.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index e352ef65d753..ad0577ba5d2a 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1187,6 +1187,7 @@ enum nft_fib_flags {
 
 #define NFT_OBJECT_UNSPEC	0
 #define NFT_OBJECT_COUNTER	1
+#define NFT_OBJECT_QUOTA	2
 
 /**
  * enum nft_object_attributes - nf_tables stateful object netlink attributes
-- 
cgit v1.2.3


From c97d22e68bfedfacb9e752dee536c69916ae0933 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 28 Nov 2016 00:05:38 +0100
Subject: netfilter: nf_tables: add stateful object reference expression

This new expression allows us to refer to existing stateful objects from
rules.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index ad0577ba5d2a..1043ce4250c5 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1137,6 +1137,20 @@ enum nft_fwd_attributes {
 };
 #define NFTA_FWD_MAX	(__NFTA_FWD_MAX - 1)
 
+/**
+ * enum nft_objref_attributes - nf_tables stateful object expression netlink attributes
+ *
+ * @NFTA_OBJREF_IMM_TYPE: object type for immediate reference (NLA_U32: nft_register)
+ * @NFTA_OBJREF_IMM_NAME: object name for immediate reference (NLA_STRING)
+ */
+enum nft_objref_attributes {
+	NFTA_OBJREF_UNSPEC,
+	NFTA_OBJREF_IMM_TYPE,
+	NFTA_OBJREF_IMM_NAME,
+	__NFTA_OBJREF_MAX
+};
+#define NFTA_OBJREF_MAX	(__NFTA_OBJREF_MAX - 1)
+
 /**
  * enum nft_gen_attributes - nf_tables ruleset generation attributes
  *
-- 
cgit v1.2.3


From 795595f68d6c787028345804bb06f5a633af24a2 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 28 Nov 2016 00:05:52 +0100
Subject: netfilter: nft_quota: dump consumed quota

Add a new attribute NFTA_QUOTA_CONSUMED that displays the amount of
quota that has been already consumed. This allows us to restore the
internal state of the quota object between reboots as well as to monitor
how wasted it is.

This patch changes the logic to account for the consumed bytes, instead
of the bytes that remain to be consumed.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 1043ce4250c5..3d47582caa80 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -988,12 +988,14 @@ enum nft_quota_flags {
  *
  * @NFTA_QUOTA_BYTES: quota in bytes (NLA_U16)
  * @NFTA_QUOTA_FLAGS: flags (NLA_U32)
+ * @NFTA_QUOTA_CONSUMED: quota already consumed in bytes (NLA_U64)
  */
 enum nft_quota_attributes {
 	NFTA_QUOTA_UNSPEC,
 	NFTA_QUOTA_BYTES,
 	NFTA_QUOTA_FLAGS,
 	NFTA_QUOTA_PAD,
+	NFTA_QUOTA_CONSUMED,
 	__NFTA_QUOTA_MAX
 };
 #define NFTA_QUOTA_MAX		(__NFTA_QUOTA_MAX - 1)
-- 
cgit v1.2.3


From 43da04a593d8b2626f1cf4b56efe9402f6b53652 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 28 Nov 2016 00:05:44 +0100
Subject: netfilter: nf_tables: atomic dump and reset for stateful objects

This patch adds a new NFT_MSG_GETOBJ_RESET command perform an atomic
dump-and-reset of the stateful object. This also comes with add support
for atomic dump and reset for counter and quota objects.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        | 3 ++-
 include/uapi/linux/netfilter/nf_tables.h | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 903cd618f50e..6f7d6a1dc09c 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -997,7 +997,8 @@ struct nft_object_type {
 						struct nft_object *obj);
 	void				(*destroy)(struct nft_object *obj);
 	int				(*dump)(struct sk_buff *skb,
-						const struct nft_object *obj);
+						struct nft_object *obj,
+						bool reset);
 };
 
 int nft_register_obj(struct nft_object_type *obj_type);
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 3d47582caa80..399eac1eee91 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -89,6 +89,7 @@ enum nft_verdicts {
  * @NFT_MSG_NEWOBJ: create a stateful object (enum nft_obj_attributes)
  * @NFT_MSG_GETOBJ: get a stateful object (enum nft_obj_attributes)
  * @NFT_MSG_DELOBJ: delete a stateful object (enum nft_obj_attributes)
+ * @NFT_MSG_GETOBJ_RESET: get and reset a stateful object (enum nft_obj_attributes)
  */
 enum nf_tables_msg_types {
 	NFT_MSG_NEWTABLE,
@@ -112,6 +113,7 @@ enum nf_tables_msg_types {
 	NFT_MSG_NEWOBJ,
 	NFT_MSG_GETOBJ,
 	NFT_MSG_DELOBJ,
+	NFT_MSG_GETOBJ_RESET,
 	NFT_MSG_MAX,
 };
 
-- 
cgit v1.2.3


From 2599e98934c5ad166ad184b3682e38aadcb63fb3 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 28 Nov 2016 00:05:48 +0100
Subject: netfilter: nf_tables: notify internal updates of stateful objects

Introduce nf_tables_obj_notify() to notify internal state changes in
stateful objects. This is used by the quota object to report depletion
in a follow up patch.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 6f7d6a1dc09c..339e374c28b5 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -969,6 +969,10 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
 					const struct nlattr *nla, u32 objtype,
 					u8 genmask);
 
+int nft_obj_notify(struct net *net, struct nft_table *table,
+		   struct nft_object *obj, u32 portid, u32 seq,
+		   int event, int family, int report, gfp_t gfp);
+
 /**
  *	struct nft_object_type - stateful object type
  *
-- 
cgit v1.2.3


From 1896531710abcd9a961a17d0c5c6a9f537d479b6 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 28 Nov 2016 00:05:56 +0100
Subject: netfilter: nft_quota: add depleted flag for objects

Notify on depleted quota objects. The NFT_QUOTA_F_DEPLETED flag
indicates we have reached overquota.

Add pointer to table from nft_object, so we can use it when sending the
depletion notification to userspace.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        | 2 ++
 include/uapi/linux/netfilter/nf_tables.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 339e374c28b5..ce6fb6e83b32 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -940,6 +940,7 @@ int nft_verdict_dump(struct sk_buff *skb, int type,
  *	struct nft_object - nf_tables stateful object
  *
  *	@list: table stateful object list node
+ *	@table: table this object belongs to
  *	@type: pointer to object type
  *	@data: pointer to object data
  *	@name: name of this stateful object
@@ -950,6 +951,7 @@ int nft_verdict_dump(struct sk_buff *skb, int type,
 struct nft_object {
 	struct list_head		list;
 	char				name[NFT_OBJ_MAXNAMELEN];
+	struct nft_table		*table;
 	u32				genmask:2,
 					use:30;
 	/* runtime data below here */
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 399eac1eee91..4864caca1e8e 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -983,6 +983,7 @@ enum nft_queue_attributes {
 
 enum nft_quota_flags {
 	NFT_QUOTA_F_INV		= (1 << 0),
+	NFT_QUOTA_F_DEPLETED	= (1 << 1),
 };
 
 /**
-- 
cgit v1.2.3


From 8aeff920dcc9b3f8cf43042a76428582634d9208 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 28 Nov 2016 00:06:00 +0100
Subject: netfilter: nf_tables: add stateful object reference to set elements

This patch allows you to refer to stateful objects from set elements.
This provides the infrastructure to create maps where the right hand
side of the mapping is a stateful object.

This allows us to build dictionaries of stateful objects, that you can
use to perform fast lookups using any arbitrary key combination.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        | 9 +++++++++
 include/uapi/linux/netfilter/nf_tables.h | 8 ++++++++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index ce6fb6e83b32..85f0f03f1e87 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -326,6 +326,7 @@ void nft_unregister_set(struct nft_set_ops *ops);
  * 	@name: name of the set
  * 	@ktype: key type (numeric type defined by userspace, not used in the kernel)
  * 	@dtype: data type (verdict or numeric type defined by userspace)
+ * 	@objtype: object type (see NFT_OBJECT_* definitions)
  * 	@size: maximum set size
  * 	@nelems: number of elements
  * 	@ndeact: number of deactivated elements queued for removal
@@ -347,6 +348,7 @@ struct nft_set {
 	char				name[NFT_SET_MAXNAMELEN];
 	u32				ktype;
 	u32				dtype;
+	u32				objtype;
 	u32				size;
 	atomic_t			nelems;
 	u32				ndeact;
@@ -416,6 +418,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
  *	@NFT_SET_EXT_EXPIRATION: element expiration time
  *	@NFT_SET_EXT_USERDATA: user data associated with the element
  *	@NFT_SET_EXT_EXPR: expression assiociated with the element
+ *	@NFT_SET_EXT_OBJREF: stateful object reference associated with element
  *	@NFT_SET_EXT_NUM: number of extension types
  */
 enum nft_set_extensions {
@@ -426,6 +429,7 @@ enum nft_set_extensions {
 	NFT_SET_EXT_EXPIRATION,
 	NFT_SET_EXT_USERDATA,
 	NFT_SET_EXT_EXPR,
+	NFT_SET_EXT_OBJREF,
 	NFT_SET_EXT_NUM
 };
 
@@ -554,6 +558,11 @@ static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set,
 	return elem + set->ops->elemsize;
 }
 
+static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext)
+{
+	return nft_set_ext(ext, NFT_SET_EXT_OBJREF);
+}
+
 void *nft_set_elem_init(const struct nft_set *set,
 			const struct nft_set_ext_tmpl *tmpl,
 			const u32 *key, const u32 *data,
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 4864caca1e8e..a6b52dbff08c 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -255,6 +255,7 @@ enum nft_rule_compat_attributes {
  * @NFT_SET_MAP: set is used as a dictionary
  * @NFT_SET_TIMEOUT: set uses timeouts
  * @NFT_SET_EVAL: set contains expressions for evaluation
+ * @NFT_SET_OBJECT: set contains stateful objects
  */
 enum nft_set_flags {
 	NFT_SET_ANONYMOUS		= 0x1,
@@ -263,6 +264,7 @@ enum nft_set_flags {
 	NFT_SET_MAP			= 0x8,
 	NFT_SET_TIMEOUT			= 0x10,
 	NFT_SET_EVAL			= 0x20,
+	NFT_SET_OBJECT			= 0x40,
 };
 
 /**
@@ -304,6 +306,7 @@ enum nft_set_desc_attributes {
  * @NFTA_SET_TIMEOUT: default timeout value (NLA_U64)
  * @NFTA_SET_GC_INTERVAL: garbage collection interval (NLA_U32)
  * @NFTA_SET_USERDATA: user data (NLA_BINARY)
+ * @NFTA_SET_OBJ_TYPE: stateful object type (NLA_U32: NFT_OBJECT_*)
  */
 enum nft_set_attributes {
 	NFTA_SET_UNSPEC,
@@ -321,6 +324,7 @@ enum nft_set_attributes {
 	NFTA_SET_GC_INTERVAL,
 	NFTA_SET_USERDATA,
 	NFTA_SET_PAD,
+	NFTA_SET_OBJ_TYPE,
 	__NFTA_SET_MAX
 };
 #define NFTA_SET_MAX		(__NFTA_SET_MAX - 1)
@@ -344,6 +348,7 @@ enum nft_set_elem_flags {
  * @NFTA_SET_ELEM_EXPIRATION: expiration time (NLA_U64)
  * @NFTA_SET_ELEM_USERDATA: user data (NLA_BINARY)
  * @NFTA_SET_ELEM_EXPR: expression (NLA_NESTED: nft_expr_attributes)
+ * @NFTA_SET_ELEM_OBJREF: stateful object reference (NLA_STRING)
  */
 enum nft_set_elem_attributes {
 	NFTA_SET_ELEM_UNSPEC,
@@ -355,6 +360,7 @@ enum nft_set_elem_attributes {
 	NFTA_SET_ELEM_USERDATA,
 	NFTA_SET_ELEM_EXPR,
 	NFTA_SET_ELEM_PAD,
+	NFTA_SET_ELEM_OBJREF,
 	__NFTA_SET_ELEM_MAX
 };
 #define NFTA_SET_ELEM_MAX	(__NFTA_SET_ELEM_MAX - 1)
@@ -1207,6 +1213,8 @@ enum nft_fib_flags {
 #define NFT_OBJECT_UNSPEC	0
 #define NFT_OBJECT_COUNTER	1
 #define NFT_OBJECT_QUOTA	2
+#define __NFT_OBJECT_MAX	3
+#define NFT_OBJECT_MAX		(__NFT_OBJECT_MAX - 1)
 
 /**
  * enum nft_object_attributes - nf_tables stateful object netlink attributes
-- 
cgit v1.2.3


From 63aea29060025fd2732680aa48a6b97687b93af8 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 28 Nov 2016 00:06:03 +0100
Subject: netfilter: nft_objref: support for stateful object maps

This patch allows us to refer to stateful object dictionaries, the
source register indicates the key data to be used to look up for the
corresponding state object. We can refer to these maps through names or,
alternatively, the map transaction id. This allows us to refer to both
anonymous and named maps.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index a6b52dbff08c..881d49e94569 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1153,11 +1153,17 @@ enum nft_fwd_attributes {
  *
  * @NFTA_OBJREF_IMM_TYPE: object type for immediate reference (NLA_U32: nft_register)
  * @NFTA_OBJREF_IMM_NAME: object name for immediate reference (NLA_STRING)
+ * @NFTA_OBJREF_SET_SREG: source register of the data to look for (NLA_U32: nft_registers)
+ * @NFTA_OBJREF_SET_NAME: name of the set where to look for (NLA_STRING)
+ * @NFTA_OBJREF_SET_ID: id of the set where to look for in this transaction (NLA_U32)
  */
 enum nft_objref_attributes {
 	NFTA_OBJREF_UNSPEC,
 	NFTA_OBJREF_IMM_TYPE,
 	NFTA_OBJREF_IMM_NAME,
+	NFTA_OBJREF_SET_SREG,
+	NFTA_OBJREF_SET_NAME,
+	NFTA_OBJREF_SET_ID,
 	__NFTA_OBJREF_MAX
 };
 #define NFTA_OBJREF_MAX	(__NFTA_OBJREF_MAX - 1)
-- 
cgit v1.2.3


From 8411b6442e59810fe0750a2f321b9dcb7d0a3d17 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 5 Dec 2016 23:35:50 +0100
Subject: netfilter: nf_tables: support for set flushing

This patch adds support for set flushing, that consists of walking over
the set elements if the NFTA_SET_ELEM_LIST_ELEMENTS attribute is set.
This patch requires the following changes:

1) Add set->ops->deactivate_one() operation: This allows us to
   deactivate an element from the set element walk path, given we can
   skip the lookup that happens in ->deactivate().

2) Add a new nft_trans_alloc_gfp() function since we need to allocate
   transactions using GFP_ATOMIC given the set walk path happens with
   held rcu_read_lock.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 85f0f03f1e87..924325c46aab 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -259,7 +259,8 @@ struct nft_expr;
  *	@lookup: look up an element within the set
  *	@insert: insert new element into set
  *	@activate: activate new element in the next generation
- *	@deactivate: deactivate element in the next generation
+ *	@deactivate: lookup for element and deactivate it in the next generation
+ *	@deactivate_one: deactivate element in the next generation
  *	@remove: remove element from set
  *	@walk: iterate over all set elemeennts
  *	@privsize: function to return size of set private data
@@ -294,6 +295,9 @@ struct nft_set_ops {
 	void *				(*deactivate)(const struct net *net,
 						      const struct nft_set *set,
 						      const struct nft_set_elem *elem);
+	bool				(*deactivate_one)(const struct net *net,
+							  const struct nft_set *set,
+							  void *priv);
 	void				(*remove)(const struct nft_set *set,
 						  const struct nft_set_elem *elem);
 	void				(*walk)(const struct nft_ctx *ctx,
-- 
cgit v1.2.3


From 2c16d60332643e90d4fa244f4a706c454b8c7569 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 6 Dec 2016 16:25:02 -0500
Subject: netfilter: xt_bpf: support ebpf

Add support for attaching an eBPF object by file descriptor.

The iptables binary can be called with a path to an elf object or a
pinned bpf object. Also pass the mode and path to the kernel to be
able to return it later for iptables dump and save.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/xt_bpf.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/xt_bpf.h b/include/uapi/linux/netfilter/xt_bpf.h
index 1fad2c27ac32..b97725af2ac0 100644
--- a/include/uapi/linux/netfilter/xt_bpf.h
+++ b/include/uapi/linux/netfilter/xt_bpf.h
@@ -2,9 +2,11 @@
 #define _XT_BPF_H
 
 #include <linux/filter.h>
+#include <linux/limits.h>
 #include <linux/types.h>
 
 #define XT_BPF_MAX_NUM_INSTR	64
+#define XT_BPF_PATH_MAX		(XT_BPF_MAX_NUM_INSTR * sizeof(struct sock_filter))
 
 struct bpf_prog;
 
@@ -16,4 +18,23 @@ struct xt_bpf_info {
 	struct bpf_prog *filter __attribute__((aligned(8)));
 };
 
+enum xt_bpf_modes {
+	XT_BPF_MODE_BYTECODE,
+	XT_BPF_MODE_FD_PINNED,
+	XT_BPF_MODE_FD_ELF,
+};
+
+struct xt_bpf_info_v1 {
+	__u16 mode;
+	__u16 bpf_program_num_elem;
+	__s32 fd;
+	union {
+		struct sock_filter bpf_program[XT_BPF_MAX_NUM_INSTR];
+		char path[XT_BPF_PATH_MAX];
+	};
+
+	/* only used in the kernel */
+	struct bpf_prog *filter __attribute__((aligned(8)));
+};
+
 #endif /*_XT_BPF_H */
-- 
cgit v1.2.3


From 5b8e2f61b9df529ca4af057daf7bfb1de348bdf1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 6 Dec 2016 19:32:50 -0800
Subject: net: sock_rps_record_flow() is for connected sockets

Paolo noticed a cache line miss in UDP recvmsg() to access
sk_rxhash, sharing a cache line with sk_drops.

sk_drops might be heavily incremented by cpus handling a flood targeting
this socket.

We might place sk_drops on a separate cache line, but lets try
to avoid wasting 64 bytes per socket just for this, since we have
other bottlenecks to take care of.

sock_rps_record_flow() should only access sk_rxhash for connected
flows.

Testing sk_state for TCP_ESTABLISHED covers most of the cases for
connected sockets, for a zero cost, since system calls using
sock_rps_record_flow() also access sk->sk_prot which is on the
same cache line.

A follow up patch will provide a static_key (Jump Label) since most
hosts do not even use RFS.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 6dfe3aa22b97..1749e38d0301 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -913,7 +913,17 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
 static inline void sock_rps_record_flow(const struct sock *sk)
 {
 #ifdef CONFIG_RPS
-	sock_rps_record_flow_hash(sk->sk_rxhash);
+	/* Reading sk->sk_rxhash might incur an expensive cache line miss.
+	 *
+	 * TCP_ESTABLISHED does cover almost all states where RFS
+	 * might be useful, and is cheaper [1] than testing :
+	 *	IPv4: inet_sk(sk)->inet_daddr
+	 * 	IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
+	 * OR	an additional socket flag
+	 * [1] : sk_state and sk_prot are in the same cache line.
+	 */
+	if (sk->sk_state == TCP_ESTABLISHED)
+		sock_rps_record_flow_hash(sk->sk_rxhash);
 #endif
 }
 
-- 
cgit v1.2.3


From faa3ffce78298b2b782297765cffd05f52fed9d4 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Wed, 7 Dec 2016 14:03:10 +0200
Subject: net/sched: cls_flower: Add support for matching on flags

Add UAPI to provide set of flags for matching, where the flags
provided from user-space are mapped to flow-dissector flags.

The 1st flag allows to match on whether the packet is an
IP fragment and corresponds to the FLOW_DIS_IS_FRAGMENT flag.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 1adc0b654996..0ad9f0bce043 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -458,11 +458,18 @@ enum {
 	TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK,	/* be16 */
 	TCA_FLOWER_KEY_ENC_UDP_DST_PORT,	/* be16 */
 	TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,	/* be16 */
+
+	TCA_FLOWER_KEY_FLAGS,		/* be32 */
+	TCA_FLOWER_KEY_FLAGS_MASK,	/* be32 */
 	__TCA_FLOWER_MAX,
 };
 
 #define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1)
 
+enum {
+	TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0),
+};
+
 /* Match-all classifier */
 
 enum {
-- 
cgit v1.2.3


From 972d3876faa8a9195122b2d2bcd3155f904fff37 Mon Sep 17 00:00:00 2001
From: Simon Horman <simon.horman@netronome.com>
Date: Wed, 7 Dec 2016 13:48:27 +0100
Subject: flow dissector: ICMP support

Allow dissection of ICMP(V6) type and code. This should only occur
if a packet is ICMP(V6) and the dissector has FLOW_DISSECTOR_KEY_ICMP set.

There are currently no users of FLOW_DISSECTOR_KEY_ICMP.
A follow-up patch will allow FLOW_DISSECTOR_KEY_ICMP to be used by
the flower classifier.

Signed-off-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index c4f31666afd2..d896a33e00d4 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -104,6 +104,22 @@ struct flow_dissector_key_ports {
 	};
 };
 
+/**
+ * flow_dissector_key_icmp:
+ *	@ports: type and code of ICMP header
+ *		icmp: ICMP type (high) and code (low)
+ *		type: ICMP type
+ *		code: ICMP code
+ */
+struct flow_dissector_key_icmp {
+	union {
+		__be16 icmp;
+		struct {
+			u8 type;
+			u8 code;
+		};
+	};
+};
 
 /**
  * struct flow_dissector_key_eth_addrs:
@@ -122,6 +138,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
 	FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
 	FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
+	FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */
 	FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
 	FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */
 	FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_flow_vlan */
-- 
cgit v1.2.3


From 7b684884fbfab33251115fa5054fb821c34b93be Mon Sep 17 00:00:00 2001
From: Simon Horman <simon.horman@netronome.com>
Date: Wed, 7 Dec 2016 13:48:28 +0100
Subject: net/sched: cls_flower: Support matching on ICMP type and code

Support matching on ICMP type and code.

Example usage:

tc qdisc add dev eth0 ingress

tc filter add dev eth0 protocol ip parent ffff: flower \
	indev eth0 ip_proto icmp type 8 code 0 action drop

tc filter add dev eth0 protocol ipv6 parent ffff: flower \
	indev eth0 ip_proto icmpv6 type 128 code 0 action drop

Signed-off-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 0ad9f0bce043..cb4bcdc58543 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -461,6 +461,16 @@ enum {
 
 	TCA_FLOWER_KEY_FLAGS,		/* be32 */
 	TCA_FLOWER_KEY_FLAGS_MASK,	/* be32 */
+
+	TCA_FLOWER_KEY_ICMPV4_CODE,	/* u8 */
+	TCA_FLOWER_KEY_ICMPV4_CODE_MASK,/* u8 */
+	TCA_FLOWER_KEY_ICMPV4_TYPE,	/* u8 */
+	TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,/* u8 */
+	TCA_FLOWER_KEY_ICMPV6_CODE,	/* u8 */
+	TCA_FLOWER_KEY_ICMPV6_CODE_MASK,/* u8 */
+	TCA_FLOWER_KEY_ICMPV6_TYPE,	/* u8 */
+	TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,/* u8 */
+
 	__TCA_FLOWER_MAX,
 };
 
-- 
cgit v1.2.3


From 89caaa2d80b7bf9bd8632cd3137254f8c685e5db Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@axis.com>
Date: Wed, 7 Dec 2016 15:20:07 +0100
Subject: net: stmmac: add support for independent DMA pbl for tx/rx

GMAC and newer supports independent programmable burst lengths for
DMA tx/rx. Add new optional devicetree properties representing this.

To be backwards compatible, snps,pbl will still be valid, but
snps,txpbl/snps,rxpbl will override the value in snps,pbl if set.

If the IP is synthesized to use the AXI interface, there is a register
and a matching DT property inside the optional stmmac-axi-config DT node
for controlling burst lengths, named snps,blen.
However, using this register, it is not possible to control tx and rx
independently. Also, this register is not available if the IP was
synthesized with, e.g., the AHB interface.

Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
Acked-by: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/stmmac.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 3537fb33cc90..e6d7a5940819 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -88,6 +88,8 @@ struct stmmac_mdio_bus_data {
 
 struct stmmac_dma_cfg {
 	int pbl;
+	int txpbl;
+	int rxpbl;
 	int fixed_burst;
 	int mixed_burst;
 	bool aal;
-- 
cgit v1.2.3


From 4022d039a315951e59d95d22e79198d861ce4490 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@axis.com>
Date: Wed, 7 Dec 2016 15:20:08 +0100
Subject: net: smmac: allow configuring lower pbl values

The driver currently always sets the PBLx8/PBLx4 bit, which means that
the pbl values configured via the pbl/txpbl/rxpbl DT properties are
always multiplied by 8/4 in the hardware.

In order to allow the DT to configure lower pbl values, while at the
same time not changing behavior of any existing device trees using the
pbl/txpbl/rxpbl settings, add a property to disable the multiplication
of the pbl by 8/4 in the hardware.

Suggested-by: Rabin Vincent <rabinv@axis.com>
Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
Acked-by: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/stmmac.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index e6d7a5940819..266dab9ad782 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -90,6 +90,7 @@ struct stmmac_dma_cfg {
 	int pbl;
 	int txpbl;
 	int rxpbl;
+	bool pblx8;
 	int fixed_burst;
 	int mixed_burst;
 	bool aal;
-- 
cgit v1.2.3


From 13bfff25c081f4e060af761c4082b5a96f756810 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 7 Dec 2016 08:29:10 -0800
Subject: net: rfs: add a jump label

RFS is not commonly used, so add a jump label to avoid some conditionals
in fast path.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 +
 include/net/sock.h        | 25 ++++++++++++++-----------
 2 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1ff5ea6e1221..994f7423a74b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -192,6 +192,7 @@ struct net_device_stats {
 #ifdef CONFIG_RPS
 #include <linux/static_key.h>
 extern struct static_key rps_needed;
+extern struct static_key rfs_needed;
 #endif
 
 struct neighbour;
diff --git a/include/net/sock.h b/include/net/sock.h
index 1749e38d0301..2729e77950b7 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -913,17 +913,20 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
 static inline void sock_rps_record_flow(const struct sock *sk)
 {
 #ifdef CONFIG_RPS
-	/* Reading sk->sk_rxhash might incur an expensive cache line miss.
-	 *
-	 * TCP_ESTABLISHED does cover almost all states where RFS
-	 * might be useful, and is cheaper [1] than testing :
-	 *	IPv4: inet_sk(sk)->inet_daddr
-	 * 	IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
-	 * OR	an additional socket flag
-	 * [1] : sk_state and sk_prot are in the same cache line.
-	 */
-	if (sk->sk_state == TCP_ESTABLISHED)
-		sock_rps_record_flow_hash(sk->sk_rxhash);
+	if (static_key_false(&rfs_needed)) {
+		/* Reading sk->sk_rxhash might incur an expensive cache line
+		 * miss.
+		 *
+		 * TCP_ESTABLISHED does cover almost all states where RFS
+		 * might be useful, and is cheaper [1] than testing :
+		 *	IPv4: inet_sk(sk)->inet_daddr
+		 * 	IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
+		 * OR	an additional socket flag
+		 * [1] : sk_state and sk_prot are in the same cache line.
+		 */
+		if (sk->sk_state == TCP_ESTABLISHED)
+			sock_rps_record_flow_hash(sk->sk_rxhash);
+	}
 #endif
 }
 
-- 
cgit v1.2.3


From c8c8b127091b758f5768f906bcdeeb88bc9951ca Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 7 Dec 2016 09:19:33 -0800
Subject: udp: under rx pressure, try to condense skbs

Under UDP flood, many softirq producers try to add packets to
UDP receive queue, and one user thread is burning one cpu trying
to dequeue packets as fast as possible.

Two parts of the per packet cost are :
- copying payload from kernel space to user space,
- freeing memory pieces associated with skb.

If socket is under pressure, softirq handler(s) can try to pull in
skb->head the payload of the packet if it fits.

Meaning the softirq handler(s) can free/reuse the page fragment
immediately, instead of letting udp_recvmsg() do this hundreds of usec
later, possibly from another node.

Additional gains :
- We reduce skb->truesize and thus can store more packets per SO_RCVBUF
- We avoid cache line misses at copyout() time and consume_skb() time,
and avoid one put_page() with potential alien freeing on NUMA hosts.

This comes at the cost of a copy, bounded to available tail room, which
is usually small. (We might have to fix GRO_MAX_HEAD which looks bigger
than necessary)

This patch gave me about 5 % increase in throughput in my tests.

skb_condense() helper could probably used in other contexts.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9c535fbccf2c..0cd92b0f2af5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1966,6 +1966,8 @@ static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len)
 	return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL;
 }
 
+void skb_condense(struct sk_buff *skb);
+
 /**
  *	skb_headroom - bytes at buffer head
  *	@skb: buffer to check
-- 
cgit v1.2.3


From 3665f3817cd354ab7a811b3a4f282c4f5cb1a0d0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 7 Dec 2016 10:05:36 -0800
Subject: net: do not read sk_drops if application does not care

sk_drops can be an often written field, do not read it unless
application showed interest.

Note that sk_drops can be read via inet_diag, so applications
can avoid getting this info from every received packet.

In the future, 'reading' sk_drops might require folding per node or per
cpu fields, and thus become even more expensive than today.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 2729e77950b7..e17aa3de2b4d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2163,7 +2163,8 @@ struct sock_skb_cb {
 static inline void
 sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
 {
-	SOCK_SKB_CB(skb)->dropcount = atomic_read(&sk->sk_drops);
+	SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ?
+						atomic_read(&sk->sk_drops) : 0;
 }
 
 static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
-- 
cgit v1.2.3


From d2a4dd37f6b41fbcad76efbf63124eb3126c66fe Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 7 Dec 2016 10:57:59 -0800
Subject: bpf: fix state equivalence

Commmits 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers")
and 484611357c19 ("bpf: allow access into map value arrays") by themselves
are correct, but in combination they make state equivalence ignore 'id' field
of the register state which can lead to accepting invalid program.

Fixes: 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers")
Fixes: 484611357c19 ("bpf: allow access into map value arrays")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7453c1281531..a13b031dc6b8 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -18,13 +18,6 @@
 
 struct bpf_reg_state {
 	enum bpf_reg_type type;
-	/*
-	 * Used to determine if any memory access using this register will
-	 * result in a bad access.
-	 */
-	s64 min_value;
-	u64 max_value;
-	u32 id;
 	union {
 		/* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
 		s64 imm;
@@ -40,6 +33,13 @@ struct bpf_reg_state {
 		 */
 		struct bpf_map *map_ptr;
 	};
+	u32 id;
+	/* Used to determine if any memory access using this register will
+	 * result in a bad access. These two fields must be last.
+	 * See states_equal()
+	 */
+	s64 min_value;
+	u64 max_value;
 };
 
 enum bpf_stack_slot_type {
-- 
cgit v1.2.3


From f38e7a32ee4fc9c8aeeac59e6e0462cd281586e3 Mon Sep 17 00:00:00 2001
From: "Woojung.Huh@microchip.com" <Woojung.Huh@microchip.com>
Date: Wed, 7 Dec 2016 20:26:07 +0000
Subject: phy: add phy fixup unregister functions

>From : Woojung Huh <woojung.huh@microchip.com>

Add functions to unregister phy fixup for modules.

int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask)
	Unregister phy fixup from phy_fixup_list per bus_id, phy_uid &
	phy_uid_mask

int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask)
	Unregister phy fixup from phy_fixup_list.
	Use it for fixup registered by phy_register_fixup_for_uid()

int phy_unregister_fixup_for_id(const char *bus_id)
	Unregister phy fixup from phy_fixup_list.
	Use it for fixup registered by phy_register_fixup_for_id()

Signed-off-by: Woojung Huh <woojung.huh@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index feb8a98e8dd3..f7d95f644eed 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -860,6 +860,10 @@ int phy_register_fixup_for_id(const char *bus_id,
 int phy_register_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask,
 			       int (*run)(struct phy_device *));
 
+int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask);
+int phy_unregister_fixup_for_id(const char *bus_id);
+int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask);
+
 int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable);
 int phy_get_eee_err(struct phy_device *phydev);
 int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data);
-- 
cgit v1.2.3


From 17bedab2723145d17b14084430743549e6943d03 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 7 Dec 2016 15:53:11 -0800
Subject: bpf: xdp: Allow head adjustment in XDP prog

This patch allows XDP prog to extend/remove the packet
data at the head (like adding or removing header).  It is
done by adding a new XDP helper bpf_xdp_adjust_head().

It also renames bpf_helper_changes_skb_data() to
bpf_helper_changes_pkt_data() to better reflect
that XDP prog does not work on skb.

This patch adds one "xdp_adjust_head" bit to bpf_prog for the
XDP-capable driver to check if the XDP prog requires
bpf_xdp_adjust_head() support.  The driver can then decide
to error out during XDP_SETUP_PROG.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h   |  6 ++++--
 include/uapi/linux/bpf.h | 11 ++++++++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index f078d2b1cff6..6a1658308612 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -406,7 +406,8 @@ struct bpf_prog {
 	u16			jited:1,	/* Is our filter JIT'ed? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1;	/* Do we need dst entry? */
+				dst_needed:1,	/* Do we need dst entry? */
+				xdp_adjust_head:1; /* Adjusting pkt head? */
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
@@ -440,6 +441,7 @@ struct bpf_skb_data_end {
 struct xdp_buff {
 	void *data;
 	void *data_end;
+	void *data_hard_start;
 };
 
 /* compute the linear packet data range [data, data_end) which
@@ -595,7 +597,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
-bool bpf_helper_changes_skb_data(void *func);
+bool bpf_helper_changes_pkt_data(void *func);
 
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6123d9b8e828..0eb0e87dbe9f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -424,6 +424,12 @@ union bpf_attr {
  *     @len: length of header to be pushed in front
  *     @flags: Flags (unused for now)
  *     Return: 0 on success or negative error
+ *
+ * int bpf_xdp_adjust_head(xdp_md, delta)
+ *     Adjust the xdp_md.data by delta
+ *     @xdp_md: pointer to xdp_md
+ *     @delta: An positive/negative integer to be added to xdp_md.data
+ *     Return: 0 on success or negative on error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -469,7 +475,8 @@ union bpf_attr {
 	FN(csum_update),		\
 	FN(set_hash_invalid),		\
 	FN(get_numa_node_id),		\
-	FN(skb_change_head),
+	FN(skb_change_head),		\
+	FN(xdp_adjust_head),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -576,6 +583,8 @@ struct bpf_sock {
 	__u32 protocol;
 };
 
+#define XDP_PACKET_HEADROOM 256
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result
-- 
cgit v1.2.3


From 2fa436b3a2a7009c11a3bc03fe0ff4c26e80fd87 Mon Sep 17 00:00:00 2001
From: Vamsi Krishna <vamsin@qti.qualcomm.com>
Date: Fri, 2 Dec 2016 23:59:08 +0200
Subject: nl80211: Use different attrs for BSSID and random MAC addr in scan
 req

NL80211_ATTR_MAC was used to set both the specific BSSID to be scanned
and the random MAC address to be used when privacy is enabled. When both
the features are enabled, both the BSSID and the local MAC address were
getting same value causing Probe Request frames to go with unintended
DA. Hence, this has been fixed by using a different NL80211_ATTR_BSSID
attribute to set the specific BSSID (which was the more recent addition
in cfg80211) for a scan.

Backwards compatibility with old userspace software is maintained to
some extent by allowing NL80211_ATTR_MAC to be used to set the specific
BSSID when scanning without enabling random MAC address use.

Scanning with random source MAC address was introduced by commit
ad2b26abc157 ("cfg80211: allow drivers to support random MAC addresses
for scan") and the issue was introduced with the addition of the second
user for the same attribute in commit 818965d39177 ("cfg80211: Allow a
scan request for a specific BSSID").

Fixes: 818965d39177 ("cfg80211: Allow a scan request for a specific BSSID")
Signed-off-by: Vamsi Krishna <vamsin@qti.qualcomm.com>
Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 259c9c77fdc1..6b76e3b0c18e 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -323,7 +323,7 @@
  * @NL80211_CMD_GET_SCAN: get scan results
  * @NL80211_CMD_TRIGGER_SCAN: trigger a new scan with the given parameters
  *	%NL80211_ATTR_TX_NO_CCK_RATE is used to decide whether to send the
- *	probe requests at CCK rate or not. %NL80211_ATTR_MAC can be used to
+ *	probe requests at CCK rate or not. %NL80211_ATTR_BSSID can be used to
  *	specify a BSSID to scan for; if not included, the wildcard BSSID will
  *	be used.
  * @NL80211_CMD_NEW_SCAN_RESULTS: scan notification (as a reply to
@@ -1977,6 +1977,9 @@ enum nl80211_commands {
  * @NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED: Indicates whether or not multicast
  *	packets should be send out as unicast to all stations (flag attribute).
  *
+ * @NL80211_ATTR_BSSID: The BSSID of the AP. Note that %NL80211_ATTR_MAC is also
+ *	used in various commands/events for specifying the BSSID.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2381,6 +2384,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED,
 
+	NL80211_ATTR_BSSID,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
-- 
cgit v1.2.3


From e6f462df9acd2a3295e5d34eb29e2823220cf129 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 8 Dec 2016 17:22:09 +0100
Subject: cfg80211/mac80211: fix BSS leaks when abandoning assoc attempts

When mac80211 abandons an association attempt, it may free
all the data structures, but inform cfg80211 and userspace
about it only by sending the deauth frame it received, in
which case cfg80211 has no link to the BSS struct that was
used and will not cfg80211_unhold_bss() it.

Fix this by providing a way to inform cfg80211 of this with
the BSS entry passed, so that it can clean up properly, and
use this ability in the appropriate places in mac80211.

This isn't ideal: some code is more or less duplicated and
tracing is missing. However, it's a fairly small change and
it's thus easier to backport - cleanups can come later.

Cc: stable@vger.kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 2019310cf135..814be4b4200c 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4685,6 +4685,17 @@ void cfg80211_rx_assoc_resp(struct net_device *dev,
  */
 void cfg80211_assoc_timeout(struct net_device *dev, struct cfg80211_bss *bss);
 
+/**
+ * cfg80211_abandon_assoc - notify cfg80211 of abandoned association attempt
+ * @dev: network device
+ * @bss: The BSS entry with which association was abandoned.
+ *
+ * Call this whenever - for reasons reported through other API, like deauth RX,
+ * an association attempt was abandoned.
+ * This function may sleep. The caller must hold the corresponding wdev's mutex.
+ */
+void cfg80211_abandon_assoc(struct net_device *dev, struct cfg80211_bss *bss);
+
 /**
  * cfg80211_tx_mlme_mgmt - notification of transmitted deauth/disassoc frame
  * @dev: network device
-- 
cgit v1.2.3


From c84d949057cab262b4d3110ead9a42a58c2958f7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 8 Dec 2016 11:41:55 -0800
Subject: udp: copy skb->truesize in the first cache line

In UDP RX handler, we currently clear skb->dev before skb
is added to receive queue, because device pointer is no longer
available once we exit from RCU section.

Since this first cache line is always hot, lets reuse this space
to store skb->truesize and thus avoid a cache line miss at
udp_recvmsg()/udp_skb_destructor time while receive queue
spinlock is held.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0cd92b0f2af5..332e76756f54 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -645,8 +645,15 @@ struct sk_buff {
 		struct rb_node	rbnode; /* used in netem & tcp stack */
 	};
 	struct sock		*sk;
-	struct net_device	*dev;
 
+	union {
+		struct net_device	*dev;
+		/* Some protocols might use this space to store information,
+		 * while device pointer would be NULL.
+		 * UDP receive path is one user.
+		 */
+		unsigned long		dev_scratch;
+	};
 	/*
 	 * This is the control buffer. It is free to use for every
 	 * layer. Please put your private variables there. If you
-- 
cgit v1.2.3


From 6b229cf77d683f634f0edd876c6d1015402303ad Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 8 Dec 2016 11:41:56 -0800
Subject: udp: add batching to udp_rmem_release()

If udp_recvmsg() constantly releases sk_rmem_alloc
for every read packet, it gives opportunity for
producers to immediately grab spinlocks and desperatly
try adding another packet, causing false sharing.

We can add a simple heuristic to give the signal
by batches of ~25 % of the queue capacity.

This patch considerably increases performance under
flood by about 50 %, since the thread draining the queue
is no longer slowed by false sharing.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index d1fd8cd39478..c0f530809d1f 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -79,6 +79,9 @@ struct udp_sock {
 	int			(*gro_complete)(struct sock *sk,
 						struct sk_buff *skb,
 						int nhoff);
+
+	/* This field is dirtied by udp_recvmsg() */
+	int		forward_deficit;
 };
 
 static inline struct udp_sock *udp_sk(const struct sock *sk)
-- 
cgit v1.2.3


From 41c43fbee68f4f9a2a9675d83bca91c77862d7f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= <asbjorn@asbjorn.st>
Date: Sun, 11 Dec 2016 00:18:57 +0000
Subject: net: l2tp: export debug flags to UAPI

Move the L2TP_MSG_* definitions to UAPI, as it is part of
the netlink API.

Signed-off-by: Asbjoern Sloth Toennesen <asbjorn@asbjorn.st>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/l2tp.h | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h
index 5daa48e2571e..85ddb74fcd1c 100644
--- a/include/uapi/linux/l2tp.h
+++ b/include/uapi/linux/l2tp.h
@@ -108,7 +108,7 @@ enum {
 	L2TP_ATTR_VLAN_ID,		/* u16 */
 	L2TP_ATTR_COOKIE,		/* 0, 4 or 8 bytes */
 	L2TP_ATTR_PEER_COOKIE,		/* 0, 4 or 8 bytes */
-	L2TP_ATTR_DEBUG,		/* u32 */
+	L2TP_ATTR_DEBUG,		/* u32, enum l2tp_debug_flags */
 	L2TP_ATTR_RECV_SEQ,		/* u8 */
 	L2TP_ATTR_SEND_SEQ,		/* u8 */
 	L2TP_ATTR_LNS_MODE,		/* u8 */
@@ -175,6 +175,21 @@ enum l2tp_seqmode {
 	L2TP_SEQ_ALL = 2,
 };
 
+/**
+ * enum l2tp_debug_flags - debug message categories for L2TP tunnels/sessions
+ *
+ * @L2TP_MSG_DEBUG: verbose debug (if compiled in)
+ * @L2TP_MSG_CONTROL: userspace - kernel interface
+ * @L2TP_MSG_SEQ: sequence numbers
+ * @L2TP_MSG_DATA: data packets
+ */
+enum l2tp_debug_flags {
+	L2TP_MSG_DEBUG		= (1 << 0),
+	L2TP_MSG_CONTROL	= (1 << 1),
+	L2TP_MSG_SEQ		= (1 << 2),
+	L2TP_MSG_DATA		= (1 << 3),
+};
+
 /*
  * NETLINK_GENERIC related info
  */
-- 
cgit v1.2.3


From 47c3e7783be4e142b861d34b5c2e223330b05d8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= <asbjorn@asbjorn.st>
Date: Sun, 11 Dec 2016 00:18:58 +0000
Subject: net: l2tp: deprecate PPPOL2TP_MSG_* in favour of L2TP_MSG_*

PPPOL2TP_MSG_* and L2TP_MSG_* are duplicates, and are being used
interchangeably in the kernel, so let's standardize on L2TP_MSG_*
internally, and keep PPPOL2TP_MSG_* defined in UAPI for compatibility.

Signed-off-by: Asbjoern Sloth Toennesen <asbjorn@asbjorn.st>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_pppol2tp.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_pppol2tp.h b/include/uapi/linux/if_pppol2tp.h
index 4bd1f55d6377..6418c4d10241 100644
--- a/include/uapi/linux/if_pppol2tp.h
+++ b/include/uapi/linux/if_pppol2tp.h
@@ -18,6 +18,7 @@
 #include <linux/types.h>
 #include <linux/in.h>
 #include <linux/in6.h>
+#include <linux/l2tp.h>
 
 /* Structure used to connect() the socket to a particular tunnel UDP
  * socket over IPv4.
@@ -90,14 +91,12 @@ enum {
 	PPPOL2TP_SO_REORDERTO	= 5,
 };
 
-/* Debug message categories for the DEBUG socket option */
+/* Debug message categories for the DEBUG socket option (deprecated) */
 enum {
-	PPPOL2TP_MSG_DEBUG	= (1 << 0),	/* verbose debug (if
-						 * compiled in) */
-	PPPOL2TP_MSG_CONTROL	= (1 << 1),	/* userspace - kernel
-						 * interface */
-	PPPOL2TP_MSG_SEQ	= (1 << 2),	/* sequence numbers */
-	PPPOL2TP_MSG_DATA	= (1 << 3),	/* data packets */
+	PPPOL2TP_MSG_DEBUG	= L2TP_MSG_DEBUG,
+	PPPOL2TP_MSG_CONTROL	= L2TP_MSG_CONTROL,
+	PPPOL2TP_MSG_SEQ	= L2TP_MSG_SEQ,
+	PPPOL2TP_MSG_DATA	= L2TP_MSG_DATA,
 };
 
 
-- 
cgit v1.2.3