From e227300c8395dffaa7614ce7c7666a82180ebc60 Mon Sep 17 00:00:00 2001
From: Purushottam Kushwaha <pkushwah@qti.qualcomm.com>
Date: Wed, 12 Oct 2016 18:25:35 +0530
Subject: cfg80211: pass struct to interface combination check/iter

Move the growing parameter list to a structure for the interface
combination check and iteration functions in cfg80211 and mac80211
to make the code easier to understand.

Signed-off-by: Purushottam Kushwaha <pkushwah@qti.qualcomm.com>
[edit commit message]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index fe78f02a242e..ea108541e1e0 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -771,6 +771,26 @@ struct cfg80211_csa_settings {
 	u8 count;
 };
 
+/**
+ * struct iface_combination_params - input parameters for interface combinations
+ *
+ * Used to pass interface combination parameters
+ *
+ * @num_different_channels: the number of different channels we want
+ *	to use for verification
+ * @radar_detect: a bitmap where each bit corresponds to a channel
+ *	width where radar detection is needed, as in the definition of
+ *	&struct ieee80211_iface_combination.@radar_detect_widths
+ * @iftype_num: array with the number of interfaces of each interface
+ *	type.  The index is the interface type as specified in &enum
+ *	nl80211_iftype.
+ */
+struct iface_combination_params {
+	int num_different_channels;
+	u8 radar_detect;
+	int iftype_num[NUM_NL80211_IFTYPES];
+};
+
 /**
  * enum station_parameters_apply_mask - station parameter values to apply
  * @STATION_PARAM_APPLY_UAPSD: apply new uAPSD parameters (uapsd_queues, max_sp)
@@ -5575,36 +5595,20 @@ unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy);
  * cfg80211_check_combinations - check interface combinations
  *
  * @wiphy: the wiphy
- * @num_different_channels: the number of different channels we want
- *	to use for verification
- * @radar_detect: a bitmap where each bit corresponds to a channel
- *	width where radar detection is needed, as in the definition of
- *	&struct ieee80211_iface_combination.@radar_detect_widths
- * @iftype_num: array with the numbers of interfaces of each interface
- *	type.  The index is the interface type as specified in &enum
- *	nl80211_iftype.
+ * @params: the interface combinations parameter
  *
  * This function can be called by the driver to check whether a
  * combination of interfaces and their types are allowed according to
  * the interface combinations.
  */
 int cfg80211_check_combinations(struct wiphy *wiphy,
-				const int num_different_channels,
-				const u8 radar_detect,
-				const int iftype_num[NUM_NL80211_IFTYPES]);
+				struct iface_combination_params *params);
 
 /**
  * cfg80211_iter_combinations - iterate over matching combinations
  *
  * @wiphy: the wiphy
- * @num_different_channels: the number of different channels we want
- *	to use for verification
- * @radar_detect: a bitmap where each bit corresponds to a channel
- *	width where radar detection is needed, as in the definition of
- *	&struct ieee80211_iface_combination.@radar_detect_widths
- * @iftype_num: array with the numbers of interfaces of each interface
- *	type.  The index is the interface type as specified in &enum
- *	nl80211_iftype.
+ * @params: the interface combinations parameter
  * @iter: function to call for each matching combination
  * @data: pointer to pass to iter function
  *
@@ -5613,9 +5617,7 @@ int cfg80211_check_combinations(struct wiphy *wiphy,
  * purposes.
  */
 int cfg80211_iter_combinations(struct wiphy *wiphy,
-			       const int num_different_channels,
-			       const u8 radar_detect,
-			       const int iftype_num[NUM_NL80211_IFTYPES],
+			       struct iface_combination_params *params,
 			       void (*iter)(const struct ieee80211_iface_combination *c,
 					    void *data),
 			       void *data);
-- 
cgit v1.2.3


From 0c317a02ca982ca093e71bf07cb562265ba40032 Mon Sep 17 00:00:00 2001
From: Purushottam Kushwaha <pkushwah@qti.qualcomm.com>
Date: Wed, 12 Oct 2016 18:26:51 +0530
Subject: cfg80211: support virtual interfaces with different beacon intervals

This commit provides a mechanism for the host drivers to advertise the
support for different beacon intervals among the respective interface
combinations in a group, through NL80211_IFACE_COMB_BI_MIN_GCD (u32).

This value will be compared against GCD of all beaconing interfaces of
matching combinations.

If the driver doesn't advertise this value, the old behaviour where
all beacon intervals must be identical is retained.

If it is specified, then any beacon interval for an interface in the
interface combination as well as the GCD of all active beacon intervals
in the combination must be greater or equal to this value.

Signed-off-by: Purushottam Kushwaha <pkushwah@qti.qualcomm.com>
[change commit message, some variable names, small other things]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index ea108541e1e0..5000ec758eb3 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -784,11 +784,19 @@ struct cfg80211_csa_settings {
  * @iftype_num: array with the number of interfaces of each interface
  *	type.  The index is the interface type as specified in &enum
  *	nl80211_iftype.
+ * @beacon_int_gcd: a value specifying GCD of all beaconing interfaces,
+ *	the GCD of a single value is considered the value itself, so for
+ *	a single interface this should be set to that interface's beacon
+ *	interval
+ * @beacon_int_different: a flag indicating whether or not all beacon
+ *	intervals (of beaconing interfaces) are different or not.
  */
 struct iface_combination_params {
 	int num_different_channels;
 	u8 radar_detect;
 	int iftype_num[NUM_NL80211_IFTYPES];
+	u32 beacon_int_gcd;
+	bool beacon_int_different;
 };
 
 /**
@@ -3100,6 +3108,12 @@ struct ieee80211_iface_limit {
  *	only in special cases.
  * @radar_detect_widths: bitmap of channel widths supported for radar detection
  * @radar_detect_regions: bitmap of regions supported for radar detection
+ * @beacon_int_min_gcd: This interface combination supports different
+ *	beacon intervals.
+ *	= 0 - all beacon intervals for different interface must be same.
+ *	> 0 - any beacon interval for the interface part of this combination AND
+ *	      *GCD* of all beacon intervals from beaconing interfaces of this
+ *	      combination must be greater or equal to this value.
  *
  * With this structure the driver can describe which interface
  * combinations it supports concurrently.
@@ -3158,6 +3172,7 @@ struct ieee80211_iface_combination {
 	bool beacon_int_infra_match;
 	u8 radar_detect_widths;
 	u8 radar_detect_regions;
+	u32 beacon_int_min_gcd;
 };
 
 struct ieee80211_txrx_stypes {
-- 
cgit v1.2.3


From cc6ac9bccf6b9814d37932e86a92f8e6a92960dc Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 8 Oct 2016 11:36:05 +0800
Subject: sctp: reuse sent_count to avoid retransmitted chunks for RTT
 measurements

Now sctp uses chunk->resent to record if a chunk is retransmitted, for
RTT measurements with retransmitted DATA chunks. chunk->sent_count was
introduced to record how many times one chunk has been sent for prsctp
RTX policy before. We actually can know if one chunk is retransmitted
by checking chunk->sent_count is greater than 1.

This patch is to remove resent from sctp_chunk and reuse sent_count
to avoid retransmitted chunks for RTT measurements.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 11c3bf262a85..27a933e4c90e 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -641,7 +641,6 @@ struct sctp_chunk {
 #define SCTP_NEED_FRTX 0x1
 #define SCTP_DONT_FRTX 0x2
 	__u16	rtt_in_progress:1,	/* This chunk used for RTT calc? */
-		resent:1,		/* Has this chunk ever been resent. */
 		has_tsn:1,		/* Does this chunk have a TSN yet? */
 		has_ssn:1,		/* Does this chunk have a SSN yet? */
 		singleton:1,		/* Only chunk in the packet? */
@@ -656,6 +655,7 @@ struct sctp_chunk {
 		fast_retransmit:2;	/* Is this chunk fast retransmitted? */
 };
 
+#define sctp_chunk_retransmitted(chunk)	(chunk->sent_count > 1)
 void sctp_chunk_hold(struct sctp_chunk *);
 void sctp_chunk_put(struct sctp_chunk *);
 int sctp_user_addto_chunk(struct sctp_chunk *chunk, int len,
-- 
cgit v1.2.3


From 8ae808eb853e3789b81b8a502cdf22bb01b76880 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 8 Oct 2016 11:40:16 +0800
Subject: sctp: remove the old ttl expires policy

The prsctp polices include ttl expires policy already, we should remove
the old ttl expires codes, and just adjust the new polices' codes to be
compatible with the old one for users.

This patch is to remove all the old expires codes, and if prsctp polices
are not set, it will still set msg's expires_at and check the expires in
sctp_check_abandoned.

Note that asoc->prsctp_enable is set by default, so users can't feel any
difference even if they use the old expires api in userspace.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 27a933e4c90e..bd4a3ded7c87 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -530,7 +530,6 @@ struct sctp_datamsg {
 	/* Did the messenge fail to send? */
 	int send_error;
 	u8 send_failed:1,
-	   can_abandon:1,   /* can chunks from this message can be abandoned. */
 	   can_delay;	    /* should this message be Nagle delayed */
 };
 
-- 
cgit v1.2.3


From 165779231ff9e9c4ac7baaee84eff91d589f3e22 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Thu, 13 Oct 2016 09:06:41 +0300
Subject: net/sched: act_mirred: Rename tcfm_ok_push to tcfm_mac_header_xmit
 and make it a bool

'tcfm_ok_push' specifies whether a mac_len sized push is needed upon
egress to the target device (if action is performed at ingress).

Rename it to 'tcfm_mac_header_xmit' as this is actually an attribute of
the target device (and use a bool instead of int).

This allows to decouple the attribute from the action to be taken.

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_mirred.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h
index 62770add15bd..95431092c4b6 100644
--- a/include/net/tc_act/tc_mirred.h
+++ b/include/net/tc_act/tc_mirred.h
@@ -8,7 +8,7 @@ struct tcf_mirred {
 	struct tc_action	common;
 	int			tcfm_eaction;
 	int			tcfm_ifindex;
-	int			tcfm_ok_push;
+	bool			tcfm_mac_header_xmit;
 	struct net_device __rcu	*tcfm_dev;
 	struct list_head	tcfm_list;
 };
-- 
cgit v1.2.3


From 5724b8b5694794829a071c6da7dd0bc146df0756 Mon Sep 17 00:00:00 2001
From: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Date: Thu, 13 Oct 2016 09:06:43 +0300
Subject: net/sched: tc_mirred: Rename public predicates
 'is_tcf_mirred_redirect' and 'is_tcf_mirred_mirror'

These accessors are used in various drivers that support tc offloading,
to detect properties of a given 'tc_action'.

'is_tcf_mirred_redirect' tests that the action is TCA_EGRESS_REDIR.
'is_tcf_mirred_mirror' tests that the action is TCA_EGRESS_MIRROR.

As a prep towards supporting INGRESS redir/mirror, rename these
predicates to reflect their true meaning:
  s/is_tcf_mirred_redirect/is_tcf_mirred_egress_redirect/
  s/is_tcf_mirred_mirror/is_tcf_mirred_egress_mirror/

Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Cc: Hariprasad S <hariprasad@chelsio.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Ido Schimmel <idosch@mellanox.com>
Cc: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_mirred.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h
index 95431092c4b6..604bc31e23ab 100644
--- a/include/net/tc_act/tc_mirred.h
+++ b/include/net/tc_act/tc_mirred.h
@@ -14,7 +14,7 @@ struct tcf_mirred {
 };
 #define to_mirred(a) ((struct tcf_mirred *)a)
 
-static inline bool is_tcf_mirred_redirect(const struct tc_action *a)
+static inline bool is_tcf_mirred_egress_redirect(const struct tc_action *a)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	if (a->ops && a->ops->type == TCA_ACT_MIRRED)
@@ -23,7 +23,7 @@ static inline bool is_tcf_mirred_redirect(const struct tc_action *a)
 	return false;
 }
 
-static inline bool is_tcf_mirred_mirror(const struct tc_action *a)
+static inline bool is_tcf_mirred_egress_mirror(const struct tc_action *a)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	if (a->ops && a->ops->type == TCA_ACT_MIRRED)
-- 
cgit v1.2.3


From 1104d9ba443a3972052ea4eaa01e51f9ee084652 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Fri, 14 Oct 2016 11:25:36 -0700
Subject: lwtunnel: Add destroy state operation

Users of lwt tunnels may set up some secondary state in build_state
function. Add a corresponding destroy_state function to allow users to
clean up state. This destroy state function is called from lwstate_free.
Also, we now free lwstate using kfree_rcu so user can assume structure
is not freed before rcu.

Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index ea3f80f58fd6..67d235f43202 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -29,6 +29,7 @@ struct lwtunnel_state {
 	int		(*orig_input)(struct sk_buff *);
 	int             len;
 	__u16		headroom;
+	struct		rcu_head rcu;
 	__u8            data[0];
 };
 
@@ -36,6 +37,7 @@ struct lwtunnel_encap_ops {
 	int (*build_state)(struct net_device *dev, struct nlattr *encap,
 			   unsigned int family, const void *cfg,
 			   struct lwtunnel_state **ts);
+	void (*destroy_state)(struct lwtunnel_state *lws);
 	int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);
 	int (*input)(struct sk_buff *skb);
 	int (*fill_encap)(struct sk_buff *skb,
@@ -46,10 +48,7 @@ struct lwtunnel_encap_ops {
 };
 
 #ifdef CONFIG_LWTUNNEL
-static inline void lwtstate_free(struct lwtunnel_state *lws)
-{
-	kfree(lws);
-}
+void lwtstate_free(struct lwtunnel_state *lws);
 
 static inline struct lwtunnel_state *
 lwtstate_get(struct lwtunnel_state *lws)
-- 
cgit v1.2.3


From a3e2f4b6ed9de85086850fe49801f9b00adb6ae1 Mon Sep 17 00:00:00 2001
From: Michael Braun <michael-dev@fami-braun.de>
Date: Sat, 15 Oct 2016 13:28:19 +0200
Subject: mac80211: fix A-MSDU outer SA/DA

According to IEEE 802.11-2012 section 8.3.2 table 8-19, the outer SA/DA
of A-MSDU frames need to be changed depending on FromDS/ToDS values.

Signed-off-by: Michael Braun <michael-dev@fami-braun.de>
[use ether_addr_copy and add alignment annotations]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a810dfcb83c2..e50c9e02889a 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1438,7 +1438,7 @@ enum ieee80211_vif_flags {
 struct ieee80211_vif {
 	enum nl80211_iftype type;
 	struct ieee80211_bss_conf bss_conf;
-	u8 addr[ETH_ALEN];
+	u8 addr[ETH_ALEN] __aligned(2);
 	bool p2p;
 	bool csa_active;
 	bool mu_mimo_owner;
-- 
cgit v1.2.3


From a1264c3d6c04f0e4e9d447caaa249d6288b01520 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 18 Oct 2016 23:12:07 +0300
Subject: wireless: radiotap: fix timestamp sampling position values

The values don't match the radiotap spec, corrected that.

Reported-by: Oz Shalev <oz.shalev@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/ieee80211_radiotap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index ba07b9d8ed63..d0e7e3f8e67a 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -333,9 +333,9 @@ enum ieee80211_radiotap_type {
 #define IEEE80211_RADIOTAP_TIMESTAMP_UNIT_NS			0x0003
 #define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_MASK			0x00F0
 #define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_BEGIN_MDPU		0x0000
-#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_EO_MPDU		0x0010
+#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_PLCP_SIG_ACQ		0x0010
 #define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_EO_PPDU		0x0020
-#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_PLCP_SIG_ACQ		0x0030
+#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_EO_MPDU		0x0030
 #define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_UNKNOWN		0x00F0
 
 #define IEEE80211_RADIOTAP_TIMESTAMP_FLAG_64BIT			0x00
-- 
cgit v1.2.3


From 0aa419ec6e7b98d485f6c66a62a90965eda3c1bb Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Tue, 18 Oct 2016 23:12:10 +0300
Subject: mac80211: allow the driver not to pass the tid to
 ieee80211_sta_uapsd_trigger

iwlwifi will check internally that the tid maps to an AC
that is trigger enabled, but can't know what tid exactly.
Allow the driver to pass a generic tid and make mac80211
assume that a trigger frame was received.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index e50c9e02889a..f3dbadafe16e 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4087,6 +4087,10 @@ void ieee80211_sta_pspoll(struct ieee80211_sta *sta);
  * This must be used in conjunction with ieee80211_sta_ps_transition()
  * and possibly ieee80211_sta_pspoll(); calls to all three must be
  * serialized.
+ * %IEEE80211_NUM_TIDS can be passed as the tid if the tid is unknown.
+ * In this case, mac80211 will not check that this tid maps to an AC
+ * that is trigger enabled and assume that the caller did the proper
+ * checks.
  */
 void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *sta, u8 tid);
 
-- 
cgit v1.2.3


From f3fe4e93dd6346c01fd4070ae02ec746fbae73bb Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Tue, 18 Oct 2016 23:12:11 +0300
Subject: mac80211: add a HW flag for supporting HW TX fragmentation

Currently mac80211 determines whether HW does fragmentation
by checking whether the set_frag_threshold callback is set
or not.
However, some drivers may want to set the HW fragmentation
capability depending on HW generation.
Allow this by checking a HW flag instead of checking the
callback.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
[added the flag to ath10k and wlcore]
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index f3dbadafe16e..a1a27021f452 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2025,6 +2025,10 @@ struct ieee80211_txq {
  *	drivers, mac80211 packet loss mechanism will not be triggered and driver
  *	is completely depending on firmware event for station kickout.
  *
+ * @IEEE80211_HW_SUPPORTS_TX_FRAG: Hardware does fragmentation by itself.
+ *	The stack will not do fragmentation.
+ *	The callback for @set_frag_threshold should be set as well.
+ *
  * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
  */
 enum ieee80211_hw_flags {
@@ -2066,6 +2070,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_TX_AMSDU,
 	IEEE80211_HW_TX_FRAG_LIST,
 	IEEE80211_HW_REPORTS_LOW_ACK,
+	IEEE80211_HW_SUPPORTS_TX_FRAG,
 
 	/* keep last, obviously */
 	NUM_IEEE80211_HW_FLAGS
@@ -3093,8 +3098,9 @@ enum ieee80211_reconfig_type {
  *	The callback must be atomic.
  *
  * @set_frag_threshold: Configuration of fragmentation threshold. Assign this
- *	if the device does fragmentation by itself; if this callback is
- *	implemented then the stack will not do fragmentation.
+ *	if the device does fragmentation by itself. Note that to prevent the
+ *	stack from doing fragmentation IEEE80211_HW_SUPPORTS_TX_FRAG
+ *	should be set as well.
  *	The callback can sleep.
  *
  * @set_rts_threshold: Configuration of RTS threshold (if device needs it)
-- 
cgit v1.2.3


From f438ceb81d424cb90a5a1aad569056bd7c2ab4c5 Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Tue, 18 Oct 2016 23:12:12 +0300
Subject: mac80211: uapsd_queues is in QoS IE order

The uapsd_queue field is in QoS IE order and not in
IEEE80211_AC_*'s order.
This means that mac80211 would get confused between
BK and BE which is certainly not such a big deal but
needs to be fixed.

Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 3 ++-
 include/net/mac80211.h | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 5000ec758eb3..10a26f0fbafe 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4574,7 +4574,8 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr);
  *	moves to cfg80211 in this call
  * @buf: authentication frame (header + body)
  * @len: length of the frame data
- * @uapsd_queues: bitmap of ACs configured to uapsd. -1 if n/a.
+ * @uapsd_queues: bitmap of queues configured for uapsd. Same format
+ *	as the AC bitmap in the QoS info field
  *
  * After being asked to associate via cfg80211_ops::assoc() the driver must
  * call either this function or cfg80211_auth_timeout().
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index a1a27021f452..b9b24abd9103 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1745,7 +1745,8 @@ struct ieee80211_sta_rates {
  * @drv_priv: data area for driver use, will always be aligned to
  *	sizeof(void *), size is determined in hw information.
  * @uapsd_queues: bitmap of queues configured for uapsd. Only valid
- *	if wme is supported.
+ *	if wme is supported. The bits order is like in
+ *	IEEE80211_WMM_IE_STA_QOSINFO_AC_*.
  * @max_sp: max Service Period. Only valid if wme is supported.
  * @bandwidth: current bandwidth the station can receive with
  * @rx_nss: in HT/VHT, the maximum number of spatial streams the
-- 
cgit v1.2.3


From 0711d638786941ec02551dd9b4aa0d8341f7db5b Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Tue, 18 Oct 2016 23:12:13 +0300
Subject: cfg80211: allow aborting in-progress connection atttempts

On a disconnect request from userspace, cfg80211 currently calls
called rdev_disconnect() only in case that 'current_bss' was set,
i.e. connection had been established.

Change this to allow the userspace call to succeed and call the
driver's disconnect() method also while the connection attempt is
in progress, to be able to abort attempts.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
[change commit subject/message]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 10a26f0fbafe..2bbbcc3eecac 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2564,9 +2564,10 @@ struct cfg80211_nan_func {
  *	cases, the result of roaming is indicated with a call to
  *	cfg80211_roamed() or cfg80211_roamed_bss().
  *	(invoked with the wireless_dev mutex held)
- * @disconnect: Disconnect from the BSS/ESS. Once done, call
- *	cfg80211_disconnected().
- *	(invoked with the wireless_dev mutex held)
+ * @disconnect: Disconnect from the BSS/ESS or stop connection attempts if
+ *      connection is in progress. Once done, call cfg80211_disconnected() in
+ *      case connection was already established (invoked with the
+ *      wireless_dev mutex held), otherwise call cfg80211_connect_timeout().
  *
  * @join_ibss: Join the specified IBSS (or create if necessary). Once done, call
  *	cfg80211_ibss_joined(), also call that function when changing BSSID due
-- 
cgit v1.2.3


From f8c3bf00d440df2bc2c3f669d460868d9ba67226 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 21 Oct 2016 13:55:45 +0200
Subject: net/socket: factor out helpers for memory and queue manipulation

Basic sock operations that udp code can use with its own
memory accounting schema. No functional change is introduced
in the existing APIs.

v4 -> v5:
  - avoid whitespace changes

v2 -> v4:
  - avoid exporting __sock_enqueue_skb

v1 -> v2:
  - avoid export sock_rmem_free

Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index ebf75db08e06..276489553338 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1274,7 +1274,9 @@ static inline struct inode *SOCK_INODE(struct socket *socket)
 /*
  * Functions for memory accounting
  */
+int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind);
 int __sk_mem_schedule(struct sock *sk, int size, int kind);
+void __sk_mem_reduce_allocated(struct sock *sk, int amount);
 void __sk_mem_reclaim(struct sock *sk, int amount);
 
 #define SK_MEM_QUANTUM ((int)PAGE_SIZE)
@@ -1950,6 +1952,8 @@ void sk_reset_timer(struct sock *sk, struct timer_list *timer,
 
 void sk_stop_timer(struct sock *sk, struct timer_list *timer);
 
+int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
+			unsigned int flags);
 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 
-- 
cgit v1.2.3


From f970bd9e3a06f06df8d8ecf1f8ad2c8615cc17eb Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 21 Oct 2016 13:55:46 +0200
Subject: udp: implement memory accounting helpers

Avoid using the generic helpers.
Use the receive queue spin lock to protect the memory
accounting operation, both on enqueue and on dequeue.

On dequeue perform partial memory reclaiming, trying to
leave a quantum of forward allocated memory.

On enqueue use a custom helper, to allow some optimizations:
- use a plain spin_lock() variant instead of the slightly
  costly spin_lock_irqsave(),
- avoid dst_force check, since the calling code has already
  dropped the skb dst
- avoid orphaning the skb, since skb_steal_sock() already did
  the work for us

The above needs custom memory reclaiming on shutdown, provided
by the udp_destruct_sock().

v5 -> v6:
  - don't orphan the skb on enqueue

v4 -> v5:
  - replace the mem_lock with the receive queue spin lock
  - ensure that the bh is always allowed to enqueue at least
    a skb, even if sk_rcvbuf is exceeded

v3 -> v4:
  - reworked memory accunting, simplifying the schema
  - provide an helper for both memory scheduling and enqueuing

v1 -> v2:
  - use a udp specific destrctor to perform memory reclaiming
  - remove a couple of helpers, unneeded after the above cleanup
  - do not reclaim memory on dequeue if not under memory
    pressure
  - reworked the fwd accounting schema to avoid potential
    integer overflow

Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/udp.h b/include/net/udp.h
index ea53a87d880f..18f1e6b91927 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -246,6 +246,9 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
 }
 
 /* net/ipv4/udp.c */
+void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
+int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
+
 void udp_v4_early_demux(struct sk_buff *skb);
 int udp_get_port(struct sock *sk, unsigned short snum,
 		 int (*saddr_cmp)(const struct sock *,
@@ -258,6 +261,7 @@ void udp_flush_pending_frames(struct sock *sk);
 void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
 int udp_rcv(struct sk_buff *skb);
 int udp_ioctl(struct sock *sk, int cmd, unsigned long arg);
+int udp_init_sock(struct sock *sk);
 int udp_disconnect(struct sock *sk, int flags);
 unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait);
 struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
-- 
cgit v1.2.3


From f76a9db351f8beb3259c4ba38058de0058ab8000 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 21 Oct 2016 16:10:22 +0200
Subject: lwt: Remove unused len field

The field is initialized by ILA and MPLS but never used. Remove it.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 67d235f43202..82e76fe1c1f7 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -24,11 +24,10 @@ enum {
 struct lwtunnel_state {
 	__u16		type;
 	__u16		flags;
+	__u16		headroom;
 	atomic_t	refcnt;
 	int		(*orig_output)(struct net *net, struct sock *sk, struct sk_buff *skb);
 	int		(*orig_input)(struct sk_buff *);
-	int             len;
-	__u16		headroom;
 	struct		rcu_head rcu;
 	__u8            data[0];
 };
-- 
cgit v1.2.3


From 432490f9d455fb842d70219f22d9d2c812371676 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Fri, 21 Oct 2016 13:03:44 +0300
Subject: net: ip, diag -- Add diag interface for raw sockets

In criu we are actively using diag interface to collect sockets
present in the system when dumping applications. And while for
unix, tcp, udp[lite], packet, netlink it works as expected,
the raw sockets do not have. Thus add it.

v2:
 - add missing sock_put calls in raw_diag_dump_one (by eric.dumazet@)
 - implement @destroy for diag requests (by dsa@)

v3:
 - add export of raw_abort for IPv6 (by dsa@)
 - pass net-admin flag into inet_sk_diag_fill due to
   changes in net-next branch (by dsa@)

v4:
 - use @pad in struct inet_diag_req_v2 for raw socket
   protocol specification: raw module carries sockets
   which may have custom protocol passed from socket()
   syscall and sole @sdiag_protocol is not enough to
   match underlied ones
 - start reporting protocol specifed in socket() call
   when sockets are raw ones for the same reason: user
   space tools like ss may parse this attribute and use
   it for socket matching

v5 (by eric.dumazet@):
 - use sock_hold in raw_sock_get instead of atomic_inc,
   we're holding (raw_v4_hashinfo|raw_v6_hashinfo)->lock
   when looking up so counter won't be zero here.

v6:
 - use sdiag_raw_protocol() helper which will access @pad
   structure used for raw sockets protocol specification:
   we can't simply rename this member without breaking uapi

v7:
 - sine sdiag_raw_protocol() helper is not suitable for
   uapi lets rather make an alias structure with proper
   names. __check_inet_diag_req_raw helper will catch
   if any of structure unintentionally changed.

CC: David S. Miller <davem@davemloft.net>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: David Ahern <dsa@cumulusnetworks.com>
CC: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
CC: James Morris <jmorris@namei.org>
CC: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
CC: Patrick McHardy <kaber@trash.net>
CC: Andrey Vagin <avagin@openvz.org>
CC: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/raw.h   | 6 ++++++
 include/net/rawv6.h | 7 +++++++
 2 files changed, 13 insertions(+)

(limited to 'include/net')

diff --git a/include/net/raw.h b/include/net/raw.h
index 3e789008394d..57c33dd22ec4 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -23,6 +23,12 @@
 
 extern struct proto raw_prot;
 
+extern struct raw_hashinfo raw_v4_hashinfo;
+struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
+			     unsigned short num, __be32 raddr,
+			     __be32 laddr, int dif);
+
+int raw_abort(struct sock *sk, int err);
 void raw_icmp_error(struct sk_buff *, int, u32);
 int raw_local_deliver(struct sk_buff *, int);
 
diff --git a/include/net/rawv6.h b/include/net/rawv6.h
index 87783dea0791..cbe4e9de1894 100644
--- a/include/net/rawv6.h
+++ b/include/net/rawv6.h
@@ -3,6 +3,13 @@
 
 #include <net/protocol.h>
 
+extern struct raw_hashinfo raw_v6_hashinfo;
+struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
+			     unsigned short num, const struct in6_addr *loc_addr,
+			     const struct in6_addr *rmt_addr, int dif);
+
+int raw_abort(struct sock *sk, int err);
+
 void raw6_icmp_error(struct sk_buff *, int nexthdr,
 		u8 type, u8 code, int inner_offset, __be32);
 bool raw6_local_deliver(struct sk_buff *, int);
-- 
cgit v1.2.3


From 73c7da3dae1e7cd8febeab13767b2698b84dfa15 Mon Sep 17 00:00:00 2001
From: Arend Van Spriel <arend.vanspriel@broadcom.com>
Date: Thu, 20 Oct 2016 20:08:22 +0100
Subject: cfg80211: add generic helper to check interface is running

Add a helper using wdev to check if interface is running. This
deals with both non-netdev and netdev interfaces. In struct
wireless_dev replace 'p2p_started' and 'nan_started' by
'is_running' as those are mutually exclusive anyway, and unify
all the code to use wdev_running().

Signed-off-by: Arend van Spriel <arend.vanspriel@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 2bbbcc3eecac..ec39f891b7a3 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -3781,8 +3781,8 @@ struct cfg80211_cached_keys;
  * @beacon_interval: beacon interval used on this device for transmitting
  *	beacons, 0 when not valid
  * @address: The address for this device, valid only if @netdev is %NULL
- * @p2p_started: true if this is a P2P Device that has been started
- * @nan_started: true if this is a NAN interface that has been started
+ * @is_running: true if this is a non-netdev device that has been started, e.g.
+ *	the P2P Device.
  * @cac_started: true if DFS channel availability check has been started
  * @cac_start_time: timestamp (jiffies) when the dfs state was entered.
  * @cac_time_ms: CAC time in ms
@@ -3814,7 +3814,7 @@ struct wireless_dev {
 
 	struct mutex mtx;
 
-	bool use_4addr, p2p_started, nan_started;
+	bool use_4addr, is_running;
 
 	u8 address[ETH_ALEN] __aligned(sizeof(u16));
 
@@ -3871,6 +3871,13 @@ static inline u8 *wdev_address(struct wireless_dev *wdev)
 	return wdev->address;
 }
 
+static inline bool wdev_running(struct wireless_dev *wdev)
+{
+	if (wdev->netdev)
+		return netif_running(wdev->netdev);
+	return wdev->is_running;
+}
+
 /**
  * wdev_priv - return wiphy priv from wireless_dev
  *
-- 
cgit v1.2.3


From 4c8dea638c16141adb046fd2e0cab51dfe43650c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 21 Oct 2016 14:25:13 +0200
Subject: cfg80211: validate beacon int as part of iface combinations

Remove the pointless checking against interface combinations in
the initial basic beacon interval validation, that currently isn't
taking into account radar detection or channels properly. Instead,
just validate the basic range there, and then delay real checking
to the interface combination validation that drivers must do.

This means that drivers wanting to use the beacon_int_min_gcd will
now have to pass the new_beacon_int when validating the AP/mesh
start.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index ec39f891b7a3..d1ffbc3a8e55 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -784,19 +784,15 @@ struct cfg80211_csa_settings {
  * @iftype_num: array with the number of interfaces of each interface
  *	type.  The index is the interface type as specified in &enum
  *	nl80211_iftype.
- * @beacon_int_gcd: a value specifying GCD of all beaconing interfaces,
- *	the GCD of a single value is considered the value itself, so for
- *	a single interface this should be set to that interface's beacon
- *	interval
- * @beacon_int_different: a flag indicating whether or not all beacon
- *	intervals (of beaconing interfaces) are different or not.
+ * @new_beacon_int: set this to the beacon interval of a new interface
+ *	that's not operating yet, if such is to be checked as part of
+ *	the verification
  */
 struct iface_combination_params {
 	int num_different_channels;
 	u8 radar_detect;
 	int iftype_num[NUM_NL80211_IFTYPES];
-	u32 beacon_int_gcd;
-	bool beacon_int_different;
+	u32 new_beacon_int;
 };
 
 /**
-- 
cgit v1.2.3


From 11b6b5a4ced2f2c76073b97ee08ca0eab8358fde Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Thu, 27 Oct 2016 00:41:58 +0300
Subject: cfg80211: Rename SAE_DATA to more generic AUTH_DATA

This adds defines and nl80211 extensions to allow FILS Authentication to
be implemented similarly to SAE. FILS does not need the special rules
for the Authentication transaction number and Status code fields, but it
does need to add non-IE fields. The previously used
NL80211_ATTR_SAE_DATA can be reused for this to avoid having to
duplicate that implementation. Rename that attribute to more generic
NL80211_ATTR_AUTH_DATA (with backwards compatibility define for
NL80211_SAE_DATA).

Also document the special rules related to the Authentication
transaction number and Status code fiels.

Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index d1ffbc3a8e55..dffc265a4fd6 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1785,9 +1785,11 @@ const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 ie);
  * @key_len: length of WEP key for shared key authentication
  * @key_idx: index of WEP key for shared key authentication
  * @key: WEP key for shared key authentication
- * @sae_data: Non-IE data to use with SAE or %NULL. This starts with
- *	Authentication transaction sequence number field.
- * @sae_data_len: Length of sae_data buffer in octets
+ * @auth_data: Fields and elements in Authentication frames. This contains
+ *	the authentication frame body (non-IE and IE data), excluding the
+ *	Authentication algorithm number, i.e., starting at the Authentication
+ *	transaction sequence number field.
+ * @auth_data_len: Length of auth_data buffer in octets
  */
 struct cfg80211_auth_request {
 	struct cfg80211_bss *bss;
@@ -1796,8 +1798,8 @@ struct cfg80211_auth_request {
 	enum nl80211_auth_type auth_type;
 	const u8 *key;
 	u8 key_len, key_idx;
-	const u8 *sae_data;
-	size_t sae_data_len;
+	const u8 *auth_data;
+	size_t auth_data_len;
 };
 
 /**
-- 
cgit v1.2.3


From 3f817fe718c6cb3ddcc2ab04ba86faecc20ef8fe Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Thu, 27 Oct 2016 00:42:01 +0300
Subject: cfg80211: Define IEEE P802.11ai (FILS) information elements

Define the Element IDs and Element ID Extensions from IEEE
P802.11ai/D11.0. In addition, add a new cfg80211_find_ext_ie() wrapper
to make it easier to find information elements that used the Element ID
Extension field.

Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index dffc265a4fd6..8ca2e9f354f7 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4180,6 +4180,27 @@ static inline const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len)
 	return cfg80211_find_ie_match(eid, ies, len, NULL, 0, 0);
 }
 
+/**
+ * cfg80211_find_ext_ie - find information element with EID Extension in data
+ *
+ * @ext_eid: element ID Extension
+ * @ies: data consisting of IEs
+ * @len: length of data
+ *
+ * Return: %NULL if the extended element ID could not be found or if
+ * the element is invalid (claims to be longer than the given
+ * data), or a pointer to the first byte of the requested
+ * element, that is the byte containing the element ID.
+ *
+ * Note: There are no checks on the element length other than
+ * having to fit into the given data.
+ */
+static inline const u8 *cfg80211_find_ext_ie(u8 ext_eid, const u8 *ies, int len)
+{
+	return cfg80211_find_ie_match(WLAN_EID_EXTENSION, ies, len,
+				      &ext_eid, 1, 2);
+}
+
 /**
  * cfg80211_find_vendor_ie - find vendor specific information element in data
  *
-- 
cgit v1.2.3


From 348bd456699801920a309c66e382380809fbdf41 Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Thu, 27 Oct 2016 00:42:03 +0300
Subject: cfg80211: Add KEK/nonces for FILS association frames

The new nl80211 attributes can be used to provide KEK and nonces to
allow the driver to encrypt and decrypt FILS (Re)Association
Request/Response frames in station mode.

Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 8ca2e9f354f7..738b4d8a4666 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1840,6 +1840,12 @@ enum cfg80211_assoc_req_flags {
  * @ht_capa_mask:  The bits of ht_capa which are to be used.
  * @vht_capa: VHT capability override
  * @vht_capa_mask: VHT capability mask indicating which fields to use
+ * @fils_kek: FILS KEK for protecting (Re)Association Request/Response frame or
+ *	%NULL if FILS is not used.
+ * @fils_kek_len: Length of fils_kek in octets
+ * @fils_nonces: FILS nonces (part of AAD) for protecting (Re)Association
+ *	Request/Response frame or %NULL if FILS is not used. This field starts
+ *	with 16 octets of STA Nonce followed by 16 octets of AP Nonce.
  */
 struct cfg80211_assoc_request {
 	struct cfg80211_bss *bss;
@@ -1851,6 +1857,9 @@ struct cfg80211_assoc_request {
 	struct ieee80211_ht_cap ht_capa;
 	struct ieee80211_ht_cap ht_capa_mask;
 	struct ieee80211_vht_cap vht_capa, vht_capa_mask;
+	const u8 *fils_kek;
+	size_t fils_kek_len;
+	const u8 *fils_nonces;
 };
 
 /**
-- 
cgit v1.2.3


From ce0ce13a1c89ff8b94b7f8fb32eb4c43e111c82e Mon Sep 17 00:00:00 2001
From: Michael Braun <michael-dev@fami-braun.de>
Date: Mon, 10 Oct 2016 19:12:22 +0200
Subject: cfg80211: configure multicast to unicast for AP interfaces

Add the ability to configure if an AP (and associated VLANs) will
do multicast-to-unicast conversion for ARP, IPv4 and IPv6 frames
(possibly within 802.1Q). If enabled, such frames are to be sent
to each station separately, with the DA replaced by their own MAC
address rather than the group address.

Note that this may break certain expectations of the receiver,
such as the ability to drop unicast IP packets received within
multicast L2 frames, or the ability to not send ICMP destination
unreachable messages for packets received in L2 multicast (which
is required, but the receiver can't tell the difference if this
new option is enabled.)

This also doesn't implement the 802.11 DMS (directed multicast
service).

Signed-off-by: Michael Braun <michael-dev@fami-braun.de>
[fix disabling, add better documentation & commit message]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 738b4d8a4666..41ae3f500957 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2742,6 +2742,8 @@ struct cfg80211_nan_func {
  * @nan_change_conf: changes NAN configuration. The changed parameters must
  *	be specified in @changes (using &enum cfg80211_nan_conf_changes);
  *	All other parameters must be ignored.
+ *
+ * @set_multicast_to_unicast: configure multicast to unicast conversion for BSS
  */
 struct cfg80211_ops {
 	int	(*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow);
@@ -3018,6 +3020,10 @@ struct cfg80211_ops {
 				   struct wireless_dev *wdev,
 				   struct cfg80211_nan_conf *conf,
 				   u32 changes);
+
+	int	(*set_multicast_to_unicast)(struct wiphy *wiphy,
+					    struct net_device *dev,
+					    const bool enabled);
 };
 
 /*
-- 
cgit v1.2.3


From 088e8df82f91a24728d49d9532cab7ebdee5117f Mon Sep 17 00:00:00 2001
From: vamsi krishna <vamsin@qti.qualcomm.com>
Date: Thu, 27 Oct 2016 16:51:11 +0300
Subject: cfg80211: Add support to update connection parameters

Add functionality to update the connection parameters when in connected
state, so that driver/firmware uses the updated parameters for
subsequent roaming. This is for drivers that support internal BSS
selection and roaming. The new command does not change the current
association state, i.e., it can be used to update IE contents for future
(re)associations without causing an immediate disassociation or
reassociation with the current BSS.

This commit implements the required functionality for updating IEs for
(Re)Association Request frame only. Other parameters can be added in
future when required.

Signed-off-by: vamsi krishna <vamsin@qti.qualcomm.com>
Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 41ae3f500957..c575583b50fb 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2050,6 +2050,18 @@ struct cfg80211_connect_params {
 	const u8 *prev_bssid;
 };
 
+/**
+ * enum cfg80211_connect_params_changed - Connection parameters being updated
+ *
+ * This enum provides information of all connect parameters that
+ * have to be updated as part of update_connect_params() call.
+ *
+ * @UPDATE_ASSOC_IES: Indicates whether association request IEs are updated
+ */
+enum cfg80211_connect_params_changed {
+	UPDATE_ASSOC_IES		= BIT(0),
+};
+
 /**
  * enum wiphy_params_flags - set_wiphy_params bitfield values
  * @WIPHY_PARAM_RETRY_SHORT: wiphy->retry_short has changed
@@ -2571,6 +2583,14 @@ struct cfg80211_nan_func {
  *	cases, the result of roaming is indicated with a call to
  *	cfg80211_roamed() or cfg80211_roamed_bss().
  *	(invoked with the wireless_dev mutex held)
+ * @update_connect_params: Update the connect parameters while connected to a
+ *	BSS. The updated parameters can be used by driver/firmware for
+ *	subsequent BSS selection (roaming) decisions and to form the
+ *	Authentication/(Re)Association Request frames. This call does not
+ *	request an immediate disassociation or reassociation with the current
+ *	BSS, i.e., this impacts only subsequent (re)associations. The bits in
+ *	changed are defined in &enum cfg80211_connect_params_changed.
+ *	(invoked with the wireless_dev mutex held)
  * @disconnect: Disconnect from the BSS/ESS or stop connection attempts if
  *      connection is in progress. Once done, call cfg80211_disconnected() in
  *      case connection was already established (invoked with the
@@ -2858,6 +2878,10 @@ struct cfg80211_ops {
 
 	int	(*connect)(struct wiphy *wiphy, struct net_device *dev,
 			   struct cfg80211_connect_params *sme);
+	int	(*update_connect_params)(struct wiphy *wiphy,
+					 struct net_device *dev,
+					 struct cfg80211_connect_params *sme,
+					 u32 changed);
 	int	(*disconnect)(struct wiphy *wiphy, struct net_device *dev,
 			      u16 reason_code);
 
-- 
cgit v1.2.3


From 4fe77d82ef80c77031c9c6f8554cd0dee2aa423a Mon Sep 17 00:00:00 2001
From: Antonio Quartulli <a@unstable.cc>
Date: Mon, 24 Oct 2016 20:32:57 +0800
Subject: skbedit: allow the user to specify bitmask for mark

The user may want to use only some bits of the skb mark in
his skbedit rules because the remaining part might be used by
something else.

Introduce the "mask" parameter to the skbedit actor in order
to implement such functionality.

When the mask is specified, only those bits selected by the
latter are altered really changed by the actor, while the
rest is left untouched.

Signed-off-by: Antonio Quartulli <antonio@open-mesh.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_skbedit.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h
index 5767e9dbcf92..19cd3d345804 100644
--- a/include/net/tc_act/tc_skbedit.h
+++ b/include/net/tc_act/tc_skbedit.h
@@ -27,6 +27,7 @@ struct tcf_skbedit {
 	u32		flags;
 	u32		priority;
 	u32		mark;
+	u32		mask;
 	u16		queue_mapping;
 	u16		ptype;
 };
-- 
cgit v1.2.3


From c90c39dab3e02ce45427a214746711f33ad13be6 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 24 Oct 2016 14:40:01 +0200
Subject: genetlink: introduce and use genl_family_attrbuf()

This helper function allows family implementations to access
their family's attrbuf. This gets rid of the attrbuf usage
in families, and also adds locking validation, since it's not
valid to use the attrbuf with parallel_ops or outside of the
dumpit callback.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 8d4608ce8716..ef9defb3f5bc 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -73,6 +73,8 @@ struct genl_family {
 	struct module		*module;
 };
 
+struct nlattr **genl_family_attrbuf(struct genl_family *family);
+
 /**
  * struct genl_info - receiving information
  * @snd_seq: sending sequence number
-- 
cgit v1.2.3


From a07ea4d9941af5a0c6f0be2a71b51ac9c083c5e5 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 24 Oct 2016 14:40:02 +0200
Subject: genetlink: no longer support using static family IDs

Static family IDs have never really been used, the only
use case was the workaround I introduced for those users
that assumed their family ID was also their multicast
group ID.

Additionally, because static family IDs would never be
reserved by the generic netlink code, using a relatively
low ID would only work for built-in families that can be
registered immediately after generic netlink is started,
which is basically only the control family (apart from
the workaround code, which I also had to add code for so
it would reserve those IDs)

Thus, anything other than GENL_ID_GENERATE is flawed and
luckily not used except in the cases I mentioned. Move
those workarounds into a few lines of code, and then get
rid of GENL_ID_GENERATE entirely, making it more robust.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index ef9defb3f5bc..43a5c3975a2f 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -20,7 +20,7 @@ struct genl_info;
 
 /**
  * struct genl_family - generic netlink family
- * @id: protocol family idenfitier
+ * @id: protocol family identifier (private)
  * @hdrsize: length of user specific header in bytes
  * @name: name of family
  * @version: protocol version
@@ -48,7 +48,7 @@ struct genl_info;
  * @n_ops: number of operations supported by this family (private)
  */
 struct genl_family {
-	unsigned int		id;
+	unsigned int		id;		/* private */
 	unsigned int		hdrsize;
 	char			name[GENL_NAMSIZ];
 	unsigned int		version;
@@ -149,9 +149,6 @@ static inline int genl_register_family(struct genl_family *family)
  * Registers the specified family and operations from the specified table.
  * Only one family may be registered with the same family name or identifier.
  *
- * The family id may equal GENL_ID_GENERATE causing an unique id to
- * be automatically generated and assigned.
- *
  * Either a doit or dumpit callback must be specified for every registered
  * operation or the function will fail. Only one operation structure per
  * command identifier may be registered.
-- 
cgit v1.2.3


From 489111e5c25b93be80340c3113d71903d7c82136 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 24 Oct 2016 14:40:03 +0200
Subject: genetlink: statically initialize families

Instead of providing macros/inline functions to initialize
the families, make all users initialize them statically and
get rid of the macros.

This reduces the kernel code size by about 1.6k on x86-64
(with allyesconfig).

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h | 71 +++++++++----------------------------------------
 1 file changed, 12 insertions(+), 59 deletions(-)

(limited to 'include/net')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 43a5c3975a2f..2298b50cee34 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -39,13 +39,14 @@ struct genl_info;
  *	Note that unbind() will not be called symmetrically if the
  *	generic netlink family is removed while there are still open
  *	sockets.
- * @attrbuf: buffer to store parsed attributes
- * @family_list: family list
- * @mcgrps: multicast groups used by this family (private)
- * @n_mcgrps: number of multicast groups (private)
+ * @attrbuf: buffer to store parsed attributes (private)
+ * @family_list: family list (private)
+ * @mcgrps: multicast groups used by this family
+ * @n_mcgrps: number of multicast groups
  * @mcgrp_offset: starting number of multicast group IDs in this family
- * @ops: the operations supported by this family (private)
- * @n_ops: number of operations supported by this family (private)
+ *	(private)
+ * @ops: the operations supported by this family
+ * @n_ops: number of operations supported by this family
  */
 struct genl_family {
 	unsigned int		id;		/* private */
@@ -64,10 +65,10 @@ struct genl_family {
 	int			(*mcast_bind)(struct net *net, int group);
 	void			(*mcast_unbind)(struct net *net, int group);
 	struct nlattr **	attrbuf;	/* private */
-	const struct genl_ops *	ops;		/* private */
-	const struct genl_multicast_group *mcgrps; /* private */
-	unsigned int		n_ops;		/* private */
-	unsigned int		n_mcgrps;	/* private */
+	const struct genl_ops *	ops;
+	const struct genl_multicast_group *mcgrps;
+	unsigned int		n_ops;
+	unsigned int		n_mcgrps;
 	unsigned int		mcgrp_offset;	/* private */
 	struct list_head	family_list;	/* private */
 	struct module		*module;
@@ -132,55 +133,7 @@ struct genl_ops {
 	u8			flags;
 };
 
-int __genl_register_family(struct genl_family *family);
-
-static inline int genl_register_family(struct genl_family *family)
-{
-	family->module = THIS_MODULE;
-	return __genl_register_family(family);
-}
-
-/**
- * genl_register_family_with_ops - register a generic netlink family with ops
- * @family: generic netlink family
- * @ops: operations to be registered
- * @n_ops: number of elements to register
- *
- * Registers the specified family and operations from the specified table.
- * Only one family may be registered with the same family name or identifier.
- *
- * Either a doit or dumpit callback must be specified for every registered
- * operation or the function will fail. Only one operation structure per
- * command identifier may be registered.
- *
- * See include/net/genetlink.h for more documenation on the operations
- * structure.
- *
- * Return 0 on success or a negative error code.
- */
-static inline int
-_genl_register_family_with_ops_grps(struct genl_family *family,
-				    const struct genl_ops *ops, size_t n_ops,
-				    const struct genl_multicast_group *mcgrps,
-				    size_t n_mcgrps)
-{
-	family->module = THIS_MODULE;
-	family->ops = ops;
-	family->n_ops = n_ops;
-	family->mcgrps = mcgrps;
-	family->n_mcgrps = n_mcgrps;
-	return __genl_register_family(family);
-}
-
-#define genl_register_family_with_ops(family, ops)			\
-	_genl_register_family_with_ops_grps((family),			\
-					    (ops), ARRAY_SIZE(ops),	\
-					    NULL, 0)
-#define genl_register_family_with_ops_groups(family, ops, grps)	\
-	_genl_register_family_with_ops_grps((family),			\
-					    (ops), ARRAY_SIZE(ops),	\
-					    (grps), ARRAY_SIZE(grps))
-
+int genl_register_family(struct genl_family *family);
 int genl_unregister_family(struct genl_family *family);
 void genl_notify(struct genl_family *family, struct sk_buff *skb,
 		 struct genl_info *info, u32 group, gfp_t flags);
-- 
cgit v1.2.3


From 2ae0f17df1cd52aafd1ab0415ea1f1dd56dc0e2a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 24 Oct 2016 14:40:04 +0200
Subject: genetlink: use idr to track families

Since generic netlink family IDs are small integers, allocated
densely, IDR is an ideal match for lookups. Replace the existing
hand-written hash-table with IDR for allocation and lookup.

This lets the families only be written to once, during register,
since the list_head can be removed and removal of a family won't
cause any writes.

It also slightly reduces the code size (by about 1.3k on x86-64).

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

(limited to 'include/net')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 2298b50cee34..3ec87bacc0f5 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -40,7 +40,6 @@ struct genl_info;
  *	generic netlink family is removed while there are still open
  *	sockets.
  * @attrbuf: buffer to store parsed attributes (private)
- * @family_list: family list (private)
  * @mcgrps: multicast groups used by this family
  * @n_mcgrps: number of multicast groups
  * @mcgrp_offset: starting number of multicast group IDs in this family
@@ -70,11 +69,10 @@ struct genl_family {
 	unsigned int		n_ops;
 	unsigned int		n_mcgrps;
 	unsigned int		mcgrp_offset;	/* private */
-	struct list_head	family_list;	/* private */
 	struct module		*module;
 };
 
-struct nlattr **genl_family_attrbuf(struct genl_family *family);
+struct nlattr **genl_family_attrbuf(const struct genl_family *family);
 
 /**
  * struct genl_info - receiving information
@@ -134,12 +132,12 @@ struct genl_ops {
 };
 
 int genl_register_family(struct genl_family *family);
-int genl_unregister_family(struct genl_family *family);
-void genl_notify(struct genl_family *family, struct sk_buff *skb,
+int genl_unregister_family(const struct genl_family *family);
+void genl_notify(const struct genl_family *family, struct sk_buff *skb,
 		 struct genl_info *info, u32 group, gfp_t flags);
 
 void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
-		  struct genl_family *family, int flags, u8 cmd);
+		  const struct genl_family *family, int flags, u8 cmd);
 
 /**
  * genlmsg_nlhdr - Obtain netlink header from user specified header
@@ -148,8 +146,8 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
  *
  * Returns pointer to netlink header.
  */
-static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr,
-					     struct genl_family *family)
+static inline struct nlmsghdr *
+genlmsg_nlhdr(void *user_hdr, const struct genl_family *family)
 {
 	return (struct nlmsghdr *)((char *)user_hdr -
 				   family->hdrsize -
@@ -185,7 +183,7 @@ static inline int genlmsg_parse(const struct nlmsghdr *nlh,
  */
 static inline void genl_dump_check_consistent(struct netlink_callback *cb,
 					      void *user_hdr,
-					      struct genl_family *family)
+					      const struct genl_family *family)
 {
 	nl_dump_check_consistent(cb, genlmsg_nlhdr(user_hdr, family));
 }
@@ -202,7 +200,7 @@ static inline void genl_dump_check_consistent(struct netlink_callback *cb,
  */
 static inline void *genlmsg_put_reply(struct sk_buff *skb,
 				      struct genl_info *info,
-				      struct genl_family *family,
+				      const struct genl_family *family,
 				      int flags, u8 cmd)
 {
 	return genlmsg_put(skb, info->snd_portid, info->snd_seq, family,
@@ -239,7 +237,7 @@ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr)
  * @group: offset of multicast group in groups array
  * @flags: allocation flags
  */
-static inline int genlmsg_multicast_netns(struct genl_family *family,
+static inline int genlmsg_multicast_netns(const struct genl_family *family,
 					  struct net *net, struct sk_buff *skb,
 					  u32 portid, unsigned int group, gfp_t flags)
 {
@@ -257,7 +255,7 @@ static inline int genlmsg_multicast_netns(struct genl_family *family,
  * @group: offset of multicast group in groups array
  * @flags: allocation flags
  */
-static inline int genlmsg_multicast(struct genl_family *family,
+static inline int genlmsg_multicast(const struct genl_family *family,
 				    struct sk_buff *skb, u32 portid,
 				    unsigned int group, gfp_t flags)
 {
@@ -275,7 +273,7 @@ static inline int genlmsg_multicast(struct genl_family *family,
  *
  * This function must hold the RTNL or rcu_read_lock().
  */
-int genlmsg_multicast_allns(struct genl_family *family,
+int genlmsg_multicast_allns(const struct genl_family *family,
 			    struct sk_buff *skb, u32 portid,
 			    unsigned int group, gfp_t flags);
 
@@ -359,8 +357,9 @@ static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags)
  * This function returns the number of broadcast listeners that have set the
  * NETLINK_RECV_NO_ENOBUFS socket option.
  */
-static inline int genl_set_err(struct genl_family *family, struct net *net,
-			       u32 portid, u32 group, int code)
+static inline int genl_set_err(const struct genl_family *family,
+			       struct net *net, u32 portid,
+			       u32 group, int code)
 {
 	if (WARN_ON_ONCE(group >= family->n_mcgrps))
 		return -EINVAL;
@@ -368,7 +367,7 @@ static inline int genl_set_err(struct genl_family *family, struct net *net,
 	return netlink_set_err(net->genl_sock, portid, group, code);
 }
 
-static inline int genl_has_listeners(struct genl_family *family,
+static inline int genl_has_listeners(const struct genl_family *family,
 				     struct net *net, unsigned int group)
 {
 	if (WARN_ON_ONCE(group >= family->n_mcgrps))
-- 
cgit v1.2.3


From b15ca182ed136087f6a2cb9ffe880c923f36a56e Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Wed, 26 Oct 2016 10:53:16 +0200
Subject: netlink: Add nla_memdup() to wrap kmemdup() use on nlattr

Wrap several common instances of:
	kmemdup(nla_data(attr), nla_len(attr), GFP_KERNEL);

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 254a0fc01800..a34f53acb6d6 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -1190,6 +1190,16 @@ static inline struct in6_addr nla_get_in6_addr(const struct nlattr *nla)
 	return tmp;
 }
 
+/**
+ * nla_memdup - duplicate attribute memory (kmemdup)
+ * @src: netlink attribute to duplicate from
+ * @gfp: GFP mask
+ */
+static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp)
+{
+	return kmemdup(nla_data(src), nla_len(src), gfp);
+}
+
 /**
  * nla_nest_start - Start a new level of nested attributes
  * @skb: socket buffer to add attributes to
-- 
cgit v1.2.3


From 5ea8ea2cb7f1d0db15762c9b0bb9e7330425a071 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 26 Oct 2016 09:27:57 -0700
Subject: tcp/dccp: drop SYN packets if accept queue is full

Per listen(fd, backlog) rules, there is really no point accepting a SYN,
sending a SYNACK, and dropping the following ACK packet if accept queue
is full, because application is not draining accept queue fast enough.

This behavior is fooling TCP clients that believe they established a
flow, while there is nothing at server side. They might then send about
10 MSS (if using IW10) that will be dropped anyway while server is under
stress.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 197a30d221e9..146054ceea8e 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -289,11 +289,6 @@ static inline int inet_csk_reqsk_queue_len(const struct sock *sk)
 	return reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue);
 }
 
-static inline int inet_csk_reqsk_queue_young(const struct sock *sk)
-{
-	return reqsk_queue_len_young(&inet_csk(sk)->icsk_accept_queue);
-}
-
 static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
 {
 	return inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog;
-- 
cgit v1.2.3


From bd68a2a854ad5a85f0c8d0a9c8048ca3f6391efb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 31 Oct 2016 13:32:55 -0700
Subject: net: set SK_MEM_QUANTUM to 4096

Systems with large pages (64KB pages for example) do not always have
huge quantity of memory.

A big SK_MEM_QUANTUM value leads to fewer interactions with the
global counters (like tcp_memory_allocated) but might trigger
memory pressure much faster, giving suboptimal TCP performance
since windows are lowered to ridiculous values.

Note that sysctl_mem units being in pages and in ABI, we also need
to change sk_prot_mem_limits() accordingly.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index f13ac87a8015..93331a1492db 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1162,11 +1162,6 @@ static inline void sk_enter_memory_pressure(struct sock *sk)
 	sk->sk_prot->enter_memory_pressure(sk);
 }
 
-static inline long sk_prot_mem_limits(const struct sock *sk, int index)
-{
-	return sk->sk_prot->sysctl_mem[index];
-}
-
 static inline long
 sk_memory_allocated(const struct sock *sk)
 {
@@ -1281,11 +1276,27 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind);
 void __sk_mem_reduce_allocated(struct sock *sk, int amount);
 void __sk_mem_reclaim(struct sock *sk, int amount);
 
-#define SK_MEM_QUANTUM ((int)PAGE_SIZE)
+/* We used to have PAGE_SIZE here, but systems with 64KB pages
+ * do not necessarily have 16x time more memory than 4KB ones.
+ */
+#define SK_MEM_QUANTUM 4096
 #define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
 #define SK_MEM_SEND	0
 #define SK_MEM_RECV	1
 
+/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */
+static inline long sk_prot_mem_limits(const struct sock *sk, int index)
+{
+	long val = sk->sk_prot->sysctl_mem[index];
+
+#if PAGE_SIZE > SK_MEM_QUANTUM
+	val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
+#elif PAGE_SIZE < SK_MEM_QUANTUM
+	val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT;
+#endif
+	return val;
+}
+
 static inline int sk_mem_pages(int amt)
 {
 	return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT;
-- 
cgit v1.2.3


From f6d0cbcf09c506b9b022df8f9d7693a7cec3c732 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 24 Oct 2016 16:56:40 +0200
Subject: netfilter: nf_tables: add fib expression

Add FIB expression, supported for ipv4, ipv6 and inet family (the latter
just dispatches to ipv4 or ipv6 one based on nfproto).

Currently supports fetching output interface index/name and the
rtm_type associated with an address.

This can be used for adding path filtering. rtm_type is useful
to e.g. enforce a strong-end host model where packets
are only accepted if daddr is configured on the interface the
packet arrived on.

The fib expression is a native nftables alternative to the
xtables addrtype and rp_filter matches.

FIB result order for oif/oifname retrieval is as follows:
 - if packet is local (skb has rtable, RTF_LOCAL set, this
   will also catch looped-back multicast packets), set oif to
   the loopback interface.
 - if fib lookup returns an error, or result points to local,
   store zero result.  This means '--local' option of -m rpfilter
   is not supported. It is possible to use 'fib type local' or add
   explicit saddr/daddr matching rules to create exceptions if this
   is really needed.
 - store result in the destination register.
   In case of multiple routes, search set for desired oif in case
   strict matching is requested.

ipv4 and ipv6 behave fib expressions are supposed to behave the same.

[ I have collapsed Arnd Bergmann's ("netfilter: nf_tables: fib warnings")

	http://patchwork.ozlabs.org/patch/688615/

  to address fallout from this patch after rebasing nf-next, that was
  posted to address compilation warnings. --pablo ]

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_fib.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 include/net/netfilter/nft_fib.h

(limited to 'include/net')

diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h
new file mode 100644
index 000000000000..cbedda077db2
--- /dev/null
+++ b/include/net/netfilter/nft_fib.h
@@ -0,0 +1,31 @@
+#ifndef _NFT_FIB_H_
+#define _NFT_FIB_H_
+
+struct nft_fib {
+	enum nft_registers	dreg:8;
+	u8			result;
+	u32			flags;
+};
+
+extern const struct nla_policy nft_fib_policy[];
+
+int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr);
+int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+		 const struct nlattr * const tb[]);
+int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
+		     const struct nft_data **data);
+
+
+void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
+			const struct nft_pktinfo *pkt);
+void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
+		   const struct nft_pktinfo *pkt);
+
+void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
+			const struct nft_pktinfo *pkt);
+void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
+		   const struct nft_pktinfo *pkt);
+
+void nft_fib_store_result(void *reg, enum nft_fib_result r,
+			  const struct nft_pktinfo *pkt, int index);
+#endif
-- 
cgit v1.2.3


From 1fddf4bad0ac9f4d32c74af286fc1eec2a03c82c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 27 Oct 2016 19:49:42 +0100
Subject: netfilter: nf_log: add packet logging for netdev family

Move layer 2 packet logging into nf_log_l2packet() that resides in
nf_log_common.c, so this can be shared by both bridge and netdev
families.

This patch adds the boiler plate code to register the netdev logging
family.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_log.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index 309cd267be4f..a559aa41253c 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -109,5 +109,10 @@ void nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf,
 			       const struct net_device *out,
 			       const struct nf_loginfo *loginfo,
 			       const char *prefix);
+void nf_log_l2packet(struct net *net, u_int8_t pf, unsigned int hooknum,
+		     const struct sk_buff *skb,
+		     const struct net_device *in,
+		     const struct net_device *out,
+		     const struct nf_loginfo *loginfo, const char *prefix);
 
 #endif /* _NF_LOG_H */
-- 
cgit v1.2.3


From 8db4c5be88f62ffd7a552f70687a10c614dc697b Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 27 Oct 2016 19:49:48 +0100
Subject: netfilter: move socket lookup infrastructure to nf_socket_ipv{4,6}.c

We need this split to reuse existing codebase for the upcoming nf_tables
socket expression.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_socket.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 include/net/netfilter/nf_socket.h

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_socket.h b/include/net/netfilter/nf_socket.h
new file mode 100644
index 000000000000..f2fc39c97d43
--- /dev/null
+++ b/include/net/netfilter/nf_socket.h
@@ -0,0 +1,27 @@
+#ifndef _NF_SOCK_H_
+#define _NF_SOCK_H_
+
+struct net_device;
+struct sk_buff;
+struct sock;
+struct net;
+
+static inline bool nf_sk_is_transparent(struct sock *sk)
+{
+	switch (sk->sk_state) {
+	case TCP_TIME_WAIT:
+		return inet_twsk(sk)->tw_transparent;
+	case TCP_NEW_SYN_RECV:
+		return inet_rsk(inet_reqsk(sk))->no_srccheck;
+	default:
+		return inet_sk(sk)->transparent;
+	}
+}
+
+struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb,
+				  const struct net_device *indev);
+
+struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
+				  const struct net_device *indev);
+
+#endif
-- 
cgit v1.2.3


From 613dbd95723aee7abd16860745691b6c7bda20dc Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 3 Nov 2016 10:56:21 +0100
Subject: netfilter: x_tables: move hook state into xt_action_param structure

Place pointer to hook state in xt_action_param structure instead of
copying the fields that we need. After this change xt_action_param fits
into one cacheline.

This patch also adds a set of new wrapper functions to fetch relevant
hook state structure fields.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 5031e072567b..44060344f958 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -30,11 +30,12 @@ static inline void nft_set_pktinfo(struct nft_pktinfo *pkt,
 				   const struct nf_hook_state *state)
 {
 	pkt->skb = skb;
-	pkt->net = pkt->xt.net = state->net;
-	pkt->in = pkt->xt.in = state->in;
-	pkt->out = pkt->xt.out = state->out;
-	pkt->hook = pkt->xt.hooknum = state->hook;
-	pkt->pf = pkt->xt.family = state->pf;
+	pkt->net = state->net;
+	pkt->in = state->in;
+	pkt->out = state->out;
+	pkt->hook = state->hook;
+	pkt->pf = state->pf;
+	pkt->xt.state = state;
 }
 
 static inline void nft_set_pktinfo_proto_unspec(struct nft_pktinfo *pkt,
-- 
cgit v1.2.3


From 0e5a1c7eb3fc705c4cc6c1e058e81d1f2e721c72 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 3 Nov 2016 10:56:26 +0100
Subject: netfilter: nf_tables: use hook state from xt_action_param structure

Don't copy relevant fields from hook state structure, instead use the
one that is already available in struct xt_action_param.

This patch also adds a set of new wrapper functions to fetch relevant
hook state structure fields.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 44060344f958..3295fb85bff6 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -14,27 +14,42 @@
 
 struct nft_pktinfo {
 	struct sk_buff			*skb;
-	struct net			*net;
-	const struct net_device		*in;
-	const struct net_device		*out;
-	u8				pf;
-	u8				hook;
 	bool				tprot_set;
 	u8				tprot;
 	/* for x_tables compatibility */
 	struct xt_action_param		xt;
 };
 
+static inline struct net *nft_net(const struct nft_pktinfo *pkt)
+{
+	return pkt->xt.state->net;
+}
+
+static inline unsigned int nft_hook(const struct nft_pktinfo *pkt)
+{
+	return pkt->xt.state->hook;
+}
+
+static inline u8 nft_pf(const struct nft_pktinfo *pkt)
+{
+	return pkt->xt.state->pf;
+}
+
+static inline const struct net_device *nft_in(const struct nft_pktinfo *pkt)
+{
+	return pkt->xt.state->in;
+}
+
+static inline const struct net_device *nft_out(const struct nft_pktinfo *pkt)
+{
+	return pkt->xt.state->out;
+}
+
 static inline void nft_set_pktinfo(struct nft_pktinfo *pkt,
 				   struct sk_buff *skb,
 				   const struct nf_hook_state *state)
 {
 	pkt->skb = skb;
-	pkt->net = state->net;
-	pkt->in = state->in;
-	pkt->out = state->out;
-	pkt->hook = state->hook;
-	pkt->pf = state->pf;
 	pkt->xt.state = state;
 }
 
-- 
cgit v1.2.3


From 01886bd91f1ba418ce669dfe97a06ca9504e482a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 3 Nov 2016 10:56:35 +0100
Subject: netfilter: remove hook_entries field from nf_hook_state

This field is only useful for nf_queue, so store it in the
nf_queue_entry structure instead, away from the core path. Pass
hook_head to nf_hook_slow().

Since we always have a valid entry on the first iteration in
nf_iterate(), we can use 'do { ... } while (entry)' loop instead.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_queue.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 2280cfe86c56..09948d10e38e 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -12,6 +12,7 @@ struct nf_queue_entry {
 	unsigned int		id;
 
 	struct nf_hook_state	state;
+	struct nf_hook_entry	*hook;
 	u16			size; /* sizeof(entry) + saved route keys */
 
 	/* extra space to store route keys */
-- 
cgit v1.2.3


From 70ecc24841326396a827deb55c3fefac582a729d Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 2 Nov 2016 11:02:16 -0400
Subject: ipv4: add IP_RECVFRAGSIZE cmsg

The IP stack records the largest fragment of a reassembled packet
in IPCB(skb)->frag_max_size. When reading a datagram or raw packet
that arrived fragmented, expose the value to allow applications to
estimate receive path MTU.

Tested:
  Sent data over a veth pair of which the source has a small mtu.
  Sent data using netcat, received using a dedicated process.

  Verified that the cmsg IP_RECVFRAGSIZE is returned only when
  data arrives fragmented, and in that cases matches the veth mtu.

    ip link add veth0 type veth peer name veth1

    ip netns add from
    ip netns add to

    ip link set dev veth1 netns to
    ip netns exec to ip addr add dev veth1 192.168.10.1/24
    ip netns exec to ip link set dev veth1 up

    ip link set dev veth0 netns from
    ip netns exec from ip addr add dev veth0 192.168.10.2/24
    ip netns exec from ip link set dev veth0 up
    ip netns exec from ip link set dev veth0 mtu 1300
    ip netns exec from ethtool -K veth0 ufo off

    dd if=/dev/zero bs=1 count=1400 2>/dev/null > payload

    ip netns exec to ./recv_cmsg_recvfragsize -4 -u -p 6000 &
    ip netns exec from nc -q 1 -u 192.168.10.1 6000 < payload

  using github.com/wdebruij/kerneltools/blob/master/tests/recvfragsize.c

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_sock.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 236a81034fef..c9cff977a7fb 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -228,6 +228,7 @@ struct inet_sock {
 #define IP_CMSG_PASSSEC		BIT(5)
 #define IP_CMSG_ORIGDSTADDR	BIT(6)
 #define IP_CMSG_CHECKSUM	BIT(7)
+#define IP_CMSG_RECVFRAGSIZE	BIT(8)
 
 /**
  * sk_to_full_sk - Access to a full socket
-- 
cgit v1.2.3


From 86741ec25462e4c8cdce6df2f41ead05568c7d5e Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Fri, 4 Nov 2016 02:23:41 +0900
Subject: net: core: Add a UID field to struct sock.

Protocol sockets (struct sock) don't have UIDs, but most of the
time, they map 1:1 to userspace sockets (struct socket) which do.

Various operations such as the iptables xt_owner match need
access to the "UID of a socket", and do so by following the
backpointer to the struct socket. This involves taking
sk_callback_lock and doesn't work when there is no socket
because userspace has already called close().

Simplify this by adding a sk_uid field to struct sock whose value
matches the UID of the corresponding struct socket. The semantics
are as follows:

1. Whenever sk_socket is non-null: sk_uid is the same as the UID
   in sk_socket, i.e., matches the return value of sock_i_uid.
   Specifically, the UID is set when userspace calls socket(),
   fchown(), or accept().
2. When sk_socket is NULL, sk_uid is defined as follows:
   - For a socket that no longer has a sk_socket because
     userspace has called close(): the previous UID.
   - For a cloned socket (e.g., an incoming connection that is
     established but on which userspace has not yet called
     accept): the UID of the socket it was cloned from.
   - For a socket that has never had an sk_socket: UID 0 inside
     the user namespace corresponding to the network namespace
     the socket belongs to.

Kernel sockets created by sock_create_kern are a special case
of #1 and sk_uid is the user that created them. For kernel
sockets created at network namespace creation time, such as the
per-processor ICMP and TCP sockets, this is the user that created
the network namespace.

Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 93331a1492db..cf617ee16723 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -419,6 +419,7 @@ struct sock {
 	u32			sk_max_ack_backlog;
 	__u32			sk_priority;
 	__u32			sk_mark;
+	kuid_t			sk_uid;
 	struct pid		*sk_peer_pid;
 	const struct cred	*sk_peer_cred;
 	long			sk_rcvtimeo;
@@ -1664,6 +1665,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
 	sk->sk_wq = parent->wq;
 	parent->sk = sk;
 	sk_set_socket(sk, parent);
+	sk->sk_uid = SOCK_INODE(parent)->i_uid;
 	security_sock_graft(sk, parent);
 	write_unlock_bh(&sk->sk_callback_lock);
 }
@@ -1671,6 +1673,11 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
 kuid_t sock_i_uid(struct sock *sk);
 unsigned long sock_i_ino(struct sock *sk);
 
+static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
+{
+	return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
+}
+
 static inline u32 net_tx_rndhash(void)
 {
 	u32 v = prandom_u32();
-- 
cgit v1.2.3


From 622ec2c9d52405973c9f1ca5116eb1c393adfc7d Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Fri, 4 Nov 2016 02:23:42 +0900
Subject: net: core: add UID to flows, rules, and routes

- Define a new FIB rule attributes, FRA_UID_RANGE, to describe a
  range of UIDs.
- Define a RTA_UID attribute for per-UID route lookups and dumps.
- Support passing these attributes to and from userspace via
  rtnetlink. The value INVALID_UID indicates no UID was
  specified.
- Add a UID field to the flow structures.

Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h | 9 ++++++++-
 include/net/flow.h      | 5 +++++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 456e4a6006ab..8dbfdf728cd8 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -8,6 +8,11 @@
 #include <net/flow.h>
 #include <net/rtnetlink.h>
 
+struct fib_kuid_range {
+	kuid_t start;
+	kuid_t end;
+};
+
 struct fib_rule {
 	struct list_head	list;
 	int			iifindex;
@@ -30,6 +35,7 @@ struct fib_rule {
 	int			suppress_prefixlen;
 	char			iifname[IFNAMSIZ];
 	char			oifname[IFNAMSIZ];
+	struct fib_kuid_range	uid_range;
 	struct rcu_head		rcu;
 };
 
@@ -92,7 +98,8 @@ struct fib_rules_ops {
 	[FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \
 	[FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \
 	[FRA_GOTO]	= { .type = NLA_U32 }, \
-	[FRA_L3MDEV]	= { .type = NLA_U8 }
+	[FRA_L3MDEV]	= { .type = NLA_U8 }, \
+	[FRA_UID_RANGE]	= { .len = sizeof(struct fib_rule_uid_range) }
 
 static inline void fib_rule_get(struct fib_rule *rule)
 {
diff --git a/include/net/flow.h b/include/net/flow.h
index 035aa7716967..51373f3a5e31 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -11,6 +11,7 @@
 #include <linux/in6.h>
 #include <linux/atomic.h>
 #include <net/flow_dissector.h>
+#include <linux/uidgid.h>
 
 /*
  * ifindex generation is per-net namespace, and loopback is
@@ -37,6 +38,7 @@ struct flowi_common {
 #define FLOWI_FLAG_SKIP_NH_OIF		0x04
 	__u32	flowic_secid;
 	struct flowi_tunnel flowic_tun_key;
+	kuid_t  flowic_uid;
 };
 
 union flowi_uli {
@@ -74,6 +76,7 @@ struct flowi4 {
 #define flowi4_flags		__fl_common.flowic_flags
 #define flowi4_secid		__fl_common.flowic_secid
 #define flowi4_tun_key		__fl_common.flowic_tun_key
+#define flowi4_uid		__fl_common.flowic_uid
 
 	/* (saddr,daddr) must be grouped, same order as in IP header */
 	__be32			saddr;
@@ -131,6 +134,7 @@ struct flowi6 {
 #define flowi6_flags		__fl_common.flowic_flags
 #define flowi6_secid		__fl_common.flowic_secid
 #define flowi6_tun_key		__fl_common.flowic_tun_key
+#define flowi6_uid		__fl_common.flowic_uid
 	struct in6_addr		daddr;
 	struct in6_addr		saddr;
 	/* Note: flowi6_tos is encoded in flowlabel, too. */
@@ -176,6 +180,7 @@ struct flowi {
 #define flowi_flags	u.__fl_common.flowic_flags
 #define flowi_secid	u.__fl_common.flowic_secid
 #define flowi_tun_key	u.__fl_common.flowic_tun_key
+#define flowi_uid	u.__fl_common.flowic_uid
 } __attribute__((__aligned__(BITS_PER_LONG/8)));
 
 static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4)
-- 
cgit v1.2.3


From e2d118a1cb5e60d077131a09db1d81b90a5295fe Mon Sep 17 00:00:00 2001
From: Lorenzo Colitti <lorenzo@google.com>
Date: Fri, 4 Nov 2016 02:23:43 +0900
Subject: net: inet: Support UID-based routing in IP protocols.

- Use the UID in routing lookups made by protocol connect() and
  sendmsg() functions.
- Make sure that routing lookups triggered by incoming packets
  (e.g., Path MTU discovery) take the UID of the socket into
  account.
- For packets not associated with a userspace socket, (e.g., ping
  replies) use UID 0 inside the user namespace corresponding to
  the network namespace the socket belongs to. This allows
  all namespaces to apply routing and iptables rules to
  kernel-originated traffic in that namespaces by matching UID 0.
  This is better than using the UID of the kernel socket that is
  sending the traffic, because the UID of kernel sockets created
  at namespace creation time (e.g., the per-processor ICMP and
  TCP sockets) is the UID of the user that created the socket,
  which might not be mapped in the namespace.

Tested: compiles allnoconfig, allyesconfig, allmodconfig
Tested: https://android-review.googlesource.com/253302
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h      | 4 +++-
 include/net/ip.h        | 1 +
 include/net/ip6_route.h | 5 +++--
 include/net/route.h     | 5 +++--
 4 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/flow.h b/include/net/flow.h
index 51373f3a5e31..6bbbca8af8e3 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -96,7 +96,8 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
 				      __u32 mark, __u8 tos, __u8 scope,
 				      __u8 proto, __u8 flags,
 				      __be32 daddr, __be32 saddr,
-				      __be16 dport, __be16 sport)
+				      __be16 dport, __be16 sport,
+				      kuid_t uid)
 {
 	fl4->flowi4_oif = oif;
 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
@@ -107,6 +108,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
 	fl4->flowi4_flags = flags;
 	fl4->flowi4_secid = 0;
 	fl4->flowi4_tun_key.tun_id = 0;
+	fl4->flowi4_uid = uid;
 	fl4->daddr = daddr;
 	fl4->saddr = saddr;
 	fl4->fl4_dport = dport;
diff --git a/include/net/ip.h b/include/net/ip.h
index 5413883ac47f..55cdaac02957 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -179,6 +179,7 @@ struct ip_reply_arg {
 				/* -1 if not needed */ 
 	int	    bound_dev_if;
 	u8  	    tos;
+	kuid_t	    uid;
 }; 
 
 #define IP_REPLY_ARG_NOSRCCHECK 1
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index f83e78d071a3..9dc2c182a263 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -140,9 +140,10 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 		  const struct in6_addr *gwaddr);
 
 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif,
-		     u32 mark);
+		     u32 mark, kuid_t uid);
 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu);
-void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark);
+void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
+		  kuid_t uid);
 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
 			    u32 mark);
 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk);
diff --git a/include/net/route.h b/include/net/route.h
index 0429d47cad25..c0874c87c173 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -153,7 +153,7 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi
 	flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos,
 			   RT_SCOPE_UNIVERSE, proto,
 			   sk ? inet_sk_flowi_flags(sk) : 0,
-			   daddr, saddr, dport, sport);
+			   daddr, saddr, dport, sport, sock_net_uid(net, sk));
 	if (sk)
 		security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
 	return ip_route_output_flow(net, fl4, sk);
@@ -269,7 +269,8 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32
 		flow_flags |= FLOWI_FLAG_ANYSRC;
 
 	flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE,
-			   protocol, flow_flags, dst, src, dport, sport);
+			   protocol, flow_flags, dst, src, dport, sport,
+			   sk->sk_uid);
 }
 
 static inline struct rtable *ip_route_connect(struct flowi4 *fl4,
-- 
cgit v1.2.3


From ad959036a70890bea121403c6a4e373dff5b7311 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 4 Nov 2016 11:28:58 +0100
Subject: net/sock: add an explicit sk argument for ip_cmsg_recv_offset()

So that we can use it even after orphaining the skbuff.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip.h b/include/net/ip.h
index 55cdaac02957..f48c67cab222 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -579,7 +579,8 @@ int ip_options_rcv_srr(struct sk_buff *skb);
  */
 
 void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb);
-void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, int tlen, int offset);
+void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
+			 struct sk_buff *skb, int tlen, int offset);
 int ip_cmsg_send(struct sock *sk, struct msghdr *msg,
 		 struct ipcm_cookie *ipc, bool allow_ipv6);
 int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
@@ -601,7 +602,7 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport,
 
 static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
 {
-	ip_cmsg_recv_offset(msg, skb, 0, 0);
+	ip_cmsg_recv_offset(msg, skb->sk, skb, 0, 0);
 }
 
 bool icmp_global_allow(void);
-- 
cgit v1.2.3


From 7c13f97ffde63cc792c49ec1513f3974f2f05229 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 4 Nov 2016 11:28:59 +0100
Subject: udp: do fwd memory scheduling on dequeue

A new argument is added to __skb_recv_datagram to provide
an explicit skb destructor, invoked under the receive queue
lock.
The UDP protocol uses such argument to perform memory
reclaiming on dequeue, so that the UDP protocol does not
set anymore skb->desctructor.
Instead explicit memory reclaiming is performed at close() time and
when skbs are removed from the receive queue.
The in kernel UDP protocol users now need to call a
skb_recv_udp() variant instead of skb_recv_datagram() to
properly perform memory accounting on dequeue.

Overall, this allows acquiring only once the receive queue
lock on dequeue.

Tested using pktgen with random src port, 64 bytes packet,
wire-speed on a 10G link as sender and udp_sink as the receiver,
using an l4 tuple rxhash to stress the contention, and one or more
udp_sink instances with reuseport.

nr sinks	vanilla		patched
1		440		560
3		2150		2300
6		3650		3800
9		4450		4600
12		6250		6450

v1 -> v2:
 - do rmem and allocated memory scheduling under the receive lock
 - do bulk scheduling in first_packet_length() and in udp_destruct_sock()
 - avoid the typdef for the dequeue callback

Suggested-by: Eric Dumazet <edumazet@google.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/net')

diff --git a/include/net/udp.h b/include/net/udp.h
index 6134f37ba3ab..e6e4e19be387 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -248,6 +248,21 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
 /* net/ipv4/udp.c */
 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
+void udp_skb_destructor(struct sock *sk, struct sk_buff *skb);
+static inline struct sk_buff *
+__skb_recv_udp(struct sock *sk, unsigned int flags, int noblock, int *peeked,
+	       int *off, int *err)
+{
+	return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
+				   udp_skb_destructor, peeked, off, err);
+}
+static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
+					   int noblock, int *err)
+{
+	int peeked, off = 0;
+
+	return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err);
+}
 
 void udp_v4_early_demux(struct sk_buff *skb);
 int udp_get_port(struct sock *sk, unsigned short snum,
-- 
cgit v1.2.3


From d0a81f67cd6286d32f42a167d19c7a387c23db79 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 3 Nov 2016 14:56:01 +0100
Subject: net: make default TX queue length a defined constant

The default TX queue length of Ethernet devices have been a magic
constant of 1000, ever since the initial git import.

Looking back in historical trees[1][2] the value used to be 100,
with the same comment "Ethernet wants good queues". The commit[3]
that changed this from 100 to 1000 didn't describe why, but from
conversations with Robert Olsson it seems that it was changed
when Ethernet devices went from 100Mbit/s to 1Gbit/s, because the
link speed increased x10 the queue size were also adjusted.  This
value later caused much heartache for the bufferbloat community.

This patch merely moves the value into a defined constant.

[1] https://git.kernel.org/cgit/linux/kernel/git/davem/netdev-vger-cvs.git/
[2] https://git.kernel.org/cgit/linux/kernel/git/tglx/history.git/
[3] https://git.kernel.org/tglx/history/c/98921832c232

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_sched.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index cd334c9584e9..f1b76b8e6d2d 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -6,6 +6,8 @@
 #include <linux/if_vlan.h>
 #include <net/sch_generic.h>
 
+#define DEFAULT_TX_QUEUE_LEN	1000
+
 struct qdisc_walker {
 	int	stop;
 	int	skip;
-- 
cgit v1.2.3


From 9ce183b4c4d24559467d7712e313f2b3f9277437 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 7 Nov 2016 15:14:36 +0200
Subject: net/sched: act_tunnel_key: add helper inlines to access
 tcf_tunnel_key

Needed for drivers to pick the relevant action when offloading tunnel
key act.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_tunnel_key.h | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'include/net')

diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h
index 253f8da6c2a6..efef0b4b1b2b 100644
--- a/include/net/tc_act/tc_tunnel_key.h
+++ b/include/net/tc_act/tc_tunnel_key.h
@@ -12,6 +12,8 @@
 #define __NET_TC_TUNNEL_KEY_H
 
 #include <net/act_api.h>
+#include <linux/tc_act/tc_tunnel_key.h>
+#include <net/dst_metadata.h>
 
 struct tcf_tunnel_key_params {
 	struct rcu_head		rcu;
@@ -27,4 +29,39 @@ struct tcf_tunnel_key {
 
 #define to_tunnel_key(a) ((struct tcf_tunnel_key *)a)
 
+static inline bool is_tcf_tunnel_set(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	struct tcf_tunnel_key *t = to_tunnel_key(a);
+	struct tcf_tunnel_key_params *params = rtnl_dereference(t->params);
+
+	if (a->ops && a->ops->type == TCA_ACT_TUNNEL_KEY)
+		return params->tcft_action == TCA_TUNNEL_KEY_ACT_SET;
+#endif
+	return false;
+}
+
+static inline bool is_tcf_tunnel_release(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	struct tcf_tunnel_key *t = to_tunnel_key(a);
+	struct tcf_tunnel_key_params *params = rtnl_dereference(t->params);
+
+	if (a->ops && a->ops->type == TCA_ACT_TUNNEL_KEY)
+		return params->tcft_action == TCA_TUNNEL_KEY_ACT_RELEASE;
+#endif
+	return false;
+}
+
+static inline struct ip_tunnel_info *tcf_tunnel_info(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	struct tcf_tunnel_key *t = to_tunnel_key(a);
+	struct tcf_tunnel_key_params *params = rtnl_dereference(t->params);
+
+	return &params->tcft_enc_metadata->u.tun_info;
+#else
+	return NULL;
+#endif
+}
 #endif /* __NET_TC_TUNNEL_KEY_H */
-- 
cgit v1.2.3


From 9ba6a9a9f7a42673e9fc08ff3594f64caae64d3c Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 7 Nov 2016 15:14:37 +0200
Subject: flow_dissector: Add enums for encapsulation keys

New encapsulation keys were added to the flower classifier, which allow
classification according to outer (encapsulation) headers attributes
such as key and IP addresses.
In order to expose those attributes outside flower, add
corresponding enums in the flow dissector.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index d9534927d93b..4e7cf38a7750 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -128,6 +128,10 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags */
 	FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */
 	FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */
+	FLOW_DISSECTOR_KEY_ENC_KEYID, /* struct flow_dissector_key_keyid */
+	FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
+	FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
+	FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
-- 
cgit v1.2.3


From f4d997fd613001e612543339e0275c037f94ffe9 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 7 Nov 2016 15:14:39 +0200
Subject: net/sched: cls_flower: Add UDP port to tunnel parameters

The current IP tunneling classification supports only IP addresses and key.
Enhance UDP based IP tunneling classification parameters by adding UDP
src and dst port.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 4e7cf38a7750..c4f31666afd2 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -132,6 +132,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
 	FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
 	FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */
+	FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
-- 
cgit v1.2.3


From 24ba898d43e87f9ac87353c7a13eef4ee726cab7 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Mon, 7 Nov 2016 15:14:40 +0200
Subject: net/dst: Add dst port to dst_metadata utility functions

Add dst port parameter to __ip_tun_set_dst and __ipv6_tun_set_dst
utility functions.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst_metadata.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 6965c8f68ade..701fc814d0af 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -115,6 +115,7 @@ static inline struct ip_tunnel_info *skb_tunnel_info_unclone(struct sk_buff *skb
 static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr,
 						    __be32 daddr,
 						    __u8 tos, __u8 ttl,
+						    __be16 tp_dst,
 						    __be16 flags,
 						    __be64 tunnel_id,
 						    int md_size)
@@ -127,7 +128,7 @@ static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr,
 
 	ip_tunnel_key_init(&tun_dst->u.tun_info.key,
 			   saddr, daddr, tos, ttl,
-			   0, 0, 0, tunnel_id, flags);
+			   0, 0, tp_dst, tunnel_id, flags);
 	return tun_dst;
 }
 
@@ -139,12 +140,13 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
 	const struct iphdr *iph = ip_hdr(skb);
 
 	return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl,
-				flags, tunnel_id, md_size);
+				0, flags, tunnel_id, md_size);
 }
 
 static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr,
 						      const struct in6_addr *daddr,
 						      __u8 tos, __u8 ttl,
+						      __be16 tp_dst,
 						      __be32 label,
 						      __be16 flags,
 						      __be64 tunnel_id,
@@ -162,7 +164,7 @@ static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *sad
 	info->key.tun_flags = flags;
 	info->key.tun_id = tunnel_id;
 	info->key.tp_src = 0;
-	info->key.tp_dst = 0;
+	info->key.tp_dst = tp_dst;
 
 	info->key.u.ipv6.src = *saddr;
 	info->key.u.ipv6.dst = *daddr;
@@ -183,7 +185,7 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb,
 
 	return __ipv6_tun_set_dst(&ip6h->saddr, &ip6h->daddr,
 				  ipv6_get_dsfield(ip6h), ip6h->hop_limit,
-				  ip6_flowlabel(ip6h), flags, tunnel_id,
+				  0, ip6_flowlabel(ip6h), flags, tunnel_id,
 				  md_size);
 }
 #endif /* __NET_DST_METADATA_H */
-- 
cgit v1.2.3


From 4e24877e61e8507c0843e4bddbc6ecccbfd2e87d Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Sun, 6 Nov 2016 21:15:51 +0800
Subject: netfilter: nf_tables: simplify the basic expressions' init routine

Some basic expressions are built into nf_tables.ko, such as nft_cmp,
nft_lookup, nft_range and so on. But these basic expressions' init
routine is a little ugly, too many goto errX labels, and we forget
to call nft_range_module_exit in the exit routine, although it is
harmless.

Acctually, the init and exit routines of these basic expressions
are same, i.e. do nft_register_expr in the init routine and do
nft_unregister_expr in the exit routine.

So it's better to arrange them into an array and deal with them
together.

Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h | 33 +++++++++------------------------
 1 file changed, 9 insertions(+), 24 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 00f4f6b1b1ba..862373d4ea9d 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -1,12 +1,18 @@
 #ifndef _NET_NF_TABLES_CORE_H
 #define _NET_NF_TABLES_CORE_H
 
+extern struct nft_expr_type nft_imm_type;
+extern struct nft_expr_type nft_cmp_type;
+extern struct nft_expr_type nft_lookup_type;
+extern struct nft_expr_type nft_bitwise_type;
+extern struct nft_expr_type nft_byteorder_type;
+extern struct nft_expr_type nft_payload_type;
+extern struct nft_expr_type nft_dynset_type;
+extern struct nft_expr_type nft_range_type;
+
 int nf_tables_core_module_init(void);
 void nf_tables_core_module_exit(void);
 
-int nft_immediate_module_init(void);
-void nft_immediate_module_exit(void);
-
 struct nft_cmp_fast_expr {
 	u32			data;
 	enum nft_registers	sreg:8;
@@ -25,24 +31,6 @@ static inline u32 nft_cmp_fast_mask(unsigned int len)
 
 extern const struct nft_expr_ops nft_cmp_fast_ops;
 
-int nft_cmp_module_init(void);
-void nft_cmp_module_exit(void);
-
-int nft_range_module_init(void);
-void nft_range_module_exit(void);
-
-int nft_lookup_module_init(void);
-void nft_lookup_module_exit(void);
-
-int nft_dynset_module_init(void);
-void nft_dynset_module_exit(void);
-
-int nft_bitwise_module_init(void);
-void nft_bitwise_module_exit(void);
-
-int nft_byteorder_module_init(void);
-void nft_byteorder_module_exit(void);
-
 struct nft_payload {
 	enum nft_payload_bases	base:8;
 	u8			offset;
@@ -62,7 +50,4 @@ struct nft_payload_set {
 extern const struct nft_expr_ops nft_payload_fast_ops;
 extern struct static_key_false nft_trace_enabled;
 
-int nft_payload_module_init(void);
-void nft_payload_module_exit(void);
-
 #endif /* _NET_NF_TABLES_CORE_H */
-- 
cgit v1.2.3


From 0e54d2179f650bac80d89a9def429dbdbed58c11 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Mon, 7 Nov 2016 18:31:17 +0100
Subject: netfilter: conntrack: simplify init/uninit of L4 protocol trackers

modify registration and deregistration of layer-4 protocol trackers to
facilitate inclusion of new elements into the current list of builtin
protocols. Both builtin (TCP, UDP, ICMP) and non-builtin (DCCP, GRE, SCTP,
UDPlite) layer-4 protocol trackers usually register/deregister themselves
using consecutive calls to nf_ct_l4proto_{,pernet}_{,un}register(...).
This sequence is interrupted and rolled back in case of error; in order to
simplify addition of builtin protocols, the input of the above functions
has been modified to allow registering/unregistering multiple protocols.

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index de629f1520df..2152b70626d5 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -125,14 +125,24 @@ struct nf_conntrack_l4proto *nf_ct_l4proto_find_get(u_int16_t l3proto,
 void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p);
 
 /* Protocol pernet registration. */
+int nf_ct_l4proto_pernet_register_one(struct net *net,
+				      struct nf_conntrack_l4proto *proto);
+void nf_ct_l4proto_pernet_unregister_one(struct net *net,
+					 struct nf_conntrack_l4proto *proto);
 int nf_ct_l4proto_pernet_register(struct net *net,
-				  struct nf_conntrack_l4proto *proto);
+				  struct nf_conntrack_l4proto *proto[],
+				  unsigned int num_proto);
 void nf_ct_l4proto_pernet_unregister(struct net *net,
-				     struct nf_conntrack_l4proto *proto);
+				     struct nf_conntrack_l4proto *proto[],
+				     unsigned int num_proto);
 
 /* Protocol global registration. */
-int nf_ct_l4proto_register(struct nf_conntrack_l4proto *proto);
-void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *proto);
+int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *proto);
+void nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *proto);
+int nf_ct_l4proto_register(struct nf_conntrack_l4proto *proto[],
+			   unsigned int num_proto);
+void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *proto[],
+			      unsigned int num_proto);
 
 /* Generic netlink helpers */
 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
-- 
cgit v1.2.3


From 1ababeba4a21f3dba3da3523c670b207fb2feb62 Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Tue, 8 Nov 2016 14:57:39 +0100
Subject: ipv6: implement dataplane support for rthdr type 4 (Segment Routing
 Header)

Implement minimal support for processing of SR-enabled packets
as described in
https://tools.ietf.org/html/draft-ietf-6man-segment-routing-header-02.

This patch implements the following operations:
- Intermediate segment endpoint: incrementation of active segment and rerouting.
- Egress for SR-encapsulated packets: decapsulation of outer IPv6 header + SRH
  and routing of inner packet.
- Cleanup flag support for SR-inlined packets: removal of SRH if we are the
  penultimate segment endpoint.

A per-interface sysctl seg6_enabled is provided, to accept/deny SR-enabled
packets. Default is deny.

This patch does not provide support for HMAC-signed packets.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/seg6.h | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 include/net/seg6.h

(limited to 'include/net')

diff --git a/include/net/seg6.h b/include/net/seg6.h
new file mode 100644
index 000000000000..4dd52a7e95f1
--- /dev/null
+++ b/include/net/seg6.h
@@ -0,0 +1,36 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _NET_SEG6_H
+#define _NET_SEG6_H
+
+static inline void update_csum_diff4(struct sk_buff *skb, __be32 from,
+				     __be32 to)
+{
+	__be32 diff[] = { ~from, to };
+
+	skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum);
+}
+
+static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from,
+				      __be32 *to)
+{
+	__be32 diff[] = {
+		~from[0], ~from[1], ~from[2], ~from[3],
+		to[0], to[1], to[2], to[3],
+	};
+
+	skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum);
+}
+
+#endif
-- 
cgit v1.2.3


From 915d7e5e5930b4f01d0971d93b9b25ed17d221aa Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Tue, 8 Nov 2016 14:57:40 +0100
Subject: ipv6: sr: add code base for control plane support of SR-IPv6

This patch adds the necessary hooks and structures to provide support
for SR-IPv6 control plane, essentially the Generic Netlink commands
that will be used for userspace control over the Segment Routing
kernel structures.

The genetlink commands provide control over two different structures:
tunnel source and HMAC data. The tunnel source is the source address
that will be used by default when encapsulating packets into an
outer IPv6 header + SRH. If the tunnel source is set to :: then an
address of the outgoing interface will be selected as the source.

The HMAC commands currently just return ENOTSUPP and will be implemented
in a future patch.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv6.h |  1 +
 include/net/seg6.h       | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 10d0848f5b8a..de7745e2edcc 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -85,6 +85,7 @@ struct netns_ipv6 {
 #endif
 	atomic_t		dev_addr_genid;
 	atomic_t		fib6_sernum;
+	struct seg6_pernet_data *seg6_data;
 };
 
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
diff --git a/include/net/seg6.h b/include/net/seg6.h
index 4dd52a7e95f1..7c7b8ed39661 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -14,6 +14,9 @@
 #ifndef _NET_SEG6_H
 #define _NET_SEG6_H
 
+#include <linux/net.h>
+#include <linux/ipv6.h>
+
 static inline void update_csum_diff4(struct sk_buff *skb, __be32 from,
 				     __be32 to)
 {
@@ -33,4 +36,17 @@ static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from,
 	skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum);
 }
 
+struct seg6_pernet_data {
+	struct mutex lock;
+	struct in6_addr __rcu *tun_src;
+};
+
+static inline struct seg6_pernet_data *seg6_pernet(struct net *net)
+{
+	return net->ipv6.seg6_data;
+}
+
+extern int seg6_init(void);
+extern void seg6_exit(void);
+
 #endif
-- 
cgit v1.2.3


From 6c8702c60b88651072460f3f4026c7dfe2521d12 Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Tue, 8 Nov 2016 14:57:41 +0100
Subject: ipv6: sr: add support for SRH encapsulation and injection with
 lwtunnels

This patch creates a new type of interfaceless lightweight tunnel (SEG6),
enabling the encapsulation and injection of SRH within locally emitted
packets and forwarded packets.

>From a configuration viewpoint, a seg6 tunnel would be configured as follows:

  ip -6 ro ad fc00::1/128 encap seg6 mode encap segs fc42::1,fc42::2,fc42::3 dev eth0

Any packet whose destination address is fc00::1 would thus be encapsulated
within an outer IPv6 header containing the SRH with three segments, and would
actually be routed to the first segment of the list. If `mode inline' was
specified instead of `mode encap', then the SRH would be directly inserted
after the IPv6 header without outer encapsulation.

The inline mode is only available if CONFIG_IPV6_SEG6_INLINE is enabled. This
feature was made configurable because direct header insertion may break
several mechanisms such as PMTUD or IPSec AH.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/seg6.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/net')

diff --git a/include/net/seg6.h b/include/net/seg6.h
index 7c7b8ed39661..ff5da0ce83e9 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -16,6 +16,8 @@
 
 #include <linux/net.h>
 #include <linux/ipv6.h>
+#include <net/lwtunnel.h>
+#include <linux/seg6.h>
 
 static inline void update_csum_diff4(struct sk_buff *skb, __be32 from,
 				     __be32 to)
@@ -48,5 +50,9 @@ static inline struct seg6_pernet_data *seg6_pernet(struct net *net)
 
 extern int seg6_init(void);
 extern void seg6_exit(void);
+extern int seg6_iptunnel_init(void);
+extern void seg6_iptunnel_exit(void);
+
+extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len);
 
 #endif
-- 
cgit v1.2.3


From bf355b8d2c30a289232042cacc1cfaea4923936c Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Tue, 8 Nov 2016 14:57:42 +0100
Subject: ipv6: sr: add core files for SR HMAC support

This patch adds the necessary functions to compute and check the HMAC signature
of an SR-enabled packet. Two HMAC algorithms are supported: hmac(sha1) and
hmac(sha256).

In order to avoid dynamic memory allocation for each HMAC computation,
a per-cpu ring buffer is allocated for this purpose.

A new per-interface sysctl called seg6_require_hmac is added, allowing a
user-defined policy for processing HMAC-signed SR-enabled packets.
A value of -1 means that the HMAC field will always be ignored.
A value of 0 means that if an HMAC field is present, its validity will
be enforced (the packet is dropped is the signature is incorrect).
Finally, a value of 1 means that any SR-enabled packet that does not
contain an HMAC signature or whose signature is incorrect will be dropped.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/seg6.h      |  4 ++++
 include/net/seg6_hmac.h | 62 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)
 create mode 100644 include/net/seg6_hmac.h

(limited to 'include/net')

diff --git a/include/net/seg6.h b/include/net/seg6.h
index ff5da0ce83e9..4e0357517d79 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -18,6 +18,7 @@
 #include <linux/ipv6.h>
 #include <net/lwtunnel.h>
 #include <linux/seg6.h>
+#include <linux/rhashtable.h>
 
 static inline void update_csum_diff4(struct sk_buff *skb, __be32 from,
 				     __be32 to)
@@ -41,6 +42,9 @@ static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from,
 struct seg6_pernet_data {
 	struct mutex lock;
 	struct in6_addr __rcu *tun_src;
+#ifdef CONFIG_IPV6_SEG6_HMAC
+	struct rhashtable hmac_infos;
+#endif
 };
 
 static inline struct seg6_pernet_data *seg6_pernet(struct net *net)
diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h
new file mode 100644
index 000000000000..69c3a106056b
--- /dev/null
+++ b/include/net/seg6_hmac.h
@@ -0,0 +1,62 @@
+/*
+ *  SR-IPv6 implementation
+ *
+ *  Author:
+ *  David Lebrun <david.lebrun@uclouvain.be>
+ *
+ *
+ *  This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _NET_SEG6_HMAC_H
+#define _NET_SEG6_HMAC_H
+
+#include <net/flow.h>
+#include <net/ip6_fib.h>
+#include <net/sock.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/route.h>
+#include <net/seg6.h>
+#include <linux/seg6_hmac.h>
+#include <linux/rhashtable.h>
+
+#define SEG6_HMAC_MAX_DIGESTSIZE	160
+#define SEG6_HMAC_RING_SIZE		256
+
+struct seg6_hmac_info {
+	struct rhash_head node;
+	struct rcu_head rcu;
+
+	u32 hmackeyid;
+	char secret[SEG6_HMAC_SECRET_LEN];
+	u8 slen;
+	u8 alg_id;
+};
+
+struct seg6_hmac_algo {
+	u8 alg_id;
+	char name[64];
+	struct crypto_shash * __percpu *tfms;
+	struct shash_desc * __percpu *shashs;
+};
+
+extern int seg6_hmac_compute(struct seg6_hmac_info *hinfo,
+			     struct ipv6_sr_hdr *hdr, struct in6_addr *saddr,
+			     u8 *output);
+extern struct seg6_hmac_info *seg6_hmac_info_lookup(struct net *net, u32 key);
+extern int seg6_hmac_info_add(struct net *net, u32 key,
+			      struct seg6_hmac_info *hinfo);
+extern int seg6_hmac_info_del(struct net *net, u32 key);
+extern int seg6_push_hmac(struct net *net, struct in6_addr *saddr,
+			  struct ipv6_sr_hdr *srh);
+extern bool seg6_hmac_validate_skb(struct sk_buff *skb);
+extern int seg6_hmac_init(void);
+extern void seg6_hmac_exit(void);
+extern int seg6_hmac_net_init(struct net *net);
+extern void seg6_hmac_net_exit(struct net *net);
+
+#endif
-- 
cgit v1.2.3


From 613fa3ca9e9e6af57927dab238121010c510fe4c Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Tue, 8 Nov 2016 14:59:20 +0100
Subject: ipv6: add source address argument for ipv6_push_nfrag_opts

This patch prepares for insertion of SRH through setsockopt().
The new source address argument is used when an HMAC field is
present in the SRH, which must be filled. The HMAC signature
process requires the source address as input text.

Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 8fed1cd78658..0a3622bf086f 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -932,7 +932,8 @@ int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
  */
 
 void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
-			  u8 *proto, struct in6_addr **daddr_p);
+			  u8 *proto, struct in6_addr **daddr_p,
+			  struct in6_addr *saddr);
 void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
 			 u8 *proto);
 
-- 
cgit v1.2.3


From f41cd11d64b2b21012eb4abffbe579bc0b90467f Mon Sep 17 00:00:00 2001
From: Yotam Gigi <yotamg@mellanox.com>
Date: Tue, 8 Nov 2016 17:24:03 +0200
Subject: tc_act: Remove tcf_act macro

tc_act macro addressed a non existing field, and was not used in the
kernel source.

Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 82f3c912a5b1..d8eae87ea778 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -42,7 +42,6 @@ struct tc_action {
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
 	struct gnet_stats_queue __percpu *cpu_qstats;
 };
-#define tcf_act		common.tcfa_act
 #define tcf_head	common.tcfa_head
 #define tcf_index	common.tcfa_index
 #define tcf_refcnt	common.tcfa_refcnt
-- 
cgit v1.2.3


From 98e4321b97cc790c6aac3fb7c92bbf13212452ee Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 13 Nov 2016 12:14:59 -0500
Subject: genetlink: Make family a signed integer.

The idr_alloc(), idr_remove(), et al. routines all expect IDs to be
signed integers.  Therefore make the genl_family member 'id' signed
too.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 3ec87bacc0f5..a34275be3600 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -48,7 +48,7 @@ struct genl_info;
  * @n_ops: number of operations supported by this family
  */
 struct genl_family {
-	unsigned int		id;		/* private */
+	int			id;		/* private */
 	unsigned int		hdrsize;
 	char			name[GENL_NAMSIZ];
 	unsigned int		version;
-- 
cgit v1.2.3


From 7e416ad7416307e22871a0ef0f0f14e2bb66a0d1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 10 Nov 2016 14:17:01 +0100
Subject: netfilter: conntrack: remove unused netns_ct member

since 23014011ba420 ('netfilter: conntrack: support a fixed size of 128 distinct labels')
this isn't needed anymore.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netns/conntrack.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index e469e85de3f9..3d06d94d2e52 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -91,7 +91,6 @@ struct netns_ct {
 	struct nf_ip_net	nf_ct_proto;
 #if defined(CONFIG_NF_CONNTRACK_LABELS)
 	unsigned int		labels_used;
-	u8			label_words;
 #endif
 };
 #endif
-- 
cgit v1.2.3


From d9dc8b0f8b4ec8cdc48ad5a20a3105387138be82 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Fri, 11 Nov 2016 10:20:50 -0800
Subject: net: fix sleeping for sk_wait_event()

Similar to commit 14135f30e33c ("inet: fix sleeping inside inet_wait_for_connect()"),
sk_wait_event() needs to fix too, because release_sock() is blocking,
it changes the process state back to running after sleep, which breaks
the previous prepare_to_wait().

Switch to the new wait API.

Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index cf617ee16723..9d905ed0cd25 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -915,14 +915,16 @@ static inline void sock_rps_reset_rxhash(struct sock *sk)
 #endif
 }
 
-#define sk_wait_event(__sk, __timeo, __condition)			\
+#define sk_wait_event(__sk, __timeo, __condition, __wait)		\
 	({	int __rc;						\
 		release_sock(__sk);					\
 		__rc = __condition;					\
 		if (!__rc) {						\
-			*(__timeo) = schedule_timeout(*(__timeo));	\
+			*(__timeo) = wait_woken(__wait,			\
+						TASK_INTERRUPTIBLE,	\
+						*(__timeo));		\
 		}							\
-		sched_annotate_sleep();						\
+		sched_annotate_sleep();					\
 		lock_sock(__sk);					\
 		__rc = __condition;					\
 		__rc;							\
-- 
cgit v1.2.3


From c915fe13cbaae5c7aa7b44f367d05addd60c9008 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 15 Nov 2016 16:37:53 +0100
Subject: udplite: fix NULL pointer dereference

The commit 850cbaddb52d ("udp: use it's own memory accounting schema")
assumes that the socket proto has memory accounting enabled,
but this is not the case for UDPLITE.
Fix it enabling memory accounting for UDPLITE and performing
fwd allocated memory reclaiming on socket shutdown.
UDP and UDPLITE share now the same memory accounting limits.
Also drop the backlog receive operation, since is no more needed.

Fixes: 850cbaddb52d ("udp: use it's own memory accounting schema")
Reported-by: Andrei Vagin <avagin@gmail.com>
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp.h     | 1 +
 include/net/udplite.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/udp.h b/include/net/udp.h
index e6e4e19be387..1661791e8ca1 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -246,6 +246,7 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
 }
 
 /* net/ipv4/udp.c */
+void udp_destruct_sock(struct sock *sk);
 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
 void udp_skb_destructor(struct sock *sk, struct sk_buff *skb);
diff --git a/include/net/udplite.h b/include/net/udplite.h
index 80761938b9a7..36097d388219 100644
--- a/include/net/udplite.h
+++ b/include/net/udplite.h
@@ -27,6 +27,7 @@ static __inline__ int udplite_getfrag(void *from, char *to, int  offset,
 static inline int udplite_sk_init(struct sock *sk)
 {
 	udp_sk(sk)->pcflag = UDPLITE_BIT;
+	sk->sk_destruct = udp_destruct_sock;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 9efdb92d68c726e70066e5b4189c1186c9b6f90c Mon Sep 17 00:00:00 2001
From: pravin shelar <pshelar@ovn.org>
Date: Sun, 13 Nov 2016 20:43:58 -0800
Subject: vxlan: remove unsed vxlan_dev_dst_port()

Signed-off-by: Pravin B Shelar <pshelar@ovn.org>
Acked-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/vxlan.h | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 308adc4154f4..49a59202f85e 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -281,16 +281,6 @@ struct vxlan_dev {
 struct net_device *vxlan_dev_create(struct net *net, const char *name,
 				    u8 name_assign_type, struct vxlan_config *conf);
 
-static inline __be16 vxlan_dev_dst_port(struct vxlan_dev *vxlan,
-					unsigned short family)
-{
-#if IS_ENABLED(CONFIG_IPV6)
-	if (family == AF_INET6)
-		return inet_sk(vxlan->vn6_sock->sock->sk)->inet_sport;
-#endif
-	return inet_sk(vxlan->vn4_sock->sock->sk)->inet_sport;
-}
-
 static inline netdev_features_t vxlan_features_check(struct sk_buff *skb,
 						     netdev_features_t features)
 {
-- 
cgit v1.2.3


From 21cb84c48ca0619181106f0f44f3802a989de024 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 15 Nov 2016 10:15:12 -0800
Subject: net: busy-poll: remove need_resched() from sk_can_busy_loop()

Now sk_busy_loop() can schedule by itself, we can remove
need_resched() check from sk_can_busy_loop()

Also add a const to its struct sock parameter.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Adam Belay <abelay@google.com>
Cc: Tariq Toukan <tariqt@mellanox.com>
Cc: Yuval Mintz <Yuval.Mintz@cavium.com>
Cc: Ariel Elior <ariel.elior@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/busy_poll.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 2fbeb1313c0f..965e52b9b5a3 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -58,10 +58,9 @@ static inline unsigned long busy_loop_end_time(void)
 	return busy_loop_us_clock() + ACCESS_ONCE(sysctl_net_busy_poll);
 }
 
-static inline bool sk_can_busy_loop(struct sock *sk)
+static inline bool sk_can_busy_loop(const struct sock *sk)
 {
-	return sk->sk_ll_usec && sk->sk_napi_id &&
-	       !need_resched() && !signal_pending(current);
+	return sk->sk_ll_usec && sk->sk_napi_id && !signal_pending(current);
 }
 
 
-- 
cgit v1.2.3


From a23a8f5bc17e6209eefadcd1cb3d3e28a2cdf530 Mon Sep 17 00:00:00 2001
From: David Lebrun <david.lebrun@uclouvain.be>
Date: Wed, 16 Nov 2016 10:05:46 +0100
Subject: lwtunnel: subtract tunnel headroom from mtu on output redirect

This patch changes the lwtunnel_headroom() function which is called
in ipv4_mtu() and ip6_mtu(), to also return the correct headroom
value when the lwtunnel state is OUTPUT_REDIRECT.

This patch enables e.g. SR-IPv6 encapsulations to work without
manually setting the route mtu.

Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 82e76fe1c1f7..d4c1c75b8862 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -94,7 +94,8 @@ static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate)
 static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate,
 					     unsigned int mtu)
 {
-	if (lwtunnel_xmit_redirect(lwtstate) && lwtstate->headroom < mtu)
+	if ((lwtunnel_xmit_redirect(lwtstate) ||
+	     lwtunnel_output_redirect(lwtstate)) && lwtstate->headroom < mtu)
 		return lwtstate->headroom;
 
 	return 0;
-- 
cgit v1.2.3


From 7fda702f9315e6f4a74fee155c540750788a2d66 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 15 Nov 2016 23:23:11 +0800
Subject: sctp: use new rhlist interface on sctp transport rhashtable

Now sctp transport rhashtable uses hash(lport, dport, daddr) as the key
to hash a node to one chain. If in one host thousands of assocs connect
to one server with the same lport and different laddrs (although it's
not a normal case), all the transports would be hashed into the same
chain.

It may cause to keep returning -EBUSY when inserting a new node, as the
chain is too long and sctp inserts a transport node in a loop, which
could even lead to system hangs there.

The new rhlist interface works for this case that there are many nodes
with the same key in one chain. It puts them into a list then makes this
list be as a node of the chain.

This patch is to replace rhashtable_ interface with rhltable_ interface.
Since a chain would not be too long and it would not return -EBUSY with
this fix when inserting a node, the reinsert loop is also removed here.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h    | 2 +-
 include/net/sctp/structs.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 31acc3f4f132..f0dcaebebddb 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -164,7 +164,7 @@ void sctp_backlog_migrate(struct sctp_association *assoc,
 			  struct sock *oldsk, struct sock *newsk);
 int sctp_transport_hashtable_init(void);
 void sctp_transport_hashtable_destroy(void);
-void sctp_hash_transport(struct sctp_transport *t);
+int sctp_hash_transport(struct sctp_transport *t);
 void sctp_unhash_transport(struct sctp_transport *t);
 struct sctp_transport *sctp_addrs_lookup_transport(
 				struct net *net,
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index bd4a3ded7c87..92daabdc007d 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -124,7 +124,7 @@ extern struct sctp_globals {
 	/* This is the sctp port control hash.	*/
 	struct sctp_bind_hashbucket *port_hashtable;
 	/* This is the hash of all transports. */
-	struct rhashtable transport_hashtable;
+	struct rhltable transport_hashtable;
 
 	/* Sizes of above hashtables. */
 	int ep_hashsize;
@@ -761,7 +761,7 @@ static inline int sctp_packet_empty(struct sctp_packet *packet)
 struct sctp_transport {
 	/* A list of transports. */
 	struct list_head transports;
-	struct rhash_head node;
+	struct rhlist_head node;
 
 	/* Reference counting. */
 	atomic_t refcnt;
-- 
cgit v1.2.3


From e68b6e50fa359cc5aad4d2f8ac2bdbc1a8f4fd59 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 16 Nov 2016 09:10:42 -0800
Subject: udp: enable busy polling for all sockets

UDP busy polling is restricted to connected UDP sockets.

This is because sk_busy_loop() only takes care of one NAPI context.

There are cases where it could be extended.

1) Some hosts receive traffic on a single NIC, with one RX queue.

2) Some applications use SO_REUSEPORT and associated BPF filter
   to split the incoming traffic on one UDP socket per RX
queue/thread/cpu

3) Some UDP sockets are used to send/receive traffic for one flow, but
they do not bother with connect()

This patch records the napi_id of first received skb, giving more
reach to busy polling.

Tested:

lpaa23:~# echo 70 >/proc/sys/net/core/busy_read
lpaa24:~# echo 70 >/proc/sys/net/core/busy_read

lpaa23:~# for f in `seq 1 10`; do ./super_netperf 1 -H lpaa24 -t UDP_RR -l 5; done

Before patch :
   27867   28870   37324   41060   41215
   36764   36838   44455   41282   43843
After patch :
   73920   73213   70147   74845   71697
   68315   68028   75219   70082   73707

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/busy_poll.h | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

(limited to 'include/net')

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 965e52b9b5a3..d73b849e29a6 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -80,11 +80,6 @@ static inline void skb_mark_napi_id(struct sk_buff *skb,
 	skb->napi_id = napi->napi_id;
 }
 
-/* used in the protocol hanlder to propagate the napi_id to the socket */
-static inline void sk_mark_napi_id(struct sock *sk, struct sk_buff *skb)
-{
-	sk->sk_napi_id = skb->napi_id;
-}
 
 #else /* CONFIG_NET_RX_BUSY_POLL */
 static inline unsigned long net_busy_loop_on(void)
@@ -107,10 +102,6 @@ static inline void skb_mark_napi_id(struct sk_buff *skb,
 {
 }
 
-static inline void sk_mark_napi_id(struct sock *sk, struct sk_buff *skb)
-{
-}
-
 static inline bool busy_loop_timeout(unsigned long end_time)
 {
 	return true;
@@ -122,4 +113,23 @@ static inline bool sk_busy_loop(struct sock *sk, int nonblock)
 }
 
 #endif /* CONFIG_NET_RX_BUSY_POLL */
+
+/* used in the protocol hanlder to propagate the napi_id to the socket */
+static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	sk->sk_napi_id = skb->napi_id;
+#endif
+}
+
+/* variant used for unconnected sockets */
+static inline void sk_mark_napi_id_once(struct sock *sk,
+					const struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	if (!sk->sk_napi_id)
+		sk->sk_napi_id = skb->napi_id;
+#endif
+}
+
 #endif /* _LINUX_NET_BUSY_POLL_H */
-- 
cgit v1.2.3


From c7d03a00b56fc23c3a01a8353789ad257363e281 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 17 Nov 2016 04:58:21 +0300
Subject: netns: make struct pernet_operations::id unsigned int

Make struct pernet_operations::id unsigned.

There are 2 reasons to do so:

1)
This field is really an index into an zero based array and
thus is unsigned entity. Using negative value is out-of-bound
access by definition.

2)
On x86_64 unsigned 32-bit data which are mixed with pointers
via array indexing or offsets added or subtracted to pointers
are preffered to signed 32-bit data.

"int" being used as an array index needs to be sign-extended
to 64-bit before being used.

	void f(long *p, int i)
	{
		g(p[i]);
	}

  roughly translates to

	movsx	rsi, esi
	mov	rdi, [rsi+...]
	call 	g

MOVSX is 3 byte instruction which isn't necessary if the variable is
unsigned because x86_64 is zero extending by default.

Now, there is net_generic() function which, you guessed it right, uses
"int" as an array index:

	static inline void *net_generic(const struct net *net, int id)
	{
		...
		ptr = ng->ptr[id - 1];
		...
	}

And this function is used a lot, so those sign extensions add up.

Patch snipes ~1730 bytes on allyesconfig kernel (without all junk
messing with code generation):

	add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)

Unfortunately some functions actually grow bigger.
This is a semmingly random artefact of code generation with register
allocator being used differently. gcc decides that some variable
needs to live in new r8+ registers and every access now requires REX
prefix. Or it is shifted into r12, so [r12+0] addressing mode has to be
used which is longer than [r8]

However, overall balance is in negative direction:

	add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730)
	function                                     old     new   delta
	nfsd4_lock                                  3886    3959     +73
	tipc_link_build_proto_msg                   1096    1140     +44
	mac80211_hwsim_new_radio                    2776    2808     +32
	tipc_mon_rcv                                1032    1058     +26
	svcauth_gss_legacy_init                     1413    1429     +16
	tipc_bcbase_select_primary                   379     392     +13
	nfsd4_exchange_id                           1247    1260     +13
	nfsd4_setclientid_confirm                    782     793     +11
		...
	put_client_renew_locked                      494     480     -14
	ip_set_sockfn_get                            730     716     -14
	geneve_sock_add                              829     813     -16
	nfsd4_sequence_done                          721     703     -18
	nlmclnt_lookup_host                          708     686     -22
	nfsd4_lockt                                 1085    1063     -22
	nfs_get_client                              1077    1050     -27
	tcf_bpf_init                                1106    1076     -30
	nfsd4_encode_fattr                          5997    5930     -67
	Total: Before=154856051, After=154854321, chg -0.00%

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/bonding.h                         | 2 +-
 include/net/ip_tunnels.h                      | 6 +++---
 include/net/net_namespace.h                   | 2 +-
 include/net/netfilter/nf_conntrack_l4proto.h  | 2 +-
 include/net/netfilter/nf_conntrack_synproxy.h | 2 +-
 include/net/netns/generic.h                   | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/bonding.h b/include/net/bonding.h
index f32f7ef8a23a..3c857778a6ca 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -681,7 +681,7 @@ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip)
 }
 
 /* exported from bond_main.c */
-extern int bond_net_id;
+extern unsigned int bond_net_id;
 extern const struct bond_parm_tbl bond_lacp_tbl[];
 extern const struct bond_parm_tbl xmit_hashtype_tbl[];
 extern const struct bond_parm_tbl arp_validate_tbl[];
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 59557c07904b..e893fe43dd13 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -129,7 +129,7 @@ struct ip_tunnel {
 #endif
 	struct ip_tunnel_prl_entry __rcu *prl;	/* potential router list */
 	unsigned int		prl_count;	/* # of entries in PRL */
-	int			ip_tnl_net_id;
+	unsigned int		ip_tnl_net_id;
 	struct gro_cells	gro_cells;
 	bool			collect_md;
 	bool			ignore_df;
@@ -248,7 +248,7 @@ void ip_tunnel_uninit(struct net_device *dev);
 void  ip_tunnel_dellink(struct net_device *dev, struct list_head *head);
 struct net *ip_tunnel_get_link_net(const struct net_device *dev);
 int ip_tunnel_get_iflink(const struct net_device *dev);
-int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
+int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
 		       struct rtnl_link_ops *ops, char *devname);
 
 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops);
@@ -275,7 +275,7 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
 			 struct ip_tunnel_parm *p);
 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 		      struct ip_tunnel_parm *p);
-void ip_tunnel_setup(struct net_device *dev, int net_id);
+void ip_tunnel_setup(struct net_device *dev, unsigned int net_id);
 
 struct ip_tunnel_encap_ops {
 	size_t (*encap_hlen)(struct ip_tunnel_encap *e);
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index fc4f757107df..d7149e93a60a 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -291,7 +291,7 @@ struct pernet_operations {
 	int (*init)(struct net *net);
 	void (*exit)(struct net *net);
 	void (*exit_batch)(struct list_head *net_exit_list);
-	int *id;
+	unsigned int *id;
 	size_t size;
 };
 
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 2152b70626d5..e7b836590f0b 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -98,7 +98,7 @@ struct nf_conntrack_l4proto {
 		const struct nla_policy *nla_policy;
 	} ctnl_timeout;
 #endif
-	int	*net_id;
+	unsigned int	*net_id;
 	/* Init l4proto pernet data */
 	int (*init_net)(struct net *net, u_int16_t proto);
 
diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h
index e6937318546c..b0ca402c1f72 100644
--- a/include/net/netfilter/nf_conntrack_synproxy.h
+++ b/include/net/netfilter/nf_conntrack_synproxy.h
@@ -54,7 +54,7 @@ struct synproxy_net {
 	struct synproxy_stats __percpu	*stats;
 };
 
-extern int synproxy_net_id;
+extern unsigned int synproxy_net_id;
 static inline struct synproxy_net *synproxy_pernet(struct net *net)
 {
 	return net_generic(net, synproxy_net_id);
diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h
index 70e158551704..d315786bcfd7 100644
--- a/include/net/netns/generic.h
+++ b/include/net/netns/generic.h
@@ -31,7 +31,7 @@ struct net_generic {
 	void *ptr[0];
 };
 
-static inline void *net_generic(const struct net *net, int id)
+static inline void *net_generic(const struct net *net, unsigned int id)
 {
 	struct net_generic *ng;
 	void *ptr;
-- 
cgit v1.2.3


From 3b2c75d371740fb0dcd0c9eac545ab1dd28b4706 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 19 Nov 2016 03:54:35 +0300
Subject: netlink: use "unsigned int" in nla_next()

->nla_len is unsigned entity (it's length after all) and u16,
thus it can't overflow when being aligned into int/unsigned int.

(nlmsg_next has the same code, but I didn't yet convince myself
it is correct to do so).

There is pointer arithmetic in this function and offset being
unsigned is better:

	add/remove: 0/0 grow/shrink: 1/64 up/down: 5/-309 (-304)
	function                                     old     new   delta
	nl80211_set_wiphy                           1444    1449      +5
	team_nl_cmd_options_set                      997     995      -2
	tcf_em_tree_validate                         872     870      -2
	switchdev_port_bridge_setlink                352     350      -2
	switchdev_port_br_afspec                     312     310      -2
	rtm_to_fib_config                            428     426      -2
	qla4xxx_sysfs_ddb_set_param                 2193    2191      -2
	qla4xxx_iface_set_param                     4470    4468      -2
	ovs_nla_free_flow_actions                    152     150      -2
	output_userspace                             518     516      -2
		...
	nl80211_set_reg                              654     649      -5
	validate_scan_freqs                          148     142      -6
	validate_linkmsg                             288     282      -6
	nl80211_parse_connkeys                       489     483      -6
	nlattr_set                                   231     224      -7
	nf_tables_delsetelem                         267     260      -7
	do_setlink                                  3416    3408      -8
	netlbl_cipsov4_add_std                      1672    1659     -13
	nl80211_parse_sched_scan                    2902    2888     -14
	nl80211_trigger_scan                        1738    1720     -18
	do_execute_actions                          2821    2738     -83
	Total: Before=154865355, After=154865051, chg -0.00%

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index a34f53acb6d6..d3938f11ae52 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -713,7 +713,7 @@ static inline int nla_ok(const struct nlattr *nla, int remaining)
  */
 static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
 {
-	int totlen = NLA_ALIGN(nla->nla_len);
+	unsigned int totlen = NLA_ALIGN(nla->nla_len);
 
 	*remaining -= totlen;
 	return (struct nlattr *) ((char *) nla + totlen);
-- 
cgit v1.2.3


From e97991832a4ea4a5f47d65f068a4c966a2eb5730 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 21 Nov 2016 14:18:38 +0100
Subject: tcp: make undo_cwnd mandatory for congestion modules

The undo_cwnd fallback in the stack doubles cwnd based on ssthresh,
which un-does reno halving behaviour.

It seems more appropriate to let congctl algorithms pair .ssthresh
and .undo_cwnd properly. Add a 'tcp_reno_undo_cwnd' function and wire it
up for all congestion algorithms that used to rely on the fallback.

Cc: Eric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 123979fe12bf..7de80739adab 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -958,6 +958,7 @@ u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
 
 u32 tcp_reno_ssthresh(struct sock *sk);
+u32 tcp_reno_undo_cwnd(struct sock *sk);
 void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
 extern struct tcp_congestion_ops tcp_reno;
 
-- 
cgit v1.2.3


From 59bfde01fab0c4550778cd53e8d266f1dfddf7b7 Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@mellanox.com>
Date: Tue, 22 Nov 2016 23:09:57 +0200
Subject: devlink: Add E-Switch inline mode control

Some HWs need the VF driver to put part of the packet headers on the
TX descriptor so the e-switch can do proper matching and steering.

The supported modes: none, link, network, transport.

Signed-off-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/devlink.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 211bd3c37028..d29e5fc82582 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -92,6 +92,8 @@ struct devlink_ops {
 
 	int (*eswitch_mode_get)(struct devlink *devlink, u16 *p_mode);
 	int (*eswitch_mode_set)(struct devlink *devlink, u16 mode);
+	int (*eswitch_inline_mode_get)(struct devlink *devlink, u8 *p_inline_mode);
+	int (*eswitch_inline_mode_set)(struct devlink *devlink, u8 inline_mode);
 };
 
 static inline void *devlink_priv(struct devlink *devlink)
-- 
cgit v1.2.3


From 21fcf572716f61926f5d23a358e79d06a9d4d54f Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Wed, 5 Oct 2016 22:51:42 +0200
Subject: Bluetooth: __ variants of u8 and friends are not neccessary inside
 kernel

bluetooth.h is not part of user API, so __ variants are not neccessary
here.

Signed-off-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/net/bluetooth/bluetooth.h | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/net')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 0a1e21d7bce1..01487192f628 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -197,7 +197,7 @@ typedef struct {
 #define BDADDR_LE_PUBLIC	0x01
 #define BDADDR_LE_RANDOM	0x02
 
-static inline bool bdaddr_type_is_valid(__u8 type)
+static inline bool bdaddr_type_is_valid(u8 type)
 {
 	switch (type) {
 	case BDADDR_BREDR:
@@ -209,7 +209,7 @@ static inline bool bdaddr_type_is_valid(__u8 type)
 	return false;
 }
 
-static inline bool bdaddr_type_is_le(__u8 type)
+static inline bool bdaddr_type_is_le(u8 type)
 {
 	switch (type) {
 	case BDADDR_LE_PUBLIC:
@@ -279,15 +279,16 @@ struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock);
 
 /* Skb helpers */
 struct l2cap_ctrl {
-	__u8	sframe:1,
+	u8	sframe:1,
 		poll:1,
 		final:1,
 		fcs:1,
 		sar:2,
 		super:2;
-	__u16	reqseq;
-	__u16	txseq;
-	__u8	retries;
+
+	u16	reqseq;
+	u16	txseq;
+	u8	retries;
 	__le16  psm;
 	bdaddr_t bdaddr;
 	struct l2cap_chan *chan;
@@ -303,7 +304,7 @@ typedef void (*hci_req_complete_skb_t)(struct hci_dev *hdev, u8 status,
 #define HCI_REQ_SKB	BIT(1)
 
 struct hci_ctrl {
-	__u16 opcode;
+	u16 opcode;
 	u8 req_flags;
 	u8 req_event;
 	union {
@@ -313,10 +314,10 @@ struct hci_ctrl {
 };
 
 struct bt_skb_cb {
-	__u8 pkt_type;
-	__u8 force_active;
-	__u16 expect;
-	__u8 incoming:1;
+	u8 pkt_type;
+	u8 force_active;
+	u16 expect;
+	u8 incoming:1;
 	union {
 		struct l2cap_ctrl l2cap;
 		struct hci_ctrl hci;
@@ -366,7 +367,7 @@ out:
 	return NULL;
 }
 
-int bt_to_errno(__u16 code);
+int bt_to_errno(u16 code);
 
 void hci_sock_set_flag(struct sock *sk, int nr);
 void hci_sock_clear_flag(struct sock *sk, int nr);
-- 
cgit v1.2.3


From 05b055e89121394058c75dc354e9a46e1e765579 Mon Sep 17 00:00:00 2001
From: Francis Yan <francisyyan@gmail.com>
Date: Sun, 27 Nov 2016 23:07:13 -0800
Subject: tcp: instrument tcp sender limits chronographs

This patch implements the skeleton of the TCP chronograph
instrumentation on sender side limits:

	1) idle (unspec)
	2) busy sending data other than 3-4 below
	3) rwnd-limited
	4) sndbuf-limited

The limits are enumerated 'tcp_chrono'. Since a connection in
theory can idle forever, we do not track the actual length of this
uninteresting idle period. For the rest we track how long the sender
spends in each limit. At any point during the life time of a
connection, the sender must be in one of the four states.

If there are multiple conditions worthy of tracking in a chronograph
then the highest priority enum takes precedence over
the other conditions. So that if something "more interesting"
starts happening, stop the previous chrono and start a new one.

The time unit is jiffy(u32) in order to save space in tcp_sock.
This implies application must sample the stats no longer than every
49 days of 1ms jiffy.

Signed-off-by: Francis Yan <francisyyan@gmail.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7de80739adab..e5ff4083870d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1516,6 +1516,20 @@ struct tcp_fastopen_context {
 	struct rcu_head		rcu;
 };
 
+/* Latencies incurred by various limits for a sender. They are
+ * chronograph-like stats that are mutually exclusive.
+ */
+enum tcp_chrono {
+	TCP_CHRONO_UNSPEC,
+	TCP_CHRONO_BUSY, /* Actively sending data (non-empty write queue) */
+	TCP_CHRONO_RWND_LIMITED, /* Stalled by insufficient receive window */
+	TCP_CHRONO_SNDBUF_LIMITED, /* Stalled by insufficient send buffer */
+	__TCP_CHRONO_MAX,
+};
+
+void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
+void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);
+
 /* write queue abstraction */
 static inline void tcp_write_queue_purge(struct sock *sk)
 {
-- 
cgit v1.2.3


From 0f87230d1a6c253681550c6064715d06a32be73d Mon Sep 17 00:00:00 2001
From: Francis Yan <francisyyan@gmail.com>
Date: Sun, 27 Nov 2016 23:07:14 -0800
Subject: tcp: instrument how long TCP is busy sending

This patch measures TCP busy time, which is defined as the period
of time when sender has data (or FIN) to send. The time starts when
data is buffered and stops when the write queue is flushed by ACKs
or error events.

Note the busy time does not include SYN time, unless data is
included in SYN (i.e. Fast Open). It does include FIN time even
if the FIN carries no payload. Excluding pure FIN is possible but
would incur one additional test in the fast path, which may not
be worth it.

Signed-off-by: Francis Yan <francisyyan@gmail.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index e5ff4083870d..3e097e39d4d2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1535,6 +1535,7 @@ static inline void tcp_write_queue_purge(struct sock *sk)
 {
 	struct sk_buff *skb;
 
+	tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
 	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
 		sk_wmem_free_skb(sk, skb);
 	sk_mem_reclaim(sk);
@@ -1593,8 +1594,10 @@ static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *
 
 static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
 {
-	if (sk->sk_send_head == skb_unlinked)
+	if (sk->sk_send_head == skb_unlinked) {
 		sk->sk_send_head = NULL;
+		tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+	}
 	if (tcp_sk(sk)->highest_sack == skb_unlinked)
 		tcp_sk(sk)->highest_sack = NULL;
 }
@@ -1616,6 +1619,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
 	/* Queue it, remembering where we must start sending. */
 	if (sk->sk_send_head == NULL) {
 		sk->sk_send_head = skb;
+		tcp_chrono_start(sk, TCP_CHRONO_BUSY);
 
 		if (tcp_sk(sk)->highest_sack == NULL)
 			tcp_sk(sk)->highest_sack = skb;
-- 
cgit v1.2.3


From 95a22caee396cef0bb2ca8fafdd82966a49367bb Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 1 Dec 2016 11:32:06 +0100
Subject: tcp: randomize tcp timestamp offsets for each connection

jiffies based timestamps allow for easy inference of number of devices
behind NAT translators and also makes tracking of hosts simpler.

commit ceaa1fef65a7c2e ("tcp: adding a per-socket timestamp offset")
added the main infrastructure that is needed for per-connection ts
randomization, in particular writing/reading the on-wire tcp header
format takes the offset into account so rest of stack can use normal
tcp_time_stamp (jiffies).

So only two items are left:
 - add a tsoffset for request sockets
 - extend the tcp isn generator to also return another 32bit number
   in addition to the ISN.

Re-use of ISN generator also means timestamps are still monotonically
increasing for same connection quadruple, i.e. PAWS will still work.

Includes fixes from Eric Dumazet.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/secure_seq.h | 8 ++++----
 include/net/tcp.h        | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h
index 3f36d45b714a..0caee631a836 100644
--- a/include/net/secure_seq.h
+++ b/include/net/secure_seq.h
@@ -6,10 +6,10 @@
 u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport);
 u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
 			       __be16 dport);
-__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
-				 __be16 sport, __be16 dport);
-__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
-				   __be16 sport, __be16 dport);
+u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
+			       __be16 sport, __be16 dport, u32 *tsoff);
+u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
+				 __be16 sport, __be16 dport, u32 *tsoff);
 u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
 				__be16 sport, __be16 dport);
 u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3e097e39d4d2..207147b4c6b2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1827,7 +1827,7 @@ struct tcp_request_sock_ops {
 	struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl,
 				       const struct request_sock *req,
 				       bool *strict);
-	__u32 (*init_seq)(const struct sk_buff *skb);
+	__u32 (*init_seq)(const struct sk_buff *skb, u32 *tsoff);
 	int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
 			   struct flowi *fl, struct request_sock *req,
 			   struct tcp_fastopen_cookie *foc,
-- 
cgit v1.2.3


From 55330f05969437c5d22fcc2ae2e54810b5236b7b Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Thu, 1 Dec 2016 14:06:33 +0200
Subject: net/sched: Add separate check for skip_hw flag

Creating a difference between two possible cases:
1. Not offloading tc rule since the user sets 'skip_hw' flag.
2. Not offloading tc rule since the device doesn't support offloading.

This patch doesn't add any new functionality.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 767b03a3fe67..45ad9aab9bba 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -425,16 +425,14 @@ struct tc_cls_u32_offload {
 	};
 };
 
-static inline bool tc_should_offload(const struct net_device *dev,
-				     const struct tcf_proto *tp, u32 flags)
+static inline bool tc_can_offload(const struct net_device *dev,
+				  const struct tcf_proto *tp)
 {
 	const struct Qdisc *sch = tp->q;
 	const struct Qdisc_class_ops *cops = sch->ops->cl_ops;
 
 	if (!(dev->features & NETIF_F_HW_TC))
 		return false;
-	if (flags & TCA_CLS_FLAGS_SKIP_HW)
-		return false;
 	if (!dev->netdev_ops->ndo_setup_tc)
 		return false;
 	if (cops && cops->tcf_cl_offload)
@@ -443,6 +441,19 @@ static inline bool tc_should_offload(const struct net_device *dev,
 	return true;
 }
 
+static inline bool tc_skip_hw(u32 flags)
+{
+	return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false;
+}
+
+static inline bool tc_should_offload(const struct net_device *dev,
+				     const struct tcf_proto *tp, u32 flags)
+{
+	if (tc_skip_hw(flags))
+		return false;
+	return tc_can_offload(dev, tp);
+}
+
 static inline bool tc_skip_sw(u32 flags)
 {
 	return (flags & TCA_CLS_FLAGS_SKIP_SW) ? true : false;
-- 
cgit v1.2.3


From 255cb30425c0ced57d6d85f3e7cddb99b9576046 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Thu, 1 Dec 2016 14:06:36 +0200
Subject: net/sched: act_mirred: Add new tc_action_ops get_dev()

Adding support to a new tc_action_ops.
get_dev is a general option which allows to get the underline
device when trying to offload a tc rule.

In case of mirred action the returned device is the mirred (egress)
device.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index d8eae87ea778..9dddf77a69cc 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -119,6 +119,8 @@ struct tc_action_ops {
 	int     (*walk)(struct net *, struct sk_buff *,
 			struct netlink_callback *, int, const struct tc_action_ops *);
 	void	(*stats_update)(struct tc_action *, u64, u32, u64);
+	int	(*get_dev)(const struct tc_action *a, struct net *net,
+			   struct net_device **mirred_dev);
 };
 
 struct tc_action_net {
-- 
cgit v1.2.3


From 7091d8c7055d7310339435ae3af2fb490a92524d Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Thu, 1 Dec 2016 14:06:37 +0200
Subject: net/sched: cls_flower: Add offload support using egress Hardware
 device

In order to support hardware offloading when the device given by the tc
rule is different from the Hardware underline device, extract the mirred
(egress) device from the tc action when a filter is added, using the new
tc_action_ops, get_dev().

Flower caches the information about the mirred device and use it for
calling ndo_setup_tc in filter change, update stats and delete.

Calling ndo_setup_tc of the mirred (egress) device instead of the
ingress device will allow a resolution between the software ingress
device and the underline hardware device.

The resolution will take place inside the offloading driver using
'egress_device' flag added to tc_to_netdev struct which is provided to
the offloading driver.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 45ad9aab9bba..f0a051480c6c 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -171,6 +171,8 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
 		     struct tcf_exts *src);
 int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts);
 int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts);
+int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts,
+		     struct net_device **hw_dev);
 
 /**
  * struct tcf_pkt_info - packet information
-- 
cgit v1.2.3


From aa4c1037a30f4e88f444e83d42c2befbe0d5caf5 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Thu, 1 Dec 2016 08:48:06 -0800
Subject: bpf: Add support for reading socket family, type, protocol

Add socket family, type and protocol to bpf_sock allowing bpf programs
read-only access.

Add __sk_flags_offset[0] to struct sock before the bitfield to
programmtically determine the offset of the unsigned int containing
protocol and type.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 442cbb118a07..69afda6bea15 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -389,6 +389,21 @@ struct sock {
 	 * Because of non atomicity rules, all
 	 * changes are protected by socket lock.
 	 */
+	unsigned int		__sk_flags_offset[0];
+#ifdef __BIG_ENDIAN_BITFIELD
+#define SK_FL_PROTO_SHIFT  16
+#define SK_FL_PROTO_MASK   0x00ff0000
+
+#define SK_FL_TYPE_SHIFT   0
+#define SK_FL_TYPE_MASK    0x0000ffff
+#else
+#define SK_FL_PROTO_SHIFT  8
+#define SK_FL_PROTO_MASK   0x0000ff00
+
+#define SK_FL_TYPE_SHIFT   16
+#define SK_FL_TYPE_MASK    0xffff0000
+#endif
+
 	kmemcheck_bitfield_begin(flags);
 	unsigned int		sk_padding : 2,
 				sk_no_check_tx : 1,
-- 
cgit v1.2.3


From 4f7df337fe79bba1e4c2d525525d63b5ba186bbd Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 2 Dec 2016 03:59:06 +0300
Subject: netlink: 2-clause nla_ok()

nla_ok() consists of 3 clauses:

	1) int rem >= (int)sizeof(struct nlattr)

	2) u16 nla_len >= sizeof(struct nlattr)

	3) u16 nla_len <= int rem

The statement is that clause (1) is redundant.

What it does is ensuring that "rem" is a positive number,
so that in clause (3) positive number will be compared to positive number
with no problems.

However, "u16" fully fits into "int" and integers do not change value
when upcasting even to signed type. Negative integers will be rejected
by clause (3) just fine. Small positive integers will be rejected
by transitivity of comparison operator.

NOTE: all of the above DOES NOT apply to nlmsg_ok() where ->nlmsg_len is
u32(!), so 3 clauses AND A CAST TO INT are necessary.

Obligatory space savings report: -1.6 KB

	$ ./scripts/bloat-o-meter ../vmlinux-000* ../vmlinux-001*
	add/remove: 0/0 grow/shrink: 3/63 up/down: 35/-1692 (-1657)
	function                                     old     new   delta
	validate_scan_freqs                          142     155     +13
	tcf_em_tree_validate                         867     879     +12
	dcbnl_ieee_del                               328     338     +10
	netlbl_cipsov4_add_common.isra               218     215      -3
		...
	ovs_nla_put_actions                          888     806     -82
	netlbl_cipsov4_add_std                      1648    1566     -82
	nl80211_parse_sched_scan                    2889    2780    -109
	ip_tun_from_nlattr                          3086    2945    -141

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index d3938f11ae52..dd657a33f8c3 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -698,8 +698,7 @@ static inline int nla_len(const struct nlattr *nla)
  */
 static inline int nla_ok(const struct nlattr *nla, int remaining)
 {
-	return remaining >= (int) sizeof(*nla) &&
-	       nla->nla_len >= sizeof(*nla) &&
+	return nla->nla_len >= sizeof(*nla) &&
 	       nla->nla_len <= remaining;
 }
 
-- 
cgit v1.2.3


From 9bfc7b9969dbb800460e2577f1dea59336269ce4 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 2 Dec 2016 04:12:58 +0300
Subject: netns: add dummy struct inside "struct net_generic"

This is precursor to fixing "[id - 1]" bloat inside net_generic().

Name "s" is chosen to complement name "u" often used for dummy unions.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/generic.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h
index d315786bcfd7..65ccce69b040 100644
--- a/include/net/netns/generic.h
+++ b/include/net/netns/generic.h
@@ -25,8 +25,10 @@
  */
 
 struct net_generic {
-	unsigned int len;
-	struct rcu_head rcu;
+	struct {
+		unsigned int len;
+		struct rcu_head rcu;
+	} s;
 
 	void *ptr[0];
 };
-- 
cgit v1.2.3


From 6af2d5fff2fdcd481cb9a4f354a0880142b17c60 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 2 Dec 2016 04:21:32 +0300
Subject: netns: fix net_generic() "id - 1" bloat

net_generic() function is both a) inline and b) used ~600 times.

It has the following code inside

		...
	ptr = ng->ptr[id - 1];
		...

"id" is never compile time constant so compiler is forced to subtract 1.
And those decrements or LEA [r32 - 1] instructions add up.

We also start id'ing from 1 to catch bugs where pernet sybsystem id
is not initialized and 0. This is quite pointless idea (nothing will
work or immediate interference with first registered subsystem) in
general but it hints what needs to be done for code size reduction.

Namely, overlaying allocation of pointer array and fixed part of
structure in the beginning and using usual base-0 addressing.

Ids are just cookies, their exact values do not matter, so lets start
with 3 on x86_64.

Code size savings (oh boy): -4.2 KB

As usual, ignore the initial compiler stupidity part of the table.

	add/remove: 0/0 grow/shrink: 12/670 up/down: 89/-4297 (-4208)
	function                                     old     new   delta
	tipc_nametbl_insert_publ                    1250    1270     +20
	nlmclnt_lookup_host                          686     703     +17
	nfsd4_encode_fattr                          5930    5941     +11
	nfs_get_client                              1050    1061     +11
	register_pernet_operations                   333     342      +9
	tcf_mirred_init                              843     849      +6
	tcf_bpf_init                                1143    1149      +6
	gss_setup_upcall                             990     994      +4
	idmap_name_to_id                             432     434      +2
	ops_init                                     274     275      +1
	nfsd_inject_forget_client                    259     260      +1
	nfs4_alloc_client                            612     613      +1
	tunnel_key_walker                            164     163      -1

		...

	tipc_bcbase_select_primary                   392     360     -32
	mac80211_hwsim_new_radio                    2808    2767     -41
	ipip6_tunnel_ioctl                          2228    2186     -42
	tipc_bcast_rcv                               715     672     -43
	tipc_link_build_proto_msg                   1140    1089     -51
	nfsd4_lock                                  3851    3796     -55
	tipc_mon_rcv                                1012     956     -56
	Total: Before=156643951, After=156639743, chg -0.00%

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/generic.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h
index 65ccce69b040..f15daaa89385 100644
--- a/include/net/netns/generic.h
+++ b/include/net/netns/generic.h
@@ -25,12 +25,14 @@
  */
 
 struct net_generic {
-	struct {
-		unsigned int len;
-		struct rcu_head rcu;
-	} s;
-
-	void *ptr[0];
+	union {
+		struct {
+			unsigned int len;
+			struct rcu_head rcu;
+		} s;
+
+		void *ptr[0];
+	};
 };
 
 static inline void *net_generic(const struct net *net, unsigned int id)
@@ -40,7 +42,7 @@ static inline void *net_generic(const struct net *net, unsigned int id)
 
 	rcu_read_lock();
 	ng = rcu_dereference(net->gen);
-	ptr = ng->ptr[id - 1];
+	ptr = ng->ptr[id];
 	rcu_read_unlock();
 
 	return ptr;
-- 
cgit v1.2.3


From 1c677b3d2828a84e0360e1a07d1204531540dc03 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sat, 3 Dec 2016 16:44:59 +0100
Subject: ipv4: fib: Add fib_info_hold() helper

As explained in the previous commit, modules are going to need to take a
reference on fib info and then drop it using fib_info_put().

Add the fib_info_hold() helper to make the code more readable and also
symmetric with fib_info_put().

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Suggested-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f390c3bb05c5..6c67b9391fc0 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -397,6 +397,11 @@ static inline void fib_combine_itag(u32 *itag, const struct fib_result *res)
 
 void free_fib_info(struct fib_info *fi);
 
+static inline void fib_info_hold(struct fib_info *fi)
+{
+	atomic_inc(&fi->fib_clntref);
+}
+
 static inline void fib_info_put(struct fib_info *fi)
 {
 	if (atomic_dec_and_test(&fi->fib_clntref))
-- 
cgit v1.2.3


From cacaad11f43aefbbe5fca00af3b9c16e6aee1ba4 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sat, 3 Dec 2016 16:45:06 +0100
Subject: ipv4: fib: Allow for consistent FIB dumping

The next patch will enable listeners of the FIB notification chain to
request a dump of the FIB tables. However, since RTNL isn't taken during
the dump, it's possible for the FIB tables to change mid-dump, which
will result in inconsistency between the listener's table and the
kernel's.

Allow listeners to know about changes that occurred mid-dump, by adding
a change sequence counter to each net namespace. The counter is
incremented just before a notification is sent in the FIB chain.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 7adf4386ac8f..f0cf5a1b777e 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -135,6 +135,9 @@ struct netns_ipv4 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	int sysctl_fib_multipath_use_neigh;
 #endif
+
+	unsigned int	fib_seq;	/* protected by rtnl_mutex */
+
 	atomic_t	rt_genid;
 };
 #endif
-- 
cgit v1.2.3


From c3852ef7f2f8f75a9f85a864bec1f6f5a3068eea Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Sat, 3 Dec 2016 16:45:07 +0100
Subject: ipv4: fib: Replay events when registering FIB notifier

Commit b90eb7549499 ("fib: introduce FIB notification infrastructure")
introduced a new notification chain to notify listeners (f.e., switchdev
drivers) about addition and deletion of routes.

However, upon registration to the chain the FIB tables can already be
populated, which means potential listeners will have an incomplete view
of the tables.

Solve that by dumping the FIB tables and replaying the events to the
passed notification block. The dump itself is done using RCU in order
not to starve consumers that need RTNL to make progress.

The integrity of the dump is ensured by reading the FIB change sequence
counter before and after the dump under RTNL. This allows us to avoid
the problematic situation in which the dumping process sends a ENTRY_ADD
notification following ENTRY_DEL generated by another process holding
RTNL.

Callers of the registration function may pass a callback that is
executed in case the dump was inconsistent with current FIB tables.

The number of retries until a consistent dump is achieved is set to a
fixed number to prevent callers from looping for long periods of time.
In case current limit proves to be problematic in the future, it can be
easily converted to be configurable using a sysctl.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_fib.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 6c67b9391fc0..5f376af377c7 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -221,7 +221,8 @@ enum fib_event_type {
 	FIB_EVENT_RULE_DEL,
 };
 
-int register_fib_notifier(struct notifier_block *nb);
+int register_fib_notifier(struct notifier_block *nb,
+			  void (*cb)(struct notifier_block *nb));
 int unregister_fib_notifier(struct notifier_block *nb);
 int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
 		       struct fib_notifier_info *info);
-- 
cgit v1.2.3


From adc176c5472214971d77c1a61c83db9b01e9cdc7 Mon Sep 17 00:00:00 2001
From: Erik Nordmark <nordmark@arista.com>
Date: Fri, 2 Dec 2016 14:00:08 -0800
Subject: ipv6 addrconf: Implemented enhanced DAD (RFC7527)

Implemented RFC7527 Enhanced DAD.
IPv6 duplicate address detection can fail if there is some temporary
loopback of Ethernet frames. RFC7527 solves this by including a random
nonce in the NS messages used for DAD, and if an NS is received with the
same nonce it is assumed to be a looped back DAD probe and is ignored.
RFC7527 is enabled by default. Can be disabled by setting both of
conf/{all,interface}/enhanced_dad to zero.

Signed-off-by: Erik Nordmark <nordmark@arista.com>
Signed-off-by: Bob Gilligan <gilligan@arista.com>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/if_inet6.h | 1 +
 include/net/ndisc.h    | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index b0576cb2ab25..0fa4c324b713 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -55,6 +55,7 @@ struct inet6_ifaddr {
 	__u8			stable_privacy_retry;
 
 	__u16			scope;
+	__u64			dad_nonce;
 
 	unsigned long		cstamp;	/* created timestamp */
 	unsigned long		tstamp; /* updated timestamp */
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index be1fe2283254..d562a2fe4860 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -31,6 +31,7 @@ enum {
 	ND_OPT_PREFIX_INFO = 3,		/* RFC2461 */
 	ND_OPT_REDIRECT_HDR = 4,	/* RFC2461 */
 	ND_OPT_MTU = 5,			/* RFC2461 */
+	ND_OPT_NONCE = 14,              /* RFC7527 */
 	__ND_OPT_ARRAY_MAX,
 	ND_OPT_ROUTE_INFO = 24,		/* RFC4191 */
 	ND_OPT_RDNSS = 25,		/* RFC5006 */
@@ -121,6 +122,7 @@ struct ndisc_options {
 #define nd_opts_pi_end			nd_opt_array[__ND_OPT_PREFIX_INFO_END]
 #define nd_opts_rh			nd_opt_array[ND_OPT_REDIRECT_HDR]
 #define nd_opts_mtu			nd_opt_array[ND_OPT_MTU]
+#define nd_opts_nonce			nd_opt_array[ND_OPT_NONCE]
 #define nd_802154_opts_src_lladdr	nd_802154_opt_array[ND_OPT_SOURCE_LL_ADDR]
 #define nd_802154_opts_tgt_lladdr	nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR]
 
@@ -398,7 +400,8 @@ void ndisc_cleanup(void);
 int ndisc_rcv(struct sk_buff *skb);
 
 void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
-		   const struct in6_addr *daddr, const struct in6_addr *saddr);
+		   const struct in6_addr *daddr, const struct in6_addr *saddr,
+		   u64 nonce);
 
 void ndisc_send_rs(struct net_device *dev,
 		   const struct in6_addr *saddr, const struct in6_addr *daddr);
-- 
cgit v1.2.3


From 0c4e966eafff8253bec545d8c27b9efa231c1f62 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Thu, 20 Oct 2016 18:33:01 +0200
Subject: netfilter: built-in NAT support for DCCP

CONFIG_NF_NAT_PROTO_DCCP is no more a tristate. When set to y, NAT
support for DCCP protocol is built-in into nf_nat.ko.

footprint test:

(nf_nat_proto_)           | dccp   || nf_nat
--------------------------+--------++--------
no builtin                | 409800 || 2241312
DCCP builtin              |   -    || 2578968

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat_l4proto.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_nat_l4proto.h b/include/net/netfilter/nf_nat_l4proto.h
index 12f4cc841b6e..92b147be00ef 100644
--- a/include/net/netfilter/nf_nat_l4proto.h
+++ b/include/net/netfilter/nf_nat_l4proto.h
@@ -54,6 +54,9 @@ extern const struct nf_nat_l4proto nf_nat_l4proto_udp;
 extern const struct nf_nat_l4proto nf_nat_l4proto_icmp;
 extern const struct nf_nat_l4proto nf_nat_l4proto_icmpv6;
 extern const struct nf_nat_l4proto nf_nat_l4proto_unknown;
+#ifdef CONFIG_NF_NAT_PROTO_DCCP
+extern const struct nf_nat_l4proto nf_nat_l4proto_dccp;
+#endif
 
 bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple,
 			     enum nf_nat_manip_type maniptype,
-- 
cgit v1.2.3


From 7a2dd28c703408ef27d6fe6a4fcd7c58968ce3bf Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Thu, 20 Oct 2016 18:33:02 +0200
Subject: netfilter: built-in NAT support for SCTP

CONFIG_NF_NAT_PROTO_SCTP is no more a tristate. When set to y, NAT
support for SCTP protocol is built-in into nf_nat.ko.

footprint test:

(nf_nat_proto_)           | sctp   || nf_nat
--------------------------+--------++--------
no builtin                | 428344 || 2241312
SCTP builtin              |   -    || 2597032

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat_l4proto.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_nat_l4proto.h b/include/net/netfilter/nf_nat_l4proto.h
index 92b147be00ef..2cbaf3856e21 100644
--- a/include/net/netfilter/nf_nat_l4proto.h
+++ b/include/net/netfilter/nf_nat_l4proto.h
@@ -57,6 +57,9 @@ extern const struct nf_nat_l4proto nf_nat_l4proto_unknown;
 #ifdef CONFIG_NF_NAT_PROTO_DCCP
 extern const struct nf_nat_l4proto nf_nat_l4proto_dccp;
 #endif
+#ifdef CONFIG_NF_NAT_PROTO_SCTP
+extern const struct nf_nat_l4proto nf_nat_l4proto_sctp;
+#endif
 
 bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple,
 			     enum nf_nat_manip_type maniptype,
-- 
cgit v1.2.3


From b8ad652f9779976d0300ae199961e413859d5378 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Thu, 20 Oct 2016 18:33:03 +0200
Subject: netfilter: built-in NAT support for UDPlite

CONFIG_NF_NAT_PROTO_UDPLITE is no more a tristate. When set to y, NAT
support for UDPlite protocol is built-in into nf_nat.ko.

footprint test:

(nf_nat_proto_)           |udplite || nf_nat
--------------------------+--------++--------
no builtin                | 408048 || 2241312
UDPLITE builtin           |   -    || 2577256

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_nat_l4proto.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_nat_l4proto.h b/include/net/netfilter/nf_nat_l4proto.h
index 2cbaf3856e21..3923150f2a1e 100644
--- a/include/net/netfilter/nf_nat_l4proto.h
+++ b/include/net/netfilter/nf_nat_l4proto.h
@@ -60,6 +60,9 @@ extern const struct nf_nat_l4proto nf_nat_l4proto_dccp;
 #ifdef CONFIG_NF_NAT_PROTO_SCTP
 extern const struct nf_nat_l4proto nf_nat_l4proto_sctp;
 #endif
+#ifdef CONFIG_NF_NAT_PROTO_UDPLITE
+extern const struct nf_nat_l4proto nf_nat_l4proto_udplite;
+#endif
 
 bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple,
 			     enum nf_nat_manip_type maniptype,
-- 
cgit v1.2.3


From 673ab46f345557e9d741e97ca0301280360d1af1 Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Mon, 14 Nov 2016 22:39:25 +0800
Subject: netfilter: nf_log: do not assume ethernet header in netdev family

In netdev family, we will handle non ethernet packets, so using
eth_hdr(skb)->h_proto is incorrect.

Meanwhile, we can use socket(AF_PACKET...) to sending packets, so
skb->protocol is not always set in bridge family.

Add an extra parameter into nf_log_l2packet to solve this issue.

Fixes: 1fddf4bad0ac ("netfilter: nf_log: add packet logging for netdev family")
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_log.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index a559aa41253c..450f87f95415 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -109,7 +109,9 @@ void nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf,
 			       const struct net_device *out,
 			       const struct nf_loginfo *loginfo,
 			       const char *prefix);
-void nf_log_l2packet(struct net *net, u_int8_t pf, unsigned int hooknum,
+void nf_log_l2packet(struct net *net, u_int8_t pf,
+		     __be16 protocol,
+		     unsigned int hooknum,
 		     const struct sk_buff *skb,
 		     const struct net_device *in,
 		     const struct net_device *out,
-- 
cgit v1.2.3


From c51d39010a1bccc9c1294e2d7c00005aefeb2b5c Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Tue, 15 Nov 2016 15:08:25 +0100
Subject: netfilter: conntrack: built-in support for DCCP

CONFIG_NF_CT_PROTO_DCCP is no more a tristate. When set to y, connection
tracking support for DCCP protocol is built-in into nf_conntrack.ko.

footprint test:
$ ls -l net/netfilter/nf_conntrack{_proto_dccp,}.ko \
        net/ipv4/netfilter/nf_conntrack_ipv4.ko \
        net/ipv6/netfilter/nf_conntrack_ipv6.ko

(builtin)||  dccp  |  ipv4  |  ipv6  | nf_conntrack
---------++--------+--------+--------+--------------
none     || 469140 | 828755 | 828676 | 6141434
DCCP     ||   -    | 830566 | 829935 | 6533526

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_conntrack_ipv4.h |  3 +++
 include/net/netfilter/ipv6/nf_conntrack_ipv6.h |  3 +++
 include/net/netns/conntrack.h                  | 14 ++++++++++++++
 3 files changed, 20 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index 981c327374da..c2f155fd9299 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -15,6 +15,9 @@ extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp;
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4;
+#endif
 
 int nf_conntrack_ipv4_compat_init(void);
 void nf_conntrack_ipv4_compat_fini(void);
diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
index a4c993685795..5ec66c0d21c4 100644
--- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
@@ -6,6 +6,9 @@ extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6;
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6;
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6;
+#endif
 
 #include <linux/sysctl.h>
 extern struct ctl_table nf_ct_ipv6_sysctl_table[];
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 3d06d94d2e52..440b781baf0b 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -6,6 +6,9 @@
 #include <linux/atomic.h>
 #include <linux/workqueue.h>
 #include <linux/netfilter/nf_conntrack_tcp.h>
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+#include <linux/netfilter/nf_conntrack_dccp.h>
+#endif
 #include <linux/seqlock.h>
 
 struct ctl_table_header;
@@ -48,12 +51,23 @@ struct nf_icmp_net {
 	unsigned int timeout;
 };
 
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+struct nf_dccp_net {
+	struct nf_proto_net pn;
+	int dccp_loose;
+	unsigned int dccp_timeout[CT_DCCP_MAX + 1];
+};
+#endif
+
 struct nf_ip_net {
 	struct nf_generic_net   generic;
 	struct nf_tcp_net	tcp;
 	struct nf_udp_net	udp;
 	struct nf_icmp_net	icmp;
 	struct nf_icmp_net	icmpv6;
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+	struct nf_dccp_net	dccp;
+#endif
 };
 
 struct ct_pcpu {
-- 
cgit v1.2.3


From a85406afeb3e045b001b2aac5b4f89f4266fede3 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Tue, 15 Nov 2016 15:08:26 +0100
Subject: netfilter: conntrack: built-in support for SCTP

CONFIG_NF_CT_PROTO_SCTP is no more a tristate. When set to y, connection
tracking support for SCTP protocol is built-in into nf_conntrack.ko.

footprint test:
$ ls -l net/netfilter/nf_conntrack{_proto_sctp,}.ko \
        net/ipv4/netfilter/nf_conntrack_ipv4.ko \
        net/ipv6/netfilter/nf_conntrack_ipv6.ko

(builtin)||  sctp  |  ipv4  |  ipv6  | nf_conntrack
---------++--------+--------+--------+--------------
none     || 498243 | 828755 | 828676 | 6141434
SCTP     ||   -    | 829254 | 829175 | 6547872

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_conntrack_ipv4.h |  3 +++
 include/net/netfilter/ipv6/nf_conntrack_ipv6.h |  3 +++
 include/net/netns/conntrack.h                  | 13 +++++++++++++
 3 files changed, 19 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index c2f155fd9299..5f1fc15a51fb 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -18,6 +18,9 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp;
 #ifdef CONFIG_NF_CT_PROTO_DCCP
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+extern struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4;
+#endif
 
 int nf_conntrack_ipv4_compat_init(void);
 void nf_conntrack_ipv4_compat_fini(void);
diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
index 5ec66c0d21c4..f70d191a8820 100644
--- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
@@ -9,6 +9,9 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6;
 #ifdef CONFIG_NF_CT_PROTO_DCCP
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+extern struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6;
+#endif
 
 #include <linux/sysctl.h>
 extern struct ctl_table nf_ct_ipv6_sysctl_table[];
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 440b781baf0b..17724c62de97 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -9,6 +9,9 @@
 #ifdef CONFIG_NF_CT_PROTO_DCCP
 #include <linux/netfilter/nf_conntrack_dccp.h>
 #endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+#include <linux/netfilter/nf_conntrack_sctp.h>
+#endif
 #include <linux/seqlock.h>
 
 struct ctl_table_header;
@@ -59,6 +62,13 @@ struct nf_dccp_net {
 };
 #endif
 
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+struct nf_sctp_net {
+	struct nf_proto_net pn;
+	unsigned int timeouts[SCTP_CONNTRACK_MAX];
+};
+#endif
+
 struct nf_ip_net {
 	struct nf_generic_net   generic;
 	struct nf_tcp_net	tcp;
@@ -68,6 +78,9 @@ struct nf_ip_net {
 #ifdef CONFIG_NF_CT_PROTO_DCCP
 	struct nf_dccp_net	dccp;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+	struct nf_sctp_net	sctp;
+#endif
 };
 
 struct ct_pcpu {
-- 
cgit v1.2.3


From 9b91c96c5d1f9da79438292f8c82f65cbf078645 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Tue, 15 Nov 2016 15:08:27 +0100
Subject: netfilter: conntrack: built-in support for UDPlite

CONFIG_NF_CT_PROTO_UDPLITE is no more a tristate. When set to y,
connection tracking support for UDPlite protocol is built-in into
nf_conntrack.ko.

footprint test:
$ ls -l net/netfilter/nf_conntrack{_proto_udplite,}.ko \
        net/ipv4/netfilter/nf_conntrack_ipv4.ko \
        net/ipv6/netfilter/nf_conntrack_ipv6.ko

(builtin)|| udplite|  ipv4  |  ipv6  |nf_conntrack
---------++--------+--------+--------+--------------
none     || 432538 | 828755 | 828676 | 6141434
UDPlite  ||   -    | 829649 | 829362 | 6498204

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_conntrack_ipv4.h |  3 +++
 include/net/netfilter/ipv6/nf_conntrack_ipv6.h |  3 +++
 include/net/netns/conntrack.h                  | 16 ++++++++++++++++
 3 files changed, 22 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index 5f1fc15a51fb..919e4e8af327 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -21,6 +21,9 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4;
 #ifdef CONFIG_NF_CT_PROTO_SCTP
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4;
+#endif
 
 int nf_conntrack_ipv4_compat_init(void);
 void nf_conntrack_ipv4_compat_fini(void);
diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
index f70d191a8820..eaea968f8657 100644
--- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
@@ -12,6 +12,9 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6;
 #ifdef CONFIG_NF_CT_PROTO_SCTP
 extern struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6;
+#endif
 
 #include <linux/sysctl.h>
 extern struct ctl_table nf_ct_ipv6_sysctl_table[];
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 17724c62de97..cf799fc3fdec 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -69,6 +69,19 @@ struct nf_sctp_net {
 };
 #endif
 
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+enum udplite_conntrack {
+	UDPLITE_CT_UNREPLIED,
+	UDPLITE_CT_REPLIED,
+	UDPLITE_CT_MAX
+};
+
+struct nf_udplite_net {
+	struct nf_proto_net pn;
+	unsigned int timeouts[UDPLITE_CT_MAX];
+};
+#endif
+
 struct nf_ip_net {
 	struct nf_generic_net   generic;
 	struct nf_tcp_net	tcp;
@@ -81,6 +94,9 @@ struct nf_ip_net {
 #ifdef CONFIG_NF_CT_PROTO_SCTP
 	struct nf_sctp_net	sctp;
 #endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+	struct nf_udplite_net	udplite;
+#endif
 };
 
 struct ct_pcpu {
-- 
cgit v1.2.3


From a379854d91b2cb0af07b0f62845449f4dacbd673 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 15 Nov 2016 21:36:39 +0100
Subject: netfilter: conntrack: remove unused init_net hook

since adf0516845bcd0 ("netfilter: remove ip_conntrack* sysctl compat code")
the only user (ipv4 tracker) sets this to an empty stub function.

After this change nf_ct_l3proto_pernet_register() is also empty,
but this will change in a followup patch to add conditional register
of the hooks.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l3proto.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index 8992e4229da9..cf8f3dfd810d 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -63,9 +63,6 @@ struct nf_conntrack_l3proto {
 
 	size_t nla_size;
 
-	/* Init l3proto pernet data */
-	int (*init_net)(struct net *net);
-
 	/* Module (if any) which this is connected to. */
 	struct module *me;
 };
-- 
cgit v1.2.3


From ecb2421b5ddf48e6e116fced7f74c985bb546138 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 15 Nov 2016 21:36:40 +0100
Subject: netfilter: add and use nf_ct_netns_get/put

currently aliased to try_module_get/_put.
Will be changed in next patch when we add functions to make use of ->net
argument to store usercount per l3proto tracker.

This is needed to avoid registering the conntrack hooks in all netns and
later only enable connection tracking in those that need conntrack.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index d9d52c020a70..5916aa9ab3f0 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -181,6 +181,10 @@ static inline void nf_ct_put(struct nf_conn *ct)
 int nf_ct_l3proto_try_module_get(unsigned short l3proto);
 void nf_ct_l3proto_module_put(unsigned short l3proto);
 
+/* load module; enable/disable conntrack in this namespace */
+int nf_ct_netns_get(struct net *net, u8 nfproto);
+void nf_ct_netns_put(struct net *net, u8 nfproto);
+
 /*
  * Allocate a hashtable of hlist_head (if nulls == 0),
  * or hlist_nulls_head (if nulls == 1)
-- 
cgit v1.2.3


From 0c66dc1ea3f0366221f8a5a16c73f01ea9259678 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 15 Nov 2016 21:36:43 +0100
Subject: netfilter: conntrack: register hooks in netns when needed by ruleset

This makes use of nf_ct_netns_get/put added in previous patch.
We add get/put functions to nf_conntrack_l3proto structure, ipv4 and ipv6
then implement use-count to track how many users (nft or xtables modules)
have a dependency on ipv4 and/or ipv6 connection tracking functionality.

When count reaches zero, the hooks are unregistered.

This delays activation of connection tracking inside a namespace until
stateful firewall rule or nat rule gets added.

This patch breaks backwards compatibility in the sense that connection
tracking won't be active anymore when the protocol tracker module is
loaded.  This breaks e.g. setups that ctnetlink for flow accounting and
the like, without any '-m conntrack' packet filter rules.

Followup patch restores old behavour and makes new delayed scheme
optional via sysctl.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l3proto.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index cf8f3dfd810d..e7dcd72be21c 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -52,6 +52,10 @@ struct nf_conntrack_l3proto {
 	int (*tuple_to_nlattr)(struct sk_buff *skb,
 			       const struct nf_conntrack_tuple *t);
 
+	/* Called when netns wants to use connection tracking */
+	int (*net_ns_get)(struct net *);
+	void (*net_ns_put)(struct net *);
+
 	/*
 	 * Calculate size of tuple nlattr
 	 */
-- 
cgit v1.2.3


From 481fa3734769b67f00ed09a42f2a6a8cbd00b869 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 15 Nov 2016 21:36:44 +0100
Subject: netfilter: conntrack: add nf_conntrack_default_on sysctl

This switch (default on) can be used to disable automatic registration
of connection tracking functionality in newly created network
namespaces.

This means that when net namespace goes down (or the tracker protocol
module is unloaded) we *might* have to unregister the hooks.

We can either add another per-netns variable that tells if
the hooks got registered by default, or, alternatively, just call
the protocol _put() function and have the callee deal with a possible
'extra' put() operation that doesn't pair with a get() one.

This uses the latter approach, i.e. a put() without a get has no effect.

Conntrack is still enabled automatically regardless of the new sysctl
setting if the new net namespace requires connection tracking, e.g. when
NAT rules are created.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l3proto.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index e7dcd72be21c..e01559b4d781 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -73,9 +73,18 @@ struct nf_conntrack_l3proto {
 
 extern struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX];
 
+#ifdef CONFIG_SYSCTL
 /* Protocol pernet registration. */
 int nf_ct_l3proto_pernet_register(struct net *net,
 				  struct nf_conntrack_l3proto *proto);
+#else
+static inline int nf_ct_l3proto_pernet_register(struct net *n,
+						struct nf_conntrack_l3proto *p)
+{
+	return 0;
+}
+#endif
+
 void nf_ct_l3proto_pernet_unregister(struct net *net,
 				     struct nf_conntrack_l3proto *proto);
 
-- 
cgit v1.2.3


From 9115e8cd2a0c6eaaa900c462721f12e1d45f326c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 3 Dec 2016 11:14:56 -0800
Subject: net: reorganize struct sock for better data locality

Group fields used in TX path, and keep some cache lines mostly read
to permit sharing among cpus.

Gained two 4 bytes holes on 64bit arches.

Added a place holder for tcp tsq_flags, next to sk_wmem_alloc
to speed up tcp_wfree() in the following patch.

I have not added ____cacheline_aligned_in_smp, this might be done later.
I prefer doing this once inet and tcp/udp sockets reorg is also done.

Tested with both TCP and UDP.

UDP receiver performance under flood increased by ~20 % :
Accessing sk_filter/sk_wq/sk_napi_id no longer stalls because sk_drops
was moved away from a critical cache line, now mostly read and shared.

	/* --- cacheline 4 boundary (256 bytes) --- */
	unsigned int               sk_napi_id;           /* 0x100   0x4 */
	int                        sk_rcvbuf;            /* 0x104   0x4 */
	struct sk_filter *         sk_filter;            /* 0x108   0x8 */
	union {
		struct socket_wq * sk_wq;                /*         0x8 */
		struct socket_wq * sk_wq_raw;            /*         0x8 */
	};                                               /* 0x110   0x8 */
	struct xfrm_policy *       sk_policy[2];         /* 0x118  0x10 */
	struct dst_entry *         sk_rx_dst;            /* 0x128   0x8 */
	struct dst_entry *         sk_dst_cache;         /* 0x130   0x8 */
	atomic_t                   sk_omem_alloc;        /* 0x138   0x4 */
	int                        sk_sndbuf;            /* 0x13c   0x4 */
	/* --- cacheline 5 boundary (320 bytes) --- */
	int                        sk_wmem_queued;       /* 0x140   0x4 */
	atomic_t                   sk_wmem_alloc;        /* 0x144   0x4 */
	long unsigned int          sk_tsq_flags;         /* 0x148   0x8 */
	struct sk_buff *           sk_send_head;         /* 0x150   0x8 */
	struct sk_buff_head        sk_write_queue;       /* 0x158  0x18 */
	__s32                      sk_peek_off;          /* 0x170   0x4 */
	int                        sk_write_pending;     /* 0x174   0x4 */
	long int                   sk_sndtimeo;          /* 0x178   0x8 */

Signed-off-by: Eric Dumazet <edumazet@google.com>
Tested-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 51 +++++++++++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 24 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 69afda6bea15..6dfe3aa22b97 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -343,6 +343,9 @@ struct sock {
 #define sk_rxhash		__sk_common.skc_rxhash
 
 	socket_lock_t		sk_lock;
+	atomic_t		sk_drops;
+	int			sk_rcvlowat;
+	struct sk_buff_head	sk_error_queue;
 	struct sk_buff_head	sk_receive_queue;
 	/*
 	 * The backlog queue is special, it is always used with
@@ -359,14 +362,13 @@ struct sock {
 		struct sk_buff	*tail;
 	} sk_backlog;
 #define sk_rmem_alloc sk_backlog.rmem_alloc
-	int			sk_forward_alloc;
 
-	__u32			sk_txhash;
+	int			sk_forward_alloc;
 #ifdef CONFIG_NET_RX_BUSY_POLL
-	unsigned int		sk_napi_id;
 	unsigned int		sk_ll_usec;
+	/* ===== mostly read cache line ===== */
+	unsigned int		sk_napi_id;
 #endif
-	atomic_t		sk_drops;
 	int			sk_rcvbuf;
 
 	struct sk_filter __rcu	*sk_filter;
@@ -379,11 +381,30 @@ struct sock {
 #endif
 	struct dst_entry	*sk_rx_dst;
 	struct dst_entry __rcu	*sk_dst_cache;
-	/* Note: 32bit hole on 64bit arches */
-	atomic_t		sk_wmem_alloc;
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
+
+	/* ===== cache line for TX ===== */
+	int			sk_wmem_queued;
+	atomic_t		sk_wmem_alloc;
+	unsigned long		sk_tsq_flags;
+	struct sk_buff		*sk_send_head;
 	struct sk_buff_head	sk_write_queue;
+	__s32			sk_peek_off;
+	int			sk_write_pending;
+	long			sk_sndtimeo;
+	struct timer_list	sk_timer;
+	__u32			sk_priority;
+	__u32			sk_mark;
+	u32			sk_pacing_rate; /* bytes per second */
+	u32			sk_max_pacing_rate;
+	struct page_frag	sk_frag;
+	netdev_features_t	sk_route_caps;
+	netdev_features_t	sk_route_nocaps;
+	int			sk_gso_type;
+	unsigned int		sk_gso_max_size;
+	gfp_t			sk_allocation;
+	__u32			sk_txhash;
 
 	/*
 	 * Because of non atomicity rules, all
@@ -414,42 +435,24 @@ struct sock {
 #define SK_PROTOCOL_MAX U8_MAX
 	kmemcheck_bitfield_end(flags);
 
-	int			sk_wmem_queued;
-	gfp_t			sk_allocation;
-	u32			sk_pacing_rate; /* bytes per second */
-	u32			sk_max_pacing_rate;
-	netdev_features_t	sk_route_caps;
-	netdev_features_t	sk_route_nocaps;
-	int			sk_gso_type;
-	unsigned int		sk_gso_max_size;
 	u16			sk_gso_max_segs;
-	int			sk_rcvlowat;
 	unsigned long	        sk_lingertime;
-	struct sk_buff_head	sk_error_queue;
 	struct proto		*sk_prot_creator;
 	rwlock_t		sk_callback_lock;
 	int			sk_err,
 				sk_err_soft;
 	u32			sk_ack_backlog;
 	u32			sk_max_ack_backlog;
-	__u32			sk_priority;
-	__u32			sk_mark;
 	kuid_t			sk_uid;
 	struct pid		*sk_peer_pid;
 	const struct cred	*sk_peer_cred;
 	long			sk_rcvtimeo;
-	long			sk_sndtimeo;
-	struct timer_list	sk_timer;
 	ktime_t			sk_stamp;
 	u16			sk_tsflags;
 	u8			sk_shutdown;
 	u32			sk_tskey;
 	struct socket		*sk_socket;
 	void			*sk_user_data;
-	struct page_frag	sk_frag;
-	struct sk_buff		*sk_send_head;
-	__s32			sk_peek_off;
-	int			sk_write_pending;
 #ifdef CONFIG_SECURITY
 	void			*sk_security;
 #endif
-- 
cgit v1.2.3


From 1c0d32fde5bdf1184bc274f864c09799278a1114 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 4 Dec 2016 09:48:16 -0800
Subject: net_sched: gen_estimator: complete rewrite of rate estimators

1) Old code was hard to maintain, due to complex lock chains.
   (We probably will be able to remove some kfree_rcu() in callers)

2) Using a single timer to update all estimators does not scale.

3) Code was buggy on 32bit kernel (WRITE_ONCE() on 64bit quantity
   is not supposed to work well)

In this rewrite :

- I removed the RB tree that had to be scanned in
  gen_estimator_active(). qdisc dumps should be much faster.

- Each estimator has its own timer.

- Estimations are maintained in net_rate_estimator structure,
  instead of dirtying the qdisc. Minor, but part of the simplification.

- Reading the estimator uses RCU and a seqcount to provide proper
  support for 32bit kernels.

- We reduce memory need when estimators are not used, since
  we store a pointer, instead of the bytes/packets counters.

- xt_rateest_mt() no longer has to grab a spinlock.
  (In the future, xt_rateest_tg() could be switched to per cpu counters)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h              |  2 +-
 include/net/gen_stats.h            | 17 +++++++++--------
 include/net/netfilter/xt_rateest.h | 10 +++++++---
 include/net/sch_generic.h          |  2 +-
 4 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'include/net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 9dddf77a69cc..1d716449209e 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -36,7 +36,7 @@ struct tc_action {
 	struct tcf_t			tcfa_tm;
 	struct gnet_stats_basic_packed	tcfa_bstats;
 	struct gnet_stats_queue		tcfa_qstats;
-	struct gnet_stats_rate_est64	tcfa_rate_est;
+	struct net_rate_estimator __rcu *tcfa_rate_est;
 	spinlock_t			tcfa_lock;
 	struct rcu_head			tcfa_rcu;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 231e121cc7d9..8b7aa370e7a4 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -11,6 +11,8 @@ struct gnet_stats_basic_cpu {
 	struct u64_stats_sync syncp;
 };
 
+struct net_rate_estimator;
+
 struct gnet_dump {
 	spinlock_t *      lock;
 	struct sk_buff *  skb;
@@ -42,8 +44,7 @@ void __gnet_stats_copy_basic(const seqcount_t *running,
 			     struct gnet_stats_basic_cpu __percpu *cpu,
 			     struct gnet_stats_basic_packed *b);
 int gnet_stats_copy_rate_est(struct gnet_dump *d,
-			     const struct gnet_stats_basic_packed *b,
-			     struct gnet_stats_rate_est64 *r);
+			     struct net_rate_estimator __rcu **ptr);
 int gnet_stats_copy_queue(struct gnet_dump *d,
 			  struct gnet_stats_queue __percpu *cpu_q,
 			  struct gnet_stats_queue *q, __u32 qlen);
@@ -53,16 +54,16 @@ int gnet_stats_finish_copy(struct gnet_dump *d);
 
 int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
-		      struct gnet_stats_rate_est64 *rate_est,
+		      struct net_rate_estimator __rcu **rate_est,
 		      spinlock_t *stats_lock,
 		      seqcount_t *running, struct nlattr *opt);
-void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
-			struct gnet_stats_rate_est64 *rate_est);
+void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
 int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
 			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
-			  struct gnet_stats_rate_est64 *rate_est,
+			  struct net_rate_estimator __rcu **ptr,
 			  spinlock_t *stats_lock,
 			  seqcount_t *running, struct nlattr *opt);
-bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
-			  const struct gnet_stats_rate_est64 *rate_est);
+bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
+bool gen_estimator_read(struct net_rate_estimator __rcu **ptr,
+			struct gnet_stats_rate_est64 *sample);
 #endif
diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h
index 79f45e19f31e..130e58361f99 100644
--- a/include/net/netfilter/xt_rateest.h
+++ b/include/net/netfilter/xt_rateest.h
@@ -1,19 +1,23 @@
 #ifndef _XT_RATEEST_H
 #define _XT_RATEEST_H
 
+#include <net/gen_stats.h>
+
 struct xt_rateest {
 	/* keep lock and bstats on same cache line to speedup xt_rateest_tg() */
 	struct gnet_stats_basic_packed	bstats;
 	spinlock_t			lock;
-	/* keep rstats and lock on same cache line to speedup xt_rateest_mt() */
-	struct gnet_stats_rate_est64	rstats;
+
 
 	/* following fields not accessed in hot path */
+	unsigned int			refcnt;
 	struct hlist_node		list;
 	char				name[IFNAMSIZ];
-	unsigned int			refcnt;
 	struct gnet_estimator		params;
 	struct rcu_head			rcu;
+
+	/* keep this field far away to speedup xt_rateest_mt() */
+	struct net_rate_estimator __rcu *rate_est;
 };
 
 struct xt_rateest *xt_rateest_lookup(const char *name);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e6aa0a249672..498f81b229a4 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -76,7 +76,7 @@ struct Qdisc {
 
 	struct netdev_queue	*dev_queue;
 
-	struct gnet_stats_rate_est64	rate_est;
+	struct net_rate_estimator __rcu *rate_est;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
 	struct gnet_stats_queue	__percpu *cpu_qstats;
 
-- 
cgit v1.2.3


From 834184b1f3a4635efbdfdae5fb437f109f6605fa Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 15 Nov 2016 21:36:45 +0100
Subject: netfilter: defrag: only register defrag functionality if needed

nf_defrag modules for ipv4 and ipv6 export an empty stub function.
Any module that needs the defragmentation hooks registered simply 'calls'
this empty function to create a phony module dependency -- modprobe will
then load the defrag module too.

This extends netfilter ipv4/ipv6 defragmentation modules to delay the hook
registration until the functionality is requested within a network namespace
instead of module load time for all namespaces.

Hooks are only un-registered on module unload or when a namespace that used
such defrag functionality exits.

We have to use struct net for this as the register hooks can be called
before netns initialization here from the ipv4/ipv6 conntrack module
init path.

There is no unregister functionality support, defrag will always be
active once it was requested inside a net namespace.

The reason is that defrag has impact on nft and iptables rulesets
(without defrag we might see framents).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/ipv4/nf_defrag_ipv4.h | 3 ++-
 include/net/netfilter/ipv6/nf_defrag_ipv6.h | 3 ++-
 include/net/netns/netfilter.h               | 6 ++++++
 3 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/ipv4/nf_defrag_ipv4.h b/include/net/netfilter/ipv4/nf_defrag_ipv4.h
index f01ef208dff6..db405f70e538 100644
--- a/include/net/netfilter/ipv4/nf_defrag_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_defrag_ipv4.h
@@ -1,6 +1,7 @@
 #ifndef _NF_DEFRAG_IPV4_H
 #define _NF_DEFRAG_IPV4_H
 
-void nf_defrag_ipv4_enable(void);
+struct net;
+int nf_defrag_ipv4_enable(struct net *);
 
 #endif /* _NF_DEFRAG_IPV4_H */
diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h
index ddf162f7966f..7664efe37974 100644
--- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h
@@ -1,7 +1,8 @@
 #ifndef _NF_DEFRAG_IPV6_H
 #define _NF_DEFRAG_IPV6_H
 
-void nf_defrag_ipv6_enable(void);
+struct net;
+int nf_defrag_ipv6_enable(struct net *);
 
 int nf_ct_frag6_init(void);
 void nf_ct_frag6_cleanup(void);
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index 58487b1cc99a..cea396b53a60 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -17,5 +17,11 @@ struct netns_nf {
 	struct ctl_table_header *nf_log_dir_header;
 #endif
 	struct nf_hook_entry __rcu *hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS];
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
+	bool			defrag_ipv4;
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+	bool			defrag_ipv6;
+#endif
 };
 #endif
-- 
cgit v1.2.3


From 1814096980bbe546c4384b7b064126cbe7d40d30 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 24 Nov 2016 12:04:55 +0100
Subject: netfilter: nft_payload: layer 4 checksum adjustment for pseudoheader
 fields

This patch adds a new flag that signals the kernel to update layer 4
checksum if the packet field belongs to the layer 4 pseudoheader. This
implicitly provides stateless NAT 1:1 that is useful under very specific
usecases.

Since rules mangling layer 3 fields that are part of the pseudoheader
may potentially convey any layer 4 packet, we have to deal with the
layer 4 checksum adjustment using protocol specific code.

This patch adds support for TCP, UDP and ICMPv6, since they include the
pseudoheader in the layer 4 checksum calculation. ICMP doesn't, so we
can skip it.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 862373d4ea9d..8f690effec37 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -45,6 +45,7 @@ struct nft_payload_set {
 	enum nft_registers	sreg:8;
 	u8			csum_type;
 	u8			csum_offset;
+	u8			csum_flags;
 };
 
 extern const struct nft_expr_ops nft_payload_fast_ops;
-- 
cgit v1.2.3


From 3bf3276119455bd0fc7a7e31be2823118e613842 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 28 Nov 2016 11:40:06 +0100
Subject: netfilter: add and use nf_fwd_netdev_egress

... so we can use current skb instead of working with a clone.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_dup_netdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_dup_netdev.h b/include/net/netfilter/nf_dup_netdev.h
index 397dcae349f9..3e919356bedf 100644
--- a/include/net/netfilter/nf_dup_netdev.h
+++ b/include/net/netfilter/nf_dup_netdev.h
@@ -2,5 +2,6 @@
 #define _NF_DUP_NETDEV_H_
 
 void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif);
+void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif);
 
 #endif
-- 
cgit v1.2.3


From e50092404c1bc7aaeb0a0f4077fa6f07b073a20f Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 28 Nov 2016 00:04:32 +0100
Subject: netfilter: nf_tables: add stateful objects

This patch augments nf_tables to support stateful objects. This new
infrastructure allows you to create, dump and delete stateful objects,
that are identified by a user-defined name.

This patch adds the generic infrastructure, follow up patches add
support for two stateful objects: counters and quotas.

This patch provides a native infrastructure for nf_tables to replace
nfacct, the extended accounting infrastructure for iptables.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 79 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 32970cba184a..903cd618f50e 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -875,6 +875,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);
  *	@list: used internally
  *	@chains: chains in the table
  *	@sets: sets in the table
+ *	@objects: stateful objects in the table
  *	@hgenerator: handle generator state
  *	@use: number of chain references to this table
  *	@flags: table flag (see enum nft_table_flags)
@@ -885,6 +886,7 @@ struct nft_table {
 	struct list_head		list;
 	struct list_head		chains;
 	struct list_head		sets;
+	struct list_head		objects;
 	u64				hgenerator;
 	u32				use;
 	u16				flags:14,
@@ -934,6 +936,73 @@ void nft_unregister_expr(struct nft_expr_type *);
 int nft_verdict_dump(struct sk_buff *skb, int type,
 		     const struct nft_verdict *v);
 
+/**
+ *	struct nft_object - nf_tables stateful object
+ *
+ *	@list: table stateful object list node
+ *	@type: pointer to object type
+ *	@data: pointer to object data
+ *	@name: name of this stateful object
+ *	@genmask: generation mask
+ *	@use: number of references to this stateful object
+ * 	@data: object data, layout depends on type
+ */
+struct nft_object {
+	struct list_head		list;
+	char				name[NFT_OBJ_MAXNAMELEN];
+	u32				genmask:2,
+					use:30;
+	/* runtime data below here */
+	const struct nft_object_type	*type ____cacheline_aligned;
+	unsigned char			data[]
+		__attribute__((aligned(__alignof__(u64))));
+};
+
+static inline void *nft_obj_data(const struct nft_object *obj)
+{
+	return (void *)obj->data;
+}
+
+#define nft_expr_obj(expr)	*((struct nft_object **)nft_expr_priv(expr))
+
+struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
+					const struct nlattr *nla, u32 objtype,
+					u8 genmask);
+
+/**
+ *	struct nft_object_type - stateful object type
+ *
+ *	@eval: stateful object evaluation function
+ *	@list: list node in list of object types
+ *	@type: stateful object numeric type
+ *	@size: stateful object size
+ *	@owner: module owner
+ *	@maxattr: maximum netlink attribute
+ *	@policy: netlink attribute policy
+ *	@init: initialize object from netlink attributes
+ *	@destroy: release existing stateful object
+ *	@dump: netlink dump stateful object
+ */
+struct nft_object_type {
+	void				(*eval)(struct nft_object *obj,
+						struct nft_regs *regs,
+						const struct nft_pktinfo *pkt);
+	struct list_head		list;
+	u32				type;
+	unsigned int			size;
+	unsigned int			maxattr;
+	struct module			*owner;
+	const struct nla_policy		*policy;
+	int				(*init)(const struct nlattr * const tb[],
+						struct nft_object *obj);
+	void				(*destroy)(struct nft_object *obj);
+	int				(*dump)(struct sk_buff *skb,
+						const struct nft_object *obj);
+};
+
+int nft_register_obj(struct nft_object_type *obj_type);
+void nft_unregister_obj(struct nft_object_type *obj_type);
+
 /**
  *	struct nft_traceinfo - nft tracing information and state
  *
@@ -981,6 +1050,9 @@ void nft_trace_notify(struct nft_traceinfo *info);
 #define MODULE_ALIAS_NFT_SET() \
 	MODULE_ALIAS("nft-set")
 
+#define MODULE_ALIAS_NFT_OBJ(type) \
+	MODULE_ALIAS("nft-obj-" __stringify(type))
+
 /*
  * The gencursor defines two generations, the currently active and the
  * next one. Objects contain a bitmask of 2 bits specifying the generations
@@ -1157,4 +1229,11 @@ struct nft_trans_elem {
 #define nft_trans_elem(trans)	\
 	(((struct nft_trans_elem *)trans->data)->elem)
 
+struct nft_trans_obj {
+	struct nft_object		*obj;
+};
+
+#define nft_trans_obj(trans)	\
+	(((struct nft_trans_obj *)trans->data)->obj)
+
 #endif /* _NET_NF_TABLES_H */
-- 
cgit v1.2.3


From 43da04a593d8b2626f1cf4b56efe9402f6b53652 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 28 Nov 2016 00:05:44 +0100
Subject: netfilter: nf_tables: atomic dump and reset for stateful objects

This patch adds a new NFT_MSG_GETOBJ_RESET command perform an atomic
dump-and-reset of the stateful object. This also comes with add support
for atomic dump and reset for counter and quota objects.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 903cd618f50e..6f7d6a1dc09c 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -997,7 +997,8 @@ struct nft_object_type {
 						struct nft_object *obj);
 	void				(*destroy)(struct nft_object *obj);
 	int				(*dump)(struct sk_buff *skb,
-						const struct nft_object *obj);
+						struct nft_object *obj,
+						bool reset);
 };
 
 int nft_register_obj(struct nft_object_type *obj_type);
-- 
cgit v1.2.3


From 2599e98934c5ad166ad184b3682e38aadcb63fb3 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 28 Nov 2016 00:05:48 +0100
Subject: netfilter: nf_tables: notify internal updates of stateful objects

Introduce nf_tables_obj_notify() to notify internal state changes in
stateful objects. This is used by the quota object to report depletion
in a follow up patch.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 6f7d6a1dc09c..339e374c28b5 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -969,6 +969,10 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
 					const struct nlattr *nla, u32 objtype,
 					u8 genmask);
 
+int nft_obj_notify(struct net *net, struct nft_table *table,
+		   struct nft_object *obj, u32 portid, u32 seq,
+		   int event, int family, int report, gfp_t gfp);
+
 /**
  *	struct nft_object_type - stateful object type
  *
-- 
cgit v1.2.3


From 1896531710abcd9a961a17d0c5c6a9f537d479b6 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 28 Nov 2016 00:05:56 +0100
Subject: netfilter: nft_quota: add depleted flag for objects

Notify on depleted quota objects. The NFT_QUOTA_F_DEPLETED flag
indicates we have reached overquota.

Add pointer to table from nft_object, so we can use it when sending the
depletion notification to userspace.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 339e374c28b5..ce6fb6e83b32 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -940,6 +940,7 @@ int nft_verdict_dump(struct sk_buff *skb, int type,
  *	struct nft_object - nf_tables stateful object
  *
  *	@list: table stateful object list node
+ *	@table: table this object belongs to
  *	@type: pointer to object type
  *	@data: pointer to object data
  *	@name: name of this stateful object
@@ -950,6 +951,7 @@ int nft_verdict_dump(struct sk_buff *skb, int type,
 struct nft_object {
 	struct list_head		list;
 	char				name[NFT_OBJ_MAXNAMELEN];
+	struct nft_table		*table;
 	u32				genmask:2,
 					use:30;
 	/* runtime data below here */
-- 
cgit v1.2.3


From 8aeff920dcc9b3f8cf43042a76428582634d9208 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 28 Nov 2016 00:06:00 +0100
Subject: netfilter: nf_tables: add stateful object reference to set elements

This patch allows you to refer to stateful objects from set elements.
This provides the infrastructure to create maps where the right hand
side of the mapping is a stateful object.

This allows us to build dictionaries of stateful objects, that you can
use to perform fast lookups using any arbitrary key combination.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index ce6fb6e83b32..85f0f03f1e87 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -326,6 +326,7 @@ void nft_unregister_set(struct nft_set_ops *ops);
  * 	@name: name of the set
  * 	@ktype: key type (numeric type defined by userspace, not used in the kernel)
  * 	@dtype: data type (verdict or numeric type defined by userspace)
+ * 	@objtype: object type (see NFT_OBJECT_* definitions)
  * 	@size: maximum set size
  * 	@nelems: number of elements
  * 	@ndeact: number of deactivated elements queued for removal
@@ -347,6 +348,7 @@ struct nft_set {
 	char				name[NFT_SET_MAXNAMELEN];
 	u32				ktype;
 	u32				dtype;
+	u32				objtype;
 	u32				size;
 	atomic_t			nelems;
 	u32				ndeact;
@@ -416,6 +418,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
  *	@NFT_SET_EXT_EXPIRATION: element expiration time
  *	@NFT_SET_EXT_USERDATA: user data associated with the element
  *	@NFT_SET_EXT_EXPR: expression assiociated with the element
+ *	@NFT_SET_EXT_OBJREF: stateful object reference associated with element
  *	@NFT_SET_EXT_NUM: number of extension types
  */
 enum nft_set_extensions {
@@ -426,6 +429,7 @@ enum nft_set_extensions {
 	NFT_SET_EXT_EXPIRATION,
 	NFT_SET_EXT_USERDATA,
 	NFT_SET_EXT_EXPR,
+	NFT_SET_EXT_OBJREF,
 	NFT_SET_EXT_NUM
 };
 
@@ -554,6 +558,11 @@ static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set,
 	return elem + set->ops->elemsize;
 }
 
+static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext)
+{
+	return nft_set_ext(ext, NFT_SET_EXT_OBJREF);
+}
+
 void *nft_set_elem_init(const struct nft_set *set,
 			const struct nft_set_ext_tmpl *tmpl,
 			const u32 *key, const u32 *data,
-- 
cgit v1.2.3


From 8411b6442e59810fe0750a2f321b9dcb7d0a3d17 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 5 Dec 2016 23:35:50 +0100
Subject: netfilter: nf_tables: support for set flushing

This patch adds support for set flushing, that consists of walking over
the set elements if the NFTA_SET_ELEM_LIST_ELEMENTS attribute is set.
This patch requires the following changes:

1) Add set->ops->deactivate_one() operation: This allows us to
   deactivate an element from the set element walk path, given we can
   skip the lookup that happens in ->deactivate().

2) Add a new nft_trans_alloc_gfp() function since we need to allocate
   transactions using GFP_ATOMIC given the set walk path happens with
   held rcu_read_lock.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 85f0f03f1e87..924325c46aab 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -259,7 +259,8 @@ struct nft_expr;
  *	@lookup: look up an element within the set
  *	@insert: insert new element into set
  *	@activate: activate new element in the next generation
- *	@deactivate: deactivate element in the next generation
+ *	@deactivate: lookup for element and deactivate it in the next generation
+ *	@deactivate_one: deactivate element in the next generation
  *	@remove: remove element from set
  *	@walk: iterate over all set elemeennts
  *	@privsize: function to return size of set private data
@@ -294,6 +295,9 @@ struct nft_set_ops {
 	void *				(*deactivate)(const struct net *net,
 						      const struct nft_set *set,
 						      const struct nft_set_elem *elem);
+	bool				(*deactivate_one)(const struct net *net,
+							  const struct nft_set *set,
+							  void *priv);
 	void				(*remove)(const struct nft_set *set,
 						  const struct nft_set_elem *elem);
 	void				(*walk)(const struct nft_ctx *ctx,
-- 
cgit v1.2.3


From 5b8e2f61b9df529ca4af057daf7bfb1de348bdf1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 6 Dec 2016 19:32:50 -0800
Subject: net: sock_rps_record_flow() is for connected sockets

Paolo noticed a cache line miss in UDP recvmsg() to access
sk_rxhash, sharing a cache line with sk_drops.

sk_drops might be heavily incremented by cpus handling a flood targeting
this socket.

We might place sk_drops on a separate cache line, but lets try
to avoid wasting 64 bytes per socket just for this, since we have
other bottlenecks to take care of.

sock_rps_record_flow() should only access sk_rxhash for connected
flows.

Testing sk_state for TCP_ESTABLISHED covers most of the cases for
connected sockets, for a zero cost, since system calls using
sock_rps_record_flow() also access sk->sk_prot which is on the
same cache line.

A follow up patch will provide a static_key (Jump Label) since most
hosts do not even use RFS.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 6dfe3aa22b97..1749e38d0301 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -913,7 +913,17 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
 static inline void sock_rps_record_flow(const struct sock *sk)
 {
 #ifdef CONFIG_RPS
-	sock_rps_record_flow_hash(sk->sk_rxhash);
+	/* Reading sk->sk_rxhash might incur an expensive cache line miss.
+	 *
+	 * TCP_ESTABLISHED does cover almost all states where RFS
+	 * might be useful, and is cheaper [1] than testing :
+	 *	IPv4: inet_sk(sk)->inet_daddr
+	 * 	IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
+	 * OR	an additional socket flag
+	 * [1] : sk_state and sk_prot are in the same cache line.
+	 */
+	if (sk->sk_state == TCP_ESTABLISHED)
+		sock_rps_record_flow_hash(sk->sk_rxhash);
 #endif
 }
 
-- 
cgit v1.2.3


From 972d3876faa8a9195122b2d2bcd3155f904fff37 Mon Sep 17 00:00:00 2001
From: Simon Horman <simon.horman@netronome.com>
Date: Wed, 7 Dec 2016 13:48:27 +0100
Subject: flow dissector: ICMP support

Allow dissection of ICMP(V6) type and code. This should only occur
if a packet is ICMP(V6) and the dissector has FLOW_DISSECTOR_KEY_ICMP set.

There are currently no users of FLOW_DISSECTOR_KEY_ICMP.
A follow-up patch will allow FLOW_DISSECTOR_KEY_ICMP to be used by
the flower classifier.

Signed-off-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index c4f31666afd2..d896a33e00d4 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -104,6 +104,22 @@ struct flow_dissector_key_ports {
 	};
 };
 
+/**
+ * flow_dissector_key_icmp:
+ *	@ports: type and code of ICMP header
+ *		icmp: ICMP type (high) and code (low)
+ *		type: ICMP type
+ *		code: ICMP code
+ */
+struct flow_dissector_key_icmp {
+	union {
+		__be16 icmp;
+		struct {
+			u8 type;
+			u8 code;
+		};
+	};
+};
 
 /**
  * struct flow_dissector_key_eth_addrs:
@@ -122,6 +138,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
 	FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
 	FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
+	FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */
 	FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
 	FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */
 	FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_flow_vlan */
-- 
cgit v1.2.3


From 13bfff25c081f4e060af761c4082b5a96f756810 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 7 Dec 2016 08:29:10 -0800
Subject: net: rfs: add a jump label

RFS is not commonly used, so add a jump label to avoid some conditionals
in fast path.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 1749e38d0301..2729e77950b7 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -913,17 +913,20 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
 static inline void sock_rps_record_flow(const struct sock *sk)
 {
 #ifdef CONFIG_RPS
-	/* Reading sk->sk_rxhash might incur an expensive cache line miss.
-	 *
-	 * TCP_ESTABLISHED does cover almost all states where RFS
-	 * might be useful, and is cheaper [1] than testing :
-	 *	IPv4: inet_sk(sk)->inet_daddr
-	 * 	IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
-	 * OR	an additional socket flag
-	 * [1] : sk_state and sk_prot are in the same cache line.
-	 */
-	if (sk->sk_state == TCP_ESTABLISHED)
-		sock_rps_record_flow_hash(sk->sk_rxhash);
+	if (static_key_false(&rfs_needed)) {
+		/* Reading sk->sk_rxhash might incur an expensive cache line
+		 * miss.
+		 *
+		 * TCP_ESTABLISHED does cover almost all states where RFS
+		 * might be useful, and is cheaper [1] than testing :
+		 *	IPv4: inet_sk(sk)->inet_daddr
+		 * 	IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
+		 * OR	an additional socket flag
+		 * [1] : sk_state and sk_prot are in the same cache line.
+		 */
+		if (sk->sk_state == TCP_ESTABLISHED)
+			sock_rps_record_flow_hash(sk->sk_rxhash);
+	}
 #endif
 }
 
-- 
cgit v1.2.3


From 3665f3817cd354ab7a811b3a4f282c4f5cb1a0d0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 7 Dec 2016 10:05:36 -0800
Subject: net: do not read sk_drops if application does not care

sk_drops can be an often written field, do not read it unless
application showed interest.

Note that sk_drops can be read via inet_diag, so applications
can avoid getting this info from every received packet.

In the future, 'reading' sk_drops might require folding per node or per
cpu fields, and thus become even more expensive than today.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 2729e77950b7..e17aa3de2b4d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2163,7 +2163,8 @@ struct sock_skb_cb {
 static inline void
 sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
 {
-	SOCK_SKB_CB(skb)->dropcount = atomic_read(&sk->sk_drops);
+	SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ?
+						atomic_read(&sk->sk_drops) : 0;
 }
 
 static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
-- 
cgit v1.2.3


From e6f462df9acd2a3295e5d34eb29e2823220cf129 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 8 Dec 2016 17:22:09 +0100
Subject: cfg80211/mac80211: fix BSS leaks when abandoning assoc attempts

When mac80211 abandons an association attempt, it may free
all the data structures, but inform cfg80211 and userspace
about it only by sending the deauth frame it received, in
which case cfg80211 has no link to the BSS struct that was
used and will not cfg80211_unhold_bss() it.

Fix this by providing a way to inform cfg80211 of this with
the BSS entry passed, so that it can clean up properly, and
use this ability in the appropriate places in mac80211.

This isn't ideal: some code is more or less duplicated and
tracing is missing. However, it's a fairly small change and
it's thus easier to backport - cleanups can come later.

Cc: stable@vger.kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 2019310cf135..814be4b4200c 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4685,6 +4685,17 @@ void cfg80211_rx_assoc_resp(struct net_device *dev,
  */
 void cfg80211_assoc_timeout(struct net_device *dev, struct cfg80211_bss *bss);
 
+/**
+ * cfg80211_abandon_assoc - notify cfg80211 of abandoned association attempt
+ * @dev: network device
+ * @bss: The BSS entry with which association was abandoned.
+ *
+ * Call this whenever - for reasons reported through other API, like deauth RX,
+ * an association attempt was abandoned.
+ * This function may sleep. The caller must hold the corresponding wdev's mutex.
+ */
+void cfg80211_abandon_assoc(struct net_device *dev, struct cfg80211_bss *bss);
+
 /**
  * cfg80211_tx_mlme_mgmt - notification of transmitted deauth/disassoc frame
  * @dev: network device
-- 
cgit v1.2.3