From e227300c8395dffaa7614ce7c7666a82180ebc60 Mon Sep 17 00:00:00 2001 From: Purushottam Kushwaha Date: Wed, 12 Oct 2016 18:25:35 +0530 Subject: cfg80211: pass struct to interface combination check/iter Move the growing parameter list to a structure for the interface combination check and iteration functions in cfg80211 and mac80211 to make the code easier to understand. Signed-off-by: Purushottam Kushwaha [edit commit message] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index fe78f02a242e..ea108541e1e0 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -771,6 +771,26 @@ struct cfg80211_csa_settings { u8 count; }; +/** + * struct iface_combination_params - input parameters for interface combinations + * + * Used to pass interface combination parameters + * + * @num_different_channels: the number of different channels we want + * to use for verification + * @radar_detect: a bitmap where each bit corresponds to a channel + * width where radar detection is needed, as in the definition of + * &struct ieee80211_iface_combination.@radar_detect_widths + * @iftype_num: array with the number of interfaces of each interface + * type. The index is the interface type as specified in &enum + * nl80211_iftype. + */ +struct iface_combination_params { + int num_different_channels; + u8 radar_detect; + int iftype_num[NUM_NL80211_IFTYPES]; +}; + /** * enum station_parameters_apply_mask - station parameter values to apply * @STATION_PARAM_APPLY_UAPSD: apply new uAPSD parameters (uapsd_queues, max_sp) @@ -5575,36 +5595,20 @@ unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy); * cfg80211_check_combinations - check interface combinations * * @wiphy: the wiphy - * @num_different_channels: the number of different channels we want - * to use for verification - * @radar_detect: a bitmap where each bit corresponds to a channel - * width where radar detection is needed, as in the definition of - * &struct ieee80211_iface_combination.@radar_detect_widths - * @iftype_num: array with the numbers of interfaces of each interface - * type. The index is the interface type as specified in &enum - * nl80211_iftype. + * @params: the interface combinations parameter * * This function can be called by the driver to check whether a * combination of interfaces and their types are allowed according to * the interface combinations. */ int cfg80211_check_combinations(struct wiphy *wiphy, - const int num_different_channels, - const u8 radar_detect, - const int iftype_num[NUM_NL80211_IFTYPES]); + struct iface_combination_params *params); /** * cfg80211_iter_combinations - iterate over matching combinations * * @wiphy: the wiphy - * @num_different_channels: the number of different channels we want - * to use for verification - * @radar_detect: a bitmap where each bit corresponds to a channel - * width where radar detection is needed, as in the definition of - * &struct ieee80211_iface_combination.@radar_detect_widths - * @iftype_num: array with the numbers of interfaces of each interface - * type. The index is the interface type as specified in &enum - * nl80211_iftype. + * @params: the interface combinations parameter * @iter: function to call for each matching combination * @data: pointer to pass to iter function * @@ -5613,9 +5617,7 @@ int cfg80211_check_combinations(struct wiphy *wiphy, * purposes. */ int cfg80211_iter_combinations(struct wiphy *wiphy, - const int num_different_channels, - const u8 radar_detect, - const int iftype_num[NUM_NL80211_IFTYPES], + struct iface_combination_params *params, void (*iter)(const struct ieee80211_iface_combination *c, void *data), void *data); -- cgit v1.2.3 From 0c317a02ca982ca093e71bf07cb562265ba40032 Mon Sep 17 00:00:00 2001 From: Purushottam Kushwaha Date: Wed, 12 Oct 2016 18:26:51 +0530 Subject: cfg80211: support virtual interfaces with different beacon intervals This commit provides a mechanism for the host drivers to advertise the support for different beacon intervals among the respective interface combinations in a group, through NL80211_IFACE_COMB_BI_MIN_GCD (u32). This value will be compared against GCD of all beaconing interfaces of matching combinations. If the driver doesn't advertise this value, the old behaviour where all beacon intervals must be identical is retained. If it is specified, then any beacon interval for an interface in the interface combination as well as the GCD of all active beacon intervals in the combination must be greater or equal to this value. Signed-off-by: Purushottam Kushwaha [change commit message, some variable names, small other things] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index ea108541e1e0..5000ec758eb3 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -784,11 +784,19 @@ struct cfg80211_csa_settings { * @iftype_num: array with the number of interfaces of each interface * type. The index is the interface type as specified in &enum * nl80211_iftype. + * @beacon_int_gcd: a value specifying GCD of all beaconing interfaces, + * the GCD of a single value is considered the value itself, so for + * a single interface this should be set to that interface's beacon + * interval + * @beacon_int_different: a flag indicating whether or not all beacon + * intervals (of beaconing interfaces) are different or not. */ struct iface_combination_params { int num_different_channels; u8 radar_detect; int iftype_num[NUM_NL80211_IFTYPES]; + u32 beacon_int_gcd; + bool beacon_int_different; }; /** @@ -3100,6 +3108,12 @@ struct ieee80211_iface_limit { * only in special cases. * @radar_detect_widths: bitmap of channel widths supported for radar detection * @radar_detect_regions: bitmap of regions supported for radar detection + * @beacon_int_min_gcd: This interface combination supports different + * beacon intervals. + * = 0 - all beacon intervals for different interface must be same. + * > 0 - any beacon interval for the interface part of this combination AND + * *GCD* of all beacon intervals from beaconing interfaces of this + * combination must be greater or equal to this value. * * With this structure the driver can describe which interface * combinations it supports concurrently. @@ -3158,6 +3172,7 @@ struct ieee80211_iface_combination { bool beacon_int_infra_match; u8 radar_detect_widths; u8 radar_detect_regions; + u32 beacon_int_min_gcd; }; struct ieee80211_txrx_stypes { -- cgit v1.2.3 From cc6ac9bccf6b9814d37932e86a92f8e6a92960dc Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 8 Oct 2016 11:36:05 +0800 Subject: sctp: reuse sent_count to avoid retransmitted chunks for RTT measurements Now sctp uses chunk->resent to record if a chunk is retransmitted, for RTT measurements with retransmitted DATA chunks. chunk->sent_count was introduced to record how many times one chunk has been sent for prsctp RTX policy before. We actually can know if one chunk is retransmitted by checking chunk->sent_count is greater than 1. This patch is to remove resent from sctp_chunk and reuse sent_count to avoid retransmitted chunks for RTT measurements. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 11c3bf262a85..27a933e4c90e 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -641,7 +641,6 @@ struct sctp_chunk { #define SCTP_NEED_FRTX 0x1 #define SCTP_DONT_FRTX 0x2 __u16 rtt_in_progress:1, /* This chunk used for RTT calc? */ - resent:1, /* Has this chunk ever been resent. */ has_tsn:1, /* Does this chunk have a TSN yet? */ has_ssn:1, /* Does this chunk have a SSN yet? */ singleton:1, /* Only chunk in the packet? */ @@ -656,6 +655,7 @@ struct sctp_chunk { fast_retransmit:2; /* Is this chunk fast retransmitted? */ }; +#define sctp_chunk_retransmitted(chunk) (chunk->sent_count > 1) void sctp_chunk_hold(struct sctp_chunk *); void sctp_chunk_put(struct sctp_chunk *); int sctp_user_addto_chunk(struct sctp_chunk *chunk, int len, -- cgit v1.2.3 From 8ae808eb853e3789b81b8a502cdf22bb01b76880 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 8 Oct 2016 11:40:16 +0800 Subject: sctp: remove the old ttl expires policy The prsctp polices include ttl expires policy already, we should remove the old ttl expires codes, and just adjust the new polices' codes to be compatible with the old one for users. This patch is to remove all the old expires codes, and if prsctp polices are not set, it will still set msg's expires_at and check the expires in sctp_check_abandoned. Note that asoc->prsctp_enable is set by default, so users can't feel any difference even if they use the old expires api in userspace. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 27a933e4c90e..bd4a3ded7c87 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -530,7 +530,6 @@ struct sctp_datamsg { /* Did the messenge fail to send? */ int send_error; u8 send_failed:1, - can_abandon:1, /* can chunks from this message can be abandoned. */ can_delay; /* should this message be Nagle delayed */ }; -- cgit v1.2.3 From 165779231ff9e9c4ac7baaee84eff91d589f3e22 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Thu, 13 Oct 2016 09:06:41 +0300 Subject: net/sched: act_mirred: Rename tcfm_ok_push to tcfm_mac_header_xmit and make it a bool 'tcfm_ok_push' specifies whether a mac_len sized push is needed upon egress to the target device (if action is performed at ingress). Rename it to 'tcfm_mac_header_xmit' as this is actually an attribute of the target device (and use a bool instead of int). This allows to decouple the attribute from the action to be taken. Signed-off-by: Shmulik Ladkani Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/tc_act/tc_mirred.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h index 62770add15bd..95431092c4b6 100644 --- a/include/net/tc_act/tc_mirred.h +++ b/include/net/tc_act/tc_mirred.h @@ -8,7 +8,7 @@ struct tcf_mirred { struct tc_action common; int tcfm_eaction; int tcfm_ifindex; - int tcfm_ok_push; + bool tcfm_mac_header_xmit; struct net_device __rcu *tcfm_dev; struct list_head tcfm_list; }; -- cgit v1.2.3 From 5724b8b5694794829a071c6da7dd0bc146df0756 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Thu, 13 Oct 2016 09:06:43 +0300 Subject: net/sched: tc_mirred: Rename public predicates 'is_tcf_mirred_redirect' and 'is_tcf_mirred_mirror' These accessors are used in various drivers that support tc offloading, to detect properties of a given 'tc_action'. 'is_tcf_mirred_redirect' tests that the action is TCA_EGRESS_REDIR. 'is_tcf_mirred_mirror' tests that the action is TCA_EGRESS_MIRROR. As a prep towards supporting INGRESS redir/mirror, rename these predicates to reflect their true meaning: s/is_tcf_mirred_redirect/is_tcf_mirred_egress_redirect/ s/is_tcf_mirred_mirror/is_tcf_mirred_egress_mirror/ Signed-off-by: Shmulik Ladkani Cc: Hariprasad S Cc: Jeff Kirsher Cc: Saeed Mahameed Cc: Jiri Pirko Cc: Ido Schimmel Cc: Jakub Kicinski Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/tc_act/tc_mirred.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h index 95431092c4b6..604bc31e23ab 100644 --- a/include/net/tc_act/tc_mirred.h +++ b/include/net/tc_act/tc_mirred.h @@ -14,7 +14,7 @@ struct tcf_mirred { }; #define to_mirred(a) ((struct tcf_mirred *)a) -static inline bool is_tcf_mirred_redirect(const struct tc_action *a) +static inline bool is_tcf_mirred_egress_redirect(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT if (a->ops && a->ops->type == TCA_ACT_MIRRED) @@ -23,7 +23,7 @@ static inline bool is_tcf_mirred_redirect(const struct tc_action *a) return false; } -static inline bool is_tcf_mirred_mirror(const struct tc_action *a) +static inline bool is_tcf_mirred_egress_mirror(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT if (a->ops && a->ops->type == TCA_ACT_MIRRED) -- cgit v1.2.3 From 1104d9ba443a3972052ea4eaa01e51f9ee084652 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 14 Oct 2016 11:25:36 -0700 Subject: lwtunnel: Add destroy state operation Users of lwt tunnels may set up some secondary state in build_state function. Add a corresponding destroy_state function to allow users to clean up state. This destroy state function is called from lwstate_free. Also, we now free lwstate using kfree_rcu so user can assume structure is not freed before rcu. Acked-by: Roopa Prabhu Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index ea3f80f58fd6..67d235f43202 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -29,6 +29,7 @@ struct lwtunnel_state { int (*orig_input)(struct sk_buff *); int len; __u16 headroom; + struct rcu_head rcu; __u8 data[0]; }; @@ -36,6 +37,7 @@ struct lwtunnel_encap_ops { int (*build_state)(struct net_device *dev, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **ts); + void (*destroy_state)(struct lwtunnel_state *lws); int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*input)(struct sk_buff *skb); int (*fill_encap)(struct sk_buff *skb, @@ -46,10 +48,7 @@ struct lwtunnel_encap_ops { }; #ifdef CONFIG_LWTUNNEL -static inline void lwtstate_free(struct lwtunnel_state *lws) -{ - kfree(lws); -} +void lwtstate_free(struct lwtunnel_state *lws); static inline struct lwtunnel_state * lwtstate_get(struct lwtunnel_state *lws) -- cgit v1.2.3 From a3e2f4b6ed9de85086850fe49801f9b00adb6ae1 Mon Sep 17 00:00:00 2001 From: Michael Braun Date: Sat, 15 Oct 2016 13:28:19 +0200 Subject: mac80211: fix A-MSDU outer SA/DA According to IEEE 802.11-2012 section 8.3.2 table 8-19, the outer SA/DA of A-MSDU frames need to be changed depending on FromDS/ToDS values. Signed-off-by: Michael Braun [use ether_addr_copy and add alignment annotations] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a810dfcb83c2..e50c9e02889a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1438,7 +1438,7 @@ enum ieee80211_vif_flags { struct ieee80211_vif { enum nl80211_iftype type; struct ieee80211_bss_conf bss_conf; - u8 addr[ETH_ALEN]; + u8 addr[ETH_ALEN] __aligned(2); bool p2p; bool csa_active; bool mu_mimo_owner; -- cgit v1.2.3 From a1264c3d6c04f0e4e9d447caaa249d6288b01520 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 18 Oct 2016 23:12:07 +0300 Subject: wireless: radiotap: fix timestamp sampling position values The values don't match the radiotap spec, corrected that. Reported-by: Oz Shalev Signed-off-by: Johannes Berg --- include/net/ieee80211_radiotap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h index ba07b9d8ed63..d0e7e3f8e67a 100644 --- a/include/net/ieee80211_radiotap.h +++ b/include/net/ieee80211_radiotap.h @@ -333,9 +333,9 @@ enum ieee80211_radiotap_type { #define IEEE80211_RADIOTAP_TIMESTAMP_UNIT_NS 0x0003 #define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_MASK 0x00F0 #define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_BEGIN_MDPU 0x0000 -#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_EO_MPDU 0x0010 +#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_PLCP_SIG_ACQ 0x0010 #define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_EO_PPDU 0x0020 -#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_PLCP_SIG_ACQ 0x0030 +#define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_EO_MPDU 0x0030 #define IEEE80211_RADIOTAP_TIMESTAMP_SPOS_UNKNOWN 0x00F0 #define IEEE80211_RADIOTAP_TIMESTAMP_FLAG_64BIT 0x00 -- cgit v1.2.3 From 0aa419ec6e7b98d485f6c66a62a90965eda3c1bb Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Tue, 18 Oct 2016 23:12:10 +0300 Subject: mac80211: allow the driver not to pass the tid to ieee80211_sta_uapsd_trigger iwlwifi will check internally that the tid maps to an AC that is trigger enabled, but can't know what tid exactly. Allow the driver to pass a generic tid and make mac80211 assume that a trigger frame was received. Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index e50c9e02889a..f3dbadafe16e 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4087,6 +4087,10 @@ void ieee80211_sta_pspoll(struct ieee80211_sta *sta); * This must be used in conjunction with ieee80211_sta_ps_transition() * and possibly ieee80211_sta_pspoll(); calls to all three must be * serialized. + * %IEEE80211_NUM_TIDS can be passed as the tid if the tid is unknown. + * In this case, mac80211 will not check that this tid maps to an AC + * that is trigger enabled and assume that the caller did the proper + * checks. */ void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *sta, u8 tid); -- cgit v1.2.3 From f3fe4e93dd6346c01fd4070ae02ec746fbae73bb Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Tue, 18 Oct 2016 23:12:11 +0300 Subject: mac80211: add a HW flag for supporting HW TX fragmentation Currently mac80211 determines whether HW does fragmentation by checking whether the set_frag_threshold callback is set or not. However, some drivers may want to set the HW fragmentation capability depending on HW generation. Allow this by checking a HW flag instead of checking the callback. Signed-off-by: Sara Sharon [added the flag to ath10k and wlcore] Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/mac80211.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index f3dbadafe16e..a1a27021f452 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2025,6 +2025,10 @@ struct ieee80211_txq { * drivers, mac80211 packet loss mechanism will not be triggered and driver * is completely depending on firmware event for station kickout. * + * @IEEE80211_HW_SUPPORTS_TX_FRAG: Hardware does fragmentation by itself. + * The stack will not do fragmentation. + * The callback for @set_frag_threshold should be set as well. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2066,6 +2070,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_TX_AMSDU, IEEE80211_HW_TX_FRAG_LIST, IEEE80211_HW_REPORTS_LOW_ACK, + IEEE80211_HW_SUPPORTS_TX_FRAG, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS @@ -3093,8 +3098,9 @@ enum ieee80211_reconfig_type { * The callback must be atomic. * * @set_frag_threshold: Configuration of fragmentation threshold. Assign this - * if the device does fragmentation by itself; if this callback is - * implemented then the stack will not do fragmentation. + * if the device does fragmentation by itself. Note that to prevent the + * stack from doing fragmentation IEEE80211_HW_SUPPORTS_TX_FRAG + * should be set as well. * The callback can sleep. * * @set_rts_threshold: Configuration of RTS threshold (if device needs it) -- cgit v1.2.3 From f438ceb81d424cb90a5a1aad569056bd7c2ab4c5 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Tue, 18 Oct 2016 23:12:12 +0300 Subject: mac80211: uapsd_queues is in QoS IE order The uapsd_queue field is in QoS IE order and not in IEEE80211_AC_*'s order. This means that mac80211 would get confused between BK and BE which is certainly not such a big deal but needs to be fixed. Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 ++- include/net/mac80211.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5000ec758eb3..10a26f0fbafe 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4574,7 +4574,8 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr); * moves to cfg80211 in this call * @buf: authentication frame (header + body) * @len: length of the frame data - * @uapsd_queues: bitmap of ACs configured to uapsd. -1 if n/a. + * @uapsd_queues: bitmap of queues configured for uapsd. Same format + * as the AC bitmap in the QoS info field * * After being asked to associate via cfg80211_ops::assoc() the driver must * call either this function or cfg80211_auth_timeout(). diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a1a27021f452..b9b24abd9103 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1745,7 +1745,8 @@ struct ieee80211_sta_rates { * @drv_priv: data area for driver use, will always be aligned to * sizeof(void *), size is determined in hw information. * @uapsd_queues: bitmap of queues configured for uapsd. Only valid - * if wme is supported. + * if wme is supported. The bits order is like in + * IEEE80211_WMM_IE_STA_QOSINFO_AC_*. * @max_sp: max Service Period. Only valid if wme is supported. * @bandwidth: current bandwidth the station can receive with * @rx_nss: in HT/VHT, the maximum number of spatial streams the -- cgit v1.2.3 From 0711d638786941ec02551dd9b4aa0d8341f7db5b Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Tue, 18 Oct 2016 23:12:13 +0300 Subject: cfg80211: allow aborting in-progress connection atttempts On a disconnect request from userspace, cfg80211 currently calls called rdev_disconnect() only in case that 'current_bss' was set, i.e. connection had been established. Change this to allow the userspace call to succeed and call the driver's disconnect() method also while the connection attempt is in progress, to be able to abort attempts. Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho [change commit subject/message] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 10a26f0fbafe..2bbbcc3eecac 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2564,9 +2564,10 @@ struct cfg80211_nan_func { * cases, the result of roaming is indicated with a call to * cfg80211_roamed() or cfg80211_roamed_bss(). * (invoked with the wireless_dev mutex held) - * @disconnect: Disconnect from the BSS/ESS. Once done, call - * cfg80211_disconnected(). - * (invoked with the wireless_dev mutex held) + * @disconnect: Disconnect from the BSS/ESS or stop connection attempts if + * connection is in progress. Once done, call cfg80211_disconnected() in + * case connection was already established (invoked with the + * wireless_dev mutex held), otherwise call cfg80211_connect_timeout(). * * @join_ibss: Join the specified IBSS (or create if necessary). Once done, call * cfg80211_ibss_joined(), also call that function when changing BSSID due -- cgit v1.2.3 From f8c3bf00d440df2bc2c3f669d460868d9ba67226 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 21 Oct 2016 13:55:45 +0200 Subject: net/socket: factor out helpers for memory and queue manipulation Basic sock operations that udp code can use with its own memory accounting schema. No functional change is introduced in the existing APIs. v4 -> v5: - avoid whitespace changes v2 -> v4: - avoid exporting __sock_enqueue_skb v1 -> v2: - avoid export sock_rmem_free Acked-by: Hannes Frederic Sowa Signed-off-by: Paolo Abeni Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index ebf75db08e06..276489553338 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1274,7 +1274,9 @@ static inline struct inode *SOCK_INODE(struct socket *socket) /* * Functions for memory accounting */ +int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind); int __sk_mem_schedule(struct sock *sk, int size, int kind); +void __sk_mem_reduce_allocated(struct sock *sk, int amount); void __sk_mem_reclaim(struct sock *sk, int amount); #define SK_MEM_QUANTUM ((int)PAGE_SIZE) @@ -1950,6 +1952,8 @@ void sk_reset_timer(struct sock *sk, struct timer_list *timer, void sk_stop_timer(struct sock *sk, struct timer_list *timer); +int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb, + unsigned int flags); int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); -- cgit v1.2.3 From f970bd9e3a06f06df8d8ecf1f8ad2c8615cc17eb Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 21 Oct 2016 13:55:46 +0200 Subject: udp: implement memory accounting helpers Avoid using the generic helpers. Use the receive queue spin lock to protect the memory accounting operation, both on enqueue and on dequeue. On dequeue perform partial memory reclaiming, trying to leave a quantum of forward allocated memory. On enqueue use a custom helper, to allow some optimizations: - use a plain spin_lock() variant instead of the slightly costly spin_lock_irqsave(), - avoid dst_force check, since the calling code has already dropped the skb dst - avoid orphaning the skb, since skb_steal_sock() already did the work for us The above needs custom memory reclaiming on shutdown, provided by the udp_destruct_sock(). v5 -> v6: - don't orphan the skb on enqueue v4 -> v5: - replace the mem_lock with the receive queue spin lock - ensure that the bh is always allowed to enqueue at least a skb, even if sk_rcvbuf is exceeded v3 -> v4: - reworked memory accunting, simplifying the schema - provide an helper for both memory scheduling and enqueuing v1 -> v2: - use a udp specific destrctor to perform memory reclaiming - remove a couple of helpers, unneeded after the above cleanup - do not reclaim memory on dequeue if not under memory pressure - reworked the fwd accounting schema to avoid potential integer overflow Acked-by: Hannes Frederic Sowa Signed-off-by: Paolo Abeni Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/udp.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/udp.h b/include/net/udp.h index ea53a87d880f..18f1e6b91927 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -246,6 +246,9 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb, } /* net/ipv4/udp.c */ +void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); +int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb); + void udp_v4_early_demux(struct sk_buff *skb); int udp_get_port(struct sock *sk, unsigned short snum, int (*saddr_cmp)(const struct sock *, @@ -258,6 +261,7 @@ void udp_flush_pending_frames(struct sock *sk); void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); +int udp_init_sock(struct sock *sk); int udp_disconnect(struct sock *sk, int flags); unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait); struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, -- cgit v1.2.3 From f76a9db351f8beb3259c4ba38058de0058ab8000 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Fri, 21 Oct 2016 16:10:22 +0200 Subject: lwt: Remove unused len field The field is initialized by ILA and MPLS but never used. Remove it. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 67d235f43202..82e76fe1c1f7 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -24,11 +24,10 @@ enum { struct lwtunnel_state { __u16 type; __u16 flags; + __u16 headroom; atomic_t refcnt; int (*orig_output)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*orig_input)(struct sk_buff *); - int len; - __u16 headroom; struct rcu_head rcu; __u8 data[0]; }; -- cgit v1.2.3 From 432490f9d455fb842d70219f22d9d2c812371676 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 21 Oct 2016 13:03:44 +0300 Subject: net: ip, diag -- Add diag interface for raw sockets In criu we are actively using diag interface to collect sockets present in the system when dumping applications. And while for unix, tcp, udp[lite], packet, netlink it works as expected, the raw sockets do not have. Thus add it. v2: - add missing sock_put calls in raw_diag_dump_one (by eric.dumazet@) - implement @destroy for diag requests (by dsa@) v3: - add export of raw_abort for IPv6 (by dsa@) - pass net-admin flag into inet_sk_diag_fill due to changes in net-next branch (by dsa@) v4: - use @pad in struct inet_diag_req_v2 for raw socket protocol specification: raw module carries sockets which may have custom protocol passed from socket() syscall and sole @sdiag_protocol is not enough to match underlied ones - start reporting protocol specifed in socket() call when sockets are raw ones for the same reason: user space tools like ss may parse this attribute and use it for socket matching v5 (by eric.dumazet@): - use sock_hold in raw_sock_get instead of atomic_inc, we're holding (raw_v4_hashinfo|raw_v6_hashinfo)->lock when looking up so counter won't be zero here. v6: - use sdiag_raw_protocol() helper which will access @pad structure used for raw sockets protocol specification: we can't simply rename this member without breaking uapi v7: - sine sdiag_raw_protocol() helper is not suitable for uapi lets rather make an alias structure with proper names. __check_inet_diag_req_raw helper will catch if any of structure unintentionally changed. CC: David S. Miller CC: Eric Dumazet CC: David Ahern CC: Alexey Kuznetsov CC: James Morris CC: Hideaki YOSHIFUJI CC: Patrick McHardy CC: Andrey Vagin CC: Stephen Hemminger Signed-off-by: Cyrill Gorcunov Signed-off-by: David S. Miller --- include/net/raw.h | 6 ++++++ include/net/rawv6.h | 7 +++++++ 2 files changed, 13 insertions(+) (limited to 'include/net') diff --git a/include/net/raw.h b/include/net/raw.h index 3e789008394d..57c33dd22ec4 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -23,6 +23,12 @@ extern struct proto raw_prot; +extern struct raw_hashinfo raw_v4_hashinfo; +struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, + unsigned short num, __be32 raddr, + __be32 laddr, int dif); + +int raw_abort(struct sock *sk, int err); void raw_icmp_error(struct sk_buff *, int, u32); int raw_local_deliver(struct sk_buff *, int); diff --git a/include/net/rawv6.h b/include/net/rawv6.h index 87783dea0791..cbe4e9de1894 100644 --- a/include/net/rawv6.h +++ b/include/net/rawv6.h @@ -3,6 +3,13 @@ #include +extern struct raw_hashinfo raw_v6_hashinfo; +struct sock *__raw_v6_lookup(struct net *net, struct sock *sk, + unsigned short num, const struct in6_addr *loc_addr, + const struct in6_addr *rmt_addr, int dif); + +int raw_abort(struct sock *sk, int err); + void raw6_icmp_error(struct sk_buff *, int nexthdr, u8 type, u8 code, int inner_offset, __be32); bool raw6_local_deliver(struct sk_buff *, int); -- cgit v1.2.3 From 73c7da3dae1e7cd8febeab13767b2698b84dfa15 Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Thu, 20 Oct 2016 20:08:22 +0100 Subject: cfg80211: add generic helper to check interface is running Add a helper using wdev to check if interface is running. This deals with both non-netdev and netdev interfaces. In struct wireless_dev replace 'p2p_started' and 'nan_started' by 'is_running' as those are mutually exclusive anyway, and unify all the code to use wdev_running(). Signed-off-by: Arend van Spriel Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2bbbcc3eecac..ec39f891b7a3 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3781,8 +3781,8 @@ struct cfg80211_cached_keys; * @beacon_interval: beacon interval used on this device for transmitting * beacons, 0 when not valid * @address: The address for this device, valid only if @netdev is %NULL - * @p2p_started: true if this is a P2P Device that has been started - * @nan_started: true if this is a NAN interface that has been started + * @is_running: true if this is a non-netdev device that has been started, e.g. + * the P2P Device. * @cac_started: true if DFS channel availability check has been started * @cac_start_time: timestamp (jiffies) when the dfs state was entered. * @cac_time_ms: CAC time in ms @@ -3814,7 +3814,7 @@ struct wireless_dev { struct mutex mtx; - bool use_4addr, p2p_started, nan_started; + bool use_4addr, is_running; u8 address[ETH_ALEN] __aligned(sizeof(u16)); @@ -3871,6 +3871,13 @@ static inline u8 *wdev_address(struct wireless_dev *wdev) return wdev->address; } +static inline bool wdev_running(struct wireless_dev *wdev) +{ + if (wdev->netdev) + return netif_running(wdev->netdev); + return wdev->is_running; +} + /** * wdev_priv - return wiphy priv from wireless_dev * -- cgit v1.2.3 From 4c8dea638c16141adb046fd2e0cab51dfe43650c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 21 Oct 2016 14:25:13 +0200 Subject: cfg80211: validate beacon int as part of iface combinations Remove the pointless checking against interface combinations in the initial basic beacon interval validation, that currently isn't taking into account radar detection or channels properly. Instead, just validate the basic range there, and then delay real checking to the interface combination validation that drivers must do. This means that drivers wanting to use the beacon_int_min_gcd will now have to pass the new_beacon_int when validating the AP/mesh start. Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index ec39f891b7a3..d1ffbc3a8e55 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -784,19 +784,15 @@ struct cfg80211_csa_settings { * @iftype_num: array with the number of interfaces of each interface * type. The index is the interface type as specified in &enum * nl80211_iftype. - * @beacon_int_gcd: a value specifying GCD of all beaconing interfaces, - * the GCD of a single value is considered the value itself, so for - * a single interface this should be set to that interface's beacon - * interval - * @beacon_int_different: a flag indicating whether or not all beacon - * intervals (of beaconing interfaces) are different or not. + * @new_beacon_int: set this to the beacon interval of a new interface + * that's not operating yet, if such is to be checked as part of + * the verification */ struct iface_combination_params { int num_different_channels; u8 radar_detect; int iftype_num[NUM_NL80211_IFTYPES]; - u32 beacon_int_gcd; - bool beacon_int_different; + u32 new_beacon_int; }; /** -- cgit v1.2.3 From 11b6b5a4ced2f2c76073b97ee08ca0eab8358fde Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Thu, 27 Oct 2016 00:41:58 +0300 Subject: cfg80211: Rename SAE_DATA to more generic AUTH_DATA This adds defines and nl80211 extensions to allow FILS Authentication to be implemented similarly to SAE. FILS does not need the special rules for the Authentication transaction number and Status code fields, but it does need to add non-IE fields. The previously used NL80211_ATTR_SAE_DATA can be reused for this to avoid having to duplicate that implementation. Rename that attribute to more generic NL80211_ATTR_AUTH_DATA (with backwards compatibility define for NL80211_SAE_DATA). Also document the special rules related to the Authentication transaction number and Status code fiels. Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d1ffbc3a8e55..dffc265a4fd6 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1785,9 +1785,11 @@ const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 ie); * @key_len: length of WEP key for shared key authentication * @key_idx: index of WEP key for shared key authentication * @key: WEP key for shared key authentication - * @sae_data: Non-IE data to use with SAE or %NULL. This starts with - * Authentication transaction sequence number field. - * @sae_data_len: Length of sae_data buffer in octets + * @auth_data: Fields and elements in Authentication frames. This contains + * the authentication frame body (non-IE and IE data), excluding the + * Authentication algorithm number, i.e., starting at the Authentication + * transaction sequence number field. + * @auth_data_len: Length of auth_data buffer in octets */ struct cfg80211_auth_request { struct cfg80211_bss *bss; @@ -1796,8 +1798,8 @@ struct cfg80211_auth_request { enum nl80211_auth_type auth_type; const u8 *key; u8 key_len, key_idx; - const u8 *sae_data; - size_t sae_data_len; + const u8 *auth_data; + size_t auth_data_len; }; /** -- cgit v1.2.3 From 3f817fe718c6cb3ddcc2ab04ba86faecc20ef8fe Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Thu, 27 Oct 2016 00:42:01 +0300 Subject: cfg80211: Define IEEE P802.11ai (FILS) information elements Define the Element IDs and Element ID Extensions from IEEE P802.11ai/D11.0. In addition, add a new cfg80211_find_ext_ie() wrapper to make it easier to find information elements that used the Element ID Extension field. Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index dffc265a4fd6..8ca2e9f354f7 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4180,6 +4180,27 @@ static inline const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len) return cfg80211_find_ie_match(eid, ies, len, NULL, 0, 0); } +/** + * cfg80211_find_ext_ie - find information element with EID Extension in data + * + * @ext_eid: element ID Extension + * @ies: data consisting of IEs + * @len: length of data + * + * Return: %NULL if the extended element ID could not be found or if + * the element is invalid (claims to be longer than the given + * data), or a pointer to the first byte of the requested + * element, that is the byte containing the element ID. + * + * Note: There are no checks on the element length other than + * having to fit into the given data. + */ +static inline const u8 *cfg80211_find_ext_ie(u8 ext_eid, const u8 *ies, int len) +{ + return cfg80211_find_ie_match(WLAN_EID_EXTENSION, ies, len, + &ext_eid, 1, 2); +} + /** * cfg80211_find_vendor_ie - find vendor specific information element in data * -- cgit v1.2.3 From 348bd456699801920a309c66e382380809fbdf41 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Thu, 27 Oct 2016 00:42:03 +0300 Subject: cfg80211: Add KEK/nonces for FILS association frames The new nl80211 attributes can be used to provide KEK and nonces to allow the driver to encrypt and decrypt FILS (Re)Association Request/Response frames in station mode. Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8ca2e9f354f7..738b4d8a4666 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1840,6 +1840,12 @@ enum cfg80211_assoc_req_flags { * @ht_capa_mask: The bits of ht_capa which are to be used. * @vht_capa: VHT capability override * @vht_capa_mask: VHT capability mask indicating which fields to use + * @fils_kek: FILS KEK for protecting (Re)Association Request/Response frame or + * %NULL if FILS is not used. + * @fils_kek_len: Length of fils_kek in octets + * @fils_nonces: FILS nonces (part of AAD) for protecting (Re)Association + * Request/Response frame or %NULL if FILS is not used. This field starts + * with 16 octets of STA Nonce followed by 16 octets of AP Nonce. */ struct cfg80211_assoc_request { struct cfg80211_bss *bss; @@ -1851,6 +1857,9 @@ struct cfg80211_assoc_request { struct ieee80211_ht_cap ht_capa; struct ieee80211_ht_cap ht_capa_mask; struct ieee80211_vht_cap vht_capa, vht_capa_mask; + const u8 *fils_kek; + size_t fils_kek_len; + const u8 *fils_nonces; }; /** -- cgit v1.2.3 From ce0ce13a1c89ff8b94b7f8fb32eb4c43e111c82e Mon Sep 17 00:00:00 2001 From: Michael Braun Date: Mon, 10 Oct 2016 19:12:22 +0200 Subject: cfg80211: configure multicast to unicast for AP interfaces Add the ability to configure if an AP (and associated VLANs) will do multicast-to-unicast conversion for ARP, IPv4 and IPv6 frames (possibly within 802.1Q). If enabled, such frames are to be sent to each station separately, with the DA replaced by their own MAC address rather than the group address. Note that this may break certain expectations of the receiver, such as the ability to drop unicast IP packets received within multicast L2 frames, or the ability to not send ICMP destination unreachable messages for packets received in L2 multicast (which is required, but the receiver can't tell the difference if this new option is enabled.) This also doesn't implement the 802.11 DMS (directed multicast service). Signed-off-by: Michael Braun [fix disabling, add better documentation & commit message] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 738b4d8a4666..41ae3f500957 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2742,6 +2742,8 @@ struct cfg80211_nan_func { * @nan_change_conf: changes NAN configuration. The changed parameters must * be specified in @changes (using &enum cfg80211_nan_conf_changes); * All other parameters must be ignored. + * + * @set_multicast_to_unicast: configure multicast to unicast conversion for BSS */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -3018,6 +3020,10 @@ struct cfg80211_ops { struct wireless_dev *wdev, struct cfg80211_nan_conf *conf, u32 changes); + + int (*set_multicast_to_unicast)(struct wiphy *wiphy, + struct net_device *dev, + const bool enabled); }; /* -- cgit v1.2.3 From 088e8df82f91a24728d49d9532cab7ebdee5117f Mon Sep 17 00:00:00 2001 From: vamsi krishna Date: Thu, 27 Oct 2016 16:51:11 +0300 Subject: cfg80211: Add support to update connection parameters Add functionality to update the connection parameters when in connected state, so that driver/firmware uses the updated parameters for subsequent roaming. This is for drivers that support internal BSS selection and roaming. The new command does not change the current association state, i.e., it can be used to update IE contents for future (re)associations without causing an immediate disassociation or reassociation with the current BSS. This commit implements the required functionality for updating IEs for (Re)Association Request frame only. Other parameters can be added in future when required. Signed-off-by: vamsi krishna Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 41ae3f500957..c575583b50fb 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2050,6 +2050,18 @@ struct cfg80211_connect_params { const u8 *prev_bssid; }; +/** + * enum cfg80211_connect_params_changed - Connection parameters being updated + * + * This enum provides information of all connect parameters that + * have to be updated as part of update_connect_params() call. + * + * @UPDATE_ASSOC_IES: Indicates whether association request IEs are updated + */ +enum cfg80211_connect_params_changed { + UPDATE_ASSOC_IES = BIT(0), +}; + /** * enum wiphy_params_flags - set_wiphy_params bitfield values * @WIPHY_PARAM_RETRY_SHORT: wiphy->retry_short has changed @@ -2571,6 +2583,14 @@ struct cfg80211_nan_func { * cases, the result of roaming is indicated with a call to * cfg80211_roamed() or cfg80211_roamed_bss(). * (invoked with the wireless_dev mutex held) + * @update_connect_params: Update the connect parameters while connected to a + * BSS. The updated parameters can be used by driver/firmware for + * subsequent BSS selection (roaming) decisions and to form the + * Authentication/(Re)Association Request frames. This call does not + * request an immediate disassociation or reassociation with the current + * BSS, i.e., this impacts only subsequent (re)associations. The bits in + * changed are defined in &enum cfg80211_connect_params_changed. + * (invoked with the wireless_dev mutex held) * @disconnect: Disconnect from the BSS/ESS or stop connection attempts if * connection is in progress. Once done, call cfg80211_disconnected() in * case connection was already established (invoked with the @@ -2858,6 +2878,10 @@ struct cfg80211_ops { int (*connect)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_connect_params *sme); + int (*update_connect_params)(struct wiphy *wiphy, + struct net_device *dev, + struct cfg80211_connect_params *sme, + u32 changed); int (*disconnect)(struct wiphy *wiphy, struct net_device *dev, u16 reason_code); -- cgit v1.2.3 From 4fe77d82ef80c77031c9c6f8554cd0dee2aa423a Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Mon, 24 Oct 2016 20:32:57 +0800 Subject: skbedit: allow the user to specify bitmask for mark The user may want to use only some bits of the skb mark in his skbedit rules because the remaining part might be used by something else. Introduce the "mask" parameter to the skbedit actor in order to implement such functionality. When the mask is specified, only those bits selected by the latter are altered really changed by the actor, while the rest is left untouched. Signed-off-by: Antonio Quartulli Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/tc_act/tc_skbedit.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index 5767e9dbcf92..19cd3d345804 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -27,6 +27,7 @@ struct tcf_skbedit { u32 flags; u32 priority; u32 mark; + u32 mask; u16 queue_mapping; u16 ptype; }; -- cgit v1.2.3 From c90c39dab3e02ce45427a214746711f33ad13be6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 24 Oct 2016 14:40:01 +0200 Subject: genetlink: introduce and use genl_family_attrbuf() This helper function allows family implementations to access their family's attrbuf. This gets rid of the attrbuf usage in families, and also adds locking validation, since it's not valid to use the attrbuf with parallel_ops or outside of the dumpit callback. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/genetlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 8d4608ce8716..ef9defb3f5bc 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -73,6 +73,8 @@ struct genl_family { struct module *module; }; +struct nlattr **genl_family_attrbuf(struct genl_family *family); + /** * struct genl_info - receiving information * @snd_seq: sending sequence number -- cgit v1.2.3 From a07ea4d9941af5a0c6f0be2a71b51ac9c083c5e5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 24 Oct 2016 14:40:02 +0200 Subject: genetlink: no longer support using static family IDs Static family IDs have never really been used, the only use case was the workaround I introduced for those users that assumed their family ID was also their multicast group ID. Additionally, because static family IDs would never be reserved by the generic netlink code, using a relatively low ID would only work for built-in families that can be registered immediately after generic netlink is started, which is basically only the control family (apart from the workaround code, which I also had to add code for so it would reserve those IDs) Thus, anything other than GENL_ID_GENERATE is flawed and luckily not used except in the cases I mentioned. Move those workarounds into a few lines of code, and then get rid of GENL_ID_GENERATE entirely, making it more robust. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/genetlink.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index ef9defb3f5bc..43a5c3975a2f 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -20,7 +20,7 @@ struct genl_info; /** * struct genl_family - generic netlink family - * @id: protocol family idenfitier + * @id: protocol family identifier (private) * @hdrsize: length of user specific header in bytes * @name: name of family * @version: protocol version @@ -48,7 +48,7 @@ struct genl_info; * @n_ops: number of operations supported by this family (private) */ struct genl_family { - unsigned int id; + unsigned int id; /* private */ unsigned int hdrsize; char name[GENL_NAMSIZ]; unsigned int version; @@ -149,9 +149,6 @@ static inline int genl_register_family(struct genl_family *family) * Registers the specified family and operations from the specified table. * Only one family may be registered with the same family name or identifier. * - * The family id may equal GENL_ID_GENERATE causing an unique id to - * be automatically generated and assigned. - * * Either a doit or dumpit callback must be specified for every registered * operation or the function will fail. Only one operation structure per * command identifier may be registered. -- cgit v1.2.3 From 489111e5c25b93be80340c3113d71903d7c82136 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 24 Oct 2016 14:40:03 +0200 Subject: genetlink: statically initialize families Instead of providing macros/inline functions to initialize the families, make all users initialize them statically and get rid of the macros. This reduces the kernel code size by about 1.6k on x86-64 (with allyesconfig). Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/genetlink.h | 71 +++++++++---------------------------------------- 1 file changed, 12 insertions(+), 59 deletions(-) (limited to 'include/net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 43a5c3975a2f..2298b50cee34 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -39,13 +39,14 @@ struct genl_info; * Note that unbind() will not be called symmetrically if the * generic netlink family is removed while there are still open * sockets. - * @attrbuf: buffer to store parsed attributes - * @family_list: family list - * @mcgrps: multicast groups used by this family (private) - * @n_mcgrps: number of multicast groups (private) + * @attrbuf: buffer to store parsed attributes (private) + * @family_list: family list (private) + * @mcgrps: multicast groups used by this family + * @n_mcgrps: number of multicast groups * @mcgrp_offset: starting number of multicast group IDs in this family - * @ops: the operations supported by this family (private) - * @n_ops: number of operations supported by this family (private) + * (private) + * @ops: the operations supported by this family + * @n_ops: number of operations supported by this family */ struct genl_family { unsigned int id; /* private */ @@ -64,10 +65,10 @@ struct genl_family { int (*mcast_bind)(struct net *net, int group); void (*mcast_unbind)(struct net *net, int group); struct nlattr ** attrbuf; /* private */ - const struct genl_ops * ops; /* private */ - const struct genl_multicast_group *mcgrps; /* private */ - unsigned int n_ops; /* private */ - unsigned int n_mcgrps; /* private */ + const struct genl_ops * ops; + const struct genl_multicast_group *mcgrps; + unsigned int n_ops; + unsigned int n_mcgrps; unsigned int mcgrp_offset; /* private */ struct list_head family_list; /* private */ struct module *module; @@ -132,55 +133,7 @@ struct genl_ops { u8 flags; }; -int __genl_register_family(struct genl_family *family); - -static inline int genl_register_family(struct genl_family *family) -{ - family->module = THIS_MODULE; - return __genl_register_family(family); -} - -/** - * genl_register_family_with_ops - register a generic netlink family with ops - * @family: generic netlink family - * @ops: operations to be registered - * @n_ops: number of elements to register - * - * Registers the specified family and operations from the specified table. - * Only one family may be registered with the same family name or identifier. - * - * Either a doit or dumpit callback must be specified for every registered - * operation or the function will fail. Only one operation structure per - * command identifier may be registered. - * - * See include/net/genetlink.h for more documenation on the operations - * structure. - * - * Return 0 on success or a negative error code. - */ -static inline int -_genl_register_family_with_ops_grps(struct genl_family *family, - const struct genl_ops *ops, size_t n_ops, - const struct genl_multicast_group *mcgrps, - size_t n_mcgrps) -{ - family->module = THIS_MODULE; - family->ops = ops; - family->n_ops = n_ops; - family->mcgrps = mcgrps; - family->n_mcgrps = n_mcgrps; - return __genl_register_family(family); -} - -#define genl_register_family_with_ops(family, ops) \ - _genl_register_family_with_ops_grps((family), \ - (ops), ARRAY_SIZE(ops), \ - NULL, 0) -#define genl_register_family_with_ops_groups(family, ops, grps) \ - _genl_register_family_with_ops_grps((family), \ - (ops), ARRAY_SIZE(ops), \ - (grps), ARRAY_SIZE(grps)) - +int genl_register_family(struct genl_family *family); int genl_unregister_family(struct genl_family *family); void genl_notify(struct genl_family *family, struct sk_buff *skb, struct genl_info *info, u32 group, gfp_t flags); -- cgit v1.2.3 From 2ae0f17df1cd52aafd1ab0415ea1f1dd56dc0e2a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 24 Oct 2016 14:40:04 +0200 Subject: genetlink: use idr to track families Since generic netlink family IDs are small integers, allocated densely, IDR is an ideal match for lookups. Replace the existing hand-written hash-table with IDR for allocation and lookup. This lets the families only be written to once, during register, since the list_head can be removed and removal of a family won't cause any writes. It also slightly reduces the code size (by about 1.3k on x86-64). Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/genetlink.h | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 2298b50cee34..3ec87bacc0f5 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -40,7 +40,6 @@ struct genl_info; * generic netlink family is removed while there are still open * sockets. * @attrbuf: buffer to store parsed attributes (private) - * @family_list: family list (private) * @mcgrps: multicast groups used by this family * @n_mcgrps: number of multicast groups * @mcgrp_offset: starting number of multicast group IDs in this family @@ -70,11 +69,10 @@ struct genl_family { unsigned int n_ops; unsigned int n_mcgrps; unsigned int mcgrp_offset; /* private */ - struct list_head family_list; /* private */ struct module *module; }; -struct nlattr **genl_family_attrbuf(struct genl_family *family); +struct nlattr **genl_family_attrbuf(const struct genl_family *family); /** * struct genl_info - receiving information @@ -134,12 +132,12 @@ struct genl_ops { }; int genl_register_family(struct genl_family *family); -int genl_unregister_family(struct genl_family *family); -void genl_notify(struct genl_family *family, struct sk_buff *skb, +int genl_unregister_family(const struct genl_family *family); +void genl_notify(const struct genl_family *family, struct sk_buff *skb, struct genl_info *info, u32 group, gfp_t flags); void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, - struct genl_family *family, int flags, u8 cmd); + const struct genl_family *family, int flags, u8 cmd); /** * genlmsg_nlhdr - Obtain netlink header from user specified header @@ -148,8 +146,8 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, * * Returns pointer to netlink header. */ -static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr, - struct genl_family *family) +static inline struct nlmsghdr * +genlmsg_nlhdr(void *user_hdr, const struct genl_family *family) { return (struct nlmsghdr *)((char *)user_hdr - family->hdrsize - @@ -185,7 +183,7 @@ static inline int genlmsg_parse(const struct nlmsghdr *nlh, */ static inline void genl_dump_check_consistent(struct netlink_callback *cb, void *user_hdr, - struct genl_family *family) + const struct genl_family *family) { nl_dump_check_consistent(cb, genlmsg_nlhdr(user_hdr, family)); } @@ -202,7 +200,7 @@ static inline void genl_dump_check_consistent(struct netlink_callback *cb, */ static inline void *genlmsg_put_reply(struct sk_buff *skb, struct genl_info *info, - struct genl_family *family, + const struct genl_family *family, int flags, u8 cmd) { return genlmsg_put(skb, info->snd_portid, info->snd_seq, family, @@ -239,7 +237,7 @@ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr) * @group: offset of multicast group in groups array * @flags: allocation flags */ -static inline int genlmsg_multicast_netns(struct genl_family *family, +static inline int genlmsg_multicast_netns(const struct genl_family *family, struct net *net, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { @@ -257,7 +255,7 @@ static inline int genlmsg_multicast_netns(struct genl_family *family, * @group: offset of multicast group in groups array * @flags: allocation flags */ -static inline int genlmsg_multicast(struct genl_family *family, +static inline int genlmsg_multicast(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { @@ -275,7 +273,7 @@ static inline int genlmsg_multicast(struct genl_family *family, * * This function must hold the RTNL or rcu_read_lock(). */ -int genlmsg_multicast_allns(struct genl_family *family, +int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags); @@ -359,8 +357,9 @@ static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags) * This function returns the number of broadcast listeners that have set the * NETLINK_RECV_NO_ENOBUFS socket option. */ -static inline int genl_set_err(struct genl_family *family, struct net *net, - u32 portid, u32 group, int code) +static inline int genl_set_err(const struct genl_family *family, + struct net *net, u32 portid, + u32 group, int code) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; @@ -368,7 +367,7 @@ static inline int genl_set_err(struct genl_family *family, struct net *net, return netlink_set_err(net->genl_sock, portid, group, code); } -static inline int genl_has_listeners(struct genl_family *family, +static inline int genl_has_listeners(const struct genl_family *family, struct net *net, unsigned int group) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) -- cgit v1.2.3 From b15ca182ed136087f6a2cb9ffe880c923f36a56e Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 26 Oct 2016 10:53:16 +0200 Subject: netlink: Add nla_memdup() to wrap kmemdup() use on nlattr Wrap several common instances of: kmemdup(nla_data(attr), nla_len(attr), GFP_KERNEL); Signed-off-by: Thomas Graf Acked-by: Johannes Berg Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/netlink.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/netlink.h b/include/net/netlink.h index 254a0fc01800..a34f53acb6d6 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1190,6 +1190,16 @@ static inline struct in6_addr nla_get_in6_addr(const struct nlattr *nla) return tmp; } +/** + * nla_memdup - duplicate attribute memory (kmemdup) + * @src: netlink attribute to duplicate from + * @gfp: GFP mask + */ +static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp) +{ + return kmemdup(nla_data(src), nla_len(src), gfp); +} + /** * nla_nest_start - Start a new level of nested attributes * @skb: socket buffer to add attributes to -- cgit v1.2.3 From 5ea8ea2cb7f1d0db15762c9b0bb9e7330425a071 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Oct 2016 09:27:57 -0700 Subject: tcp/dccp: drop SYN packets if accept queue is full Per listen(fd, backlog) rules, there is really no point accepting a SYN, sending a SYNACK, and dropping the following ACK packet if accept queue is full, because application is not draining accept queue fast enough. This behavior is fooling TCP clients that believe they established a flow, while there is nothing at server side. They might then send about 10 MSS (if using IW10) that will be dropped anyway while server is under stress. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 197a30d221e9..146054ceea8e 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -289,11 +289,6 @@ static inline int inet_csk_reqsk_queue_len(const struct sock *sk) return reqsk_queue_len(&inet_csk(sk)->icsk_accept_queue); } -static inline int inet_csk_reqsk_queue_young(const struct sock *sk) -{ - return reqsk_queue_len_young(&inet_csk(sk)->icsk_accept_queue); -} - static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) { return inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog; -- cgit v1.2.3 From bd68a2a854ad5a85f0c8d0a9c8048ca3f6391efb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 31 Oct 2016 13:32:55 -0700 Subject: net: set SK_MEM_QUANTUM to 4096 Systems with large pages (64KB pages for example) do not always have huge quantity of memory. A big SK_MEM_QUANTUM value leads to fewer interactions with the global counters (like tcp_memory_allocated) but might trigger memory pressure much faster, giving suboptimal TCP performance since windows are lowered to ridiculous values. Note that sysctl_mem units being in pages and in ABI, we also need to change sk_prot_mem_limits() accordingly. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index f13ac87a8015..93331a1492db 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1162,11 +1162,6 @@ static inline void sk_enter_memory_pressure(struct sock *sk) sk->sk_prot->enter_memory_pressure(sk); } -static inline long sk_prot_mem_limits(const struct sock *sk, int index) -{ - return sk->sk_prot->sysctl_mem[index]; -} - static inline long sk_memory_allocated(const struct sock *sk) { @@ -1281,11 +1276,27 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind); void __sk_mem_reduce_allocated(struct sock *sk, int amount); void __sk_mem_reclaim(struct sock *sk, int amount); -#define SK_MEM_QUANTUM ((int)PAGE_SIZE) +/* We used to have PAGE_SIZE here, but systems with 64KB pages + * do not necessarily have 16x time more memory than 4KB ones. + */ +#define SK_MEM_QUANTUM 4096 #define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM) #define SK_MEM_SEND 0 #define SK_MEM_RECV 1 +/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */ +static inline long sk_prot_mem_limits(const struct sock *sk, int index) +{ + long val = sk->sk_prot->sysctl_mem[index]; + +#if PAGE_SIZE > SK_MEM_QUANTUM + val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT; +#elif PAGE_SIZE < SK_MEM_QUANTUM + val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT; +#endif + return val; +} + static inline int sk_mem_pages(int amt) { return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT; -- cgit v1.2.3 From f6d0cbcf09c506b9b022df8f9d7693a7cec3c732 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 24 Oct 2016 16:56:40 +0200 Subject: netfilter: nf_tables: add fib expression Add FIB expression, supported for ipv4, ipv6 and inet family (the latter just dispatches to ipv4 or ipv6 one based on nfproto). Currently supports fetching output interface index/name and the rtm_type associated with an address. This can be used for adding path filtering. rtm_type is useful to e.g. enforce a strong-end host model where packets are only accepted if daddr is configured on the interface the packet arrived on. The fib expression is a native nftables alternative to the xtables addrtype and rp_filter matches. FIB result order for oif/oifname retrieval is as follows: - if packet is local (skb has rtable, RTF_LOCAL set, this will also catch looped-back multicast packets), set oif to the loopback interface. - if fib lookup returns an error, or result points to local, store zero result. This means '--local' option of -m rpfilter is not supported. It is possible to use 'fib type local' or add explicit saddr/daddr matching rules to create exceptions if this is really needed. - store result in the destination register. In case of multiple routes, search set for desired oif in case strict matching is requested. ipv4 and ipv6 behave fib expressions are supposed to behave the same. [ I have collapsed Arnd Bergmann's ("netfilter: nf_tables: fib warnings") http://patchwork.ozlabs.org/patch/688615/ to address fallout from this patch after rebasing nf-next, that was posted to address compilation warnings. --pablo ] Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_fib.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 include/net/netfilter/nft_fib.h (limited to 'include/net') diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h new file mode 100644 index 000000000000..cbedda077db2 --- /dev/null +++ b/include/net/netfilter/nft_fib.h @@ -0,0 +1,31 @@ +#ifndef _NFT_FIB_H_ +#define _NFT_FIB_H_ + +struct nft_fib { + enum nft_registers dreg:8; + u8 result; + u32 flags; +}; + +extern const struct nla_policy nft_fib_policy[]; + +int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr); +int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]); +int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nft_data **data); + + +void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt); +void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt); + +void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt); +void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt); + +void nft_fib_store_result(void *reg, enum nft_fib_result r, + const struct nft_pktinfo *pkt, int index); +#endif -- cgit v1.2.3 From 1fddf4bad0ac9f4d32c74af286fc1eec2a03c82c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 27 Oct 2016 19:49:42 +0100 Subject: netfilter: nf_log: add packet logging for netdev family Move layer 2 packet logging into nf_log_l2packet() that resides in nf_log_common.c, so this can be shared by both bridge and netdev families. This patch adds the boiler plate code to register the netdev logging family. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_log.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index 309cd267be4f..a559aa41253c 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -109,5 +109,10 @@ void nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, const struct net_device *out, const struct nf_loginfo *loginfo, const char *prefix); +void nf_log_l2packet(struct net *net, u_int8_t pf, unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, const char *prefix); #endif /* _NF_LOG_H */ -- cgit v1.2.3 From 8db4c5be88f62ffd7a552f70687a10c614dc697b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 27 Oct 2016 19:49:48 +0100 Subject: netfilter: move socket lookup infrastructure to nf_socket_ipv{4,6}.c We need this split to reuse existing codebase for the upcoming nf_tables socket expression. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_socket.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 include/net/netfilter/nf_socket.h (limited to 'include/net') diff --git a/include/net/netfilter/nf_socket.h b/include/net/netfilter/nf_socket.h new file mode 100644 index 000000000000..f2fc39c97d43 --- /dev/null +++ b/include/net/netfilter/nf_socket.h @@ -0,0 +1,27 @@ +#ifndef _NF_SOCK_H_ +#define _NF_SOCK_H_ + +struct net_device; +struct sk_buff; +struct sock; +struct net; + +static inline bool nf_sk_is_transparent(struct sock *sk) +{ + switch (sk->sk_state) { + case TCP_TIME_WAIT: + return inet_twsk(sk)->tw_transparent; + case TCP_NEW_SYN_RECV: + return inet_rsk(inet_reqsk(sk))->no_srccheck; + default: + return inet_sk(sk)->transparent; + } +} + +struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb, + const struct net_device *indev); + +struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, + const struct net_device *indev); + +#endif -- cgit v1.2.3 From 613dbd95723aee7abd16860745691b6c7bda20dc Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 3 Nov 2016 10:56:21 +0100 Subject: netfilter: x_tables: move hook state into xt_action_param structure Place pointer to hook state in xt_action_param structure instead of copying the fields that we need. After this change xt_action_param fits into one cacheline. This patch also adds a set of new wrapper functions to fetch relevant hook state structure fields. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 5031e072567b..44060344f958 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -30,11 +30,12 @@ static inline void nft_set_pktinfo(struct nft_pktinfo *pkt, const struct nf_hook_state *state) { pkt->skb = skb; - pkt->net = pkt->xt.net = state->net; - pkt->in = pkt->xt.in = state->in; - pkt->out = pkt->xt.out = state->out; - pkt->hook = pkt->xt.hooknum = state->hook; - pkt->pf = pkt->xt.family = state->pf; + pkt->net = state->net; + pkt->in = state->in; + pkt->out = state->out; + pkt->hook = state->hook; + pkt->pf = state->pf; + pkt->xt.state = state; } static inline void nft_set_pktinfo_proto_unspec(struct nft_pktinfo *pkt, -- cgit v1.2.3 From 0e5a1c7eb3fc705c4cc6c1e058e81d1f2e721c72 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 3 Nov 2016 10:56:26 +0100 Subject: netfilter: nf_tables: use hook state from xt_action_param structure Don't copy relevant fields from hook state structure, instead use the one that is already available in struct xt_action_param. This patch also adds a set of new wrapper functions to fetch relevant hook state structure fields. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 44060344f958..3295fb85bff6 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -14,27 +14,42 @@ struct nft_pktinfo { struct sk_buff *skb; - struct net *net; - const struct net_device *in; - const struct net_device *out; - u8 pf; - u8 hook; bool tprot_set; u8 tprot; /* for x_tables compatibility */ struct xt_action_param xt; }; +static inline struct net *nft_net(const struct nft_pktinfo *pkt) +{ + return pkt->xt.state->net; +} + +static inline unsigned int nft_hook(const struct nft_pktinfo *pkt) +{ + return pkt->xt.state->hook; +} + +static inline u8 nft_pf(const struct nft_pktinfo *pkt) +{ + return pkt->xt.state->pf; +} + +static inline const struct net_device *nft_in(const struct nft_pktinfo *pkt) +{ + return pkt->xt.state->in; +} + +static inline const struct net_device *nft_out(const struct nft_pktinfo *pkt) +{ + return pkt->xt.state->out; +} + static inline void nft_set_pktinfo(struct nft_pktinfo *pkt, struct sk_buff *skb, const struct nf_hook_state *state) { pkt->skb = skb; - pkt->net = state->net; - pkt->in = state->in; - pkt->out = state->out; - pkt->hook = state->hook; - pkt->pf = state->pf; pkt->xt.state = state; } -- cgit v1.2.3 From 01886bd91f1ba418ce669dfe97a06ca9504e482a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 3 Nov 2016 10:56:35 +0100 Subject: netfilter: remove hook_entries field from nf_hook_state This field is only useful for nf_queue, so store it in the nf_queue_entry structure instead, away from the core path. Pass hook_head to nf_hook_slow(). Since we always have a valid entry on the first iteration in nf_iterate(), we can use 'do { ... } while (entry)' loop instead. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_queue.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 2280cfe86c56..09948d10e38e 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -12,6 +12,7 @@ struct nf_queue_entry { unsigned int id; struct nf_hook_state state; + struct nf_hook_entry *hook; u16 size; /* sizeof(entry) + saved route keys */ /* extra space to store route keys */ -- cgit v1.2.3 From 70ecc24841326396a827deb55c3fefac582a729d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 2 Nov 2016 11:02:16 -0400 Subject: ipv4: add IP_RECVFRAGSIZE cmsg The IP stack records the largest fragment of a reassembled packet in IPCB(skb)->frag_max_size. When reading a datagram or raw packet that arrived fragmented, expose the value to allow applications to estimate receive path MTU. Tested: Sent data over a veth pair of which the source has a small mtu. Sent data using netcat, received using a dedicated process. Verified that the cmsg IP_RECVFRAGSIZE is returned only when data arrives fragmented, and in that cases matches the veth mtu. ip link add veth0 type veth peer name veth1 ip netns add from ip netns add to ip link set dev veth1 netns to ip netns exec to ip addr add dev veth1 192.168.10.1/24 ip netns exec to ip link set dev veth1 up ip link set dev veth0 netns from ip netns exec from ip addr add dev veth0 192.168.10.2/24 ip netns exec from ip link set dev veth0 up ip netns exec from ip link set dev veth0 mtu 1300 ip netns exec from ethtool -K veth0 ufo off dd if=/dev/zero bs=1 count=1400 2>/dev/null > payload ip netns exec to ./recv_cmsg_recvfragsize -4 -u -p 6000 & ip netns exec from nc -q 1 -u 192.168.10.1 6000 < payload using github.com/wdebruij/kerneltools/blob/master/tests/recvfragsize.c Signed-off-by: Willem de Bruijn Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_sock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 236a81034fef..c9cff977a7fb 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -228,6 +228,7 @@ struct inet_sock { #define IP_CMSG_PASSSEC BIT(5) #define IP_CMSG_ORIGDSTADDR BIT(6) #define IP_CMSG_CHECKSUM BIT(7) +#define IP_CMSG_RECVFRAGSIZE BIT(8) /** * sk_to_full_sk - Access to a full socket -- cgit v1.2.3 From 86741ec25462e4c8cdce6df2f41ead05568c7d5e Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Fri, 4 Nov 2016 02:23:41 +0900 Subject: net: core: Add a UID field to struct sock. Protocol sockets (struct sock) don't have UIDs, but most of the time, they map 1:1 to userspace sockets (struct socket) which do. Various operations such as the iptables xt_owner match need access to the "UID of a socket", and do so by following the backpointer to the struct socket. This involves taking sk_callback_lock and doesn't work when there is no socket because userspace has already called close(). Simplify this by adding a sk_uid field to struct sock whose value matches the UID of the corresponding struct socket. The semantics are as follows: 1. Whenever sk_socket is non-null: sk_uid is the same as the UID in sk_socket, i.e., matches the return value of sock_i_uid. Specifically, the UID is set when userspace calls socket(), fchown(), or accept(). 2. When sk_socket is NULL, sk_uid is defined as follows: - For a socket that no longer has a sk_socket because userspace has called close(): the previous UID. - For a cloned socket (e.g., an incoming connection that is established but on which userspace has not yet called accept): the UID of the socket it was cloned from. - For a socket that has never had an sk_socket: UID 0 inside the user namespace corresponding to the network namespace the socket belongs to. Kernel sockets created by sock_create_kern are a special case of #1 and sk_uid is the user that created them. For kernel sockets created at network namespace creation time, such as the per-processor ICMP and TCP sockets, this is the user that created the network namespace. Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- include/net/sock.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 93331a1492db..cf617ee16723 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -419,6 +419,7 @@ struct sock { u32 sk_max_ack_backlog; __u32 sk_priority; __u32 sk_mark; + kuid_t sk_uid; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; long sk_rcvtimeo; @@ -1664,6 +1665,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) sk->sk_wq = parent->wq; parent->sk = sk; sk_set_socket(sk, parent); + sk->sk_uid = SOCK_INODE(parent)->i_uid; security_sock_graft(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } @@ -1671,6 +1673,11 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) kuid_t sock_i_uid(struct sock *sk); unsigned long sock_i_ino(struct sock *sk); +static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) +{ + return sk ? sk->sk_uid : make_kuid(net->user_ns, 0); +} + static inline u32 net_tx_rndhash(void) { u32 v = prandom_u32(); -- cgit v1.2.3 From 622ec2c9d52405973c9f1ca5116eb1c393adfc7d Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Fri, 4 Nov 2016 02:23:42 +0900 Subject: net: core: add UID to flows, rules, and routes - Define a new FIB rule attributes, FRA_UID_RANGE, to describe a range of UIDs. - Define a RTA_UID attribute for per-UID route lookups and dumps. - Support passing these attributes to and from userspace via rtnetlink. The value INVALID_UID indicates no UID was specified. - Add a UID field to the flow structures. Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- include/net/fib_rules.h | 9 ++++++++- include/net/flow.h | 5 +++++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 456e4a6006ab..8dbfdf728cd8 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -8,6 +8,11 @@ #include #include +struct fib_kuid_range { + kuid_t start; + kuid_t end; +}; + struct fib_rule { struct list_head list; int iifindex; @@ -30,6 +35,7 @@ struct fib_rule { int suppress_prefixlen; char iifname[IFNAMSIZ]; char oifname[IFNAMSIZ]; + struct fib_kuid_range uid_range; struct rcu_head rcu; }; @@ -92,7 +98,8 @@ struct fib_rules_ops { [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \ [FRA_GOTO] = { .type = NLA_U32 }, \ - [FRA_L3MDEV] = { .type = NLA_U8 } + [FRA_L3MDEV] = { .type = NLA_U8 }, \ + [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) } static inline void fib_rule_get(struct fib_rule *rule) { diff --git a/include/net/flow.h b/include/net/flow.h index 035aa7716967..51373f3a5e31 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -11,6 +11,7 @@ #include #include #include +#include /* * ifindex generation is per-net namespace, and loopback is @@ -37,6 +38,7 @@ struct flowi_common { #define FLOWI_FLAG_SKIP_NH_OIF 0x04 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; + kuid_t flowic_uid; }; union flowi_uli { @@ -74,6 +76,7 @@ struct flowi4 { #define flowi4_flags __fl_common.flowic_flags #define flowi4_secid __fl_common.flowic_secid #define flowi4_tun_key __fl_common.flowic_tun_key +#define flowi4_uid __fl_common.flowic_uid /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; @@ -131,6 +134,7 @@ struct flowi6 { #define flowi6_flags __fl_common.flowic_flags #define flowi6_secid __fl_common.flowic_secid #define flowi6_tun_key __fl_common.flowic_tun_key +#define flowi6_uid __fl_common.flowic_uid struct in6_addr daddr; struct in6_addr saddr; /* Note: flowi6_tos is encoded in flowlabel, too. */ @@ -176,6 +180,7 @@ struct flowi { #define flowi_flags u.__fl_common.flowic_flags #define flowi_secid u.__fl_common.flowic_secid #define flowi_tun_key u.__fl_common.flowic_tun_key +#define flowi_uid u.__fl_common.flowic_uid } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) -- cgit v1.2.3 From e2d118a1cb5e60d077131a09db1d81b90a5295fe Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Fri, 4 Nov 2016 02:23:43 +0900 Subject: net: inet: Support UID-based routing in IP protocols. - Use the UID in routing lookups made by protocol connect() and sendmsg() functions. - Make sure that routing lookups triggered by incoming packets (e.g., Path MTU discovery) take the UID of the socket into account. - For packets not associated with a userspace socket, (e.g., ping replies) use UID 0 inside the user namespace corresponding to the network namespace the socket belongs to. This allows all namespaces to apply routing and iptables rules to kernel-originated traffic in that namespaces by matching UID 0. This is better than using the UID of the kernel socket that is sending the traffic, because the UID of kernel sockets created at namespace creation time (e.g., the per-processor ICMP and TCP sockets) is the UID of the user that created the socket, which might not be mapped in the namespace. Tested: compiles allnoconfig, allyesconfig, allmodconfig Tested: https://android-review.googlesource.com/253302 Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- include/net/flow.h | 4 +++- include/net/ip.h | 1 + include/net/ip6_route.h | 5 +++-- include/net/route.h | 5 +++-- 4 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/flow.h b/include/net/flow.h index 51373f3a5e31..6bbbca8af8e3 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -96,7 +96,8 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, __u32 mark, __u8 tos, __u8 scope, __u8 proto, __u8 flags, __be32 daddr, __be32 saddr, - __be16 dport, __be16 sport) + __be16 dport, __be16 sport, + kuid_t uid) { fl4->flowi4_oif = oif; fl4->flowi4_iif = LOOPBACK_IFINDEX; @@ -107,6 +108,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_flags = flags; fl4->flowi4_secid = 0; fl4->flowi4_tun_key.tun_id = 0; + fl4->flowi4_uid = uid; fl4->daddr = daddr; fl4->saddr = saddr; fl4->fl4_dport = dport; diff --git a/include/net/ip.h b/include/net/ip.h index 5413883ac47f..55cdaac02957 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -179,6 +179,7 @@ struct ip_reply_arg { /* -1 if not needed */ int bound_dev_if; u8 tos; + kuid_t uid; }; #define IP_REPLY_ARG_NOSRCCHECK 1 diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index f83e78d071a3..9dc2c182a263 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -140,9 +140,10 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, const struct in6_addr *gwaddr); void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif, - u32 mark); + u32 mark, kuid_t uid); void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu); -void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark); +void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, + kuid_t uid); void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, u32 mark); void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk); diff --git a/include/net/route.h b/include/net/route.h index 0429d47cad25..c0874c87c173 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -153,7 +153,7 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos, RT_SCOPE_UNIVERSE, proto, sk ? inet_sk_flowi_flags(sk) : 0, - daddr, saddr, dport, sport); + daddr, saddr, dport, sport, sock_net_uid(net, sk)); if (sk) security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); return ip_route_output_flow(net, fl4, sk); @@ -269,7 +269,8 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 flow_flags |= FLOWI_FLAG_ANYSRC; flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, - protocol, flow_flags, dst, src, dport, sport); + protocol, flow_flags, dst, src, dport, sport, + sk->sk_uid); } static inline struct rtable *ip_route_connect(struct flowi4 *fl4, -- cgit v1.2.3 From ad959036a70890bea121403c6a4e373dff5b7311 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 4 Nov 2016 11:28:58 +0100 Subject: net/sock: add an explicit sk argument for ip_cmsg_recv_offset() So that we can use it even after orphaining the skbuff. Suggested-by: Eric Dumazet Signed-off-by: Paolo Abeni Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index 55cdaac02957..f48c67cab222 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -579,7 +579,8 @@ int ip_options_rcv_srr(struct sk_buff *skb); */ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb); -void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, int tlen, int offset); +void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb, int tlen, int offset); int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, bool allow_ipv6); int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, @@ -601,7 +602,7 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport, static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) { - ip_cmsg_recv_offset(msg, skb, 0, 0); + ip_cmsg_recv_offset(msg, skb->sk, skb, 0, 0); } bool icmp_global_allow(void); -- cgit v1.2.3 From 7c13f97ffde63cc792c49ec1513f3974f2f05229 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 4 Nov 2016 11:28:59 +0100 Subject: udp: do fwd memory scheduling on dequeue A new argument is added to __skb_recv_datagram to provide an explicit skb destructor, invoked under the receive queue lock. The UDP protocol uses such argument to perform memory reclaiming on dequeue, so that the UDP protocol does not set anymore skb->desctructor. Instead explicit memory reclaiming is performed at close() time and when skbs are removed from the receive queue. The in kernel UDP protocol users now need to call a skb_recv_udp() variant instead of skb_recv_datagram() to properly perform memory accounting on dequeue. Overall, this allows acquiring only once the receive queue lock on dequeue. Tested using pktgen with random src port, 64 bytes packet, wire-speed on a 10G link as sender and udp_sink as the receiver, using an l4 tuple rxhash to stress the contention, and one or more udp_sink instances with reuseport. nr sinks vanilla patched 1 440 560 3 2150 2300 6 3650 3800 9 4450 4600 12 6250 6450 v1 -> v2: - do rmem and allocated memory scheduling under the receive lock - do bulk scheduling in first_packet_length() and in udp_destruct_sock() - avoid the typdef for the dequeue callback Suggested-by: Eric Dumazet Acked-by: Hannes Frederic Sowa Signed-off-by: Paolo Abeni Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/udp.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/net') diff --git a/include/net/udp.h b/include/net/udp.h index 6134f37ba3ab..e6e4e19be387 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -248,6 +248,21 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb, /* net/ipv4/udp.c */ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb); +void udp_skb_destructor(struct sock *sk, struct sk_buff *skb); +static inline struct sk_buff * +__skb_recv_udp(struct sock *sk, unsigned int flags, int noblock, int *peeked, + int *off, int *err) +{ + return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), + udp_skb_destructor, peeked, off, err); +} +static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, + int noblock, int *err) +{ + int peeked, off = 0; + + return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err); +} void udp_v4_early_demux(struct sk_buff *skb); int udp_get_port(struct sock *sk, unsigned short snum, -- cgit v1.2.3 From d0a81f67cd6286d32f42a167d19c7a387c23db79 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 3 Nov 2016 14:56:01 +0100 Subject: net: make default TX queue length a defined constant The default TX queue length of Ethernet devices have been a magic constant of 1000, ever since the initial git import. Looking back in historical trees[1][2] the value used to be 100, with the same comment "Ethernet wants good queues". The commit[3] that changed this from 100 to 1000 didn't describe why, but from conversations with Robert Olsson it seems that it was changed when Ethernet devices went from 100Mbit/s to 1Gbit/s, because the link speed increased x10 the queue size were also adjusted. This value later caused much heartache for the bufferbloat community. This patch merely moves the value into a defined constant. [1] https://git.kernel.org/cgit/linux/kernel/git/davem/netdev-vger-cvs.git/ [2] https://git.kernel.org/cgit/linux/kernel/git/tglx/history.git/ [3] https://git.kernel.org/tglx/history/c/98921832c232 Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index cd334c9584e9..f1b76b8e6d2d 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -6,6 +6,8 @@ #include #include +#define DEFAULT_TX_QUEUE_LEN 1000 + struct qdisc_walker { int stop; int skip; -- cgit v1.2.3 From 9ce183b4c4d24559467d7712e313f2b3f9277437 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Mon, 7 Nov 2016 15:14:36 +0200 Subject: net/sched: act_tunnel_key: add helper inlines to access tcf_tunnel_key Needed for drivers to pick the relevant action when offloading tunnel key act. Signed-off-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/net/tc_act/tc_tunnel_key.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include/net') diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h index 253f8da6c2a6..efef0b4b1b2b 100644 --- a/include/net/tc_act/tc_tunnel_key.h +++ b/include/net/tc_act/tc_tunnel_key.h @@ -12,6 +12,8 @@ #define __NET_TC_TUNNEL_KEY_H #include +#include +#include struct tcf_tunnel_key_params { struct rcu_head rcu; @@ -27,4 +29,39 @@ struct tcf_tunnel_key { #define to_tunnel_key(a) ((struct tcf_tunnel_key *)a) +static inline bool is_tcf_tunnel_set(const struct tc_action *a) +{ +#ifdef CONFIG_NET_CLS_ACT + struct tcf_tunnel_key *t = to_tunnel_key(a); + struct tcf_tunnel_key_params *params = rtnl_dereference(t->params); + + if (a->ops && a->ops->type == TCA_ACT_TUNNEL_KEY) + return params->tcft_action == TCA_TUNNEL_KEY_ACT_SET; +#endif + return false; +} + +static inline bool is_tcf_tunnel_release(const struct tc_action *a) +{ +#ifdef CONFIG_NET_CLS_ACT + struct tcf_tunnel_key *t = to_tunnel_key(a); + struct tcf_tunnel_key_params *params = rtnl_dereference(t->params); + + if (a->ops && a->ops->type == TCA_ACT_TUNNEL_KEY) + return params->tcft_action == TCA_TUNNEL_KEY_ACT_RELEASE; +#endif + return false; +} + +static inline struct ip_tunnel_info *tcf_tunnel_info(const struct tc_action *a) +{ +#ifdef CONFIG_NET_CLS_ACT + struct tcf_tunnel_key *t = to_tunnel_key(a); + struct tcf_tunnel_key_params *params = rtnl_dereference(t->params); + + return ¶ms->tcft_enc_metadata->u.tun_info; +#else + return NULL; +#endif +} #endif /* __NET_TC_TUNNEL_KEY_H */ -- cgit v1.2.3 From 9ba6a9a9f7a42673e9fc08ff3594f64caae64d3c Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Mon, 7 Nov 2016 15:14:37 +0200 Subject: flow_dissector: Add enums for encapsulation keys New encapsulation keys were added to the flower classifier, which allow classification according to outer (encapsulation) headers attributes such as key and IP addresses. In order to expose those attributes outside flower, add corresponding enums in the flow dissector. Signed-off-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index d9534927d93b..4e7cf38a7750 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -128,6 +128,10 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags */ FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */ FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */ + FLOW_DISSECTOR_KEY_ENC_KEYID, /* struct flow_dissector_key_keyid */ + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */ + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ + FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_MAX, }; -- cgit v1.2.3 From f4d997fd613001e612543339e0275c037f94ffe9 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Mon, 7 Nov 2016 15:14:39 +0200 Subject: net/sched: cls_flower: Add UDP port to tunnel parameters The current IP tunneling classification supports only IP addresses and key. Enhance UDP based IP tunneling classification parameters by adding UDP src and dst port. Signed-off-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 4e7cf38a7750..c4f31666afd2 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -132,6 +132,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */ FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */ + FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_MAX, }; -- cgit v1.2.3 From 24ba898d43e87f9ac87353c7a13eef4ee726cab7 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Mon, 7 Nov 2016 15:14:40 +0200 Subject: net/dst: Add dst port to dst_metadata utility functions Add dst port parameter to __ip_tun_set_dst and __ipv6_tun_set_dst utility functions. Signed-off-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 6965c8f68ade..701fc814d0af 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -115,6 +115,7 @@ static inline struct ip_tunnel_info *skb_tunnel_info_unclone(struct sk_buff *skb static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr, __be32 daddr, __u8 tos, __u8 ttl, + __be16 tp_dst, __be16 flags, __be64 tunnel_id, int md_size) @@ -127,7 +128,7 @@ static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr, ip_tunnel_key_init(&tun_dst->u.tun_info.key, saddr, daddr, tos, ttl, - 0, 0, 0, tunnel_id, flags); + 0, 0, tp_dst, tunnel_id, flags); return tun_dst; } @@ -139,12 +140,13 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, const struct iphdr *iph = ip_hdr(skb); return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, - flags, tunnel_id, md_size); + 0, flags, tunnel_id, md_size); } static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr, const struct in6_addr *daddr, __u8 tos, __u8 ttl, + __be16 tp_dst, __be32 label, __be16 flags, __be64 tunnel_id, @@ -162,7 +164,7 @@ static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *sad info->key.tun_flags = flags; info->key.tun_id = tunnel_id; info->key.tp_src = 0; - info->key.tp_dst = 0; + info->key.tp_dst = tp_dst; info->key.u.ipv6.src = *saddr; info->key.u.ipv6.dst = *daddr; @@ -183,7 +185,7 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, return __ipv6_tun_set_dst(&ip6h->saddr, &ip6h->daddr, ipv6_get_dsfield(ip6h), ip6h->hop_limit, - ip6_flowlabel(ip6h), flags, tunnel_id, + 0, ip6_flowlabel(ip6h), flags, tunnel_id, md_size); } #endif /* __NET_DST_METADATA_H */ -- cgit v1.2.3 From 4e24877e61e8507c0843e4bddbc6ecccbfd2e87d Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 6 Nov 2016 21:15:51 +0800 Subject: netfilter: nf_tables: simplify the basic expressions' init routine Some basic expressions are built into nf_tables.ko, such as nft_cmp, nft_lookup, nft_range and so on. But these basic expressions' init routine is a little ugly, too many goto errX labels, and we forget to call nft_range_module_exit in the exit routine, although it is harmless. Acctually, the init and exit routines of these basic expressions are same, i.e. do nft_register_expr in the init routine and do nft_unregister_expr in the exit routine. So it's better to arrange them into an array and deal with them together. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 00f4f6b1b1ba..862373d4ea9d 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -1,12 +1,18 @@ #ifndef _NET_NF_TABLES_CORE_H #define _NET_NF_TABLES_CORE_H +extern struct nft_expr_type nft_imm_type; +extern struct nft_expr_type nft_cmp_type; +extern struct nft_expr_type nft_lookup_type; +extern struct nft_expr_type nft_bitwise_type; +extern struct nft_expr_type nft_byteorder_type; +extern struct nft_expr_type nft_payload_type; +extern struct nft_expr_type nft_dynset_type; +extern struct nft_expr_type nft_range_type; + int nf_tables_core_module_init(void); void nf_tables_core_module_exit(void); -int nft_immediate_module_init(void); -void nft_immediate_module_exit(void); - struct nft_cmp_fast_expr { u32 data; enum nft_registers sreg:8; @@ -25,24 +31,6 @@ static inline u32 nft_cmp_fast_mask(unsigned int len) extern const struct nft_expr_ops nft_cmp_fast_ops; -int nft_cmp_module_init(void); -void nft_cmp_module_exit(void); - -int nft_range_module_init(void); -void nft_range_module_exit(void); - -int nft_lookup_module_init(void); -void nft_lookup_module_exit(void); - -int nft_dynset_module_init(void); -void nft_dynset_module_exit(void); - -int nft_bitwise_module_init(void); -void nft_bitwise_module_exit(void); - -int nft_byteorder_module_init(void); -void nft_byteorder_module_exit(void); - struct nft_payload { enum nft_payload_bases base:8; u8 offset; @@ -62,7 +50,4 @@ struct nft_payload_set { extern const struct nft_expr_ops nft_payload_fast_ops; extern struct static_key_false nft_trace_enabled; -int nft_payload_module_init(void); -void nft_payload_module_exit(void); - #endif /* _NET_NF_TABLES_CORE_H */ -- cgit v1.2.3 From 0e54d2179f650bac80d89a9def429dbdbed58c11 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Mon, 7 Nov 2016 18:31:17 +0100 Subject: netfilter: conntrack: simplify init/uninit of L4 protocol trackers modify registration and deregistration of layer-4 protocol trackers to facilitate inclusion of new elements into the current list of builtin protocols. Both builtin (TCP, UDP, ICMP) and non-builtin (DCCP, GRE, SCTP, UDPlite) layer-4 protocol trackers usually register/deregister themselves using consecutive calls to nf_ct_l4proto_{,pernet}_{,un}register(...). This sequence is interrupted and rolled back in case of error; in order to simplify addition of builtin protocols, the input of the above functions has been modified to allow registering/unregistering multiple protocols. Signed-off-by: Davide Caratti Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index de629f1520df..2152b70626d5 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -125,14 +125,24 @@ struct nf_conntrack_l4proto *nf_ct_l4proto_find_get(u_int16_t l3proto, void nf_ct_l4proto_put(struct nf_conntrack_l4proto *p); /* Protocol pernet registration. */ +int nf_ct_l4proto_pernet_register_one(struct net *net, + struct nf_conntrack_l4proto *proto); +void nf_ct_l4proto_pernet_unregister_one(struct net *net, + struct nf_conntrack_l4proto *proto); int nf_ct_l4proto_pernet_register(struct net *net, - struct nf_conntrack_l4proto *proto); + struct nf_conntrack_l4proto *proto[], + unsigned int num_proto); void nf_ct_l4proto_pernet_unregister(struct net *net, - struct nf_conntrack_l4proto *proto); + struct nf_conntrack_l4proto *proto[], + unsigned int num_proto); /* Protocol global registration. */ -int nf_ct_l4proto_register(struct nf_conntrack_l4proto *proto); -void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *proto); +int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *proto); +void nf_ct_l4proto_unregister_one(struct nf_conntrack_l4proto *proto); +int nf_ct_l4proto_register(struct nf_conntrack_l4proto *proto[], + unsigned int num_proto); +void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *proto[], + unsigned int num_proto); /* Generic netlink helpers */ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, -- cgit v1.2.3 From 1ababeba4a21f3dba3da3523c670b207fb2feb62 Mon Sep 17 00:00:00 2001 From: David Lebrun Date: Tue, 8 Nov 2016 14:57:39 +0100 Subject: ipv6: implement dataplane support for rthdr type 4 (Segment Routing Header) Implement minimal support for processing of SR-enabled packets as described in https://tools.ietf.org/html/draft-ietf-6man-segment-routing-header-02. This patch implements the following operations: - Intermediate segment endpoint: incrementation of active segment and rerouting. - Egress for SR-encapsulated packets: decapsulation of outer IPv6 header + SRH and routing of inner packet. - Cleanup flag support for SR-inlined packets: removal of SRH if we are the penultimate segment endpoint. A per-interface sysctl seg6_enabled is provided, to accept/deny SR-enabled packets. Default is deny. This patch does not provide support for HMAC-signed packets. Signed-off-by: David Lebrun Signed-off-by: David S. Miller --- include/net/seg6.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 include/net/seg6.h (limited to 'include/net') diff --git a/include/net/seg6.h b/include/net/seg6.h new file mode 100644 index 000000000000..4dd52a7e95f1 --- /dev/null +++ b/include/net/seg6.h @@ -0,0 +1,36 @@ +/* + * SR-IPv6 implementation + * + * Author: + * David Lebrun + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _NET_SEG6_H +#define _NET_SEG6_H + +static inline void update_csum_diff4(struct sk_buff *skb, __be32 from, + __be32 to) +{ + __be32 diff[] = { ~from, to }; + + skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum); +} + +static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from, + __be32 *to) +{ + __be32 diff[] = { + ~from[0], ~from[1], ~from[2], ~from[3], + to[0], to[1], to[2], to[3], + }; + + skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum); +} + +#endif -- cgit v1.2.3 From 915d7e5e5930b4f01d0971d93b9b25ed17d221aa Mon Sep 17 00:00:00 2001 From: David Lebrun Date: Tue, 8 Nov 2016 14:57:40 +0100 Subject: ipv6: sr: add code base for control plane support of SR-IPv6 This patch adds the necessary hooks and structures to provide support for SR-IPv6 control plane, essentially the Generic Netlink commands that will be used for userspace control over the Segment Routing kernel structures. The genetlink commands provide control over two different structures: tunnel source and HMAC data. The tunnel source is the source address that will be used by default when encapsulating packets into an outer IPv6 header + SRH. If the tunnel source is set to :: then an address of the outgoing interface will be selected as the source. The HMAC commands currently just return ENOTSUPP and will be implemented in a future patch. Signed-off-by: David Lebrun Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 1 + include/net/seg6.h | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'include/net') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 10d0848f5b8a..de7745e2edcc 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -85,6 +85,7 @@ struct netns_ipv6 { #endif atomic_t dev_addr_genid; atomic_t fib6_sernum; + struct seg6_pernet_data *seg6_data; }; #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) diff --git a/include/net/seg6.h b/include/net/seg6.h index 4dd52a7e95f1..7c7b8ed39661 100644 --- a/include/net/seg6.h +++ b/include/net/seg6.h @@ -14,6 +14,9 @@ #ifndef _NET_SEG6_H #define _NET_SEG6_H +#include +#include + static inline void update_csum_diff4(struct sk_buff *skb, __be32 from, __be32 to) { @@ -33,4 +36,17 @@ static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from, skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum); } +struct seg6_pernet_data { + struct mutex lock; + struct in6_addr __rcu *tun_src; +}; + +static inline struct seg6_pernet_data *seg6_pernet(struct net *net) +{ + return net->ipv6.seg6_data; +} + +extern int seg6_init(void); +extern void seg6_exit(void); + #endif -- cgit v1.2.3 From 6c8702c60b88651072460f3f4026c7dfe2521d12 Mon Sep 17 00:00:00 2001 From: David Lebrun Date: Tue, 8 Nov 2016 14:57:41 +0100 Subject: ipv6: sr: add support for SRH encapsulation and injection with lwtunnels This patch creates a new type of interfaceless lightweight tunnel (SEG6), enabling the encapsulation and injection of SRH within locally emitted packets and forwarded packets. >From a configuration viewpoint, a seg6 tunnel would be configured as follows: ip -6 ro ad fc00::1/128 encap seg6 mode encap segs fc42::1,fc42::2,fc42::3 dev eth0 Any packet whose destination address is fc00::1 would thus be encapsulated within an outer IPv6 header containing the SRH with three segments, and would actually be routed to the first segment of the list. If `mode inline' was specified instead of `mode encap', then the SRH would be directly inserted after the IPv6 header without outer encapsulation. The inline mode is only available if CONFIG_IPV6_SEG6_INLINE is enabled. This feature was made configurable because direct header insertion may break several mechanisms such as PMTUD or IPSec AH. Signed-off-by: David Lebrun Signed-off-by: David S. Miller --- include/net/seg6.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/seg6.h b/include/net/seg6.h index 7c7b8ed39661..ff5da0ce83e9 100644 --- a/include/net/seg6.h +++ b/include/net/seg6.h @@ -16,6 +16,8 @@ #include #include +#include +#include static inline void update_csum_diff4(struct sk_buff *skb, __be32 from, __be32 to) @@ -48,5 +50,9 @@ static inline struct seg6_pernet_data *seg6_pernet(struct net *net) extern int seg6_init(void); extern void seg6_exit(void); +extern int seg6_iptunnel_init(void); +extern void seg6_iptunnel_exit(void); + +extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len); #endif -- cgit v1.2.3 From bf355b8d2c30a289232042cacc1cfaea4923936c Mon Sep 17 00:00:00 2001 From: David Lebrun Date: Tue, 8 Nov 2016 14:57:42 +0100 Subject: ipv6: sr: add core files for SR HMAC support This patch adds the necessary functions to compute and check the HMAC signature of an SR-enabled packet. Two HMAC algorithms are supported: hmac(sha1) and hmac(sha256). In order to avoid dynamic memory allocation for each HMAC computation, a per-cpu ring buffer is allocated for this purpose. A new per-interface sysctl called seg6_require_hmac is added, allowing a user-defined policy for processing HMAC-signed SR-enabled packets. A value of -1 means that the HMAC field will always be ignored. A value of 0 means that if an HMAC field is present, its validity will be enforced (the packet is dropped is the signature is incorrect). Finally, a value of 1 means that any SR-enabled packet that does not contain an HMAC signature or whose signature is incorrect will be dropped. Signed-off-by: David Lebrun Signed-off-by: David S. Miller --- include/net/seg6.h | 4 ++++ include/net/seg6_hmac.h | 62 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 include/net/seg6_hmac.h (limited to 'include/net') diff --git a/include/net/seg6.h b/include/net/seg6.h index ff5da0ce83e9..4e0357517d79 100644 --- a/include/net/seg6.h +++ b/include/net/seg6.h @@ -18,6 +18,7 @@ #include #include #include +#include static inline void update_csum_diff4(struct sk_buff *skb, __be32 from, __be32 to) @@ -41,6 +42,9 @@ static inline void update_csum_diff16(struct sk_buff *skb, __be32 *from, struct seg6_pernet_data { struct mutex lock; struct in6_addr __rcu *tun_src; +#ifdef CONFIG_IPV6_SEG6_HMAC + struct rhashtable hmac_infos; +#endif }; static inline struct seg6_pernet_data *seg6_pernet(struct net *net) diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h new file mode 100644 index 000000000000..69c3a106056b --- /dev/null +++ b/include/net/seg6_hmac.h @@ -0,0 +1,62 @@ +/* + * SR-IPv6 implementation + * + * Author: + * David Lebrun + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _NET_SEG6_HMAC_H +#define _NET_SEG6_HMAC_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SEG6_HMAC_MAX_DIGESTSIZE 160 +#define SEG6_HMAC_RING_SIZE 256 + +struct seg6_hmac_info { + struct rhash_head node; + struct rcu_head rcu; + + u32 hmackeyid; + char secret[SEG6_HMAC_SECRET_LEN]; + u8 slen; + u8 alg_id; +}; + +struct seg6_hmac_algo { + u8 alg_id; + char name[64]; + struct crypto_shash * __percpu *tfms; + struct shash_desc * __percpu *shashs; +}; + +extern int seg6_hmac_compute(struct seg6_hmac_info *hinfo, + struct ipv6_sr_hdr *hdr, struct in6_addr *saddr, + u8 *output); +extern struct seg6_hmac_info *seg6_hmac_info_lookup(struct net *net, u32 key); +extern int seg6_hmac_info_add(struct net *net, u32 key, + struct seg6_hmac_info *hinfo); +extern int seg6_hmac_info_del(struct net *net, u32 key); +extern int seg6_push_hmac(struct net *net, struct in6_addr *saddr, + struct ipv6_sr_hdr *srh); +extern bool seg6_hmac_validate_skb(struct sk_buff *skb); +extern int seg6_hmac_init(void); +extern void seg6_hmac_exit(void); +extern int seg6_hmac_net_init(struct net *net); +extern void seg6_hmac_net_exit(struct net *net); + +#endif -- cgit v1.2.3 From 613fa3ca9e9e6af57927dab238121010c510fe4c Mon Sep 17 00:00:00 2001 From: David Lebrun Date: Tue, 8 Nov 2016 14:59:20 +0100 Subject: ipv6: add source address argument for ipv6_push_nfrag_opts This patch prepares for insertion of SRH through setsockopt(). The new source address argument is used when an HMAC field is present in the SRH, which must be filled. The HMAC signature process requires the source address as input text. Signed-off-by: David Lebrun Signed-off-by: David S. Miller --- include/net/ipv6.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 8fed1cd78658..0a3622bf086f 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -932,7 +932,8 @@ int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); */ void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, - u8 *proto, struct in6_addr **daddr_p); + u8 *proto, struct in6_addr **daddr_p, + struct in6_addr *saddr); void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto); -- cgit v1.2.3 From f41cd11d64b2b21012eb4abffbe579bc0b90467f Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Tue, 8 Nov 2016 17:24:03 +0200 Subject: tc_act: Remove tcf_act macro tc_act macro addressed a non existing field, and was not used in the kernel source. Signed-off-by: Yotam Gigi Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 82f3c912a5b1..d8eae87ea778 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -42,7 +42,6 @@ struct tc_action { struct gnet_stats_basic_cpu __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; }; -#define tcf_act common.tcfa_act #define tcf_head common.tcfa_head #define tcf_index common.tcfa_index #define tcf_refcnt common.tcfa_refcnt -- cgit v1.2.3 From 98e4321b97cc790c6aac3fb7c92bbf13212452ee Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 13 Nov 2016 12:14:59 -0500 Subject: genetlink: Make family a signed integer. The idr_alloc(), idr_remove(), et al. routines all expect IDs to be signed integers. Therefore make the genl_family member 'id' signed too. Signed-off-by: David S. Miller --- include/net/genetlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 3ec87bacc0f5..a34275be3600 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -48,7 +48,7 @@ struct genl_info; * @n_ops: number of operations supported by this family */ struct genl_family { - unsigned int id; /* private */ + int id; /* private */ unsigned int hdrsize; char name[GENL_NAMSIZ]; unsigned int version; -- cgit v1.2.3 From 7e416ad7416307e22871a0ef0f0f14e2bb66a0d1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 10 Nov 2016 14:17:01 +0100 Subject: netfilter: conntrack: remove unused netns_ct member since 23014011ba420 ('netfilter: conntrack: support a fixed size of 128 distinct labels') this isn't needed anymore. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netns/conntrack.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index e469e85de3f9..3d06d94d2e52 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -91,7 +91,6 @@ struct netns_ct { struct nf_ip_net nf_ct_proto; #if defined(CONFIG_NF_CONNTRACK_LABELS) unsigned int labels_used; - u8 label_words; #endif }; #endif -- cgit v1.2.3 From d9dc8b0f8b4ec8cdc48ad5a20a3105387138be82 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 11 Nov 2016 10:20:50 -0800 Subject: net: fix sleeping for sk_wait_event() Similar to commit 14135f30e33c ("inet: fix sleeping inside inet_wait_for_connect()"), sk_wait_event() needs to fix too, because release_sock() is blocking, it changes the process state back to running after sleep, which breaks the previous prepare_to_wait(). Switch to the new wait API. Cc: Eric Dumazet Cc: Peter Zijlstra Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/sock.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index cf617ee16723..9d905ed0cd25 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -915,14 +915,16 @@ static inline void sock_rps_reset_rxhash(struct sock *sk) #endif } -#define sk_wait_event(__sk, __timeo, __condition) \ +#define sk_wait_event(__sk, __timeo, __condition, __wait) \ ({ int __rc; \ release_sock(__sk); \ __rc = __condition; \ if (!__rc) { \ - *(__timeo) = schedule_timeout(*(__timeo)); \ + *(__timeo) = wait_woken(__wait, \ + TASK_INTERRUPTIBLE, \ + *(__timeo)); \ } \ - sched_annotate_sleep(); \ + sched_annotate_sleep(); \ lock_sock(__sk); \ __rc = __condition; \ __rc; \ -- cgit v1.2.3 From c915fe13cbaae5c7aa7b44f367d05addd60c9008 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 15 Nov 2016 16:37:53 +0100 Subject: udplite: fix NULL pointer dereference The commit 850cbaddb52d ("udp: use it's own memory accounting schema") assumes that the socket proto has memory accounting enabled, but this is not the case for UDPLITE. Fix it enabling memory accounting for UDPLITE and performing fwd allocated memory reclaiming on socket shutdown. UDP and UDPLITE share now the same memory accounting limits. Also drop the backlog receive operation, since is no more needed. Fixes: 850cbaddb52d ("udp: use it's own memory accounting schema") Reported-by: Andrei Vagin Suggested-by: Eric Dumazet Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/udp.h | 1 + include/net/udplite.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/udp.h b/include/net/udp.h index e6e4e19be387..1661791e8ca1 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -246,6 +246,7 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb, } /* net/ipv4/udp.c */ +void udp_destruct_sock(struct sock *sk); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb); void udp_skb_destructor(struct sock *sk, struct sk_buff *skb); diff --git a/include/net/udplite.h b/include/net/udplite.h index 80761938b9a7..36097d388219 100644 --- a/include/net/udplite.h +++ b/include/net/udplite.h @@ -27,6 +27,7 @@ static __inline__ int udplite_getfrag(void *from, char *to, int offset, static inline int udplite_sk_init(struct sock *sk) { udp_sk(sk)->pcflag = UDPLITE_BIT; + sk->sk_destruct = udp_destruct_sock; return 0; } -- cgit v1.2.3 From 9efdb92d68c726e70066e5b4189c1186c9b6f90c Mon Sep 17 00:00:00 2001 From: pravin shelar Date: Sun, 13 Nov 2016 20:43:58 -0800 Subject: vxlan: remove unsed vxlan_dev_dst_port() Signed-off-by: Pravin B Shelar Acked-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/vxlan.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 308adc4154f4..49a59202f85e 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -281,16 +281,6 @@ struct vxlan_dev { struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); -static inline __be16 vxlan_dev_dst_port(struct vxlan_dev *vxlan, - unsigned short family) -{ -#if IS_ENABLED(CONFIG_IPV6) - if (family == AF_INET6) - return inet_sk(vxlan->vn6_sock->sock->sk)->inet_sport; -#endif - return inet_sk(vxlan->vn4_sock->sock->sk)->inet_sport; -} - static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, netdev_features_t features) { -- cgit v1.2.3 From 21cb84c48ca0619181106f0f44f3802a989de024 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2016 10:15:12 -0800 Subject: net: busy-poll: remove need_resched() from sk_can_busy_loop() Now sk_busy_loop() can schedule by itself, we can remove need_resched() check from sk_can_busy_loop() Also add a const to its struct sock parameter. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Adam Belay Cc: Tariq Toukan Cc: Yuval Mintz Cc: Ariel Elior Signed-off-by: David S. Miller --- include/net/busy_poll.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 2fbeb1313c0f..965e52b9b5a3 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -58,10 +58,9 @@ static inline unsigned long busy_loop_end_time(void) return busy_loop_us_clock() + ACCESS_ONCE(sysctl_net_busy_poll); } -static inline bool sk_can_busy_loop(struct sock *sk) +static inline bool sk_can_busy_loop(const struct sock *sk) { - return sk->sk_ll_usec && sk->sk_napi_id && - !need_resched() && !signal_pending(current); + return sk->sk_ll_usec && sk->sk_napi_id && !signal_pending(current); } -- cgit v1.2.3 From a23a8f5bc17e6209eefadcd1cb3d3e28a2cdf530 Mon Sep 17 00:00:00 2001 From: David Lebrun Date: Wed, 16 Nov 2016 10:05:46 +0100 Subject: lwtunnel: subtract tunnel headroom from mtu on output redirect This patch changes the lwtunnel_headroom() function which is called in ipv4_mtu() and ip6_mtu(), to also return the correct headroom value when the lwtunnel state is OUTPUT_REDIRECT. This patch enables e.g. SR-IPv6 encapsulations to work without manually setting the route mtu. Acked-by: Roopa Prabhu Signed-off-by: David Lebrun Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 82e76fe1c1f7..d4c1c75b8862 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -94,7 +94,8 @@ static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate) static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate, unsigned int mtu) { - if (lwtunnel_xmit_redirect(lwtstate) && lwtstate->headroom < mtu) + if ((lwtunnel_xmit_redirect(lwtstate) || + lwtunnel_output_redirect(lwtstate)) && lwtstate->headroom < mtu) return lwtstate->headroom; return 0; -- cgit v1.2.3 From 7fda702f9315e6f4a74fee155c540750788a2d66 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 15 Nov 2016 23:23:11 +0800 Subject: sctp: use new rhlist interface on sctp transport rhashtable Now sctp transport rhashtable uses hash(lport, dport, daddr) as the key to hash a node to one chain. If in one host thousands of assocs connect to one server with the same lport and different laddrs (although it's not a normal case), all the transports would be hashed into the same chain. It may cause to keep returning -EBUSY when inserting a new node, as the chain is too long and sctp inserts a transport node in a loop, which could even lead to system hangs there. The new rhlist interface works for this case that there are many nodes with the same key in one chain. It puts them into a list then makes this list be as a node of the chain. This patch is to replace rhashtable_ interface with rhltable_ interface. Since a chain would not be too long and it would not return -EBUSY with this fix when inserting a node, the reinsert loop is also removed here. Signed-off-by: Xin Long Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 2 +- include/net/sctp/structs.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 31acc3f4f132..f0dcaebebddb 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -164,7 +164,7 @@ void sctp_backlog_migrate(struct sctp_association *assoc, struct sock *oldsk, struct sock *newsk); int sctp_transport_hashtable_init(void); void sctp_transport_hashtable_destroy(void); -void sctp_hash_transport(struct sctp_transport *t); +int sctp_hash_transport(struct sctp_transport *t); void sctp_unhash_transport(struct sctp_transport *t); struct sctp_transport *sctp_addrs_lookup_transport( struct net *net, diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index bd4a3ded7c87..92daabdc007d 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -124,7 +124,7 @@ extern struct sctp_globals { /* This is the sctp port control hash. */ struct sctp_bind_hashbucket *port_hashtable; /* This is the hash of all transports. */ - struct rhashtable transport_hashtable; + struct rhltable transport_hashtable; /* Sizes of above hashtables. */ int ep_hashsize; @@ -761,7 +761,7 @@ static inline int sctp_packet_empty(struct sctp_packet *packet) struct sctp_transport { /* A list of transports. */ struct list_head transports; - struct rhash_head node; + struct rhlist_head node; /* Reference counting. */ atomic_t refcnt; -- cgit v1.2.3 From e68b6e50fa359cc5aad4d2f8ac2bdbc1a8f4fd59 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Nov 2016 09:10:42 -0800 Subject: udp: enable busy polling for all sockets UDP busy polling is restricted to connected UDP sockets. This is because sk_busy_loop() only takes care of one NAPI context. There are cases where it could be extended. 1) Some hosts receive traffic on a single NIC, with one RX queue. 2) Some applications use SO_REUSEPORT and associated BPF filter to split the incoming traffic on one UDP socket per RX queue/thread/cpu 3) Some UDP sockets are used to send/receive traffic for one flow, but they do not bother with connect() This patch records the napi_id of first received skb, giving more reach to busy polling. Tested: lpaa23:~# echo 70 >/proc/sys/net/core/busy_read lpaa24:~# echo 70 >/proc/sys/net/core/busy_read lpaa23:~# for f in `seq 1 10`; do ./super_netperf 1 -H lpaa24 -t UDP_RR -l 5; done Before patch : 27867 28870 37324 41060 41215 36764 36838 44455 41282 43843 After patch : 73920 73213 70147 74845 71697 68315 68028 75219 70082 73707 Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/busy_poll.h | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 965e52b9b5a3..d73b849e29a6 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -80,11 +80,6 @@ static inline void skb_mark_napi_id(struct sk_buff *skb, skb->napi_id = napi->napi_id; } -/* used in the protocol hanlder to propagate the napi_id to the socket */ -static inline void sk_mark_napi_id(struct sock *sk, struct sk_buff *skb) -{ - sk->sk_napi_id = skb->napi_id; -} #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) @@ -107,10 +102,6 @@ static inline void skb_mark_napi_id(struct sk_buff *skb, { } -static inline void sk_mark_napi_id(struct sock *sk, struct sk_buff *skb) -{ -} - static inline bool busy_loop_timeout(unsigned long end_time) { return true; @@ -122,4 +113,23 @@ static inline bool sk_busy_loop(struct sock *sk, int nonblock) } #endif /* CONFIG_NET_RX_BUSY_POLL */ + +/* used in the protocol hanlder to propagate the napi_id to the socket */ +static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + sk->sk_napi_id = skb->napi_id; +#endif +} + +/* variant used for unconnected sockets */ +static inline void sk_mark_napi_id_once(struct sock *sk, + const struct sk_buff *skb) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + if (!sk->sk_napi_id) + sk->sk_napi_id = skb->napi_id; +#endif +} + #endif /* _LINUX_NET_BUSY_POLL_H */ -- cgit v1.2.3 From c7d03a00b56fc23c3a01a8353789ad257363e281 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 17 Nov 2016 04:58:21 +0300 Subject: netns: make struct pernet_operations::id unsigned int Make struct pernet_operations::id unsigned. There are 2 reasons to do so: 1) This field is really an index into an zero based array and thus is unsigned entity. Using negative value is out-of-bound access by definition. 2) On x86_64 unsigned 32-bit data which are mixed with pointers via array indexing or offsets added or subtracted to pointers are preffered to signed 32-bit data. "int" being used as an array index needs to be sign-extended to 64-bit before being used. void f(long *p, int i) { g(p[i]); } roughly translates to movsx rsi, esi mov rdi, [rsi+...] call g MOVSX is 3 byte instruction which isn't necessary if the variable is unsigned because x86_64 is zero extending by default. Now, there is net_generic() function which, you guessed it right, uses "int" as an array index: static inline void *net_generic(const struct net *net, int id) { ... ptr = ng->ptr[id - 1]; ... } And this function is used a lot, so those sign extensions add up. Patch snipes ~1730 bytes on allyesconfig kernel (without all junk messing with code generation): add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730) Unfortunately some functions actually grow bigger. This is a semmingly random artefact of code generation with register allocator being used differently. gcc decides that some variable needs to live in new r8+ registers and every access now requires REX prefix. Or it is shifted into r12, so [r12+0] addressing mode has to be used which is longer than [r8] However, overall balance is in negative direction: add/remove: 0/0 grow/shrink: 70/598 up/down: 396/-2126 (-1730) function old new delta nfsd4_lock 3886 3959 +73 tipc_link_build_proto_msg 1096 1140 +44 mac80211_hwsim_new_radio 2776 2808 +32 tipc_mon_rcv 1032 1058 +26 svcauth_gss_legacy_init 1413 1429 +16 tipc_bcbase_select_primary 379 392 +13 nfsd4_exchange_id 1247 1260 +13 nfsd4_setclientid_confirm 782 793 +11 ... put_client_renew_locked 494 480 -14 ip_set_sockfn_get 730 716 -14 geneve_sock_add 829 813 -16 nfsd4_sequence_done 721 703 -18 nlmclnt_lookup_host 708 686 -22 nfsd4_lockt 1085 1063 -22 nfs_get_client 1077 1050 -27 tcf_bpf_init 1106 1076 -30 nfsd4_encode_fattr 5997 5930 -67 Total: Before=154856051, After=154854321, chg -0.00% Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/net/bonding.h | 2 +- include/net/ip_tunnels.h | 6 +++--- include/net/net_namespace.h | 2 +- include/net/netfilter/nf_conntrack_l4proto.h | 2 +- include/net/netfilter/nf_conntrack_synproxy.h | 2 +- include/net/netns/generic.h | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/bonding.h b/include/net/bonding.h index f32f7ef8a23a..3c857778a6ca 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -681,7 +681,7 @@ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip) } /* exported from bond_main.c */ -extern int bond_net_id; +extern unsigned int bond_net_id; extern const struct bond_parm_tbl bond_lacp_tbl[]; extern const struct bond_parm_tbl xmit_hashtype_tbl[]; extern const struct bond_parm_tbl arp_validate_tbl[]; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 59557c07904b..e893fe43dd13 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -129,7 +129,7 @@ struct ip_tunnel { #endif struct ip_tunnel_prl_entry __rcu *prl; /* potential router list */ unsigned int prl_count; /* # of entries in PRL */ - int ip_tnl_net_id; + unsigned int ip_tnl_net_id; struct gro_cells gro_cells; bool collect_md; bool ignore_df; @@ -248,7 +248,7 @@ void ip_tunnel_uninit(struct net_device *dev); void ip_tunnel_dellink(struct net_device *dev, struct list_head *head); struct net *ip_tunnel_get_link_net(const struct net_device *dev); int ip_tunnel_get_iflink(const struct net_device *dev); -int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, +int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops); @@ -275,7 +275,7 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p); -void ip_tunnel_setup(struct net_device *dev, int net_id); +void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); struct ip_tunnel_encap_ops { size_t (*encap_hlen)(struct ip_tunnel_encap *e); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index fc4f757107df..d7149e93a60a 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -291,7 +291,7 @@ struct pernet_operations { int (*init)(struct net *net); void (*exit)(struct net *net); void (*exit_batch)(struct list_head *net_exit_list); - int *id; + unsigned int *id; size_t size; }; diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 2152b70626d5..e7b836590f0b 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -98,7 +98,7 @@ struct nf_conntrack_l4proto { const struct nla_policy *nla_policy; } ctnl_timeout; #endif - int *net_id; + unsigned int *net_id; /* Init l4proto pernet data */ int (*init_net)(struct net *net, u_int16_t proto); diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h index e6937318546c..b0ca402c1f72 100644 --- a/include/net/netfilter/nf_conntrack_synproxy.h +++ b/include/net/netfilter/nf_conntrack_synproxy.h @@ -54,7 +54,7 @@ struct synproxy_net { struct synproxy_stats __percpu *stats; }; -extern int synproxy_net_id; +extern unsigned int synproxy_net_id; static inline struct synproxy_net *synproxy_pernet(struct net *net) { return net_generic(net, synproxy_net_id); diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h index 70e158551704..d315786bcfd7 100644 --- a/include/net/netns/generic.h +++ b/include/net/netns/generic.h @@ -31,7 +31,7 @@ struct net_generic { void *ptr[0]; }; -static inline void *net_generic(const struct net *net, int id) +static inline void *net_generic(const struct net *net, unsigned int id) { struct net_generic *ng; void *ptr; -- cgit v1.2.3 From 3b2c75d371740fb0dcd0c9eac545ab1dd28b4706 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 19 Nov 2016 03:54:35 +0300 Subject: netlink: use "unsigned int" in nla_next() ->nla_len is unsigned entity (it's length after all) and u16, thus it can't overflow when being aligned into int/unsigned int. (nlmsg_next has the same code, but I didn't yet convince myself it is correct to do so). There is pointer arithmetic in this function and offset being unsigned is better: add/remove: 0/0 grow/shrink: 1/64 up/down: 5/-309 (-304) function old new delta nl80211_set_wiphy 1444 1449 +5 team_nl_cmd_options_set 997 995 -2 tcf_em_tree_validate 872 870 -2 switchdev_port_bridge_setlink 352 350 -2 switchdev_port_br_afspec 312 310 -2 rtm_to_fib_config 428 426 -2 qla4xxx_sysfs_ddb_set_param 2193 2191 -2 qla4xxx_iface_set_param 4470 4468 -2 ovs_nla_free_flow_actions 152 150 -2 output_userspace 518 516 -2 ... nl80211_set_reg 654 649 -5 validate_scan_freqs 148 142 -6 validate_linkmsg 288 282 -6 nl80211_parse_connkeys 489 483 -6 nlattr_set 231 224 -7 nf_tables_delsetelem 267 260 -7 do_setlink 3416 3408 -8 netlbl_cipsov4_add_std 1672 1659 -13 nl80211_parse_sched_scan 2902 2888 -14 nl80211_trigger_scan 1738 1720 -18 do_execute_actions 2821 2738 -83 Total: Before=154865355, After=154865051, chg -0.00% Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/net/netlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netlink.h b/include/net/netlink.h index a34f53acb6d6..d3938f11ae52 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -713,7 +713,7 @@ static inline int nla_ok(const struct nlattr *nla, int remaining) */ static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining) { - int totlen = NLA_ALIGN(nla->nla_len); + unsigned int totlen = NLA_ALIGN(nla->nla_len); *remaining -= totlen; return (struct nlattr *) ((char *) nla + totlen); -- cgit v1.2.3 From e97991832a4ea4a5f47d65f068a4c966a2eb5730 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 21 Nov 2016 14:18:38 +0100 Subject: tcp: make undo_cwnd mandatory for congestion modules The undo_cwnd fallback in the stack doubles cwnd based on ssthresh, which un-does reno halving behaviour. It seems more appropriate to let congctl algorithms pair .ssthresh and .undo_cwnd properly. Add a 'tcp_reno_undo_cwnd' function and wire it up for all congestion algorithms that used to rely on the fallback. Cc: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 123979fe12bf..7de80739adab 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -958,6 +958,7 @@ u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); u32 tcp_reno_ssthresh(struct sock *sk); +u32 tcp_reno_undo_cwnd(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; -- cgit v1.2.3 From 59bfde01fab0c4550778cd53e8d266f1dfddf7b7 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 22 Nov 2016 23:09:57 +0200 Subject: devlink: Add E-Switch inline mode control Some HWs need the VF driver to put part of the packet headers on the TX descriptor so the e-switch can do proper matching and steering. The supported modes: none, link, network, transport. Signed-off-by: Roi Dayan Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/net/devlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 211bd3c37028..d29e5fc82582 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -92,6 +92,8 @@ struct devlink_ops { int (*eswitch_mode_get)(struct devlink *devlink, u16 *p_mode); int (*eswitch_mode_set)(struct devlink *devlink, u16 mode); + int (*eswitch_inline_mode_get)(struct devlink *devlink, u8 *p_inline_mode); + int (*eswitch_inline_mode_set)(struct devlink *devlink, u8 inline_mode); }; static inline void *devlink_priv(struct devlink *devlink) -- cgit v1.2.3 From 21fcf572716f61926f5d23a358e79d06a9d4d54f Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 5 Oct 2016 22:51:42 +0200 Subject: Bluetooth: __ variants of u8 and friends are not neccessary inside kernel bluetooth.h is not part of user API, so __ variants are not neccessary here. Signed-off-by: Pavel Machek Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 0a1e21d7bce1..01487192f628 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -197,7 +197,7 @@ typedef struct { #define BDADDR_LE_PUBLIC 0x01 #define BDADDR_LE_RANDOM 0x02 -static inline bool bdaddr_type_is_valid(__u8 type) +static inline bool bdaddr_type_is_valid(u8 type) { switch (type) { case BDADDR_BREDR: @@ -209,7 +209,7 @@ static inline bool bdaddr_type_is_valid(__u8 type) return false; } -static inline bool bdaddr_type_is_le(__u8 type) +static inline bool bdaddr_type_is_le(u8 type) { switch (type) { case BDADDR_LE_PUBLIC: @@ -279,15 +279,16 @@ struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock); /* Skb helpers */ struct l2cap_ctrl { - __u8 sframe:1, + u8 sframe:1, poll:1, final:1, fcs:1, sar:2, super:2; - __u16 reqseq; - __u16 txseq; - __u8 retries; + + u16 reqseq; + u16 txseq; + u8 retries; __le16 psm; bdaddr_t bdaddr; struct l2cap_chan *chan; @@ -303,7 +304,7 @@ typedef void (*hci_req_complete_skb_t)(struct hci_dev *hdev, u8 status, #define HCI_REQ_SKB BIT(1) struct hci_ctrl { - __u16 opcode; + u16 opcode; u8 req_flags; u8 req_event; union { @@ -313,10 +314,10 @@ struct hci_ctrl { }; struct bt_skb_cb { - __u8 pkt_type; - __u8 force_active; - __u16 expect; - __u8 incoming:1; + u8 pkt_type; + u8 force_active; + u16 expect; + u8 incoming:1; union { struct l2cap_ctrl l2cap; struct hci_ctrl hci; @@ -366,7 +367,7 @@ out: return NULL; } -int bt_to_errno(__u16 code); +int bt_to_errno(u16 code); void hci_sock_set_flag(struct sock *sk, int nr); void hci_sock_clear_flag(struct sock *sk, int nr); -- cgit v1.2.3 From 05b055e89121394058c75dc354e9a46e1e765579 Mon Sep 17 00:00:00 2001 From: Francis Yan Date: Sun, 27 Nov 2016 23:07:13 -0800 Subject: tcp: instrument tcp sender limits chronographs This patch implements the skeleton of the TCP chronograph instrumentation on sender side limits: 1) idle (unspec) 2) busy sending data other than 3-4 below 3) rwnd-limited 4) sndbuf-limited The limits are enumerated 'tcp_chrono'. Since a connection in theory can idle forever, we do not track the actual length of this uninteresting idle period. For the rest we track how long the sender spends in each limit. At any point during the life time of a connection, the sender must be in one of the four states. If there are multiple conditions worthy of tracking in a chronograph then the highest priority enum takes precedence over the other conditions. So that if something "more interesting" starts happening, stop the previous chrono and start a new one. The time unit is jiffy(u32) in order to save space in tcp_sock. This implies application must sample the stats no longer than every 49 days of 1ms jiffy. Signed-off-by: Francis Yan Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 7de80739adab..e5ff4083870d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1516,6 +1516,20 @@ struct tcp_fastopen_context { struct rcu_head rcu; }; +/* Latencies incurred by various limits for a sender. They are + * chronograph-like stats that are mutually exclusive. + */ +enum tcp_chrono { + TCP_CHRONO_UNSPEC, + TCP_CHRONO_BUSY, /* Actively sending data (non-empty write queue) */ + TCP_CHRONO_RWND_LIMITED, /* Stalled by insufficient receive window */ + TCP_CHRONO_SNDBUF_LIMITED, /* Stalled by insufficient send buffer */ + __TCP_CHRONO_MAX, +}; + +void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type); +void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type); + /* write queue abstraction */ static inline void tcp_write_queue_purge(struct sock *sk) { -- cgit v1.2.3 From 0f87230d1a6c253681550c6064715d06a32be73d Mon Sep 17 00:00:00 2001 From: Francis Yan Date: Sun, 27 Nov 2016 23:07:14 -0800 Subject: tcp: instrument how long TCP is busy sending This patch measures TCP busy time, which is defined as the period of time when sender has data (or FIN) to send. The time starts when data is buffered and stops when the write queue is flushed by ACKs or error events. Note the busy time does not include SYN time, unless data is included in SYN (i.e. Fast Open). It does include FIN time even if the FIN carries no payload. Excluding pure FIN is possible but would incur one additional test in the fast path, which may not be worth it. Signed-off-by: Francis Yan Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index e5ff4083870d..3e097e39d4d2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1535,6 +1535,7 @@ static inline void tcp_write_queue_purge(struct sock *sk) { struct sk_buff *skb; + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) sk_wmem_free_skb(sk, skb); sk_mem_reclaim(sk); @@ -1593,8 +1594,10 @@ static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff * static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked) { - if (sk->sk_send_head == skb_unlinked) + if (sk->sk_send_head == skb_unlinked) { sk->sk_send_head = NULL; + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); + } if (tcp_sk(sk)->highest_sack == skb_unlinked) tcp_sk(sk)->highest_sack = NULL; } @@ -1616,6 +1619,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb /* Queue it, remembering where we must start sending. */ if (sk->sk_send_head == NULL) { sk->sk_send_head = skb; + tcp_chrono_start(sk, TCP_CHRONO_BUSY); if (tcp_sk(sk)->highest_sack == NULL) tcp_sk(sk)->highest_sack = skb; -- cgit v1.2.3 From 95a22caee396cef0bb2ca8fafdd82966a49367bb Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 1 Dec 2016 11:32:06 +0100 Subject: tcp: randomize tcp timestamp offsets for each connection jiffies based timestamps allow for easy inference of number of devices behind NAT translators and also makes tracking of hosts simpler. commit ceaa1fef65a7c2e ("tcp: adding a per-socket timestamp offset") added the main infrastructure that is needed for per-connection ts randomization, in particular writing/reading the on-wire tcp header format takes the offset into account so rest of stack can use normal tcp_time_stamp (jiffies). So only two items are left: - add a tsoffset for request sockets - extend the tcp isn generator to also return another 32bit number in addition to the ISN. Re-use of ISN generator also means timestamps are still monotonically increasing for same connection quadruple, i.e. PAWS will still work. Includes fixes from Eric Dumazet. Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/secure_seq.h | 8 ++++---- include/net/tcp.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h index 3f36d45b714a..0caee631a836 100644 --- a/include/net/secure_seq.h +++ b/include/net/secure_seq.h @@ -6,10 +6,10 @@ u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport); u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport); -__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport); -__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, - __be16 sport, __be16 dport); +u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport, u32 *tsoff); +u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport, u32 *tsoff); u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport); u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, diff --git a/include/net/tcp.h b/include/net/tcp.h index 3e097e39d4d2..207147b4c6b2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1827,7 +1827,7 @@ struct tcp_request_sock_ops { struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl, const struct request_sock *req, bool *strict); - __u32 (*init_seq)(const struct sk_buff *skb); + __u32 (*init_seq)(const struct sk_buff *skb, u32 *tsoff); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, -- cgit v1.2.3 From 55330f05969437c5d22fcc2ae2e54810b5236b7b Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Thu, 1 Dec 2016 14:06:33 +0200 Subject: net/sched: Add separate check for skip_hw flag Creating a difference between two possible cases: 1. Not offloading tc rule since the user sets 'skip_hw' flag. 2. Not offloading tc rule since the device doesn't support offloading. This patch doesn't add any new functionality. Signed-off-by: Hadar Hen Zion Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 767b03a3fe67..45ad9aab9bba 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -425,16 +425,14 @@ struct tc_cls_u32_offload { }; }; -static inline bool tc_should_offload(const struct net_device *dev, - const struct tcf_proto *tp, u32 flags) +static inline bool tc_can_offload(const struct net_device *dev, + const struct tcf_proto *tp) { const struct Qdisc *sch = tp->q; const struct Qdisc_class_ops *cops = sch->ops->cl_ops; if (!(dev->features & NETIF_F_HW_TC)) return false; - if (flags & TCA_CLS_FLAGS_SKIP_HW) - return false; if (!dev->netdev_ops->ndo_setup_tc) return false; if (cops && cops->tcf_cl_offload) @@ -443,6 +441,19 @@ static inline bool tc_should_offload(const struct net_device *dev, return true; } +static inline bool tc_skip_hw(u32 flags) +{ + return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false; +} + +static inline bool tc_should_offload(const struct net_device *dev, + const struct tcf_proto *tp, u32 flags) +{ + if (tc_skip_hw(flags)) + return false; + return tc_can_offload(dev, tp); +} + static inline bool tc_skip_sw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_SW) ? true : false; -- cgit v1.2.3 From 255cb30425c0ced57d6d85f3e7cddb99b9576046 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Thu, 1 Dec 2016 14:06:36 +0200 Subject: net/sched: act_mirred: Add new tc_action_ops get_dev() Adding support to a new tc_action_ops. get_dev is a general option which allows to get the underline device when trying to offload a tc rule. In case of mirred action the returned device is the mirred (egress) device. Signed-off-by: Hadar Hen Zion Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index d8eae87ea778..9dddf77a69cc 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -119,6 +119,8 @@ struct tc_action_ops { int (*walk)(struct net *, struct sk_buff *, struct netlink_callback *, int, const struct tc_action_ops *); void (*stats_update)(struct tc_action *, u64, u32, u64); + int (*get_dev)(const struct tc_action *a, struct net *net, + struct net_device **mirred_dev); }; struct tc_action_net { -- cgit v1.2.3 From 7091d8c7055d7310339435ae3af2fb490a92524d Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Thu, 1 Dec 2016 14:06:37 +0200 Subject: net/sched: cls_flower: Add offload support using egress Hardware device In order to support hardware offloading when the device given by the tc rule is different from the Hardware underline device, extract the mirred (egress) device from the tc action when a filter is added, using the new tc_action_ops, get_dev(). Flower caches the information about the mirred device and use it for calling ndo_setup_tc in filter change, update stats and delete. Calling ndo_setup_tc of the mirred (egress) device instead of the ingress device will allow a resolution between the software ingress device and the underline hardware device. The resolution will take place inside the offloading driver using 'egress_device' flag added to tc_to_netdev struct which is provided to the offloading driver. Signed-off-by: Hadar Hen Zion Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 45ad9aab9bba..f0a051480c6c 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -171,6 +171,8 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, struct tcf_exts *src); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); +int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts, + struct net_device **hw_dev); /** * struct tcf_pkt_info - packet information -- cgit v1.2.3 From aa4c1037a30f4e88f444e83d42c2befbe0d5caf5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 1 Dec 2016 08:48:06 -0800 Subject: bpf: Add support for reading socket family, type, protocol Add socket family, type and protocol to bpf_sock allowing bpf programs read-only access. Add __sk_flags_offset[0] to struct sock before the bitfield to programmtically determine the offset of the unsigned int containing protocol and type. Signed-off-by: David Ahern Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/sock.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 442cbb118a07..69afda6bea15 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -389,6 +389,21 @@ struct sock { * Because of non atomicity rules, all * changes are protected by socket lock. */ + unsigned int __sk_flags_offset[0]; +#ifdef __BIG_ENDIAN_BITFIELD +#define SK_FL_PROTO_SHIFT 16 +#define SK_FL_PROTO_MASK 0x00ff0000 + +#define SK_FL_TYPE_SHIFT 0 +#define SK_FL_TYPE_MASK 0x0000ffff +#else +#define SK_FL_PROTO_SHIFT 8 +#define SK_FL_PROTO_MASK 0x0000ff00 + +#define SK_FL_TYPE_SHIFT 16 +#define SK_FL_TYPE_MASK 0xffff0000 +#endif + kmemcheck_bitfield_begin(flags); unsigned int sk_padding : 2, sk_no_check_tx : 1, -- cgit v1.2.3 From 4f7df337fe79bba1e4c2d525525d63b5ba186bbd Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 2 Dec 2016 03:59:06 +0300 Subject: netlink: 2-clause nla_ok() nla_ok() consists of 3 clauses: 1) int rem >= (int)sizeof(struct nlattr) 2) u16 nla_len >= sizeof(struct nlattr) 3) u16 nla_len <= int rem The statement is that clause (1) is redundant. What it does is ensuring that "rem" is a positive number, so that in clause (3) positive number will be compared to positive number with no problems. However, "u16" fully fits into "int" and integers do not change value when upcasting even to signed type. Negative integers will be rejected by clause (3) just fine. Small positive integers will be rejected by transitivity of comparison operator. NOTE: all of the above DOES NOT apply to nlmsg_ok() where ->nlmsg_len is u32(!), so 3 clauses AND A CAST TO INT are necessary. Obligatory space savings report: -1.6 KB $ ./scripts/bloat-o-meter ../vmlinux-000* ../vmlinux-001* add/remove: 0/0 grow/shrink: 3/63 up/down: 35/-1692 (-1657) function old new delta validate_scan_freqs 142 155 +13 tcf_em_tree_validate 867 879 +12 dcbnl_ieee_del 328 338 +10 netlbl_cipsov4_add_common.isra 218 215 -3 ... ovs_nla_put_actions 888 806 -82 netlbl_cipsov4_add_std 1648 1566 -82 nl80211_parse_sched_scan 2889 2780 -109 ip_tun_from_nlattr 3086 2945 -141 Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/net/netlink.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netlink.h b/include/net/netlink.h index d3938f11ae52..dd657a33f8c3 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -698,8 +698,7 @@ static inline int nla_len(const struct nlattr *nla) */ static inline int nla_ok(const struct nlattr *nla, int remaining) { - return remaining >= (int) sizeof(*nla) && - nla->nla_len >= sizeof(*nla) && + return nla->nla_len >= sizeof(*nla) && nla->nla_len <= remaining; } -- cgit v1.2.3 From 9bfc7b9969dbb800460e2577f1dea59336269ce4 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 2 Dec 2016 04:12:58 +0300 Subject: netns: add dummy struct inside "struct net_generic" This is precursor to fixing "[id - 1]" bloat inside net_generic(). Name "s" is chosen to complement name "u" often used for dummy unions. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/net/netns/generic.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h index d315786bcfd7..65ccce69b040 100644 --- a/include/net/netns/generic.h +++ b/include/net/netns/generic.h @@ -25,8 +25,10 @@ */ struct net_generic { - unsigned int len; - struct rcu_head rcu; + struct { + unsigned int len; + struct rcu_head rcu; + } s; void *ptr[0]; }; -- cgit v1.2.3 From 6af2d5fff2fdcd481cb9a4f354a0880142b17c60 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 2 Dec 2016 04:21:32 +0300 Subject: netns: fix net_generic() "id - 1" bloat net_generic() function is both a) inline and b) used ~600 times. It has the following code inside ... ptr = ng->ptr[id - 1]; ... "id" is never compile time constant so compiler is forced to subtract 1. And those decrements or LEA [r32 - 1] instructions add up. We also start id'ing from 1 to catch bugs where pernet sybsystem id is not initialized and 0. This is quite pointless idea (nothing will work or immediate interference with first registered subsystem) in general but it hints what needs to be done for code size reduction. Namely, overlaying allocation of pointer array and fixed part of structure in the beginning and using usual base-0 addressing. Ids are just cookies, their exact values do not matter, so lets start with 3 on x86_64. Code size savings (oh boy): -4.2 KB As usual, ignore the initial compiler stupidity part of the table. add/remove: 0/0 grow/shrink: 12/670 up/down: 89/-4297 (-4208) function old new delta tipc_nametbl_insert_publ 1250 1270 +20 nlmclnt_lookup_host 686 703 +17 nfsd4_encode_fattr 5930 5941 +11 nfs_get_client 1050 1061 +11 register_pernet_operations 333 342 +9 tcf_mirred_init 843 849 +6 tcf_bpf_init 1143 1149 +6 gss_setup_upcall 990 994 +4 idmap_name_to_id 432 434 +2 ops_init 274 275 +1 nfsd_inject_forget_client 259 260 +1 nfs4_alloc_client 612 613 +1 tunnel_key_walker 164 163 -1 ... tipc_bcbase_select_primary 392 360 -32 mac80211_hwsim_new_radio 2808 2767 -41 ipip6_tunnel_ioctl 2228 2186 -42 tipc_bcast_rcv 715 672 -43 tipc_link_build_proto_msg 1140 1089 -51 nfsd4_lock 3851 3796 -55 tipc_mon_rcv 1012 956 -56 Total: Before=156643951, After=156639743, chg -0.00% Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/net/netns/generic.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h index 65ccce69b040..f15daaa89385 100644 --- a/include/net/netns/generic.h +++ b/include/net/netns/generic.h @@ -25,12 +25,14 @@ */ struct net_generic { - struct { - unsigned int len; - struct rcu_head rcu; - } s; - - void *ptr[0]; + union { + struct { + unsigned int len; + struct rcu_head rcu; + } s; + + void *ptr[0]; + }; }; static inline void *net_generic(const struct net *net, unsigned int id) @@ -40,7 +42,7 @@ static inline void *net_generic(const struct net *net, unsigned int id) rcu_read_lock(); ng = rcu_dereference(net->gen); - ptr = ng->ptr[id - 1]; + ptr = ng->ptr[id]; rcu_read_unlock(); return ptr; -- cgit v1.2.3 From 1c677b3d2828a84e0360e1a07d1204531540dc03 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 3 Dec 2016 16:44:59 +0100 Subject: ipv4: fib: Add fib_info_hold() helper As explained in the previous commit, modules are going to need to take a reference on fib info and then drop it using fib_info_put(). Add the fib_info_hold() helper to make the code more readable and also symmetric with fib_info_put(). Signed-off-by: Ido Schimmel Suggested-by: Jiri Pirko Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/ip_fib.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index f390c3bb05c5..6c67b9391fc0 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -397,6 +397,11 @@ static inline void fib_combine_itag(u32 *itag, const struct fib_result *res) void free_fib_info(struct fib_info *fi); +static inline void fib_info_hold(struct fib_info *fi) +{ + atomic_inc(&fi->fib_clntref); +} + static inline void fib_info_put(struct fib_info *fi) { if (atomic_dec_and_test(&fi->fib_clntref)) -- cgit v1.2.3 From cacaad11f43aefbbe5fca00af3b9c16e6aee1ba4 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 3 Dec 2016 16:45:06 +0100 Subject: ipv4: fib: Allow for consistent FIB dumping The next patch will enable listeners of the FIB notification chain to request a dump of the FIB tables. However, since RTNL isn't taken during the dump, it's possible for the FIB tables to change mid-dump, which will result in inconsistency between the listener's table and the kernel's. Allow listeners to know about changes that occurred mid-dump, by adding a change sequence counter to each net namespace. The counter is incremented just before a notification is sent in the FIB chain. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 7adf4386ac8f..f0cf5a1b777e 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -135,6 +135,9 @@ struct netns_ipv4 { #ifdef CONFIG_IP_ROUTE_MULTIPATH int sysctl_fib_multipath_use_neigh; #endif + + unsigned int fib_seq; /* protected by rtnl_mutex */ + atomic_t rt_genid; }; #endif -- cgit v1.2.3 From c3852ef7f2f8f75a9f85a864bec1f6f5a3068eea Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 3 Dec 2016 16:45:07 +0100 Subject: ipv4: fib: Replay events when registering FIB notifier Commit b90eb7549499 ("fib: introduce FIB notification infrastructure") introduced a new notification chain to notify listeners (f.e., switchdev drivers) about addition and deletion of routes. However, upon registration to the chain the FIB tables can already be populated, which means potential listeners will have an incomplete view of the tables. Solve that by dumping the FIB tables and replaying the events to the passed notification block. The dump itself is done using RCU in order not to starve consumers that need RTNL to make progress. The integrity of the dump is ensured by reading the FIB change sequence counter before and after the dump under RTNL. This allows us to avoid the problematic situation in which the dumping process sends a ENTRY_ADD notification following ENTRY_DEL generated by another process holding RTNL. Callers of the registration function may pass a callback that is executed in case the dump was inconsistent with current FIB tables. The number of retries until a consistent dump is achieved is set to a fixed number to prevent callers from looping for long periods of time. In case current limit proves to be problematic in the future, it can be easily converted to be configurable using a sysctl. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/ip_fib.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 6c67b9391fc0..5f376af377c7 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -221,7 +221,8 @@ enum fib_event_type { FIB_EVENT_RULE_DEL, }; -int register_fib_notifier(struct notifier_block *nb); +int register_fib_notifier(struct notifier_block *nb, + void (*cb)(struct notifier_block *nb)); int unregister_fib_notifier(struct notifier_block *nb); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info); -- cgit v1.2.3 From adc176c5472214971d77c1a61c83db9b01e9cdc7 Mon Sep 17 00:00:00 2001 From: Erik Nordmark Date: Fri, 2 Dec 2016 14:00:08 -0800 Subject: ipv6 addrconf: Implemented enhanced DAD (RFC7527) Implemented RFC7527 Enhanced DAD. IPv6 duplicate address detection can fail if there is some temporary loopback of Ethernet frames. RFC7527 solves this by including a random nonce in the NS messages used for DAD, and if an NS is received with the same nonce it is assumed to be a looped back DAD probe and is ignored. RFC7527 is enabled by default. Can be disabled by setting both of conf/{all,interface}/enhanced_dad to zero. Signed-off-by: Erik Nordmark Signed-off-by: Bob Gilligan Reviewed-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/if_inet6.h | 1 + include/net/ndisc.h | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index b0576cb2ab25..0fa4c324b713 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -55,6 +55,7 @@ struct inet6_ifaddr { __u8 stable_privacy_retry; __u16 scope; + __u64 dad_nonce; unsigned long cstamp; /* created timestamp */ unsigned long tstamp; /* updated timestamp */ diff --git a/include/net/ndisc.h b/include/net/ndisc.h index be1fe2283254..d562a2fe4860 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -31,6 +31,7 @@ enum { ND_OPT_PREFIX_INFO = 3, /* RFC2461 */ ND_OPT_REDIRECT_HDR = 4, /* RFC2461 */ ND_OPT_MTU = 5, /* RFC2461 */ + ND_OPT_NONCE = 14, /* RFC7527 */ __ND_OPT_ARRAY_MAX, ND_OPT_ROUTE_INFO = 24, /* RFC4191 */ ND_OPT_RDNSS = 25, /* RFC5006 */ @@ -121,6 +122,7 @@ struct ndisc_options { #define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END] #define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] #define nd_opts_mtu nd_opt_array[ND_OPT_MTU] +#define nd_opts_nonce nd_opt_array[ND_OPT_NONCE] #define nd_802154_opts_src_lladdr nd_802154_opt_array[ND_OPT_SOURCE_LL_ADDR] #define nd_802154_opts_tgt_lladdr nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR] @@ -398,7 +400,8 @@ void ndisc_cleanup(void); int ndisc_rcv(struct sk_buff *skb); void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, - const struct in6_addr *daddr, const struct in6_addr *saddr); + const struct in6_addr *daddr, const struct in6_addr *saddr, + u64 nonce); void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr); -- cgit v1.2.3 From 0c4e966eafff8253bec545d8c27b9efa231c1f62 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 20 Oct 2016 18:33:01 +0200 Subject: netfilter: built-in NAT support for DCCP CONFIG_NF_NAT_PROTO_DCCP is no more a tristate. When set to y, NAT support for DCCP protocol is built-in into nf_nat.ko. footprint test: (nf_nat_proto_) | dccp || nf_nat --------------------------+--------++-------- no builtin | 409800 || 2241312 DCCP builtin | - || 2578968 Signed-off-by: Davide Caratti Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat_l4proto.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_nat_l4proto.h b/include/net/netfilter/nf_nat_l4proto.h index 12f4cc841b6e..92b147be00ef 100644 --- a/include/net/netfilter/nf_nat_l4proto.h +++ b/include/net/netfilter/nf_nat_l4proto.h @@ -54,6 +54,9 @@ extern const struct nf_nat_l4proto nf_nat_l4proto_udp; extern const struct nf_nat_l4proto nf_nat_l4proto_icmp; extern const struct nf_nat_l4proto nf_nat_l4proto_icmpv6; extern const struct nf_nat_l4proto nf_nat_l4proto_unknown; +#ifdef CONFIG_NF_NAT_PROTO_DCCP +extern const struct nf_nat_l4proto nf_nat_l4proto_dccp; +#endif bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, -- cgit v1.2.3 From 7a2dd28c703408ef27d6fe6a4fcd7c58968ce3bf Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 20 Oct 2016 18:33:02 +0200 Subject: netfilter: built-in NAT support for SCTP CONFIG_NF_NAT_PROTO_SCTP is no more a tristate. When set to y, NAT support for SCTP protocol is built-in into nf_nat.ko. footprint test: (nf_nat_proto_) | sctp || nf_nat --------------------------+--------++-------- no builtin | 428344 || 2241312 SCTP builtin | - || 2597032 Signed-off-by: Davide Caratti Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat_l4proto.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_nat_l4proto.h b/include/net/netfilter/nf_nat_l4proto.h index 92b147be00ef..2cbaf3856e21 100644 --- a/include/net/netfilter/nf_nat_l4proto.h +++ b/include/net/netfilter/nf_nat_l4proto.h @@ -57,6 +57,9 @@ extern const struct nf_nat_l4proto nf_nat_l4proto_unknown; #ifdef CONFIG_NF_NAT_PROTO_DCCP extern const struct nf_nat_l4proto nf_nat_l4proto_dccp; #endif +#ifdef CONFIG_NF_NAT_PROTO_SCTP +extern const struct nf_nat_l4proto nf_nat_l4proto_sctp; +#endif bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, -- cgit v1.2.3 From b8ad652f9779976d0300ae199961e413859d5378 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 20 Oct 2016 18:33:03 +0200 Subject: netfilter: built-in NAT support for UDPlite CONFIG_NF_NAT_PROTO_UDPLITE is no more a tristate. When set to y, NAT support for UDPlite protocol is built-in into nf_nat.ko. footprint test: (nf_nat_proto_) |udplite || nf_nat --------------------------+--------++-------- no builtin | 408048 || 2241312 UDPLITE builtin | - || 2577256 Signed-off-by: Davide Caratti Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat_l4proto.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_nat_l4proto.h b/include/net/netfilter/nf_nat_l4proto.h index 2cbaf3856e21..3923150f2a1e 100644 --- a/include/net/netfilter/nf_nat_l4proto.h +++ b/include/net/netfilter/nf_nat_l4proto.h @@ -60,6 +60,9 @@ extern const struct nf_nat_l4proto nf_nat_l4proto_dccp; #ifdef CONFIG_NF_NAT_PROTO_SCTP extern const struct nf_nat_l4proto nf_nat_l4proto_sctp; #endif +#ifdef CONFIG_NF_NAT_PROTO_UDPLITE +extern const struct nf_nat_l4proto nf_nat_l4proto_udplite; +#endif bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, -- cgit v1.2.3 From 673ab46f345557e9d741e97ca0301280360d1af1 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Mon, 14 Nov 2016 22:39:25 +0800 Subject: netfilter: nf_log: do not assume ethernet header in netdev family In netdev family, we will handle non ethernet packets, so using eth_hdr(skb)->h_proto is incorrect. Meanwhile, we can use socket(AF_PACKET...) to sending packets, so skb->protocol is not always set in bridge family. Add an extra parameter into nf_log_l2packet to solve this issue. Fixes: 1fddf4bad0ac ("netfilter: nf_log: add packet logging for netdev family") Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_log.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index a559aa41253c..450f87f95415 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -109,7 +109,9 @@ void nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, const struct net_device *out, const struct nf_loginfo *loginfo, const char *prefix); -void nf_log_l2packet(struct net *net, u_int8_t pf, unsigned int hooknum, +void nf_log_l2packet(struct net *net, u_int8_t pf, + __be16 protocol, + unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, -- cgit v1.2.3 From c51d39010a1bccc9c1294e2d7c00005aefeb2b5c Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 15 Nov 2016 15:08:25 +0100 Subject: netfilter: conntrack: built-in support for DCCP CONFIG_NF_CT_PROTO_DCCP is no more a tristate. When set to y, connection tracking support for DCCP protocol is built-in into nf_conntrack.ko. footprint test: $ ls -l net/netfilter/nf_conntrack{_proto_dccp,}.ko \ net/ipv4/netfilter/nf_conntrack_ipv4.ko \ net/ipv6/netfilter/nf_conntrack_ipv6.ko (builtin)|| dccp | ipv4 | ipv6 | nf_conntrack ---------++--------+--------+--------+-------------- none || 469140 | 828755 | 828676 | 6141434 DCCP || - | 830566 | 829935 | 6533526 Signed-off-by: Davide Caratti Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_conntrack_ipv4.h | 3 +++ include/net/netfilter/ipv6/nf_conntrack_ipv6.h | 3 +++ include/net/netns/conntrack.h | 14 ++++++++++++++ 3 files changed, 20 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h index 981c327374da..c2f155fd9299 100644 --- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h @@ -15,6 +15,9 @@ extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp; +#ifdef CONFIG_NF_CT_PROTO_DCCP +extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4; +#endif int nf_conntrack_ipv4_compat_init(void); void nf_conntrack_ipv4_compat_fini(void); diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h index a4c993685795..5ec66c0d21c4 100644 --- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h +++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h @@ -6,6 +6,9 @@ extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6; +#ifdef CONFIG_NF_CT_PROTO_DCCP +extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6; +#endif #include extern struct ctl_table nf_ct_ipv6_sysctl_table[]; diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 3d06d94d2e52..440b781baf0b 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -6,6 +6,9 @@ #include #include #include +#ifdef CONFIG_NF_CT_PROTO_DCCP +#include +#endif #include struct ctl_table_header; @@ -48,12 +51,23 @@ struct nf_icmp_net { unsigned int timeout; }; +#ifdef CONFIG_NF_CT_PROTO_DCCP +struct nf_dccp_net { + struct nf_proto_net pn; + int dccp_loose; + unsigned int dccp_timeout[CT_DCCP_MAX + 1]; +}; +#endif + struct nf_ip_net { struct nf_generic_net generic; struct nf_tcp_net tcp; struct nf_udp_net udp; struct nf_icmp_net icmp; struct nf_icmp_net icmpv6; +#ifdef CONFIG_NF_CT_PROTO_DCCP + struct nf_dccp_net dccp; +#endif }; struct ct_pcpu { -- cgit v1.2.3 From a85406afeb3e045b001b2aac5b4f89f4266fede3 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 15 Nov 2016 15:08:26 +0100 Subject: netfilter: conntrack: built-in support for SCTP CONFIG_NF_CT_PROTO_SCTP is no more a tristate. When set to y, connection tracking support for SCTP protocol is built-in into nf_conntrack.ko. footprint test: $ ls -l net/netfilter/nf_conntrack{_proto_sctp,}.ko \ net/ipv4/netfilter/nf_conntrack_ipv4.ko \ net/ipv6/netfilter/nf_conntrack_ipv6.ko (builtin)|| sctp | ipv4 | ipv6 | nf_conntrack ---------++--------+--------+--------+-------------- none || 498243 | 828755 | 828676 | 6141434 SCTP || - | 829254 | 829175 | 6547872 Signed-off-by: Davide Caratti Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_conntrack_ipv4.h | 3 +++ include/net/netfilter/ipv6/nf_conntrack_ipv6.h | 3 +++ include/net/netns/conntrack.h | 13 +++++++++++++ 3 files changed, 19 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h index c2f155fd9299..5f1fc15a51fb 100644 --- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h @@ -18,6 +18,9 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp; #ifdef CONFIG_NF_CT_PROTO_DCCP extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4; #endif +#ifdef CONFIG_NF_CT_PROTO_SCTP +extern struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4; +#endif int nf_conntrack_ipv4_compat_init(void); void nf_conntrack_ipv4_compat_fini(void); diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h index 5ec66c0d21c4..f70d191a8820 100644 --- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h +++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h @@ -9,6 +9,9 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6; #ifdef CONFIG_NF_CT_PROTO_DCCP extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6; #endif +#ifdef CONFIG_NF_CT_PROTO_SCTP +extern struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6; +#endif #include extern struct ctl_table nf_ct_ipv6_sysctl_table[]; diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 440b781baf0b..17724c62de97 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -9,6 +9,9 @@ #ifdef CONFIG_NF_CT_PROTO_DCCP #include #endif +#ifdef CONFIG_NF_CT_PROTO_SCTP +#include +#endif #include struct ctl_table_header; @@ -59,6 +62,13 @@ struct nf_dccp_net { }; #endif +#ifdef CONFIG_NF_CT_PROTO_SCTP +struct nf_sctp_net { + struct nf_proto_net pn; + unsigned int timeouts[SCTP_CONNTRACK_MAX]; +}; +#endif + struct nf_ip_net { struct nf_generic_net generic; struct nf_tcp_net tcp; @@ -68,6 +78,9 @@ struct nf_ip_net { #ifdef CONFIG_NF_CT_PROTO_DCCP struct nf_dccp_net dccp; #endif +#ifdef CONFIG_NF_CT_PROTO_SCTP + struct nf_sctp_net sctp; +#endif }; struct ct_pcpu { -- cgit v1.2.3 From 9b91c96c5d1f9da79438292f8c82f65cbf078645 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 15 Nov 2016 15:08:27 +0100 Subject: netfilter: conntrack: built-in support for UDPlite CONFIG_NF_CT_PROTO_UDPLITE is no more a tristate. When set to y, connection tracking support for UDPlite protocol is built-in into nf_conntrack.ko. footprint test: $ ls -l net/netfilter/nf_conntrack{_proto_udplite,}.ko \ net/ipv4/netfilter/nf_conntrack_ipv4.ko \ net/ipv6/netfilter/nf_conntrack_ipv6.ko (builtin)|| udplite| ipv4 | ipv6 |nf_conntrack ---------++--------+--------+--------+-------------- none || 432538 | 828755 | 828676 | 6141434 UDPlite || - | 829649 | 829362 | 6498204 Signed-off-by: Davide Caratti Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_conntrack_ipv4.h | 3 +++ include/net/netfilter/ipv6/nf_conntrack_ipv6.h | 3 +++ include/net/netns/conntrack.h | 16 ++++++++++++++++ 3 files changed, 22 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h index 5f1fc15a51fb..919e4e8af327 100644 --- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h @@ -21,6 +21,9 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4; #ifdef CONFIG_NF_CT_PROTO_SCTP extern struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4; #endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE +extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4; +#endif int nf_conntrack_ipv4_compat_init(void); void nf_conntrack_ipv4_compat_fini(void); diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h index f70d191a8820..eaea968f8657 100644 --- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h +++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h @@ -12,6 +12,9 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6; #ifdef CONFIG_NF_CT_PROTO_SCTP extern struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6; #endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE +extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6; +#endif #include extern struct ctl_table nf_ct_ipv6_sysctl_table[]; diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 17724c62de97..cf799fc3fdec 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -69,6 +69,19 @@ struct nf_sctp_net { }; #endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE +enum udplite_conntrack { + UDPLITE_CT_UNREPLIED, + UDPLITE_CT_REPLIED, + UDPLITE_CT_MAX +}; + +struct nf_udplite_net { + struct nf_proto_net pn; + unsigned int timeouts[UDPLITE_CT_MAX]; +}; +#endif + struct nf_ip_net { struct nf_generic_net generic; struct nf_tcp_net tcp; @@ -81,6 +94,9 @@ struct nf_ip_net { #ifdef CONFIG_NF_CT_PROTO_SCTP struct nf_sctp_net sctp; #endif +#ifdef CONFIG_NF_CT_PROTO_UDPLITE + struct nf_udplite_net udplite; +#endif }; struct ct_pcpu { -- cgit v1.2.3 From a379854d91b2cb0af07b0f62845449f4dacbd673 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 15 Nov 2016 21:36:39 +0100 Subject: netfilter: conntrack: remove unused init_net hook since adf0516845bcd0 ("netfilter: remove ip_conntrack* sysctl compat code") the only user (ipv4 tracker) sets this to an empty stub function. After this change nf_ct_l3proto_pernet_register() is also empty, but this will change in a followup patch to add conditional register of the hooks. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l3proto.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index 8992e4229da9..cf8f3dfd810d 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -63,9 +63,6 @@ struct nf_conntrack_l3proto { size_t nla_size; - /* Init l3proto pernet data */ - int (*init_net)(struct net *net); - /* Module (if any) which this is connected to. */ struct module *me; }; -- cgit v1.2.3 From ecb2421b5ddf48e6e116fced7f74c985bb546138 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 15 Nov 2016 21:36:40 +0100 Subject: netfilter: add and use nf_ct_netns_get/put currently aliased to try_module_get/_put. Will be changed in next patch when we add functions to make use of ->net argument to store usercount per l3proto tracker. This is needed to avoid registering the conntrack hooks in all netns and later only enable connection tracking in those that need conntrack. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index d9d52c020a70..5916aa9ab3f0 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -181,6 +181,10 @@ static inline void nf_ct_put(struct nf_conn *ct) int nf_ct_l3proto_try_module_get(unsigned short l3proto); void nf_ct_l3proto_module_put(unsigned short l3proto); +/* load module; enable/disable conntrack in this namespace */ +int nf_ct_netns_get(struct net *net, u8 nfproto); +void nf_ct_netns_put(struct net *net, u8 nfproto); + /* * Allocate a hashtable of hlist_head (if nulls == 0), * or hlist_nulls_head (if nulls == 1) -- cgit v1.2.3 From 0c66dc1ea3f0366221f8a5a16c73f01ea9259678 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 15 Nov 2016 21:36:43 +0100 Subject: netfilter: conntrack: register hooks in netns when needed by ruleset This makes use of nf_ct_netns_get/put added in previous patch. We add get/put functions to nf_conntrack_l3proto structure, ipv4 and ipv6 then implement use-count to track how many users (nft or xtables modules) have a dependency on ipv4 and/or ipv6 connection tracking functionality. When count reaches zero, the hooks are unregistered. This delays activation of connection tracking inside a namespace until stateful firewall rule or nat rule gets added. This patch breaks backwards compatibility in the sense that connection tracking won't be active anymore when the protocol tracker module is loaded. This breaks e.g. setups that ctnetlink for flow accounting and the like, without any '-m conntrack' packet filter rules. Followup patch restores old behavour and makes new delayed scheme optional via sysctl. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l3proto.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index cf8f3dfd810d..e7dcd72be21c 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -52,6 +52,10 @@ struct nf_conntrack_l3proto { int (*tuple_to_nlattr)(struct sk_buff *skb, const struct nf_conntrack_tuple *t); + /* Called when netns wants to use connection tracking */ + int (*net_ns_get)(struct net *); + void (*net_ns_put)(struct net *); + /* * Calculate size of tuple nlattr */ -- cgit v1.2.3 From 481fa3734769b67f00ed09a42f2a6a8cbd00b869 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 15 Nov 2016 21:36:44 +0100 Subject: netfilter: conntrack: add nf_conntrack_default_on sysctl This switch (default on) can be used to disable automatic registration of connection tracking functionality in newly created network namespaces. This means that when net namespace goes down (or the tracker protocol module is unloaded) we *might* have to unregister the hooks. We can either add another per-netns variable that tells if the hooks got registered by default, or, alternatively, just call the protocol _put() function and have the callee deal with a possible 'extra' put() operation that doesn't pair with a get() one. This uses the latter approach, i.e. a put() without a get has no effect. Conntrack is still enabled automatically regardless of the new sysctl setting if the new net namespace requires connection tracking, e.g. when NAT rules are created. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l3proto.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index e7dcd72be21c..e01559b4d781 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -73,9 +73,18 @@ struct nf_conntrack_l3proto { extern struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX]; +#ifdef CONFIG_SYSCTL /* Protocol pernet registration. */ int nf_ct_l3proto_pernet_register(struct net *net, struct nf_conntrack_l3proto *proto); +#else +static inline int nf_ct_l3proto_pernet_register(struct net *n, + struct nf_conntrack_l3proto *p) +{ + return 0; +} +#endif + void nf_ct_l3proto_pernet_unregister(struct net *net, struct nf_conntrack_l3proto *proto); -- cgit v1.2.3 From 9115e8cd2a0c6eaaa900c462721f12e1d45f326c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 3 Dec 2016 11:14:56 -0800 Subject: net: reorganize struct sock for better data locality Group fields used in TX path, and keep some cache lines mostly read to permit sharing among cpus. Gained two 4 bytes holes on 64bit arches. Added a place holder for tcp tsq_flags, next to sk_wmem_alloc to speed up tcp_wfree() in the following patch. I have not added ____cacheline_aligned_in_smp, this might be done later. I prefer doing this once inet and tcp/udp sockets reorg is also done. Tested with both TCP and UDP. UDP receiver performance under flood increased by ~20 % : Accessing sk_filter/sk_wq/sk_napi_id no longer stalls because sk_drops was moved away from a critical cache line, now mostly read and shared. /* --- cacheline 4 boundary (256 bytes) --- */ unsigned int sk_napi_id; /* 0x100 0x4 */ int sk_rcvbuf; /* 0x104 0x4 */ struct sk_filter * sk_filter; /* 0x108 0x8 */ union { struct socket_wq * sk_wq; /* 0x8 */ struct socket_wq * sk_wq_raw; /* 0x8 */ }; /* 0x110 0x8 */ struct xfrm_policy * sk_policy[2]; /* 0x118 0x10 */ struct dst_entry * sk_rx_dst; /* 0x128 0x8 */ struct dst_entry * sk_dst_cache; /* 0x130 0x8 */ atomic_t sk_omem_alloc; /* 0x138 0x4 */ int sk_sndbuf; /* 0x13c 0x4 */ /* --- cacheline 5 boundary (320 bytes) --- */ int sk_wmem_queued; /* 0x140 0x4 */ atomic_t sk_wmem_alloc; /* 0x144 0x4 */ long unsigned int sk_tsq_flags; /* 0x148 0x8 */ struct sk_buff * sk_send_head; /* 0x150 0x8 */ struct sk_buff_head sk_write_queue; /* 0x158 0x18 */ __s32 sk_peek_off; /* 0x170 0x4 */ int sk_write_pending; /* 0x174 0x4 */ long int sk_sndtimeo; /* 0x178 0x8 */ Signed-off-by: Eric Dumazet Tested-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/sock.h | 51 +++++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 24 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 69afda6bea15..6dfe3aa22b97 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -343,6 +343,9 @@ struct sock { #define sk_rxhash __sk_common.skc_rxhash socket_lock_t sk_lock; + atomic_t sk_drops; + int sk_rcvlowat; + struct sk_buff_head sk_error_queue; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with @@ -359,14 +362,13 @@ struct sock { struct sk_buff *tail; } sk_backlog; #define sk_rmem_alloc sk_backlog.rmem_alloc - int sk_forward_alloc; - __u32 sk_txhash; + int sk_forward_alloc; #ifdef CONFIG_NET_RX_BUSY_POLL - unsigned int sk_napi_id; unsigned int sk_ll_usec; + /* ===== mostly read cache line ===== */ + unsigned int sk_napi_id; #endif - atomic_t sk_drops; int sk_rcvbuf; struct sk_filter __rcu *sk_filter; @@ -379,11 +381,30 @@ struct sock { #endif struct dst_entry *sk_rx_dst; struct dst_entry __rcu *sk_dst_cache; - /* Note: 32bit hole on 64bit arches */ - atomic_t sk_wmem_alloc; atomic_t sk_omem_alloc; int sk_sndbuf; + + /* ===== cache line for TX ===== */ + int sk_wmem_queued; + atomic_t sk_wmem_alloc; + unsigned long sk_tsq_flags; + struct sk_buff *sk_send_head; struct sk_buff_head sk_write_queue; + __s32 sk_peek_off; + int sk_write_pending; + long sk_sndtimeo; + struct timer_list sk_timer; + __u32 sk_priority; + __u32 sk_mark; + u32 sk_pacing_rate; /* bytes per second */ + u32 sk_max_pacing_rate; + struct page_frag sk_frag; + netdev_features_t sk_route_caps; + netdev_features_t sk_route_nocaps; + int sk_gso_type; + unsigned int sk_gso_max_size; + gfp_t sk_allocation; + __u32 sk_txhash; /* * Because of non atomicity rules, all @@ -414,42 +435,24 @@ struct sock { #define SK_PROTOCOL_MAX U8_MAX kmemcheck_bitfield_end(flags); - int sk_wmem_queued; - gfp_t sk_allocation; - u32 sk_pacing_rate; /* bytes per second */ - u32 sk_max_pacing_rate; - netdev_features_t sk_route_caps; - netdev_features_t sk_route_nocaps; - int sk_gso_type; - unsigned int sk_gso_max_size; u16 sk_gso_max_segs; - int sk_rcvlowat; unsigned long sk_lingertime; - struct sk_buff_head sk_error_queue; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; int sk_err, sk_err_soft; u32 sk_ack_backlog; u32 sk_max_ack_backlog; - __u32 sk_priority; - __u32 sk_mark; kuid_t sk_uid; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; long sk_rcvtimeo; - long sk_sndtimeo; - struct timer_list sk_timer; ktime_t sk_stamp; u16 sk_tsflags; u8 sk_shutdown; u32 sk_tskey; struct socket *sk_socket; void *sk_user_data; - struct page_frag sk_frag; - struct sk_buff *sk_send_head; - __s32 sk_peek_off; - int sk_write_pending; #ifdef CONFIG_SECURITY void *sk_security; #endif -- cgit v1.2.3 From 1c0d32fde5bdf1184bc274f864c09799278a1114 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 4 Dec 2016 09:48:16 -0800 Subject: net_sched: gen_estimator: complete rewrite of rate estimators 1) Old code was hard to maintain, due to complex lock chains. (We probably will be able to remove some kfree_rcu() in callers) 2) Using a single timer to update all estimators does not scale. 3) Code was buggy on 32bit kernel (WRITE_ONCE() on 64bit quantity is not supposed to work well) In this rewrite : - I removed the RB tree that had to be scanned in gen_estimator_active(). qdisc dumps should be much faster. - Each estimator has its own timer. - Estimations are maintained in net_rate_estimator structure, instead of dirtying the qdisc. Minor, but part of the simplification. - Reading the estimator uses RCU and a seqcount to provide proper support for 32bit kernels. - We reduce memory need when estimators are not used, since we store a pointer, instead of the bytes/packets counters. - xt_rateest_mt() no longer has to grab a spinlock. (In the future, xt_rateest_tg() could be switched to per cpu counters) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- include/net/gen_stats.h | 17 +++++++++-------- include/net/netfilter/xt_rateest.h | 10 +++++++--- include/net/sch_generic.h | 2 +- 4 files changed, 18 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 9dddf77a69cc..1d716449209e 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -36,7 +36,7 @@ struct tc_action { struct tcf_t tcfa_tm; struct gnet_stats_basic_packed tcfa_bstats; struct gnet_stats_queue tcfa_qstats; - struct gnet_stats_rate_est64 tcfa_rate_est; + struct net_rate_estimator __rcu *tcfa_rate_est; spinlock_t tcfa_lock; struct rcu_head tcfa_rcu; struct gnet_stats_basic_cpu __percpu *cpu_bstats; diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index 231e121cc7d9..8b7aa370e7a4 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -11,6 +11,8 @@ struct gnet_stats_basic_cpu { struct u64_stats_sync syncp; }; +struct net_rate_estimator; + struct gnet_dump { spinlock_t * lock; struct sk_buff * skb; @@ -42,8 +44,7 @@ void __gnet_stats_copy_basic(const seqcount_t *running, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b); int gnet_stats_copy_rate_est(struct gnet_dump *d, - const struct gnet_stats_basic_packed *b, - struct gnet_stats_rate_est64 *r); + struct net_rate_estimator __rcu **ptr); int gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue __percpu *cpu_q, struct gnet_stats_queue *q, __u32 qlen); @@ -53,16 +54,16 @@ int gnet_stats_finish_copy(struct gnet_dump *d); int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, - struct gnet_stats_rate_est64 *rate_est, + struct net_rate_estimator __rcu **rate_est, spinlock_t *stats_lock, seqcount_t *running, struct nlattr *opt); -void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_rate_est64 *rate_est); +void gen_kill_estimator(struct net_rate_estimator __rcu **ptr); int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, - struct gnet_stats_rate_est64 *rate_est, + struct net_rate_estimator __rcu **ptr, spinlock_t *stats_lock, seqcount_t *running, struct nlattr *opt); -bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, - const struct gnet_stats_rate_est64 *rate_est); +bool gen_estimator_active(struct net_rate_estimator __rcu **ptr); +bool gen_estimator_read(struct net_rate_estimator __rcu **ptr, + struct gnet_stats_rate_est64 *sample); #endif diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h index 79f45e19f31e..130e58361f99 100644 --- a/include/net/netfilter/xt_rateest.h +++ b/include/net/netfilter/xt_rateest.h @@ -1,19 +1,23 @@ #ifndef _XT_RATEEST_H #define _XT_RATEEST_H +#include + struct xt_rateest { /* keep lock and bstats on same cache line to speedup xt_rateest_tg() */ struct gnet_stats_basic_packed bstats; spinlock_t lock; - /* keep rstats and lock on same cache line to speedup xt_rateest_mt() */ - struct gnet_stats_rate_est64 rstats; + /* following fields not accessed in hot path */ + unsigned int refcnt; struct hlist_node list; char name[IFNAMSIZ]; - unsigned int refcnt; struct gnet_estimator params; struct rcu_head rcu; + + /* keep this field far away to speedup xt_rateest_mt() */ + struct net_rate_estimator __rcu *rate_est; }; struct xt_rateest *xt_rateest_lookup(const char *name); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e6aa0a249672..498f81b229a4 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -76,7 +76,7 @@ struct Qdisc { struct netdev_queue *dev_queue; - struct gnet_stats_rate_est64 rate_est; + struct net_rate_estimator __rcu *rate_est; struct gnet_stats_basic_cpu __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; -- cgit v1.2.3 From 834184b1f3a4635efbdfdae5fb437f109f6605fa Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 15 Nov 2016 21:36:45 +0100 Subject: netfilter: defrag: only register defrag functionality if needed nf_defrag modules for ipv4 and ipv6 export an empty stub function. Any module that needs the defragmentation hooks registered simply 'calls' this empty function to create a phony module dependency -- modprobe will then load the defrag module too. This extends netfilter ipv4/ipv6 defragmentation modules to delay the hook registration until the functionality is requested within a network namespace instead of module load time for all namespaces. Hooks are only un-registered on module unload or when a namespace that used such defrag functionality exits. We have to use struct net for this as the register hooks can be called before netns initialization here from the ipv4/ipv6 conntrack module init path. There is no unregister functionality support, defrag will always be active once it was requested inside a net namespace. The reason is that defrag has impact on nft and iptables rulesets (without defrag we might see framents). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_defrag_ipv4.h | 3 ++- include/net/netfilter/ipv6/nf_defrag_ipv6.h | 3 ++- include/net/netns/netfilter.h | 6 ++++++ 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/ipv4/nf_defrag_ipv4.h b/include/net/netfilter/ipv4/nf_defrag_ipv4.h index f01ef208dff6..db405f70e538 100644 --- a/include/net/netfilter/ipv4/nf_defrag_ipv4.h +++ b/include/net/netfilter/ipv4/nf_defrag_ipv4.h @@ -1,6 +1,7 @@ #ifndef _NF_DEFRAG_IPV4_H #define _NF_DEFRAG_IPV4_H -void nf_defrag_ipv4_enable(void); +struct net; +int nf_defrag_ipv4_enable(struct net *); #endif /* _NF_DEFRAG_IPV4_H */ diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h index ddf162f7966f..7664efe37974 100644 --- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h +++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h @@ -1,7 +1,8 @@ #ifndef _NF_DEFRAG_IPV6_H #define _NF_DEFRAG_IPV6_H -void nf_defrag_ipv6_enable(void); +struct net; +int nf_defrag_ipv6_enable(struct net *); int nf_ct_frag6_init(void); void nf_ct_frag6_cleanup(void); diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index 58487b1cc99a..cea396b53a60 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -17,5 +17,11 @@ struct netns_nf { struct ctl_table_header *nf_log_dir_header; #endif struct nf_hook_entry __rcu *hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) + bool defrag_ipv4; +#endif +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + bool defrag_ipv6; +#endif }; #endif -- cgit v1.2.3 From 1814096980bbe546c4384b7b064126cbe7d40d30 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 24 Nov 2016 12:04:55 +0100 Subject: netfilter: nft_payload: layer 4 checksum adjustment for pseudoheader fields This patch adds a new flag that signals the kernel to update layer 4 checksum if the packet field belongs to the layer 4 pseudoheader. This implicitly provides stateless NAT 1:1 that is useful under very specific usecases. Since rules mangling layer 3 fields that are part of the pseudoheader may potentially convey any layer 4 packet, we have to deal with the layer 4 checksum adjustment using protocol specific code. This patch adds support for TCP, UDP and ICMPv6, since they include the pseudoheader in the layer 4 checksum calculation. ICMP doesn't, so we can skip it. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 862373d4ea9d..8f690effec37 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -45,6 +45,7 @@ struct nft_payload_set { enum nft_registers sreg:8; u8 csum_type; u8 csum_offset; + u8 csum_flags; }; extern const struct nft_expr_ops nft_payload_fast_ops; -- cgit v1.2.3 From 3bf3276119455bd0fc7a7e31be2823118e613842 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 28 Nov 2016 11:40:06 +0100 Subject: netfilter: add and use nf_fwd_netdev_egress ... so we can use current skb instead of working with a clone. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_dup_netdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_dup_netdev.h b/include/net/netfilter/nf_dup_netdev.h index 397dcae349f9..3e919356bedf 100644 --- a/include/net/netfilter/nf_dup_netdev.h +++ b/include/net/netfilter/nf_dup_netdev.h @@ -2,5 +2,6 @@ #define _NF_DUP_NETDEV_H_ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif); +void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif); #endif -- cgit v1.2.3 From e50092404c1bc7aaeb0a0f4077fa6f07b073a20f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Nov 2016 00:04:32 +0100 Subject: netfilter: nf_tables: add stateful objects This patch augments nf_tables to support stateful objects. This new infrastructure allows you to create, dump and delete stateful objects, that are identified by a user-defined name. This patch adds the generic infrastructure, follow up patches add support for two stateful objects: counters and quotas. This patch provides a native infrastructure for nf_tables to replace nfacct, the extended accounting infrastructure for iptables. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 79 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 32970cba184a..903cd618f50e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -875,6 +875,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); * @list: used internally * @chains: chains in the table * @sets: sets in the table + * @objects: stateful objects in the table * @hgenerator: handle generator state * @use: number of chain references to this table * @flags: table flag (see enum nft_table_flags) @@ -885,6 +886,7 @@ struct nft_table { struct list_head list; struct list_head chains; struct list_head sets; + struct list_head objects; u64 hgenerator; u32 use; u16 flags:14, @@ -934,6 +936,73 @@ void nft_unregister_expr(struct nft_expr_type *); int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v); +/** + * struct nft_object - nf_tables stateful object + * + * @list: table stateful object list node + * @type: pointer to object type + * @data: pointer to object data + * @name: name of this stateful object + * @genmask: generation mask + * @use: number of references to this stateful object + * @data: object data, layout depends on type + */ +struct nft_object { + struct list_head list; + char name[NFT_OBJ_MAXNAMELEN]; + u32 genmask:2, + use:30; + /* runtime data below here */ + const struct nft_object_type *type ____cacheline_aligned; + unsigned char data[] + __attribute__((aligned(__alignof__(u64)))); +}; + +static inline void *nft_obj_data(const struct nft_object *obj) +{ + return (void *)obj->data; +} + +#define nft_expr_obj(expr) *((struct nft_object **)nft_expr_priv(expr)) + +struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, + const struct nlattr *nla, u32 objtype, + u8 genmask); + +/** + * struct nft_object_type - stateful object type + * + * @eval: stateful object evaluation function + * @list: list node in list of object types + * @type: stateful object numeric type + * @size: stateful object size + * @owner: module owner + * @maxattr: maximum netlink attribute + * @policy: netlink attribute policy + * @init: initialize object from netlink attributes + * @destroy: release existing stateful object + * @dump: netlink dump stateful object + */ +struct nft_object_type { + void (*eval)(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt); + struct list_head list; + u32 type; + unsigned int size; + unsigned int maxattr; + struct module *owner; + const struct nla_policy *policy; + int (*init)(const struct nlattr * const tb[], + struct nft_object *obj); + void (*destroy)(struct nft_object *obj); + int (*dump)(struct sk_buff *skb, + const struct nft_object *obj); +}; + +int nft_register_obj(struct nft_object_type *obj_type); +void nft_unregister_obj(struct nft_object_type *obj_type); + /** * struct nft_traceinfo - nft tracing information and state * @@ -981,6 +1050,9 @@ void nft_trace_notify(struct nft_traceinfo *info); #define MODULE_ALIAS_NFT_SET() \ MODULE_ALIAS("nft-set") +#define MODULE_ALIAS_NFT_OBJ(type) \ + MODULE_ALIAS("nft-obj-" __stringify(type)) + /* * The gencursor defines two generations, the currently active and the * next one. Objects contain a bitmask of 2 bits specifying the generations @@ -1157,4 +1229,11 @@ struct nft_trans_elem { #define nft_trans_elem(trans) \ (((struct nft_trans_elem *)trans->data)->elem) +struct nft_trans_obj { + struct nft_object *obj; +}; + +#define nft_trans_obj(trans) \ + (((struct nft_trans_obj *)trans->data)->obj) + #endif /* _NET_NF_TABLES_H */ -- cgit v1.2.3 From 43da04a593d8b2626f1cf4b56efe9402f6b53652 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Nov 2016 00:05:44 +0100 Subject: netfilter: nf_tables: atomic dump and reset for stateful objects This patch adds a new NFT_MSG_GETOBJ_RESET command perform an atomic dump-and-reset of the stateful object. This also comes with add support for atomic dump and reset for counter and quota objects. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 903cd618f50e..6f7d6a1dc09c 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -997,7 +997,8 @@ struct nft_object_type { struct nft_object *obj); void (*destroy)(struct nft_object *obj); int (*dump)(struct sk_buff *skb, - const struct nft_object *obj); + struct nft_object *obj, + bool reset); }; int nft_register_obj(struct nft_object_type *obj_type); -- cgit v1.2.3 From 2599e98934c5ad166ad184b3682e38aadcb63fb3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Nov 2016 00:05:48 +0100 Subject: netfilter: nf_tables: notify internal updates of stateful objects Introduce nf_tables_obj_notify() to notify internal state changes in stateful objects. This is used by the quota object to report depletion in a follow up patch. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 6f7d6a1dc09c..339e374c28b5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -969,6 +969,10 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, const struct nlattr *nla, u32 objtype, u8 genmask); +int nft_obj_notify(struct net *net, struct nft_table *table, + struct nft_object *obj, u32 portid, u32 seq, + int event, int family, int report, gfp_t gfp); + /** * struct nft_object_type - stateful object type * -- cgit v1.2.3 From 1896531710abcd9a961a17d0c5c6a9f537d479b6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Nov 2016 00:05:56 +0100 Subject: netfilter: nft_quota: add depleted flag for objects Notify on depleted quota objects. The NFT_QUOTA_F_DEPLETED flag indicates we have reached overquota. Add pointer to table from nft_object, so we can use it when sending the depletion notification to userspace. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 339e374c28b5..ce6fb6e83b32 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -940,6 +940,7 @@ int nft_verdict_dump(struct sk_buff *skb, int type, * struct nft_object - nf_tables stateful object * * @list: table stateful object list node + * @table: table this object belongs to * @type: pointer to object type * @data: pointer to object data * @name: name of this stateful object @@ -950,6 +951,7 @@ int nft_verdict_dump(struct sk_buff *skb, int type, struct nft_object { struct list_head list; char name[NFT_OBJ_MAXNAMELEN]; + struct nft_table *table; u32 genmask:2, use:30; /* runtime data below here */ -- cgit v1.2.3 From 8aeff920dcc9b3f8cf43042a76428582634d9208 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Nov 2016 00:06:00 +0100 Subject: netfilter: nf_tables: add stateful object reference to set elements This patch allows you to refer to stateful objects from set elements. This provides the infrastructure to create maps where the right hand side of the mapping is a stateful object. This allows us to build dictionaries of stateful objects, that you can use to perform fast lookups using any arbitrary key combination. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index ce6fb6e83b32..85f0f03f1e87 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -326,6 +326,7 @@ void nft_unregister_set(struct nft_set_ops *ops); * @name: name of the set * @ktype: key type (numeric type defined by userspace, not used in the kernel) * @dtype: data type (verdict or numeric type defined by userspace) + * @objtype: object type (see NFT_OBJECT_* definitions) * @size: maximum set size * @nelems: number of elements * @ndeact: number of deactivated elements queued for removal @@ -347,6 +348,7 @@ struct nft_set { char name[NFT_SET_MAXNAMELEN]; u32 ktype; u32 dtype; + u32 objtype; u32 size; atomic_t nelems; u32 ndeact; @@ -416,6 +418,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, * @NFT_SET_EXT_EXPIRATION: element expiration time * @NFT_SET_EXT_USERDATA: user data associated with the element * @NFT_SET_EXT_EXPR: expression assiociated with the element + * @NFT_SET_EXT_OBJREF: stateful object reference associated with element * @NFT_SET_EXT_NUM: number of extension types */ enum nft_set_extensions { @@ -426,6 +429,7 @@ enum nft_set_extensions { NFT_SET_EXT_EXPIRATION, NFT_SET_EXT_USERDATA, NFT_SET_EXT_EXPR, + NFT_SET_EXT_OBJREF, NFT_SET_EXT_NUM }; @@ -554,6 +558,11 @@ static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, return elem + set->ops->elemsize; } +static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext) +{ + return nft_set_ext(ext, NFT_SET_EXT_OBJREF); +} + void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *data, -- cgit v1.2.3 From 8411b6442e59810fe0750a2f321b9dcb7d0a3d17 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 5 Dec 2016 23:35:50 +0100 Subject: netfilter: nf_tables: support for set flushing This patch adds support for set flushing, that consists of walking over the set elements if the NFTA_SET_ELEM_LIST_ELEMENTS attribute is set. This patch requires the following changes: 1) Add set->ops->deactivate_one() operation: This allows us to deactivate an element from the set element walk path, given we can skip the lookup that happens in ->deactivate(). 2) Add a new nft_trans_alloc_gfp() function since we need to allocate transactions using GFP_ATOMIC given the set walk path happens with held rcu_read_lock. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 85f0f03f1e87..924325c46aab 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -259,7 +259,8 @@ struct nft_expr; * @lookup: look up an element within the set * @insert: insert new element into set * @activate: activate new element in the next generation - * @deactivate: deactivate element in the next generation + * @deactivate: lookup for element and deactivate it in the next generation + * @deactivate_one: deactivate element in the next generation * @remove: remove element from set * @walk: iterate over all set elemeennts * @privsize: function to return size of set private data @@ -294,6 +295,9 @@ struct nft_set_ops { void * (*deactivate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); + bool (*deactivate_one)(const struct net *net, + const struct nft_set *set, + void *priv); void (*remove)(const struct nft_set *set, const struct nft_set_elem *elem); void (*walk)(const struct nft_ctx *ctx, -- cgit v1.2.3 From 5b8e2f61b9df529ca4af057daf7bfb1de348bdf1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 6 Dec 2016 19:32:50 -0800 Subject: net: sock_rps_record_flow() is for connected sockets Paolo noticed a cache line miss in UDP recvmsg() to access sk_rxhash, sharing a cache line with sk_drops. sk_drops might be heavily incremented by cpus handling a flood targeting this socket. We might place sk_drops on a separate cache line, but lets try to avoid wasting 64 bytes per socket just for this, since we have other bottlenecks to take care of. sock_rps_record_flow() should only access sk_rxhash for connected flows. Testing sk_state for TCP_ESTABLISHED covers most of the cases for connected sockets, for a zero cost, since system calls using sock_rps_record_flow() also access sk->sk_prot which is on the same cache line. A follow up patch will provide a static_key (Jump Label) since most hosts do not even use RFS. Signed-off-by: Eric Dumazet Reported-by: Paolo Abeni Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/sock.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 6dfe3aa22b97..1749e38d0301 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -913,7 +913,17 @@ static inline void sock_rps_record_flow_hash(__u32 hash) static inline void sock_rps_record_flow(const struct sock *sk) { #ifdef CONFIG_RPS - sock_rps_record_flow_hash(sk->sk_rxhash); + /* Reading sk->sk_rxhash might incur an expensive cache line miss. + * + * TCP_ESTABLISHED does cover almost all states where RFS + * might be useful, and is cheaper [1] than testing : + * IPv4: inet_sk(sk)->inet_daddr + * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) + * OR an additional socket flag + * [1] : sk_state and sk_prot are in the same cache line. + */ + if (sk->sk_state == TCP_ESTABLISHED) + sock_rps_record_flow_hash(sk->sk_rxhash); #endif } -- cgit v1.2.3 From 972d3876faa8a9195122b2d2bcd3155f904fff37 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 7 Dec 2016 13:48:27 +0100 Subject: flow dissector: ICMP support Allow dissection of ICMP(V6) type and code. This should only occur if a packet is ICMP(V6) and the dissector has FLOW_DISSECTOR_KEY_ICMP set. There are currently no users of FLOW_DISSECTOR_KEY_ICMP. A follow-up patch will allow FLOW_DISSECTOR_KEY_ICMP to be used by the flower classifier. Signed-off-by: Simon Horman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index c4f31666afd2..d896a33e00d4 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -104,6 +104,22 @@ struct flow_dissector_key_ports { }; }; +/** + * flow_dissector_key_icmp: + * @ports: type and code of ICMP header + * icmp: ICMP type (high) and code (low) + * type: ICMP type + * code: ICMP code + */ +struct flow_dissector_key_icmp { + union { + __be16 icmp; + struct { + u8 type; + u8 code; + }; + }; +}; /** * struct flow_dissector_key_eth_addrs: @@ -122,6 +138,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */ FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */ + FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */ FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */ FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */ FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_flow_vlan */ -- cgit v1.2.3 From 13bfff25c081f4e060af761c4082b5a96f756810 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 7 Dec 2016 08:29:10 -0800 Subject: net: rfs: add a jump label RFS is not commonly used, so add a jump label to avoid some conditionals in fast path. Signed-off-by: Eric Dumazet Cc: Paolo Abeni Signed-off-by: David S. Miller --- include/net/sock.h | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 1749e38d0301..2729e77950b7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -913,17 +913,20 @@ static inline void sock_rps_record_flow_hash(__u32 hash) static inline void sock_rps_record_flow(const struct sock *sk) { #ifdef CONFIG_RPS - /* Reading sk->sk_rxhash might incur an expensive cache line miss. - * - * TCP_ESTABLISHED does cover almost all states where RFS - * might be useful, and is cheaper [1] than testing : - * IPv4: inet_sk(sk)->inet_daddr - * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) - * OR an additional socket flag - * [1] : sk_state and sk_prot are in the same cache line. - */ - if (sk->sk_state == TCP_ESTABLISHED) - sock_rps_record_flow_hash(sk->sk_rxhash); + if (static_key_false(&rfs_needed)) { + /* Reading sk->sk_rxhash might incur an expensive cache line + * miss. + * + * TCP_ESTABLISHED does cover almost all states where RFS + * might be useful, and is cheaper [1] than testing : + * IPv4: inet_sk(sk)->inet_daddr + * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) + * OR an additional socket flag + * [1] : sk_state and sk_prot are in the same cache line. + */ + if (sk->sk_state == TCP_ESTABLISHED) + sock_rps_record_flow_hash(sk->sk_rxhash); + } #endif } -- cgit v1.2.3 From 3665f3817cd354ab7a811b3a4f282c4f5cb1a0d0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 7 Dec 2016 10:05:36 -0800 Subject: net: do not read sk_drops if application does not care sk_drops can be an often written field, do not read it unless application showed interest. Note that sk_drops can be read via inet_diag, so applications can avoid getting this info from every received packet. In the future, 'reading' sk_drops might require folding per node or per cpu fields, and thus become even more expensive than today. Signed-off-by: Eric Dumazet Cc: Paolo Abeni Signed-off-by: David S. Miller --- include/net/sock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 2729e77950b7..e17aa3de2b4d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2163,7 +2163,8 @@ struct sock_skb_cb { static inline void sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) { - SOCK_SKB_CB(skb)->dropcount = atomic_read(&sk->sk_drops); + SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? + atomic_read(&sk->sk_drops) : 0; } static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) -- cgit v1.2.3 From e6f462df9acd2a3295e5d34eb29e2823220cf129 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 8 Dec 2016 17:22:09 +0100 Subject: cfg80211/mac80211: fix BSS leaks when abandoning assoc attempts When mac80211 abandons an association attempt, it may free all the data structures, but inform cfg80211 and userspace about it only by sending the deauth frame it received, in which case cfg80211 has no link to the BSS struct that was used and will not cfg80211_unhold_bss() it. Fix this by providing a way to inform cfg80211 of this with the BSS entry passed, so that it can clean up properly, and use this ability in the appropriate places in mac80211. This isn't ideal: some code is more or less duplicated and tracing is missing. However, it's a fairly small change and it's thus easier to backport - cleanups can come later. Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2019310cf135..814be4b4200c 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4685,6 +4685,17 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, */ void cfg80211_assoc_timeout(struct net_device *dev, struct cfg80211_bss *bss); +/** + * cfg80211_abandon_assoc - notify cfg80211 of abandoned association attempt + * @dev: network device + * @bss: The BSS entry with which association was abandoned. + * + * Call this whenever - for reasons reported through other API, like deauth RX, + * an association attempt was abandoned. + * This function may sleep. The caller must hold the corresponding wdev's mutex. + */ +void cfg80211_abandon_assoc(struct net_device *dev, struct cfg80211_bss *bss); + /** * cfg80211_tx_mlme_mgmt - notification of transmitted deauth/disassoc frame * @dev: network device -- cgit v1.2.3