From 23a1f8d44c0bca48f04fc2a2f1edafd826ce6133 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Tue, 8 Dec 2015 16:04:31 +0200 Subject: mac80211: process and save VHT MU-MIMO group frame The Group ID Management frame is an Action frame of category VHT. It is transmitted by the AP to assign or change the user position of a STA for one or more group IDs. Process and save the group membership data. Notify underlying driver of changes. Signed-off-by: Sara Sharon Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7c30faff245f..8da483b2c067 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -298,6 +298,7 @@ struct ieee80211_vif_chanctx_switch { * note that this is only called when it changes after the channel * context had been assigned. * @BSS_CHANGED_OCB: OCB join status changed + * @BSS_CHANGED_MU_GROUPS: VHT MU-MIMO group id or user position changed */ enum ieee80211_bss_change { BSS_CHANGED_ASSOC = 1<<0, @@ -323,6 +324,7 @@ enum ieee80211_bss_change { BSS_CHANGED_BEACON_INFO = 1<<20, BSS_CHANGED_BANDWIDTH = 1<<21, BSS_CHANGED_OCB = 1<<22, + BSS_CHANGED_MU_GROUPS = 1<<23, /* when adding here, make sure to change ieee80211_reconfig */ }; @@ -435,6 +437,19 @@ struct ieee80211_event { } u; }; +/** + * struct ieee80211_mu_group_data - STA's VHT MU-MIMO group data + * + * This structure describes the group id data of VHT MU-MIMO + * + * @membership: 64 bits array - a bit is set if station is member of the group + * @position: 2 bits per group id indicating the position in the group + */ +struct ieee80211_mu_group_data { + u8 membership[WLAN_MEMBERSHIP_LEN]; + u8 position[WLAN_USER_POSITION_LEN]; +}; + /** * struct ieee80211_bss_conf - holds the BSS's changing parameters * @@ -477,6 +492,7 @@ struct ieee80211_event { * @enable_beacon: whether beaconing should be enabled or not * @chandef: Channel definition for this BSS -- the hardware might be * configured a higher bandwidth than this BSS uses, for example. + * @mu_group: VHT MU-MIMO group membership data * @ht_operation_mode: HT operation mode like in &struct ieee80211_ht_operation. * This field is only valid when the channel is a wide HT/VHT channel. * Note that with TDLS this can be the case (channel is HT, protection must @@ -535,6 +551,7 @@ struct ieee80211_bss_conf { s32 cqm_rssi_thold; u32 cqm_rssi_hyst; struct cfg80211_chan_def chandef; + struct ieee80211_mu_group_data mu_group; __be32 arp_addr_list[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; int arp_addr_cnt; bool qos; -- cgit v1.2.3 From f9cfa5f354b11e56cd8f019c12e14a42585586cd Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Tue, 8 Dec 2015 16:04:33 +0200 Subject: mac80211: add flag for duplication check Add an option for driver to check for packet duplication by itself. This is needed for example by the iwlwifi driver which parallelizes the RX path and does the duplication check per queue. Signed-off-by: Sara Sharon Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 8da483b2c067..ecab934dc8d9 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1063,7 +1063,7 @@ enum mac80211_rx_flags { RX_FLAG_HT_GF = BIT(13), RX_FLAG_AMPDU_DETAILS = BIT(14), RX_FLAG_PN_VALIDATED = BIT(15), - /* bit 16 free */ + RX_FLAG_DUP_VALIDATED = BIT(16), RX_FLAG_AMPDU_LAST_KNOWN = BIT(17), RX_FLAG_AMPDU_IS_LAST = BIT(18), RX_FLAG_AMPDU_DELIM_CRC_ERROR = BIT(19), -- cgit v1.2.3 From fad471860c097844432c7cf5d3ae6a0a059c2bdc Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Tue, 8 Dec 2015 16:04:34 +0200 Subject: mac80211: pass RX aggregation window size to driver Currently mac80211 does not inform the driver of the window size when starting an RX aggregation session. To enable managing the reorder buffer in the driver or hardware the window size is needed. Signed-off-by: Sara Sharon Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index ecab934dc8d9..a990338a766e 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3047,9 +3047,11 @@ enum ieee80211_reconfig_type { * ieee80211_ampdu_mlme_action. Starting sequence number (@ssn) * is the first frame we expect to perform the action on. Notice * that TX/RX_STOP can pass NULL for this parameter. - * The @buf_size parameter is only valid when the action is set to - * %IEEE80211_AMPDU_TX_OPERATIONAL and indicates the peer's reorder - * buffer size (number of subframes) for this session -- the driver + * The @buf_size parameter is valid only when the action is set to + * %IEEE80211_AMPDU_RX_START or %IEEE80211_AMPDU_TX_OPERATIONAL and + * indicates the reorder buffer size (number of subframes) for this + * session. + * When the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL the driver * may neither send aggregates containing more subframes than this * nor send aggregates in a way that lost frames would exceed the * buffer size. If just limiting the aggregate size, this would be -- cgit v1.2.3 From 4352a4d7f6bfd0aed0276a13fa4993db35714db4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 8 Dec 2015 16:04:35 +0200 Subject: mac80211: document status.freq restrictions It's not always necessary to set the status.freq field, for example when this would be an expensive calculation. It must be set for all management frames (as they might be reported to userspace), but for data frames it's not really required. Document this. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a990338a766e..bdee1cc19c7e 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1108,6 +1108,8 @@ enum mac80211_rx_vht_flags { * it but can store it and pass it back to the driver for synchronisation * @band: the active band when this frame was received * @freq: frequency the radio was tuned to when receiving this frame, in MHz + * This field must be set for management frames, but isn't strictly needed + * for data (other) frames - for those it only affects radiotap reporting. * @signal: signal strength when receiving this frame, either in dBm, in dB or * unspecified depending on the hardware capabilities flags * @IEEE80211_HW_SIGNAL_* -- cgit v1.2.3 From 50ea05efaf3bed7dd34bcc2635a8b3f53bd0ccc1 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Wed, 30 Dec 2015 16:06:04 +0200 Subject: mac80211: pass block ack session timeout to to driver Currently mac80211 does not inform the driver of the session block ack timeout when starting a rx aggregation session. Drivers that manage the reorder buffer need to know this parameter. Seeing that there are now too many arguments for the drv_ampdu_action() function, wrap them inside a structure. Signed-off-by: Sara Sharon Signed-off-by: Johannes Berg --- include/net/mac80211.h | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index bdee1cc19c7e..6c9c559394b0 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2702,6 +2702,33 @@ enum ieee80211_ampdu_mlme_action { IEEE80211_AMPDU_TX_OPERATIONAL, }; +/** + * struct ieee80211_ampdu_params - AMPDU action parameters + * + * @action: the ampdu action, value from %ieee80211_ampdu_mlme_action. + * @sta: peer of this AMPDU session + * @tid: tid of the BA session + * @ssn: start sequence number of the session. TX/RX_STOP can pass 0. When + * action is set to %IEEE80211_AMPDU_RX_START the driver passes back the + * actual ssn value used to start the session and writes the value here. + * @buf_size: reorder buffer size (number of subframes). Valid only when the + * action is set to %IEEE80211_AMPDU_RX_START or + * %IEEE80211_AMPDU_TX_OPERATIONAL + * @amsdu: indicates the peer's ability to receive A-MSDU within A-MPDU. + * valid when the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL + * @timeout: BA session timeout. Valid only when the action is set to + * %IEEE80211_AMPDU_RX_START + */ +struct ieee80211_ampdu_params { + enum ieee80211_ampdu_mlme_action action; + struct ieee80211_sta *sta; + u16 tid; + u16 ssn; + u8 buf_size; + bool amsdu; + u16 timeout; +}; + /** * enum ieee80211_frame_release_type - frame release reason * @IEEE80211_FRAME_RELEASE_PSPOLL: frame released for PS-Poll @@ -3046,15 +3073,9 @@ enum ieee80211_reconfig_type { * @ampdu_action: Perform a certain A-MPDU action * The RA/TID combination determines the destination and TID we want * the ampdu action to be performed for. The action is defined through - * ieee80211_ampdu_mlme_action. Starting sequence number (@ssn) - * is the first frame we expect to perform the action on. Notice - * that TX/RX_STOP can pass NULL for this parameter. - * The @buf_size parameter is valid only when the action is set to - * %IEEE80211_AMPDU_RX_START or %IEEE80211_AMPDU_TX_OPERATIONAL and - * indicates the reorder buffer size (number of subframes) for this - * session. + * ieee80211_ampdu_mlme_action. * When the action is set to %IEEE80211_AMPDU_TX_OPERATIONAL the driver - * may neither send aggregates containing more subframes than this + * may neither send aggregates containing more subframes than @buf_size * nor send aggregates in a way that lost frames would exceed the * buffer size. If just limiting the aggregate size, this would be * possible with a buf_size of 8: @@ -3065,9 +3086,6 @@ enum ieee80211_reconfig_type { * buffer size of 8. Correct ways to retransmit #1 would be: * - TX: 1 or 18 or 81 * Even "189" would be wrong since 1 could be lost again. - * The @amsdu parameter is valid when the action is set to - * %IEEE80211_AMPDU_TX_OPERATIONAL and indicates the peer's ability - * to receive A-MSDU within A-MPDU. * * Returns a negative error code on failure. * The callback can sleep. @@ -3409,9 +3427,7 @@ struct ieee80211_ops { int (*tx_last_beacon)(struct ieee80211_hw *hw); int (*ampdu_action)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - enum ieee80211_ampdu_mlme_action action, - struct ieee80211_sta *sta, u16 tid, u16 *ssn, - u8 buf_size, bool amsdu); + struct ieee80211_ampdu_params *params); int (*get_survey)(struct ieee80211_hw *hw, int idx, struct survey_info *survey); void (*rfkill_poll)(struct ieee80211_hw *hw); -- cgit v1.2.3 From 61d2bcae99f66a640b3dd9632180209143fb5512 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 1 Feb 2016 21:03:07 -0800 Subject: tcp: fastopen: accept data/FIN present in SYNACK message RFC 7413 (TCP Fast Open) 4.2.2 states that the SYNACK message MAY include data and/or FIN This patch adds support for the client side : If we receive a SYNACK with payload or FIN, queue the skb instead of ignoring it. Since we already support the same for SYN, we refactor the existing code and reuse it. Note we need to clone the skb, so this operation might fail under memory pressure. Sara Dickinson pointed out FreeBSD server Fast Open implementation was planned to generate such SYNACK in the future. The server side might be implemented on linux later. Reported-by: Sara Dickinson Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index f6f8f032c73e..27f4c733116d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1437,6 +1437,7 @@ void tcp_free_fastopen_req(struct tcp_sock *tp); extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; int tcp_fastopen_reset_cipher(void *key, unsigned int len); +void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc, -- cgit v1.2.3 From e3e17b773bfe45462b7f3fae20c550025975cb13 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 6 Feb 2016 11:16:28 -0800 Subject: tcp: fastopen: call tcp_fin() if FIN present in SYNACK When we acknowledge a FIN, it is not enough to ack the sequence number and queue the skb into receive queue. We also have to call tcp_fin() to properly update socket state and send proper poll() notifications. It seems we also had the problem if we received a SYN packet with the FIN flag set, but it does not seem an urgent issue, as no known implementation can do that. Fixes: 61d2bcae99f6 ("tcp: fastopen: accept data/FIN present in SYNACK message") Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 27f4c733116d..479d535609fd 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -568,6 +568,7 @@ void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_reset(struct sock *sk); void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); +void tcp_fin(struct sock *sk); /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); -- cgit v1.2.3 From 0e715d6fbd2a4a1dcd215d6d51091346e6a3d3fa Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 2 Feb 2016 18:09:11 +0100 Subject: vxlan: cleanup types include/net/vxlan.h is a kernel header, no need to prefix fixed size types with double underscore. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/vxlan.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 0fb86442544b..5c64250619c5 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -30,15 +30,15 @@ * [0] https://tools.ietf.org/html/draft-smith-vxlan-group-policy */ struct vxlanhdr_gbp { - __u8 vx_flags; + u8 vx_flags; #ifdef __LITTLE_ENDIAN_BITFIELD - __u8 reserved_flags1:3, + u8 reserved_flags1:3, policy_applied:1, reserved_flags2:2, dont_learn:1, reserved_flags3:1; #elif defined(__BIG_ENDIAN_BITFIELD) - __u8 reserved_flags1:1, + u8 reserved_flags1:1, dont_learn:1, reserved_flags2:2, policy_applied:1, @@ -138,10 +138,10 @@ struct vxlan_config { int remote_ifindex; int mtu; __be16 dst_port; - __u16 port_min; - __u16 port_max; - __u8 tos; - __u8 ttl; + u16 port_min; + u16 port_max; + u8 tos; + u8 ttl; u32 flags; unsigned long age_interval; unsigned int addrmax; -- cgit v1.2.3 From 427bc465bf9fcdab749f6997ff7a4eecaef4ca40 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 2 Feb 2016 18:09:12 +0100 Subject: vxlan: remove duplicated macros VNI_HASH_BITS and VNI_HASH_SIZE are defined twice. Remove the extra definitions. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/vxlan.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 5c64250619c5..234bf1ef2737 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -9,9 +9,6 @@ #include #include -#define VNI_HASH_BITS 10 -#define VNI_HASH_SIZE (1< Date: Tue, 2 Feb 2016 18:09:13 +0100 Subject: vxlan: restructure vxlan.h definitions RCO and GBP are VXLAN extensions, not specified in RFC 7348. Because of that, they need to be explicitly enabled when creating vxlan interface. By default, those extensions are not used and plain VXLAN header is sent and received. Reflect this in vxlan.h: first, the plain VXLAN header is defined. Following it, RCO is documented and defined, and likewise for GBP. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/vxlan.h | 104 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 63 insertions(+), 41 deletions(-) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 234bf1ef2737..25bd919c9ef0 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -9,14 +9,71 @@ #include #include +/* VXLAN protocol (RFC 7348) header: + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |R|R|R|R|I|R|R|R| Reserved | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | VXLAN Network Identifier (VNI) | Reserved | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * I = VXLAN Network Identifier (VNI) present. + */ +struct vxlanhdr { + __be32 vx_flags; + __be32 vx_vni; +}; + +/* VXLAN header flags. */ +#define VXLAN_HF_VNI BIT(27) + +#define VXLAN_N_VID (1u << 24) +#define VXLAN_VID_MASK (VXLAN_N_VID - 1) +#define VXLAN_VNI_MASK (VXLAN_VID_MASK << 8) +#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) + +#define VNI_HASH_BITS 10 +#define VNI_HASH_SIZE (1<mark mapping @@ -59,44 +119,6 @@ struct vxlanhdr_gbp { #define VXLAN_GBP_POLICY_APPLIED (BIT(3) << 16) #define VXLAN_GBP_ID_MASK (0xFFFF) -/* VXLAN protocol header: - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * |G|R|R|R|I|R|R|C| Reserved | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | VXLAN Network Identifier (VNI) | Reserved | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * - * G = 1 Group Policy (VXLAN-GBP) - * I = 1 VXLAN Network Identifier (VNI) present - * C = 1 Remote checksum offload (RCO) - */ -struct vxlanhdr { - __be32 vx_flags; - __be32 vx_vni; -}; - -/* VXLAN header flags. */ -#define VXLAN_HF_RCO BIT(21) -#define VXLAN_HF_VNI BIT(27) -#define VXLAN_HF_GBP BIT(31) - -/* Remote checksum offload header option */ -#define VXLAN_RCO_MASK 0x7f /* Last byte of vni field */ -#define VXLAN_RCO_UDP 0x80 /* Indicate UDP RCO (TCP when not set *) */ -#define VXLAN_RCO_SHIFT 1 /* Left shift of start */ -#define VXLAN_RCO_SHIFT_MASK ((1 << VXLAN_RCO_SHIFT) - 1) -#define VXLAN_MAX_REMCSUM_START (VXLAN_RCO_MASK << VXLAN_RCO_SHIFT) - -#define VXLAN_N_VID (1u << 24) -#define VXLAN_VID_MASK (VXLAN_N_VID - 1) -#define VXLAN_VNI_MASK (VXLAN_VID_MASK << 8) -#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) - -#define VNI_HASH_BITS 10 -#define VNI_HASH_SIZE (1< Date: Wed, 3 Feb 2016 09:46:49 +0200 Subject: ipv4: Namespaceify tcp syn retries sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 ++ include/net/tcp.h | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2b7907a35568..b7b5bd64df35 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -98,6 +98,8 @@ struct netns_ipv4 { int sysctl_tcp_keepalive_probes; int sysctl_tcp_keepalive_intvl; + int sysctl_tcp_syn_retries; + struct ping_group_range ping_group_range; atomic_t dev_addr_genid; diff --git a/include/net/tcp.h b/include/net/tcp.h index 479d535609fd..825485c7cc1a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_syn_retries; extern int sysctl_tcp_synack_retries; extern int sysctl_tcp_retries1; extern int sysctl_tcp_retries2; -- cgit v1.2.3 From 7c083ecb3ba4583a625d5ff9655d1a819e374493 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:50 +0200 Subject: ipv4: Namespaceify tcp synack retries sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index b7b5bd64df35..9e83084ab8c1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -99,6 +99,7 @@ struct netns_ipv4 { int sysctl_tcp_keepalive_intvl; int sysctl_tcp_syn_retries; + int sysctl_tcp_synack_retries; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index 825485c7cc1a..05659e860039 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_synack_retries; extern int sysctl_tcp_retries1; extern int sysctl_tcp_retries2; extern int sysctl_tcp_orphan_retries; -- cgit v1.2.3 From 12ed8244ed8b31b023ea6d2851fd8b15f2999e9b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:51 +0200 Subject: ipv4: Namespaceify tcp syncookies sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 ++ include/net/tcp.h | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 9e83084ab8c1..ac000fccdf0f 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -101,6 +101,8 @@ struct netns_ipv4 { int sysctl_tcp_syn_retries; int sysctl_tcp_synack_retries; + int sysctl_tcp_syncookies; + struct ping_group_range ping_group_range; atomic_t dev_addr_genid; diff --git a/include/net/tcp.h b/include/net/tcp.h index 05659e860039..1fb23b70d237 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -243,7 +243,6 @@ extern int sysctl_tcp_fin_timeout; extern int sysctl_tcp_retries1; extern int sysctl_tcp_retries2; extern int sysctl_tcp_orphan_retries; -extern int sysctl_tcp_syncookies; extern int sysctl_tcp_fastopen; extern int sysctl_tcp_retrans_collapse; extern int sysctl_tcp_stdurg; -- cgit v1.2.3 From 1043e25ff96a1efc7bd34d11f5f32203a28a3bd7 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:52 +0200 Subject: ipv4: Namespaceify tcp reordering sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 +- include/net/tcp.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index ac000fccdf0f..eb4cd0a3c296 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -100,8 +100,8 @@ struct netns_ipv4 { int sysctl_tcp_syn_retries; int sysctl_tcp_synack_retries; - int sysctl_tcp_syncookies; + int sysctl_tcp_reordering; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index 1fb23b70d237..7e9a147cabae 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -961,9 +961,11 @@ static inline void tcp_enable_fack(struct tcp_sock *tp) */ static inline void tcp_enable_early_retrans(struct tcp_sock *tp) { + struct net *net = sock_net((struct sock *)tp); + tp->do_early_retrans = sysctl_tcp_early_retrans && sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack && - sysctl_tcp_reordering == 3; + net->ipv4.sysctl_tcp_reordering == 3; } static inline void tcp_disable_early_retrans(struct tcp_sock *tp) -- cgit v1.2.3 From ae5c3f406cffe15ffd2aa544961b7cd027468d46 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:53 +0200 Subject: ipv4: Namespaceify tcp_retries1 sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index eb4cd0a3c296..dee6ba647461 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -102,6 +102,7 @@ struct netns_ipv4 { int sysctl_tcp_synack_retries; int sysctl_tcp_syncookies; int sysctl_tcp_reordering; + int sysctl_tcp_retries1; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index 7e9a147cabae..da96b9af3e5f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_retries1; extern int sysctl_tcp_retries2; extern int sysctl_tcp_orphan_retries; extern int sysctl_tcp_fastopen; -- cgit v1.2.3 From c6214a97c86c660de4f7ddb8eed925192e646161 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:54 +0200 Subject: ipv4: Namespaceify tcp_retries2 sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index dee6ba647461..d92c8e5d0fbc 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -103,6 +103,7 @@ struct netns_ipv4 { int sysctl_tcp_syncookies; int sysctl_tcp_reordering; int sysctl_tcp_retries1; + int sysctl_tcp_retries2; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index da96b9af3e5f..a786cfa6301b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_retries2; extern int sysctl_tcp_orphan_retries; extern int sysctl_tcp_fastopen; extern int sysctl_tcp_retrans_collapse; -- cgit v1.2.3 From c402d9beffb6141ab2e4d2ad8be71128803a28ca Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:55 +0200 Subject: ipv4: Namespaceify tcp_orphan_retries sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index d92c8e5d0fbc..080230321985 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -104,6 +104,7 @@ struct netns_ipv4 { int sysctl_tcp_reordering; int sysctl_tcp_retries1; int sysctl_tcp_retries2; + int sysctl_tcp_orphan_retries; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index a786cfa6301b..71f840b89c76 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -240,7 +240,6 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; extern int sysctl_tcp_fin_timeout; -extern int sysctl_tcp_orphan_retries; extern int sysctl_tcp_fastopen; extern int sysctl_tcp_retrans_collapse; extern int sysctl_tcp_stdurg; -- cgit v1.2.3 From 1e579caa18b96f9eb18f4f5416658cd15f37c062 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:56 +0200 Subject: ipv4: Namespaceify tcp_fin_timeout sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 080230321985..de5ff4385e84 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -105,6 +105,7 @@ struct netns_ipv4 { int sysctl_tcp_retries1; int sysctl_tcp_retries2; int sysctl_tcp_orphan_retries; + int sysctl_tcp_fin_timeout; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index 71f840b89c76..3f160c2e6960 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -239,7 +239,6 @@ extern struct inet_timewait_death_row tcp_death_row; extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; -extern int sysctl_tcp_fin_timeout; extern int sysctl_tcp_fastopen; extern int sysctl_tcp_retrans_collapse; extern int sysctl_tcp_stdurg; @@ -1249,7 +1248,7 @@ static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) static inline int tcp_fin_time(const struct sock *sk) { - int fin_timeout = tcp_sk(sk)->linger2 ? : sysctl_tcp_fin_timeout; + int fin_timeout = tcp_sk(sk)->linger2 ? : sock_net(sk)->ipv4.sysctl_tcp_fin_timeout; const int rto = inet_csk(sk)->icsk_rto; if (fin_timeout < (rto << 2) - (rto >> 1)) -- cgit v1.2.3 From 4979f2d9f7262b9b180bc83de8d70f7a7721c085 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 3 Feb 2016 09:46:57 +0200 Subject: ipv4: Namespaceify tcp_notsent_lowat sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index de5ff4385e84..4d6ec3f6fafe 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -106,6 +106,7 @@ struct netns_ipv4 { int sysctl_tcp_retries2; int sysctl_tcp_orphan_retries; int sysctl_tcp_fin_timeout; + unsigned int sysctl_tcp_notsent_lowat; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index 3f160c2e6960..9b2cb0c8d876 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -267,7 +267,6 @@ extern int sysctl_tcp_thin_dupack; extern int sysctl_tcp_early_retrans; extern int sysctl_tcp_limit_output_bytes; extern int sysctl_tcp_challenge_ack_limit; -extern unsigned int sysctl_tcp_notsent_lowat; extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_min_rtt_wlen; extern int sysctl_tcp_autocorking; @@ -1682,7 +1681,8 @@ void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) { - return tp->notsent_lowat ?: sysctl_tcp_notsent_lowat; + struct net *net = sock_net((struct sock *)tp); + return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat; } static inline bool tcp_stream_memory_free(const struct sock *sk) -- cgit v1.2.3 From 5ee14e6d336f1daacf5ba73e831029c5ab7ae329 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 3 Feb 2016 13:17:01 +0100 Subject: bonding: 3ad: apply ad_actor settings changes immediately Currently the bonding allows to set ad_actor_system and prio while the bond device is down, but these are actually applied only if there aren't any slaves yet (applied to bond device when first slave shows up, and to slaves at 3ad bind time). After this patch changes are applied immediately and the new values can be used/seen after the bond's upped so it's not necessary anymore to release all and enslave again to see the changes. CC: Jay Vosburgh CC: Veaceslav Falico CC: Andy Gospodarek Signed-off-by: Nikolay Aleksandrov Signed-off-by: Jay Vosburgh Signed-off-by: David S. Miller --- include/net/bond_3ad.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index f1fbc3b11962..f358ad5e4214 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -306,5 +306,6 @@ int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); int bond_3ad_set_carrier(struct bonding *bond); void bond_3ad_update_lacp_rate(struct bonding *bond); +void bond_3ad_update_ad_actor_settings(struct bonding *bond); #endif /* _NET_BOND_3AD_H */ -- cgit v1.2.3 From 086c653f5862591a9cfe2386f5650d03adacc33a Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Wed, 10 Feb 2016 11:50:35 -0500 Subject: sock: struct proto hash function may error In order to support fast reuseport lookups in TCP, the hash function defined in struct proto must be capable of returning an error code. This patch changes the function signature of all related hash functions to return an integer and handles or propagates this return value at all call sites. Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 2 +- include/net/phonet/phonet.h | 2 +- include/net/ping.h | 2 +- include/net/raw.h | 2 +- include/net/sock.h | 6 +++--- include/net/udp.h | 3 ++- 6 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index de2e3ade6102..554440e7f83d 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -208,7 +208,7 @@ void inet_hashinfo_init(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk); void __inet_hash(struct sock *sk, struct sock *osk); -void inet_hash(struct sock *sk); +int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); struct sock *__inet_lookup_listener(struct net *net, diff --git a/include/net/phonet/phonet.h b/include/net/phonet/phonet.h index 68e509750caa..039cc29cb4a8 100644 --- a/include/net/phonet/phonet.h +++ b/include/net/phonet/phonet.h @@ -51,7 +51,7 @@ void pn_sock_init(void); struct sock *pn_find_sock_by_sa(struct net *net, const struct sockaddr_pn *sa); void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb); void phonet_get_local_port_range(int *min, int *max); -void pn_sock_hash(struct sock *sk); +int pn_sock_hash(struct sock *sk); void pn_sock_unhash(struct sock *sk); int pn_sock_get_port(struct sock *sk, unsigned short sport); diff --git a/include/net/ping.h b/include/net/ping.h index ac80cb45e630..5fd7cc244833 100644 --- a/include/net/ping.h +++ b/include/net/ping.h @@ -65,7 +65,7 @@ struct pingfakehdr { }; int ping_get_port(struct sock *sk, unsigned short ident); -void ping_hash(struct sock *sk); +int ping_hash(struct sock *sk); void ping_unhash(struct sock *sk); int ping_init_sock(struct sock *sk); diff --git a/include/net/raw.h b/include/net/raw.h index 6a40c6562dd2..3e789008394d 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -57,7 +57,7 @@ int raw_seq_open(struct inode *ino, struct file *file, #endif -void raw_hash_sk(struct sock *sk); +int raw_hash_sk(struct sock *sk); void raw_unhash_sk(struct sock *sk); struct raw_sock { diff --git a/include/net/sock.h b/include/net/sock.h index f5ea148853e2..255d3e03727b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -984,7 +984,7 @@ struct proto { void (*release_cb)(struct sock *sk); /* Keeping track of sk's, looking them up, and port selection methods. */ - void (*hash)(struct sock *sk); + int (*hash)(struct sock *sk); void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); @@ -1194,10 +1194,10 @@ static inline void sock_prot_inuse_add(struct net *net, struct proto *prot, /* With per-bucket locks this operation is not-atomic, so that * this version is not worse. */ -static inline void __sk_prot_rehash(struct sock *sk) +static inline int __sk_prot_rehash(struct sock *sk) { sk->sk_prot->unhash(sk); - sk->sk_prot->hash(sk); + return sk->sk_prot->hash(sk); } void sk_prot_clear_portaddr_nulls(struct sock *sk, int size); diff --git a/include/net/udp.h b/include/net/udp.h index 2842541e28e7..92927f729ac8 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -177,9 +177,10 @@ static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) } /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ -static inline void udp_lib_hash(struct sock *sk) +static inline int udp_lib_hash(struct sock *sk) { BUG(); + return 0; } void udp_lib_unhash(struct sock *sk); -- cgit v1.2.3 From 496611d7b5eaf59c03440c8f2def1d9988ad2459 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Wed, 10 Feb 2016 11:50:36 -0500 Subject: inet: create IPv6-equivalent inet_hash function In order to support fast lookups for TCP sockets with SO_REUSEPORT, the function that adds sockets to the listening hash set needs to be able to check receive address equality. Since this equality check is different for IPv4 and IPv6, we will need two different socket hashing functions. This patch adds inet6_hash identical to the existing inet_hash function and updates the appropriate references. A following patch will differentiate the two by passing different comparison functions to __inet_hash. Additionally, in order to use the IPv6 address equality function from inet6_hashtables (which is compiled as a built-in object when IPv6 is enabled) it also needs to be in a built-in object file as well. This moves ipv6_rcv_saddr_equal into inet_hashtables to accomplish this. Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- include/net/inet6_hashtables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 7ff588ca6817..b3c28a9dfbf1 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -96,6 +96,8 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif); + +int inet6_hash(struct sock *sk); #endif /* IS_ENABLED(CONFIG_IPV6) */ #define INET6_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif) \ -- cgit v1.2.3 From a583636a83ea383fd07517e5a7a2eedbc5d90fb1 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Wed, 10 Feb 2016 11:50:38 -0500 Subject: inet: refactor inet[6]_lookup functions to take skb This is a preliminary step to allow fast socket lookup of SO_REUSEPORT groups. Doing so with a BPF filter will require access to the skb in question. This change plumbs the skb (and offset to payload data) through the call stack to the listening socket lookup implementations where it will be used in a following patch. Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 ++ include/net/inet6_hashtables.h | 11 +++++++---- include/net/inet_hashtables.h | 18 ++++++++++++------ 3 files changed, 21 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 47f52d3cd8df..730d856683e5 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -87,6 +87,8 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, u32 banned_flags); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); +int ipv4_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard); int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index b3c28a9dfbf1..28332bdac333 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -53,6 +53,7 @@ struct sock *__inet6_lookup_established(struct net *net, struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, @@ -60,6 +61,7 @@ struct sock *inet6_lookup_listener(struct net *net, static inline struct sock *__inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, @@ -71,12 +73,12 @@ static inline struct sock *__inet6_lookup(struct net *net, if (sk) return sk; - return inet6_lookup_listener(net, hashinfo, saddr, sport, + return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, hnum, dif); } static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, - struct sk_buff *skb, + struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, int iif) @@ -86,13 +88,14 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, if (sk) return sk; - return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, - &ipv6_hdr(skb)->saddr, sport, + return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb, + doff, &ipv6_hdr(skb)->saddr, sport, &ipv6_hdr(skb)->daddr, ntohs(dport), iif); } struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif); diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 554440e7f83d..82403390af58 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -213,6 +213,7 @@ void inet_unhash(struct sock *sk); struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const unsigned short hnum, @@ -220,10 +221,11 @@ struct sock *__inet_lookup_listener(struct net *net, static inline struct sock *inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { - return __inet_lookup_listener(net, hashinfo, saddr, sport, + return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, ntohs(dport), dif); } @@ -299,6 +301,7 @@ static inline struct sock * static inline struct sock *__inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) @@ -307,12 +310,13 @@ static inline struct sock *__inet_lookup(struct net *net, struct sock *sk = __inet_lookup_established(net, hashinfo, saddr, sport, daddr, hnum, dif); - return sk ? : __inet_lookup_listener(net, hashinfo, saddr, sport, - daddr, hnum, dif); + return sk ? : __inet_lookup_listener(net, hashinfo, skb, doff, saddr, + sport, daddr, hnum, dif); } static inline struct sock *inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) @@ -320,7 +324,8 @@ static inline struct sock *inet_lookup(struct net *net, struct sock *sk; local_bh_disable(); - sk = __inet_lookup(net, hashinfo, saddr, sport, daddr, dport, dif); + sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + dport, dif); local_bh_enable(); return sk; @@ -328,6 +333,7 @@ static inline struct sock *inet_lookup(struct net *net, static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, + int doff, const __be16 sport, const __be16 dport) { @@ -337,8 +343,8 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, if (sk) return sk; else - return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, - iph->saddr, sport, + return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb, + doff, iph->saddr, sport, iph->daddr, dport, inet_iif(skb)); } -- cgit v1.2.3 From c125e80b88687b25b321795457309eaaee4bf270 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Wed, 10 Feb 2016 11:50:40 -0500 Subject: soreuseport: fast reuseport TCP socket selection This change extends the fast SO_REUSEPORT socket lookup implemented for UDP to TCP. Listener sockets with SO_REUSEPORT and the same receive address are additionally added to an array for faster random access. This means that only a single socket from the group must be found in the listener list before any socket in the group can be used to receive a packet. Previously, every socket in the group needed to be considered before handing off the incoming packet. This feature also exposes the ability to use a BPF program when selecting a socket from a reuseport group. Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 82403390af58..50f635c2c536 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -207,7 +207,10 @@ void inet_hashinfo_init(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk); -void __inet_hash(struct sock *sk, struct sock *osk); +int __inet_hash(struct sock *sk, struct sock *osk, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); -- cgit v1.2.3 From 815c52700746cdcc0874a33390bac334a4b90107 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 8 Feb 2016 23:29:21 +0200 Subject: igmp: Namespaceify igmp_max_memberships sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 4d6ec3f6fafe..759cf624eec2 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -108,6 +108,8 @@ struct netns_ipv4 { int sysctl_tcp_fin_timeout; unsigned int sysctl_tcp_notsent_lowat; + int sysctl_igmp_max_memberships; + struct ping_group_range ping_group_range; atomic_t dev_addr_genid; -- cgit v1.2.3 From 166b6b2d6f01be67a83b87ab5c91350a68b17115 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 8 Feb 2016 23:29:22 +0200 Subject: igmp: Namespaceify igmp_max_msf sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 759cf624eec2..522a2cfe1ad9 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -109,6 +109,7 @@ struct netns_ipv4 { unsigned int sysctl_tcp_notsent_lowat; int sysctl_igmp_max_memberships; + int sysctl_igmp_max_msf; struct ping_group_range ping_group_range; -- cgit v1.2.3 From 87a8a2ae65b7721893c7922f963502be8fa01c94 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 9 Feb 2016 00:13:50 +0200 Subject: igmp: Namespaceify igmp_llm_reports sysctl knob This was initially introduced in df2cf4a78e488d26 ("IGMP: Inhibit reports for local multicast groups") by defining the sysctl in the ipv4_net_table array, however it was never implemented to be namespace aware. Fix this by changing the code accordingly. Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 522a2cfe1ad9..cbbf8115e8a7 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -110,6 +110,7 @@ struct netns_ipv4 { int sysctl_igmp_max_memberships; int sysctl_igmp_max_msf; + int sysctl_igmp_llm_reports; struct ping_group_range ping_group_range; -- cgit v1.2.3 From 165094afcee79e4d5b6e94032a5d3be157460b4a Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 8 Feb 2016 23:29:24 +0200 Subject: igmp: Namespacify igmp_qrv sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index cbbf8115e8a7..848fe8056534 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -111,6 +111,7 @@ struct netns_ipv4 { int sysctl_igmp_max_memberships; int sysctl_igmp_max_msf; int sysctl_igmp_llm_reports; + int sysctl_igmp_qrv; struct ping_group_range ping_group_range; -- cgit v1.2.3 From 21e2e7f9b5fefdbf94a107a9b24d74baa5148ef3 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 11 Feb 2016 20:50:44 +0000 Subject: net: enable LCO for udp_tunnel_handle_offloads() users The only protocol affected at present is Geneve. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/net/udp_tunnel.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index cca2ad3082c3..734c15662ea9 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -103,7 +103,8 @@ static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb, { int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; - return iptunnel_handle_offloads(skb, udp_csum, type); + /* As we're a UDP tunnel, we support LCO, so don't need csum_help */ + return iptunnel_handle_offloads(skb, false, type); } static inline void udp_tunnel_gro_complete(struct sk_buff *skb, int nhoff) -- cgit v1.2.3 From 6fa79666e24d32be1b709f5269af41ed9e829e7e Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 11 Feb 2016 21:02:31 +0000 Subject: net: ip_tunnel: remove 'csum_help' argument to iptunnel_handle_offloads All users now pass false, so we can remove it, and remove the code that was conditional upon it. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 3 +-- include/net/udp_tunnel.h | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 6db96ea0144f..bc439f32baa9 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -279,8 +279,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags); -struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool gre_csum, - int gso_type_mask); +struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask); static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len) { diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 734c15662ea9..97f5adb121a6 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -103,8 +103,7 @@ static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb, { int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; - /* As we're a UDP tunnel, we support LCO, so don't need csum_help */ - return iptunnel_handle_offloads(skb, false, type); + return iptunnel_handle_offloads(skb, type); } static inline void udp_tunnel_gro_complete(struct sk_buff *skb, int nhoff) -- cgit v1.2.3 From 911362c70df5b766c243dc297fadeaced786ffd8 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:53 +0100 Subject: net: add dst_cache support This patch add a generic, lockless dst cache implementation. The need for lock is avoided updating the dst cache fields only in per cpu scope, and requiring that the cache manipulation functions are invoked with the local bh disabled. The refresh_ts and reset_ts fields are used to ensure the cache consistency in case of cuncurrent cache update (dst_cache_set*) and reset operation (dst_cache_reset). Consider the following scenario: CPU1: CPU2: dst_cache_reset() dst_cache_set() The dst entry set passed to dst_cache_set() should not be used for later dst cache lookup, because it's obtained using old configuration values. Since the refresh_ts is updated only on dst_cache lookup, the cached value in the above scenario will be discarded on the next lookup. Signed-off-by: Paolo Abeni Suggested-and-acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/dst_cache.h | 97 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 include/net/dst_cache.h (limited to 'include/net') diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h new file mode 100644 index 000000000000..151accae708b --- /dev/null +++ b/include/net/dst_cache.h @@ -0,0 +1,97 @@ +#ifndef _NET_DST_CACHE_H +#define _NET_DST_CACHE_H + +#include +#include +#if IS_ENABLED(CONFIG_IPV6) +#include +#endif + +struct dst_cache { + struct dst_cache_pcpu __percpu *cache; + unsigned long reset_ts; +}; + +/** + * dst_cache_get - perform cache lookup + * @dst_cache: the cache + * + * The caller should use dst_cache_get_ip4() if it need to retrieve the + * source address to be used when xmitting to the cached dst. + * local BH must be disabled. + */ +struct dst_entry *dst_cache_get(struct dst_cache *dst_cache); + +/** + * dst_cache_get_ip4 - perform cache lookup and fetch ipv4 source address + * @dst_cache: the cache + * @saddr: return value for the retrieved source address + * + * local BH must be disabled. + */ +struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr); + +/** + * dst_cache_set_ip4 - store the ipv4 dst into the cache + * @dst_cache: the cache + * @dst: the entry to be cached + * @saddr: the source address to be stored inside the cache + * + * local BH must be disabled. + */ +void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, + __be32 saddr); + +#if IS_ENABLED(CONFIG_IPV6) + +/** + * dst_cache_set_ip6 - store the ipv6 dst into the cache + * @dst_cache: the cache + * @dst: the entry to be cached + * @saddr: the source address to be stored inside the cache + * + * local BH must be disabled. + */ +void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, + const struct in6_addr *addr); + +/** + * dst_cache_get_ip6 - perform cache lookup and fetch ipv6 source address + * @dst_cache: the cache + * @saddr: return value for the retrieved source address + * + * local BH must be disabled. + */ +struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, + struct in6_addr *saddr); +#endif + +/** + * dst_cache_reset - invalidate the cache contents + * @dst_cache: the cache + * + * This do not free the cached dst to avoid races and contentions. + * the dst will be freed on later cache lookup. + */ +static inline void dst_cache_reset(struct dst_cache *dst_cache) +{ + dst_cache->reset_ts = jiffies; +} + +/** + * dst_cache_init - initialize the cache, allocating the required storage + * @dst_cache: the cache + * @gfp: allocation flags + */ +int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp); + +/** + * dst_cache_destroy - empty the cache and free the allocated storage + * @dst_cache: the cache + * + * No synchronization is enforced: it must be called only when the cache + * is unsed. + */ +void dst_cache_destroy(struct dst_cache *dst_cache); + +#endif -- cgit v1.2.3 From 607f725f6f7d5ec3759fbc16224afb60e2152a5b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:54 +0100 Subject: net: replace dst_cache ip6_tunnel implementation with the generic one This also fix a potential race into the existing tunnel code, which could lead to the wrong dst to be permanenty cached: CPU1: CPU2: dst = ip6_route_output(...) dst_cache_reset() // no effect, // the cache is empty dst_cache_set() // the wrong dst // is permanenty stored // into the cache With the new dst implementation the above race is not possible since the first cache lookup after dst_cache_reset will fail due to the timestamp check Signed-off-by: Paolo Abeni Suggested-and-acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 0d0ce0b2d870..499a707765ea 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -6,6 +6,7 @@ #include #include #include +#include #define IP6TUNNEL_ERR_TIMEO (30*HZ) @@ -33,12 +34,6 @@ struct __ip6_tnl_parm { __be32 o_key; }; -struct ip6_tnl_dst { - seqlock_t lock; - struct dst_entry __rcu *dst; - u32 cookie; -}; - /* IPv6 tunnel */ struct ip6_tnl { struct ip6_tnl __rcu *next; /* next tunnel in list */ @@ -46,7 +41,7 @@ struct ip6_tnl { struct net *net; /* netns for packet i/o */ struct __ip6_tnl_parm parms; /* tunnel configuration parameters */ struct flowi fl; /* flowi template for xmit */ - struct ip6_tnl_dst __percpu *dst_cache; /* cached dst */ + struct dst_cache dst_cache; /* cached dst */ int err_count; unsigned long err_time; @@ -66,11 +61,6 @@ struct ipv6_tlv_tnl_enc_lim { __u8 encap_limit; /* tunnel encapsulation limit */ } __packed; -struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t); -int ip6_tnl_dst_init(struct ip6_tnl *t); -void ip6_tnl_dst_destroy(struct ip6_tnl *t); -void ip6_tnl_dst_reset(struct ip6_tnl *t); -void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst); int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, -- cgit v1.2.3 From e09acddf873bf775b208b452a4c3a3fd26fa9427 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:55 +0100 Subject: ip_tunnel: replace dst_cache with generic implementation The current ip_tunnel cache implementation is prone to a race that will cause the wrong dst to be cached on cuncurrent dst cache miss and ip tunnel update via netlink. Replacing with the generic implementation fix the issue. Signed-off-by: Paolo Abeni Suggested-and-acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index bc439f32baa9..fd36936d85a6 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -13,6 +13,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -85,11 +86,6 @@ struct ip_tunnel_prl_entry { struct rcu_head rcu_head; }; -struct ip_tunnel_dst { - struct dst_entry __rcu *dst; - __be32 saddr; -}; - struct metadata_dst; struct ip_tunnel { @@ -108,7 +104,7 @@ struct ip_tunnel { int tun_hlen; /* Precalculated header length */ int mlink; - struct ip_tunnel_dst __percpu *dst_cache; + struct dst_cache dst_cache; struct ip_tunnel_parm parms; @@ -247,7 +243,6 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p); void ip_tunnel_setup(struct net_device *dev, int net_id); -void ip_tunnel_dst_reset_all(struct ip_tunnel *t); int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap); -- cgit v1.2.3 From 0c1d70af924b966cc71e9e48920b2b635441aa50 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:56 +0100 Subject: net: use dst_cache for vxlan device In case of UDP traffic with datagram length below MTU this give about 3% performance increase when tunneling over ipv4 and about 70% when tunneling over ipv6. Signed-off-by: Paolo Abeni Suggested-and-acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/vxlan.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 25bd919c9ef0..b314e4af89c5 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -148,6 +148,7 @@ struct vxlan_rdst { u32 remote_ifindex; struct list_head list; struct rcu_head rcu; + struct dst_cache dst_cache; }; struct vxlan_config { -- cgit v1.2.3 From d71785ffc7e7cae3fbdc4ea8a9d05b7a1c59f7b8 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 12 Feb 2016 15:43:57 +0100 Subject: net: add dst_cache to ovs vxlan lwtunnel In case of UDP traffic with datagram length below MTU this give about 2% performance increase when tunneling over ipv4 and about 60% when tunneling over ipv6 Signed-off-by: Paolo Abeni Suggested-and-acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 1 + include/net/ip_tunnels.h | 3 +++ 2 files changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 30a56ab2ccfb..84b833af6882 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -62,6 +62,7 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, sizeof(a->u.tun_info) + a->u.tun_info.options_len); } +void metadata_dst_free(struct metadata_dst *); struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags); struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index fd36936d85a6..87408ab80856 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -58,6 +58,9 @@ struct ip_tunnel_key { struct ip_tunnel_info { struct ip_tunnel_key key; +#ifdef CONFIG_DST_CACHE + struct dst_cache dst_cache; +#endif u8 options_len; u8 mode; }; -- cgit v1.2.3 From fa50d974d104113630d68b7d03233a6686230d0c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 15 Feb 2016 12:11:27 +0200 Subject: ipv4: Namespaceify ip_default_ttl sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/route.h | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 848fe8056534..bc8f7f94abcb 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -80,6 +80,7 @@ struct netns_ipv4 { int sysctl_tcp_ecn; int sysctl_tcp_ecn_fallback; + int sysctl_ip_default_ttl; int sysctl_ip_no_pmtu_disc; int sysctl_ip_fwd_use_pmtu; int sysctl_ip_nonlocal_bind; diff --git a/include/net/route.h b/include/net/route.h index a3b9ef74a389..9b0a523bb428 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -329,14 +329,13 @@ static inline int inet_iif(const struct sk_buff *skb) return skb->skb_iif; } -extern int sysctl_ip_default_ttl; - static inline int ip4_dst_hoplimit(const struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); + struct net *net = dev_net(dst->dev); if (hoplimit == 0) - hoplimit = sysctl_ip_default_ttl; + hoplimit = net->ipv4.sysctl_ip_default_ttl; return hoplimit; } -- cgit v1.2.3 From 287b7f38fd6842e534db1783cead3843f7677b79 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 15 Feb 2016 12:11:29 +0200 Subject: ipv4: Namespacify ip_dynaddr sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/ip.h | 3 --- include/net/netns/ipv4.h | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index 1a98f1ca1638..e3fb25d76421 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -248,9 +248,6 @@ extern int inet_peer_maxttl; /* From ip_input.c */ extern int sysctl_ip_early_demux; -/* From ip_output.c */ -extern int sysctl_ip_dynaddr; - void ipfrag_init(void); void ip_static_sysctl_init(void); diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index bc8f7f94abcb..b7e3fb2587da 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -84,6 +84,8 @@ struct netns_ipv4 { int sysctl_ip_no_pmtu_disc; int sysctl_ip_fwd_use_pmtu; int sysctl_ip_nonlocal_bind; + /* Shall we try to damage output packets if routing dev changes? */ + int sysctl_ip_dynaddr; int sysctl_fwmark_reflect; int sysctl_tcp_fwmark_accept; -- cgit v1.2.3 From e21145a9871aa5ae07e01926105bb8e523d64095 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 15 Feb 2016 12:11:30 +0200 Subject: ipv4: namespacify ip_early_demux sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/ip.h | 3 --- include/net/netns/ipv4.h | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index e3fb25d76421..cbb134b2f0e4 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -245,9 +245,6 @@ extern int inet_peer_threshold; extern int inet_peer_minttl; extern int inet_peer_maxttl; -/* From ip_input.c */ -extern int sysctl_ip_early_demux; - void ipfrag_init(void); void ip_static_sysctl_init(void); diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index b7e3fb2587da..a69cde3ce460 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -86,6 +86,7 @@ struct netns_ipv4 { int sysctl_ip_nonlocal_bind; /* Shall we try to damage output packets if routing dev changes? */ int sysctl_ip_dynaddr; + int sysctl_ip_early_demux; int sysctl_fwmark_reflect; int sysctl_tcp_fwmark_accept; -- cgit v1.2.3 From 0fbf4cb27e061204c8cee8e7eb2870416bdf30fd Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 15 Feb 2016 12:11:31 +0200 Subject: ipv4: namespacify ip fragment max dist sysctl knob Signed-off-by: Nikolay Borisov Signed-off-by: David S. Miller --- include/net/inet_frag.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 12aac0fd6ee7..909972aa3acd 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -13,6 +13,7 @@ struct netns_frags { int timeout; int high_thresh; int low_thresh; + int max_dist; }; /** -- cgit v1.2.3 From a1b7c5fd7fe98f51fbbc393ee1fc4c1cdb2f0119 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Feb 2016 21:17:09 -0800 Subject: net: sched: add cls_u32 offload hooks for netdevs This patch allows netdev drivers to consume cls_u32 offloads via the ndo_setup_tc ndo op. This works aligns with how network drivers have been doing qdisc offloads for mqprio. Signed-off-by: John Fastabend Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index bc49967e1a68..59789ca6e2c8 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -358,4 +358,38 @@ tcf_match_indev(struct sk_buff *skb, int ifindex) } #endif /* CONFIG_NET_CLS_IND */ +struct tc_cls_u32_knode { + struct tcf_exts *exts; + u8 fshift; + u32 handle; + u32 val; + u32 mask; + u32 link_handle; + struct tc_u32_sel *sel; +}; + +struct tc_cls_u32_hnode { + u32 handle; + u32 prio; + unsigned int divisor; +}; + +enum tc_clsu32_command { + TC_CLSU32_NEW_KNODE, + TC_CLSU32_REPLACE_KNODE, + TC_CLSU32_DELETE_KNODE, + TC_CLSU32_NEW_HNODE, + TC_CLSU32_REPLACE_HNODE, + TC_CLSU32_DELETE_HNODE, +}; + +struct tc_cls_u32_offload { + /* knode values */ + enum tc_clsu32_command command; + union { + struct tc_cls_u32_knode knode; + struct tc_cls_u32_hnode hnode; + }; +}; + #endif -- cgit v1.2.3 From 3b01cf56daf96acf9b155d6201d94bc8b4de218e Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Feb 2016 21:18:03 -0800 Subject: net: tc: helper functions to query action types This is a helper function drivers can use to learn if the action type is a drop action. Signed-off-by: John Fastabend Acked-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/tc_act/tc_gact.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/net') diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h index 592a6bc02b0b..04a31830711b 100644 --- a/include/net/tc_act/tc_gact.h +++ b/include/net/tc_act/tc_gact.h @@ -2,6 +2,7 @@ #define __NET_TC_GACT_H #include +#include struct tcf_gact { struct tcf_common common; @@ -15,4 +16,19 @@ struct tcf_gact { #define to_gact(a) \ container_of(a->priv, struct tcf_gact, common) +#ifdef CONFIG_NET_CLS_ACT +static inline bool is_tcf_gact_shot(const struct tc_action *a) +{ + struct tcf_gact *gact; + + if (a->ops && a->ops->type != TCA_ACT_GACT) + return false; + + gact = a->priv; + if (gact->tcf_action == TC_ACT_SHOT) + return true; + + return false; +} +#endif #endif /* __NET_TC_GACT_H */ -- cgit v1.2.3 From 1cd4d5c4326a7ed3bb0e346bd7d20f5057a80ae6 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Feb 2016 14:28:05 +0800 Subject: sctp: remove the unused sctp_datamsg_free() Since commit 8b570dc9f7b6 ("sctp: only drop the reference on the datamsg after sending a msg") used sctp_datamsg_put in sctp_sendmsg, instead of sctp_datamsg_free, this function has no use in sctp. So we will remove it. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 205630bb5010..d05b56641abc 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -535,7 +535,6 @@ struct sctp_datamsg { struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *, struct sctp_sndrcvinfo *, struct iov_iter *); -void sctp_datamsg_free(struct sctp_datamsg *); void sctp_datamsg_put(struct sctp_datamsg *); void sctp_chunk_fail(struct sctp_chunk *, int error); int sctp_chunk_abandoned(struct sctp_chunk *); -- cgit v1.2.3 From e014860e31e2a66b1a94088504360a6ebc023564 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 17 Feb 2016 14:59:30 -0800 Subject: net: pack tc_cls_u32_knode struct slighter better By packing the structure we can remove a few holes as Jamal suggests. before: struct tc_cls_u32_knode { struct tcf_exts * exts; /* 0 8 */ u8 fshift; /* 8 1 */ /* XXX 3 bytes hole, try to pack */ u32 handle; /* 12 4 */ u32 val; /* 16 4 */ u32 mask; /* 20 4 */ u32 link_handle; /* 24 4 */ /* XXX 4 bytes hole, try to pack */ struct tc_u32_sel * sel; /* 32 8 */ /* size: 40, cachelines: 1, members: 7 */ /* sum members: 33, holes: 2, sum holes: 7 */ /* last cacheline: 40 bytes */ }; after: struct tc_cls_u32_knode { struct tcf_exts * exts; /* 0 8 */ struct tc_u32_sel * sel; /* 8 8 */ u32 handle; /* 16 4 */ u32 val; /* 20 4 */ u32 mask; /* 24 4 */ u32 link_handle; /* 28 4 */ u8 fshift; /* 32 1 */ /* size: 40, cachelines: 1, members: 7 */ /* padding: 7 */ /* last cacheline: 40 bytes */ }; Suggested-by: Jamal Hadi Salim Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 59789ca6e2c8..2121df574262 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -360,12 +360,12 @@ tcf_match_indev(struct sk_buff *skb, int ifindex) struct tc_cls_u32_knode { struct tcf_exts *exts; - u8 fshift; + struct tc_u32_sel *sel; u32 handle; u32 val; u32 mask; u32 link_handle; - struct tc_u32_sel *sel; + u8 fshift; }; struct tc_cls_u32_hnode { -- cgit v1.2.3 From d4ac05ff3697e036dcb0e2e284c5f7eb77cc0966 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 16 Feb 2016 21:58:57 +0100 Subject: vxlan: introduce vxlan_hdr Currently, pointer to the vxlan header is kept in a local variable. It has to be reloaded whenever the pskb pull operations are performed which usually happens somewhere deep in called functions. Create a vxlan_hdr function and use it to reference the vxlan header instead. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/vxlan.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index b314e4af89c5..3f38b40ec4aa 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -262,6 +262,11 @@ static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, /* IPv6 header + UDP + VXLAN + Ethernet header */ #define VXLAN6_HEADROOM (40 + 8 + 8 + 14) +static inline struct vxlanhdr *vxlan_hdr(struct sk_buff *skb) +{ + return (struct vxlanhdr *)(udp_hdr(skb) + 1); +} + #if IS_ENABLED(CONFIG_VXLAN) void vxlan_get_rx_port(struct net_device *netdev); #else -- cgit v1.2.3 From 54bfd872bf16d40b61bd0cd9b769b2fef67dd272 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 16 Feb 2016 21:58:58 +0100 Subject: vxlan: keep flags and vni in network byte order Prevent repeated conversions from and to network order in the fast path. To achieve this, define all flag constants in big endian order and store VNI as __be32. To prevent confusion between the actual VNI value and the VNI field from the header (which contains additional reserved byte), strictly distinguish between "vni" and "vni_field". Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/vxlan.h | 70 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 3f38b40ec4aa..1b85a3b40c5a 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -24,11 +24,11 @@ struct vxlanhdr { }; /* VXLAN header flags. */ -#define VXLAN_HF_VNI BIT(27) +#define VXLAN_HF_VNI cpu_to_be32(BIT(27)) #define VXLAN_N_VID (1u << 24) #define VXLAN_VID_MASK (VXLAN_N_VID - 1) -#define VXLAN_VNI_MASK (VXLAN_VID_MASK << 8) +#define VXLAN_VNI_MASK cpu_to_be32(VXLAN_VID_MASK << 8) #define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) #define VNI_HASH_BITS 10 @@ -55,14 +55,14 @@ struct vxlanhdr { */ /* VXLAN-RCO header flags. */ -#define VXLAN_HF_RCO BIT(21) +#define VXLAN_HF_RCO cpu_to_be32(BIT(21)) /* Remote checksum offload header option */ -#define VXLAN_RCO_MASK 0x7f /* Last byte of vni field */ -#define VXLAN_RCO_UDP 0x80 /* Indicate UDP RCO (TCP when not set *) */ -#define VXLAN_RCO_SHIFT 1 /* Left shift of start */ +#define VXLAN_RCO_MASK cpu_to_be32(0x7f) /* Last byte of vni field */ +#define VXLAN_RCO_UDP cpu_to_be32(0x80) /* Indicate UDP RCO (TCP when not set *) */ +#define VXLAN_RCO_SHIFT 1 /* Left shift of start */ #define VXLAN_RCO_SHIFT_MASK ((1 << VXLAN_RCO_SHIFT) - 1) -#define VXLAN_MAX_REMCSUM_START (VXLAN_RCO_MASK << VXLAN_RCO_SHIFT) +#define VXLAN_MAX_REMCSUM_START (0x7f << VXLAN_RCO_SHIFT) /* * VXLAN Group Based Policy Extension (VXLAN_F_GBP): @@ -105,9 +105,9 @@ struct vxlanhdr_gbp { }; /* VXLAN-GBP header flags. */ -#define VXLAN_HF_GBP BIT(31) +#define VXLAN_HF_GBP cpu_to_be32(BIT(31)) -#define VXLAN_GBP_USED_BITS (VXLAN_HF_GBP | 0xFFFFFF) +#define VXLAN_GBP_USED_BITS (VXLAN_HF_GBP | cpu_to_be32(0xFFFFFF)) /* skb->mark mapping * @@ -144,7 +144,7 @@ union vxlan_addr { struct vxlan_rdst { union vxlan_addr remote_ip; __be16 remote_port; - u32 remote_vni; + __be32 remote_vni; u32 remote_ifindex; struct list_head list; struct rcu_head rcu; @@ -154,7 +154,7 @@ struct vxlan_rdst { struct vxlan_config { union vxlan_addr remote_ip; union vxlan_addr saddr; - u32 vni; + __be32 vni; int remote_ifindex; int mtu; __be16 dst_port; @@ -267,6 +267,54 @@ static inline struct vxlanhdr *vxlan_hdr(struct sk_buff *skb) return (struct vxlanhdr *)(udp_hdr(skb) + 1); } +static inline __be32 vxlan_vni(__be32 vni_field) +{ +#if defined(__BIG_ENDIAN) + return vni_field >> 8; +#else + return (vni_field & VXLAN_VNI_MASK) << 8; +#endif +} + +static inline __be32 vxlan_vni_field(__be32 vni) +{ +#if defined(__BIG_ENDIAN) + return vni << 8; +#else + return vni >> 8; +#endif +} + +static inline __be32 vxlan_tun_id_to_vni(__be64 tun_id) +{ +#if defined(__BIG_ENDIAN) + return tun_id; +#else + return tun_id >> 32; +#endif +} + +static inline size_t vxlan_rco_start(__be32 vni_field) +{ + return be32_to_cpu(vni_field & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; +} + +static inline size_t vxlan_rco_offset(__be32 vni_field) +{ + return (vni_field & VXLAN_RCO_UDP) ? + offsetof(struct udphdr, check) : + offsetof(struct tcphdr, check); +} + +static inline __be32 vxlan_compute_rco(unsigned int start, unsigned int offset) +{ + __be32 vni_field = cpu_to_be32(start >> VXLAN_RCO_SHIFT); + + if (offset == offsetof(struct udphdr, check)) + vni_field |= VXLAN_RCO_UDP; + return vni_field; +} + #if IS_ENABLED(CONFIG_VXLAN) void vxlan_get_rx_port(struct net_device *netdev); #else -- cgit v1.2.3 From 263ea09084d172cac6e40459a690babe8de8e448 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Feb 2016 15:03:26 +0100 Subject: Revert "genl: Add genlmsg_new_unicast() for unicast message allocation" This reverts commit bb9b18fb55b0 ("genl: Add genlmsg_new_unicast() for unicast message allocation")'. Nothing wrong with it; its no longer needed since this was only for mmapped netlink support. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/genetlink.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 43c0e771f417..8d4608ce8716 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -83,7 +83,6 @@ struct genl_family { * @attrs: netlink attributes * @_net: network namespace * @user_ptr: user pointers - * @dst_sk: destination socket */ struct genl_info { u32 snd_seq; @@ -94,7 +93,6 @@ struct genl_info { struct nlattr ** attrs; possible_net_t _net; void * user_ptr[2]; - struct sock * dst_sk; }; static inline struct net *genl_info_net(struct genl_info *info) @@ -188,8 +186,6 @@ int genl_unregister_family(struct genl_family *family); void genl_notify(struct genl_family *family, struct sk_buff *skb, struct genl_info *info, u32 group, gfp_t flags); -struct sk_buff *genlmsg_new_unicast(size_t payload, struct genl_info *info, - gfp_t flags); void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, struct genl_family *family, int flags, u8 cmd); -- cgit v1.2.3 From 07dabf20d9867710b90b91108b2adcd448773e25 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 18 Feb 2016 19:19:29 +0100 Subject: vxlan: tun_id is 64bit, not 32bit The tun_id field in struct ip_tunnel_key is __be64, not __be32. We need to convert the vni to tun_id correctly. Fixes: 54bfd872bf16 ("vxlan: keep flags and vni in network byte order") Reported-by: Paolo Abeni Tested-by: Paolo Abeni Signed-off-by: Jiri Benc Acked-by: Thadeu Lima de Souza Cascardo Signed-off-by: David S. Miller --- include/net/vxlan.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 1b85a3b40c5a..748083de367a 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -294,6 +294,15 @@ static inline __be32 vxlan_tun_id_to_vni(__be64 tun_id) #endif } +static inline __be64 vxlan_vni_to_tun_id(__be32 vni) +{ +#if defined(__BIG_ENDIAN) + return (__be64)vni; +#else + return (__be64)vni << 32; +#endif +} + static inline size_t vxlan_rco_start(__be32 vni_field) { return be32_to_cpu(vni_field & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; -- cgit v1.2.3 From 7f290c94352e59b1d720055fce760a69a63bd0a1 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 18 Feb 2016 11:22:52 +0100 Subject: iptunnel: scrub packet in iptunnel_pull_header Part of skb_scrub_packet was open coded in iptunnel_pull_header. Let it call skb_scrub_packet directly instead. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 87408ab80856..4dd616376fec 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -270,7 +270,8 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, return INET_ECN_encapsulate(tos, inner); } -int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto); +int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, + bool xnet); void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, u8 tos, u8 ttl, __be16 df, bool xnet); -- cgit v1.2.3 From e550785c30f639b3cc6ca70c489a6463ff298453 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Wed, 17 Feb 2016 16:20:33 -0800 Subject: ipv6: Annotate change of locking mechanism for np->opt follows up commit 45f6fad84cc3 ("ipv6: add complete rcu protection around np->opt") which added mixed rcu/refcount protection to np->opt. Given the current implementation of rcu_pointer_handoff(), this has no effect at runtime. Signed-off-by: Benjamin Poirier Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ipv6.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 6570f379aba2..f3c9857c645d 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -259,8 +259,12 @@ static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) rcu_read_lock(); opt = rcu_dereference(np->opt); - if (opt && !atomic_inc_not_zero(&opt->refcnt)) - opt = NULL; + if (opt) { + if (!atomic_inc_not_zero(&opt->refcnt)) + opt = NULL; + else + opt = rcu_pointer_handoff(opt); + } rcu_read_unlock(); return opt; } -- cgit v1.2.3 From 745041e2aaf1d668f293aaab4b0f6ad7daa056a5 Mon Sep 17 00:00:00 2001 From: Robert Shearman Date: Fri, 19 Feb 2016 09:43:16 +0000 Subject: lwtunnel: autoload of lwt modules The lwt implementations using net devices can autoload using the existing mechanism using IFLA_INFO_KIND. However, there's no mechanism that lwt modules not using net devices can use. Therefore, add the ability to autoload modules registering lwt operations for lwt implementations not using a net device so that users don't have to manually load the modules. Only users with the CAP_NET_ADMIN capability can cause modules to be loaded, which is ensured by rtnetlink_rcv_msg rejecting non-RTM_GETxxx messages for users without this capability, and by lwtunnel_build_state not being called in response to RTM_GETxxx messages. Signed-off-by: Robert Shearman Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 66350ce3e955..e9f116e29c22 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -170,6 +170,8 @@ static inline int lwtunnel_input(struct sk_buff *skb) return -EOPNOTSUPP; } -#endif +#endif /* CONFIG_LWTUNNEL */ + +#define MODULE_ALIAS_RTNL_LWT(encap_type) MODULE_ALIAS("rtnl-lwt-" __stringify(encap_type)) #endif /* __NET_LWTUNNEL_H */ -- cgit v1.2.3 From 6ceb31ca5f65acff299dbc3da5854d54e147b7d8 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 19 Feb 2016 11:26:31 -0800 Subject: VXLAN: Support outer IPv4 Tx checksums by default This change makes it so that if UDP CSUM is not specified we will default to enabling it. The main motivation behind this is the fact that with the use of outer checksum we can greatly improve the performance for VXLAN tunnels on devices that don't know how to parse tunnel headers. Signed-off-by: Alexander Duyck Acked-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/vxlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 748083de367a..6eda4ed4d78b 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -197,7 +197,7 @@ struct vxlan_dev { #define VXLAN_F_L2MISS 0x08 #define VXLAN_F_L3MISS 0x10 #define VXLAN_F_IPV6 0x20 -#define VXLAN_F_UDP_CSUM 0x40 +#define VXLAN_F_UDP_ZERO_CSUM_TX 0x40 #define VXLAN_F_UDP_ZERO_CSUM6_TX 0x80 #define VXLAN_F_UDP_ZERO_CSUM6_RX 0x100 #define VXLAN_F_REMCSUM_TX 0x200 -- cgit v1.2.3 From 6d5d2ee63cee7025badda3b74ae2ef7ab097acfa Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 8 Jan 2016 19:28:58 +0100 Subject: Bluetooth: add LED trigger for indicating HCI is powered up Add support for LED triggers to the Bluetooth subsystem and add kernel config symbol BT_LEDS for it. For now one trigger for indicating "HCI is powered up" is supported. Signed-off-by: Heiner Kallweit Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index d4f82edb5cff..dc71473462ac 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -25,6 +25,7 @@ #ifndef __HCI_CORE_H #define __HCI_CORE_H +#include #include #include @@ -396,6 +397,8 @@ struct hci_dev { struct delayed_work rpa_expired; bdaddr_t rpa; + struct led_trigger *power_led; + int (*open)(struct hci_dev *hdev); int (*close)(struct hci_dev *hdev); int (*flush)(struct hci_dev *hdev); -- cgit v1.2.3 From 07b0188adf7298bf80a9890d3e90f27e973623d3 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 19 Feb 2016 09:59:11 +0100 Subject: mac802154: fix mac header length check I got report about that sometimes the WARN_ON occurs there which should never happen. I came to the conclusion that the mac header is there but inside the headroom of skb. The skb->len information doesn't contain the information about the headroom length and skb->len is lesser than two. We check now if the skb_mac_header pointer is set and the room between mac header pointer and tail pointer. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/mac802154.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/mac802154.h b/include/net/mac802154.h index da574bbdc333..2e3cdd2048d2 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -247,8 +247,9 @@ struct ieee802154_ops { */ static inline __le16 ieee802154_get_fc_from_skb(const struct sk_buff *skb) { - /* return some invalid fc on failure */ - if (unlikely(skb->len < 2)) { + /* check if we can fc at skb_mac_header of sk buffer */ + if (unlikely(!skb_mac_header_was_set(skb) || + (skb_tail_pointer(skb) - skb_mac_header(skb)) < 2)) { WARN_ON(1); return cpu_to_le16(0); } -- cgit v1.2.3 From 5609c185f24dffca5f6a9c127106869da150be03 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 22 Feb 2016 09:13:54 +0100 Subject: 6lowpan: iphc: add support for stateful compression This patch introduce support for IPHC stateful address compression. It will offer the context table via one debugfs entry. This debugfs has and directory for each cid entry for the context table. Inside each cid directory there exists the following files: - "active": If the entry is added or deleted. The context table is original a list implementation, this flag will indicate if the context is part of list or not. - "prefix": The ipv6 prefix. - "prefix_length": The prefix length for the prefix. - "compression": The compression flag according RFC6775. This part should be moved into sysfs after some testing time. Also the debugfs entry contains a "show" file which is a pretty-printout for the current context table information. Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/6lowpan.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/net') diff --git a/include/net/6lowpan.h b/include/net/6lowpan.h index 2f6a3f2233ed..da3a77d25fcb 100644 --- a/include/net/6lowpan.h +++ b/include/net/6lowpan.h @@ -75,6 +75,8 @@ #define LOWPAN_IPHC_MAX_HC_BUF_LEN (sizeof(struct ipv6hdr) + \ LOWPAN_IPHC_MAX_HEADER_LEN + \ LOWPAN_NHC_MAX_HDR_LEN) +/* SCI/DCI is 4 bit width, so we have maximum 16 entries */ +#define LOWPAN_IPHC_CTX_TABLE_SIZE (1 << 4) #define LOWPAN_DISPATCH_IPV6 0x41 /* 01000001 = 65 */ #define LOWPAN_DISPATCH_IPHC 0x60 /* 011xxxxx = ... */ @@ -98,9 +100,39 @@ enum lowpan_lltypes { LOWPAN_LLTYPE_IEEE802154, }; +enum lowpan_iphc_ctx_flags { + LOWPAN_IPHC_CTX_FLAG_ACTIVE, + LOWPAN_IPHC_CTX_FLAG_COMPRESSION, +}; + +struct lowpan_iphc_ctx { + u8 id; + struct in6_addr pfx; + u8 plen; + unsigned long flags; +}; + +struct lowpan_iphc_ctx_table { + spinlock_t lock; + const struct lowpan_iphc_ctx_ops *ops; + struct lowpan_iphc_ctx table[LOWPAN_IPHC_CTX_TABLE_SIZE]; +}; + +static inline bool lowpan_iphc_ctx_is_active(const struct lowpan_iphc_ctx *ctx) +{ + return test_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &ctx->flags); +} + +static inline bool +lowpan_iphc_ctx_is_compression(const struct lowpan_iphc_ctx *ctx) +{ + return test_bit(LOWPAN_IPHC_CTX_FLAG_COMPRESSION, &ctx->flags); +} + struct lowpan_priv { enum lowpan_lltypes lltype; struct dentry *iface_debugfs; + struct lowpan_iphc_ctx_table ctx; /* must be last */ u8 priv[0] __aligned(sizeof(void *)); -- cgit v1.2.3 From a6692754d61a6b3735803783f394880805675f99 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 12 Feb 2016 12:09:39 -0500 Subject: net: dsa: pass bridge down to drivers Some DSA drivers may or may not support multiple software bridges on top of an hardware switch. It is more convenient for them to access the bridge's net_device for finer configuration. Removing the need to craft and access a bitmask also simplifies the code. This patch changes the signature of bridge related functions, update DSA drivers, and removes dsa_slave_br_port_mask. Signed-off-by: Vivien Didelot Tested-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 26a0e86e611e..1c845d7bf0b2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -297,9 +297,8 @@ struct dsa_switch_driver { * Bridge integration */ int (*port_join_bridge)(struct dsa_switch *ds, int port, - u32 br_port_mask); - int (*port_leave_bridge)(struct dsa_switch *ds, int port, - u32 br_port_mask); + struct net_device *bridge); + int (*port_leave_bridge)(struct dsa_switch *ds, int port); int (*port_stp_update)(struct dsa_switch *ds, int port, u8 state); -- cgit v1.2.3 From 412a6d800c7380c1b87c11080c7da905c27cfea8 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Tue, 8 Dec 2015 19:09:05 +0200 Subject: mac80211: support hw managing reorder logic Enable driver to manage the reordering logic itself. This is needed for example for the iwlwifi driver that will support hardware assisted reordering. Signed-off-by: Sara Sharon Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 6c9c559394b0..ee6305a52251 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1929,6 +1929,11 @@ struct ieee80211_txq { * by just its MAC address; this prevents, for example, the same station * from connecting to two virtual AP interfaces at the same time. * + * @IEEE80211_HW_SUPPORTS_REORDERING_BUFFER: Hardware (or driver) manages the + * reordering buffer internally, guaranteeing mac80211 receives frames in + * order and does not need to manage its own reorder buffer or BA session + * timeout. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -1965,6 +1970,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_AMSDU_IN_AMPDU, IEEE80211_HW_BEACON_TX_STATUS, IEEE80211_HW_NEEDS_UNIQUE_STA_ADDR, + IEEE80211_HW_SUPPORTS_REORDERING_BUFFER, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS -- cgit v1.2.3 From 178830481eee5eea147a1c8fab67a96e09d80345 Mon Sep 17 00:00:00 2001 From: Grzegorz Bajorski Date: Fri, 11 Dec 2015 14:39:46 +0100 Subject: mac80211: allow drivers to report (non-)monitor frames Some drivers offload some frames internally (e.g. AddBa). Reporting such frames to mac80211 would only confuse MLME. However it would be useful to be able to pass such frames to monitor interfaces for sniffing purposes, e.g. when running AP + monitor. To do that allow drivers to tell mac80211 whether a given frame should be: - processed but not delivered to any monitor vif - not processed but delievered to monitor vifs only Signed-off-by: Grzegorz Bajorski Signed-off-by: Johannes Berg --- include/net/mac80211.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index ee6305a52251..5910085af9e6 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1031,6 +1031,14 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * @RX_FLAG_AMPDU_DELIM_CRC_KNOWN: The delimiter CRC field is known (the CRC * is stored in the @ampdu_delimiter_crc field) * @RX_FLAG_LDPC: LDPC was used + * @RX_FLAG_ONLY_MONITOR: Report frame only to monitor interfaces without + * processing it in any regular way. + * This is useful if drivers offload some frames but still want to report + * them for sniffing purposes. + * @RX_FLAG_SKIP_MONITOR: Process and report frame to all interfaces except + * monitor interfaces. + * This is useful if drivers offload some frames but still want to report + * them for sniffing purposes. * @RX_FLAG_STBC_MASK: STBC 2 bit bitmask. 1 - Nss=1, 2 - Nss=2, 3 - Nss=3 * @RX_FLAG_10MHZ: 10 MHz (half channel) was used * @RX_FLAG_5MHZ: 5 MHz (quarter channel) was used @@ -1071,6 +1079,8 @@ enum mac80211_rx_flags { RX_FLAG_MACTIME_END = BIT(21), RX_FLAG_VHT = BIT(22), RX_FLAG_LDPC = BIT(23), + RX_FLAG_ONLY_MONITOR = BIT(24), + RX_FLAG_SKIP_MONITOR = BIT(25), RX_FLAG_STBC_MASK = BIT(26) | BIT(27), RX_FLAG_10MHZ = BIT(28), RX_FLAG_5MHZ = BIT(29), @@ -1089,6 +1099,7 @@ enum mac80211_rx_flags { * @RX_VHT_FLAG_160MHZ: 160 MHz was used * @RX_VHT_FLAG_BF: packet was beamformed */ + enum mac80211_rx_vht_flags { RX_VHT_FLAG_80MHZ = BIT(0), RX_VHT_FLAG_160MHZ = BIT(1), -- cgit v1.2.3 From 506bcfa8abebdbcebdc17b03e96e38dc0b8ce765 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sun, 13 Dec 2015 15:41:05 +0200 Subject: mac80211: limit the A-MSDU Tx based on peer's capabilities In VHT, the specification allows to limit the number of MSDUs in an A-MSDU in the Extended Capabilities IE. There is also a limitation on the byte size in the VHT IE. In HT, the only limitation is on the byte size. Parse the capabilities from the peer and make them available to the driver. In HT, there is another limitation when a BA agreement is active: the byte size can't be greater than 4095. This is not enforced here. Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 5910085af9e6..df5698ed8052 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1714,6 +1714,18 @@ struct ieee80211_sta_rates { * @tdls_initiator: indicates the STA is an initiator of the TDLS link. Only * valid if the STA is a TDLS peer in the first place. * @mfp: indicates whether the STA uses management frame protection or not. + * @max_amsdu_subframes: indicates the maximal number of MSDUs in a single + * A-MSDU. Taken from the Extended Capabilities element. 0 means + * unlimited. + * @max_amsdu_len: indicates the maximal length of an A-MSDU in bytes. This + * field is always valid for packets with a VHT preamble. For packets + * with a HT preamble, additional limits apply: + * + If the skb is transmitted as part of a BA agreement, the + * A-MSDU maximal size is min(max_amsdu_len, 4065) bytes. + * + If the skb is not part of a BA aggreement, the A-MSDU maximal + * size is min(max_amsdu_len, 7935) bytes. + * Both additional HT limits must be enforced by the low level driver. + * This is defined by the spec (IEEE 802.11-2012 section 8.3.2.2 NOTE 2). * @txq: per-TID data TX queues (if driver uses the TXQ abstraction) */ struct ieee80211_sta { @@ -1732,6 +1744,8 @@ struct ieee80211_sta { bool tdls; bool tdls_initiator; bool mfp; + u8 max_amsdu_subframes; + u16 max_amsdu_len; struct ieee80211_txq *txq[IEEE80211_NUM_TIDS]; -- cgit v1.2.3 From 538dc9045251d3d6b5c0216a5c61c32bd9cedac9 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Thu, 24 Dec 2015 00:33:26 -0800 Subject: mac80211: Make addr const in SET_IEEE80211_PERM_ADDR() Make the addr parameter const in SET_IEEE80211_PERM_ADDR() to save clients from having to cast away a const qualifier. Signed-off-by: Bjorn Andersson Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index df5698ed8052..566df20dc957 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2217,7 +2217,7 @@ static inline void SET_IEEE80211_DEV(struct ieee80211_hw *hw, struct device *dev * @hw: the &struct ieee80211_hw to set the MAC address for * @addr: the address to set */ -static inline void SET_IEEE80211_PERM_ADDR(struct ieee80211_hw *hw, u8 *addr) +static inline void SET_IEEE80211_PERM_ADDR(struct ieee80211_hw *hw, const u8 *addr) { memcpy(hw->wiphy->perm_addr, addr, ETH_ALEN); } -- cgit v1.2.3 From f4a0f0c5264e72d9279fbf9cf48a061526e8f788 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 25 Jan 2016 15:46:34 +0200 Subject: mac80211: add RX_FLAG_MACTIME_PLCP_START The timestamp given by iwlwifi is at the beginning of the frame over the air, at (or during) the SYNC field. Allow such timestamps to be given to mac80211, at least (for now) for frames with non-HT/VHT preambles. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 566df20dc957..31337f81ec03 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1010,6 +1010,8 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * @RX_FLAG_MACTIME_END: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the last symbol of the MPDU * (including FCS) was received. + * @RX_FLAG_MACTIME_PLCP_START: The timestamp passed in the RX status (@mactime + * field) is valid and contains the time the SYNC preamble was received. * @RX_FLAG_SHORTPRE: Short preamble was used for this frame * @RX_FLAG_HT: HT MCS was used and rate_idx is MCS index * @RX_FLAG_VHT: VHT MCS was used and rate_index is MCS index @@ -1058,6 +1060,7 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) enum mac80211_rx_flags { RX_FLAG_MMIC_ERROR = BIT(0), RX_FLAG_DECRYPTED = BIT(1), + RX_FLAG_MACTIME_PLCP_START = BIT(2), RX_FLAG_MMIC_STRIPPED = BIT(3), RX_FLAG_IV_STRIPPED = BIT(4), RX_FLAG_FAILED_FCS_CRC = BIT(5), -- cgit v1.2.3 From dfdfc2beb0dd7e3a067d2eeacb4623cb48e77658 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Tue, 26 Jan 2016 17:11:13 +0100 Subject: mac80211: Parse legacy and HT rate in injected frames Drivers/devices without their own rate control algorithm can get the information what rates they should use from either the radiotap header of injected frames or from the rate control algorithm. But the parsing of the legacy rate information from the radiotap header was removed in commit e6a9854b05c1 ("mac80211/drivers: rewrite the rate control API"). The removal of this feature heavily reduced the usefulness of frame injection when wanting to simulate specific transmission behavior. Having rate parsing together with MCS rates and retry support allows a fine grained selection of the tx behavior of injected frames for these kind of tests. Signed-off-by: Sven Eckelmann Cc: Simon Wunderlich Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 31337f81ec03..dbcd69a6bfda 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -708,12 +708,14 @@ enum mac80211_tx_info_flags { * protocol frame (e.g. EAP) * @IEEE80211_TX_CTRL_PS_RESPONSE: This frame is a response to a poll * frame (PS-Poll or uAPSD). + * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information * * These flags are used in tx_info->control.flags. */ enum mac80211_tx_control_flags { IEEE80211_TX_CTRL_PORT_CTRL_PROTO = BIT(0), IEEE80211_TX_CTRL_PS_RESPONSE = BIT(1), + IEEE80211_TX_CTRL_RATE_INJECT = BIT(2), }; /* -- cgit v1.2.3 From f2ac7e301ae6397669ff3f79e691942a9b5d2f39 Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Wed, 27 Jan 2016 15:26:12 +0100 Subject: mac80211: expose txq queue depth and size to drivers This will allow drivers to make more educated decisions whether to defer transmission or not. Relying on wake_tx_queue() call count implicitly was not possible because it could be called without queued frame count actually changing on software tx aggregation start/stop code paths. It was also not possible to know how long byte-wise queue was without dequeueing. Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- include/net/mac80211.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index dbcd69a6bfda..fd35fc4d7127 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5596,4 +5596,19 @@ void ieee80211_unreserve_tid(struct ieee80211_sta *sta, u8 tid); */ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_txq *txq); + +/** + * ieee80211_txq_get_depth - get pending frame/byte count of given txq + * + * The values are not guaranteed to be coherent with regard to each other, i.e. + * txq state can change half-way of this function and the caller may end up + * with "new" frame_cnt and "old" byte_cnt or vice-versa. + * + * @txq: pointer obtained from station or virtual interface + * @frame_cnt: pointer to store frame count + * @byte_cnt: pointer to store byte count + */ +void ieee80211_txq_get_depth(struct ieee80211_txq *txq, + unsigned long *frame_cnt, + unsigned long *byte_cnt); #endif /* MAC80211_H */ -- cgit v1.2.3 From 06470f7468c8b6c95e72ebda803a61a99f4ee446 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Thu, 28 Jan 2016 16:19:25 +0200 Subject: mac80211: add API to allow filtering frames in BA sessions If any frames are dropped that are part of a BA session, the reorder buffer will "indefinitely" (until the timeout) wait for them to come in (or a BAR moving the window) and won't release frames after them. This means it isn't possible to filter frames within a BA session in firmware. Introduce an API function that allows such filtering. Calling this function will move the BA window forward to the new SSN, and allows marking frames after the SSN as having been filtered, so any future reordering activity will release frames while skipping the holes. Signed-off-by: Sara Sharon Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index fd35fc4d7127..57147749ae42 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2015 Intel Deutschland GmbH + * Copyright (C) 2015 - 2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -5193,6 +5193,24 @@ void ieee80211_remain_on_channel_expired(struct ieee80211_hw *hw); void ieee80211_stop_rx_ba_session(struct ieee80211_vif *vif, u16 ba_rx_bitmap, const u8 *addr); +/** + * ieee80211_mark_rx_ba_filtered_frames - move RX BA window and mark filtered + * @pubsta: station struct + * @tid: the session's TID + * @ssn: starting sequence number of the bitmap, all frames before this are + * assumed to be out of the window after the call + * @filtered: bitmap of filtered frames, BIT(0) is the @ssn entry etc. + * @received_mpdus: number of received mpdus in firmware + * + * This function moves the BA window and releases all frames before @ssn, and + * marks frames marked in the bitmap as having been filtered. Afterwards, it + * checks if any frames in the window starting from @ssn can now be released + * (in case they were only waiting for frames that were filtered.) + */ +void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid, + u16 ssn, u64 filtered, + u16 received_mpdus); + /** * ieee80211_send_bar - send a BlockAckReq frame * -- cgit v1.2.3 From 34d505193bd10668acf1caba02d2f66bddc23fea Mon Sep 17 00:00:00 2001 From: Lior David Date: Thu, 28 Jan 2016 10:58:25 +0200 Subject: cfg80211: basic support for PBSS network type PBSS (Personal Basic Service Set) is a new BSS type for DMG networks. It is similar to infrastructure BSS, having an AP-like entity called PCP (PBSS Control Point), but it has few differences. PBSS support is mandatory for 11ad devices. Add support for PBSS by introducing a new PBSS flag attribute. The PBSS flag is used in the START_AP command to request starting a PCP instead of an AP, and in the CONNECT command to request connecting to a PCP instead of an AP. Signed-off-by: Lior David Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 9bcaaf7cd15a..9e1b24c29f0c 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -712,6 +712,8 @@ struct cfg80211_acl_data { * @p2p_opp_ps: P2P opportunistic PS * @acl: ACL configuration used by the drivers which has support for * MAC address based access control + * @pbss: If set, start as a PCP instead of AP. Relevant for DMG + * networks. */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -730,6 +732,7 @@ struct cfg80211_ap_settings { u8 p2p_ctwindow; bool p2p_opp_ps; const struct cfg80211_acl_data *acl; + bool pbss; }; /** @@ -1888,6 +1891,8 @@ struct cfg80211_ibss_params { * @ht_capa_mask: The bits of ht_capa which are to be used. * @vht_capa: VHT Capability overrides * @vht_capa_mask: The bits of vht_capa which are to be used. + * @pbss: if set, connect to a PCP instead of AP. Valid for DMG + * networks. */ struct cfg80211_connect_params { struct ieee80211_channel *channel; @@ -1910,6 +1915,7 @@ struct cfg80211_connect_params { struct ieee80211_ht_cap ht_capa_mask; struct ieee80211_vht_cap vht_capa; struct ieee80211_vht_cap vht_capa_mask; + bool pbss; }; /** @@ -3489,6 +3495,7 @@ struct cfg80211_cached_keys; * registered for unexpected class 3 frames (AP mode) * @conn: (private) cfg80211 software SME connection state machine data * @connect_keys: (private) keys to set after connection is established + * @conn_bss_type: connecting/connected BSS type * @ibss_fixed: (private) IBSS is using fixed BSSID * @ibss_dfs_possible: (private) IBSS may change to a DFS channel * @event_list: (private) list for internal event processing @@ -3519,6 +3526,7 @@ struct wireless_dev { u8 ssid_len, mesh_id_len, mesh_id_up_len; struct cfg80211_conn *conn; struct cfg80211_cached_keys *connect_keys; + enum ieee80211_bss_type conn_bss_type; struct list_head event_list; spinlock_t event_lock; -- cgit v1.2.3 From f8079d43cf0f1f0171606e75fcef6fe17bb183f2 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Sun, 14 Feb 2016 13:56:35 +0200 Subject: mac80211: move TKIP TX IVs to public part of key struct Some drivers/devices might want to set the IVs by themselves (and still let mac80211 generate MMIC). Specifically, this is needed when the device does offloading at certain times, and the driver has to make sure that the IVs of new tx frames (from the host) are synchronized with IVs that were potentially used during the offloading. Similarly to CCMP, move the TX IVs of TKIP keys to the public part of the key struct, and export a function to add the IV right into the crypto header. The public tx_pn field is defined as atomic64, so define TKIP_PN_TO_IV16/32 helper macros to convert it to iv16/32 when needed. Since the iv32 used for the p1k cache is taken directly from the frame, we can safely remove iv16/32 from being protected by tkip.txlock. Signed-off-by: Eliad Peller Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 57147749ae42..15879b49baad 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1521,9 +1521,8 @@ enum ieee80211_key_flags { * wants to be given when a frame is transmitted and needs to be * encrypted in hardware. * @cipher: The key's cipher suite selector. - * @tx_pn: PN used for TX on non-TKIP keys, may be used by the driver - * as well if it needs to do software PN assignment by itself - * (e.g. due to TSO) + * @tx_pn: PN used for TX keys, may be used by the driver as well if it + * needs to do software PN assignment by itself (e.g. due to TSO) * @flags: key flags, see &enum ieee80211_key_flags. * @keyidx: the key index (0-3) * @keylen: key material length @@ -1549,6 +1548,9 @@ struct ieee80211_key_conf { #define IEEE80211_MAX_PN_LEN 16 +#define TKIP_PN_TO_IV16(pn) ((u16)(pn & 0xffff)) +#define TKIP_PN_TO_IV32(pn) ((u32)((pn >> 16) & 0xffffffff)) + /** * struct ieee80211_key_seq - key sequence counter * @@ -4446,6 +4448,21 @@ void ieee80211_get_tkip_rx_p1k(struct ieee80211_key_conf *keyconf, void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf, struct sk_buff *skb, u8 *p2k); +/** + * ieee80211_tkip_add_iv - write TKIP IV and Ext. IV to pos + * + * @pos: start of crypto header + * @keyconf: the parameter passed with the set key + * @pn: PN to add + * + * Returns: pointer to the octet following IVs (i.e. beginning of + * the packet payload) + * + * This function writes the tkip IV value to pos (which should + * point to the crypto header) + */ +u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key_conf *keyconf, u64 pn); + /** * ieee80211_get_key_tx_seq - get key TX sequence counter * -- cgit v1.2.3 From ca48ebbc7ea7e82e3ae4b55aacead0cdb54ff008 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Mon, 15 Feb 2016 12:34:10 +0200 Subject: mac80211: remove ieee80211_get_key_tx_seq/ieee80211_set_key_tx_seq Since the PNs of all the tx keys are now tracked in the public part of the key struct (with atomic counter), we no longer need these functions. dvm and vt665{5,6} are currently the only users of these functions, so update them accordingly. Signed-off-by: Eliad Peller Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 34 ---------------------------------- 1 file changed, 34 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 15879b49baad..66155d3ad7e6 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4463,23 +4463,6 @@ void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf, */ u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key_conf *keyconf, u64 pn); -/** - * ieee80211_get_key_tx_seq - get key TX sequence counter - * - * @keyconf: the parameter passed with the set key - * @seq: buffer to receive the sequence data - * - * This function allows a driver to retrieve the current TX IV/PN - * for the given key. It must not be called if IV generation is - * offloaded to the device. - * - * Note that this function may only be called when no TX processing - * can be done concurrently, for example when queues are stopped - * and the stop has been synchronized. - */ -void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf, - struct ieee80211_key_seq *seq); - /** * ieee80211_get_key_rx_seq - get key RX sequence counter * @@ -4499,23 +4482,6 @@ void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf, void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq); -/** - * ieee80211_set_key_tx_seq - set key TX sequence counter - * - * @keyconf: the parameter passed with the set key - * @seq: new sequence data - * - * This function allows a driver to set the current TX IV/PNs for the - * given key. This is useful when resuming from WoWLAN sleep and the - * device may have transmitted frames using the PTK, e.g. replies to - * ARP requests. - * - * Note that this function may only be called when no TX processing - * can be done concurrently. - */ -void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf, - struct ieee80211_key_seq *seq); - /** * ieee80211_set_key_rx_seq - set key RX sequence counter * -- cgit v1.2.3 From 65554d07adfc22bb9e14f6df8c609a646f869a74 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Tue, 16 Feb 2016 12:48:17 +0200 Subject: mac80211: provide interface to driver to set VHT MU-MIMO data Provide an interface to the lower level driver to set the VHT MU-MIMO data. This is needed for example when there is an update of the group data during low power state, where the management frame will not be passed to the host at all. Signed-off-by: Sara Sharon Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 66155d3ad7e6..23f2a5ecf669 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5445,6 +5445,21 @@ ieee80211_vif_type_p2p(struct ieee80211_vif *vif) return ieee80211_iftype_p2p(vif->type, vif->p2p); } +/** + * ieee80211_update_mu_groups - set the VHT MU-MIMO groud data + * + * @vif: the specified virtual interface + * @membership: 64 bits array - a bit is set if station is member of the group + * @position: 2 bits per group id indicating the position in the group + * + * Note: This function assumes that the given vif is valid and the position and + * membership data is of the correct size and are in the same byte order as the + * matching GroupId management frame. + * Calls to this function need to be serialized with RX path. + */ +void ieee80211_update_mu_groups(struct ieee80211_vif *vif, + const u8 *membership, const u8 *position); + void ieee80211_enable_rssi_reports(struct ieee80211_vif *vif, int rssi_min_thold, int rssi_max_thold); -- cgit v1.2.3 From b5a33d52595f0cb153f09bf45a5dcd66a7418dbb Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Tue, 16 Feb 2016 12:48:18 +0200 Subject: mac80211: move MU_MIMO_OWNER flag to ieee80211_vif Drivers may need to track which vif is using VHT MU-MIMO. Move the flag indicationg the ownership of MU_MIMO to ieee80211_vif. Signed-off-by: Sara Sharon Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 23f2a5ecf669..0c09da34b67a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1382,6 +1382,7 @@ enum ieee80211_vif_flags { * @csa_active: marks whether a channel switch is going on. Internally it is * write-protected by sdata_lock and local->mtx so holding either is fine * for read access. + * @mu_mimo_owner: indicates interface owns MU-MIMO capability * @driver_flags: flags/capabilities the driver has for this interface, * these need to be set (or cleared) when the interface is added * or, if supported by the driver, the interface type is changed @@ -1408,6 +1409,7 @@ struct ieee80211_vif { u8 addr[ETH_ALEN]; bool p2p; bool csa_active; + bool mu_mimo_owner; u8 cab_queue; u8 hw_queue[IEEE80211_NUM_ACS]; -- cgit v1.2.3 From 1d4150c02c5709fdfd80f10368a31867de35e72e Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 22 Feb 2016 15:57:52 -0800 Subject: net_sched: prepare tcf_hashinfo_destroy() for netns support We only release the memory of the hashtable itself, not its entries inside. This is not a problem yet since we only call it in module release path, and module is refcount'ed by actions. This would be a problem after we move the per module hinfo into per netns in the latter patch. Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_api.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 9d446f136607..8c4e3ff723fb 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -65,11 +65,6 @@ static inline int tcf_hashinfo_init(struct tcf_hashinfo *hf, unsigned int mask) return 0; } -static inline void tcf_hashinfo_destroy(struct tcf_hashinfo *hf) -{ - kfree(hf->htab); -} - /* Update lastuse only if needed, to avoid dirtying a cache line. * We use a temp variable to avoid fetching jiffies twice. */ -- cgit v1.2.3 From ddf97ccdd7cb7e00daba465a5c947b8d941dc2a4 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 22 Feb 2016 15:57:53 -0800 Subject: net_sched: add network namespace support for tc actions Currently tc actions are stored in a per-module hashtable, therefore are visible to all network namespaces. This is probably the last part of the tc subsystem which is not aware of netns now. This patch makes them per-netns, several tc action API's need to be adjusted for this. The tc action API code is ugly due to historical reasons, we need to refactor that code in the future. Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_api.h | 58 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 8c4e3ff723fb..342be6c5ab5c 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -7,6 +7,8 @@ #include #include +#include +#include struct tcf_common { struct hlist_node tcfc_head; @@ -87,31 +89,65 @@ struct tc_action { __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ __u32 order; struct list_head list; + struct tcf_hashinfo *hinfo; }; struct tc_action_ops { struct list_head head; - struct tcf_hashinfo *hinfo; char kind[IFNAMSIZ]; __u32 type; /* TBD to match kind */ struct module *owner; int (*act)(struct sk_buff *, const struct tc_action *, struct tcf_result *); int (*dump)(struct sk_buff *, struct tc_action *, int, int); void (*cleanup)(struct tc_action *, int bind); - int (*lookup)(struct tc_action *, u32); + int (*lookup)(struct net *, struct tc_action *, u32); int (*init)(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *act, int ovr, int bind); - int (*walk)(struct sk_buff *, struct netlink_callback *, int, struct tc_action *); + int (*walk)(struct net *, struct sk_buff *, + struct netlink_callback *, int, struct tc_action *); +}; + +struct tc_action_net { + struct tcf_hashinfo *hinfo; + const struct tc_action_ops *ops; }; -int tcf_hash_search(struct tc_action *a, u32 index); -u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo); -int tcf_hash_check(u32 index, struct tc_action *a, int bind); -int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, - int size, int bind, bool cpustats); +static inline +int tc_action_net_init(struct tc_action_net *tn, const struct tc_action_ops *ops, + unsigned int mask) +{ + int err = 0; + + tn->hinfo = kmalloc(sizeof(*tn->hinfo), GFP_KERNEL); + if (!tn->hinfo) + return -ENOMEM; + tn->ops = ops; + err = tcf_hashinfo_init(tn->hinfo, mask); + if (err) + kfree(tn->hinfo); + return err; +} + +void tcf_hashinfo_destroy(const struct tc_action_ops *ops, + struct tcf_hashinfo *hinfo); + +static inline void tc_action_net_exit(struct tc_action_net *tn) +{ + tcf_hashinfo_destroy(tn->ops, tn->hinfo); +} + +int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, + struct netlink_callback *cb, int type, + struct tc_action *a); +int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index); +u32 tcf_hash_new_index(struct tc_action_net *tn); +int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a, + int bind); +int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est, + struct tc_action *a, int size, int bind, bool cpustats); void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est); -void tcf_hash_insert(struct tc_action *a); +void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a); int __tcf_hash_release(struct tc_action *a, bool bind, bool strict); @@ -120,8 +156,8 @@ static inline int tcf_hash_release(struct tc_action *a, bool bind) return __tcf_hash_release(a, bind, false); } -int tcf_register_action(struct tc_action_ops *a, unsigned int mask); -int tcf_unregister_action(struct tc_action_ops *a); +int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops); +int tcf_unregister_action(struct tc_action_ops *a, struct pernet_operations *ops); int tcf_action_destroy(struct list_head *actions, int bind); int tcf_action_exec(struct sk_buff *skb, const struct list_head *actions, struct tcf_result *res); -- cgit v1.2.3 From 65aebfc002abc1827ac7c8644a2bba0459ce3ce2 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 23 Feb 2016 12:13:54 -0500 Subject: net: dsa: add port_vlan_dump routine Similar to port_fdb_dump, add a port_vlan_dump function to DSA drivers which gets passed the switchdev VLAN object and callback. This function, if implemented, takes precedence over the soon legacy vlan_getnext/port_pvid_get approach. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 1c845d7bf0b2..ebc0d9ea96a1 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -313,6 +313,9 @@ struct dsa_switch_driver { struct switchdev_trans *trans); int (*port_vlan_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); + int (*port_vlan_dump)(struct dsa_switch *ds, int port, + struct switchdev_obj_port_vlan *vlan, + int (*cb)(struct switchdev_obj *obj)); int (*port_pvid_get)(struct dsa_switch *ds, int port, u16 *pvid); int (*vlan_getnext)(struct dsa_switch *ds, u16 *vid, unsigned long *ports, unsigned long *untagged); -- cgit v1.2.3 From 477b184526a7f44164029eea720da0e0c888cac6 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 23 Feb 2016 12:13:56 -0500 Subject: net: dsa: drop vlan_getnext The VLAN GetNext operation is specific to some switches, and thus can be complicated to implement for some drivers. Remove the support for the vlan_getnext/port_pvid_get approach in favor of the generic and simpler port_vlan_dump function. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index ebc0d9ea96a1..3dd54867174a 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -316,9 +316,6 @@ struct dsa_switch_driver { int (*port_vlan_dump)(struct dsa_switch *ds, int port, struct switchdev_obj_port_vlan *vlan, int (*cb)(struct switchdev_obj *obj)); - int (*port_pvid_get)(struct dsa_switch *ds, int port, u16 *pvid); - int (*vlan_getnext)(struct dsa_switch *ds, u16 *vid, - unsigned long *ports, unsigned long *untagged); /* * Forwarding database -- cgit v1.2.3 From 3f2fb9a834cb1fcddbae22deca7fde136944dc89 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 24 Feb 2016 11:47:02 -0800 Subject: net: l3mdev: address selection should only consider devices in L3 domain David Lamparter noted a use case where the source address selection fails to pick an address from a VRF interface - unnumbered interfaces. Relevant commands from his script: ip addr add 9.9.9.9/32 dev lo ip link set lo up ip link add name vrf0 type vrf table 101 ip rule add oif vrf0 table 101 ip rule add iif vrf0 table 101 ip link set vrf0 up ip addr add 10.0.0.3/32 dev vrf0 ip link add name dummy2 type dummy ip link set dummy2 master vrf0 up --> note dummy2 has no address - unnumbered device ip route add 10.2.2.2/32 dev dummy2 table 101 ip neigh add 10.2.2.2 dev dummy2 lladdr 02:00:00:00:00:02 tcpdump -ni dummy2 & And using ping instead of his socat example: $ ping -I vrf0 -c1 10.2.2.2 ping: Warning: source address might be selected on device other than vrf0. PING 10.2.2.2 (10.2.2.2) from 9.9.9.9 vrf0: 56(84) bytes of data. >From tcpdump: 12:57:29.449128 IP 9.9.9.9 > 10.2.2.2: ICMP echo request, id 2491, seq 1, length 64 Note the source address is from lo and is not a VRF local address. With this patch: $ ping -I vrf0 -c1 10.2.2.2 PING 10.2.2.2 (10.2.2.2) from 10.0.0.3 vrf0: 56(84) bytes of data. >From tcpdump: 12:59:25.096426 IP 10.0.0.3 > 10.2.2.2: ICMP echo request, id 2113, seq 1, length 64 Now the source address comes from vrf0. The ipv4 function for selecting source address takes a const argument. Removing the const requires touching a lot of places, so instead l3mdev_master_ifindex_rcu is changed to take a const argument and then do the typecast to non-const as required by netdev_master_upper_dev_get_rcu. This is similar to what l3mdev_fib_table_rcu does. IPv6 for unnumbered interfaces appears to be selecting the addresses properly. Cc: David Lamparter Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/l3mdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 5567d46b3cff..c43a9c73de5e 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -39,7 +39,7 @@ struct l3mdev_ops { #ifdef CONFIG_NET_L3_MASTER_DEV -int l3mdev_master_ifindex_rcu(struct net_device *dev); +int l3mdev_master_ifindex_rcu(const struct net_device *dev); static inline int l3mdev_master_ifindex(struct net_device *dev) { int ifindex; @@ -179,7 +179,7 @@ struct dst_entry *l3mdev_rt6_dst_by_oif(struct net *net, #else -static inline int l3mdev_master_ifindex_rcu(struct net_device *dev) +static inline int l3mdev_master_ifindex_rcu(const struct net_device *dev) { return 0; } -- cgit v1.2.3 From 86a7996cc8a078793670d82ed97d5a99bb4e8496 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 25 Feb 2016 14:55:00 -0800 Subject: net_sched: introduce qdisc_replace() helper Remove nearly duplicated code and prepare for the following patch. Cc: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/sch_generic.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 636a362a0e03..8fdad9f7a2fb 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -707,6 +707,23 @@ static inline void qdisc_reset_queue(struct Qdisc *sch) sch->qstats.backlog = 0; } +static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new, + struct Qdisc **pold) +{ + struct Qdisc *old; + + sch_tree_lock(sch); + old = *pold; + *pold = new; + if (old != NULL) { + qdisc_tree_decrease_qlen(old, old->q.qlen); + qdisc_reset(old); + } + sch_tree_unlock(sch); + + return old; +} + static inline unsigned int __qdisc_queue_drop(struct Qdisc *sch, struct sk_buff_head *list) { -- cgit v1.2.3 From 2ccccf5fb43ff62b2b96cc58d95fc0b3596516e4 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 25 Feb 2016 14:55:01 -0800 Subject: net_sched: update hierarchical backlog too When the bottom qdisc decides to, for example, drop some packet, it calls qdisc_tree_decrease_qlen() to update the queue length for all its ancestors, we need to update the backlog too to keep the stats on root qdisc accurate. Cc: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/codel.h | 4 ++++ include/net/sch_generic.h | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/codel.h b/include/net/codel.h index 267e70210061..d168aca115cc 100644 --- a/include/net/codel.h +++ b/include/net/codel.h @@ -162,12 +162,14 @@ struct codel_vars { * struct codel_stats - contains codel shared variables and stats * @maxpacket: largest packet we've seen so far * @drop_count: temp count of dropped packets in dequeue() + * @drop_len: bytes of dropped packets in dequeue() * ecn_mark: number of packets we ECN marked instead of dropping * ce_mark: number of packets CE marked because sojourn time was above ce_threshold */ struct codel_stats { u32 maxpacket; u32 drop_count; + u32 drop_len; u32 ecn_mark; u32 ce_mark; }; @@ -308,6 +310,7 @@ static struct sk_buff *codel_dequeue(struct Qdisc *sch, vars->rec_inv_sqrt); goto end; } + stats->drop_len += qdisc_pkt_len(skb); qdisc_drop(skb, sch); stats->drop_count++; skb = dequeue_func(vars, sch); @@ -330,6 +333,7 @@ static struct sk_buff *codel_dequeue(struct Qdisc *sch, if (params->ecn && INET_ECN_set_ce(skb)) { stats->ecn_mark++; } else { + stats->drop_len += qdisc_pkt_len(skb); qdisc_drop(skb, sch); stats->drop_count++; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 8fdad9f7a2fb..e5bba897d206 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -396,7 +396,8 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, struct Qdisc *qdisc); void qdisc_reset(struct Qdisc *qdisc); void qdisc_destroy(struct Qdisc *qdisc); -void qdisc_tree_decrease_qlen(struct Qdisc *qdisc, unsigned int n); +void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, unsigned int n, + unsigned int len); struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops); struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, @@ -716,7 +717,7 @@ static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new, old = *pold; *pold = new; if (old != NULL) { - qdisc_tree_decrease_qlen(old, old->q.qlen); + qdisc_tree_reduce_backlog(old, old->q.qlen, old->qstats.backlog); qdisc_reset(old); } sch_tree_unlock(sch); -- cgit v1.2.3 From 6843e7a2abe7cac10c19702ffec90018df6f040d Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 26 Feb 2016 07:53:49 -0800 Subject: net: sched: consolidate offload decision in cls_u32 The offload decision was originally very basic and tied to if the dev implemented the appropriate ndo op hook. The next step is to allow the user to more flexibly define if any paticular rule should be offloaded or not. In order to have this logic in one function lift the current check into a helper routine tc_should_offload(). Signed-off-by: John Fastabend Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 2121df574262..e64d20b81047 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -392,4 +392,9 @@ struct tc_cls_u32_offload { }; }; +static inline bool tc_should_offload(struct net_device *dev) +{ + return dev->netdev_ops->ndo_setup_tc; +} + #endif -- cgit v1.2.3 From 2b6ab0d3aae6bf1e08118060b0c5565778cd6b21 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 26 Feb 2016 07:54:13 -0800 Subject: net: cls_u32: move TC offload feature bit into cls_u32 offload logic In the original series drivers would get offload requests for cls_u32 rules even if the feature bit is disabled. This meant the driver had to do a boiler plate check on the feature bit before adding/deleting the rule. This patch lifts the check into the core code and removes it from the driver specific case. Signed-off-by: John Fastabend Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index e64d20b81047..6096e96fb78b 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -394,6 +394,9 @@ struct tc_cls_u32_offload { static inline bool tc_should_offload(struct net_device *dev) { + if (!(dev->features & NETIF_F_HW_TC)) + return false; + return dev->netdev_ops->ndo_setup_tc; } -- cgit v1.2.3 From 9e8ce79cd711d4dfe09d8bba6822cd9bb7db96bd Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 26 Feb 2016 07:54:39 -0800 Subject: net: sched: cls_u32 add bit to specify software only rules In the initial implementation the only way to stop a rule from being inserted into the hardware table was via the device feature flag. However this doesn't work well when working on an end host system where packets are expect to hit both the hardware and software datapaths. For example we can imagine a rule that will match an IP address and increment a field. If we install this rule in both hardware and software we may increment the field twice. To date we have only added support for the drop action so we have been able to ignore these cases. But as we extend the action support we will hit this example plus more such cases. Arguably these are not even corner cases in many working systems these cases will be common. To avoid forcing the driver to always abort (i.e. the above example) this patch adds a flag to add a rule in software only. A careful user can use this flag to build software and hardware datapaths that work together. One example we have found particularly useful is to use hardware resources to set the skb->mark on the skb when the match may be expensive to run in software but a mark lookup in a hash table is cheap. The idea here is hardware can do in one lookup what the u32 classifier may need to traverse multiple lists and hash tables to compute. The flag is only passed down on inserts. On deletion to avoid stale references in hardware we always try to remove a rule if it exists. The flags field is part of the classifier specific options. Although it is tempting to lift this into the generic structure doing this proves difficult do to how the tc netlink attributes are implemented along with how the dump/change routines are called. There is also precedence for putting seemingly generic pieces in the specific classifier options such as TCA_U32_POLICE, TCA_U32_ACT, etc. So although not ideal I've left FLAGS in the u32 options as well as it simplifies the code greatly and user space has already learned how to manage these bits ala 'tc' tool. Another thing if trying to update a rule we require the flags to be unchanged. This is to force user space, software u32 and the hardware u32 to keep in sync. Thanks to Simon Horman for catching this case. Signed-off-by: John Fastabend Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 6096e96fb78b..bea14eee373e 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -392,12 +392,21 @@ struct tc_cls_u32_offload { }; }; -static inline bool tc_should_offload(struct net_device *dev) +/* tca flags definitions */ +#define TCA_CLS_FLAGS_SKIP_HW 1 + +static inline bool tc_should_offload(struct net_device *dev, u32 flags) { if (!(dev->features & NETIF_F_HW_TC)) return false; - return dev->netdev_ops->ndo_setup_tc; + if (flags & TCA_CLS_FLAGS_SKIP_HW) + return false; + + if (!dev->netdev_ops->ndo_setup_tc) + return false; + + return true; } #endif -- cgit v1.2.3 From bfcd3a46617209454cfc0947ab093e37fd1e84ef Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 26 Feb 2016 17:32:23 +0100 Subject: Introduce devlink infrastructure Introduce devlink infrastructure for drivers to register and expose to userspace via generic Netlink interface. There are two basic objects defined: devlink - one instance for every "parent device", for example switch ASIC devlink port - one instance for every physical port of the device. This initial portion implements basic get/dump of objects to userspace. Also, port splitter and port type setting is implemented. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 include/net/devlink.h (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h new file mode 100644 index 000000000000..c37d257891d6 --- /dev/null +++ b/include/net/devlink.h @@ -0,0 +1,140 @@ +/* + * include/net/devlink.h - Network physical device Netlink interface + * Copyright (c) 2016 Mellanox Technologies. All rights reserved. + * Copyright (c) 2016 Jiri Pirko + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ +#ifndef _NET_DEVLINK_H_ +#define _NET_DEVLINK_H_ + +#include +#include +#include +#include +#include +#include +#include + +struct devlink_ops; + +struct devlink { + struct list_head list; + struct list_head port_list; + const struct devlink_ops *ops; + struct device *dev; + possible_net_t _net; + char priv[0] __aligned(NETDEV_ALIGN); +}; + +struct devlink_port { + struct list_head list; + struct devlink *devlink; + unsigned index; + bool registered; + enum devlink_port_type type; + enum devlink_port_type desired_type; + void *type_dev; + bool split; + u32 split_group; +}; + +struct devlink_ops { + size_t priv_size; + int (*port_type_set)(struct devlink_port *devlink_port, + enum devlink_port_type port_type); + int (*port_split)(struct devlink *devlink, unsigned int port_index, + unsigned int count); + int (*port_unsplit)(struct devlink *devlink, unsigned int port_index); +}; + +static inline void *devlink_priv(struct devlink *devlink) +{ + BUG_ON(!devlink); + return &devlink->priv; +} + +static inline struct devlink *priv_to_devlink(void *priv) +{ + BUG_ON(!priv); + return container_of(priv, struct devlink, priv); +} + +struct ib_device; + +#if IS_ENABLED(CONFIG_NET_DEVLINK) + +struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size); +int devlink_register(struct devlink *devlink, struct device *dev); +void devlink_unregister(struct devlink *devlink); +void devlink_free(struct devlink *devlink); +int devlink_port_register(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index); +void devlink_port_unregister(struct devlink_port *devlink_port); +void devlink_port_type_eth_set(struct devlink_port *devlink_port, + struct net_device *netdev); +void devlink_port_type_ib_set(struct devlink_port *devlink_port, + struct ib_device *ibdev); +void devlink_port_type_clear(struct devlink_port *devlink_port); +void devlink_port_split_set(struct devlink_port *devlink_port, + u32 split_group); + +#else + +static inline struct devlink *devlink_alloc(const struct devlink_ops *ops, + size_t priv_size) +{ + return kzalloc(sizeof(struct devlink) + priv_size, GFP_KERNEL); +} + +static inline int devlink_register(struct devlink *devlink, struct device *dev) +{ + return 0; +} + +static inline void devlink_unregister(struct devlink *devlink) +{ +} + +static inline void devlink_free(struct devlink *devlink) +{ + kfree(devlink); +} + +static inline int devlink_port_register(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index) +{ + return 0; +} + +static inline void devlink_port_unregister(struct devlink_port *devlink_port) +{ +} + +static inline void devlink_port_type_eth_set(struct devlink_port *devlink_port, + struct net_device *netdev) +{ +} + +static inline void devlink_port_type_ib_set(struct devlink_port *devlink_port, + struct ib_device *ibdev) +{ +} + +static inline void devlink_port_type_clear(struct devlink_port *devlink_port) +{ +} + +static inline void devlink_port_split_set(struct devlink_port *devlink_port, + u32 split_group) +{ +} + +#endif + +#endif /* _NET_DEVLINK_H_ */ -- cgit v1.2.3 From fb2dabad69f099fb9c03a44276778911da50ba29 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 26 Feb 2016 13:16:00 -0500 Subject: net: dsa: support VLAN filtering switchdev attr When a user explicitly requests VLAN filtering with something like: # echo 1 > /sys/class/net//bridge/vlan_filtering Switchdev propagates a SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING port attribute. Add support for it in the DSA layer with a new port_vlan_filtering function to let drivers toggle 802.1Q filtering on user demand. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 3dd54867174a..26c0a3fa009a 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -305,6 +305,8 @@ struct dsa_switch_driver { /* * VLAN support */ + int (*port_vlan_filtering)(struct dsa_switch *ds, int port, + bool vlan_filtering); int (*port_vlan_prepare)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan, struct switchdev_trans *trans); -- cgit v1.2.3 From ef6980b6becb1afd9d82a4f043749a10ae81bf14 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sat, 27 Feb 2016 08:08:54 -0500 Subject: introduce IFE action This action allows for a sending side to encapsulate arbitrary metadata which is decapsulated by the receiving end. The sender runs in encoding mode and the receiver in decode mode. Both sender and receiver must specify the same ethertype. At some point we hope to have a registered ethertype and we'll then provide a default so the user doesnt have to specify it. For now we enforce the user specify it. Lets show example usage where we encode icmp from a sender towards a receiver with an skbmark of 17; both sender and receiver use ethertype of 0xdead to interop. YYYY: Lets start with Receiver-side policy config: xxx: add an ingress qdisc sudo tc qdisc add dev $ETH ingress xxx: any packets with ethertype 0xdead will be subjected to ife decoding xxx: we then restart the classification so we can match on icmp at prio 3 sudo $TC filter add dev $ETH parent ffff: prio 2 protocol 0xdead \ u32 match u32 0 0 flowid 1:1 \ action ife decode reclassify xxx: on restarting the classification from above if it was an icmp xxx: packet, then match it here and continue to the next rule at prio 4 xxx: which will match based on skb mark of 17 sudo tc filter add dev $ETH parent ffff: prio 3 protocol ip \ u32 match ip protocol 1 0xff flowid 1:1 \ action continue xxx: match on skbmark of 0x11 (decimal 17) and accept sudo tc filter add dev $ETH parent ffff: prio 4 protocol ip \ handle 0x11 fw flowid 1:1 \ action ok xxx: Lets show the decoding policy sudo tc -s filter ls dev $ETH parent ffff: protocol 0xdead xxx: filter pref 2 u32 filter pref 2 u32 fh 800: ht divisor 1 filter pref 2 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1 (rule hit 0 success 0) match 00000000/00000000 at 0 (success 0 ) action order 1: ife decode action reclassify index 1 ref 1 bind 1 installed 14 sec used 14 sec type: 0x0 Metadata: allow mark allow hash allow prio allow qmap Action statistics: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 xxx: Observe that above lists all metadatum it can decode. Typically these submodules will already be compiled into a monolithic kernel or loaded as modules YYYY: Lets show the sender side now .. xxx: Add an egress qdisc on the sender netdev sudo tc qdisc add dev $ETH root handle 1: prio xxx: xxx: Match all icmp packets to 192.168.122.237/24, then xxx: tag the packet with skb mark of decimal 17, then xxx: Encode it with: xxx: ethertype 0xdead xxx: add skb->mark to whitelist of metadatum to send xxx: rewrite target dst MAC address to 02:15:15:15:15:15 xxx: sudo $TC filter add dev $ETH parent 1: protocol ip prio 10 u32 \ match ip dst 192.168.122.237/24 \ match ip protocol 1 0xff \ flowid 1:2 \ action skbedit mark 17 \ action ife encode \ type 0xDEAD \ allow mark \ dst 02:15:15:15:15:15 xxx: Lets show the encoding policy sudo tc -s filter ls dev $ETH parent 1: protocol ip xxx: filter pref 10 u32 filter pref 10 u32 fh 800: ht divisor 1 filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:2 (rule hit 0 success 0) match c0a87aed/ffffffff at 16 (success 0 ) match 00010000/00ff0000 at 8 (success 0 ) action order 1: skbedit mark 17 index 6 ref 1 bind 1 Action statistics: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 action order 2: ife encode action pipe index 3 ref 1 bind 1 dst MAC: 02:15:15:15:15:15 type: 0xDEAD Metadata: allow mark Action statistics: Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 xxx: test by sending ping from sender to destination Signed-off-by: Jamal Hadi Salim Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/net/tc_act/tc_ife.h | 61 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 include/net/tc_act/tc_ife.h (limited to 'include/net') diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h new file mode 100644 index 000000000000..dc9a09aefb33 --- /dev/null +++ b/include/net/tc_act/tc_ife.h @@ -0,0 +1,61 @@ +#ifndef __NET_TC_IFE_H +#define __NET_TC_IFE_H + +#include +#include +#include +#include + +#define IFE_METAHDRLEN 2 +struct tcf_ife_info { + struct tcf_common common; + u8 eth_dst[ETH_ALEN]; + u8 eth_src[ETH_ALEN]; + u16 eth_type; + u16 flags; + /* list of metaids allowed */ + struct list_head metalist; +}; +#define to_ife(a) \ + container_of(a->priv, struct tcf_ife_info, common) + +struct tcf_meta_info { + const struct tcf_meta_ops *ops; + void *metaval; + u16 metaid; + struct list_head metalist; +}; + +struct tcf_meta_ops { + u16 metaid; /*Maintainer provided ID */ + u16 metatype; /*netlink attribute type (look at net/netlink.h) */ + const char *name; + const char *synopsis; + struct list_head list; + int (*check_presence)(struct sk_buff *, struct tcf_meta_info *); + int (*encode)(struct sk_buff *, void *, struct tcf_meta_info *); + int (*decode)(struct sk_buff *, void *, u16 len); + int (*get)(struct sk_buff *skb, struct tcf_meta_info *mi); + int (*alloc)(struct tcf_meta_info *, void *); + void (*release)(struct tcf_meta_info *); + int (*validate)(void *val, int len); + struct module *owner; +}; + +#define MODULE_ALIAS_IFE_META(metan) MODULE_ALIAS("ifemeta" __stringify_1(metan)) + +int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi); +int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi); +int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, + const void *dval); +int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval); +int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval); +int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi); +int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi); +int ife_validate_meta_u32(void *val, int len); +int ife_validate_meta_u16(void *val, int len); +void ife_release_meta_gen(struct tcf_meta_info *mi); +int register_ife_op(struct tcf_meta_ops *mops); +int unregister_ife_op(struct tcf_meta_ops *mops); + +#endif /* __NET_TC_IFE_H */ -- cgit v1.2.3 From 822c868532cae2cc1c51f4f18ab61c194d98aaf6 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 27 Feb 2016 00:32:15 -0800 Subject: net: ipv4: Convert IP network timestamps to be y2038 safe ICMP timestamp messages and IP source route options require timestamps to be in milliseconds modulo 24 hours from midnight UT format. Add inet_current_timestamp() function to support this. The function returns the required timestamp in network byte order. Timestamp calculation is also changed to call ktime_get_real_ts64() which uses struct timespec64. struct timespec64 is y2038 safe. Previously it called getnstimeofday() which uses struct timespec. struct timespec is not y2038 safe. Signed-off-by: Deepa Dinamani Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Cc: James Morris Cc: Patrick McHardy Acked-by: YOSHIFUJI Hideaki Acked-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/net/ip.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index cbb134b2f0e4..fad74d323bd6 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -240,6 +240,8 @@ static inline int inet_is_local_reserved_port(struct net *net, int port) } #endif +__be32 inet_current_timestamp(void); + /* From inetpeer.c */ extern int inet_peer_threshold; extern int inet_peer_minttl; -- cgit v1.2.3 From 8a6bf5da1aefdafd60b73d9122c7af9fd2d7bb9c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 1 Mar 2016 19:55:14 +0100 Subject: netfilter: nft_masq: support port range Complete masquerading support by allowing port range selection. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_masq.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nft_masq.h b/include/net/netfilter/nft_masq.h index e2a518b60e19..a3f3c11b2526 100644 --- a/include/net/netfilter/nft_masq.h +++ b/include/net/netfilter/nft_masq.h @@ -2,7 +2,9 @@ #define _NFT_MASQ_H_ struct nft_masq { - u32 flags; + u32 flags; + enum nft_registers sreg_proto_min:8; + enum nft_registers sreg_proto_max:8; }; extern const struct nla_policy nft_masq_policy[]; -- cgit v1.2.3 From 1f27cde313d72d6b44a73ba89c8b2c6a99c628cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Mar 2016 08:21:43 -0800 Subject: net: sched: use pfifo_fast for non real queues Some devices declare a high number of TX queues, then set a much lower real_num_tx_queues This cause setups using fq_codel, sfq or fq as the default qdisc to consume more memory than really needed. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sch_generic.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e5bba897d206..46e55f0202a6 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -345,6 +345,12 @@ extern struct Qdisc_ops pfifo_fast_ops; extern struct Qdisc_ops mq_qdisc_ops; extern struct Qdisc_ops noqueue_qdisc_ops; extern const struct Qdisc_ops *default_qdisc_ops; +static inline const struct Qdisc_ops * +get_default_qdisc_ops(const struct net_device *dev, int ntx) +{ + return ntx < dev->real_num_tx_queues ? + default_qdisc_ops : &pfifo_fast_ops; +} struct Qdisc_class_common { u32 classid; -- cgit v1.2.3 From f719e3754ee2f7275437e61a6afd520181fdd43b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 5 Mar 2016 15:03:22 +0200 Subject: ipvs: drop first packet to redirect conntrack Jiri Bohac is reporting for a problem where the attempt to reschedule existing connection to another real server needs proper redirect for the conntrack used by the IPVS connection. For example, when IPVS connection is created to NAT-ed real server we alter the reply direction of conntrack. If we later decide to select different real server we can not alter again the conntrack. And if we expire the old connection, the new connection is left without conntrack. So, the only way to redirect both the IPVS connection and the Netfilter's conntrack is to drop the SYN packet that hits existing connection, to wait for the next jiffie to expire the old connection and its conntrack and to rely on client's retransmission to create new connection as usually. Jiri Bohac provided a fix that drops all SYNs on rescheduling, I extended his patch to do such drops only for connections that use conntrack. Here is the original report from Jiri Bohac: Since commit dc7b3eb900aa ("ipvs: Fix reuse connection if real server is dead"), new connections to dead servers are redistributed immediately to new servers. The old connection is expired using ip_vs_conn_expire_now() which sets the connection timer to expire immediately. However, before the timer callback, ip_vs_conn_expire(), is run to clean the connection's conntrack entry, the new redistributed connection may already be established and its conntrack removed instead. Fix this by dropping the first packet of the new connection instead, like we do when the destination server is not available. The timer will have deleted the old conntrack entry long before the first packet of the new connection is retransmitted. Fixes: dc7b3eb900aa ("ipvs: Fix reuse connection if real server is dead") Signed-off-by: Jiri Bohac Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 0816c872b689..a6cc576fd467 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1588,6 +1588,23 @@ static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) } #endif /* CONFIG_IP_VS_NFCT */ +/* Really using conntrack? */ +static inline bool ip_vs_conn_uses_conntrack(struct ip_vs_conn *cp, + struct sk_buff *skb) +{ +#ifdef CONFIG_IP_VS_NFCT + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + return false; + ct = nf_ct_get(skb, &ctinfo); + if (ct && !nf_ct_is_untracked(ct)) + return true; +#endif + return false; +} + static inline int ip_vs_dest_conn_overhead(struct ip_vs_dest *dest) { -- cgit v1.2.3 From 8050c0f0274a15841756968857cfb07b3ab809ae Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:02 +0100 Subject: bpf: allow bpf_csum_diff to feed bpf_l3_csum_replace as well Commit 7d672345ed29 ("bpf: add generic bpf_csum_diff helper") added a generic checksum diff helper that can feed bpf_l4_csum_replace() with a target __wsum diff that is to be applied to the L4 checksum. This facility is very flexible, can be cascaded, allows for adding, removing, or diffing data, or for calculating the pseudo header checksum from scratch, but it can also be reused for working with the IPv4 header checksum. Thus, analogous to bpf_l4_csum_replace(), add a case for header field value of 0 to change the checksum at a given offset through a new helper csum_replace_by_diff(). Also, in addition to that, this provides an easy to use interface for feeding precalculated diffs f.e. coming from a map. It nicely complements bpf_l3_csum_replace() that currently allows only for csum updates of 2 and 4 byte diffs. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/checksum.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/checksum.h b/include/net/checksum.h index 10a16b5bd1c7..abffc64e7300 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -120,6 +120,11 @@ static inline __wsum csum_partial_ext(const void *buff, int len, __wsum sum) #define CSUM_MANGLED_0 ((__force __sum16)0xffff) +static inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) +{ + *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); +} + static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) { __wsum tmp = csum_sub(~csum_unfold(*sum), (__force __wsum)from); -- cgit v1.2.3 From db3c6139e6ead91b42e7c2ad044ed8beaee884e6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Mar 2016 15:15:07 +0100 Subject: bpf, vxlan, geneve, gre: fix usage of dst_cache on xmit The assumptions from commit 0c1d70af924b ("net: use dst_cache for vxlan device"), 468dfffcd762 ("geneve: add dst caching support") and 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") on dst_cache usage when ip_tunnel_info is used is unfortunately not always valid as assumed. While it seems correct for ip_tunnel_info front-ends such as OVS, eBPF however can fill in ip_tunnel_info for consumers like vxlan, geneve or gre with different remote dsts, tos, etc, therefore they cannot be assumed as packet independent. Right now vxlan, geneve, gre would cache the dst for eBPF and every packet would reuse the same entry that was first created on the initial route lookup. eBPF doesn't store/cache the ip_tunnel_info, so each skb may have a different one. Fix it by adding a flag that checks the ip_tunnel_info. Also the !tos test in vxlan needs to be handeled differently in this context as it is currently inferred from ip_tunnel_info as well if present. ip_tunnel_dst_cache_usable() helper is added for the three tunnel cases, which checks if we can use dst cache. Fixes: 0c1d70af924b ("net: use dst_cache for vxlan device") Fixes: 468dfffcd762 ("geneve: add dst caching support") Fixes: 3c1cb4d2604c ("net/ipv4: add dst cache support for gre lwtunnels") Signed-off-by: Daniel Borkmann Acked-by: Paolo Abeni Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 5f28b606633e..e1395d70fb48 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -140,6 +140,7 @@ struct ip_tunnel { #define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400) #define TUNNEL_GENEVE_OPT __cpu_to_be16(0x0800) #define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000) +#define TUNNEL_NOCACHE __cpu_to_be16(0x2000) #define TUNNEL_OPTIONS_PRESENT (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT) @@ -206,6 +207,20 @@ static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, 0, sizeof(*key) - IP_TUNNEL_KEY_SIZE); } +static inline bool +ip_tunnel_dst_cache_usable(const struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + if (skb->mark) + return false; + if (!info) + return true; + if (info->key.tun_flags & TUNNEL_NOCACHE) + return false; + + return true; +} + static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info *tun_info) { -- cgit v1.2.3 From 9a03cd8f38efb83c13fbe62aff50eea4efff93da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= Date: Tue, 8 Mar 2016 14:44:35 +0100 Subject: ipv6: per netns fib6 walkers The IPv6 FIB data structures are separated per network namespace but there is still only one global walkers list and one global walker list lock. This means changes in one namespace unnecessarily interfere with walkers in other namespaces. Replace the global list with per-netns lists (and give each its own lock). Signed-off-by: Michal Kubecek Reviewed-by: Cong Wang Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index c0368db6df54..f0109b973648 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -58,7 +58,9 @@ struct netns_ipv6 { struct timer_list ip6_fib_timer; struct hlist_head *fib_table_hash; struct fib6_table *fib6_main_tbl; + struct list_head fib6_walkers; struct dst_ops ip6_dst_ops; + rwlock_t fib6_walker_lock; unsigned int ip6_rt_gc_expire; unsigned long ip6_rt_last_gc; #ifdef CONFIG_IPV6_MULTIPLE_TABLES -- cgit v1.2.3 From 3dc94f93be161ec4203673de9a34b7362d8985b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= Date: Tue, 8 Mar 2016 14:44:45 +0100 Subject: ipv6: per netns FIB garbage collection One of our customers observed issues with FIB6 garbage collectors running in different network namespaces blocking each other, resulting in soft lockups (fib6_run_gc() initiated from timer runs always in forced mode). Now that FIB6 walkers are separated per namespace, there is no more need for instances of fib6_run_gc() in different namespaces blocking each other. There is still a call to icmp6_dst_gc() which operates on shared data but this function is protected by its own shared lock. Signed-off-by: Michal Kubecek Reviewed-by: Cong Wang Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index f0109b973648..10d0848f5b8a 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -61,6 +61,7 @@ struct netns_ipv6 { struct list_head fib6_walkers; struct dst_ops ip6_dst_ops; rwlock_t fib6_walker_lock; + spinlock_t fib6_gc_lock; unsigned int ip6_rt_gc_expire; unsigned long ip6_rt_last_gc; #ifdef CONFIG_IPV6_MULTIPLE_TABLES -- cgit v1.2.3 From e28e87ed474c5a0b378c66fb85efc8e487f4f63f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 8 Mar 2016 23:36:03 +0100 Subject: ip_tunnel, bpf: ip_tunnel_info_opts_{get, set} depends on CONFIG_INET Helpers like ip_tunnel_info_opts_{get,set}() are only available if CONFIG_INET is set, thus add an empty definition into the header for the !CONFIG_INET case, where already other empty inline helpers are defined. This avoids ifdef kludge inside filter.c, but also vxlan and geneve themself where this facility can only be used with, depend on INET being set. For the !INET case TUNNEL_OPTIONS_PRESENT would never be set in flags. Fixes: 14ca0751c96f ("bpf: support for access to tunnel options") Reported-by: Fengguang Wu Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index e1395d70fb48..0acd80fadb32 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -369,6 +369,17 @@ static inline void ip_tunnel_unneed_metadata(void) { } +static inline void ip_tunnel_info_opts_get(void *to, + const struct ip_tunnel_info *info) +{ +} + +static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, + const void *from, int len) +{ + info->options_len = 0; +} + #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */ -- cgit v1.2.3 From 473bd239b808a8af5241ce9996a16d283d88ddff Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 7 Mar 2016 14:11:05 -0800 Subject: tcp: Add tcp_inq to get available receive bytes on socket Create a common kernel function to get the number of bytes available on a TCP socket. This is based on code in INQ getsockopt and we now call the function for that getsockopt. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/tcp.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index e90db8546806..0302636af98c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1816,4 +1816,28 @@ static inline void skb_set_tcp_pure_ack(struct sk_buff *skb) skb->truesize = 2; } +static inline int tcp_inq(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int answ; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + answ = 0; + } else if (sock_flag(sk, SOCK_URGINLINE) || + !tp->urg_data || + before(tp->urg_seq, tp->copied_seq) || + !before(tp->urg_seq, tp->rcv_nxt)) { + + answ = tp->rcv_nxt - tp->copied_seq; + + /* Subtract 1, if FIN was received */ + if (answ && sock_flag(sk, SOCK_DONE)) + answ--; + } else { + answ = tp->urg_seq - tp->copied_seq; + } + + return answ; +} + #endif /* _TCP_H */ -- cgit v1.2.3 From ab7ac4eb9832e32a09f4e8042705484d2fb0aad3 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 7 Mar 2016 14:11:06 -0800 Subject: kcm: Kernel Connection Multiplexor module This module implements the Kernel Connection Multiplexor. Kernel Connection Multiplexor (KCM) is a facility that provides a message based interface over TCP for generic application protocols. With KCM an application can efficiently send and receive application protocol messages over TCP using datagram sockets. For more information see the included Documentation/networking/kcm.txt Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/kcm.h | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 include/net/kcm.h (limited to 'include/net') diff --git a/include/net/kcm.h b/include/net/kcm.h new file mode 100644 index 000000000000..1bcae39070ec --- /dev/null +++ b/include/net/kcm.h @@ -0,0 +1,125 @@ +/* + * Kernel Connection Multiplexor + * + * Copyright (c) 2016 Tom Herbert + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + */ + +#ifndef __NET_KCM_H_ +#define __NET_KCM_H_ + +#include +#include +#include + +extern unsigned int kcm_net_id; + +struct kcm_tx_msg { + unsigned int sent; + unsigned int fragidx; + unsigned int frag_offset; + unsigned int msg_flags; + struct sk_buff *frag_skb; + struct sk_buff *last_skb; +}; + +struct kcm_rx_msg { + int full_len; + int accum_len; + int offset; +}; + +/* Socket structure for KCM client sockets */ +struct kcm_sock { + struct sock sk; + struct kcm_mux *mux; + struct list_head kcm_sock_list; + int index; + u32 done : 1; + struct work_struct done_work; + + /* Transmit */ + struct kcm_psock *tx_psock; + struct work_struct tx_work; + struct list_head wait_psock_list; + struct sk_buff *seq_skb; + + /* Don't use bit fields here, these are set under different locks */ + bool tx_wait; + bool tx_wait_more; + + /* Receive */ + struct kcm_psock *rx_psock; + struct list_head wait_rx_list; /* KCMs waiting for receiving */ + bool rx_wait; + u32 rx_disabled : 1; +}; + +struct bpf_prog; + +/* Structure for an attached lower socket */ +struct kcm_psock { + struct sock *sk; + struct kcm_mux *mux; + int index; + + u32 tx_stopped : 1; + u32 rx_stopped : 1; + u32 done : 1; + u32 unattaching : 1; + + void (*save_state_change)(struct sock *sk); + void (*save_data_ready)(struct sock *sk); + void (*save_write_space)(struct sock *sk); + + struct list_head psock_list; + + /* Receive */ + struct sk_buff *rx_skb_head; + struct sk_buff **rx_skb_nextp; + struct sk_buff *ready_rx_msg; + struct list_head psock_ready_list; + struct work_struct rx_work; + struct delayed_work rx_delayed_work; + struct bpf_prog *bpf_prog; + struct kcm_sock *rx_kcm; + + /* Transmit */ + struct kcm_sock *tx_kcm; + struct list_head psock_avail_list; +}; + +/* Per net MUX list */ +struct kcm_net { + struct mutex mutex; + struct list_head mux_list; + int count; +}; + +/* Structure for a MUX */ +struct kcm_mux { + struct list_head kcm_mux_list; + struct rcu_head rcu; + struct kcm_net *knet; + + struct list_head kcm_socks; /* All KCM sockets on MUX */ + int kcm_socks_cnt; /* Total KCM socket count for MUX */ + struct list_head psocks; /* List of all psocks on MUX */ + int psocks_cnt; /* Total attached sockets */ + + /* Receive */ + spinlock_t rx_lock ____cacheline_aligned_in_smp; + struct list_head kcm_rx_waiters; /* KCMs waiting for receiving */ + struct list_head psocks_ready; /* List of psocks with a msg ready */ + struct sk_buff_head rx_hold_queue; + + /* Transmit */ + spinlock_t lock ____cacheline_aligned_in_smp; /* TX and mux locking */ + struct list_head psocks_avail; /* List of available psocks */ + struct list_head kcm_tx_waiters; /* KCMs waiting for a TX psock */ +}; + +#endif /* __NET_KCM_H_ */ -- cgit v1.2.3 From cd6e111bf5be5c70aef96a86d791ee7be0c0e137 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 7 Mar 2016 14:11:07 -0800 Subject: kcm: Add statistics and proc interfaces This patch adds various counters for KCM. These include counters for messages and bytes received or sent, as well as counters for number of attached/unattached TCP sockets and other error or edge events. The statistics are exposed via a proc interface. /proc/net/kcm provides statistics per KCM socket and per psock (attached TCP sockets). /proc/net/kcm_stats provides aggregate statistics. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/kcm.h | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) (limited to 'include/net') diff --git a/include/net/kcm.h b/include/net/kcm.h index 1bcae39070ec..39c7abe98552 100644 --- a/include/net/kcm.h +++ b/include/net/kcm.h @@ -17,6 +17,42 @@ extern unsigned int kcm_net_id; +#define KCM_STATS_ADD(stat, count) ((stat) += (count)) +#define KCM_STATS_INCR(stat) ((stat)++) + +struct kcm_psock_stats { + unsigned long long rx_msgs; + unsigned long long rx_bytes; + unsigned long long tx_msgs; + unsigned long long tx_bytes; + unsigned int rx_aborts; + unsigned int rx_mem_fail; + unsigned int rx_need_more_hdr; + unsigned int rx_bad_hdr_len; + unsigned long long reserved; + unsigned long long unreserved; + unsigned int tx_aborts; +}; + +struct kcm_mux_stats { + unsigned long long rx_msgs; + unsigned long long rx_bytes; + unsigned long long tx_msgs; + unsigned long long tx_bytes; + unsigned int rx_ready_drops; + unsigned int tx_retries; + unsigned int psock_attach; + unsigned int psock_unattach_rsvd; + unsigned int psock_unattach; +}; + +struct kcm_stats { + unsigned long long rx_msgs; + unsigned long long rx_bytes; + unsigned long long tx_msgs; + unsigned long long tx_bytes; +}; + struct kcm_tx_msg { unsigned int sent; unsigned int fragidx; @@ -41,6 +77,8 @@ struct kcm_sock { u32 done : 1; struct work_struct done_work; + struct kcm_stats stats; + /* Transmit */ struct kcm_psock *tx_psock; struct work_struct tx_work; @@ -77,6 +115,8 @@ struct kcm_psock { struct list_head psock_list; + struct kcm_psock_stats stats; + /* Receive */ struct sk_buff *rx_skb_head; struct sk_buff **rx_skb_nextp; @@ -86,15 +126,21 @@ struct kcm_psock { struct delayed_work rx_delayed_work; struct bpf_prog *bpf_prog; struct kcm_sock *rx_kcm; + unsigned long long saved_rx_bytes; + unsigned long long saved_rx_msgs; /* Transmit */ struct kcm_sock *tx_kcm; struct list_head psock_avail_list; + unsigned long long saved_tx_bytes; + unsigned long long saved_tx_msgs; }; /* Per net MUX list */ struct kcm_net { struct mutex mutex; + struct kcm_psock_stats aggregate_psock_stats; + struct kcm_mux_stats aggregate_mux_stats; struct list_head mux_list; int count; }; @@ -110,6 +156,9 @@ struct kcm_mux { struct list_head psocks; /* List of all psocks on MUX */ int psocks_cnt; /* Total attached sockets */ + struct kcm_mux_stats stats; + struct kcm_psock_stats aggregate_psock_stats; + /* Receive */ spinlock_t rx_lock ____cacheline_aligned_in_smp; struct list_head kcm_rx_waiters; /* KCMs waiting for receiving */ @@ -122,4 +171,49 @@ struct kcm_mux { struct list_head kcm_tx_waiters; /* KCMs waiting for a TX psock */ }; +#ifdef CONFIG_PROC_FS +int kcm_proc_init(void); +void kcm_proc_exit(void); +#else +static int kcm_proc_init(void) { return 0; } +static void kcm_proc_exit(void) { } +#endif + +static inline void aggregate_psock_stats(struct kcm_psock_stats *stats, + struct kcm_psock_stats *agg_stats) +{ + /* Save psock statistics in the mux when psock is being unattached. */ + +#define SAVE_PSOCK_STATS(_stat) (agg_stats->_stat += stats->_stat) + SAVE_PSOCK_STATS(rx_msgs); + SAVE_PSOCK_STATS(rx_bytes); + SAVE_PSOCK_STATS(rx_aborts); + SAVE_PSOCK_STATS(rx_mem_fail); + SAVE_PSOCK_STATS(rx_need_more_hdr); + SAVE_PSOCK_STATS(rx_bad_hdr_len); + SAVE_PSOCK_STATS(tx_msgs); + SAVE_PSOCK_STATS(tx_bytes); + SAVE_PSOCK_STATS(reserved); + SAVE_PSOCK_STATS(unreserved); + SAVE_PSOCK_STATS(tx_aborts); +#undef SAVE_PSOCK_STATS +} + +static inline void aggregate_mux_stats(struct kcm_mux_stats *stats, + struct kcm_mux_stats *agg_stats) +{ + /* Save psock statistics in the mux when psock is being unattached. */ + +#define SAVE_MUX_STATS(_stat) (agg_stats->_stat += stats->_stat) + SAVE_MUX_STATS(rx_msgs); + SAVE_MUX_STATS(rx_bytes); + SAVE_MUX_STATS(tx_msgs); + SAVE_MUX_STATS(tx_bytes); + SAVE_MUX_STATS(rx_ready_drops); + SAVE_MUX_STATS(psock_attach); + SAVE_MUX_STATS(psock_unattach_rsvd); + SAVE_MUX_STATS(psock_unattach); +#undef SAVE_MUX_STATS +} + #endif /* __NET_KCM_H_ */ -- cgit v1.2.3 From 7ced95ef525c329f947c424859cf2b0a3b731f8c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 7 Mar 2016 14:11:10 -0800 Subject: kcm: Add memory limit for receive message construction Message assembly is performed on the TCP socket. This is logically equivalent of an application that performs a peek on the socket to find out how much memory is needed for a receive buffer. The receive socket buffer also provides the maximum message size which is checked. The receive algorithm is something like: 1) Receive the first skbuf for a message (or skbufs if multiple are needed to determine message length). 2) Check the message length against the number of bytes in the TCP receive queue (tcp_inq()). - If all the bytes of the message are in the queue (incluing the skbuf received), then proceed with message assembly (it should complete with the tcp_read_sock) - Else, mark the psock with the number of bytes needed to complete the message. 3) In TCP data ready function, if the psock indicates that we are waiting for the rest of the bytes of a messages, check the number of queued bytes against that. - If there are still not enough bytes for the message, just return - Else, clear the waiting bytes and proceed to receive the skbufs. The message should now be received in one tcp_read_sock Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/kcm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/kcm.h b/include/net/kcm.h index 39c7abe98552..d892956ff552 100644 --- a/include/net/kcm.h +++ b/include/net/kcm.h @@ -28,6 +28,7 @@ struct kcm_psock_stats { unsigned int rx_aborts; unsigned int rx_mem_fail; unsigned int rx_need_more_hdr; + unsigned int rx_msg_too_big; unsigned int rx_bad_hdr_len; unsigned long long reserved; unsigned long long unreserved; @@ -66,6 +67,7 @@ struct kcm_rx_msg { int full_len; int accum_len; int offset; + int early_eaten; }; /* Socket structure for KCM client sockets */ @@ -128,6 +130,7 @@ struct kcm_psock { struct kcm_sock *rx_kcm; unsigned long long saved_rx_bytes; unsigned long long saved_rx_msgs; + unsigned int rx_need_bytes; /* Transmit */ struct kcm_sock *tx_kcm; @@ -190,6 +193,7 @@ static inline void aggregate_psock_stats(struct kcm_psock_stats *stats, SAVE_PSOCK_STATS(rx_aborts); SAVE_PSOCK_STATS(rx_mem_fail); SAVE_PSOCK_STATS(rx_need_more_hdr); + SAVE_PSOCK_STATS(rx_msg_too_big); SAVE_PSOCK_STATS(rx_bad_hdr_len); SAVE_PSOCK_STATS(tx_msgs); SAVE_PSOCK_STATS(tx_bytes); -- cgit v1.2.3 From 29152a34f72cb4d7ab32885ad2f20a482c92a8f3 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 7 Mar 2016 14:11:11 -0800 Subject: kcm: Add receive message timeout This patch adds receive timeout for message assembly on the attached TCP sockets. The timeout is set when a new messages is started and the whole message has not been received by TCP (not in the receive queue). If the completely message is subsequently received the timer is cancelled, if the timer expires the RX side is aborted. The timeout value is taken from the socket timeout (SO_RCVTIMEO) that is set on a TCP socket (i.e. set by get sockopt before attaching a TCP socket to KCM. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/kcm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/kcm.h b/include/net/kcm.h index d892956ff552..95c425ca97b6 100644 --- a/include/net/kcm.h +++ b/include/net/kcm.h @@ -29,6 +29,7 @@ struct kcm_psock_stats { unsigned int rx_mem_fail; unsigned int rx_need_more_hdr; unsigned int rx_msg_too_big; + unsigned int rx_msg_timeouts; unsigned int rx_bad_hdr_len; unsigned long long reserved; unsigned long long unreserved; @@ -130,6 +131,7 @@ struct kcm_psock { struct kcm_sock *rx_kcm; unsigned long long saved_rx_bytes; unsigned long long saved_rx_msgs; + struct timer_list rx_msg_timer; unsigned int rx_need_bytes; /* Transmit */ @@ -194,6 +196,7 @@ static inline void aggregate_psock_stats(struct kcm_psock_stats *stats, SAVE_PSOCK_STATS(rx_mem_fail); SAVE_PSOCK_STATS(rx_need_more_hdr); SAVE_PSOCK_STATS(rx_msg_too_big); + SAVE_PSOCK_STATS(rx_msg_timeouts); SAVE_PSOCK_STATS(rx_bad_hdr_len); SAVE_PSOCK_STATS(tx_msgs); SAVE_PSOCK_STATS(tx_bytes); -- cgit v1.2.3 From f16089209e1029d45ae78dd238b6ab9b2c9a886c Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 4 Mar 2016 10:10:20 +0100 Subject: mac802154: use put and get unaligned functions This patch removes the swap pointer and memmove functionality. Instead we use the well known put/get unaligned access with specific byte order handling. Signed-off-by: Alexander Aring Suggested-by: Marc Kleine-Budde Signed-off-by: Marcel Holtmann --- include/net/mac802154.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/mac802154.h b/include/net/mac802154.h index 2e3cdd2048d2..6cd7a70706a9 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -16,10 +16,10 @@ #ifndef NET_MAC802154_H #define NET_MAC802154_H +#include #include #include #include -#include #include @@ -254,7 +254,7 @@ static inline __le16 ieee802154_get_fc_from_skb(const struct sk_buff *skb) return cpu_to_le16(0); } - return (__force __le16)__get_unaligned_memmove16(skb_mac_header(skb)); + return get_unaligned_le16(skb_mac_header(skb)); } /** @@ -264,7 +264,7 @@ static inline __le16 ieee802154_get_fc_from_skb(const struct sk_buff *skb) */ static inline void ieee802154_be64_to_le64(void *le64_dst, const void *be64_src) { - __put_unaligned_memmove64(swab64p(be64_src), le64_dst); + put_unaligned_le64(get_unaligned_be64(be64_src), le64_dst); } /** @@ -274,7 +274,7 @@ static inline void ieee802154_be64_to_le64(void *le64_dst, const void *be64_src) */ static inline void ieee802154_le64_to_be64(void *be64_dst, const void *le64_src) { - __put_unaligned_memmove64(swab64p(le64_src), be64_dst); + put_unaligned_be64(get_unaligned_le64(le64_src), be64_dst); } /** @@ -284,7 +284,7 @@ static inline void ieee802154_le64_to_be64(void *be64_dst, const void *le64_src) */ static inline void ieee802154_le16_to_be16(void *be16_dst, const void *le16_src) { - __put_unaligned_memmove16(swab16p(le16_src), be16_dst); + put_unaligned_be16(get_unaligned_le16(le16_src), be16_dst); } /** -- cgit v1.2.3 From 82a37adeedd38880940e2772ec1ae27a09353e5a Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Wed, 9 Mar 2016 17:30:34 +0200 Subject: Bluetooth: Add support for limited privacy mode Introduce a limited privacy mode indicated by value 0x02 to the mgmt Set Privacy command. With value 0x02 the kernel will use privacy mode with a resolvable private address. In case the controller is bondable and discoverable the identity address will be used. Signed-off-by: Johan Hedberg Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 339ea57be423..5d38d980b89d 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -233,6 +233,7 @@ enum { HCI_SC_ENABLED, HCI_SC_ONLY, HCI_PRIVACY, + HCI_LIMITED_PRIVACY, HCI_RPA_EXPIRED, HCI_RPA_RESOLVING, HCI_HS_ENABLED, -- cgit v1.2.3 From f720d0caa0af2c33ad15310974c7320345ab4468 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 10 Mar 2016 19:31:12 +0100 Subject: kcm: mark helper functions inline The stub helper functions for the newly added kcm_proc_init/exit interfaces are defined as 'static' in a header file, which leads to build warnings for each file that includes them without calling them: include/net/kcm.h:183:12: error: 'kcm_proc_init' defined but not used [-Werror=unused-function] include/net/kcm.h:184:13: error: 'kcm_proc_exit' defined but not used [-Werror=unused-function] This marks the two functions as 'static inline' instead, which avoids the warnings and is obviously what was meant here. Signed-off-by: Arnd Bergmann Fixes: cd6e111bf5be ("kcm: Add statistics and proc interfaces") Signed-off-by: David S. Miller --- include/net/kcm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/kcm.h b/include/net/kcm.h index 95c425ca97b6..2840b5825dcc 100644 --- a/include/net/kcm.h +++ b/include/net/kcm.h @@ -180,8 +180,8 @@ struct kcm_mux { int kcm_proc_init(void); void kcm_proc_exit(void); #else -static int kcm_proc_init(void) { return 0; } -static void kcm_proc_exit(void) { } +static inline int kcm_proc_init(void) { return 0; } +static inline void kcm_proc_exit(void) { } #endif static inline void aggregate_psock_stats(struct kcm_psock_stats *stats, -- cgit v1.2.3 From 5b33f48842fa1e13e9c0ea8cc59c1d0df19042db Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Tue, 8 Mar 2016 12:42:29 +0200 Subject: net/flower: Introduce hardware offload support This patch is based on a patch made by John Fastabend. It adds support for offloading cls_flower. when NETIF_F_HW_TC is on: flags = 0 => Rule will be processed twice - by hardware, and if still relevant, by software. flags = SKIP_HW => Rull will be processed by software only If hardware fail/not capabale to apply the rule, operation will NOT fail. Filter will be processed by SW only. Acked-by: Jiri Pirko Suggested-by: John Fastabend Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index bea14eee373e..5b4e8f08b8f0 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -409,4 +409,18 @@ static inline bool tc_should_offload(struct net_device *dev, u32 flags) return true; } +enum tc_fl_command { + TC_CLSFLOWER_REPLACE, + TC_CLSFLOWER_DESTROY, +}; + +struct tc_cls_flower_offload { + enum tc_fl_command command; + u64 cookie; + struct flow_dissector *dissector; + struct fl_flow_key *mask; + struct fl_flow_key *key; + struct tcf_exts *exts; +}; + #endif -- cgit v1.2.3 From 8de2d793daf784f8f109565bcc023a6d198bad85 Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Tue, 8 Mar 2016 12:42:30 +0200 Subject: net/flow_dissector: Make dissector_uses_key() and skb_flow_dissector_target() public Will be used in a following patch to query if a key is being used, and what it's value in the target object. Acked-by: John Fastabend Acked-by: Jiri Pirko Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 8c8548cf5888..d3d60dccd19f 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -184,4 +184,17 @@ static inline bool flow_keys_have_l4(struct flow_keys *keys) u32 flow_hash_from_keys(struct flow_keys *keys); +static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) +{ + return flow_dissector->used_keys & (1 << key_id); +} + +static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id, + void *target_container) +{ + return ((char *)target_container) + flow_dissector->offset[key_id]; +} + #endif -- cgit v1.2.3 From 00175aec941e9c306d8a5ce930b2d91f7c04468c Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Tue, 8 Mar 2016 12:42:31 +0200 Subject: net/sched: Macro instead of CONFIG_NET_CLS_ACT ifdef Introduce the macros tc_no_actions and tc_for_each_action to make code clearer. Extracted struct tc_action out of the ifdef to make calls to is_tcf_gact_shot() and similar functions valid, even when it is a nop. Acked-by: Jiri Pirko Acked-by: John Fastabend Suggested-by: Jiri Pirko Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/net/act_api.h | 21 ++++++++++++++++----- include/net/tc_act/tc_gact.h | 4 ++-- 2 files changed, 18 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 342be6c5ab5c..2a19fe111c78 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -78,11 +78,6 @@ static inline void tcf_lastuse_update(struct tcf_t *tm) tm->lastuse = now; } -#ifdef CONFIG_NET_CLS_ACT - -#define ACT_P_CREATED 1 -#define ACT_P_DELETED 1 - struct tc_action { void *priv; const struct tc_action_ops *ops; @@ -92,6 +87,11 @@ struct tc_action { struct tcf_hashinfo *hinfo; }; +#ifdef CONFIG_NET_CLS_ACT + +#define ACT_P_CREATED 1 +#define ACT_P_DELETED 1 + struct tc_action_ops { struct list_head head; char kind[IFNAMSIZ]; @@ -171,5 +171,16 @@ int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); + +#define tc_no_actions(_exts) \ + (list_empty(&(_exts)->actions)) + +#define tc_for_each_action(_a, _exts) \ + list_for_each_entry(a, &(_exts)->actions, list) +#else /* CONFIG_NET_CLS_ACT */ + +#define tc_no_actions(_exts) true +#define tc_for_each_action(_a, _exts) while (0) + #endif /* CONFIG_NET_CLS_ACT */ #endif diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h index 04a31830711b..93c520b83d10 100644 --- a/include/net/tc_act/tc_gact.h +++ b/include/net/tc_act/tc_gact.h @@ -16,9 +16,9 @@ struct tcf_gact { #define to_gact(a) \ container_of(a->priv, struct tcf_gact, common) -#ifdef CONFIG_NET_CLS_ACT static inline bool is_tcf_gact_shot(const struct tc_action *a) { +#ifdef CONFIG_NET_CLS_ACT struct tcf_gact *gact; if (a->ops && a->ops->type != TCA_ACT_GACT) @@ -28,7 +28,7 @@ static inline bool is_tcf_gact_shot(const struct tc_action *a) if (gact->tcf_action == TC_ACT_SHOT) return true; +#endif return false; } -#endif #endif /* __NET_TC_GACT_H */ -- cgit v1.2.3 From 519afb1813eab066a0c9995a08861fd0af75d5ae Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Tue, 8 Mar 2016 12:42:32 +0200 Subject: net/act_skbedit: Utility functions for mark action Enable device drivers to query the action, if and only if is a mark action and what value to use for marking. Acked-by: Jiri Pirko Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/net/tc_act/tc_skbedit.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/net') diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index 0df9a0db4a8e..b496d5ad7d42 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -20,6 +20,7 @@ #define __NET_TC_SKBEDIT_H #include +#include struct tcf_skbedit { struct tcf_common common; @@ -32,4 +33,19 @@ struct tcf_skbedit { #define to_skbedit(a) \ container_of(a->priv, struct tcf_skbedit, common) +/* Return true iff action is mark */ +static inline bool is_tcf_skbedit_mark(const struct tc_action *a) +{ +#ifdef CONFIG_NET_CLS_ACT + if (a->ops && a->ops->type == TCA_ACT_SKBEDIT) + return to_skbedit(a)->flags == SKBEDIT_F_MARK; +#endif + return false; +} + +static inline u32 tcf_skbedit_mark(const struct tc_action *a) +{ + return to_skbedit(a)->mark; +} + #endif /* __NET_TC_SKBEDIT_H */ -- cgit v1.2.3 From 8208d21bf309551686b7a76d19059ae182a956d0 Mon Sep 17 00:00:00 2001 From: Amir Vadai Date: Fri, 11 Mar 2016 11:08:45 +0200 Subject: net/flower: Fix pointer cast Cast pointer to unsigned long instead of u64, to fix compilation warning on 32 bit arch, spotted by 0day build. Fixes: 5b33f48 ("net/flower: Introduce hardware offload support") Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 5b4e8f08b8f0..caa5e18636df 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -416,7 +416,7 @@ enum tc_fl_command { struct tc_cls_flower_offload { enum tc_fl_command command; - u64 cookie; + unsigned long cookie; struct flow_dissector *dissector; struct fl_flow_key *mask; struct fl_flow_key *key; -- cgit v1.2.3 From 134611446dc657e1bbc73ca0e4e6b599df687db0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 9 Mar 2016 03:00:02 +0100 Subject: ip_tunnel: add support for setting flow label via collect metadata This patch extends udp_tunnel6_xmit_skb() to pass in the IPv6 flow label from call sites. Currently, there's no such option and it's always set to zero when writing ip6_flow_hdr(). Add a label member to ip_tunnel_key, so that flow-based tunnels via collect metadata frontends can make use of it. vxlan and geneve will be converted to add flow label support separately. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 5 ++++- include/net/ip_tunnels.h | 4 +++- include/net/udp_tunnel.h | 4 ++-- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 84b833af6882..5db9f5910428 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -126,7 +126,7 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, ip_tunnel_key_init(&tun_dst->u.tun_info.key, iph->saddr, iph->daddr, iph->tos, iph->ttl, - 0, 0, tunnel_id, flags); + 0, 0, 0, tunnel_id, flags); return tun_dst; } @@ -152,8 +152,11 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, info->key.u.ipv6.src = ip6h->saddr; info->key.u.ipv6.dst = ip6h->daddr; + info->key.tos = ipv6_get_dsfield(ip6h); info->key.ttl = ip6h->hop_limit; + info->key.label = ip6_flowlabel(ip6h); + return tun_dst; } diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 0acd80fadb32..5dc2e454f866 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -48,6 +48,7 @@ struct ip_tunnel_key { __be16 tun_flags; u8 tos; /* TOS for IPv4, TC for IPv6 */ u8 ttl; /* TTL for IPv4, HL for IPv6 */ + __be32 label; /* Flow Label for IPv6 */ __be16 tp_src; __be16 tp_dst; }; @@ -181,7 +182,7 @@ int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, __be32 saddr, __be32 daddr, - u8 tos, u8 ttl, + u8 tos, u8 ttl, __be32 label, __be16 tp_src, __be16 tp_dst, __be64 tun_id, __be16 tun_flags) { @@ -192,6 +193,7 @@ static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); key->tos = tos; key->ttl = ttl; + key->label = label; key->tun_flags = tun_flags; /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 97f5adb121a6..b83114077cee 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -88,8 +88,8 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be16 src_port, - __be16 dst_port, bool nocheck); + __u8 prio, __u8 ttl, __be32 label, + __be16 src_port, __be16 dst_port, bool nocheck); #endif void udp_tunnel_sock_release(struct socket *sock); -- cgit v1.2.3 From e7f70af111f086a20800ad2e17f544b2e3e0f375 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 9 Mar 2016 03:00:03 +0100 Subject: vxlan: support setting IPv6 flow label This work adds support for setting the IPv6 flow label for vxlan per device and through collect metadata (ip_tunnel_key) frontends. The vxlan dst cache does not need any special considerations here, for the cases where caches can be used, the label is static per cache. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/vxlan.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 6eda4ed4d78b..a763c96ecde4 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -162,6 +162,7 @@ struct vxlan_config { u16 port_max; u8 tos; u8 ttl; + __be32 label; u32 flags; unsigned long age_interval; unsigned int addrmax; -- cgit v1.2.3 From 338039635d01524090e7bd706a3e555e20d5b337 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 9 Mar 2016 09:25:26 -0800 Subject: csum: Update csum_block_add to use rotate instead of byteswap The code for csum_block_add was doing a funky byteswap to swap the even and odd bytes of the checksum if the offset was odd. Instead of doing this we can save ourselves some trouble and just shift by 8 as this should have the same effect in terms of the final checksum value and only requires one instruction. In addition we can update csum_block_sub to just use csum_block_add with a inverse value for csum2. This way we follow the same code path as csum_block_add without having to duplicate it. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/net/checksum.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/checksum.h b/include/net/checksum.h index abffc64e7300..5c30891e84e5 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -88,8 +88,11 @@ static inline __wsum csum_block_add(__wsum csum, __wsum csum2, int offset) { u32 sum = (__force u32)csum2; - if (offset&1) - sum = ((sum&0xFF00FF)<<8)+((sum>>8)&0xFF00FF); + + /* rotate sum to align it with a 16b boundary */ + if (offset & 1) + sum = ror32(sum, 8); + return csum_add(csum, (__force __wsum)sum); } @@ -102,10 +105,7 @@ csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len) static inline __wsum csum_block_sub(__wsum csum, __wsum csum2, int offset) { - u32 sum = (__force u32)csum2; - if (offset&1) - sum = ((sum&0xFF00FF)<<8)+((sum>>8)&0xFF00FF); - return csum_sub(csum, (__force __wsum)sum); + return csum_block_add(csum, ~csum2, offset); } static inline __wsum csum_unfold(__sum16 n) -- cgit v1.2.3 From cea8768f333e3f0bc231d8b815aa4a9e63fa990c Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 10 Mar 2016 18:33:07 -0300 Subject: sctp: allow sctp_transmit_packet and others to use gfp Currently sctp_sendmsg() triggers some calls that will allocate memory with GFP_ATOMIC even when not necessary. In the case of sctp_packet_transmit it will allocate a linear skb that will be used to construct the packet and this may cause sends to fail due to ENOMEM more often than anticipated specially with big MTUs. This patch thus allows it to inherit gfp flags from upper calls so that it can use GFP_KERNEL if it was triggered by a sctp_sendmsg call or similar. All others, like retransmits or flushes started from BH, are still allocated using GFP_ATOMIC. In netperf tests this didn't result in any performance drawbacks when memory is not too fragmented and made it trigger ENOMEM way less often. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sm.h | 2 +- include/net/sctp/structs.h | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 487ef34bbd63..efc01743b9d6 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -201,7 +201,7 @@ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *, struct sctp_chunk * sctp_make_datafrag_empty(struct sctp_association *, const struct sctp_sndrcvinfo *sinfo, int len, const __u8 flags, - __u16 ssn); + __u16 ssn, gfp_t gfp); struct sctp_chunk *sctp_make_ecne(const struct sctp_association *, const __u32); struct sctp_chunk *sctp_make_sack(const struct sctp_association *); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index d05b56641abc..9d237669c52c 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -655,7 +655,7 @@ void sctp_chunk_free(struct sctp_chunk *); void *sctp_addto_chunk(struct sctp_chunk *, int len, const void *data); struct sctp_chunk *sctp_chunkify(struct sk_buff *, const struct sctp_association *, - struct sock *); + struct sock *, gfp_t gfp); void sctp_init_addrs(struct sctp_chunk *, union sctp_addr *, union sctp_addr *); const union sctp_addr *sctp_source(const struct sctp_chunk *chunk); @@ -717,10 +717,10 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *, __u16 sport, __u16 dport); struct sctp_packet *sctp_packet_config(struct sctp_packet *, __u32 vtag, int); sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *, - struct sctp_chunk *, int); + struct sctp_chunk *, int, gfp_t); sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *, struct sctp_chunk *); -int sctp_packet_transmit(struct sctp_packet *); +int sctp_packet_transmit(struct sctp_packet *, gfp_t); void sctp_packet_free(struct sctp_packet *); static inline int sctp_packet_empty(struct sctp_packet *packet) @@ -1053,7 +1053,7 @@ struct sctp_outq { void sctp_outq_init(struct sctp_association *, struct sctp_outq *); void sctp_outq_teardown(struct sctp_outq *); void sctp_outq_free(struct sctp_outq*); -int sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk); +int sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk, gfp_t); int sctp_outq_sack(struct sctp_outq *, struct sctp_chunk *); int sctp_outq_is_empty(const struct sctp_outq *); void sctp_outq_restart(struct sctp_outq *); @@ -1061,7 +1061,7 @@ void sctp_outq_restart(struct sctp_outq *); void sctp_retransmit(struct sctp_outq *, struct sctp_transport *, sctp_retransmit_reason_t); void sctp_retransmit_mark(struct sctp_outq *, struct sctp_transport *, __u8); -int sctp_outq_uncork(struct sctp_outq *); +int sctp_outq_uncork(struct sctp_outq *, gfp_t gfp); /* Uncork and flush an outqueue. */ static inline void sctp_outq_cork(struct sctp_outq *q) { -- cgit v1.2.3 From 1e94082963747b551b129528714827f76a090e93 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 11 Mar 2016 14:05:41 -0800 Subject: ipv6: Pass proto to csum_ipv6_magic as __u8 instead of unsigned short This patch updates csum_ipv6_magic so that it correctly recognizes that protocol is a unsigned 8 bit value. This will allow us to better understand what limitations may or may not be present in how we handle the data. For example there are a number of places that call htonl on the protocol value. This is likely not necessary and can be replaced with a multiplication by ntohl(1) which will be converted to a shift by the compiler. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/net/ip6_checksum.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_checksum.h b/include/net/ip6_checksum.h index 1a49b73f7f6e..cca840584c88 100644 --- a/include/net/ip6_checksum.h +++ b/include/net/ip6_checksum.h @@ -37,8 +37,7 @@ #ifndef _HAVE_ARCH_IPV6_CSUM __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, - __u32 len, unsigned short proto, - __wsum csum); + __u32 len, __u8 proto, __wsum csum); #endif static inline __wsum ip6_compute_pseudo(struct sk_buff *skb, int proto) -- cgit v1.2.3 From 8cb2d8bf57e6e004c37db2fb4ce74f4d032b7cd0 Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Mon, 14 Mar 2016 09:39:04 +0100 Subject: net: add a hardware buffer management helper API This basic implementation allows to share code between driver using hardware buffer management. As the code is hardware agnostic, there is few helpers, most of the optimization brought by the an HW BM has to be done at driver level. Tested-by: Sebastian Careba Signed-off-by: Gregory CLEMENT Signed-off-by: David S. Miller --- include/net/hwbm.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 include/net/hwbm.h (limited to 'include/net') diff --git a/include/net/hwbm.h b/include/net/hwbm.h new file mode 100644 index 000000000000..47d08662501b --- /dev/null +++ b/include/net/hwbm.h @@ -0,0 +1,28 @@ +#ifndef _HWBM_H +#define _HWBM_H + +struct hwbm_pool { + /* Capacity of the pool */ + int size; + /* Size of the buffers managed */ + int frag_size; + /* Number of buffers currently used by this pool */ + int buf_num; + /* constructor called during alocation */ + int (*construct)(struct hwbm_pool *bm_pool, void *buf); + /* protect acces to the buffer counter*/ + spinlock_t lock; + /* private data */ + void *priv; +}; +#ifdef CONFIG_HWBM +void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf); +int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp); +int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp); +#else +void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf) {} +int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp) { return 0; } +int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp) +{ return 0; } +#endif /* CONFIG_HWBM */ +#endif /* _HWBM_H */ -- cgit v1.2.3 From a44d6eacdaf56f74fad699af7f4925a5f5ac0e7f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 14 Mar 2016 10:52:15 -0700 Subject: tcp: Add RFC4898 tcpEStatsPerfDataSegsOut/In Per RFC4898, they count segments sent/received containing a positive length data segment (that includes retransmission segments carrying data). Unlike tcpi_segs_out/in, tcpi_data_segs_out/in excludes segments carrying no data (e.g. pure ack). The patch also updates the segs_in in tcp_fastopen_add_skb() so that segs_in >= data_segs_in property is kept. Together with retransmission data, tcpi_data_segs_out gives a better signal on the rxmit rate. v6: Rebase on the latest net-next v5: Eric pointed out that checking skb->len is still needed in tcp_fastopen_add_skb() because skb can carry a FIN without data. Hence, instead of open coding segs_in and data_segs_in, tcp_segs_in() helper is used. Comment is added to the fastopen case to explain why segs_in has to be reset and tcp_segs_in() has to be called before __skb_pull(). v4: Add comment to the changes in tcp_fastopen_add_skb() and also add remark on this case in the commit message. v3: Add const modifier to the skb parameter in tcp_segs_in() v2: Rework based on recent fix by Eric: commit a9d99ce28ed3 ("tcp: fix tcpi_segs_in after connection establishment") Signed-off-by: Martin KaFai Lau Cc: Chris Rapier Cc: Eric Dumazet Cc: Marcelo Ricardo Leitner Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0302636af98c..c8dbd293daae 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1840,4 +1840,14 @@ static inline int tcp_inq(struct sock *sk) return answ; } +static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) +{ + u16 segs_in; + + segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs); + tp->segs_in += segs_in; + if (skb->len > tcp_hdrlen(skb)) + tp->data_segs_in += segs_in; +} + #endif /* _TCP_H */ -- cgit v1.2.3 From 71327a4e7d997276d49db92fd3d30008389ee6d5 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Sun, 13 Mar 2016 16:21:32 -0400 Subject: net: dsa: rename port_*_bridge routines Rename DSA port_join_bridge and port_leave_bridge routines to respectively port_bridge_join and port_bridge_leave in order to respect an implicit Port::Bridge namespace. Signed-off-by: Vivien Didelot Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/dsa.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 26c0a3fa009a..004e034184c1 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -296,9 +296,9 @@ struct dsa_switch_driver { /* * Bridge integration */ - int (*port_join_bridge)(struct dsa_switch *ds, int port, + int (*port_bridge_join)(struct dsa_switch *ds, int port, struct net_device *bridge); - int (*port_leave_bridge)(struct dsa_switch *ds, int port); + int (*port_bridge_leave)(struct dsa_switch *ds, int port); int (*port_stp_update)(struct dsa_switch *ds, int port, u8 state); -- cgit v1.2.3 From 16bfa7024eba5e36aff38ba62086b9027373007d Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Sun, 13 Mar 2016 16:21:33 -0400 Subject: net: dsa: make port_bridge_leave return void netdev_upper_dev_unlink() which notifies NETDEV_CHANGEUPPER, returns void, as well as del_nbp(). So there's no advantage to catch an eventual error from the port_bridge_leave routine at the DSA level. Make this routine void for the DSA layer and its existing drivers. Signed-off-by: Vivien Didelot Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 004e034184c1..6463bb2863ac 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -298,7 +298,7 @@ struct dsa_switch_driver { */ int (*port_bridge_join)(struct dsa_switch *ds, int port, struct net_device *bridge); - int (*port_bridge_leave)(struct dsa_switch *ds, int port); + void (*port_bridge_leave)(struct dsa_switch *ds, int port); int (*port_stp_update)(struct dsa_switch *ds, int port, u8 state); -- cgit v1.2.3 From 808c1b697c3c4dd2a7132882424c390b0d0acfb9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 16 Mar 2016 01:42:50 +0100 Subject: bpf, dst: add and use dst_tclassid helper We can just add a small helper dst_tclassid() for retrieving the dst->tclassid value. It makes the code a bit better in that we can get rid of the ifdef from filter.c by moving this into the header. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/dst.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index c7329dcd90cc..5c98443c1c9e 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -398,6 +398,18 @@ static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, __skb_tunnel_rx(skb, dev, net); } +static inline u32 dst_tclassid(const struct sk_buff *skb) +{ +#ifdef CONFIG_IP_ROUTE_CLASSID + const struct dst_entry *dst; + + dst = skb_dst(skb); + if (dst) + return dst->tclassid; +#endif + return 0; +} + int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); static inline int dst_discard(struct sk_buff *skb) { -- cgit v1.2.3 From fca5fdf67de9e092fda23c9eb059ba968e7b5267 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 16 Mar 2016 01:42:51 +0100 Subject: ip_tunnels, bpf: define IP_TUNNEL_OPTS_MAX and use it eBPF defines this as BPF_TUNLEN_MAX and OVS just uses the hard-coded value inside struct sw_flow_key. Thus, add and use IP_TUNNEL_OPTS_MAX for this, which makes the code a bit more generic and allows to remove BPF_TUNLEN_MAX from eBPF code. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 5dc2e454f866..c35dda9ec991 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -7,6 +7,8 @@ #include #include #include +#include + #include #include #include @@ -57,6 +59,11 @@ struct ip_tunnel_key { #define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */ #define IP_TUNNEL_INFO_IPV6 0x02 /* key contains IPv6 addresses */ +/* Maximum tunnel options length. */ +#define IP_TUNNEL_OPTS_MAX \ + GENMASK((FIELD_SIZEOF(struct ip_tunnel_info, \ + options_len) * BITS_PER_BYTE) - 1, 0) + struct ip_tunnel_info { struct ip_tunnel_key key; #ifdef CONFIG_DST_CACHE -- cgit v1.2.3 From fe30937b65354c7fec244caebbdaae68e28ca797 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Mar 2016 17:23:36 -0700 Subject: bonding: fix bond_get_stats() bond_get_stats() can be called from rtnetlink (with RTNL held) or from /proc/net/dev seq handler (with RCU held) The logic added in commit 5f0c5f73e5ef ("bonding: make global bonding stats more reliable") kind of assumed only one cpu could run there. If multiple threads are reading /proc/net/dev, stats can be really messed up after a while. A second problem is that some fields are 32bit, so we need to properly handle the wrap around problem. Given that RTNL is not always held, we need to use bond_for_each_slave_rcu(). Fixes: 5f0c5f73e5ef ("bonding: make global bonding stats more reliable") Signed-off-by: Eric Dumazet Cc: Andy Gospodarek Cc: Jay Vosburgh Cc: Veaceslav Falico Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/bonding.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/bonding.h b/include/net/bonding.h index ee6c52053aa3..791800ddd6d9 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -215,6 +215,7 @@ struct bonding { * ALB mode (6) - to sync the use and modifications of its hash table */ spinlock_t mode_lock; + spinlock_t stats_lock; u8 send_peer_notif; u8 igmp_retrans; #ifdef CONFIG_PROC_FS -- cgit v1.2.3