From 7016e0627171878810798a842a416dddee4e3329 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 13 Sep 2017 13:58:15 -0700 Subject: net: Convert int functions to bool Global function ipv6_rcv_saddr_equal and static functions ipv6_rcv_saddr_equal and ipv4_rcv_saddr_equal currently return int. bool is slightly more descriptive for these functions so change their return type from int to bool. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/net/addrconf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index f44ff2476758..87981cd63180 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -94,8 +94,8 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, u32 banned_flags); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); -int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, - bool match_wildcard); +bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); -- cgit v1.2.3 From bffa72cf7f9df842f0016ba03586039296b4caaf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 05:14:24 -0700 Subject: net: sk_buff rbnode reorg skb->rbnode shares space with skb->next, skb->prev and skb->tstamp Current uses (TCP receive ofo queue and netem) need to save/restore tstamp, while skb->dev is either NULL (TCP) or a constant for a given queue (netem). Since we plan using an RB tree for TCP retransmit queue to speedup SACK processing with large BDP, this patch exchanges skb->dev and skb->tstamp. This saves some overhead in both TCP and netem. v2: removes the swtstamp field from struct tcp_skb_cb Signed-off-by: Eric Dumazet Cc: Soheil Hassas Yeganeh Cc: Wei Wang Cc: Willem de Bruijn Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/tcp.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index b510f284427a..49a8a46466f3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -797,12 +797,6 @@ struct tcp_skb_cb { u16 tcp_gso_segs; u16 tcp_gso_size; }; - - /* Used to stash the receive timestamp while this skb is in the - * out of order queue, as skb->tstamp is overwritten by the - * rbnode. - */ - ktime_t swtstamp; }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ -- cgit v1.2.3 From f5619866592c65adc087364cc1a3ba709201ea26 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 19 Sep 2017 11:56:57 -0400 Subject: net: dsa: remove copy of master ethtool_ops There is no need to store a copy of the master ethtool ops, storing the original pointer in DSA and the new one in the master netdev itself is enough. In the meantime, set orig_ethtool_ops to NULL when restoring the master ethtool ops and check the presence of the master original ethtool ops as well as its needed functions before calling them. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index dd44d6ce1097..8dee216a5a9b 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -188,7 +188,6 @@ struct dsa_port { /* * Original copy of the master netdev ethtool_ops */ - struct ethtool_ops ethtool_ops; const struct ethtool_ops *orig_ethtool_ops; }; -- cgit v1.2.3 From 752fbcc33405d6f8249465e4b2c4e420091bb825 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 19 Sep 2017 13:15:42 -0700 Subject: net_sched: no need to free qdisc in RCU callback gen estimator has been rewritten in commit 1c0d32fde5bd ("net_sched: gen_estimator: complete rewrite of rate estimators"), the caller no longer needs to wait for a grace period. So this patch gets rid of it. Cc: Jamal Hadi Salim Cc: Eric Dumazet Signed-off-by: Cong Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 135f5a2dd931..684d8ed27eaa 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -93,7 +93,6 @@ struct Qdisc { unsigned long state; struct Qdisc *next_sched; struct sk_buff *skb_bad_txq; - struct rcu_head rcu_head; int padded; refcount_t refcnt; -- cgit v1.2.3 From a90c9347e90ed1e9323d71402ed18023bc910cd8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 16:27:06 -0700 Subject: ipv6: addrlabel: per netns list Having a global list of labels do not scale to thousands of netns in the cloud era. This causes quadratic behavior on netns creation and deletion. This is time having a per netns list of ~10 labels. Tested: $ time perf record (for f in `seq 1 3000` ; do ip netns add tast$f; done) [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 3.637 MB perf.data (~158898 samples) ] real 0m20.837s # instead of 0m24.227s user 0m0.328s sys 0m20.338s # instead of 0m23.753s 16.17% ip [kernel.kallsyms] [k] netlink_broadcast_filtered 12.30% ip [kernel.kallsyms] [k] netlink_has_listeners 6.76% ip [kernel.kallsyms] [k] _raw_spin_lock_irqsave 5.78% ip [kernel.kallsyms] [k] memset_erms 5.77% ip [kernel.kallsyms] [k] kobject_uevent_env 5.18% ip [kernel.kallsyms] [k] refcount_sub_and_test 4.96% ip [kernel.kallsyms] [k] _raw_read_lock 3.82% ip [kernel.kallsyms] [k] refcount_inc_not_zero 3.33% ip [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore 2.11% ip [kernel.kallsyms] [k] unmap_page_range 1.77% ip [kernel.kallsyms] [k] __wake_up 1.69% ip [kernel.kallsyms] [k] strlen 1.17% ip [kernel.kallsyms] [k] __wake_up_common 1.09% ip [kernel.kallsyms] [k] insert_header 1.04% ip [kernel.kallsyms] [k] page_remove_rmap 1.01% ip [kernel.kallsyms] [k] consume_skb 0.98% ip [kernel.kallsyms] [k] netlink_trim 0.51% ip [kernel.kallsyms] [k] kernfs_link_sibling 0.51% ip [kernel.kallsyms] [k] filemap_map_pages 0.46% ip [kernel.kallsyms] [k] memcpy_erms Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 2544f9760a42..2ea1ed341ef8 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -89,6 +89,11 @@ struct netns_ipv6 { atomic_t fib6_sernum; struct seg6_pernet_data *seg6_data; struct fib_notifier_ops *notifier_ops; + struct { + struct hlist_head head; + spinlock_t lock; + u32 seq; + } ip6addrlbl_table; }; #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) -- cgit v1.2.3 From 64bc17811b72758753e2b64cd8f2a63812c61fe1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Sep 2017 16:27:09 -0700 Subject: ipv4: speedup ipv6 tunnels dismantle Implement exit_batch() method to dismantle more devices per round. (rtnl_lock() ... unregister_netdevice_many() ... rtnl_unlock()) Tested: $ cat add_del_unshare.sh for i in `seq 1 40` do (for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) & done wait ; grep net_namespace /proc/slabinfo Before patch : $ time ./add_del_unshare.sh net_namespace 126 282 5504 1 2 : tunables 8 4 0 : slabdata 126 282 0 real 1m38.965s user 0m0.688s sys 0m37.017s After patch: $ time ./add_del_unshare.sh net_namespace 135 291 5504 1 2 : tunables 8 4 0 : slabdata 135 291 0 real 0m22.117s user 0m0.728s sys 0m35.328s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 992652856fe8..b41a1e057fce 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -258,7 +258,8 @@ int ip_tunnel_get_iflink(const struct net_device *dev); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); -void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops); +void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id, + struct rtnl_link_ops *ops); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); -- cgit v1.2.3 From 2512b1b18d0748d867bb22387db7c86b903291ad Mon Sep 17 00:00:00 2001 From: Liad Kaufman Date: Sat, 5 Aug 2017 11:44:31 +0300 Subject: mac80211: extend ieee80211_ie_split to support EXTENSION Current ieee80211_ie_split() implementation doesn't account for elements that are sub-elements of the EXTENSION IE. To extend support to these IEs as well, treat the WLAN_EID_EXTENSION ids in the %ids array as indicating that the next id in the array is a sub-element of the EXTENSION IE. Signed-off-by: Liad Kaufman Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index f12fa5245a45..aa9d993e519a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5934,7 +5934,8 @@ int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len, * @ies: the IE buffer * @ielen: the length of the IE buffer * @ids: an array with element IDs that are allowed before - * the split + * the split. A WLAN_EID_EXTENSION value means that the next + * EID in the list is a sub-element of the EXTENSION IE. * @n_ids: the size of the element ID array * @after_ric: array IE types that come after the RIC element * @n_after_ric: size of the @after_ric array @@ -5965,7 +5966,8 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, * @ies: the IE buffer * @ielen: the length of the IE buffer * @ids: an array with element IDs that are allowed before - * the split + * the split. A WLAN_EID_EXTENSION value means that the next + * EID in the list is a sub-element of the EXTENSION IE. * @n_ids: the size of the element ID array * @offset: offset where to start splitting in the buffer * -- cgit v1.2.3 From 1272c5d89b597995cb10db87dd4a1adc91d36006 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Fri, 18 Aug 2017 15:33:56 +0300 Subject: mac80211: add documentation to ieee80211_rx_ba_offl() Add documentation to ieee80211_rx_ba_offl() function and, while at it, rename the bit argument to tid, for consistency. Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/mac80211.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 885690fa39c8..cc9073e45be9 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5441,8 +5441,14 @@ void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid, */ void ieee80211_send_bar(struct ieee80211_vif *vif, u8 *ra, u16 tid, u16 ssn); +/** + * ieee80211_manage_rx_ba_offl - helper to queue an RX BA work + * @vif: &struct ieee80211_vif pointer from the add_interface callback + * @addr: station mac address + * @tid: the rx tid + */ void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, const u8 *addr, - unsigned int bit); + unsigned int tid); /** * ieee80211_start_rx_ba_session_offl - start a Rx BA session -- cgit v1.2.3 From a6bcda44843c6dfced0fb973e2607c2a98addfa9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 Sep 2017 11:52:43 +0200 Subject: cfg80211: remove unused function ieee80211_data_from_8023() This function hasn't been used since the removal of iwmc3200wifi in 2012. It also appears to have a bug when qos=True, since then it'll copy uninitialized stack memory to the SKB. Just remove the function entirely. Reported-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index aa9d993e519a..cc1996081463 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4346,19 +4346,6 @@ static inline int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, return ieee80211_data_to_8023_exthdr(skb, NULL, addr, iftype); } -/** - * ieee80211_data_from_8023 - convert an 802.3 frame to 802.11 - * @skb: the 802.3 frame - * @addr: the device MAC address - * @iftype: the virtual interface type - * @bssid: the network bssid (used only for iftype STATION and ADHOC) - * @qos: build 802.11 QoS data frame - * Return: 0 on success, or a negative error code. - */ -int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, - enum nl80211_iftype iftype, const u8 *bssid, - bool qos); - /** * ieee80211_amsdu_to_8023s - decode an IEEE 802.11n A-MSDU frame * -- cgit v1.2.3 From 6e617de84e87d626d1e976fc30e1322239fd4d2d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 20 Sep 2017 18:26:53 +0200 Subject: net: avoid a full fib lookup when rp_filter is disabled. Since commit 1dced6a85482 ("ipv4: Restore accept_local behaviour in fib_validate_source()") a full fib lookup is needed even if the rp_filter is disabled, if accept_local is false - which is the default. What we really need in the above scenario is just checking that the source IP address is not local, and in most case we can do that is a cheaper way looking up the ifaddr hash table. This commit adds a helper for such lookup, and uses it to validate the src address when rp_filter is disabled and no 'local' routes are created by the user space in the relevant namespace. A new ipv4 netns flag is added to account for such routes. We need that to preserve the same behavior we had before this patch. It also drops the checks to bail early from __fib_validate_source, added by the commit 1dced6a85482 ("ipv4: Restore accept_local behaviour in fib_validate_source()") they do not give any measurable performance improvement: if we do the lookup with are on a slower path. This improves UDP performances for unconnected sockets when rp_filter is disabled by 5% and also gives small but measurable performance improvement for TCP flood scenarios. v1 -> v2: - use the ifaddr lookup helper in __ip_dev_find(), as suggested by Eric - fall-back to full lookup if custom local routes are present Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 20d061c805e3..20720721da4b 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -49,6 +49,7 @@ struct netns_ipv4 { #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rules_ops *rules_ops; bool fib_has_custom_rules; + bool fib_has_custom_local_routes; struct fib_table __rcu *fib_main; struct fib_table __rcu *fib_default; #endif -- cgit v1.2.3 From a1f3316dd7b5ce740c774697c664e2e60d095794 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 21 Sep 2017 18:18:23 -0700 Subject: ipv4: Move fib_has_custom_local_routes outside of IP_MULTIPLE_TABLES. > net/ipv4/fib_frontend.c: In function 'fib_validate_source': > net/ipv4/fib_frontend.c:411:16: error: 'struct netns_ipv4' has no member named 'fib_has_custom_local_routes' > if (net->ipv4.fib_has_custom_local_routes) > ^ > net/ipv4/fib_frontend.c: In function 'inet_rtm_newroute': > net/ipv4/fib_frontend.c:773:12: error: 'struct netns_ipv4' has no member named 'fib_has_custom_local_routes' > net->ipv4.fib_has_custom_local_routes = true; > ^ Fixes: 6e617de84e87 ("net: avoid a full fib lookup when rp_filter is disabled.") Reported-by: Stephen Rothwell Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 20720721da4b..8387f099115e 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -49,10 +49,10 @@ struct netns_ipv4 { #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rules_ops *rules_ops; bool fib_has_custom_rules; - bool fib_has_custom_local_routes; struct fib_table __rcu *fib_main; struct fib_table __rcu *fib_default; #endif + bool fib_has_custom_local_routes; #ifdef CONFIG_IP_ROUTE_CLASSID int fib_num_tclassid_users; #endif -- cgit v1.2.3 From 373b8eeb0c15d4ce58f62afb12f213b1b5bbc3d3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 21 Sep 2017 23:45:43 +0300 Subject: xfrm: make aead_len() return unsigned int Key lengths can't be negative. Comparison with nla_len() is left signed just in case negative value can sneak in there. Signed-off-by: Alexey Dobriyan Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index f002a2c5e33c..0be4c547e383 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1764,7 +1764,7 @@ static inline int xfrm_acquire_is_on(struct net *net) } #endif -static inline int aead_len(struct xfrm_algo_aead *alg) +static inline unsigned int aead_len(struct xfrm_algo_aead *alg) { return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); } -- cgit v1.2.3 From 06cd22f830f28023b82455c82c7db65fc6cf9c16 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 21 Sep 2017 23:46:30 +0300 Subject: xfrm: make xfrm_alg_len() return unsigned int Key lengths can't be negative. Comparison with nla_len() is left signed just in case negative value can sneak in there. Signed-off-by: Alexey Dobriyan Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 0be4c547e383..2abc0e117f11 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1769,7 +1769,7 @@ static inline unsigned int aead_len(struct xfrm_algo_aead *alg) return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); } -static inline int xfrm_alg_len(const struct xfrm_algo *alg) +static inline unsigned int xfrm_alg_len(const struct xfrm_algo *alg) { return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); } -- cgit v1.2.3 From 1bd963a72e859d194d87a5a2a8839efee7e23102 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 21 Sep 2017 23:47:09 +0300 Subject: xfrm: make xfrm_alg_auth_len() return unsigned int Key lengths can't be negative. Comparison with nla_len() is left signed just in case negative value can sneak in there. Signed-off-by: Alexey Dobriyan Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2abc0e117f11..5d5e11b653eb 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1774,7 +1774,7 @@ static inline unsigned int xfrm_alg_len(const struct xfrm_algo *alg) return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); } -static inline int xfrm_alg_auth_len(const struct xfrm_algo_auth *alg) +static inline unsigned int xfrm_alg_auth_len(const struct xfrm_algo_auth *alg) { return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); } -- cgit v1.2.3 From 5e708e47c44366453c33373940455a75fd33f635 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 21 Sep 2017 23:47:50 +0300 Subject: xfrm: make xfrm_replay_state_esn_len() return unsigned int Replay detection bitmaps can't have negative length. Comparisons with nla_len() are left signed just in case negative value can sneak in there. Propagate unsignedness for code size savings: add/remove: 0/0 grow/shrink: 0/5 up/down: 0/-38 (-38) function old new delta xfrm_state_construct 1802 1800 -2 xfrm_update_ae_params 295 289 -6 xfrm_state_migrate 1345 1339 -6 xfrm_replay_notify_esn 349 337 -12 xfrm_replay_notify_bmp 345 333 -12 Signed-off-by: Alexey Dobriyan Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 5d5e11b653eb..3cb618bbcfa5 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1779,7 +1779,7 @@ static inline unsigned int xfrm_alg_auth_len(const struct xfrm_algo_auth *alg) return sizeof(*alg) + ((alg->alg_key_len + 7) / 8); } -static inline int xfrm_replay_state_esn_len(struct xfrm_replay_state_esn *replay_esn) +static inline unsigned int xfrm_replay_state_esn_len(struct xfrm_replay_state_esn *replay_esn) { return sizeof(*replay_esn) + replay_esn->bmp_len * sizeof(__u32); } -- cgit v1.2.3 From e451ae8e4f6b3f6bd3b83a5595657b5421b3bf69 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 23 Sep 2017 23:01:06 +0300 Subject: neigh: make struct neigh_table::entry_size unsigned int Neigh entry size can't be negative. Space savings: add/remove: 0/0 grow/shrink: 0/5 up/down: 0/-7 (-7) function old new delta lowpan_neigh_construct 25 24 -1 clip_seq_sub_iter 152 151 -1 clip_ioctl 1475 1474 -1 clip_constructor 93 92 -1 __neigh_create 2455 2452 -3 Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/net/neighbour.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 9816df225af3..9a25512e0a6e 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -190,7 +190,7 @@ struct neigh_hash_table { struct neigh_table { int family; - int entry_size; + unsigned int entry_size; int key_len; __be16 protocol; __u32 (*hash)(const void *pkey, -- cgit v1.2.3 From 01ccdf126ca5f9d4fe0889f65ee67afac910f19c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 23 Sep 2017 23:03:04 +0300 Subject: neigh: make strucrt neigh_table::entry_size unsigned int Key length can't be negative. Leave comparisons against nla_len() signed just in case truncated attribute can sneak in there. Space savings: add/remove: 0/0 grow/shrink: 0/7 up/down: 0/-7 (-7) function old new delta pneigh_delete 273 272 -1 mlx5e_rep_netevent_event 1415 1414 -1 mlx5e_create_encap_header_ipv6 1194 1193 -1 mlx5e_create_encap_header_ipv4 1071 1070 -1 cxgb4_l2t_get 1104 1103 -1 __pneigh_lookup 69 68 -1 __neigh_create 2452 2451 -1 Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/net/neighbour.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 9a25512e0a6e..2492000e1035 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -191,7 +191,7 @@ struct neigh_hash_table { struct neigh_table { int family; unsigned int entry_size; - int key_len; + unsigned int key_len; __be16 protocol; __u32 (*hash)(const void *pkey, const struct net_device *dev, -- cgit v1.2.3 From 3b8e9238a8d194e82f0202a5fb68a63686ebe420 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 25 Sep 2017 10:58:21 +0200 Subject: net: sched: introduce helper to identify gact pass action Introduce a helper called is_tcf_gact_pass which could be used to tell if the action is gact pass or not. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/tc_act/tc_gact.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h index 41afe1ce7b16..d979a0d48f9e 100644 --- a/include/net/tc_act/tc_gact.h +++ b/include/net/tc_act/tc_gact.h @@ -33,6 +33,11 @@ static inline bool __is_tcf_gact_act(const struct tc_action *a, int act, return false; } +static inline bool is_tcf_gact_ok(const struct tc_action *a) +{ + return __is_tcf_gact_act(a, TC_ACT_OK, false); +} + static inline bool is_tcf_gact_shot(const struct tc_action *a) { return __is_tcf_gact_act(a, TC_ACT_SHOT, false); -- cgit v1.2.3 From 85e482285bbbd508483cbe08de69c8fe00cdbbfe Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:11 +0200 Subject: fib: notifier: Add VIF add and delete event types In order for an interface to forward packets according to the kernel multicast routing table, it must be configured with a VIF index according to the mroute user API. The VIF index is then used to refer to that interface in the mroute user API, for example, to set the iif and oifs of an MFC entry. In order to allow drivers to be aware and offload multicast routes, they have to be aware of the VIF add and delete notifications. Due to the fact that a specific VIF can be deleted and re-added pointing to another netdevice, and the MFC routes that point to it will forward the matching packets to the new netdevice, a driver willing to offload MFC cache entries must be aware of the VIF add and delete events in addition to MFC routes notifications. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/fib_notifier.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index 669b9716dc7a..54cd6b839d2f 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -20,6 +20,8 @@ enum fib_event_type { FIB_EVENT_RULE_DEL, FIB_EVENT_NH_ADD, FIB_EVENT_NH_DEL, + FIB_EVENT_VIF_ADD, + FIB_EVENT_VIF_DEL, }; struct fib_notifier_ops { -- cgit v1.2.3 From 4d65b9487831170e699b2fc64a91b839d729bd78 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Wed, 27 Sep 2017 08:23:13 +0200 Subject: ipmr: Add FIB notification access functions Make the ipmr module register as a FIB notifier. To do that, implement both the ipmr_seq_read and ipmr_dump ops. The ipmr_seq_read op returns a sequence counter that is incremented on every notification related operation done by the ipmr. To implement that, add a sequence counter in the netns_ipv4 struct and increment it whenever a new MFC route or VIF are added or deleted. The sequence operations are protected by the RTNL lock. The ipmr_dump iterates the list of MFC routes and the list of VIF entries and sends notifications about them. The entries dump is done under RCU where the VIF dump uses the mrt_lock too, as the vif->dev field can change under RCU. Signed-off-by: Yotam Gigi Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 8387f099115e..abc84d986da4 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -163,6 +163,9 @@ struct netns_ipv4 { struct fib_notifier_ops *notifier_ops; unsigned int fib_seq; /* protected by rtnl_mutex */ + struct fib_notifier_ops *ipmr_notifier_ops; + unsigned int ipmr_seq; /* protected by rtnl_mutex */ + atomic_t rt_genid; }; #endif -- cgit v1.2.3 From c7c3e5913bf18eda3cf38932bebdce48351baac9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Sep 2017 19:08:00 -0700 Subject: net: ipv4: remove fib_weight fib_weight in fib_info is set but not used. Remove it and the helpers for setting it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 1a7f7e424320..f80524396c06 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -122,9 +122,6 @@ struct fib_info { #define fib_rtt fib_metrics->metrics[RTAX_RTT-1] #define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1] int fib_nhs; -#ifdef CONFIG_IP_ROUTE_MULTIPATH - int fib_weight; -#endif struct rcu_head rcu; struct fib_nh fib_nh[0]; #define fib_dev fib_nh[0].nh_dev -- cgit v1.2.3 From 152402483ed75b167d5628d414e876ffa7a6d4c4 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:18 -0400 Subject: net: dsa: add tagging ops to port The DSA tagging protocol operations are specific to each CPU port, thus the dsa_device_ops pointer belongs to the dsa_port structure. >From now on assign a slave's xmit copy from its CPU port tagging operations. This will ease the future support for multiple CPU ports. Also keep the tag_ops at the beginning of the dsa_port structure so that we ensure copies for hot path are in cacheline 1. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 8dee216a5a9b..4d1df2f086e8 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -175,6 +175,9 @@ struct dsa_mall_tc_entry { struct dsa_port { + /* CPU port tagging operations used by master or slave devices */ + const struct dsa_device_ops *tag_ops; + struct dsa_switch *ds; unsigned int index; const char *name; -- cgit v1.2.3 From 3e41f93b358a8800194b87995ad076fc50919719 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:19 -0400 Subject: net: dsa: prepare master receive hot path In preparation to make DSA master devices point to their corresponding CPU port instead of the whole tree, add copies of dst and rcv in the dsa_port structure so that we keep fast access in the receive hot path. Also keep the copies at the beginning of the dsa_port structure in order to ensure they are available in cacheline 1. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 4d1df2f086e8..6bda01fa5747 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -178,6 +178,11 @@ struct dsa_port { /* CPU port tagging operations used by master or slave devices */ const struct dsa_device_ops *tag_ops; + /* Copies for faster access in master receive hot path */ + struct dsa_switch_tree *dst; + struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt); + struct dsa_switch *ds; unsigned int index; const char *name; -- cgit v1.2.3 From aa193d9b1d7ea6893ce24a9d141f676950563987 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 29 Sep 2017 17:19:21 -0400 Subject: net: dsa: remove tag ops from the switch tree Now that the dsa_ptr is a dsa_port instance, there is no need to keep the tag operations in the dsa_switch_tree structure. Remove it. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6bda01fa5747..10dceccd9ce8 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -130,11 +130,6 @@ struct dsa_switch_tree { */ struct dsa_platform_data *pd; - /* Copy of tag_ops->rcv for faster access in hot path */ - struct sk_buff * (*rcv)(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt); - /* * The switch port to which the CPU is attached. */ @@ -144,12 +139,6 @@ struct dsa_switch_tree { * Data for the individual switch chips. */ struct dsa_switch *ds[DSA_MAX_SWITCHES]; - - /* - * Tagging protocol operations for adding and removing an - * encapsulation tag. - */ - const struct dsa_device_ops *tag_ops; }; /* TC matchall action types, only mirroring for now */ -- cgit v1.2.3 From e1cfcbe82b4534bd0f99fef92a6d33843fd85e0e Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:40 +0800 Subject: ipv4: Namespaceify tcp_fastopen knob Different namespace application might require enable TCP Fast Open feature independently of the host. This patch series continues making more of the TCP Fast Open related sysctl knobs be per net-namespace. Reported-by: Luca BRUNO Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index abc84d986da4..16420ccaef15 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -128,6 +128,7 @@ struct netns_ipv4 { int sysctl_tcp_timestamps; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; + int sysctl_tcp_fastopen; #ifdef CONFIG_NET_L3_MASTER_DEV int sysctl_udp_l3mdev_accept; diff --git a/include/net/tcp.h b/include/net/tcp.h index 770b608c8439..9e414a99034f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -240,7 +240,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ -extern int sysctl_tcp_fastopen; extern int sysctl_tcp_retrans_collapse; extern int sysctl_tcp_stdurg; extern int sysctl_tcp_rfc1337; -- cgit v1.2.3 From dd000598a39b6937fcefdf143720ec9fb5250e72 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:41 +0800 Subject: ipv4: Remove the 'publish' logic in tcp_fastopen_init_key_once MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'publish' logic is not necessary after commit dfea2aa65424 ("tcp: Do not call tcp_fastopen_reset_cipher from interrupt context"), because in tcp_fastopen_cookie_gen,it wouldn't call tcp_fastopen_init_key_once. Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 9e414a99034f..d9376e2458e9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1555,7 +1555,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc); -void tcp_fastopen_init_key_once(bool publish); +void tcp_fastopen_init_key_once(void); bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); bool tcp_fastopen_defer_connect(struct sock *sk, int *err); -- cgit v1.2.3 From 437138485656c41e32b8c63c0987cfa0348be0e6 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:42 +0800 Subject: ipv4: Namespaceify tcp_fastopen_key knob Different namespace application might require different tcp_fastopen_key independently of the host. David Miller pointed out there is a leak without releasing the context of tcp_fastopen_key during netns teardown. So add the release action in exit_batch path. Tested: 1. Container namespace: # cat /proc/sys/net/ipv4/tcp_fastopen_key: 2817fff2-f803cf97-eadfd1f3-78c0992b cookie key in tcp syn packets: Fast Open Cookie Kind: TCP Fast Open Cookie (34) Length: 10 Fast Open Cookie: 1e5dd82a8c492ca9 2. Host: # cat /proc/sys/net/ipv4/tcp_fastopen_key: 107d7c5f-68eb2ac7-02fb06e6-ed341702 cookie key in tcp syn packets: Fast Open Cookie Kind: TCP Fast Open Cookie (34) Length: 10 Fast Open Cookie: e213c02bf0afbc8a Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 4 ++++ include/net/tcp.h | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 16420ccaef15..7bb9603ff66c 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -36,6 +36,8 @@ struct inet_timewait_death_row { int sysctl_max_tw_buckets; }; +struct tcp_fastopen_context; + struct netns_ipv4 { #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; @@ -129,6 +131,8 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; + struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; + spinlock_t tcp_fastopen_ctx_lock; #ifdef CONFIG_NET_L3_MASTER_DEV int sysctl_udp_l3mdev_accept; diff --git a/include/net/tcp.h b/include/net/tcp.h index d9376e2458e9..6d25d8305054 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1549,13 +1549,13 @@ struct tcp_fastopen_request { }; void tcp_free_fastopen_req(struct tcp_sock *tp); -extern struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; -int tcp_fastopen_reset_cipher(void *key, unsigned int len); +void tcp_fastopen_ctx_destroy(struct net *net); +int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len); void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc); -void tcp_fastopen_init_key_once(void); +void tcp_fastopen_init_key_once(struct net *net); bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); bool tcp_fastopen_defer_connect(struct sock *sk, int *err); -- cgit v1.2.3 From 3733be14a32bae288b61ed28341e593baba983af Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 27 Sep 2017 11:35:43 +0800 Subject: ipv4: Namespaceify tcp_fastopen_blackhole_timeout knob Different namespace application might require different time period in second to disable Fastopen on active TCP sockets. Tested: Simulate following similar situation that the server's data gets dropped after 3WHS. C ---- syn-data ---> S C <--- syn/ack ----- S C ---- ack --------> S S (accept & write) C? X <- data ------ S [retry and timeout] And then print netstat of TCPFastOpenBlackhole, the counter increased as expected when the firewall blackhole issue is detected and active TFO is disabled. # cat /proc/net/netstat | awk '{print $91}' TCPFastOpenBlackhole 1 Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 7bb9603ff66c..2c4222a5d102 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -133,6 +133,9 @@ struct netns_ipv4 { int sysctl_tcp_fastopen; struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; spinlock_t tcp_fastopen_ctx_lock; + unsigned int sysctl_tcp_fastopen_blackhole_timeout; + atomic_t tfo_active_disable_times; + unsigned long tfo_active_disable_stamp; #ifdef CONFIG_NET_L3_MASTER_DEV int sysctl_udp_l3mdev_accept; -- cgit v1.2.3 From b80ccfe9bbcac70e66fdfaef73f0988a27f9a68c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 26 Sep 2017 20:37:22 -0700 Subject: net-ipv6: remove unused IP6_ECN_clear() function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function is unused, and furthermore it is buggy since it suffers from the same issue that requires IP6_ECN_set_ce() to take a pointer to the skb so that it may (in case of CHECKSUM_COMPLETE) update skb->csum Instead of fixing it, let's just outright remove it. Tested: builds, and 'git grep IP6_ECN_clear' comes up empty Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- include/net/inet_ecn.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index dce2d586d9ce..f5ff16d72fe6 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -133,11 +133,6 @@ static inline int IP6_ECN_set_ce(struct sk_buff *skb, struct ipv6hdr *iph) return 1; } -static inline void IP6_ECN_clear(struct ipv6hdr *iph) -{ - *(__be32*)iph &= ~htonl(INET_ECN_MASK << 20); -} - static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner) { dscp &= ~INET_ECN_MASK; -- cgit v1.2.3 From 503c1fb98ba3859c13863957c7c65c92371a9e50 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Fri, 29 Sep 2017 14:21:49 +0200 Subject: cfg80211/nl80211: add a port authorized event Add an event that indicates that a connection is authorized (i.e. the 4 way handshake was performed by the driver). This event should be sent by the driver after sending a connect/roamed event. This is useful for networks that require 802.1X authentication. In cases that the driver supports 4 way handshake offload, but the 802.1X authentication is managed by user space, the driver needs to inform user space right after the 802.11 association was completed so user space can initialize its 802.1X state machine etc. However, it is also possible that the AP will choose to skip the 802.1X authentication (e.g. when PMKSA caching is used) and proceed with the 4 way handshake immediately. In this case the driver needs to inform user space that 802.1X authentication is no longer required (e.g. to prevent user space from disconnecting since it did not get any EAPOLs from the AP). This is also useful for roaming, in which case it is possible that the driver used the Fast Transition protocol so 802.1X is not required. Since there will now be a dedicated notification indicating that the connection is authorized, the authorized flag can be removed from the roamed event. Drivers can send the new port authorized event right after sending the roamed event to indicate the new AP is already authorized. This therefore reserves the old PORT_AUTHORIZED attribute. Signed-off-by: Avraham Stern Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index cc1996081463..8b8118a7fadb 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5428,9 +5428,6 @@ cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid, * @req_ie_len: association request IEs length * @resp_ie: association response IEs (may be %NULL) * @resp_ie_len: assoc response IEs length - * @authorized: true if the 802.1X authentication was done by the driver or is - * not needed (e.g., when Fast Transition protocol was used), false - * otherwise. Ignored for networks that don't use 802.1X authentication. */ struct cfg80211_roam_info { struct ieee80211_channel *channel; @@ -5440,7 +5437,6 @@ struct cfg80211_roam_info { size_t req_ie_len; const u8 *resp_ie; size_t resp_ie_len; - bool authorized; }; /** @@ -5464,6 +5460,23 @@ struct cfg80211_roam_info { void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, gfp_t gfp); +/** + * cfg80211_port_authorized - notify cfg80211 of successful security association + * + * @dev: network device + * @bssid: the BSSID of the AP + * @gfp: allocation flags + * + * This function should be called by a driver that supports 4 way handshake + * offload after a security association was successfully established (i.e., + * the 4 way handshake was completed successfully). The call to this function + * should be preceded with a call to cfg80211_connect_result(), + * cfg80211_connect_done(), cfg80211_connect_bss() or cfg80211_roamed() to + * indicate the 802.11 association. + */ +void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid, + gfp_t gfp); + /** * cfg80211_disconnected - notify cfg80211 that connection was dropped * -- cgit v1.2.3 From 32f16369e59fcc505c5ed93a6a8cad3d5636b463 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 2 Oct 2017 10:41:15 +0200 Subject: net/dst: Make skb parameter of skb{metadata_dst, tunnel_info}() const Make the skb parameter of skb_metadata_dst() and skb_tunnel_info() const as they are not modified. This is in preparation for using them in call-sites where skb is const. Signed-off-by: Simon Horman Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index a803129a4849..9fba2ebf6dda 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -24,7 +24,7 @@ struct metadata_dst { } u; }; -static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) +static inline struct metadata_dst *skb_metadata_dst(const struct sk_buff *skb) { struct metadata_dst *md_dst = (struct metadata_dst *) skb_dst(skb); @@ -34,7 +34,8 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) return NULL; } -static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb) +static inline struct ip_tunnel_info * +skb_tunnel_info(const struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); struct dst_entry *dst; -- cgit v1.2.3 From f952be79cebd49d04154781d99408867a069d375 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:11 -0300 Subject: sctp: introduce struct sctp_stream_out_ext With the stream schedulers, sctp_stream_out will become too big to be allocated by kmalloc and as we need to allocate with BH disabled, we cannot use __vmalloc in sctp_stream_init(). This patch moves out the stats from sctp_stream_out to sctp_stream_out_ext, which will be allocated only when the application tries to sendmsg something on it. Just the introduction of sctp_stream_out_ext would already fix the issue described above by splitting the allocation in two. Moving the stats to it also reduces the pressure on the allocator as we will ask for less memory atomically when creating the socket and we will use GFP_KERNEL later. Then, for stream schedulers, we will just use sctp_stream_out_ext. Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 0477945de1a3..9b2b30b3ba4d 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -84,6 +84,7 @@ struct sctp_ulpq; struct sctp_ep_common; struct crypto_shash; struct sctp_stream; +struct sctp_stream_out; #include @@ -380,6 +381,7 @@ struct sctp_sender_hb_info { int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp); +int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid); void sctp_stream_free(struct sctp_stream *stream); void sctp_stream_clear(struct sctp_stream *stream); void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new); @@ -1315,11 +1317,15 @@ struct sctp_inithdr_host { __u32 initial_tsn; }; +struct sctp_stream_out_ext { + __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; + __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; +}; + struct sctp_stream_out { __u16 ssn; __u8 state; - __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; - __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; + struct sctp_stream_out_ext *ext; }; struct sctp_stream_in { -- cgit v1.2.3 From 2fc019f790312e703efa1a44204c586112a430dc Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:12 -0300 Subject: sctp: introduce sctp_chunk_stream_no Add a helper to fetch the stream number from a given chunk. Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 9b2b30b3ba4d..c48f7999fe9b 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -642,6 +642,11 @@ void sctp_init_addrs(struct sctp_chunk *, union sctp_addr *, union sctp_addr *); const union sctp_addr *sctp_source(const struct sctp_chunk *chunk); +static inline __u16 sctp_chunk_stream_no(struct sctp_chunk *ch) +{ + return ntohs(ch->subh.data_hdr->stream); +} + enum { SCTP_ADDR_NEW, /* new address added to assoc/ep */ SCTP_ADDR_SRC, /* address can be used as source */ -- cgit v1.2.3 From 5bbbbe32a43199c2b9ea5ea66fab6241c64beb51 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:13 -0300 Subject: sctp: introduce stream scheduler foundations This patch introduces the hooks necessary to do stream scheduling, as per RFC Draft ndata. It also introduces the first scheduler, which is what we do today but now factored out: first come first served (FCFS). With stream scheduling now we have to track which chunk was enqueued on which stream and be able to select another other than the in front of the main outqueue. So we introduce a list on sctp_stream_out_ext structure for this purpose. We reuse sctp_chunk->transmitted_list space for the list above, as the chunk cannot belong to the two lists at the same time. By using the union in there, we can have distinct names for these moments. sctp_sched_ops are the operations expected to be implemented by each scheduler. The dequeueing is a bit particular to this implementation but it is to match how we dequeue packets today. We first dequeue and then check if it fits the packet and if not, we requeue it at head. Thus why we don't have a peek operation but have dequeue_done instead, which is called once the chunk can be safely considered as transmitted. The check removed from sctp_outq_flush is now performed by sctp_stream_outq_migrate, which is only called during assoc setup. (sctp_sendmsg() also checks for it) The only operation that is foreseen but not yet added here is a way to signalize that a new packet is starting or that the packet is done, for round robin scheduler per packet, but is intentionally left to the patch that actually implements it. Support for I-DATA chunks, also described in this RFC, with user message interleaving is straightforward as it just requires the schedulers to probe for the feature and ignore datamsg boundaries when dequeueing. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/stream_sched.h | 72 +++++++++++++++++++++++++++++++++++++++++ include/net/sctp/structs.h | 15 +++++++-- 2 files changed, 84 insertions(+), 3 deletions(-) create mode 100644 include/net/sctp/stream_sched.h (limited to 'include/net') diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h new file mode 100644 index 000000000000..c676550a4c7d --- /dev/null +++ b/include/net/sctp/stream_sched.h @@ -0,0 +1,72 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * These are definitions used by the stream schedulers, defined in RFC + * draft ndata (https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-11) + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * . + * + * Please send any bug reports or fixes you make to the + * email addresses: + * lksctp developers + * + * Written or modified by: + * Marcelo Ricardo Leitner + */ + +#ifndef __sctp_stream_sched_h__ +#define __sctp_stream_sched_h__ + +struct sctp_sched_ops { + /* Property handling for a given stream */ + int (*set)(struct sctp_stream *stream, __u16 sid, __u16 value, + gfp_t gfp); + int (*get)(struct sctp_stream *stream, __u16 sid, __u16 *value); + + /* Init the specific scheduler */ + int (*init)(struct sctp_stream *stream); + /* Init a stream */ + int (*init_sid)(struct sctp_stream *stream, __u16 sid, gfp_t gfp); + /* Frees the entire thing */ + void (*free)(struct sctp_stream *stream); + + /* Enqueue a chunk */ + void (*enqueue)(struct sctp_outq *q, struct sctp_datamsg *msg); + /* Dequeue a chunk */ + struct sctp_chunk *(*dequeue)(struct sctp_outq *q); + /* Called only if the chunk fit the packet */ + void (*dequeue_done)(struct sctp_outq *q, struct sctp_chunk *chunk); + /* Sched all chunks already enqueued */ + void (*sched_all)(struct sctp_stream *steam); + /* Unched all chunks already enqueued */ + void (*unsched_all)(struct sctp_stream *steam); +}; + +int sctp_sched_set_sched(struct sctp_association *asoc, + enum sctp_sched_type sched); +int sctp_sched_get_sched(struct sctp_association *asoc); +int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid, + __u16 value, gfp_t gfp); +int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, + __u16 *value); +void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch); + +void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch); +int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp); +struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream); + +#endif /* __sctp_stream_sched_h__ */ diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index c48f7999fe9b..3c22a30fd71b 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -84,7 +84,6 @@ struct sctp_ulpq; struct sctp_ep_common; struct crypto_shash; struct sctp_stream; -struct sctp_stream_out; #include @@ -531,8 +530,12 @@ struct sctp_chunk { /* How many times this chunk have been sent, for prsctp RTX policy */ int sent_count; - /* This is our link to the per-transport transmitted list. */ - struct list_head transmitted_list; + union { + /* This is our link to the per-transport transmitted list. */ + struct list_head transmitted_list; + /* List in specific stream outq */ + struct list_head stream_list; + }; /* This field is used by chunks that hold fragmented data. * For the first fragment this is the list that holds the rest of @@ -1019,6 +1022,9 @@ struct sctp_outq { /* Data pending that has never been transmitted. */ struct list_head out_chunk_list; + /* Stream scheduler being used */ + struct sctp_sched_ops *sched; + unsigned int out_qlen; /* Total length of queued data chunks. */ /* Error of send failed, may used in SCTP_SEND_FAILED event. */ @@ -1325,6 +1331,7 @@ struct sctp_inithdr_host { struct sctp_stream_out_ext { __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; + struct list_head outq; /* chunks enqueued by this stream */ }; struct sctp_stream_out { @@ -1342,6 +1349,8 @@ struct sctp_stream { struct sctp_stream_in *in; __u16 outcnt; __u16 incnt; + /* Current stream being sent, if any */ + struct sctp_stream_out *out_curr; }; #define SCTP_STREAM_CLOSED 0x00 -- cgit v1.2.3 From 637784ade221a3c8a7ecd0f583eddd95d6276b9a Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:16 -0300 Subject: sctp: introduce priority based stream scheduler This patch introduces RFC Draft ndata section 3.4 Priority Based Scheduler (SCTP_SS_PRIO). It works by having a struct sctp_stream_priority for each priority configured. This struct is then enlisted on a queue ordered per priority if, and only if, there is a stream with data queued, so that dequeueing is very straightforward: either finish current datamsg or simply dequeue from the highest priority queued, which is the next stream pointed, and that's it. If there are multiple streams assigned with the same priority and with data queued, it will do round robin amongst them while respecting datamsgs boundaries (when not using idata chunks), to be reasonably fair. We intentionally don't maintain a list of priorities nor a list of all streams with the same priority to save memory. The first would mean at least 2 other pointers per priority (which, for 1000 priorities, that can mean 16kB) and the second would also mean 2 other pointers but per stream. As SCTP supports up to 65535 streams on a given asoc, that's 1MB. This impacts when giving a priority to some stream, as we have to find out if the new priority is already being used and if we can free the old one, and also when tearing down. The new fields in struct sctp_stream_out_ext and sctp_stream are added under a union because that memory is to be shared with other schedulers. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 3c22a30fd71b..40eb8d66a37c 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1328,10 +1328,27 @@ struct sctp_inithdr_host { __u32 initial_tsn; }; +struct sctp_stream_priorities { + /* List of priorities scheduled */ + struct list_head prio_sched; + /* List of streams scheduled */ + struct list_head active; + /* The next stream stream in line */ + struct sctp_stream_out_ext *next; + __u16 prio; +}; + struct sctp_stream_out_ext { __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; struct list_head outq; /* chunks enqueued by this stream */ + union { + struct { + /* Scheduled streams list */ + struct list_head prio_list; + struct sctp_stream_priorities *prio_head; + }; + }; }; struct sctp_stream_out { @@ -1351,6 +1368,13 @@ struct sctp_stream { __u16 incnt; /* Current stream being sent, if any */ struct sctp_stream_out *out_curr; + union { + /* Fields used by priority scheduler */ + struct { + /* List of priorities scheduled */ + struct list_head prio_list; + }; + }; }; #define SCTP_STREAM_CLOSED 0x00 -- cgit v1.2.3 From ac1ed8b82cd60ba8e7d84103ac1414b8c577c485 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Tue, 3 Oct 2017 19:20:17 -0300 Subject: sctp: introduce round robin stream scheduler This patch introduces RFC Draft ndata section 3.2 Priority Based Scheduler (SCTP_SS_RR). Works by maintaining a list of enqueued streams and tracking the last one used to send data. When the datamsg is done, it switches to the next stream. See-also: https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-13 Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 40eb8d66a37c..16f949eef52f 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1348,6 +1348,10 @@ struct sctp_stream_out_ext { struct list_head prio_list; struct sctp_stream_priorities *prio_head; }; + /* Fields used by RR scheduler */ + struct { + struct list_head rr_list; + }; }; }; @@ -1374,6 +1378,13 @@ struct sctp_stream { /* List of priorities scheduled */ struct list_head prio_list; }; + /* Fields used by RR scheduler */ + struct { + /* List of streams scheduled */ + struct list_head rr_list; + /* The next stream stream in line */ + struct sctp_stream_out_ext *rr_next; + }; }; }; -- cgit v1.2.3 From e774d96b7d2c3489bfb5bbdc2b65ed41cd68d3d5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 4 Oct 2017 15:55:29 +0200 Subject: rtnetlink: remove slave_validate callback no users in the tree. Signed-off-by: Florian Westphal Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/rtnetlink.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 21837ca68ecc..6520993ff449 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -93,9 +93,6 @@ struct rtnl_link_ops { int slave_maxtype; const struct nla_policy *slave_policy; - int (*slave_validate)(struct nlattr *tb[], - struct nlattr *data[], - struct netlink_ext_ack *extack); int (*slave_changelink)(struct net_device *dev, struct net_device *slave_dev, struct nlattr *tb[], -- cgit v1.2.3 From 5c45121dc39026ab2139910e57cf933fd57d30f2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 4 Oct 2017 15:58:49 +0200 Subject: rtnetlink: remove __rtnl_af_unregister switch the only caller to rtnl_af_unregister. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/rtnetlink.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 6520993ff449..e3ca8e2e3103 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -151,8 +151,6 @@ struct rtnl_af_ops { size_t (*get_stats_af_size)(const struct net_device *dev); }; -void __rtnl_af_unregister(struct rtnl_af_ops *ops); - void rtnl_af_register(struct rtnl_af_ops *ops); void rtnl_af_unregister(struct rtnl_af_ops *ops); -- cgit v1.2.3 From 33eaf2a6eb48ebf00374aaaf4b1b43f9950dcbe4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 4 Oct 2017 17:48:46 -0700 Subject: net: Add extack to ndo_add_slave Pass extack to do_set_master and down to ndo_add_slave Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/bonding.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bonding.h b/include/net/bonding.h index b2e68657a216..2860cc66c2bb 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -596,7 +596,8 @@ void bond_destroy_sysfs(struct bond_net *net); void bond_prepare_sysfs_group(struct bonding *bond); int bond_sysfs_slave_add(struct slave *slave); void bond_sysfs_slave_del(struct slave *slave); -int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev); +int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, + struct netlink_ext_ack *extack); int bond_release(struct net_device *bond_dev, struct net_device *slave_dev); u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb); int bond_set_carrier(struct bonding *bond); -- cgit v1.2.3 From 44f209807ee87a5eddf6c0432f3fb63cec27bad8 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Oct 2017 16:46:50 -0400 Subject: VSOCK: export socket tables for sock_diag interface The socket table symbols need to be exported from vsock.ko so that the vsock_diag.ko module will be able to traverse sockets. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/net/af_vsock.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index f9fb566e75cf..30cba806e344 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -27,6 +27,11 @@ #define LAST_RESERVED_PORT 1023 +#define VSOCK_HASH_SIZE 251 +extern struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; +extern struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; +extern spinlock_t vsock_table_lock; + #define vsock_sk(__sk) ((struct vsock_sock *)__sk) #define sk_vsock(__vsk) (&(__vsk)->sk) -- cgit v1.2.3 From bf359b8127719535f88494adb3c2b73c06667dcd Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Oct 2017 16:46:51 -0400 Subject: VSOCK: move __vsock_in_bound/connected_table() to af_vsock.h The vsock_diag.ko module will need to check socket table membership. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/net/af_vsock.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/net') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 30cba806e344..3dd217718a2f 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -180,6 +180,18 @@ const struct vsock_transport *vsock_core_get_transport(void); /**** UTILS ****/ +/* vsock_table_lock must be held */ +static inline bool __vsock_in_bound_table(struct vsock_sock *vsk) +{ + return !list_empty(&vsk->bound_table); +} + +/* vsock_table_lock must be held */ +static inline bool __vsock_in_connected_table(struct vsock_sock *vsk) +{ + return !list_empty(&vsk->connected_table); +} + void vsock_release_pending(struct sock *pending); void vsock_add_pending(struct sock *listener, struct sock *pending); void vsock_remove_pending(struct sock *listener, struct sock *pending); -- cgit v1.2.3 From 3b4477d2dcf2709d0be89e2a8dced3d0f4a017f2 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Thu, 5 Oct 2017 16:46:52 -0400 Subject: VSOCK: use TCP state constants for sk_state There are two state fields: socket->state and sock->sk_state. The socket->state field uses SS_UNCONNECTED, SS_CONNECTED, etc while the sock->sk_state typically uses values that match TCP state constants (TCP_CLOSE, TCP_ESTABLISHED). AF_VSOCK does not follow this convention and instead uses SS_* constants for both fields. The sk_state field will be exposed to userspace through the vsock_diag interface for ss(8), netstat(8), and other programs. This patch switches sk_state to TCP state constants so that the meaning of this field is consistent with other address families. Not just AF_INET and AF_INET6 use the TCP constants, AF_UNIX and others do too. The following mapping was used to convert the code: SS_FREE -> TCP_CLOSE SS_UNCONNECTED -> TCP_CLOSE SS_CONNECTING -> TCP_SYN_SENT SS_CONNECTED -> TCP_ESTABLISHED SS_DISCONNECTING -> TCP_CLOSING VSOCK_SS_LISTEN -> TCP_LISTEN In __vsock_create() the sk_state initialization was dropped because sock_init_data() already initializes sk_state to TCP_CLOSE. Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- include/net/af_vsock.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 3dd217718a2f..9324ac2d9ff2 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -22,9 +22,6 @@ #include "vsock_addr.h" -/* vsock-specific sock->sk_state constants */ -#define VSOCK_SS_LISTEN 255 - #define LAST_RESERVED_PORT 1023 #define VSOCK_HASH_SIZE 251 -- cgit v1.2.3 From 27204aaa9dc67b833b77179fdac556288ec3a4bf Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 4 Oct 2017 10:03:44 -0700 Subject: tcp: uniform the set up of sockets after successful connection Currently in the TCP code, the initialization sequence for cached metrics, congestion control, BPF, etc, after successful connection is very inconsistent. This introduces inconsistent bevhavior and is prone to bugs. The current call sequence is as follows: (1) for active case (tcp_finish_connect() case): tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); (2) for passive case (tcp_rcv_state_process() TCP_SYN_RECV case): icsk->icsk_af_ops->rebuild_header(sk); tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); tcp_mtup_init(sk); tcp_init_buffer_space(sk); tcp_init_metrics(sk); (3) for TFO passive case (tcp_fastopen_create_child()): inet_csk(child)->icsk_af_ops->rebuild_header(child); tcp_init_congestion_control(child); tcp_mtup_init(child); tcp_init_metrics(child); tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tcp_init_buffer_space(child); This commit uniforms the above functions to have the following sequence: tcp_mtup_init(sk); icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE/PASSIVE_ESTABLISHED_CB); tcp_init_congestion_control(sk); tcp_init_buffer_space(sk); This sequence is the same as the (1) active case. We pick this sequence because this order correctly allows BPF to override the settings including congestion control module and initial cwnd, etc from the route, and then allows the CC module to see those settings. Suggested-by: Neal Cardwell Tested-by: Neal Cardwell Signed-off-by: Wei Wang Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 7a3a8af56fd6..426c2e986016 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -416,6 +416,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); void tcp_disable_fack(struct tcp_sock *tp); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); +void tcp_init_transfer(struct sock *sk, int bpf_op); unsigned int tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, -- cgit v1.2.3 From e2080072ed2d98a55ae69d95dea60ff7a17cddd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Oct 2017 12:59:58 -0700 Subject: tcp: new list for sent but unacked skbs for RACK recovery This patch adds a new queue (list) that tracks the sent but not yet acked or SACKed skbs for a TCP connection. The list is chronologically ordered by skb->skb_mstamp (the head is the oldest sent skb). This list will be used to optimize TCP Rack recovery, which checks an skb's timestamp to judge if it has been lost and needs to be retransmitted. Since TCP write queue is ordered by sequence instead of sent time, RACK has to scan over the write queue to catch all eligible packets to detect lost retransmission, and iterates through SACKed skbs repeatedly. Special cares for rare events: 1. TCP repair fakes skb transmission so the send queue needs adjusted 2. SACK reneging would require re-inserting SACKed skbs into the send queue. For now I believe it's not worth the complexity to make RACK work perfectly on SACK reneging, so we do nothing here. 3. Fast Open: currently for non-TFO, send-queue correctly queues the pure SYN packet. For TFO which queues a pure SYN and then a data packet, send-queue only queues the data packet but not the pure SYN due to the structure of TFO code. This is okay because the SYN receiver would never respond with a SACK on a missing SYN (i.e. SYN is never fast-retransmitted by SACK/RACK). In order to not grow sk_buff, we use an union for the new list and _skb_refdst/destructor fields. This is a bit complicated because we need to make sure _skb_refdst and destructor are properly zeroed before skb is cloned/copied at transmit, and before being freed. Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 426c2e986016..3b16f353b539 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1589,14 +1589,34 @@ enum tcp_chrono { void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type); void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type); +/* This helper is needed, because skb->tcp_tsorted_anchor uses + * the same memory storage than skb->destructor/_skb_refdst + */ +static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb) +{ + skb->destructor = NULL; + skb->_skb_refdst = 0UL; +} + +#define tcp_skb_tsorted_save(skb) { \ + unsigned long _save = skb->_skb_refdst; \ + skb->_skb_refdst = 0UL; + +#define tcp_skb_tsorted_restore(skb) \ + skb->_skb_refdst = _save; \ +} + /* write queue abstraction */ static inline void tcp_write_queue_purge(struct sock *sk) { struct sk_buff *skb; tcp_chrono_stop(sk, TCP_CHRONO_BUSY); - while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) + while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + tcp_skb_tsorted_anchor_cleanup(skb); sk_wmem_free_skb(sk, skb); + } + INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); sk_mem_reclaim(sk); tcp_clear_all_retrans_hints(tcp_sk(sk)); } @@ -1711,6 +1731,8 @@ static inline void tcp_insert_write_queue_before(struct sk_buff *new, static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk) { + list_del(&skb->tcp_tsorted_anchor); + tcp_skb_tsorted_anchor_cleanup(skb); __skb_unlink(skb, &sk->sk_write_queue); } -- cgit v1.2.3 From 4e64b1ed15e25b8dcc2819c6d43dab72eb0bea26 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 5 Oct 2017 23:46:14 -0700 Subject: net/ipv6: Convert icmpv6_push_pending_frames to void commit cc71b7b07119 ("net/ipv6: remove unused err variable on icmpv6_push_pending_frames") exposed icmpv6_push_pending_frames return value not being used. Remove now unnecessary int err declarations and uses. Miscellanea: o Remove unnecessary goto and out: labels o Realign arguments Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- include/net/ipv6.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 6eac5cf8f1e6..3cda3b521c36 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -300,8 +300,8 @@ static inline void fl6_sock_release(struct ip6_flowlabel *fl) void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info); -int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, - struct icmp6hdr *thdr, int len); +void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, + struct icmp6hdr *thdr, int len); int ip6_ra_control(struct sock *sk, int sel); -- cgit v1.2.3 From ac3f09ba3e496bd7cc780ead05b1d1bb5f33aedb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 22:21:22 -0700 Subject: tcp: uninline tcp_write_queue_purge() Since the upcoming rtx rbtree will add some extra code, it is time to not inline this fat function anymore. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 3b16f353b539..744559b72784 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1606,20 +1606,7 @@ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb) skb->_skb_refdst = _save; \ } -/* write queue abstraction */ -static inline void tcp_write_queue_purge(struct sock *sk) -{ - struct sk_buff *skb; - - tcp_chrono_stop(sk, TCP_CHRONO_BUSY); - while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { - tcp_skb_tsorted_anchor_cleanup(skb); - sk_wmem_free_skb(sk, skb); - } - INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); - sk_mem_reclaim(sk); - tcp_clear_all_retrans_hints(tcp_sk(sk)); -} +void tcp_write_queue_purge(struct sock *sk); static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk) { -- cgit v1.2.3 From 75c119afe14f74b4dd967d75ed9f57ab6c0ef045 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Oct 2017 22:21:27 -0700 Subject: tcp: implement rb-tree based retransmit queue Using a linear list to store all skbs in write queue has been okay for quite a while : O(N) is not too bad when N < 500. Things get messy when N is the order of 100,000 : Modern TCP stacks want 10Gbit+ of throughput even with 200 ms RTT flows. 40 ns per cache line miss means a full scan can use 4 ms, blowing away CPU caches. SACK processing often can use various hints to avoid parsing whole retransmit queue. But with high packet losses and/or high reordering, hints no longer work. Sender has to process thousands of unfriendly SACK, accumulating a huge socket backlog, burning a cpu and massively dropping packets. Using an rb-tree for retransmit queue has been avoided for years because it added complexity and overhead, but now is the time to be more resistant and say no to quadratic behavior. 1) RTX queue is no longer part of the write queue : already sent skbs are stored in one rb-tree. 2) Since reaching the head of write queue no longer needs sk->sk_send_head, we added an union of sk_send_head and tcp_rtx_queue Tested: On receiver : netem on ingress : delay 150ms 200us loss 1 GRO disabled to force stress and SACK storms. for f in `seq 1 10` do ./netperf -H lpaa6 -l30 -- -K bbr -o THROUGHPUT|tail -1 done | awk '{print $0} {sum += $0} END {printf "%7u\n",sum}' Before patch : 323.87 351.48 339.59 338.62 306.72 204.07 304.93 291.88 202.47 176.88 2840 After patch: 1700.83 2207.98 2070.17 1544.26 2114.76 2124.89 1693.14 1080.91 2216.82 1299.94 18053 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 7 +++-- include/net/tcp.h | 89 ++++++++++++++++++++++++++++-------------------------- 2 files changed, 52 insertions(+), 44 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index a6b9a8d1a6df..4827094f1db4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -60,7 +60,7 @@ #include #include #include - +#include #include #include #include @@ -397,7 +397,10 @@ struct sock { int sk_wmem_queued; refcount_t sk_wmem_alloc; unsigned long sk_tsq_flags; - struct sk_buff *sk_send_head; + union { + struct sk_buff *sk_send_head; + struct rb_root tcp_rtx_queue; + }; struct sk_buff_head sk_write_queue; __s32 sk_peek_off; int sk_write_pending; diff --git a/include/net/tcp.h b/include/net/tcp.h index 744559b72784..5a95e5886b55 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -551,7 +551,13 @@ void tcp_xmit_retransmit_queue(struct sock *); void tcp_simple_retransmit(struct sock *); void tcp_enter_recovery(struct sock *sk, bool ece_ack); int tcp_trim_head(struct sock *, struct sk_buff *, u32); -int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t); +enum tcp_queue { + TCP_FRAG_IN_WRITE_QUEUE, + TCP_FRAG_IN_RTX_QUEUE, +}; +int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + struct sk_buff *skb, u32 len, + unsigned int mss_now, gfp_t gfp); void tcp_send_probe0(struct sock *); void tcp_send_partial(struct sock *); @@ -1608,6 +1614,11 @@ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb) void tcp_write_queue_purge(struct sock *sk); +static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk) +{ + return skb_rb_first(&sk->tcp_rtx_queue); +} + static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk) { return skb_peek(&sk->sk_write_queue); @@ -1630,18 +1641,12 @@ static inline struct sk_buff *tcp_write_queue_prev(const struct sock *sk, return skb_queue_prev(&sk->sk_write_queue, skb); } -#define tcp_for_write_queue(skb, sk) \ - skb_queue_walk(&(sk)->sk_write_queue, skb) - -#define tcp_for_write_queue_from(skb, sk) \ - skb_queue_walk_from(&(sk)->sk_write_queue, skb) - #define tcp_for_write_queue_from_safe(skb, tmp, sk) \ skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp) static inline struct sk_buff *tcp_send_head(const struct sock *sk) { - return sk->sk_send_head; + return skb_peek(&sk->sk_write_queue); } static inline bool tcp_skb_is_last(const struct sock *sk, @@ -1650,29 +1655,30 @@ static inline bool tcp_skb_is_last(const struct sock *sk, return skb_queue_is_last(&sk->sk_write_queue, skb); } -static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb) +static inline bool tcp_write_queue_empty(const struct sock *sk) { - if (tcp_skb_is_last(sk, skb)) - sk->sk_send_head = NULL; - else - sk->sk_send_head = tcp_write_queue_next(sk, skb); + return skb_queue_empty(&sk->sk_write_queue); +} + +static inline bool tcp_rtx_queue_empty(const struct sock *sk) +{ + return RB_EMPTY_ROOT(&sk->tcp_rtx_queue); +} + +static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk) +{ + return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk); } static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked) { - if (sk->sk_send_head == skb_unlinked) { - sk->sk_send_head = NULL; + if (tcp_write_queue_empty(sk)) tcp_chrono_stop(sk, TCP_CHRONO_BUSY); - } + if (tcp_sk(sk)->highest_sack == skb_unlinked) tcp_sk(sk)->highest_sack = NULL; } -static inline void tcp_init_send_head(struct sock *sk) -{ - sk->sk_send_head = NULL; -} - static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) { __skb_queue_tail(&sk->sk_write_queue, skb); @@ -1683,8 +1689,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb __tcp_add_write_queue_tail(sk, skb); /* Queue it, remembering where we must start sending. */ - if (sk->sk_send_head == NULL) { - sk->sk_send_head = skb; + if (sk->sk_write_queue.next == skb) { tcp_chrono_start(sk, TCP_CHRONO_BUSY); if (tcp_sk(sk)->highest_sack == NULL) @@ -1697,35 +1702,32 @@ static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *s __skb_queue_head(&sk->sk_write_queue, skb); } -/* Insert buff after skb on the write queue of sk. */ -static inline void tcp_insert_write_queue_after(struct sk_buff *skb, - struct sk_buff *buff, - struct sock *sk) -{ - __skb_queue_after(&sk->sk_write_queue, skb, buff); -} - /* Insert new before skb on the write queue of sk. */ static inline void tcp_insert_write_queue_before(struct sk_buff *new, struct sk_buff *skb, struct sock *sk) { __skb_queue_before(&sk->sk_write_queue, skb, new); - - if (sk->sk_send_head == skb) - sk->sk_send_head = new; } static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk) { - list_del(&skb->tcp_tsorted_anchor); - tcp_skb_tsorted_anchor_cleanup(skb); __skb_unlink(skb, &sk->sk_write_queue); } -static inline bool tcp_write_queue_empty(struct sock *sk) +void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb); + +static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk) { - return skb_queue_empty(&sk->sk_write_queue); + tcp_skb_tsorted_anchor_cleanup(skb); + rb_erase(&skb->rbnode, &sk->tcp_rtx_queue); +} + +static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk) +{ + list_del(&skb->tcp_tsorted_anchor); + tcp_rtx_queue_unlink(skb, sk); + sk_wmem_free_skb(sk, skb); } static inline void tcp_push_pending_frames(struct sock *sk) @@ -1754,8 +1756,9 @@ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp) static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb) { - tcp_sk(sk)->highest_sack = tcp_skb_is_last(sk, skb) ? NULL : - tcp_write_queue_next(sk, skb); + struct sk_buff *next = skb_rb_next(skb); + + tcp_sk(sk)->highest_sack = next ?: tcp_send_head(sk); } static inline struct sk_buff *tcp_highest_sack(struct sock *sk) @@ -1765,7 +1768,9 @@ static inline struct sk_buff *tcp_highest_sack(struct sock *sk) static inline void tcp_highest_sack_reset(struct sock *sk) { - tcp_sk(sk)->highest_sack = tcp_write_queue_head(sk); + struct sk_buff *skb = tcp_rtx_queue_head(sk); + + tcp_sk(sk)->highest_sack = skb ?: tcp_send_head(sk); } /* Called when old skb is about to be deleted (to be combined with new skb) */ @@ -1935,7 +1940,7 @@ extern void tcp_rack_reo_timeout(struct sock *sk); /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) { - const struct sk_buff *skb = tcp_write_queue_head(sk); + const struct sk_buff *skb = tcp_rtx_queue_head(sk); u32 rto = inet_csk(sk)->icsk_rto; u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto); -- cgit v1.2.3 From 180ca444b985c42948fa26abd278e616b5ce7eb2 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:05:56 -0700 Subject: ipv6: introduce a new function fib6_update_sernum() This function takes a route as input and tries to update the sernum in the fib6_node this route is associated with. It will be used in later commit when adding a cached route into the exception table under that route. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index d060d711a624..152b7b14a5a5 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -358,6 +358,8 @@ void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb); +void fib6_update_sernum(struct rt6_info *rt); + #ifdef CONFIG_IPV6_MULTIPLE_TABLES int fib6_rules_init(void); void fib6_rules_cleanup(void); -- cgit v1.2.3 From 35732d01fe311ec13c4e42936878b782b8e7ea85 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:05:57 -0700 Subject: ipv6: introduce a hash table to store dst cache Add a hash table into struct rt6_info in order to store dst caches created by pmtu discovery and ip redirect in ipv6 routing code. APIs to add dst cache, delete dst cache, find dst cache and update dst cache in the hash table are implemented and will be used in later commits. This is a preparation work to move all cache routes into the exception table instead of getting inserted into the fib6 tree. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 19 +++++++++++++++++++ include/net/ip6_route.h | 3 +++ 2 files changed, 22 insertions(+) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 152b7b14a5a5..c4864c1e8f13 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -98,6 +98,22 @@ struct rt6key { struct fib6_table; +struct rt6_exception_bucket { + struct hlist_head chain; + int depth; +}; + +struct rt6_exception { + struct hlist_node hlist; + struct rt6_info *rt6i; + unsigned long stamp; + struct rcu_head rcu; +}; + +#define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10 +#define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT) +#define FIB6_MAX_DEPTH 5 + struct rt6_info { struct dst_entry dst; @@ -134,12 +150,15 @@ struct rt6_info { struct inet6_dev *rt6i_idev; struct rt6_info * __percpu *rt6i_pcpu; + struct rt6_exception_bucket __rcu *rt6i_exception_bucket; u32 rt6i_metric; u32 rt6i_pmtu; /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; u8 rt6i_protocol; + u8 exception_bucket_flushed:1, + unused:7; }; static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index ee96f402cb75..3315605f34c9 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -95,6 +95,9 @@ int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack); int ip6_ins_rt(struct rt6_info *); int ip6_del_rt(struct rt6_info *); +void rt6_flush_exceptions(struct rt6_info *rt); +int rt6_remove_exception_rt(struct rt6_info *rt); + static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, const struct in6_addr *daddr, unsigned int prefs, -- cgit v1.2.3 From c757faa8bfa26a0dd24b41ff783e0da042156887 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:01 -0700 Subject: ipv6: prepare fib6_age() for exception table If all dst cache entries are stored in the exception table under the main route, we have to go through them during fib6_age() when doing garbage collecting. Introduce a new function rt6_age_exception() which goes through all dst entries in the exception table and remove those entries that are expired. This function is called in fib6_age() so that all dst caches are also garbage collected. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 13 +++++++++++++ include/net/ip6_route.h | 2 ++ 2 files changed, 15 insertions(+) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index c4864c1e8f13..11a79ef87a28 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -29,6 +29,14 @@ #define FIB6_TABLE_HASHSZ 1 #endif +#define RT6_DEBUG 2 + +#if RT6_DEBUG >= 3 +#define RT6_TRACE(x...) pr_debug(x) +#else +#define RT6_TRACE(x...) do { ; } while (0) +#endif + struct rt6_info; struct fib6_config { @@ -75,6 +83,11 @@ struct fib6_node { struct rcu_head rcu; }; +struct fib6_gc_args { + int timeout; + int more; +}; + #ifndef CONFIG_IPV6_SUBTREES #define FIB6_SUBTREE(fn) NULL #else diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 3315605f34c9..a0087fb9864b 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -97,6 +97,8 @@ int ip6_del_rt(struct rt6_info *); void rt6_flush_exceptions(struct rt6_info *rt); int rt6_remove_exception_rt(struct rt6_info *rt); +void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args, + unsigned long now); static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, const struct in6_addr *daddr, -- cgit v1.2.3 From 38fbeeeeccdb38d0635398e8e344d245f6d8dc52 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:02 -0700 Subject: ipv6: prepare fib6_locate() for exception table fib6_locate() is used to find the fib6_node according to the passed in prefix address key. It currently tries to find the fib6_node with the exact match of the passed in key. However, when we move cached routes into the exception table, fib6_locate() will fail to find the fib6_node for it as the cached routes will be stored in the exception table under the fib6_node with the longest prefix match of the cache's dst addr key. This commit adds a new parameter to let the caller specify if it needs exact match or longest prefix match. Right now, all callers still does exact match when calling fib6_locate(). It will be changed in later commit where exception table is hooked up to store cached routes. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 11a79ef87a28..4497a1eb4d41 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -357,7 +357,8 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, - const struct in6_addr *saddr, int src_len); + const struct in6_addr *saddr, int src_len, + bool exact_match); void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), void *arg); -- cgit v1.2.3 From 2b760fcf5cfb34e8610df56d83745b2b74ae1379 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:03 -0700 Subject: ipv6: hook up exception table to store dst cache This commit makes use of the exception hash table implementation to store dst caches created by pmtu discovery and ip redirect into the hash table under the rt_info and no longer inserts these routes into fib6 tree. This makes the fib6 tree only contain static configured routes and could now be protected by rcu instead of a rw lock. With this change, in the route lookup related functions, after finding the rt6_info with the longest prefix, we also need to search for the exception table before doing backtracking. In the route delete function, if the route being deleted is not a dst cache, deletion of this route also need to flush the whole hash table under it. If it is a dst cache, then only delete the cached dst in the hash table. Note: for fib6_walk_continue() function, w->root now is always pointing to a root node considering that fib6_prune_clones() is removed from the code. So we add a WARN_ON() msg to make sure w->root always points to a root node and also removed the update of w->root in fib6_repair_tree(). This is a prerequisite for later patch because we don't need to make w->root as rcu protected when replacing rwlock with RCU. Also, we remove all prune related variables as it is no longer used. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 4497a1eb4d41..d0b7283073e3 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -280,7 +280,6 @@ struct fib6_walker { struct fib6_node *root, *node; struct rt6_info *leaf; enum fib6_walk_state state; - bool prune; unsigned int skip; unsigned int count; int (*func)(struct fib6_walker *); -- cgit v1.2.3 From bbd63f06d114a52be33f6982fc89ca2768cdeb62 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:07 -0700 Subject: ipv6: update fn_sernum after route is inserted to tree fib6_add() logic currently calls fib6_add_1() to figure out what node should be used for the newly added route and then call fib6_add_rt2node() to insert the route to the node. And during the call of fib6_add_1(), fn_sernum is updated for all nodes that share the same prefix as the new route. This does not have issue in the current code because reader thread will not be able to access the tree while writer thread is inserting new route to it. However, it is not the case once we transition to use RCU. Reader thread could potentially see the new fn_sernum before the new route is inserted. As a result, reader thread's route lookup will return a stale route with the new fn_sernum. In order to solve this issue, we remove all the update of fn_sernum in fib6_add_1(), and instead, introduce a new function that updates fn_sernum for all related nodes and call this functions once the route is successfully inserted to the tree. Also, smp_wmb() is used after a route is successfully inserted into the fib tree and right before the updated of fn->sernum. And smp_rmb() is used right after fn->sernum is accessed in rt6_get_cookie_safe(). This is to guarantee that when the reader thread sees the new fn->sernum, the new route is already inserted in the tree in memory. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index d0b7283073e3..6bf929b50951 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -220,6 +220,8 @@ static inline bool rt6_get_cookie_safe(const struct rt6_info *rt, if (fn) { *cookie = fn->fn_sernum; + /* pairs with smp_wmb() in fib6_update_sernum_upto_root() */ + smp_rmb(); status = true; } -- cgit v1.2.3 From 66f5d6ce53e665477d2a33e8f539d4fa4ca81c83 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:10 -0700 Subject: ipv6: replace rwlock with rcu and spinlock in fib6_table With all the preparation work before, we are now ready to replace rwlock with rcu and spinlock in fib6_table. That means now all fib6_node in fib6_table are protected by rcu. And when freeing fib6_node, call_rcu() is used to wait for the rcu grace period before releasing the memory. When accessing fib6_node, corresponding rcu APIs need to be used. And all previous sessions protected by the write lock will now be protected by the spin lock per table. All previous sessions protected by read lock will now be protected by rcu_read_lock(). A couple of things to note here: 1. As part of the work of replacing rwlock with rcu, the linked list of fn->leaf now has to be rcu protected as well. So both fn->leaf and rt->dst.rt6_next are now __rcu tagged and corresponding rcu APIs are used when manipulating them. 2. For fn->rr_ptr, first of all, it also needs to be rcu protected now and is tagged with __rcu and rcu APIs are used in corresponding places. Secondly, fn->rr_ptr is changed in rt6_select() which is a reader thread. This makes the issue a bit complicated. We think a valid solution for it is to let rt6_select() grab the tb6_lock if it decides to change it. As it is not in the normal operation and only happens when there is no valid neighbor cache for the route, we think the performance impact should be low. 3. fib6_walk_continue() has to be called with tb6_lock held even in the route dumping related functions, e.g. inet6_dump_fib(), fib6_tables_dump() and ipv6_route_seq_ops. It is because fib6_walk_continue() makes modifications to the walker structure, and so are fib6_repair_tree() and fib6_del_route(). In order to do proper syncing between them, we need to let fib6_walk_continue() hold the lock. We may be able to do further improvement on the way we do the tree walk to get rid of the need for holding the spin lock. But not for now. 4. When fib6_del_route() removes a route from the tree, we no longer mark rt->dst.rt6_next to NULL to make simultaneous reader be able to further traverse the list with rcu. However, rt->dst.rt6_next is only valid within this same rcu period. No one should access it later. 5. All the operation of atomic_inc(rt->rt6i_ref) is changed to be performed before we publish this route (either by linking it to fn->leaf or insert it in the list pointed by fn->leaf) just to be safe because as soon as we publish the route, some read thread will be able to access it. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 2 +- include/net/ip6_fib.h | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index 06a6765da074..204c19e25456 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -101,7 +101,7 @@ struct dst_entry { union { struct dst_entry *next; struct rtable __rcu *rt_next; - struct rt6_info *rt6_next; + struct rt6_info __rcu *rt6_next; struct dn_route __rcu *dn_next; }; }; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 6bf929b50951..0b438b9bcb10 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -68,18 +68,18 @@ struct fib6_config { }; struct fib6_node { - struct fib6_node *parent; - struct fib6_node *left; - struct fib6_node *right; + struct fib6_node __rcu *parent; + struct fib6_node __rcu *left; + struct fib6_node __rcu *right; #ifdef CONFIG_IPV6_SUBTREES - struct fib6_node *subtree; + struct fib6_node __rcu *subtree; #endif - struct rt6_info *leaf; + struct rt6_info __rcu *leaf; __u16 fn_bit; /* bit key */ __u16 fn_flags; int fn_sernum; - struct rt6_info *rr_ptr; + struct rt6_info __rcu *rr_ptr; struct rcu_head rcu; }; @@ -91,7 +91,7 @@ struct fib6_gc_args { #ifndef CONFIG_IPV6_SUBTREES #define FIB6_SUBTREE(fn) NULL #else -#define FIB6_SUBTREE(fn) ((fn)->subtree) +#define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1)) #endif struct mx6_config { @@ -174,6 +174,14 @@ struct rt6_info { unused:7; }; +#define for_each_fib6_node_rt_rcu(fn) \ + for (rt = rcu_dereference((fn)->leaf); rt; \ + rt = rcu_dereference(rt->dst.rt6_next)) + +#define for_each_fib6_walker_rt(w) \ + for (rt = (w)->leaf; rt; \ + rt = rcu_dereference_protected(rt->dst.rt6_next, 1)) + static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) { return ((struct rt6_info *)dst)->rt6i_idev; @@ -310,7 +318,7 @@ struct rt6_statistics { struct fib6_table { struct hlist_node tb6_hlist; u32 tb6_id; - rwlock_t tb6_lock; + spinlock_t tb6_lock; struct fib6_node tb6_root; struct inet_peer_base tb6_peers; unsigned int flags; -- cgit v1.2.3 From 81eb8447daae3b62247aa66bb17b82f8fef68249 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 6 Oct 2017 12:06:11 -0700 Subject: ipv6: take care of rt6_stats Currently, most of the rt6_stats are not hooked up correctly. As the last part of this patch series, hook up all existing rt6_stats and add one new stat fib_rt_uncache to indicate the number of routes in the uncached list. For details of the stats, please refer to the comments added in include/net/ip6_fib.h. Note: fib_rt_alloc and fib_rt_uncache are not guaranteed to be modified under a lock. So atomic_t is used for them. Signed-off-by: Wei Wang Signed-off-by: Martin KaFai Lau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 0b438b9bcb10..10c913816032 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -297,12 +297,15 @@ struct fib6_walker { }; struct rt6_statistics { - __u32 fib_nodes; - __u32 fib_route_nodes; - __u32 fib_rt_alloc; /* permanent routes */ - __u32 fib_rt_entries; /* rt entries in table */ - __u32 fib_rt_cache; /* cache routes */ - __u32 fib_discarded_routes; + __u32 fib_nodes; /* all fib6 nodes */ + __u32 fib_route_nodes; /* intermediate nodes */ + __u32 fib_rt_entries; /* rt entries in fib table */ + __u32 fib_rt_cache; /* cached rt entries in exception table */ + __u32 fib_discarded_routes; /* total number of routes delete */ + + /* The following stats are not protected by any lock */ + atomic_t fib_rt_alloc; /* total number of routes alloced */ + atomic_t fib_rt_uncache; /* rt entries in uncached list */ }; #define RTN_TL_ROOT 0x0001 -- cgit v1.2.3 From 548ec114705bb8f0879a0da12abec17f17a7cc26 Mon Sep 17 00:00:00 2001 From: Lin Zhang Date: Fri, 6 Oct 2017 01:40:35 +0800 Subject: net: phonet: mark phonet_protocol as const The phonet_protocol structs don't need to be written by anyone and so can be marked as const. Signed-off-by: Lin Zhang Signed-off-by: David S. Miller --- include/net/phonet/phonet.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/phonet/phonet.h b/include/net/phonet/phonet.h index 039cc29cb4a8..51e1a2a45d02 100644 --- a/include/net/phonet/phonet.h +++ b/include/net/phonet/phonet.h @@ -108,8 +108,10 @@ struct phonet_protocol { int sock_type; }; -int phonet_proto_register(unsigned int protocol, struct phonet_protocol *pp); -void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp); +int phonet_proto_register(unsigned int protocol, + const struct phonet_protocol *pp); +void phonet_proto_unregister(unsigned int protocol, + const struct phonet_protocol *pp); int phonet_sysctl_init(void); void phonet_sysctl_exit(void); -- cgit v1.2.3 From 77041420751fe6d4acf2103b245dcc2b4b7b8360 Mon Sep 17 00:00:00 2001 From: Yotam Gigi Date: Mon, 9 Oct 2017 11:15:31 +0200 Subject: net: bridge: Notify on bridge device mrouter state changes Add the SWITCHDEV_ATTR_ID_BRIDGE_MROUTER switchdev notification type, used to indicate whether the bridge is or isn't mrouter. Notify when the bridge changes its state, similarly to the already existing bridged port mrouter notifications. The notification uses the switchdev_attr.u.mrouter boolean flag to indicate the current bridge mrouter status. Thus, it only indicates whether the bridge is currently used as an mrouter or not, and does not indicate the exact mrouter state of the bridge (learning, permanent, etc.). Signed-off-by: Yotam Gigi Signed-off-by: Jiri Pirko Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/switchdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index d767b7991887..d756fbe46625 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -51,6 +51,7 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME, SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED, + SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, }; struct switchdev_attr { -- cgit v1.2.3 From d66f2b91f95b56e31772b9faa0d036cd2e53cb02 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Oct 2017 10:30:14 -0700 Subject: bpf: don't rely on the verifier lock for metadata_dst allocation bpf_skb_set_tunnel_*() functions require allocation of per-cpu metadata_dst. The allocation happens upon verification of the first program using those helpers. In preparation for removing the verifier lock, use cmpxchg() to make sure we only allocate the metadata_dsts once. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 9fba2ebf6dda..87a0bb8d449f 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -87,6 +87,7 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, void metadata_dst_free(struct metadata_dst *); struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, gfp_t flags); +void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst); struct metadata_dst __percpu * metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags); -- cgit v1.2.3 From 8c418b5b15747eda05d086e80fa0a767982fbf37 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 6 Oct 2017 11:53:32 +0200 Subject: fq: support filtering a given tin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add to the FQ API a way to filter a given tin, in order to remove frames that fulfil certain criteria according to a filter function. This will be used by mac80211 to remove frames belonging to an AP VLAN interface that's being removed. Signed-off-by: Johannes Berg Acked-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- include/net/fq.h | 7 +++++ include/net/fq_impl.h | 72 ++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 69 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/fq.h b/include/net/fq.h index 6d8521a30c5c..ac944a686840 100644 --- a/include/net/fq.h +++ b/include/net/fq.h @@ -90,6 +90,13 @@ typedef void fq_skb_free_t(struct fq *, struct fq_flow *, struct sk_buff *); +/* Return %true to filter (drop) the frame. */ +typedef bool fq_skb_filter_t(struct fq *, + struct fq_tin *, + struct fq_flow *, + struct sk_buff *, + void *); + typedef struct fq_flow *fq_flow_get_default_t(struct fq *, struct fq_tin *, int idx, diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h index 4e6131cd3f43..8b237e4afee6 100644 --- a/include/net/fq_impl.h +++ b/include/net/fq_impl.h @@ -12,24 +12,22 @@ /* functions that are embedded into includer */ -static struct sk_buff *fq_flow_dequeue(struct fq *fq, - struct fq_flow *flow) +static void fq_adjust_removal(struct fq *fq, + struct fq_flow *flow, + struct sk_buff *skb) { struct fq_tin *tin = flow->tin; - struct fq_flow *i; - struct sk_buff *skb; - - lockdep_assert_held(&fq->lock); - - skb = __skb_dequeue(&flow->queue); - if (!skb) - return NULL; tin->backlog_bytes -= skb->len; tin->backlog_packets--; flow->backlog -= skb->len; fq->backlog--; fq->memory_usage -= skb->truesize; +} + +static void fq_rejigger_backlog(struct fq *fq, struct fq_flow *flow) +{ + struct fq_flow *i; if (flow->backlog == 0) { list_del_init(&flow->backlogchain); @@ -43,6 +41,21 @@ static struct sk_buff *fq_flow_dequeue(struct fq *fq, list_move_tail(&flow->backlogchain, &i->backlogchain); } +} + +static struct sk_buff *fq_flow_dequeue(struct fq *fq, + struct fq_flow *flow) +{ + struct sk_buff *skb; + + lockdep_assert_held(&fq->lock); + + skb = __skb_dequeue(&flow->queue); + if (!skb) + return NULL; + + fq_adjust_removal(fq, flow, skb); + fq_rejigger_backlog(fq, flow); return skb; } @@ -188,6 +201,45 @@ static void fq_tin_enqueue(struct fq *fq, } } +static void fq_flow_filter(struct fq *fq, + struct fq_flow *flow, + fq_skb_filter_t filter_func, + void *filter_data, + fq_skb_free_t free_func) +{ + struct fq_tin *tin = flow->tin; + struct sk_buff *skb, *tmp; + + lockdep_assert_held(&fq->lock); + + skb_queue_walk_safe(&flow->queue, skb, tmp) { + if (!filter_func(fq, tin, flow, skb, filter_data)) + continue; + + __skb_unlink(skb, &flow->queue); + fq_adjust_removal(fq, flow, skb); + free_func(fq, tin, flow, skb); + } + + fq_rejigger_backlog(fq, flow); +} + +static void fq_tin_filter(struct fq *fq, + struct fq_tin *tin, + fq_skb_filter_t filter_func, + void *filter_data, + fq_skb_free_t free_func) +{ + struct fq_flow *flow; + + lockdep_assert_held(&fq->lock); + + list_for_each_entry(flow, &tin->new_flows, flowchain) + fq_flow_filter(fq, flow, filter_func, filter_data, free_func); + list_for_each_entry(flow, &tin->old_flows, flowchain) + fq_flow_filter(fq, flow, filter_func, filter_data, free_func); +} + static void fq_flow_reset(struct fq *fq, struct fq_flow *flow, fq_skb_free_t free_func) -- cgit v1.2.3 From 4a269818a7eb8577d32d8b2879099c689ddbd856 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Oct 2017 13:27:29 -0700 Subject: tcp: fix tcp_unlink_write_queue() Yury reported crash with this signature : [ 554.034021] [] 0xffff80003ccd5a58 [ 554.034156] [] skb_release_all+0x14/0x30 [ 554.034288] [] __kfree_skb+0x14/0x28 [ 554.034409] [] tcp_sendmsg_locked+0x4dc/0xcc8 [ 554.034541] [] tcp_sendmsg+0x34/0x58 [ 554.034659] [] inet_sendmsg+0x2c/0xf8 [ 554.034783] [] sock_sendmsg+0x18/0x30 [ 554.034928] [] SyS_sendto+0x84/0xf8 Problem is that skb->destructor contains garbage, and this is because I accidentally removed tcp_skb_tsorted_anchor_cleanup() from tcp_unlink_write_queue() This would trigger with a write(fd, , len) attempt, and we will add to packetdrill this capability to avoid future regressions. Fixes: 75c119afe14f ("tcp: implement rb-tree based retransmit queue") Reported-by: Yury Norov Tested-by: Yury Norov Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 5a95e5886b55..15163454174b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1712,6 +1712,7 @@ static inline void tcp_insert_write_queue_before(struct sk_buff *new, static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk) { + tcp_skb_tsorted_anchor_cleanup(skb); __skb_unlink(skb, &sk->sk_write_queue); } -- cgit v1.2.3 From 843e79d05addd8eb06992cd6dfafc7b9d53f2bc8 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 11 Oct 2017 09:41:07 +0200 Subject: net: sched: make tc_action_ops->get_dev return dev and avoid passing net Return dev directly, NULL if not possible. That is enough. Makes no sense to pass struct net * to get_dev op, as there is only one net possible, the one the action was created in. So just store it in mirred priv and use directly. Rename the mirred op callback function. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 3 +-- include/net/tc_act/tc_mirred.h | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index b944e0eb93be..900168a9901e 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -93,8 +93,7 @@ struct tc_action_ops { int (*walk)(struct net *, struct sk_buff *, struct netlink_callback *, int, const struct tc_action_ops *); void (*stats_update)(struct tc_action *, u64, u32, u64); - int (*get_dev)(const struct tc_action *a, struct net *net, - struct net_device **mirred_dev); + struct net_device *(*get_dev)(const struct tc_action *a); }; struct tc_action_net { diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h index 604bc31e23ab..21a656569840 100644 --- a/include/net/tc_act/tc_mirred.h +++ b/include/net/tc_act/tc_mirred.h @@ -10,6 +10,7 @@ struct tcf_mirred { int tcfm_ifindex; bool tcfm_mac_header_xmit; struct net_device __rcu *tcfm_dev; + struct net *net; struct list_head tcfm_list; }; #define to_mirred(a) ((struct tcf_mirred *)a) -- cgit v1.2.3 From b3f55bdda8df55a563005e00b1b71212d8546541 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 11 Oct 2017 09:41:08 +0200 Subject: net: sched: introduce per-egress action device callbacks Introduce infrastructure that allows drivers to register callbacks that are called whenever tc would offload inserted rule and specified device acts as tc action egress device. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 34 ++++++++++++++++++++++++++++++++++ include/net/pkt_cls.h | 2 ++ 2 files changed, 36 insertions(+) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 900168a9901e..f5e8c9048fb0 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -174,4 +174,38 @@ static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes, #endif } +typedef int tc_setup_cb_t(enum tc_setup_type type, + void *type_data, void *cb_priv); + +#ifdef CONFIG_NET_CLS_ACT +int tc_setup_cb_egdev_register(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv); +void tc_setup_cb_egdev_unregister(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv); +int tc_setup_cb_egdev_call(const struct net_device *dev, + enum tc_setup_type type, void *type_data, + bool err_stop); +#else +static inline +int tc_setup_cb_egdev_register(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + return 0; +} + +static inline +void tc_setup_cb_egdev_unregister(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ +} + +static inline +int tc_setup_cb_egdev_call(const struct net_device *dev, + enum tc_setup_type type, void *type_data, + bool err_stop) +{ + return 0; +} +#endif + #endif diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index e80edd8879ef..6f8149c82571 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -206,6 +206,8 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts, struct net_device **hw_dev); +int tcf_exts_egdev_cb_call(struct tcf_exts *exts, enum tc_setup_type type, + void *type_data, bool err_stop); /** * struct tcf_pkt_info - packet information -- cgit v1.2.3 From 717503b9cf57c0bb7ea4d3a9f5699c9a04adf988 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 11 Oct 2017 09:41:09 +0200 Subject: net: sched: convert cls_flower->egress_dev users to tc_setup_cb_egdev infra The only user of cls_flower->egress_dev is mlx5. So do the conversion there alongside with the code originating the call in cls_flower function fl_hw_replace_filter to the newly introduced egress device callback infrastucture. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 6f8149c82571..c0bdf5cad727 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -206,8 +206,6 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts, struct net_device **hw_dev); -int tcf_exts_egdev_cb_call(struct tcf_exts *exts, enum tc_setup_type type, - void *type_data, bool err_stop); /** * struct tcf_pkt_info - packet information @@ -407,6 +405,9 @@ tcf_match_indev(struct sk_buff *skb, int ifindex) } #endif /* CONFIG_NET_CLS_IND */ +int tc_setup_cb_call(struct tcf_exts *exts, enum tc_setup_type type, + void *type_data, bool err_stop); + struct tc_cls_common_offload { u32 chain_index; __be16 protocol; -- cgit v1.2.3 From 7578d7b45ed870b13a8ace57e32feaed623c2a94 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 11 Oct 2017 09:41:10 +0200 Subject: net: sched: remove unused tcf_exts_get_dev helper and cls_flower->egress_dev The helper and the struct field ares no longer used by any code, so remove them. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index c0bdf5cad727..f5263743076b 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -204,8 +204,6 @@ void tcf_exts_destroy(struct tcf_exts *exts); void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); -int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts, - struct net_device **hw_dev); /** * struct tcf_pkt_info - packet information @@ -517,7 +515,6 @@ struct tc_cls_flower_offload { struct fl_flow_key *mask; struct fl_flow_key *key; struct tcf_exts *exts; - bool egress_dev; }; enum tc_matchall_command { -- cgit v1.2.3 From 437d2762ba07f0fc639d5a09acb323fe4106a61f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Oct 2017 20:45:40 -0700 Subject: tcp: remove obsolete helpers Remove three inline helpers that are no longer needed. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 15163454174b..3b3b9b968e2d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1629,18 +1629,6 @@ static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk) return skb_peek_tail(&sk->sk_write_queue); } -static inline struct sk_buff *tcp_write_queue_next(const struct sock *sk, - const struct sk_buff *skb) -{ - return skb_queue_next(&sk->sk_write_queue, skb); -} - -static inline struct sk_buff *tcp_write_queue_prev(const struct sock *sk, - const struct sk_buff *skb) -{ - return skb_queue_prev(&sk->sk_write_queue, skb); -} - #define tcp_for_write_queue_from_safe(skb, tmp, sk) \ skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp) @@ -1697,11 +1685,6 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb } } -static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *skb) -{ - __skb_queue_head(&sk->sk_write_queue, skb); -} - /* Insert new before skb on the write queue of sk. */ static inline void tcp_insert_write_queue_before(struct sk_buff *new, struct sk_buff *skb, -- cgit v1.2.3 From 60724d4bae14cd295b27b1610cad9a2720eb0860 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 11 Oct 2017 10:57:48 -0700 Subject: net: dsa: Add support for DSA specific notifiers In preparation for communicating a given DSA network device's port number and switch index, create a specialized DSA notifier and two events: DSA_PORT_REGISTER and DSA_PORT_UNREGISTER that communicate: the slave network device (slave_dev), port number and switch number in the tree. This will be later used for network device drivers like bcmsysport which needs to cooperate with its DSA network devices to set-up queue mapping and scheduling. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 10dceccd9ce8..40a709a0754d 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -471,4 +471,49 @@ static inline int dsa_switch_resume(struct dsa_switch *ds) } #endif /* CONFIG_PM_SLEEP */ +enum dsa_notifier_type { + DSA_PORT_REGISTER, + DSA_PORT_UNREGISTER, +}; + +struct dsa_notifier_info { + struct net_device *dev; +}; + +struct dsa_notifier_register_info { + struct dsa_notifier_info info; /* must be first */ + struct net_device *master; + unsigned int port_number; + unsigned int switch_number; +}; + +static inline struct net_device * +dsa_notifier_info_to_dev(const struct dsa_notifier_info *info) +{ + return info->dev; +} + +#if IS_ENABLED(CONFIG_NET_DSA) +int register_dsa_notifier(struct notifier_block *nb); +int unregister_dsa_notifier(struct notifier_block *nb); +int call_dsa_notifiers(unsigned long val, struct net_device *dev, + struct dsa_notifier_info *info); +#else +static inline int register_dsa_notifier(struct notifier_block *nb) +{ + return 0; +} + +static inline int unregister_dsa_notifier(struct notifier_block *nb) +{ + return 0; +} + +static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev, + struct dsa_notifier_info *info) +{ + return NOTIFY_DONE; +} +#endif + #endif -- cgit v1.2.3 From 0a5f14ce67a6e093e651d3cd75e6ac281123d93a Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 11 Oct 2017 10:57:49 -0700 Subject: net: dsa: tag_brcm: Indicate to master netdevice port + queue We need to tell the DSA master network device doing the actual transmission what the desired switch port and queue number is for it to resolve that to the internal transmit queue it is mapped to. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 40a709a0754d..ce1d622734d7 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -516,4 +516,9 @@ static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev, } #endif +/* Broadcom tag specific helpers to insert and extract queue/port number */ +#define BRCM_TAG_SET_PORT_QUEUE(p, q) ((p) << 8 | q) +#define BRCM_TAG_GET_PORT(v) ((v) >> 8) +#define BRCM_TAG_GET_QUEUE(v) ((v) & 0xff) + #endif -- cgit v1.2.3 From 8f04748016f3b583e675e0f649d42cfc10812a8b Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Wed, 11 Oct 2017 10:50:29 -0400 Subject: net sched actions: change IFE modules alias names Make style of module alias name consistent with other subsystems in kernel, for example net devices. Fixes: 084e2f6566d2 ("Support to encoding decoding skb mark on IFE action") Fixes: 200e10f46936 ("Support to encoding decoding skb prio on IFE action") Fixes: 408fbc22ef1e ("net sched ife action: Introduce skb tcindex metadata encap decap") Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- include/net/tc_act/tc_ife.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h index 30ba459ddd34..104578f16062 100644 --- a/include/net/tc_act/tc_ife.h +++ b/include/net/tc_act/tc_ife.h @@ -40,7 +40,7 @@ struct tcf_meta_ops { struct module *owner; }; -#define MODULE_ALIAS_IFE_META(metan) MODULE_ALIAS("ifemeta" __stringify_1(metan)) +#define MODULE_ALIAS_IFE_META(metan) MODULE_ALIAS("ife-meta-" metan) int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi); int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi); -- cgit v1.2.3 From aa9fd9a325d51fa0b11153b03b8fefff569fa955 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 11 Oct 2017 17:16:08 -0400 Subject: sched: act: ife: update parameters via rcu handling This patch changes the parameter updating via RCU and not protected by a spinlock anymore. This reduce the time that the spinlock is being held. Signed-off-by: Alexander Aring Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/tc_act/tc_ife.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h index 104578f16062..c7fb99c3f76c 100644 --- a/include/net/tc_act/tc_ife.h +++ b/include/net/tc_act/tc_ife.h @@ -6,12 +6,18 @@ #include #include -struct tcf_ife_info { - struct tc_action common; +struct tcf_ife_params { u8 eth_dst[ETH_ALEN]; u8 eth_src[ETH_ALEN]; u16 eth_type; u16 flags; + + struct rcu_head rcu; +}; + +struct tcf_ife_info { + struct tc_action common; + struct tcf_ife_params __rcu *params; /* list of metaids allowed */ struct list_head metalist; }; -- cgit v1.2.3 From 4e8b86c062695454df0b76f3fee4fab8dc4bb716 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Thu, 7 Sep 2017 04:00:06 -0700 Subject: mqprio: Introduce new hardware offload mode and shaper in mqprio The offload types currently supported in mqprio are 0 (no offload) and 1 (offload only TCs) by setting these values for the 'hw' option. If offloads are supported by setting the 'hw' option to 1, the default offload mode is 'dcb' where only the TC values are offloaded to the device. This patch introduces a new hardware offload mode called 'channel' with 'hw' set to 1 in mqprio which makes full use of the mqprio options, the TCs, the queue configurations and the QoS parameters for the TCs. This is achieved through a new netlink attribute for the 'mode' option which takes values such as 'dcb' (default) and 'channel'. The 'channel' mode also supports QoS attributes for traffic class such as minimum and maximum values for bandwidth rate limits. This patch enables configuring additional HW shaper attributes associated with a traffic class. Currently the shaper for bandwidth rate limiting is supported which takes options such as minimum and maximum bandwidth rates and are offloaded to the hardware in the 'channel' mode. The min and max limits for bandwidth rates are provided by the user along with the TCs and the queue configurations when creating the mqprio qdisc. The interface can be extended to support new HW shapers in future through the 'shaper' attribute. Introduces a new data structure 'tc_mqprio_qopt_offload' for offloading mqprio queue options and use this to be shared between the kernel and device driver. This contains a copy of the existing data structure for mqprio queue options. This new data structure can be extended when adding new attributes for traffic class such as mode, shaper, shaper parameters (bandwidth rate limits). The existing data structure for mqprio queue options will be shared between the kernel and userspace. Example: queues 4@0 4@4 hw 1 mode channel shaper bw_rlimit\ min_rate 1Gbit 2Gbit max_rate 4Gbit 5Gbit To dump the bandwidth rates: qdisc mqprio 804a: root tc 2 map 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 queues:(0:3) (4:7) mode:channel shaper:bw_rlimit min_rate:1Gbit 2Gbit max_rate:4Gbit 5Gbit Signed-off-by: Amritha Nambiar Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/net/pkt_cls.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index f5263743076b..60d39789e4f0 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -546,6 +546,15 @@ struct tc_cls_bpf_offload { u32 gen_flags; }; +struct tc_mqprio_qopt_offload { + /* struct tc_mqprio_qopt must always be the first element */ + struct tc_mqprio_qopt qopt; + u16 mode; + u16 shaper; + u32 flags; + u64 min_rate[TC_QOPT_MAX_QUEUE]; + u64 max_rate[TC_QOPT_MAX_QUEUE]; +}; /* This structure holds cookie structure that is passed from user * to the kernel for actions and classifiers -- cgit v1.2.3 From 841f4f24053acad69240c6ab7427a1d24bc29491 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 13 Oct 2017 14:18:09 -0400 Subject: net: dsa: remove .set_addr Now that there is no user for the .set_addr function, remove it from DSA. If a switch supports this feature (like mv88e6xxx), the implementation can be done in the driver setup. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index ce1d622734d7..2746741f74cf 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -291,7 +291,6 @@ struct dsa_switch_ops { enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds); int (*setup)(struct dsa_switch *ds); - int (*set_addr)(struct dsa_switch *ds, u8 *addr); u32 (*get_phy_flags)(struct dsa_switch *ds, int port); /* -- cgit v1.2.3 From 69d78ef25c7b0058674145500efb12255738ba8a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 13 Oct 2017 14:00:57 +0200 Subject: net: sched: store Qdisc pointer in struct block Prepare for removal of tp->q and store Qdisc pointer in the block structure. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 4 ++-- include/net/sch_generic.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 60d39789e4f0..e6c9e1e4d711 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -22,7 +22,7 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create); void tcf_chain_put(struct tcf_chain *chain); int tcf_block_get(struct tcf_block **p_block, - struct tcf_proto __rcu **p_filter_chain); + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q); void tcf_block_put(struct tcf_block *block); int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode); @@ -30,7 +30,7 @@ int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, #else static inline int tcf_block_get(struct tcf_block **p_block, - struct tcf_proto __rcu **p_filter_chain) + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q) { return 0; } diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 684d8ed27eaa..df4032ca1b7f 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -270,6 +270,7 @@ struct tcf_chain { struct tcf_block { struct list_head chain_list; + struct Qdisc *q; }; static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) -- cgit v1.2.3 From 855319becbcffec6988a4e781a861b69a71c5b58 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 13 Oct 2017 14:00:58 +0200 Subject: net: sched: store net pointer in block and introduce qdisc_net helper Store net pointer in the block structure. Along the way, introduce qdisc_net helper which allows to easily obtain net pointer for qdisc instance. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 7 +++++++ include/net/sch_generic.h | 1 + 2 files changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 259bc191ba59..2d234af15f3e 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -4,7 +4,9 @@ #include #include #include +#include #include +#include #include #define DEFAULT_TX_QUEUE_LEN 1000 @@ -146,4 +148,9 @@ static inline bool is_classid_clsact_egress(u32 classid) TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_EGRESS); } +static inline struct net *qdisc_net(struct Qdisc *q) +{ + return dev_net(q->dev_queue->dev); +} + #endif diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index df4032ca1b7f..9b2cb91dc0d9 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -270,6 +270,7 @@ struct tcf_chain { struct tcf_block { struct list_head chain_list; + struct net *net; struct Qdisc *q; }; -- cgit v1.2.3 From 44186460c85a0121562db7cfef132d63c869118f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 13 Oct 2017 14:00:59 +0200 Subject: net: sched: introduce tcf_block_q and tcf_block_dev helpers These helpers allows to get a q and netdev pointers for given block easily. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index e6c9e1e4d711..7bed674ba29a 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -24,6 +24,17 @@ void tcf_chain_put(struct tcf_chain *chain); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q); void tcf_block_put(struct tcf_block *block); + +static inline struct Qdisc *tcf_block_q(struct tcf_block *block) +{ + return block->q; +} + +static inline struct net_device *tcf_block_dev(struct tcf_block *block) +{ + return tcf_block_q(block)->dev_queue->dev; +} + int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode); @@ -39,6 +50,16 @@ static inline void tcf_block_put(struct tcf_block *block) { } +static inline struct Qdisc *tcf_block_q(struct tcf_block *block) +{ + return NULL; +} + +static inline struct net_device *tcf_block_dev(struct tcf_block *block) +{ + return NULL; +} + static inline int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode) { -- cgit v1.2.3 From 34e3759cf86a3e75463e34c1bb9691777406a175 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 13 Oct 2017 14:01:00 +0200 Subject: net: sched: teach tcf_bind/unbind_filter to use block->q Whenever the block->q is set, it can be used instead of tp->q as it contains the same value. When it is not set, which can't happen now but it might happen with the follow-up shared blocks introduction, the class is not set in the result. That would lead to a class lookup instead of direct class pointer use for classful qdiscs. However, it is not planned to support classful qdisqs sharing filter blocks, so that may never happen. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 7bed674ba29a..49a143e0fe65 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -74,36 +74,43 @@ __cls_set_class(unsigned long *clp, unsigned long cl) } static inline unsigned long -cls_set_class(struct tcf_proto *tp, unsigned long *clp, - unsigned long cl) +cls_set_class(struct Qdisc *q, unsigned long *clp, unsigned long cl) { unsigned long old_cl; - - tcf_tree_lock(tp); + + sch_tree_lock(q); old_cl = __cls_set_class(clp, cl); - tcf_tree_unlock(tp); - + sch_tree_unlock(q); return old_cl; } static inline void tcf_bind_filter(struct tcf_proto *tp, struct tcf_result *r, unsigned long base) { + struct Qdisc *q = tp->chain->block->q; unsigned long cl; - cl = tp->q->ops->cl_ops->bind_tcf(tp->q, base, r->classid); - cl = cls_set_class(tp, &r->class, cl); + /* Check q as it is not set for shared blocks. In that case, + * setting class is not supported. + */ + if (!q) + return; + cl = q->ops->cl_ops->bind_tcf(q, base, r->classid); + cl = cls_set_class(q, &r->class, cl); if (cl) - tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); + q->ops->cl_ops->unbind_tcf(q, cl); } static inline void tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r) { + struct Qdisc *q = tp->chain->block->q; unsigned long cl; + if (!q) + return; if ((cl = __cls_set_class(&r->class, 0)) != 0) - tp->q->ops->cl_ops->unbind_tcf(tp->q, cl); + q->ops->cl_ops->unbind_tcf(q, cl); } struct tcf_exts { -- cgit v1.2.3 From 74e3be6021d22df2ffcb691eae1affeb2bd0128e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 13 Oct 2017 14:01:04 +0200 Subject: net: sched: use tcf_block_q helper to get q pointer for sch_tree_lock Use tcf_block_q helper to get q pointer to be used for direct call of sch_tree_lock/unlock instead of tcf_tree_lock/unlock. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 9b2cb91dc0d9..0aea9e23e97a 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -359,9 +359,6 @@ static inline void sch_tree_unlock(const struct Qdisc *q) spin_unlock_bh(qdisc_root_sleeping_lock(q)); } -#define tcf_tree_lock(tp) sch_tree_lock((tp)->q) -#define tcf_tree_unlock(tp) sch_tree_unlock((tp)->q) - extern struct Qdisc noop_qdisc; extern struct Qdisc_ops noop_qdisc_ops; extern struct Qdisc_ops pfifo_fast_ops; -- cgit v1.2.3 From 0da4af00b2ed3dbe46788623a696c4169447eadc Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 13 Oct 2017 15:08:07 -0700 Subject: ipv6: only update __use and lastusetime once per jiffy at most In order to not dirty the cacheline too often, we try to only update dst->__use and dst->lastusetime at most once per jiffy. As dst->lastusetime is only used by ipv6 garbage collector, it should be good enough time resolution. And __use is only used in ipv6_route_seq_show() to show how many times a dst has been used. And as __use is not atomic_t right now, it does not show the precise number of usage times anyway. So we think it should be OK to only update it at most once per jiffy. According to my latest syn flood test on a machine with intel Xeon 6th gen processor and 2 10G mlx nics bonded together, each with 8 rx queues on 2 NUMA nodes: With this patch, the packet process rate increases from ~3.49Mpps to ~3.75Mpps with a 7% increase rate. Note: dst_use() is being renamed to dst_hold_and_use() to better specify the purpose of the function. Signed-off-by: Wei Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index 204c19e25456..5047e8053d6c 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -255,17 +255,18 @@ static inline void dst_hold(struct dst_entry *dst) WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0); } -static inline void dst_use(struct dst_entry *dst, unsigned long time) +static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) { - dst_hold(dst); - dst->__use++; - dst->lastuse = time; + if (time != dst->lastuse) { + dst->__use++; + dst->lastuse = time; + } } -static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) +static inline void dst_hold_and_use(struct dst_entry *dst, unsigned long time) { - dst->__use++; - dst->lastuse = time; + dst_hold(dst); + dst_use_noref(dst, time); } static inline struct dst_entry *dst_clone(struct dst_entry *dst) -- cgit v1.2.3 From a68f4a27f55f1d54e35c270aff89383da4b1b656 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 18 Oct 2017 11:36:39 +0100 Subject: rxrpc: Support service upgrade from a kernel service Provide support for a kernel service to make use of the service upgrade facility. This involves: (1) Pass an upgrade request flag to rxrpc_kernel_begin_call(). (2) Make rxrpc_kernel_recv_data() return the call's current service ID so that the caller can detect service upgrade and see what the service was upgraded to. Signed-off-by: David Howells --- include/net/af_rxrpc.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 3ac79150291f..820dd365a08e 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -49,12 +49,13 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *, unsigned long, s64, gfp_t, - rxrpc_notify_rx_t); + rxrpc_notify_rx_t, + bool); int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *, struct msghdr *, size_t, rxrpc_notify_end_tx_t); int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *, - void *, size_t, size_t *, bool, u32 *); + void *, size_t, size_t *, bool, u32 *, u16 *); bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, u32, int, const char *); void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *); -- cgit v1.2.3 From f4d15fb6f99af9b99f688bd87579137be44f85ee Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 18 Oct 2017 11:07:31 +0100 Subject: rxrpc: Provide functions for allowing cleaner handling of signals Provide a couple of functions to allow cleaner handling of signals in a kernel service. They are: (1) rxrpc_kernel_get_rtt() This allows the kernel service to find out the RTT time for a call, so as to better judge how large a timeout to employ. Note, though, that whilst this returns a value in nanoseconds, the timeouts can only actually be in jiffies. (2) rxrpc_kernel_check_life() This returns a number that is updated when ACKs are received from the peer (notably including PING RESPONSE ACKs which we can elicit by sending PING ACKs to see if the call still exists on the server). The caller should compare the numbers of two calls to see if the call is still alive. These can be used to provide an extending timeout rather than returning immediately in the case that a signal occurs that would otherwise abort an RPC operation. The timeout would be extended if the server is still responsive and the call is still apparently alive on the server. For most operations this isn't that necessary - but for FS.StoreData it is: OpenAFS writes the data to storage as it comes in without making a backup, so if we immediately abort it when partially complete on a CTRL+C, say, we have no idea of the state of the file after the abort. Signed-off-by: David Howells --- include/net/af_rxrpc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 820dd365a08e..2b3a6eec4570 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -61,6 +61,7 @@ bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *); void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *, struct sockaddr_rxrpc *); +u64 rxrpc_kernel_get_rtt(struct socket *, struct rxrpc_call *); int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, rxrpc_user_attach_call_t, unsigned long, gfp_t); void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); @@ -68,5 +69,6 @@ int rxrpc_kernel_retry_call(struct socket *, struct rxrpc_call *, struct sockaddr_rxrpc *, struct key *); int rxrpc_kernel_check_call(struct socket *, struct rxrpc_call *, enum rxrpc_call_completion *, u32 *); +u32 rxrpc_kernel_check_life(struct socket *, struct rxrpc_call *); #endif /* _NET_RXRPC_H */ -- cgit v1.2.3 From f8b8b1cd5aadd221742b45eb0ee3c8a80abf036a Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 16 Oct 2017 11:12:18 -0400 Subject: net: dsa: split dsa_port's netdev member The dsa_port structure has a "netdev" member, which can be used for either the master device, or the slave device, depending on its type. It is true that today, CPU port are not exposed to userspace, thus the port's netdev member can be used to point to its master interface. But it is still slightly confusing, so split it into more explicit "master" and "slave" members inside an anonymous union. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 2746741f74cf..6ed1a17ed1bd 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -164,6 +164,14 @@ struct dsa_mall_tc_entry { struct dsa_port { + /* A CPU port is physically connected to a master device. + * A user port exposed to userspace has a slave device. + */ + union { + struct net_device *master; + struct net_device *slave; + }; + /* CPU port tagging operations used by master or slave devices */ const struct dsa_device_ops *tag_ops; @@ -176,7 +184,6 @@ struct dsa_port { unsigned int index; const char *name; struct dsa_port *cpu_dp; - struct net_device *netdev; struct device_node *dn; unsigned int ageing_time; u8 stp_state; -- cgit v1.2.3 From c8652c83bc84ac8db44060ced0036de7628aa5e5 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 16 Oct 2017 11:12:19 -0400 Subject: net: dsa: add dsa_to_port helper The dsa_port structure is part of DSA core data and must only be updated by the later. It is OK and sometimes necessary for the DSA drivers to access this data, but this has to be read only. For that purpose, add a dsa_to_port() helper which returns a const pointer to a dsa_port structure which must be used by DSA drivers from now on instead of digging into ds->ports[] themselves. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6ed1a17ed1bd..38961ef91d3d 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -269,6 +269,11 @@ static inline bool dsa_is_normal_port(struct dsa_switch *ds, int p) return !dsa_is_cpu_port(ds, p) && !dsa_is_dsa_port(ds, p); } +static inline const struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) +{ + return &ds->ports[p]; +} + static inline u8 dsa_upstream_port(struct dsa_switch *ds) { struct dsa_switch_tree *dst = ds->dst; -- cgit v1.2.3 From eb4ddaf474285a4c6986f4a1c3205bdb0bed2da9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:28:45 -0700 Subject: net/decnet: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "David S. Miller" Cc: Johannes Berg Cc: David Ahern Cc: linux-decnet-user@lists.sourceforge.net Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- include/net/dn.h | 7 ------- include/net/dn_nsp.h | 1 - 2 files changed, 8 deletions(-) (limited to 'include/net') diff --git a/include/net/dn.h b/include/net/dn.h index 913b73d239f5..4394f7d5cfe8 100644 --- a/include/net/dn.h +++ b/include/net/dn.h @@ -122,13 +122,6 @@ struct dn_scp /* Session Control Port */ unsigned long keepalive; void (*keepalive_fxn)(struct sock *sk); - /* - * This stuff is for the fast timer for delayed acks - */ - struct timer_list delack_timer; - int delack_pending; - void (*delack_fxn)(struct sock *sk); - }; static inline struct dn_scp *DN_SK(struct sock *sk) diff --git a/include/net/dn_nsp.h b/include/net/dn_nsp.h index 3a3e33d18456..413a15e5339c 100644 --- a/include/net/dn_nsp.h +++ b/include/net/dn_nsp.h @@ -17,7 +17,6 @@ void dn_nsp_send_data_ack(struct sock *sk); void dn_nsp_send_oth_ack(struct sock *sk); -void dn_nsp_delayed_ack(struct sock *sk); void dn_send_conn_ack(struct sock *sk); void dn_send_conn_conf(struct sock *sk, gfp_t gfp); void dn_nsp_send_disc(struct sock *sk, unsigned char type, -- cgit v1.2.3 From 59f379f9046a9e0532ffd19b44e3c32fe79ec51b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:29:19 -0700 Subject: inet/connection_sock: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "David S. Miller" Cc: Gerrit Renker Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Cc: netdev@vger.kernel.org Cc: dccp@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 13e4c89a8231..0358745ea059 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -169,9 +169,9 @@ enum inet_csk_ack_state_t { }; void inet_csk_init_xmit_timers(struct sock *sk, - void (*retransmit_handler)(unsigned long), - void (*delack_handler)(unsigned long), - void (*keepalive_handler)(unsigned long)); + void (*retransmit_handler)(struct timer_list *), + void (*delack_handler)(struct timer_list *), + void (*keepalive_handler)(struct timer_list *)); void inet_csk_clear_xmit_timers(struct sock *sk); static inline void inet_csk_schedule_ack(struct sock *sk) -- cgit v1.2.3 From 78802011fbe34331bdef6f2dfb1634011f0e4c32 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 16 Oct 2017 17:29:20 -0700 Subject: inet: frags: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Alexander Aring Cc: Stefan Schmidt Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Cc: Pablo Neira Ayuso Cc: Jozsef Kadlecsik Cc: Florian Westphal Cc: linux-wpan@vger.kernel.org Cc: netdev@vger.kernel.org Cc: netfilter-devel@vger.kernel.org Cc: coreteam@netfilter.org Signed-off-by: Kees Cook Acked-by: Stefan Schmidt # for ieee802154 Signed-off-by: David S. Miller --- include/net/inet_frag.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index fc59e0775e00..c695807ca707 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -95,7 +95,7 @@ struct inet_frags { void (*constructor)(struct inet_frag_queue *q, const void *arg); void (*destructor)(struct inet_frag_queue *); - void (*frag_expire)(unsigned long data); + void (*frag_expire)(struct timer_list *t); struct kmem_cache *frags_cachep; const char *frags_cache_name; }; -- cgit v1.2.3 From de95e04791a03de5cb681980a3880db6919e3b4a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 18 Oct 2017 09:56:54 -0700 Subject: net: Add extack to validator_info structs used for address notifier Add extack to in_validator_info and in6_validator_info. Update the one user of each, ipvlan, to return an error message for failures. Only manual configuration of an address is plumbed in the IPv6 code path. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/addrconf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 87981cd63180..b8b16437c6d5 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -55,6 +55,7 @@ struct prefix_info { struct in6_validator_info { struct in6_addr i6vi_addr; struct inet6_dev *i6vi_dev; + struct netlink_ext_ack *extack; }; #define IN6_ADDR_HSIZE_SHIFT 4 -- cgit v1.2.3 From 1fba70e5b6bed53496ba1f1f16127f5be01b5fb6 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 18 Oct 2017 11:22:51 -0700 Subject: tcp: socket option to set TCP fast open key New socket option TCP_FASTOPEN_KEY to allow different keys per listener. The listener by default uses the global key until the socket option is set. The key is a 16 bytes long binary data. This option has no effect on regular non-listener TCP sockets. Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Reviewed-by: Christoph Paasch Signed-off-by: David S. Miller --- include/net/request_sock.h | 2 ++ include/net/tcp.h | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 23e22054aa60..347015515a7d 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -150,6 +150,8 @@ struct fastopen_queue { spinlock_t lock; int qlen; /* # of pending (TCP_SYN_RECV) reqs */ int max_qlen; /* != 0 iff TFO is currently enabled */ + + struct tcp_fastopen_context __rcu *ctx; /* cipher context for cookie */ }; /** struct request_sock_queue - queue of request_socks diff --git a/include/net/tcp.h b/include/net/tcp.h index 3b3b9b968e2d..1efe8365cb28 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1555,9 +1555,10 @@ struct tcp_fastopen_request { int copied; /* queued in tcp_connect() */ }; void tcp_free_fastopen_req(struct tcp_sock *tp); - +void tcp_fastopen_destroy_cipher(struct sock *sk); void tcp_fastopen_ctx_destroy(struct net *net); -int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len); +int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, + void *key, unsigned int len); void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, -- cgit v1.2.3 From 8c4083b30e56fc71b0e94c26374b32d95d5ea461 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:29 +0200 Subject: net: sched: add block bind/unbind notif. and extended block_get/put Introduce new type of ndo_setup_tc message to propage binding/unbinding of a block to driver. Call this ndo whenever qdisc gets/puts a block. Alongside with this, there's need to propagate binder type from qdisc code down to the notifier. So introduce extended variants of block_get/put in order to pass this info. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 49a143e0fe65..41bc7d774047 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -17,13 +17,27 @@ struct tcf_walker { int register_tcf_proto_ops(struct tcf_proto_ops *ops); int unregister_tcf_proto_ops(struct tcf_proto_ops *ops); +enum tcf_block_binder_type { + TCF_BLOCK_BINDER_TYPE_UNSPEC, +}; + +struct tcf_block_ext_info { + enum tcf_block_binder_type binder_type; +}; + #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create); void tcf_chain_put(struct tcf_chain *chain); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q); +int tcf_block_get_ext(struct tcf_block **p_block, + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, + struct tcf_block_ext_info *ei); void tcf_block_put(struct tcf_block *block); +void tcf_block_put_ext(struct tcf_block *block, + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, + struct tcf_block_ext_info *ei); static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { @@ -46,10 +60,25 @@ int tcf_block_get(struct tcf_block **p_block, return 0; } +static inline +int tcf_block_get_ext(struct tcf_block **p_block, + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, + struct tcf_block_ext_info *ei) +{ + return 0; +} + static inline void tcf_block_put(struct tcf_block *block) { } +static inline +void tcf_block_put_ext(struct tcf_block *block, + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, + struct tcf_block_ext_info *ei) +{ +} + static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { return NULL; @@ -434,6 +463,17 @@ tcf_match_indev(struct sk_buff *skb, int ifindex) int tc_setup_cb_call(struct tcf_exts *exts, enum tc_setup_type type, void *type_data, bool err_stop); +enum tc_block_command { + TC_BLOCK_BIND, + TC_BLOCK_UNBIND, +}; + +struct tc_block_offload { + enum tc_block_command command; + enum tcf_block_binder_type binder_type; + struct tcf_block *block; +}; + struct tc_cls_common_offload { u32 chain_index; __be16 protocol; -- cgit v1.2.3 From 6e40cf2d4dee9dc22ff398041ce876bef8172dea Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:30 +0200 Subject: net: sched: use extended variants of block_get/put in ingress and clsact qdiscs Use previously introduced extended variants of block get and put functions. This allows to specify a binder types specific to clsact ingress/egress which is useful for drivers to distinguish who actually got the block. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 41bc7d774047..5c50af8f7183 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -19,6 +19,8 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops); enum tcf_block_binder_type { TCF_BLOCK_BINDER_TYPE_UNSPEC, + TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS, + TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS, }; struct tcf_block_ext_info { -- cgit v1.2.3 From acb674428c3d57bccbe3f4a1a7a009f6d73e9f41 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:31 +0200 Subject: net: sched: introduce per-block callbacks Introduce infrastructure that allows drivers to register callbacks that are called whenever tc would offload inserted rule for a specific block. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 81 +++++++++++++++++++++++++++++++++++++++++++++++ include/net/sch_generic.h | 1 + 2 files changed, 82 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 5c50af8f7183..4bc6b1cc245d 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -27,6 +27,8 @@ struct tcf_block_ext_info { enum tcf_block_binder_type binder_type; }; +struct tcf_block_cb; + #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create); @@ -51,6 +53,21 @@ static inline struct net_device *tcf_block_dev(struct tcf_block *block) return tcf_block_q(block)->dev_queue->dev; } +void *tcf_block_cb_priv(struct tcf_block_cb *block_cb); +struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident); +void tcf_block_cb_incref(struct tcf_block_cb *block_cb); +unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb); +struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident, + void *cb_priv); +int tcf_block_cb_register(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident, + void *cb_priv); +void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb); +void tcf_block_cb_unregister(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident); + int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode); @@ -91,6 +108,70 @@ static inline struct net_device *tcf_block_dev(struct tcf_block *block) return NULL; } +static inline +int tc_setup_cb_block_register(struct tcf_block *block, tc_setup_cb_t *cb, + void *cb_priv) +{ + return 0; +} + +static inline +void tc_setup_cb_block_unregister(struct tcf_block *block, tc_setup_cb_t *cb, + void *cb_priv) +{ +} + +static inline +void *tcf_block_cb_priv(struct tcf_block_cb *block_cb) +{ + return NULL; +} + +static inline +struct tcf_block_cb *tcf_block_cb_lookup(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident) +{ + return NULL; +} + +static inline +void tcf_block_cb_incref(struct tcf_block_cb *block_cb) +{ +} + +static inline +unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb) +{ + return 0; +} + +static inline +struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident, + void *cb_priv) +{ + return NULL; +} + +static inline +int tcf_block_cb_register(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident, + void *cb_priv) +{ + return 0; +} + +static inline +void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb) +{ +} + +static inline +void tcf_block_cb_unregister(struct tcf_block *block, + tc_setup_cb_t *cb, void *cb_ident) +{ +} + static inline int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode) { diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 0aea9e23e97a..031dffd5836c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -272,6 +272,7 @@ struct tcf_block { struct list_head chain_list; struct net *net; struct Qdisc *q; + struct list_head cb_list; }; static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) -- cgit v1.2.3 From 208c0f4b5237f1d6611b2c679a8022d6901577d6 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:32 +0200 Subject: net: sched: use tc_setup_cb_call to call per-block callbacks Extend the tc_setup_cb_call entrypoint function originally used only for action egress devices callbacks to call per-block callbacks as well. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 4bc6b1cc245d..fcca5a9d9880 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -543,8 +543,8 @@ tcf_match_indev(struct sk_buff *skb, int ifindex) } #endif /* CONFIG_NET_CLS_IND */ -int tc_setup_cb_call(struct tcf_exts *exts, enum tc_setup_type type, - void *type_data, bool err_stop); +int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts, + enum tc_setup_type type, void *type_data, bool err_stop); enum tc_block_command { TC_BLOCK_BIND, -- cgit v1.2.3 From d58d31a118690b578897749feda48416ac10ca43 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:47 +0200 Subject: net: sched: remove unused classid field from tc_cls_common_offload It is no longer used by the drivers, so remove it. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index fcca5a9d9880..04caa246e747 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -561,7 +561,6 @@ struct tc_cls_common_offload { u32 chain_index; __be16 protocol; u32 prio; - u32 classid; }; static inline void @@ -571,7 +570,6 @@ tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common, cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio; - cls_common->classid = tp->classid; } struct tc_cls_u32_knode { -- cgit v1.2.3 From fa71212e91811ac67014ad19d4fc3b3c3446ccf7 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 19 Oct 2017 15:50:48 +0200 Subject: net: sched: remove unused is_classid_clsact_ingress/egress helpers These helpers are no longer in use by drivers, so remove them. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 2d234af15f3e..b8ecafce4ba1 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -135,19 +135,6 @@ static inline unsigned int psched_mtu(const struct net_device *dev) return dev->mtu + dev->hard_header_len; } -static inline bool is_classid_clsact_ingress(u32 classid) -{ - /* This also returns true for ingress qdisc */ - return TC_H_MAJ(classid) == TC_H_MAJ(TC_H_CLSACT) && - TC_H_MIN(classid) != TC_H_MIN(TC_H_MIN_EGRESS); -} - -static inline bool is_classid_clsact_egress(u32 classid) -{ - return TC_H_MAJ(classid) == TC_H_MAJ(TC_H_CLSACT) && - TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_EGRESS); -} - static inline struct net *qdisc_net(struct Qdisc *q) { return dev_net(q->dev_queue->dev); -- cgit v1.2.3 From 3f27fb23219e75343b094366f2358bff34300493 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Oct 2017 16:17:47 -0700 Subject: ipv6: addrconf: add per netns perturbation in inet6_addr_hash() Bring IPv6 in par with IPv4 : - Use net_hash_mix() to spread addresses a bit more. - Use 256 slots hash table instead of 16 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index b8b16437c6d5..15b5ffd7253d 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -58,7 +58,7 @@ struct in6_validator_info { struct netlink_ext_ack *extack; }; -#define IN6_ADDR_HSIZE_SHIFT 4 +#define IN6_ADDR_HSIZE_SHIFT 8 #define IN6_ADDR_HSIZE (1 << IN6_ADDR_HSIZE_SHIFT) int addrconf_init(void); -- cgit v1.2.3 From b6f4f8484d88b69f700907200a9a9ec73806355f Mon Sep 17 00:00:00 2001 From: Tim Hansen Date: Mon, 23 Oct 2017 15:35:58 -0400 Subject: net/sock: Update sk rcu iterator macro. Mark hlist node in sk rcu iterator as protected by the rcu. hlist_next_rcu accomplishes this and silences the warnings sparse throws. Found with make C=1 net/ipv4/udp.o on linux-next tag next-20171009. Signed-off-by: Tim Hansen Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 4827094f1db4..6f1be9726e02 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -737,10 +737,10 @@ static inline void sk_add_bind_node(struct sock *sk, * */ #define sk_for_each_entry_offset_rcu(tpos, pos, head, offset) \ - for (pos = rcu_dereference((head)->first); \ + for (pos = rcu_dereference(hlist_first_rcu(head)); \ pos != NULL && \ ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \ - pos = rcu_dereference(pos->next)) + pos = rcu_dereference(hlist_next_rcu(pos))) static inline struct user_namespace *sk_user_ns(struct sock *sk) { -- cgit v1.2.3 From 71c02379c762cb616c00fd5c4ed253fbf6bbe11b Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Mon, 23 Oct 2017 13:22:23 -0700 Subject: tcp: Configure TFO without cookie per socket and/or per route We already allow to enable TFO without a cookie by using the fastopen-sysctl and setting it to TFO_SERVER_COOKIE_NOT_REQD (or TFO_CLIENT_NO_COOKIE). This is safe to do in certain environments where we know that there isn't a malicous host (aka., data-centers) or when the application-protocol already provides an authentication mechanism in the first flight of data. A server however might be providing multiple services or talking to both sides (public Internet and data-center). So, this server would want to enable cookie-less TFO for certain services and/or for connections that go to the data-center. This patch exposes a socket-option and a per-route attribute to enable such fine-grained configurations. Signed-off-by: Christoph Paasch Reviewed-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/net/tcp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 2c13484704cb..2392f74074e7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1567,7 +1567,8 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct tcp_fastopen_cookie *foc); + struct tcp_fastopen_cookie *foc, + const struct dst_entry *dst); void tcp_fastopen_init_key_once(struct net *net); bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); -- cgit v1.2.3 From c4f3db15958277c03d1c324894255ea3ecbf86e1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Oct 2017 10:47:40 +0200 Subject: netfilter: conntrack: add and use nf_l4proto_log_invalid We currently pass down the l4 protocol to the conntrack ->packet() function, but the only user of this is the debug info decision. Same information can be derived from struct nf_conn. As a first step, add and use a new log function for this, similar to nf_ct_helper_log(). Add __cold annotation -- invalid packets should be infrequent so gcc can consider all call paths that lead to such a function as unlikely. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 738a0307a96b..6d79a061d360 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -152,8 +152,18 @@ extern const struct nla_policy nf_ct_port_nla_policy[]; #define LOG_INVALID(net, proto) \ ((net)->ct.sysctl_log_invalid == (proto) || \ (net)->ct.sysctl_log_invalid == IPPROTO_RAW) + +__printf(5, 6) __cold +void nf_l4proto_log_invalid(const struct sk_buff *skb, + struct net *net, + u16 pf, u8 protonum, + const char *fmt, ...); #else static inline int LOG_INVALID(struct net *net, int proto) { return 0; } + +static inline __printf(5, 6) __cold +void nf_l4proto_log_invalid(const struct sk_buff *skb, struct net *net, + u16 pf, u8 protonum, const char *fmt, ...) {} #endif /* CONFIG_SYSCTL */ #endif /*_NF_CONNTRACK_PROTOCOL_H*/ -- cgit v1.2.3 From 3d0b527bc9dc0e8c4428eb1a98d4cd27bd1114c7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Oct 2017 10:47:41 +0200 Subject: netfilter: conntrack: add and use nf_ct_l4proto_log_invalid We currently pass down the l4 protocol to the conntrack ->packet() function, but the only user of this is the debug info decision. Same information can be derived from struct nf_conn. Add a wrapper for the previous patch that extracs the information from nf_conn and passes it to nf_l4proto_log_invalid(). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 6d79a061d360..5d51255b5bfb 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -149,21 +149,23 @@ int nf_ct_port_nlattr_tuple_size(void); extern const struct nla_policy nf_ct_port_nla_policy[]; #ifdef CONFIG_SYSCTL -#define LOG_INVALID(net, proto) \ - ((net)->ct.sysctl_log_invalid == (proto) || \ - (net)->ct.sysctl_log_invalid == IPPROTO_RAW) - +__printf(3, 4) __cold +void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, + const struct nf_conn *ct, + const char *fmt, ...); __printf(5, 6) __cold void nf_l4proto_log_invalid(const struct sk_buff *skb, struct net *net, u16 pf, u8 protonum, const char *fmt, ...); #else -static inline int LOG_INVALID(struct net *net, int proto) { return 0; } - static inline __printf(5, 6) __cold void nf_l4proto_log_invalid(const struct sk_buff *skb, struct net *net, u16 pf, u8 protonum, const char *fmt, ...) {} +static inline __printf(3, 4) __cold +void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, + const struct nf_conn *ct, + const char *fmt, ...) { } #endif /* CONFIG_SYSCTL */ #endif /*_NF_CONNTRACK_PROTOCOL_H*/ -- cgit v1.2.3 From eb6fad5a4a328b85d3faa8b301b522e3f316b49d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 11 Oct 2017 10:47:42 +0200 Subject: netfilter: conntrack: remove pf argument from l4 packet functions not needed/used anymore. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 5d51255b5bfb..e06518874144 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -42,7 +42,6 @@ struct nf_conntrack_l4proto { const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - u_int8_t pf, unsigned int *timeouts); /* Called when a new connection for this protocol found; -- cgit v1.2.3 From 28efb0046512e8a13ed9f9bdf0d68d10bbfbe9cf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 12 Oct 2017 09:38:30 +0200 Subject: netfilter: conntrack: make l3proto trackers const previous patches removed all writes to them. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_conntrack_ipv4.h | 2 +- include/net/netfilter/ipv6/nf_conntrack_ipv6.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h index 919e4e8af327..5534ecca7a5d 100644 --- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h @@ -10,7 +10,7 @@ #define _NF_CONNTRACK_IPV4_H -extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; +const extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4; diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h index eaea968f8657..30dc57980866 100644 --- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h +++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h @@ -1,7 +1,7 @@ #ifndef _NF_CONNTRACK_IPV6_H #define _NF_CONNTRACK_IPV6_H -extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6; +extern const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6; -- cgit v1.2.3 From ef5201c83d1400570a3b6f004ad7a23d71934411 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 24 Oct 2017 13:54:20 +0800 Subject: bonding: remove rtmsg_ifinfo called after bond_lower_state_changed After the patch 'rtnetlink: bring NETDEV_CHANGELOWERSTATE event process back to rtnetlink_event', bond_lower_state_changed would generate NETDEV_CHANGEUPPER event which would send a notification to userspace in rtnetlink_event. There's no need to call rtmsg_ifinfo to send the notification any more. So this patch is to remove it from these places after bond_lower_state_changed. Besides, after this, rtmsg_ifinfo is not needed to be exported. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/bonding.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/net') diff --git a/include/net/bonding.h b/include/net/bonding.h index 2860cc66c2bb..f801fc940b29 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -330,7 +330,6 @@ static inline void bond_set_active_slave(struct slave *slave) slave->backup = 0; bond_queue_slave_event(slave); bond_lower_state_changed(slave); - rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_ATOMIC); } } @@ -340,7 +339,6 @@ static inline void bond_set_backup_slave(struct slave *slave) slave->backup = 1; bond_queue_slave_event(slave); bond_lower_state_changed(slave); - rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_ATOMIC); } } @@ -353,7 +351,6 @@ static inline void bond_set_slave_state(struct slave *slave, slave->backup = slave_state; if (notify) { bond_lower_state_changed(slave); - rtmsg_ifinfo(RTM_NEWLINK, slave->dev, 0, GFP_ATOMIC); bond_queue_slave_event(slave); slave->should_notify = 0; } else { @@ -385,7 +382,6 @@ static inline void bond_slave_state_notify(struct bonding *bond) bond_for_each_slave(bond, tmp, iter) { if (tmp->should_notify) { bond_lower_state_changed(tmp); - rtmsg_ifinfo(RTM_NEWLINK, tmp->dev, 0, GFP_ATOMIC); tmp->should_notify = 0; } } -- cgit v1.2.3 From 9c3b57518363577d4e2ea1964ef4fa03e100beaa Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 24 Oct 2017 01:45:31 -0700 Subject: net: sctp: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: Vlad Yasevich Cc: Neil Horman Cc: "David S. Miller" Cc: linux-sctp@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- include/net/sctp/sm.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 2db3d3a9ce1d..13cc4963e905 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -72,7 +72,7 @@ typedef enum sctp_disposition (sctp_state_fn_t) ( const union sctp_subtype type, void *arg, struct sctp_cmd_seq *commands); -typedef void (sctp_timer_event_t) (unsigned long); +typedef void (sctp_timer_event_t) (struct timer_list *); struct sctp_sm_table_entry { sctp_state_fn_t *fn; const char *name; @@ -314,10 +314,10 @@ int sctp_do_sm(struct net *net, enum sctp_event event_type, void *event_arg, gfp_t gfp); /* 2nd level prototypes */ -void sctp_generate_t3_rtx_event(unsigned long peer); -void sctp_generate_heartbeat_event(unsigned long peer); -void sctp_generate_reconf_event(unsigned long peer); -void sctp_generate_proto_unreach_event(unsigned long peer); +void sctp_generate_t3_rtx_event(struct timer_list *t); +void sctp_generate_heartbeat_event(struct timer_list *t); +void sctp_generate_reconf_event(struct timer_list *t); +void sctp_generate_proto_unreach_event(struct timer_list *t); void sctp_ootb_pkt_free(struct sctp_packet *packet); -- cgit v1.2.3 From fc8bcaa05160528d56432e4612f522e3ceafc513 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 24 Oct 2017 01:45:48 -0700 Subject: net: LLC: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Cc: "David S. Miller" Cc: Eric Dumazet Cc: Hans Liljestrand Cc: "Paul E. McKenney" Cc: "Reshetova, Elena" Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- include/net/llc_c_ac.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/llc_c_ac.h b/include/net/llc_c_ac.h index f3be818e73c1..e766300b3e99 100644 --- a/include/net/llc_c_ac.h +++ b/include/net/llc_c_ac.h @@ -171,10 +171,10 @@ int llc_conn_ac_rst_sendack_flag(struct sock *sk, struct sk_buff *skb); int llc_conn_ac_send_i_rsp_as_ack(struct sock *sk, struct sk_buff *skb); int llc_conn_ac_send_i_as_ack(struct sock *sk, struct sk_buff *skb); -void llc_conn_busy_tmr_cb(unsigned long timeout_data); -void llc_conn_pf_cycle_tmr_cb(unsigned long timeout_data); -void llc_conn_ack_tmr_cb(unsigned long timeout_data); -void llc_conn_rej_tmr_cb(unsigned long timeout_data); +void llc_conn_busy_tmr_cb(struct timer_list *t); +void llc_conn_pf_cycle_tmr_cb(struct timer_list *t); +void llc_conn_ack_tmr_cb(struct timer_list *t); +void llc_conn_rej_tmr_cb(struct timer_list *t); void llc_conn_set_p_flag(struct sock *sk, u8 value); #endif /* LLC_C_AC_H */ -- cgit v1.2.3 From 14cd5d4a0125f643350e7fa12f5384f1fc2d3e9d Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:17 -0700 Subject: locking/atomics, net/netlink/netfilter: Convert ACCESS_ONCE() to READ_ONCE()/WRITE_ONCE() For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't currently harmful. However, for some features it is necessary to instrument reads and writes separately, which is not possible with ACCESS_ONCE(). This distinction is critical to correct operation. It's possible to transform the bulk of kernel code using the Coccinelle script below. However, this doesn't handle comments, leaving references to ACCESS_ONCE() instances which have been removed. As a preparatory step, this patch converts netlink and netfilter code and comments to use {READ,WRITE}_ONCE() consistently. ---- virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Cc: David S. Miller Cc: Florian Westphal Cc: Jozsef Kadlecsik Cc: Linus Torvalds Cc: Pablo Neira Ayuso Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-7-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/net/netfilter/nf_tables.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 0f5b12a4ad09..5c68e279eaea 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1164,8 +1164,8 @@ static inline u8 nft_genmask_next(const struct net *net) static inline u8 nft_genmask_cur(const struct net *net) { - /* Use ACCESS_ONCE() to prevent refetching the value for atomicity */ - return 1 << ACCESS_ONCE(net->nft.gencursor); + /* Use READ_ONCE() to prevent refetching the value for atomicity */ + return 1 << READ_ONCE(net->nft.gencursor); } #define NFT_GENMASK_ANY ((1 << 0) | (1 << 1)) -- cgit v1.2.3 From 6aa7de059173a986114ac43b8f50b297a86f09a8 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 23 Oct 2017 14:07:29 -0700 Subject: locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE() patterns to READ_ONCE()/WRITE_ONCE() Please do not apply this to mainline directly, instead please re-run the coccinelle script shown below and apply its output. For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't harmful, and changing them results in churn. However, for some features, the read/write distinction is critical to correct operation. To distinguish these cases, separate read/write accessors must be used. This patch migrates (most) remaining ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following coccinelle script: ---- // Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and // WRITE_ONCE() // $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- include/net/ip_vs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 4f4f786255ef..3fadb6f9982b 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -983,12 +983,12 @@ static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) static inline int sysctl_sync_period(struct netns_ipvs *ipvs) { - return ACCESS_ONCE(ipvs->sysctl_sync_threshold[1]); + return READ_ONCE(ipvs->sysctl_sync_threshold[1]); } static inline unsigned int sysctl_sync_refresh_period(struct netns_ipvs *ipvs) { - return ACCESS_ONCE(ipvs->sysctl_sync_refresh_period); + return READ_ONCE(ipvs->sysctl_sync_refresh_period); } static inline int sysctl_sync_retries(struct netns_ipvs *ipvs) @@ -1013,7 +1013,7 @@ static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs) static inline int sysctl_sync_ports(struct netns_ipvs *ipvs) { - return ACCESS_ONCE(ipvs->sysctl_sync_ports); + return READ_ONCE(ipvs->sysctl_sync_ports); } static inline int sysctl_sync_persist_mode(struct netns_ipvs *ipvs) -- cgit v1.2.3 From 60e2a7780793bae0debc275a9ccd57f7da0cf195 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 25 Oct 2017 11:01:45 +0200 Subject: tcp: TCP experimental option for SMC The SMC protocol [1] relies on the use of a new TCP experimental option [2, 3]. With this option, SMC capabilities are exchanged between peers during the TCP three way handshake. This patch adds support for this experimental option to TCP. References: [1] SMC-R Informational RFC: http://www.rfc-editor.org/info/rfc7609 [2] Shared Use of TCP Experimental Options RFC 6994: https://tools.ietf.org/rfc/rfc6994.txt [3] IANA ExID SMCR: http://www.iana.org/assignments/tcp-parameters/tcp-parameters.xhtml#tcp-exids Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- include/net/inet_sock.h | 3 ++- include/net/tcp.h | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 425752f768d2..c49938d1481a 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -92,7 +92,8 @@ struct inet_request_sock { wscale_ok : 1, ecn_ok : 1, acked : 1, - no_srccheck: 1; + no_srccheck: 1, + smc_ok : 1; kmemcheck_bitfield_end(flags); u32 ir_mark; union { diff --git a/include/net/tcp.h b/include/net/tcp.h index 2392f74074e7..285bc82dea41 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -191,6 +191,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); * experimental options. See draft-ietf-tcpm-experimental-options-00.txt */ #define TCPOPT_FASTOPEN_MAGIC 0xF989 +#define TCPOPT_SMC_MAGIC 0xE2D4C3D9 /* * TCP option lengths @@ -203,6 +204,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_MD5SIG 18 #define TCPOLEN_FASTOPEN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 +#define TCPOLEN_EXP_SMC_BASE 6 /* But this is what stacks really send out. */ #define TCPOLEN_TSTAMP_ALIGNED 12 @@ -213,6 +215,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_SACK_PERBLOCK 8 #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 +#define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ @@ -2108,4 +2111,8 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) { return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1); } + +#if IS_ENABLED(CONFIG_SMC) +extern struct static_key_false tcp_have_smc; +#endif #endif /* _TCP_H */ -- cgit v1.2.3 From 32d18ab1d44166cbb1dcaf8b359183636734ddb1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 24 Oct 2017 12:41:01 +0200 Subject: net: updating dst lastusage is an unlikely event. Since commit 0da4af00b2ed ("ipv6: only update __use and lastusetime once per jiffy at most"), updating the dst lastuse field is an unlikely action: it happens at most once per jiffy, out of potentially millions of calls per second. Mark explicitly the code as such, and let the compiler generate better code. Note: gcc 7.2 and several older versions do actually generate different - better - code when the unlikely() hint is in place, avoid jump in the fast path and keeping better code locality. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/dst.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index 5047e8053d6c..2f53ecc2c296 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -257,7 +257,7 @@ static inline void dst_hold(struct dst_entry *dst) static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) { - if (time != dst->lastuse) { + if (unlikely(time != dst->lastuse)) { dst->__use++; dst->lastuse = time; } -- cgit v1.2.3 From 2ae21cf527da0e5cf9d7ee14bd5b0909bb9d1a75 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:54:56 -0700 Subject: tcp: Namespace-ify sysctl_tcp_early_retrans Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2c4222a5d102..a7f39e3ea666 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -128,6 +128,7 @@ struct netns_ipv4 { int sysctl_tcp_sack; int sysctl_tcp_window_scaling; int sysctl_tcp_timestamps; + int sysctl_tcp_early_retrans; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 285bc82dea41..a12b71d4118b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -265,7 +265,6 @@ extern int sysctl_tcp_workaround_signed_windows; extern int sysctl_tcp_slow_start_after_idle; extern int sysctl_tcp_thin_linear_timeouts; extern int sysctl_tcp_thin_dupack; -extern int sysctl_tcp_early_retrans; extern int sysctl_tcp_recovery; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -- cgit v1.2.3 From e20223f1962831d1b1c416d59d259879d0639d68 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:54:57 -0700 Subject: tcp: Namespace-ify sysctl_tcp_recovery Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index a7f39e3ea666..d6ed718075d4 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -129,6 +129,7 @@ struct netns_ipv4 { int sysctl_tcp_window_scaling; int sysctl_tcp_timestamps; int sysctl_tcp_early_retrans; + int sysctl_tcp_recovery; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index a12b71d4118b..c7f51534fc44 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -265,7 +265,7 @@ extern int sysctl_tcp_workaround_signed_windows; extern int sysctl_tcp_slow_start_after_idle; extern int sysctl_tcp_thin_linear_timeouts; extern int sysctl_tcp_thin_dupack; -extern int sysctl_tcp_recovery; + #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ extern int sysctl_tcp_limit_output_bytes; -- cgit v1.2.3 From 2c04ac8ae0b61e0780a30b7069a11bb202b21f26 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:54:58 -0700 Subject: tcp: Namespace-ify sysctl_tcp_thin_linear_timeouts Note that sysctl_tcp_thin_dupack was not used, I deleted it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index d6ed718075d4..2a9f37b39c45 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -130,6 +130,7 @@ struct netns_ipv4 { int sysctl_tcp_timestamps; int sysctl_tcp_early_retrans; int sysctl_tcp_recovery; + int sysctl_tcp_thin_linear_timeouts; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index c7f51534fc44..063a7a48b7fe 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -263,8 +263,6 @@ extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; extern int sysctl_tcp_workaround_signed_windows; extern int sysctl_tcp_slow_start_after_idle; -extern int sysctl_tcp_thin_linear_timeouts; -extern int sysctl_tcp_thin_dupack; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -- cgit v1.2.3 From b510f0d23a47c3d1f074fe583e7867dc4918fe02 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:54:59 -0700 Subject: tcp: Namespace-ify sysctl_tcp_slow_start_after_idle Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2a9f37b39c45..8662692686b3 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -131,6 +131,7 @@ struct netns_ipv4 { int sysctl_tcp_early_retrans; int sysctl_tcp_recovery; int sysctl_tcp_thin_linear_timeouts; + int sysctl_tcp_slow_start_after_idle; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 063a7a48b7fe..cc2ab522eb5c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -262,7 +262,6 @@ extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; extern int sysctl_tcp_workaround_signed_windows; -extern int sysctl_tcp_slow_start_after_idle; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ @@ -1308,7 +1307,7 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); s32 delta; - if (!sysctl_tcp_slow_start_after_idle || tp->packets_out || + if (!sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle || tp->packets_out || ca_ops->cong_control) return; delta = tcp_jiffies32 - tp->lsndtime; -- cgit v1.2.3 From e0a1e5b519236dc1662ff25e42560dd1be9e3776 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:00 -0700 Subject: tcp: Namespace-ify sysctl_tcp_retrans_collapse Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 8662692686b3..b28c172b10e4 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -132,6 +132,7 @@ struct netns_ipv4 { int sysctl_tcp_recovery; int sysctl_tcp_thin_linear_timeouts; int sysctl_tcp_slow_start_after_idle; + int sysctl_tcp_retrans_collapse; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index cc2ab522eb5c..33cc86355b8f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -243,7 +243,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ -extern int sysctl_tcp_retrans_collapse; extern int sysctl_tcp_stdurg; extern int sysctl_tcp_rfc1337; extern int sysctl_tcp_abort_on_overflow; -- cgit v1.2.3 From 3f4c7c6f6a9053493ce7dd8a0f17ed8eafc53893 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:01 -0700 Subject: tcp: Namespace-ify sysctl_tcp_stdurg Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index b28c172b10e4..ffa2cf3dc747 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -133,6 +133,7 @@ struct netns_ipv4 { int sysctl_tcp_thin_linear_timeouts; int sysctl_tcp_slow_start_after_idle; int sysctl_tcp_retrans_collapse; + int sysctl_tcp_stdurg; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 33cc86355b8f..cf3fac7008d7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -243,7 +243,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ -extern int sysctl_tcp_stdurg; extern int sysctl_tcp_rfc1337; extern int sysctl_tcp_abort_on_overflow; extern int sysctl_tcp_max_orphans; -- cgit v1.2.3 From 625357aa175c688d219da43c8cfaa2e1629e0e1a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:02 -0700 Subject: tcp: Namespace-ify sysctl_tcp_rfc1337 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index ffa2cf3dc747..968edce38eb5 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -134,6 +134,7 @@ struct netns_ipv4 { int sysctl_tcp_slow_start_after_idle; int sysctl_tcp_retrans_collapse; int sysctl_tcp_stdurg; + int sysctl_tcp_rfc1337; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index cf3fac7008d7..2aea2b3373b3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -243,7 +243,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ -extern int sysctl_tcp_rfc1337; extern int sysctl_tcp_abort_on_overflow; extern int sysctl_tcp_max_orphans; extern int sysctl_tcp_fack; -- cgit v1.2.3 From 65c9410cf55ecf32da1b720f563365d565d6289a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:03 -0700 Subject: tcp: Namespace-ify sysctl_tcp_abort_on_overflow Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 968edce38eb5..3875fdf6b186 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -135,6 +135,7 @@ struct netns_ipv4 { int sysctl_tcp_retrans_collapse; int sysctl_tcp_stdurg; int sysctl_tcp_rfc1337; + int sysctl_tcp_abort_on_overflow; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 2aea2b3373b3..7331281a2292 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -243,7 +243,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ -extern int sysctl_tcp_abort_on_overflow; extern int sysctl_tcp_max_orphans; extern int sysctl_tcp_fack; extern int sysctl_tcp_reordering; -- cgit v1.2.3 From 0bc65a28ae2aeb14aab7f4a930e0d8cf4cad9dc4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:04 -0700 Subject: tcp: Namespace-ify sysctl_tcp_fack Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 3875fdf6b186..f0e792beeea9 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -136,6 +136,7 @@ struct netns_ipv4 { int sysctl_tcp_stdurg; int sysctl_tcp_rfc1337; int sysctl_tcp_abort_on_overflow; + int sysctl_tcp_fack; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 7331281a2292..e7b15e9f6e28 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -244,7 +244,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; -extern int sysctl_tcp_fack; extern int sysctl_tcp_reordering; extern int sysctl_tcp_max_reordering; extern int sysctl_tcp_dsack; -- cgit v1.2.3 From 773d4bb96ceca6829ae9928f6b002b93e2e62cdc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:05 -0700 Subject: tcp: remove stale sysctl_tcp_reordering This extern is no longer used. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index e7b15e9f6e28..fc134ba74c7d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -244,7 +244,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; -extern int sysctl_tcp_reordering; extern int sysctl_tcp_max_reordering; extern int sysctl_tcp_dsack; extern long sysctl_tcp_mem[3]; -- cgit v1.2.3 From c6e218035913e14952b04ceecf1a543205106fdb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:06 -0700 Subject: tcp: Namespace-ify sysctl_tcp_max_reordering Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index f0e792beeea9..3f6844665a2f 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -137,6 +137,7 @@ struct netns_ipv4 { int sysctl_tcp_rfc1337; int sysctl_tcp_abort_on_overflow; int sysctl_tcp_fack; + int sysctl_tcp_max_reordering; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index fc134ba74c7d..8cd286226a1e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -244,7 +244,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; -extern int sysctl_tcp_max_reordering; extern int sysctl_tcp_dsack; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; -- cgit v1.2.3 From 6496f6bde0c323fba5e8c5b5cbf3a7bf28dad7ed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:07 -0700 Subject: tcp: Namespace-ify sysctl_tcp_dsack Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 3f6844665a2f..956957a77db9 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -138,6 +138,7 @@ struct netns_ipv4 { int sysctl_tcp_abort_on_overflow; int sysctl_tcp_fack; int sysctl_tcp_max_reordering; + int sysctl_tcp_dsack; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 8cd286226a1e..8b2ae3e8d79f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -244,7 +244,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; -extern int sysctl_tcp_dsack; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; -- cgit v1.2.3 From 0c12654ac6d9004b9538b2a969b2b59e9a5ed831 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:08 -0700 Subject: tcp: Namespace-ify sysctl_tcp_app_win Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 956957a77db9..63f91d52cbc0 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -139,6 +139,7 @@ struct netns_ipv4 { int sysctl_tcp_fack; int sysctl_tcp_max_reordering; int sysctl_tcp_dsack; + int sysctl_tcp_app_win; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 8b2ae3e8d79f..7aa3d65062a1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; -extern int sysctl_tcp_app_win; extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_frto; extern int sysctl_tcp_nometrics_save; -- cgit v1.2.3 From 94f0893e0c27219f4a726932618505aab6795973 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:09 -0700 Subject: tcp: Namespace-ify sysctl_tcp_adv_win_scale Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 9 ++++----- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 63f91d52cbc0..9dbb07d4eff4 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -140,6 +140,7 @@ struct netns_ipv4 { int sysctl_tcp_max_reordering; int sysctl_tcp_dsack; int sysctl_tcp_app_win; + int sysctl_tcp_adv_win_scale; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 7aa3d65062a1..0dc27cd24899 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; -extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_frto; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; @@ -1311,9 +1310,9 @@ void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd); -static inline int tcp_win_from_space(int space) +static inline int tcp_win_from_space(const struct sock *sk, int space) { - int tcp_adv_win_scale = sysctl_tcp_adv_win_scale; + int tcp_adv_win_scale = sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale; return tcp_adv_win_scale <= 0 ? (space>>(-tcp_adv_win_scale)) : @@ -1323,13 +1322,13 @@ static inline int tcp_win_from_space(int space) /* Note: caller must be prepared to deal with negative returns */ static inline int tcp_space(const struct sock *sk) { - return tcp_win_from_space(sk->sk_rcvbuf - + return tcp_win_from_space(sk, sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)); } static inline int tcp_full_space(const struct sock *sk) { - return tcp_win_from_space(sk->sk_rcvbuf); + return tcp_win_from_space(sk, sk->sk_rcvbuf); } extern void tcp_openreq_init_rwin(struct request_sock *req, -- cgit v1.2.3 From af9b69a7a6ca6b817e8d6f416e7aa5b2a5bf1d91 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Oct 2017 21:55:10 -0700 Subject: tcp: Namespace-ify sysctl_tcp_frto Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 9dbb07d4eff4..f4622e28db3a 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -141,6 +141,7 @@ struct netns_ipv4 { int sysctl_tcp_dsack; int sysctl_tcp_app_win; int sysctl_tcp_adv_win_scale; + int sysctl_tcp_frto; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 0dc27cd24899..18f047501f53 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; -extern int sysctl_tcp_frto; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; -- cgit v1.2.3 From bff7b688d5b10a8cb8cecefdea5e255408b78f2f Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 11:22:51 -0400 Subject: net: dsa: add dsa_is_unused_port helper As the comment above the chunk states, the b53 driver attempts to disable the unused ports. But using ds->enabled_port_mask is misleading, because this mask reports in fact the user ports. To avoid confusion and fix this, this patch introduces an explicit dsa_is_unused_port helper which ensures the corresponding bit is not masked in any of the switch port masks. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 38961ef91d3d..6b1bc1c8f7e2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -254,6 +254,13 @@ struct dsa_switch { struct dsa_port ports[]; }; +static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) +{ + u32 m = ds->enabled_port_mask | ds->dsa_port_mask | ds->cpu_port_mask; + + return !(m & BIT(p)); +} + static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) { return !!(ds->cpu_port_mask & (1 << p)); -- cgit v1.2.3 From deb8ee0b51204273c120a3b3848efbb5695ad658 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 11:22:53 -0400 Subject: net: dsa: fix dsa_is_normal_port helper In order to know if a port is of type user, dsa_is_normal_port checks that the given port is not of type DSA nor CPU. This is not enough because a port can be unused. Without the previous fix, this caused the unused mv88e6xxx ports to be configured in normal mode. The ds->enabled_port_mask reports the user ports, so check this instead. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6b1bc1c8f7e2..4ad432ad2d40 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -273,7 +273,7 @@ static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p) static inline bool dsa_is_normal_port(struct dsa_switch *ds, int p) { - return !dsa_is_cpu_port(ds, p) && !dsa_is_dsa_port(ds, p); + return !!(ds->enabled_port_mask & BIT(p)); } static inline const struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) -- cgit v1.2.3 From 2b3e9891cb607f7c7d5a4b11fb5a6e775e7f3ef4 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 11:22:54 -0400 Subject: net: dsa: rename dsa_is_normal_port helper This patch renames dsa_is_normal_port to dsa_is_user_port because "user" is the correct term in the DSA terminology, not "normal". Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 4ad432ad2d40..49701d958663 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -271,7 +271,7 @@ static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p) return !!((ds->dsa_port_mask) & (1 << p)); } -static inline bool dsa_is_normal_port(struct dsa_switch *ds, int p) +static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) { return !!(ds->enabled_port_mask & BIT(p)); } -- cgit v1.2.3 From 02bc6e546e858b209c3ebe380a13a73b333b1b3f Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 11:22:56 -0400 Subject: net: dsa: introduce dsa_user_ports helper Introduce a dsa_user_ports() helper to return the ds->enabled_port_mask mask which is more explicit. This will also minimize diffs when touching this internal mask. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 49701d958663..dc7728062396 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -276,6 +276,11 @@ static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) return !!(ds->enabled_port_mask & BIT(p)); } +static inline u32 dsa_user_ports(struct dsa_switch *ds) +{ + return ds->enabled_port_mask; +} + static inline const struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) { return &ds->ports[p]; -- cgit v1.2.3 From 057cad2c59d73b0c4a6638546f3099d6fb444094 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 11:22:57 -0400 Subject: net: dsa: define port types Introduce an enumerated type for ports, which will be way more explicit to identify a port type instead of digging into switch port masks. A port can be of type CPU, DSA, user, or unused by default. This is a static parsed information that cannot be changed at runtime. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index dc7728062396..8da20c4a6552 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -180,6 +180,13 @@ struct dsa_port { struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); + enum { + DSA_PORT_TYPE_UNUSED = 0, + DSA_PORT_TYPE_CPU, + DSA_PORT_TYPE_DSA, + DSA_PORT_TYPE_USER, + } type; + struct dsa_switch *ds; unsigned int index; const char *name; -- cgit v1.2.3 From c38c5a66506e4e8223fd03e950b1bde99190701e Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 11:22:58 -0400 Subject: net: dsa: use new port type in helpers Now that DSA exposes an enumerated type for the ports, we can use them directly instead of checking bitmaps, which is more consistent. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 8da20c4a6552..07dfbd7f4fd5 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -261,36 +261,41 @@ struct dsa_switch { struct dsa_port ports[]; }; -static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) +static inline const struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) { - u32 m = ds->enabled_port_mask | ds->dsa_port_mask | ds->cpu_port_mask; + return &ds->ports[p]; +} - return !(m & BIT(p)); +static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) +{ + return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_UNUSED; } static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) { - return !!(ds->cpu_port_mask & (1 << p)); + return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_CPU; } static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p) { - return !!((ds->dsa_port_mask) & (1 << p)); + return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_DSA; } static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) { - return !!(ds->enabled_port_mask & BIT(p)); + return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_USER; } static inline u32 dsa_user_ports(struct dsa_switch *ds) { - return ds->enabled_port_mask; -} + u32 mask = 0; + int p; -static inline const struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p) -{ - return &ds->ports[p]; + for (p = 0; p < ds->num_ports; p++) + if (dsa_is_user_port(ds, p)) + mask |= BIT(p); + + return mask; } static inline u8 dsa_upstream_port(struct dsa_switch *ds) -- cgit v1.2.3 From 5749f0f3772b9d98f37e3a92539f49fafaa64eca Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 26 Oct 2017 11:22:59 -0400 Subject: net: dsa: remove port masks Now that DSA core provides port types, there is no need to keep this information at the switch level. This is a static information that is part of a DSA core dsa_port structure. Remove them. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 07dfbd7f4fd5..50e276dc4c01 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -240,9 +240,6 @@ struct dsa_switch { /* * Slave mii_bus and devices for the individual ports. */ - u32 dsa_port_mask; - u32 cpu_port_mask; - u32 enabled_port_mask; u32 phys_mii_mask; struct mii_bus *slave_mii_bus; -- cgit v1.2.3 From 3d0bd028ffb4a4915cb64cfa0d2cee1578cc0321 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 16 Oct 2017 18:01:27 -0700 Subject: net/sched: Add support for HW offloading for CBS This adds support for offloading the CBS algorithm to the controller, if supported. Drivers wanting to support CBS offload must implement the .ndo_setup_tc callback and handle the TC_SETUP_CBS (introduced here) type. Signed-off-by: Vinicius Costa Gomes Tested-by: Henrik Austad Signed-off-by: Jeff Kirsher --- include/net/pkt_sched.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index b8ecafce4ba1..02f2db26e30c 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -140,4 +140,13 @@ static inline struct net *qdisc_net(struct Qdisc *q) return dev_net(q->dev_queue->dev); } +struct tc_cbs_qopt_offload { + u8 enable; + s32 queue; + s32 hicredit; + s32 locredit; + s32 idleslope; + s32 sendslope; +}; + #endif -- cgit v1.2.3 From ec36e416f06f6a8659281053fdc46ce484ad2211 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:21 -0700 Subject: tcp: Namespace-ify sysctl_tcp_nometrics_save Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index f4622e28db3a..9606e2ea1f14 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -142,6 +142,7 @@ struct netns_ipv4 { int sysctl_tcp_app_win; int sysctl_tcp_adv_win_scale; int sysctl_tcp_frto; + int sysctl_tcp_nometrics_save; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 18f047501f53..6ab7fa4154b2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; -extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; extern int sysctl_tcp_workaround_signed_windows; -- cgit v1.2.3 From 4540c0cf98b8892a642d2453eec20ae3eb5696fb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:22 -0700 Subject: tcp: Namespace-ify sysctl_tcp_moderate_rcvbuf Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 9606e2ea1f14..4458a54fe3f4 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -143,6 +143,7 @@ struct netns_ipv4 { int sysctl_tcp_adv_win_scale; int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; + int sysctl_tcp_moderate_rcvbuf; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 6ab7fa4154b2..f954e74578ff 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; -extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; extern int sysctl_tcp_workaround_signed_windows; -- cgit v1.2.3 From d06a99045837d3f4d5431793c4c390b0daf2a08d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:23 -0700 Subject: tcp: Namespace-ify sysctl_tcp_tso_win_divisor Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 4458a54fe3f4..60bccda046db 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -144,6 +144,7 @@ struct netns_ipv4 { int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; int sysctl_tcp_moderate_rcvbuf; + int sysctl_tcp_tso_win_divisor; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index f954e74578ff..ed0828dc82f1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; -extern int sysctl_tcp_tso_win_divisor; extern int sysctl_tcp_workaround_signed_windows; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -- cgit v1.2.3 From ceef9ab6be7234f9e49f79769e0da88d1dccfcc7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:24 -0700 Subject: tcp: Namespace-ify sysctl_tcp_workaround_signed_windows Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 60bccda046db..e74c7c1b0d18 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -145,6 +145,7 @@ struct netns_ipv4 { int sysctl_tcp_nometrics_save; int sysctl_tcp_moderate_rcvbuf; int sysctl_tcp_tso_win_divisor; + int sysctl_tcp_workaround_signed_windows; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index ed0828dc82f1..e338e16178dd 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -247,7 +247,6 @@ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; -extern int sysctl_tcp_workaround_signed_windows; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ @@ -1302,7 +1301,8 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk) } /* Determine a window scaling and initial window to offer. */ -void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, +void tcp_select_initial_window(const struct sock *sk, int __space, + __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd); -- cgit v1.2.3 From 9184d8bb448a3d2c2d9f90f1e2f5de625292e769 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:25 -0700 Subject: tcp: Namespace-ify sysctl_tcp_limit_output_bytes Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e74c7c1b0d18..e98f473bab13 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -146,6 +146,7 @@ struct netns_ipv4 { int sysctl_tcp_moderate_rcvbuf; int sysctl_tcp_tso_win_divisor; int sysctl_tcp_workaround_signed_windows; + int sysctl_tcp_limit_output_bytes; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index e338e16178dd..33f9d30a6905 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -extern int sysctl_tcp_limit_output_bytes; extern int sysctl_tcp_challenge_ack_limit; extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_min_rtt_wlen; -- cgit v1.2.3 From b530b68148301d73775cd27cc136ce4dd5738ae8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:26 -0700 Subject: tcp: Namespace-ify sysctl_tcp_challenge_ack_limit Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e98f473bab13..e9895d40868e 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -147,6 +147,7 @@ struct netns_ipv4 { int sysctl_tcp_tso_win_divisor; int sysctl_tcp_workaround_signed_windows; int sysctl_tcp_limit_output_bytes; + int sysctl_tcp_challenge_ack_limit; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 33f9d30a6905..afc23596e9aa 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -extern int sysctl_tcp_challenge_ack_limit; extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_min_rtt_wlen; extern int sysctl_tcp_autocorking; -- cgit v1.2.3 From 26e9596e5b8f11025b57b12e7265df649129ab00 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:27 -0700 Subject: tcp: Namespace-ify sysctl_tcp_min_tso_segs Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e9895d40868e..a2da3e19a977 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -148,6 +148,7 @@ struct netns_ipv4 { int sysctl_tcp_workaround_signed_windows; int sysctl_tcp_limit_output_bytes; int sysctl_tcp_challenge_ack_limit; + int sysctl_tcp_min_tso_segs; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index afc23596e9aa..0735303a6575 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_min_rtt_wlen; extern int sysctl_tcp_autocorking; extern int sysctl_tcp_invalid_ratelimit; -- cgit v1.2.3 From bd239704295c66196e6b77c5717ec4aec076ddd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:28 -0700 Subject: tcp: Namespace-ify sysctl_tcp_min_rtt_wlen Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index a2da3e19a977..1a66af8a0d32 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -149,6 +149,7 @@ struct netns_ipv4 { int sysctl_tcp_limit_output_bytes; int sysctl_tcp_challenge_ack_limit; int sysctl_tcp_min_tso_segs; + int sysctl_tcp_min_rtt_wlen; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 0735303a6575..56f50c9a3e6a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -extern int sysctl_tcp_min_rtt_wlen; extern int sysctl_tcp_autocorking; extern int sysctl_tcp_invalid_ratelimit; extern int sysctl_tcp_pacing_ss_ratio; -- cgit v1.2.3 From 790f00e19f65673c3c169dfc137c09a9236847d5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:29 -0700 Subject: tcp: Namespace-ify sysctl_tcp_autocorking Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 1a66af8a0d32..537830882149 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -150,6 +150,7 @@ struct netns_ipv4 { int sysctl_tcp_challenge_ack_limit; int sysctl_tcp_min_tso_segs; int sysctl_tcp_min_rtt_wlen; + int sysctl_tcp_autocorking; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 56f50c9a3e6a..0268f1025d9d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -extern int sysctl_tcp_autocorking; extern int sysctl_tcp_invalid_ratelimit; extern int sysctl_tcp_pacing_ss_ratio; extern int sysctl_tcp_pacing_ca_ratio; -- cgit v1.2.3 From 4170ba6b589ced82da56c7e4f71cc84b2be036d6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:30 -0700 Subject: tcp: Namespace-ify sysctl_tcp_invalid_ratelimit Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 537830882149..e52c2124b32e 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -151,6 +151,7 @@ struct netns_ipv4 { int sysctl_tcp_min_tso_segs; int sysctl_tcp_min_rtt_wlen; int sysctl_tcp_autocorking; + int sysctl_tcp_invalid_ratelimit; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 0268f1025d9d..5869a822ecb1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -extern int sysctl_tcp_invalid_ratelimit; extern int sysctl_tcp_pacing_ss_ratio; extern int sysctl_tcp_pacing_ca_ratio; -- cgit v1.2.3 From 23a7102a2d1068508fa2a0ce593a0df7f8fdc0ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:31 -0700 Subject: tcp: Namespace-ify sysctl_tcp_pacing_ss_ratio Also remove an obsolete comment about TCP pacing. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e52c2124b32e..eb2dcf1cbe61 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -152,6 +152,7 @@ struct netns_ipv4 { int sysctl_tcp_min_rtt_wlen; int sysctl_tcp_autocorking; int sysctl_tcp_invalid_ratelimit; + int sysctl_tcp_pacing_ss_ratio; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 5869a822ecb1..2a5f8261ca03 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -250,7 +250,6 @@ extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -extern int sysctl_tcp_pacing_ss_ratio; extern int sysctl_tcp_pacing_ca_ratio; extern atomic_long_t tcp_memory_allocated; -- cgit v1.2.3 From c26e91f8b9b8e1fd252e07c1f60e50220cd7ebab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 27 Oct 2017 07:47:32 -0700 Subject: tcp: Namespace-ify sysctl_tcp_pacing_ca_ratio Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index eb2dcf1cbe61..141ba82b5efb 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -153,6 +153,7 @@ struct netns_ipv4 { int sysctl_tcp_autocorking; int sysctl_tcp_invalid_ratelimit; int sysctl_tcp_pacing_ss_ratio; + int sysctl_tcp_pacing_ca_ratio; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index 2a5f8261ca03..092d606fcc16 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -250,8 +250,6 @@ extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ -extern int sysctl_tcp_pacing_ca_ratio; - extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; extern unsigned long tcp_memory_pressure; -- cgit v1.2.3 From 5b52a4c3acf5f4b4854d1c3ddc8be8770330a79c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 27 Oct 2017 10:01:39 -0700 Subject: tcp: remove unnecessary include two extra #include are not necessary in tcp.h Remove them. Fixes: 40304b2a1567 ("bpf: BPF support for sock_ops") Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/tcp.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 092d606fcc16..aa1cc90fdc02 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -45,9 +45,6 @@ #include #include - -#include -#include #include extern struct inet_hashinfo tcp_hashinfo; -- cgit v1.2.3 From 1f01d8be0e6a04bd682a55f6d50c14c1679e7571 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20Zapa=C5=82owicz?= Date: Tue, 17 Oct 2017 15:53:49 +0200 Subject: Bluetooth: increase timeout for le auto connections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch increases the connection timeout for LE connections that are triggered by the advertising report to 4 seconds. It has been observed that devices equipped with wifi+bt combo SoC fail to create a connection with BLE devices due to their coexistence issues. Increasing this timeout gives them enough time to complete the connection with success. Signed-off-by: Konrad Zapałowicz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index fe98f0a5bef0..1668211297a9 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -273,7 +273,7 @@ enum { #define HCI_AUTO_OFF_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ #define HCI_POWER_OFF_TIMEOUT msecs_to_jiffies(5000) /* 5 seconds */ #define HCI_LE_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */ -#define HCI_LE_AUTOCONN_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ +#define HCI_LE_AUTOCONN_TIMEOUT msecs_to_jiffies(4000) /* 4 seconds */ /* HCI data types */ #define HCI_COMMAND_PKT 0x01 -- cgit v1.2.3 From 2064ee332e4c1b7495cf68b84355c213d8fe71fd Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Mon, 30 Oct 2017 10:42:59 +0100 Subject: Bluetooth: Use bt_dev_err and bt_dev_info when possible In case of using BT_ERR and BT_INFO, convert to bt_dev_err and bt_dev_info when possible. This allows for controller specific reporting. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/bluetooth.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 020142bb9735..e89cff0c4c23 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -147,6 +147,9 @@ void bt_err_ratelimited(const char *fmt, ...); #define bt_dev_dbg(hdev, fmt, ...) \ BT_DBG("%s: " fmt, (hdev)->name, ##__VA_ARGS__) +#define bt_dev_err_ratelimited(hdev, fmt, ...) \ + BT_ERR_RATELIMITED("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + /* Connection and socket states */ enum { BT_CONNECTED = 1, /* Equal to TCP_ESTABLISHED to make net code happy */ -- cgit v1.2.3 From e4dca7b7aa08b22893c45485d222b5807c1375ae Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 17 Oct 2017 19:04:42 -0700 Subject: treewide: Fix function prototypes for module_param_call() Several function prototypes for the set/get functions defined by module_param_call() have a slightly wrong argument types. This fixes those in an effort to clean up the calls when running under type-enforced compiler instrumentation for CFI. This is the result of running the following semantic patch: @match_module_param_call_function@ declarer name module_param_call; identifier _name, _set_func, _get_func; expression _arg, _mode; @@ module_param_call(_name, _set_func, _get_func, _arg, _mode); @fix_set_prototype depends on match_module_param_call_function@ identifier match_module_param_call_function._set_func; identifier _val, _param; type _val_type, _param_type; @@ int _set_func( -_val_type _val +const char * _val , -_param_type _param +const struct kernel_param * _param ) { ... } @fix_get_prototype depends on match_module_param_call_function@ identifier match_module_param_call_function._get_func; identifier _val, _param; type _val_type, _param_type; @@ int _get_func( -_val_type _val +char * _val , -_param_type _param +const struct kernel_param * _param ) { ... } Two additional by-hand changes are included for places where the above Coccinelle script didn't notice them: drivers/platform/x86/thinkpad_acpi.c fs/lockd/svc.c Signed-off-by: Kees Cook Signed-off-by: Jessica Yu --- include/net/netfilter/nf_conntrack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 8f3bd30511de..fd5241ce1fc9 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -284,7 +284,7 @@ static inline bool nf_ct_should_gc(const struct nf_conn *ct) struct kernel_param; -int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp); +int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp); int nf_conntrack_hash_resize(unsigned int hashsize); extern struct hlist_nulls_head *nf_conntrack_hash; -- cgit v1.2.3 From 384c181e3780ddc45e70483e29d84495b484730d Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 27 Oct 2017 02:35:34 -0700 Subject: net: sched: Identify hardware traffic classes using classid This patch offloads the classid to hardware and uses the classid reserved in the range :ffe0 - :ffef to identify hardware traffic classes reported via dev->num_tc. tcf_result structure contains the class ID of the class to which the packet belongs and is offloaded to hardware via flower filter. A new helper function is introduced to represent HW traffic classes 0 through 15 using the reserved classid values :ffe0 - :ffef. Signed-off-by: Amritha Nambiar Acked-by: Shannon Nelson Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/net/pkt_cls.h | 1 + include/net/sch_generic.h | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index bf73e1675519..37c5ef766655 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -666,6 +666,7 @@ struct tc_cls_flower_offload { struct fl_flow_key *mask; struct fl_flow_key *key; struct tcf_exts *exts; + u32 classid; }; enum tc_matchall_command { diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 07c179dab478..c23e938f5b19 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -411,6 +411,13 @@ qdisc_class_find(const struct Qdisc_class_hash *hash, u32 id) return NULL; } +static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid) +{ + u32 hwtc = TC_H_MIN(classid) - TC_H_MIN_PRIORITY; + + return (hwtc < netdev_get_num_tc(dev)) ? hwtc : -EINVAL; +} + int qdisc_class_hash_init(struct Qdisc_class_hash *); void qdisc_class_hash_insert(struct Qdisc_class_hash *, struct Qdisc_class_common *); -- cgit v1.2.3 From 6c31e5a91fde2e718e59c8a627c56451f88be54c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 27 Oct 2017 17:37:13 -0700 Subject: net: Add extack to fib_notifier_info Add extack to fib_notifier_info and plumb through stack to call_fib_rule_notifiers, call_fib_entry_notifiers and call_fib6_entry_notifiers. This allows notifer handlers to return messages to user. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/fib_notifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index 54cd6b839d2f..c91ec732afd6 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -9,6 +9,7 @@ struct fib_notifier_info { struct net *net; int family; + struct netlink_ext_ack *extack; }; enum fib_event_type { -- cgit v1.2.3 From da13c59b9936dfedcf9f2203bd29fbf83ad672bf Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Mon, 30 Oct 2017 19:38:52 -0400 Subject: net: display hw address of source machine during ipv6 DAD failure This patch updates the error messages displayed in kernel log to include hwaddress of the source machine that caused ipv6 duplicate address detection failures. Examples: a) When we receive a NA packet from another machine advertising our address: ICMPv6: NA: 34:ab:cd:56:11:e8 advertised our address 2001:db8:: on eth0! b) When we detect DAD failure during address assignment to an interface: IPv6: eth0: IPv6 duplicate address 2001:db8:: used by 34:ab:cd:56:11:e8 detected! v2: Changed %pI6 to %pI6c in ndisc_recv_na() Chaged the v6 address in the commit message to 2001:db8:: Suggested-by: Igor Lubashev Signed-off-by: Vishwanath Pai Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 15b5ffd7253d..2a616ea53956 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -208,7 +208,7 @@ void ipv6_mc_remap(struct inet6_dev *idev); void ipv6_mc_init_dev(struct inet6_dev *idev); void ipv6_mc_destroy_dev(struct inet6_dev *idev); int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed); -void addrconf_dad_failure(struct inet6_ifaddr *ifp); +void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp); bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, const struct in6_addr *src_addr); -- cgit v1.2.3 From 0b5a89caee5c9958c18cd933c7f8891e35b21781 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 1 Nov 2017 11:47:38 +0100 Subject: net: sched: remove unused tc_should_offload helper tc_should_offload is no longer used, remove it. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 37c5ef766655..108dcdd96421 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -623,13 +623,6 @@ static inline bool tc_skip_hw(u32 flags) return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false; } -static inline bool tc_should_offload(const struct net_device *dev, u32 flags) -{ - if (tc_skip_hw(flags)) - return false; - return tc_can_offload(dev); -} - static inline bool tc_skip_sw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_SW) ? true : false; -- cgit v1.2.3 From 70b5aee46782208c14d93b715e9f62f7fec844f1 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 1 Nov 2017 11:47:41 +0100 Subject: net: sched: remove ndo_setup_tc check from tc_can_offload Since tc_can_offload is always called from block callback or egdev callback, no need to check if ndo_setup_tc exists. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 108dcdd96421..d15c40c7bde7 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -611,11 +611,7 @@ struct tc_cls_u32_offload { static inline bool tc_can_offload(const struct net_device *dev) { - if (!(dev->features & NETIF_F_HW_TC)) - return false; - if (!dev->netdev_ops->ndo_setup_tc) - return false; - return true; + return dev->features & NETIF_F_HW_TC; } static inline bool tc_skip_hw(u32 flags) -- cgit v1.2.3 From 47d3d7ac656a1ffb9d0f0d3c845663ed6fd7e78d Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 30 Oct 2017 14:16:00 -0700 Subject: ipv6: Implement limits on Hop-by-Hop and Destination options RFC 8200 (IPv6) defines Hop-by-Hop options and Destination options extension headers. Both of these carry a list of TLVs which is only limited by the maximum length of the extension header (2048 bytes). By the spec a host must process all the TLVs in these options, however these could be used as a fairly obvious denial of service attack. I think this could in fact be a significant DOS vector on the Internet, one mitigating factor might be that many FWs drop all packets with EH (and obviously this is only IPv6) so an Internet wide attack might not be so effective (yet!). By my calculation, the worse case packet with TLVs in a standard 1500 byte MTU packet that would be processed by the stack contains 1282 invidual TLVs (including pad TLVS) or 724 two byte TLVs. I wrote a quick test program that floods a whole bunch of these packets to a host and sure enough there is substantial time spent in ip6_parse_tlv. These packets contain nothing but unknown TLVS (that are ignored), TLV padding, and bogus UDP header with zero payload length. 25.38% [kernel] [k] __fib6_clean_all 21.63% [kernel] [k] ip6_parse_tlv 4.21% [kernel] [k] __local_bh_enable_ip 2.18% [kernel] [k] ip6_pol_route.isra.39 1.98% [kernel] [k] fib6_walk_continue 1.88% [kernel] [k] _raw_write_lock_bh 1.65% [kernel] [k] dst_release This patch adds configurable limits to Destination and Hop-by-Hop options. There are three limits that may be set: - Limit the number of options in a Hop-by-Hop or Destination options extension header. - Limit the byte length of a Hop-by-Hop or Destination options extension header. - Disallow unrecognized options in a Hop-by-Hop or Destination options extension header. The limits are set in corresponding sysctls: ipv6.sysctl.max_dst_opts_cnt ipv6.sysctl.max_hbh_opts_cnt ipv6.sysctl.max_dst_opts_len ipv6.sysctl.max_hbh_opts_len If a max_*_opts_cnt is less than zero then unknown TLVs are disallowed. The number of known TLVs that are allowed is the absolute value of this number. If a limit is exceeded when processing an extension header the packet is dropped. Default values are set to 8 for options counts, and set to INT_MAX for maximum length. Note the choice to limit options to 8 is an arbitrary guess (roughly based on the fact that the stack supports three HBH options and just one destination option). These limits have being proposed in draft-ietf-6man-rfc6434-bis. Tested (by Martin Lau) I tested out 1 thread (i.e. one raw_udp process). I changed the net.ipv6.max_dst_(opts|hbh)_number between 8 to 2048. With sysctls setting to 2048, the softirq% is packed to 100%. With 8, the softirq% is almost unnoticable from mpstat. v2; - Code and documention cleanup. - Change references of RFC2460 to be RFC8200. - Add reference to RFC6434-bis where the limits will be in standard. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/ipv6.h | 40 ++++++++++++++++++++++++++++++++++++++++ include/net/netns/ipv6.h | 4 ++++ 2 files changed, 44 insertions(+) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 3cda3b521c36..fb6d67012de6 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -51,6 +51,46 @@ #define IPV6_DEFAULT_HOPLIMIT 64 #define IPV6_DEFAULT_MCASTHOPS 1 +/* Limits on Hop-by-Hop and Destination options. + * + * Per RFC8200 there is no limit on the maximum number or lengths of options in + * Hop-by-Hop or Destination options other then the packet must fit in an MTU. + * We allow configurable limits in order to mitigate potential denial of + * service attacks. + * + * There are three limits that may be set: + * - Limit the number of options in a Hop-by-Hop or Destination options + * extension header + * - Limit the byte length of a Hop-by-Hop or Destination options extension + * header + * - Disallow unknown options + * + * The limits are expressed in corresponding sysctls: + * + * ipv6.sysctl.max_dst_opts_cnt + * ipv6.sysctl.max_hbh_opts_cnt + * ipv6.sysctl.max_dst_opts_len + * ipv6.sysctl.max_hbh_opts_len + * + * max_*_opts_cnt is the number of TLVs that are allowed for Destination + * options or Hop-by-Hop options. If the number is less than zero then unknown + * TLVs are disallowed and the number of known options that are allowed is the + * absolute value. Setting the value to INT_MAX indicates no limit. + * + * max_*_opts_len is the length limit in bytes of a Destination or + * Hop-by-Hop options extension header. Setting the value to INT_MAX + * indicates no length limit. + * + * If a limit is exceeded when processing an extension header the packet is + * silently discarded. + */ + +/* Default limits for Hop-by-Hop and Destination options */ +#define IP6_DEFAULT_MAX_DST_OPTS_CNT 8 +#define IP6_DEFAULT_MAX_HBH_OPTS_CNT 8 +#define IP6_DEFAULT_MAX_DST_OPTS_LEN INT_MAX /* No limit */ +#define IP6_DEFAULT_MAX_HBH_OPTS_LEN INT_MAX /* No limit */ + /* * Addr type * diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 2ea1ed341ef8..600ba1c1befc 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -37,6 +37,10 @@ struct netns_sysctl_ipv6 { int idgen_delay; int flowlabel_state_ranges; int flowlabel_reflect; + int max_dst_opts_cnt; + int max_hbh_opts_cnt; + int max_dst_opts_len; + int max_hbh_opts_len; }; struct netns_ipv6 { -- cgit v1.2.3 From 3ae6ec08292f01c6782d1a80be0b2cc675e0ecfc Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 2 Nov 2017 17:14:05 +0100 Subject: ipv4: Send a netevent whenever multipath hash policy is changed Devices performing IPv4 forwarding need to update their multipath hash policy whenever it is changed. Inform these devices by generating a netevent. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Jiri Pirko Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/netevent.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/netevent.h b/include/net/netevent.h index f440df172b56..e3f0e8f2f6e8 100644 --- a/include/net/netevent.h +++ b/include/net/netevent.h @@ -25,6 +25,7 @@ enum netevent_notif_type { NETEVENT_NEIGH_UPDATE = 1, /* arg is struct neighbour ptr */ NETEVENT_REDIRECT, /* arg is struct netevent_redirect ptr */ NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */ + NETEVENT_MULTIPATH_HASH_UPDATE, /* arg is struct net ptr */ }; int register_netevent_notifier(struct notifier_block *nb); -- cgit v1.2.3 From c7eb7d7230509ec862d4144f7a831f995bc5d028 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 3 Nov 2017 11:46:24 +0100 Subject: net: sched: introduce chain_head_change callback Add a callback that is to be called whenever head of the chain changes. Also provide a callback for the default case when the caller gets a block using non-extended getter. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 14 ++++++-------- include/net/sch_generic.h | 5 ++++- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index d15c40c7bde7..505d4b71975f 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -26,6 +26,8 @@ enum tcf_block_binder_type { struct tcf_block_ext_info { enum tcf_block_binder_type binder_type; + tcf_chain_head_change_t *chain_head_change; + void *chain_head_change_priv; }; struct tcf_block_cb; @@ -37,12 +39,10 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, void tcf_chain_put(struct tcf_chain *chain); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q); -int tcf_block_get_ext(struct tcf_block **p_block, - struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, +int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei); void tcf_block_put(struct tcf_block *block); -void tcf_block_put_ext(struct tcf_block *block, - struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, +void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei); static inline struct Qdisc *tcf_block_q(struct tcf_block *block) @@ -82,8 +82,7 @@ int tcf_block_get(struct tcf_block **p_block, } static inline -int tcf_block_get_ext(struct tcf_block **p_block, - struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, +int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, struct tcf_block_ext_info *ei) { return 0; @@ -94,8 +93,7 @@ static inline void tcf_block_put(struct tcf_block *block) } static inline -void tcf_block_put_ext(struct tcf_block *block, - struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, +void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei) { } diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c23e938f5b19..f230269e0bfb 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -260,9 +260,12 @@ struct qdisc_skb_cb { unsigned char data[QDISC_CB_PRIV_LEN]; }; +typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); + struct tcf_chain { struct tcf_proto __rcu *filter_chain; - struct tcf_proto __rcu **p_filter_chain; + tcf_chain_head_change_t *chain_head_change; + void *chain_head_change_priv; struct list_head list; struct tcf_block *block; u32 index; /* chain index */ -- cgit v1.2.3 From 46209401f8f6116bd0b2c2d14a63958e83ffca0b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 3 Nov 2017 11:46:25 +0100 Subject: net: core: introduce mini_Qdisc and eliminate usage of tp->q for clsact fastpath In sch_handle_egress and sch_handle_ingress tp->q is used only in order to update stats. So stats and filter list are the only things that are needed in clsact qdisc fastpath processing. Introduce new mini_Qdisc struct to hold those items. Also, introduce a helper to swap the mini_Qdisc structures in case filter list head changes. This removes need for tp->q usage without added overhead. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f230269e0bfb..c64e62c9450a 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -904,4 +904,36 @@ static inline void psched_ratecfg_getrate(struct tc_ratespec *res, res->linklayer = (r->linklayer & TC_LINKLAYER_MASK); } +/* Mini Qdisc serves for specific needs of ingress/clsact Qdisc. + * The fast path only needs to access filter list and to update stats + */ +struct mini_Qdisc { + struct tcf_proto *filter_list; + struct gnet_stats_basic_cpu __percpu *cpu_bstats; + struct gnet_stats_queue __percpu *cpu_qstats; + struct rcu_head rcu; +}; + +static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq, + const struct sk_buff *skb) +{ + bstats_cpu_update(this_cpu_ptr(miniq->cpu_bstats), skb); +} + +static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq) +{ + this_cpu_inc(miniq->cpu_qstats->drops); +} + +struct mini_Qdisc_pair { + struct mini_Qdisc miniq1; + struct mini_Qdisc miniq2; + struct mini_Qdisc __rcu **p_miniq; +}; + +void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, + struct tcf_proto *tp_head); +void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, + struct mini_Qdisc __rcu **p_miniq); + #endif -- cgit v1.2.3 From 27c565ae9d554fa1c00c799754cff43476c8d3b5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Nov 2017 08:53:27 -0700 Subject: ipv6: remove IN6_ADDR_HSIZE from addrconf.h IN6_ADDR_HSIZE is private to addrconf.c, move it here to avoid confusion. Signed-off-by: Eric Dumazet Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/addrconf.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 3357332ea375..b623b65a79d1 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -59,9 +59,6 @@ struct in6_validator_info { struct netlink_ext_ack *extack; }; -#define IN6_ADDR_HSIZE_SHIFT 8 -#define IN6_ADDR_HSIZE (1 << IN6_ADDR_HSIZE_SHIFT) - int addrconf_init(void); void addrconf_cleanup(void); -- cgit v1.2.3 From 99feaafcdb566e8f032e7acc2a303713ad6bf196 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 3 Nov 2017 19:05:20 -0400 Subject: net: dsa: make switch index unsigned Define the DSA switch index as an unsigned int, because it will never be less than 0. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 50e276dc4c01..fa1c21ab8092 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -209,7 +209,7 @@ struct dsa_switch { * Parent switch tree, and switch index. */ struct dsa_switch_tree *dst; - int index; + unsigned int index; /* Listener for switch fabric events */ struct notifier_block nb; -- cgit v1.2.3 From 49463b7f2da1a115404b02c5533bc2c2125833a3 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Fri, 3 Nov 2017 19:05:21 -0400 Subject: net: dsa: make tree index unsigned Similarly to a DSA switch and port, rename the tree index from "tree" to "index" and make it an unsigned int because it isn't supposed to be less than 0. u32 is an OF specific data used to retrieve the value and has no need to be propagated up to the tree index. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index fa1c21ab8092..e54332968417 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -116,7 +116,7 @@ struct dsa_switch_tree { struct raw_notifier_head nh; /* Tree identifier */ - u32 tree; + unsigned int index; /* Number of switches attached to this tree */ struct kref refcount; -- cgit v1.2.3 From 1f2556916d974cfb62b6af51660186b5f58bd869 Mon Sep 17 00:00:00 2001 From: Priyaranjan Jha Date: Fri, 3 Nov 2017 16:38:48 -0700 Subject: tcp: higher throughput under reordering with adaptive RACK reordering wnd Currently TCP RACK loss detection does not work well if packets are being reordered beyond its static reordering window (min_rtt/4).Under such reordering it may falsely trigger loss recoveries and reduce TCP throughput significantly. This patch improves that by increasing and reducing the reordering window based on DSACK, which is now supported in major TCP implementations. It makes RACK's reo_wnd adaptive based on DSACK and no. of recoveries. - If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded by srtt), since there is possibility that spurious retransmission was due to reordering delay longer than reo_wnd. - Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) no. of successful recoveries (accounts for full DSACK-based loss recovery undo). After that, reset it to default (min_rtt/4). - At max, reo_wnd is incremented only once per rtt. So that the new DSACK on which we are reacting, is due to the spurious retx (approx) after the reo_wnd has been updated last time. - reo_wnd is tracked in terms of steps (of min_rtt/4), rather than absolute value to account for change in rtt. In our internal testing, we observed significant increase in throughput, in scenarios where reordering exceeds min_rtt/4 (previous static value). Signed-off-by: Priyaranjan Jha Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index c2bf2a822b10..babfd4da1515 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -246,6 +246,7 @@ extern int sysctl_tcp_wmem[3]; extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ +#define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; @@ -1901,6 +1902,7 @@ extern void tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk); +extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs); /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) -- cgit v1.2.3 From 5caaed151a68ae36aca2981cc245f5960a0a7603 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 2 Nov 2017 19:41:09 +0100 Subject: netfilter: conntrack: don't cache nlattr_tuple_size result in nla_size We currently call ->nlattr_tuple_size() once at register time and cache result in l4proto->nla_size. nla_size is the only member that is written to, avoiding this would allow to make l4proto trackers const. We can use ->nlattr_tuple_size() at run time, and cache result in the individual trackers instead. This is an intermediate step, next patch removes nlattr_size() callback and computes size at compile time, then removes nla_size. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index e06518874144..46e786ffcf2f 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -74,7 +74,7 @@ struct nf_conntrack_l4proto { int (*tuple_to_nlattr)(struct sk_buff *skb, const struct nf_conntrack_tuple *t); /* Calculate tuple nlattr size */ - int (*nlattr_tuple_size)(void); + unsigned int (*nlattr_tuple_size)(void); int (*nlattr_to_tuple)(struct nlattr *tb[], struct nf_conntrack_tuple *t); const struct nla_policy *nla_policy; @@ -144,7 +144,7 @@ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple); int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t); -int nf_ct_port_nlattr_tuple_size(void); +unsigned int nf_ct_port_nlattr_tuple_size(void); extern const struct nla_policy nf_ct_port_nla_policy[]; #ifdef CONFIG_SYSCTL -- cgit v1.2.3 From ba0e4d9917b43dfa746cbbcb4477da59aae73bd6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 9 Oct 2017 19:52:28 +0200 Subject: netfilter: nf_tables: get set elements via netlink This patch adds a new get operation to look up for specific elements in a set via netlink interface. You can also use it to check if an interval already exists. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 0f5b12a4ad09..d011e56cc7a9 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -311,6 +311,7 @@ struct nft_expr; * @flush: deactivate element in the next generation * @remove: remove element from set * @walk: iterate over all set elemeennts + * @get: get set elements * @privsize: function to return size of set private data * @init: initialize private data of new set instance * @destroy: destroy private data of set instance @@ -350,6 +351,10 @@ struct nft_set_ops { void (*walk)(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter); + void * (*get)(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem, + unsigned int flags); unsigned int (*privsize)(const struct nlattr * const nla[], const struct nft_set_desc *desc); -- cgit v1.2.3 From 602f3baf22188aad24b9a58be3209ab774b97d74 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Mon, 6 Nov 2017 07:23:41 +0100 Subject: net_sch: red: Add offload ability to RED qdisc Add the ability to offload RED qdisc by using ndo_setup_tc. There are four commands for RED offloading: * TC_RED_SET: handles set and change. * TC_RED_DESTROY: handle qdisc destroy. * TC_RED_STATS: update the qdiscs counters (given as reference) * TC_RED_XSTAT: returns red xstats. Whether RED is being offloaded is being determined every time dump action is being called because parent change of this qdisc could change its offload state but doesn't require any RED function to be called. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 98fef3221227..03c208d3c922 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -703,4 +703,34 @@ struct tc_cookie { u8 *data; u32 len; }; + +enum tc_red_command { + TC_RED_REPLACE, + TC_RED_DESTROY, + TC_RED_STATS, + TC_RED_XSTATS, +}; + +struct tc_red_qopt_offload_params { + u32 min; + u32 max; + u32 probability; + bool is_ecn; +}; +struct tc_red_qopt_offload_stats { + struct gnet_stats_basic_packed *bstats; + struct gnet_stats_queue *qstats; +}; + +struct tc_red_qopt_offload { + enum tc_red_command command; + u32 handle; + u32 parent; + union { + struct tc_red_qopt_offload_params set; + struct tc_red_qopt_offload_stats stats; + struct red_stats *xstats; + }; +}; + #endif -- cgit v1.2.3 From b2d0f5d5dc53532e6f07bc546a476a55ebdfe0f3 Mon Sep 17 00:00:00 2001 From: Yi Yang Date: Tue, 7 Nov 2017 21:07:02 +0800 Subject: openvswitch: enable NSH support v16->17 - Fixed disputed check code: keep them in nsh_push and nsh_pop but also add them in __ovs_nla_copy_actions v15->v16 - Add csum recalculation for nsh_push, nsh_pop and set_nsh pointed out by Pravin - Move nsh key into the union with ipv4 and ipv6 and add check for nsh key in match_validate pointed out by Pravin - Add nsh check in validate_set and __ovs_nla_copy_actions v14->v15 - Check size in nsh_hdr_from_nlattr - Fixed four small issues pointed out By Jiri and Eric v13->v14 - Rename skb_push_nsh to nsh_push per Dave's comment - Rename skb_pop_nsh to nsh_pop per Dave's comment v12->v13 - Fix NSH header length check in set_nsh v11->v12 - Fix missing changes old comments pointed out - Fix new comments for v11 v10->v11 - Fix the left three disputable comments for v9 but not fixed in v10. v9->v10 - Change struct ovs_key_nsh to struct ovs_nsh_key_base base; __be32 context[NSH_MD1_CONTEXT_SIZE]; - Fix new comments for v9 v8->v9 - Fix build error reported by daily intel build because nsh module isn't selected by openvswitch v7->v8 - Rework nested value and mask for OVS_KEY_ATTR_NSH - Change pop_nsh to adapt to nsh kernel module - Fix many issues per comments from Jiri Benc v6->v7 - Remove NSH GSO patches in v6 because Jiri Benc reworked it as another patch series and they have been merged. - Change it to adapt to nsh kernel module added by NSH GSO patch series v5->v6 - Fix the rest comments for v4. - Add NSH GSO support for VxLAN-gpe + NSH and Eth + NSH. v4->v5 - Fix many comments by Jiri Benc and Eric Garver for v4. v3->v4 - Add new NSH match field ttl - Update NSH header to the latest format which will be final format and won't change per its author's confirmation. - Fix comments for v3. v2->v3 - Change OVS_KEY_ATTR_NSH to nested key to handle length-fixed attributes and length-variable attriubte more flexibly. - Remove struct ovs_action_push_nsh completely - Add code to handle nested attribute for SET_MASKED - Change PUSH_NSH to use the nested OVS_KEY_ATTR_NSH to transfer NSH header data. - Fix comments and coding style issues by Jiri and Eric v1->v2 - Change encap_nsh and decap_nsh to push_nsh and pop_nsh - Dynamically allocate struct ovs_action_push_nsh for length-variable metadata. OVS master and 2.8 branch has merged NSH userspace patch series, this patch is to enable NSH support in kernel data path in order that OVS can support NSH in compat mode by porting this. Signed-off-by: Yi Yang Acked-by: Jiri Benc Acked-by: Eric Garver Acked-by: Pravin Shelar Signed-off-by: David S. Miller --- include/net/nsh.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/nsh.h b/include/net/nsh.h index a1eaea20be96..350b1ad11c7f 100644 --- a/include/net/nsh.h +++ b/include/net/nsh.h @@ -304,4 +304,7 @@ static inline void nsh_set_flags_ttl_len(struct nshhdr *nsh, u8 flags, NSH_FLAGS_MASK | NSH_TTL_MASK | NSH_LEN_MASK); } +int nsh_push(struct sk_buff *skb, const struct nshhdr *pushed_nh); +int nsh_pop(struct sk_buff *skb); + #endif /* __NET_NSH_H */ -- cgit v1.2.3 From 24a9332a58b7f41a0d36c35a2c6897242bffdbc0 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 6 Nov 2017 16:11:43 -0500 Subject: net: dsa: constify cpu_dp member of dsa_port A DSA port has a dedicated CPU port assigned to it, stored in the cpu_dp member. It is not meant to be modified by a port, thus make it const. Signed-off-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index e54332968417..2a8613b5a23d 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -190,7 +190,7 @@ struct dsa_port { struct dsa_switch *ds; unsigned int index; const char *name; - struct dsa_port *cpu_dp; + const struct dsa_port *cpu_dp; struct device_node *dn; unsigned int ageing_time; u8 stp_state; -- cgit v1.2.3 From ec15dd4269d0cbf947c9a2dfdcf08a917098fab1 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 6 Nov 2017 16:11:46 -0500 Subject: net: dsa: setup and teardown tree This commit provides better scope for the DSA tree setup and teardown functions. It renames the "applied" bool to "setup" and print a message when the tree is setup, as it is done during teardown. At the same time, check dst->setup in dsa_tree_setup, where it is set to true. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 2a8613b5a23d..6c239257309b 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -122,7 +122,7 @@ struct dsa_switch_tree { struct kref refcount; /* Has this tree been applied to the hardware? */ - bool applied; + bool setup; /* * Configuration data for the platform device that owns -- cgit v1.2.3 From c7e460ce55724d4e4e22d3126e5c47273819c53a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 6 Nov 2017 13:47:18 -0800 Subject: Revert "net_sched: hold netns refcnt for each action" This reverts commit ceffcc5e254b450e6159f173e4538215cebf1b59. If we hold that refcnt, the netns can never be destroyed until all actions are destroyed by user, this breaks our netns design which we expect all actions are destroyed when we destroy the whole netns. Cc: Lucas Bates Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/act_api.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 1e6df0eb058f..a10a3b1813f3 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -14,7 +14,6 @@ struct tcf_idrinfo { spinlock_t lock; struct idr action_idr; - struct net *net; }; struct tc_action_ops; @@ -106,7 +105,7 @@ struct tc_action_net { static inline int tc_action_net_init(struct tc_action_net *tn, - const struct tc_action_ops *ops, struct net *net) + const struct tc_action_ops *ops) { int err = 0; @@ -114,7 +113,6 @@ int tc_action_net_init(struct tc_action_net *tn, if (!tn->idrinfo) return -ENOMEM; tn->ops = ops; - tn->idrinfo->net = net; spin_lock_init(&tn->idrinfo->lock); idr_init(&tn->idrinfo->action_idr); return err; -- cgit v1.2.3 From e4b95c41df36befcfd117210900cd790bc2cd048 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 6 Nov 2017 13:47:19 -0800 Subject: net_sched: introduce tcf_exts_get_net() and tcf_exts_put_net() Instead of holding netns refcnt in tc actions, we can minimize the holding time by saving it in struct tcf_exts instead. This means we can just hold netns refcnt right before call_rcu() and release it after tcf_exts_destroy() is done. However, because on netns cleanup path we call tcf_proto_destroy() too, obviously we can not hold netns for a zero refcnt, in this case we have to do cleanup synchronously. It is fine for RCU too, the caller cleanup_net() already waits for a grace period. For other cases, refcnt is non-zero and we can safely grab it as normal and release it after we are done. This patch provides two new API for each filter to use: tcf_exts_get_net() and tcf_exts_put_net(). And all filters now can use the following pattern: void __destroy_filter() { tcf_exts_destroy(); tcf_exts_put_net(); // <== release netns refcnt kfree(); } void some_work() { rtnl_lock(); __destroy_filter(); rtnl_unlock(); } void some_rcu_callback() { tcf_queue_work(some_work); } if (tcf_exts_get_net()) // <== hold netns refcnt call_rcu(some_rcu_callback); else __destroy_filter(); Cc: Lucas Bates Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 70ca2437740e..8826747ef83e 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -94,6 +94,7 @@ struct tcf_exts { __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ int nr_actions; struct tc_action **actions; + struct net *net; #endif /* Map to export classifier specific extension TLV types to the * generic extensions API. Unsupported extensions must be set to 0. @@ -107,6 +108,7 @@ static inline int tcf_exts_init(struct tcf_exts *exts, int action, int police) #ifdef CONFIG_NET_CLS_ACT exts->type = 0; exts->nr_actions = 0; + exts->net = NULL; exts->actions = kcalloc(TCA_ACT_MAX_PRIO, sizeof(struct tc_action *), GFP_KERNEL); if (!exts->actions) @@ -117,6 +119,28 @@ static inline int tcf_exts_init(struct tcf_exts *exts, int action, int police) return 0; } +/* Return false if the netns is being destroyed in cleanup_net(). Callers + * need to do cleanup synchronously in this case, otherwise may race with + * tc_action_net_exit(). Return true for other cases. + */ +static inline bool tcf_exts_get_net(struct tcf_exts *exts) +{ +#ifdef CONFIG_NET_CLS_ACT + exts->net = maybe_get_net(exts->net); + return exts->net != NULL; +#else + return true; +#endif +} + +static inline void tcf_exts_put_net(struct tcf_exts *exts) +{ +#ifdef CONFIG_NET_CLS_ACT + if (exts->net) + put_net(exts->net); +#endif +} + static inline void tcf_exts_to_list(const struct tcf_exts *exts, struct list_head *actions) { -- cgit v1.2.3 From 47d5b6db2afa766d7af85db684d0b5f092e4fc46 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 9 Nov 2017 23:10:59 +0100 Subject: net: bridge: Add/del switchdev object on host join/leave When the host joins or leaves a multicast group, use switchdev to add an object to the hardware to forward traffic for the group to the host. Signed-off-by: Andrew Lunn Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/switchdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index d756fbe46625..39bc855d7fee 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -76,6 +76,7 @@ enum switchdev_obj_id { SWITCHDEV_OBJ_ID_UNDEFINED, SWITCHDEV_OBJ_ID_PORT_VLAN, SWITCHDEV_OBJ_ID_PORT_MDB, + SWITCHDEV_OBJ_ID_HOST_MDB, }; struct switchdev_obj { -- cgit v1.2.3 From a3dcaf17ee54f1d01d22cc2b22cab0b4f60d78cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Nov 2017 00:29:27 -0800 Subject: net: allow per netns sysctl_rmem and sysctl_wmem for protos As we want to gradually implement per netns sysctl_rmem and sysctl_wmem on per protocol basis, add two new fields in struct proto, and two new helpers : sk_get_wmem0() and sk_get_rmem0() First user will be TCP. Then UDP and SCTP can be easily converted, while DECNET probably wont get this support. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 6f1be9726e02..688a823dccc3 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1101,8 +1101,12 @@ struct proto { */ unsigned long *memory_pressure; long *sysctl_mem; + int *sysctl_wmem; int *sysctl_rmem; + u32 sysctl_wmem_offset; + u32 sysctl_rmem_offset; + int max_header; bool no_autobind; @@ -2390,4 +2394,22 @@ extern int sysctl_optmem_max; extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; +static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto) +{ + /* Does this proto have per netns sysctl_wmem ? */ + if (proto->sysctl_wmem_offset) + return *(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset); + + return *proto->sysctl_wmem; +} + +static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto) +{ + /* Does this proto have per netns sysctl_rmem ? */ + if (proto->sysctl_rmem_offset) + return *(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset); + + return *proto->sysctl_rmem; +} + #endif /* _SOCK_H */ -- cgit v1.2.3 From 356d1833b638bd465672aefeb71def3ab93fc17d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Nov 2017 00:29:28 -0800 Subject: tcp: Namespace-ify sysctl_tcp_rmem and sysctl_tcp_wmem Note that when a new netns is created, it inherits its sysctl_tcp_rmem and sysctl_tcp_wmem from initial netns. This change is needed so that we can refine TCP rcvbuf autotuning, to take RTT into consideration. Signed-off-by: Eric Dumazet Cc: Wei Wang Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 ++ include/net/tcp.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 379550f8124a..5e12975fc658 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -155,6 +155,8 @@ struct netns_ipv4 { int sysctl_tcp_invalid_ratelimit; int sysctl_tcp_pacing_ss_ratio; int sysctl_tcp_pacing_ca_ratio; + int sysctl_tcp_wmem[3]; + int sysctl_tcp_rmem[3]; struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; diff --git a/include/net/tcp.h b/include/net/tcp.h index babfd4da1515..2f2c69ad31b2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -242,8 +242,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; -extern int sysctl_tcp_wmem[3]; -extern int sysctl_tcp_rmem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ -- cgit v1.2.3 From 4c5b9d9642c859f7369338fc42c0f62f4151bef3 Mon Sep 17 00:00:00 2001 From: Manish Kurup Date: Tue, 7 Nov 2017 15:49:05 -0500 Subject: act_vlan: VLAN action rewrite to use RCU lock/unlock and update Using a spinlock in the VLAN action causes performance issues when the VLAN action is used on multiple cores. Rewrote the VLAN action to use RCU read locking for reads and updates instead. All functions now use an RCU dereferenced pointer to access the VLAN action context. Modified helper functions used by other modules, to use the RCU as opposed to directly accessing the structure. Acked-by: Jamal Hadi Salim Acked-by: Jiri Pirko Signed-off-by: Manish Kurup Signed-off-by: David S. Miller --- include/net/tc_act/tc_vlan.h | 46 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index c2090df944ff..22ae260d6869 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -13,12 +13,17 @@ #include #include +struct tcf_vlan_params { + int tcfv_action; + u16 tcfv_push_vid; + __be16 tcfv_push_proto; + u8 tcfv_push_prio; + struct rcu_head rcu; +}; + struct tcf_vlan { struct tc_action common; - int tcfv_action; - u16 tcfv_push_vid; - __be16 tcfv_push_proto; - u8 tcfv_push_prio; + struct tcf_vlan_params __rcu *vlan_p; }; #define to_vlan(a) ((struct tcf_vlan *)a) @@ -33,22 +38,45 @@ static inline bool is_tcf_vlan(const struct tc_action *a) static inline u32 tcf_vlan_action(const struct tc_action *a) { - return to_vlan(a)->tcfv_action; + u32 tcfv_action; + + rcu_read_lock(); + tcfv_action = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_action; + rcu_read_unlock(); + + return tcfv_action; } static inline u16 tcf_vlan_push_vid(const struct tc_action *a) { - return to_vlan(a)->tcfv_push_vid; + u16 tcfv_push_vid; + + rcu_read_lock(); + tcfv_push_vid = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_vid; + rcu_read_unlock(); + + return tcfv_push_vid; } static inline __be16 tcf_vlan_push_proto(const struct tc_action *a) { - return to_vlan(a)->tcfv_push_proto; + __be16 tcfv_push_proto; + + rcu_read_lock(); + tcfv_push_proto = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_proto; + rcu_read_unlock(); + + return tcfv_push_proto; } static inline u8 tcf_vlan_push_prio(const struct tc_action *a) { - return to_vlan(a)->tcfv_push_prio; -} + u8 tcfv_push_prio; + rcu_read_lock(); + tcfv_push_prio = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_prio; + rcu_read_unlock(); + + return tcfv_push_prio; +} #endif /* __NET_TC_VLAN_H */ -- cgit v1.2.3 From 8d6e79d3ce13e34957de87f7584cbf1bcde74c57 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 8 Nov 2017 09:59:26 +0100 Subject: tipc: improve link resiliency when rps is activated Currently, the TIPC RPS dissector is based only on the incoming packets' source node address, hence steering all traffic from a node to the same core. We have seen that this makes the links vulnerable to starvation and unnecessary resets when we turn down the link tolerance to very low values. To reduce the risk of this happening, we exempt probe and probe replies packets from the convergence to one core per source node. Instead, we do the opposite, - we try to diverge those packets across as many cores as possible, by randomizing the flow selector key. To make such packets identifiable to the dissector, we add a new 'is_keepalive' bit to word 0 of the LINK_PROTOCOL header. This bit is set both for PROBE and PROBE_REPLY messages, and only for those. It should be noted that these packets are not part of any flow anyway, and only constitute a minuscule fraction of all packets sent across a link. Hence, there is no risk that this will affect overall performance. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 12 ++++----- include/net/tipc.h | 62 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 6 deletions(-) create mode 100644 include/net/tipc.h (limited to 'include/net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 22aba321282d..9a074776f70b 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -84,11 +84,11 @@ struct flow_dissector_key_ipv6_addrs { }; /** - * struct flow_dissector_key_tipc_addrs: - * @srcnode: source node address + * struct flow_dissector_key_tipc: + * @key: source node address combined with selector */ -struct flow_dissector_key_tipc_addrs { - __be32 srcnode; +struct flow_dissector_key_tipc { + __be32 key; }; /** @@ -100,7 +100,7 @@ struct flow_dissector_key_addrs { union { struct flow_dissector_key_ipv4_addrs v4addrs; struct flow_dissector_key_ipv6_addrs v6addrs; - struct flow_dissector_key_tipc_addrs tipcaddrs; + struct flow_dissector_key_tipc tipckey; }; }; @@ -192,7 +192,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */ FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */ - FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */ + FLOW_DISSECTOR_KEY_TIPC, /* struct flow_dissector_key_tipc */ FLOW_DISSECTOR_KEY_ARP, /* struct flow_dissector_key_arp */ FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_flow_vlan */ FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags */ diff --git a/include/net/tipc.h b/include/net/tipc.h new file mode 100644 index 000000000000..07670ec022a7 --- /dev/null +++ b/include/net/tipc.h @@ -0,0 +1,62 @@ +/* + * include/net/tipc.h: Include file for TIPC message header routines + * + * Copyright (c) 2017 Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_HDR_H +#define _TIPC_HDR_H + +#include + +#define KEEPALIVE_MSG_MASK 0x0e080000 /* LINK_PROTOCOL + MSG_IS_KEEPALIVE */ + +struct tipc_basic_hdr { + __be32 w[4]; +}; + +static inline u32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr) +{ + u32 w0 = ntohl(hdr->w[0]); + bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK; + int key; + + /* Return source node identity as key */ + if (likely(!keepalive_msg)) + return hdr->w[3]; + + /* Spread PROBE/PROBE_REPLY messages across the cores */ + get_random_bytes(&key, sizeof(key)); + return key; +} + +#endif -- cgit v1.2.3 From 713bafea92920103cd3d361657406cf04d0e22dd Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 8 Nov 2017 13:01:26 -0800 Subject: tcp: retire FACK loss detection FACK loss detection has been disabled by default and the successor RACK subsumed FACK and can handle reordering better. This patch removes FACK to simplify TCP loss recovery. Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Reviewed-by: Priyaranjan Jha Signed-off-by: David S. Miller --- include/net/tcp.h | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 2f2c69ad31b2..ed71511e67a6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -384,7 +384,6 @@ void tcp_update_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk); void tcp_metrics_init(void); bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); -void tcp_disable_fack(struct tcp_sock *tp); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); void tcp_init_transfer(struct sock *sk, int bpf_op); @@ -776,7 +775,7 @@ struct tcp_skb_cb { }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ - __u8 sacked; /* State flags for SACK/FACK. */ + __u8 sacked; /* State flags for SACK. */ #define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */ #define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */ #define TCPCB_LOST 0x04 /* SKB is lost */ @@ -1066,7 +1065,6 @@ void tcp_rate_check_app_limited(struct sock *sk); * * tcp_is_sack - SACK enabled * tcp_is_reno - No SACK - * tcp_is_fack - FACK enabled, implies SACK enabled */ static inline int tcp_is_sack(const struct tcp_sock *tp) { @@ -1078,16 +1076,6 @@ static inline bool tcp_is_reno(const struct tcp_sock *tp) return !tcp_is_sack(tp); } -static inline bool tcp_is_fack(const struct tcp_sock *tp) -{ - return tp->rx_opt.sack_ok & TCP_FACK_ENABLED; -} - -static inline void tcp_enable_fack(struct tcp_sock *tp) -{ - tp->rx_opt.sack_ok |= TCP_FACK_ENABLED; -} - static inline unsigned int tcp_left_out(const struct tcp_sock *tp) { return tp->sacked_out + tp->lost_out; -- cgit v1.2.3 From 39b175211053c7a6a4d794c42e225994f1c069c2 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Fri, 10 Nov 2017 14:03:51 -0800 Subject: net: Remove unused skb_shared_info member ip6_frag_id was only used by UFO, which has been removed. ipv6_proxy_select_ident() only existed to set ip6_frag_id and has no in-tree callers. Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/ipv6.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index fb6d67012de6..ec14f0d5a3a1 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -767,7 +767,6 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr); -void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb); int ip6_dst_hoplimit(struct dst_entry *dst); -- cgit v1.2.3 From 5ed4e3eb021762fee584ce65620bc822131c7aa0 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 10 Nov 2017 15:22:52 -0800 Subject: net: dsa: Pass a port to get_tag_protocol() A number of drivers want to check whether the configured CPU port is a possible configuration for enabling tagging, pass down the CPU port number so they verify that. Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6c239257309b..68e232fd4b0f 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -321,7 +321,8 @@ struct dsa_switch_ops { struct device *host_dev, int sw_addr, void **priv); - enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds); + enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds, + int port); int (*setup)(struct dsa_switch *ds); u32 (*get_phy_flags)(struct dsa_switch *ds, int port); -- cgit v1.2.3 From b74b70c44986dee87881fbed3d912e02c5dcf78c Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 10 Nov 2017 15:22:54 -0800 Subject: net: dsa: Support prepended Broadcom tag Add a new type: DSA_TAG_PROTO_PREPEND which allows us to support for the 4-bytes Broadcom tag that we already support, but in a format where it is pre-pended to the packet instead of located between the MAC SA and the Ethertyper (DSA_TAG_PROTO_BRCM). Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 68e232fd4b0f..2a05738570d8 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -29,6 +29,7 @@ struct fixed_phy_status; enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = 0, DSA_TAG_PROTO_BRCM, + DSA_TAG_PROTO_BRCM_PREPEND, DSA_TAG_PROTO_DSA, DSA_TAG_PROTO_EDSA, DSA_TAG_PROTO_KSZ, -- cgit v1.2.3 From 3a9b76fd0db9f0d426533f96a68a62a58753a51e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 11 Nov 2017 15:54:12 -0800 Subject: tcp: allow drivers to tweak TSQ logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I had many reports that TSQ logic breaks wifi aggregation. Current logic is to allow up to 1 ms of bytes to be queued into qdisc and drivers queues. But Wifi aggregation needs a bigger budget to allow bigger rates to be discovered by various TCP Congestion Controls algorithms. This patch adds an extra socket field, allowing wifi drivers to select another log scale to derive TCP Small Queue credit from current pacing rate. Initial value is 10, meaning that this patch does not change current behavior. We expect wifi drivers to set this field to smaller values (tests have been done with values from 6 to 9) They would have to use following template : if (skb->sk && skb->sk->sk_pacing_shift != MY_PACING_SHIFT) skb->sk->sk_pacing_shift = MY_PACING_SHIFT; Ref: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1670041 Signed-off-by: Eric Dumazet Cc: Johannes Berg Cc: Toke Høiland-Jørgensen Cc: Kir Kolyshkin Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 688a823dccc3..f8715c5af37d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -267,6 +267,7 @@ struct sock_common { * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) * @sk_gso_max_size: Maximum GSO segment size to build * @sk_gso_max_segs: Maximum number of GSO segments + * @sk_pacing_shift: scaling factor for TCP Small Queues * @sk_lingertime: %SO_LINGER l_linger setting * @sk_backlog: always used with the per-socket spinlock held * @sk_callback_lock: used with the callbacks in the end of this struct @@ -451,6 +452,7 @@ struct sock { kmemcheck_bitfield_end(flags); u16 sk_gso_max_segs; + u8 sk_pacing_shift; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; -- cgit v1.2.3 From 6d88207fcfddc002afe3e2e4a455e5201089d5d9 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 13 Nov 2017 10:22:45 +0200 Subject: tls: Add function to update the TLS socket configuration The tx configuration is now stored in ctx->tx_conf. And sk->sk_prot is updated trough a function This will simplify things when we add rx and support for different possible tx and rx cross configurations. Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- include/net/tls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index b89d397dd62f..f058a6e08eaa 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -83,6 +83,8 @@ struct tls_context { void *priv_ctx; + u8 tx_conf:2; + u16 prepend_size; u16 tag_size; u16 overhead_size; -- cgit v1.2.3 From ff45d820a2df163957ad8ab459b6eb6976144c18 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 13 Nov 2017 10:22:46 +0200 Subject: tls: Fix TLS ulp context leak, when TLS_TX setsockopt is not used. Previously the TLS ulp context would leak if we attached a TLS ulp to a socket but did not use the TLS_TX setsockopt, or did use it but it failed. This patch solves the issue by overriding prot[TLS_BASE_TX].close and fixing tls_sk_proto_close to work properly when its called with ctx->tx_conf == TLS_BASE_TX. This patch also removes ctx->free_resources as we can use ctx->tx_conf to obtain the relevant information. Fixes: 3c4d7559159b ('tls: kernel TLS support') Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- include/net/tls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index f058a6e08eaa..7cb58a6b8fd0 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -99,7 +99,6 @@ struct tls_context { u16 pending_open_record_frags; int (*push_pending_record)(struct sock *sk, int flags); - void (*free_resources)(struct sock *sk); void (*sk_write_space)(struct sock *sk); void (*sk_proto_close)(struct sock *sk, long timeout); @@ -124,6 +123,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); void tls_sw_close(struct sock *sk, long timeout); +void tls_sw_free_tx_resources(struct sock *sk); void tls_sk_destruct(struct sock *sk, struct tls_context *ctx); void tls_icsk_clean_acked(struct sock *sk); -- cgit v1.2.3 From 213ef6e7c9c063c482d77f12cc438872628d48ec Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Mon, 13 Nov 2017 10:22:47 +0200 Subject: tls: Move tls_make_aad to header to allow sharing move tls_make_aad as it is going to be reused by the device offload code and rx path. Remove unused recv parameter. Signed-off-by: Ilya Lesokhin Signed-off-by: David S. Miller --- include/net/tls.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index 7cb58a6b8fd0..70becd0a9299 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -214,6 +214,21 @@ static inline void tls_fill_prepend(struct tls_context *ctx, ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size); } +static inline void tls_make_aad(char *buf, + size_t size, + char *record_sequence, + int record_sequence_size, + unsigned char record_type) +{ + memcpy(buf, record_sequence, record_sequence_size); + + buf[8] = record_type; + buf[9] = TLS_1_2_VERSION_MAJOR; + buf[10] = TLS_1_2_VERSION_MINOR; + buf[11] = size >> 8; + buf[12] = size & 0xFF; +} + static inline struct tls_context *tls_get_ctx(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); -- cgit v1.2.3 From b9f3eb499d84f8d4adcb2f9212ec655700b28228 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Tue, 14 Nov 2017 06:30:11 +0300 Subject: uapi: fix linux/tls.h userspace compilation error Move inclusion of a private kernel header from uapi/linux/tls.h to its only user - net/tls.h, to fix the following linux/tls.h userspace compilation error: /usr/include/linux/tls.h:41:21: fatal error: net/tcp.h: No such file or directory As to this point uapi/linux/tls.h was totaly unusuable for userspace, cleanup this header file further by moving other redundant includes to net/tls.h. Fixes: 3c4d7559159b ("tls: kernel TLS support") Cc: # v4.13+ Signed-off-by: Dmitry V. Levin Signed-off-by: David S. Miller --- include/net/tls.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index 70becd0a9299..936cfc5cab7d 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -35,6 +35,10 @@ #define _TLS_OFFLOAD_H #include +#include +#include +#include +#include #include -- cgit v1.2.3 From 6670e152447732ba90626f36dfc015a13fbf150e Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 14 Nov 2017 08:25:49 -0800 Subject: tcp: Namespace-ify sysctl_tcp_default_congestion_control Make default TCP default congestion control to a per namespace value. This changes default congestion control to a pointer to congestion ops (rather than implicit as first element of available lsit). The congestion control setting of new namespaces is inherited from the current setting of the root namespace. Signed-off-by: Stephen Hemminger Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 5e12975fc658..44668c29701a 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -160,6 +160,7 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; + const struct tcp_congestion_ops __rcu *tcp_congestion_control; struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; spinlock_t tcp_fastopen_ctx_lock; unsigned int sysctl_tcp_fastopen_blackhole_timeout; diff --git a/include/net/tcp.h b/include/net/tcp.h index ed71511e67a6..35cc7d0d3d47 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1002,8 +1002,8 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); void tcp_assign_congestion_control(struct sock *sk); void tcp_init_congestion_control(struct sock *sk); void tcp_cleanup_congestion_control(struct sock *sk); -int tcp_set_default_congestion_control(const char *name); -void tcp_get_default_congestion_control(char *name); +int tcp_set_default_congestion_control(struct net *net, const char *name); +void tcp_get_default_congestion_control(struct net *net, char *name); void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); @@ -1017,7 +1017,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find_key(u32 key); -u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca); +u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca); #ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); #else -- cgit v1.2.3 From 50895b9de1d3e0258e015e8e55128d835d9a9f19 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Nov 2017 21:02:19 -0800 Subject: tcp: highest_sack fix syzbot easily found a regression added in our latest patches [1] No longer set tp->highest_sack to the head of the send queue since this is not logical and error prone. Only sack processing should maintain the pointer to an skb from rtx queue. We might in the future only remember the sequence instead of a pointer to skb, since rb-tree should allow a fast lookup. [1] BUG: KASAN: use-after-free in tcp_highest_sack_seq include/net/tcp.h:1706 [inline] BUG: KASAN: use-after-free in tcp_ack+0x42bb/0x4fd0 net/ipv4/tcp_input.c:3537 Read of size 4 at addr ffff8801c154faa8 by task syz-executor4/12860 CPU: 0 PID: 12860 Comm: syz-executor4 Not tainted 4.14.0-next-20171113+ #41 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x25b/0x340 mm/kasan/report.c:409 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report.c:429 tcp_highest_sack_seq include/net/tcp.h:1706 [inline] tcp_ack+0x42bb/0x4fd0 net/ipv4/tcp_input.c:3537 tcp_rcv_established+0x672/0x18a0 net/ipv4/tcp_input.c:5439 tcp_v4_do_rcv+0x2ab/0x7d0 net/ipv4/tcp_ipv4.c:1468 sk_backlog_rcv include/net/sock.h:909 [inline] __release_sock+0x124/0x360 net/core/sock.c:2264 release_sock+0xa4/0x2a0 net/core/sock.c:2778 tcp_sendmsg+0x3a/0x50 net/ipv4/tcp.c:1462 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763 sock_sendmsg_nosec net/socket.c:632 [inline] sock_sendmsg+0xca/0x110 net/socket.c:642 ___sys_sendmsg+0x75b/0x8a0 net/socket.c:2048 __sys_sendmsg+0xe5/0x210 net/socket.c:2082 SYSC_sendmsg net/socket.c:2093 [inline] SyS_sendmsg+0x2d/0x50 net/socket.c:2089 entry_SYSCALL_64_fastpath+0x1f/0x96 RIP: 0033:0x452879 RSP: 002b:00007fc9761bfbe8 EFLAGS: 00000212 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000758020 RCX: 0000000000452879 RDX: 0000000000000000 RSI: 0000000020917fc8 RDI: 0000000000000015 RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000212 R12: 00000000006ee3a0 R13: 00000000ffffffff R14: 00007fc9761c06d4 R15: 0000000000000000 Allocated by task 12860: save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489 kmem_cache_alloc_node+0x144/0x760 mm/slab.c:3638 __alloc_skb+0xf1/0x780 net/core/skbuff.c:193 alloc_skb_fclone include/linux/skbuff.h:1023 [inline] sk_stream_alloc_skb+0x11d/0x900 net/ipv4/tcp.c:870 tcp_sendmsg_locked+0x1341/0x3b80 net/ipv4/tcp.c:1299 tcp_sendmsg+0x2f/0x50 net/ipv4/tcp.c:1461 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763 sock_sendmsg_nosec net/socket.c:632 [inline] sock_sendmsg+0xca/0x110 net/socket.c:642 SYSC_sendto+0x358/0x5a0 net/socket.c:1749 SyS_sendto+0x40/0x50 net/socket.c:1717 entry_SYSCALL_64_fastpath+0x1f/0x96 Freed by task 12860: save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524 __cache_free mm/slab.c:3492 [inline] kmem_cache_free+0x77/0x280 mm/slab.c:3750 kfree_skbmem+0xdd/0x1d0 net/core/skbuff.c:603 __kfree_skb+0x1d/0x20 net/core/skbuff.c:642 sk_wmem_free_skb include/net/sock.h:1419 [inline] tcp_rtx_queue_unlink_and_free include/net/tcp.h:1682 [inline] tcp_clean_rtx_queue net/ipv4/tcp_input.c:3111 [inline] tcp_ack+0x1b17/0x4fd0 net/ipv4/tcp_input.c:3593 tcp_rcv_established+0x672/0x18a0 net/ipv4/tcp_input.c:5439 tcp_v4_do_rcv+0x2ab/0x7d0 net/ipv4/tcp_ipv4.c:1468 sk_backlog_rcv include/net/sock.h:909 [inline] __release_sock+0x124/0x360 net/core/sock.c:2264 release_sock+0xa4/0x2a0 net/core/sock.c:2778 tcp_sendmsg+0x3a/0x50 net/ipv4/tcp.c:1462 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763 sock_sendmsg_nosec net/socket.c:632 [inline] sock_sendmsg+0xca/0x110 net/socket.c:642 ___sys_sendmsg+0x75b/0x8a0 net/socket.c:2048 __sys_sendmsg+0xe5/0x210 net/socket.c:2082 SYSC_sendmsg net/socket.c:2093 [inline] SyS_sendmsg+0x2d/0x50 net/socket.c:2089 entry_SYSCALL_64_fastpath+0x1f/0x96 The buggy address belongs to the object at ffff8801c154fa80 which belongs to the cache skbuff_fclone_cache of size 456 The buggy address is located 40 bytes inside of 456-byte region [ffff8801c154fa80, ffff8801c154fc48) The buggy address belongs to the page: page:ffffea00070553c0 count:1 mapcount:0 mapping:ffff8801c154f080 index:0x0 flags: 0x2fffc0000000100(slab) raw: 02fffc0000000100 ffff8801c154f080 0000000000000000 0000000100000006 raw: ffffea00070a5a20 ffffea0006a18360 ffff8801d9ca0500 0000000000000000 page dumped because: kasan: bad access detected Fixes: 737ff314563c ("tcp: use sequence distance to detect reordering") Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/tcp.h | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 35cc7d0d3d47..85ea578195d4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1630,9 +1630,6 @@ static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unli { if (tcp_write_queue_empty(sk)) tcp_chrono_stop(sk, TCP_CHRONO_BUSY); - - if (tcp_sk(sk)->highest_sack == skb_unlinked) - tcp_sk(sk)->highest_sack = NULL; } static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) @@ -1645,12 +1642,8 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb __tcp_add_write_queue_tail(sk, skb); /* Queue it, remembering where we must start sending. */ - if (sk->sk_write_queue.next == skb) { + if (sk->sk_write_queue.next == skb) tcp_chrono_start(sk, TCP_CHRONO_BUSY); - - if (tcp_sk(sk)->highest_sack == NULL) - tcp_sk(sk)->highest_sack = skb; - } } /* Insert new before skb on the write queue of sk. */ @@ -1708,9 +1701,7 @@ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp) static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb) { - struct sk_buff *next = skb_rb_next(skb); - - tcp_sk(sk)->highest_sack = next ?: tcp_send_head(sk); + tcp_sk(sk)->highest_sack = skb_rb_next(skb); } static inline struct sk_buff *tcp_highest_sack(struct sock *sk) @@ -1720,9 +1711,7 @@ static inline struct sk_buff *tcp_highest_sack(struct sock *sk) static inline void tcp_highest_sack_reset(struct sock *sk) { - struct sk_buff *skb = tcp_rtx_queue_head(sk); - - tcp_sk(sk)->highest_sack = skb ?: tcp_send_head(sk); + tcp_sk(sk)->highest_sack = tcp_rtx_queue_head(sk); } /* Called when old skb is about to be deleted and replaced by new skb */ -- cgit v1.2.3 From 0a833c29d89656025443cb9f0ebff7052dd95ce0 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Wed, 15 Nov 2017 13:09:32 +0100 Subject: genetlink: fix genlmsg_nlhdr() According to the description, first argument of genlmsg_nlhdr() points to what genlmsg_put() returns, i.e. beginning of user header. Therefore we should only subtract size of genetlink header and netlink message header, not user header. This also means we don't need to pass the pointer to genetlink family and the same is true for genl_dump_check_consistent() which is the only caller of genlmsg_nlhdr(). (Note that at the moment, these functions are only used for families which do not have user header so that they are not affected.) Fixes: 670dc2833d14 ("netlink: advertise incomplete dumps") Signed-off-by: Michal Kubecek Reviewed-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/genetlink.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 5ac169a735f4..decf6012a401 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -154,15 +154,12 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, /** * genlmsg_nlhdr - Obtain netlink header from user specified header * @user_hdr: user header as returned from genlmsg_put() - * @family: generic netlink family * * Returns pointer to netlink header. */ -static inline struct nlmsghdr * -genlmsg_nlhdr(void *user_hdr, const struct genl_family *family) +static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr) { return (struct nlmsghdr *)((char *)user_hdr - - family->hdrsize - GENL_HDRLEN - NLMSG_HDRLEN); } @@ -190,16 +187,14 @@ static inline int genlmsg_parse(const struct nlmsghdr *nlh, * genl_dump_check_consistent - check if sequence is consistent and advertise if not * @cb: netlink callback structure that stores the sequence number * @user_hdr: user header as returned from genlmsg_put() - * @family: generic netlink family * * Cf. nl_dump_check_consistent(), this just provides a wrapper to make it * simpler to use with generic netlink. */ static inline void genl_dump_check_consistent(struct netlink_callback *cb, - void *user_hdr, - const struct genl_family *family) + void *user_hdr) { - nl_dump_check_consistent(cb, genlmsg_nlhdr(user_hdr, family)); + nl_dump_check_consistent(cb, genlmsg_nlhdr(user_hdr)); } /** -- cgit v1.2.3 From d50112edde1d0c621520e53747044009f11c656b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 15 Nov 2017 17:32:18 -0800 Subject: slab, slub, slob: add slab_flags_t Add sparse-checked slab_flags_t for struct kmem_cache::flags (SLAB_POISON, etc). SLAB is bloated temporarily by switching to "unsigned long", but only temporarily. Link: http://lkml.kernel.org/r/20171021100225.GA22428@avx2 Signed-off-by: Alexey Dobriyan Acked-by: Pekka Enberg Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/net/sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index a6b9a8d1a6df..c577286dbffb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1105,7 +1105,7 @@ struct proto { struct kmem_cache *slab; unsigned int obj_size; - int slab_flags; + slab_flags_t slab_flags; struct percpu_counter *orphan_count; -- cgit v1.2.3 From 4950276672fce5c241857540f8561c440663673d Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Wed, 15 Nov 2017 17:35:51 -0800 Subject: kmemcheck: remove annotations Patch series "kmemcheck: kill kmemcheck", v2. As discussed at LSF/MM, kill kmemcheck. KASan is a replacement that is able to work without the limitation of kmemcheck (single CPU, slow). KASan is already upstream. We are also not aware of any users of kmemcheck (or users who don't consider KASan as a suitable replacement). The only objection was that since KASAN wasn't supported by all GCC versions provided by distros at that time we should hold off for 2 years, and try again. Now that 2 years have passed, and all distros provide gcc that supports KASAN, kill kmemcheck again for the very same reasons. This patch (of 4): Remove kmemcheck annotations, and calls to kmemcheck from the kernel. [alexander.levin@verizon.com: correctly remove kmemcheck call from dma_map_sg_attrs] Link: http://lkml.kernel.org/r/20171012192151.26531-1-alexander.levin@verizon.com Link: http://lkml.kernel.org/r/20171007030159.22241-2-alexander.levin@verizon.com Signed-off-by: Sasha Levin Cc: Alexander Potapenko Cc: Eric W. Biederman Cc: Michal Hocko Cc: Pekka Enberg Cc: Steven Rostedt Cc: Tim Hansen Cc: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/net/inet_sock.h | 3 --- include/net/inet_timewait_sock.h | 4 ---- include/net/sock.h | 3 --- 3 files changed, 10 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index db8162dd8c0b..8e51b4a69088 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -17,7 +17,6 @@ #define _INET_SOCK_H #include -#include #include #include #include @@ -84,7 +83,6 @@ struct inet_request_sock { #define ireq_state req.__req_common.skc_state #define ireq_family req.__req_common.skc_family - kmemcheck_bitfield_begin(flags); u16 snd_wscale : 4, rcv_wscale : 4, tstamp_ok : 1, @@ -93,7 +91,6 @@ struct inet_request_sock { ecn_ok : 1, acked : 1, no_srccheck: 1; - kmemcheck_bitfield_end(flags); u32 ir_mark; union { struct ip_options_rcu __rcu *ireq_opt; diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 6a75d67a30fd..1356fa6a7566 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -15,8 +15,6 @@ #ifndef _INET_TIMEWAIT_SOCK_ #define _INET_TIMEWAIT_SOCK_ - -#include #include #include #include @@ -69,14 +67,12 @@ struct inet_timewait_sock { /* Socket demultiplex comparisons on incoming packets. */ /* these three are in inet_sock */ __be16 tw_sport; - kmemcheck_bitfield_begin(flags); /* And these are ours. */ unsigned int tw_kill : 1, tw_transparent : 1, tw_flowlabel : 20, tw_pad : 2, /* 2 bits hole */ tw_tos : 8; - kmemcheck_bitfield_end(flags); struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; }; diff --git a/include/net/sock.h b/include/net/sock.h index c577286dbffb..a63e6a8bb7e0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -436,7 +436,6 @@ struct sock { #define SK_FL_TYPE_MASK 0xffff0000 #endif - kmemcheck_bitfield_begin(flags); unsigned int sk_padding : 1, sk_kern_sock : 1, sk_no_check_tx : 1, @@ -445,8 +444,6 @@ struct sock { sk_protocol : 8, sk_type : 16; #define SK_PROTOCOL_MAX U8_MAX - kmemcheck_bitfield_end(flags); - u16 sk_gso_max_segs; unsigned long sk_lingertime; struct proto *sk_prot_creator; -- cgit v1.2.3 From ecca8f88da5c4260cc2bccfefd2a24976704c366 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 17 Nov 2017 14:11:11 +0800 Subject: sctp: set frag_point in sctp_setsockopt_maxseg correctly Now in sctp_setsockopt_maxseg user_frag or frag_point can be set with val >= 8 and val <= SCTP_MAX_CHUNK_LEN. But both checks are incorrect. val >= 8 means frag_point can even be less than SCTP_DEFAULT_MINSEGMENT. Then in sctp_datamsg_from_user(), when it's value is greater than cookie echo len and trying to bundle with cookie echo chunk, the first_len will overflow. The worse case is when it's value is equal as cookie echo len, first_len becomes 0, it will go into a dead loop for fragment later on. In Hangbin syzkaller testing env, oom was even triggered due to consecutive memory allocation in that loop. Besides, SCTP_MAX_CHUNK_LEN is the max size of the whole chunk, it should deduct the data header for frag_point or user_frag check. This patch does a proper check with SCTP_DEFAULT_MINSEGMENT subtracting the sctphdr and datahdr, SCTP_MAX_CHUNK_LEN subtracting datahdr when setting frag_point via sockopt. It also improves sctp_setsockopt_maxseg codes. Suggested-by: Marcelo Ricardo Leitner Reported-by: Hangbin Liu Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index d7d8cba01469..749a42882437 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -444,7 +444,8 @@ static inline int sctp_frag_point(const struct sctp_association *asoc, int pmtu) if (asoc->user_frag) frag = min_t(int, frag, asoc->user_frag); - frag = SCTP_TRUNC4(min_t(int, frag, SCTP_MAX_CHUNK_LEN)); + frag = SCTP_TRUNC4(min_t(int, frag, SCTP_MAX_CHUNK_LEN - + sizeof(struct sctp_data_chunk))); return frag; } -- cgit v1.2.3 From ed66dfaf236c04d414de1d218441296e57fb2bd2 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 17 Nov 2017 21:06:14 -0500 Subject: tcp: when scheduling TLP, time of RTO should account for current ACK Fix the TLP scheduling logic so that when scheduling a TLP probe, we ensure that the estimated time at which an RTO would fire accounts for the fact that ACKs indicating forward progress should push back RTO times. After the following fix: df92c8394e6e ("tcp: fix xmit timer to only be reset if data ACKed/SACKed") we had an unintentional behavior change in the following kind of scenario: suppose the RTT variance has been very low recently. Then suppose we send out a flight of N packets and our RTT is 100ms: t=0: send a flight of N packets t=100ms: receive an ACK for N-1 packets The response before df92c8394e6e that was: -> schedule a TLP for now + RTO_interval The response after df92c8394e6e is: -> schedule a TLP for t=0 + RTO_interval Since RTO_interval = srtt + RTT_variance, this means that we have scheduled a TLP timer at a point in the future that only accounts for RTT_variance. If the RTT_variance term is small, this means that the timer fires soon. Before df92c8394e6e this would not happen, because in that code, when we receive an ACK for a prefix of flight, we did: 1) Near the top of tcp_ack(), switch from TLP timer to RTO at write_queue_head->paket_tx_time + RTO_interval: if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); 2) In tcp_clean_rtx_queue(), update the RTO to now + RTO_interval: if (flag & FLAG_ACKED) { tcp_rearm_rto(sk); 3) In tcp_ack() after tcp_fastretrans_alert() switch from RTO to TLP at now + RTO_interval: if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); In df92c8394e6e we removed that 3-phase dance, and instead directly set the TLP timer once: we set the TLP timer in cases like this to write_queue_head->packet_tx_time + RTO_interval. So if the RTT variance is small, then this means that this is setting the TLP timer to fire quite soon. This means if the ACK for the tail of the flight takes longer than an RTT to arrive (often due to delayed ACKs), then the TLP timer fires too quickly. Fixes: df92c8394e6e ("tcp: fix xmit timer to only be reset if data ACKed/SACKed") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 85ea578195d4..4e09398009c1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -539,7 +539,7 @@ void tcp_push_one(struct sock *, unsigned int mss_now); void tcp_send_ack(struct sock *sk); void tcp_send_delayed_ack(struct sock *sk); void tcp_send_loss_probe(struct sock *sk); -bool tcp_schedule_loss_probe(struct sock *sk); +bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto); void tcp_skb_collapse_tstamp(struct sk_buff *skb, const struct sk_buff *next_skb); -- cgit v1.2.3 From 0c19f846d582af919db66a5914a0189f9f92c936 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 21 Nov 2017 10:22:25 -0500 Subject: net: accept UFO datagrams from tuntap and packet Tuntap and similar devices can inject GSO packets. Accept type VIRTIO_NET_HDR_GSO_UDP, even though not generating UFO natively. Processes are expected to use feature negotiation such as TUNSETOFFLOAD to detect supported offload types and refrain from injecting other packets. This process breaks down with live migration: guest kernels do not renegotiate flags, so destination hosts need to expose all features that the source host does. Partially revert the UFO removal from 182e0b6b5846~1..d9d30adf5677. This patch introduces nearly(*) no new code to simplify verification. It brings back verbatim tuntap UFO negotiation, VIRTIO_NET_HDR_GSO_UDP insertion and software UFO segmentation. It does not reinstate protocol stack support, hardware offload (NETIF_F_UFO), SKB_GSO_UDP tunneling in SKB_GSO_SOFTWARE or reception of VIRTIO_NET_HDR_GSO_UDP packets in tuntap. To support SKB_GSO_UDP reappearing in the stack, also reinstate logic in act_csum and openvswitch. Achieve equivalence with v4.13 HEAD by squashing in commit 939912216fa8 ("net: skb_needs_check() removes CHECKSUM_UNNECESSARY check for tx.") and reverting commit 8d63bee643f1 ("net: avoid skb_warn_bad_offload false positives on UFO"). (*) To avoid having to bring back skb_shinfo(skb)->ip6_frag_id, ipv6_proxy_select_ident is changed to return a __be32 and this is assigned directly to the frag_hdr. Also, SKB_GSO_UDP is inserted at the end of the enum to minimize code churn. Tested Booted a v4.13 guest kernel with QEMU. On a host kernel before this patch `ethtool -k eth0` shows UFO disabled. After the patch, it is enabled, same as on a v4.13 host kernel. A UFO packet sent from the guest appears on the tap device: host: nc -l -p -u 8000 & tcpdump -n -i tap0 guest: dd if=/dev/zero of=payload.txt bs=1 count=2000 nc -u 192.16.1.1 8000 < payload.txt Direct tap to tap transmission of VIRTIO_NET_HDR_GSO_UDP succeeds, packets arriving fragmented: ./with_tap_pair.sh ./tap_send_ufo tap0 tap1 (from https://github.com/wdebruij/kerneltools/tree/master/tests) Changes v1 -> v2 - simplified set_offload change (review comment) - documented test procedure Link: http://lkml.kernel.org/r/ Fixes: fb652fdfe837 ("macvlan/macvtap: Remove NETIF_F_UFO advertisement.") Reported-by: Michal Kubecek Signed-off-by: Willem de Bruijn Acked-by: Jason Wang Signed-off-by: David S. Miller --- include/net/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index ec14f0d5a3a1..f73797e2fa60 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -767,6 +767,7 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr); +__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb); int ip6_dst_hoplimit(struct dst_entry *dst); -- cgit v1.2.3 From 7b6ddeaf27eca72795ceeae2f0f347db1b5f9a30 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 21 Nov 2017 14:46:08 +0100 Subject: mac80211: use QoS NDP for AP probing When connected to a QoS/WMM AP, mac80211 should use a QoS NDP for probing it, instead of a regular non-QoS one, fix this. Change all the drivers to *not* allow QoS NDP for now, even though it looks like most of them should be OK with that. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index cc9073e45be9..eec143cca1c0 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4470,18 +4470,24 @@ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, * ieee80211_nullfunc_get - retrieve a nullfunc template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @qos_ok: QoS NDP is acceptable to the caller, this should be set + * if at all possible * * Creates a Nullfunc template which can, for example, uploaded to * hardware. The template must be updated after association so that correct * BSSID and address is used. * + * If @qos_ndp is set and the association is to an AP with QoS/WMM, the + * returned packet will be QoS NDP. + * * Note: Caller (or hardware) is responsible for setting the * &IEEE80211_FCTL_PM bit as well as Duration and Sequence Control fields. * * Return: The nullfunc template. %NULL on error. */ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, - struct ieee80211_vif *vif); + struct ieee80211_vif *vif, + bool qos_ok); /** * ieee80211_probereq_get - retrieve a Probe Request template -- cgit v1.2.3 From ade994f4f6c8c3ef4c3bfc2d02166262fb9d089c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 3 Jul 2017 00:01:49 -0400 Subject: net: annotate ->poll() instances Signed-off-by: Al Viro --- include/net/bluetooth/bluetooth.h | 2 +- include/net/inet_connection_sock.h | 2 +- include/net/iucv/af_iucv.h | 2 +- include/net/sctp/sctp.h | 2 +- include/net/sock.h | 2 +- include/net/tcp.h | 2 +- include/net/udp.h | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index e89cff0c4c23..ec9d6bc65855 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -271,7 +271,7 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); -uint bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait); +__poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait); int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo); int bt_sock_wait_ready(struct sock *sk, unsigned long flags); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 0358745ea059..ec72cdb5bc39 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -305,7 +305,7 @@ void inet_csk_prepare_forced_close(struct sock *sk); /* * LISTEN is a special case for poll.. */ -static inline unsigned int inet_csk_listen_poll(const struct sock *sk) +static inline __poll_t inet_csk_listen_poll(const struct sock *sk) { return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ? (POLLIN | POLLRDNORM) : 0; diff --git a/include/net/iucv/af_iucv.h b/include/net/iucv/af_iucv.h index 070e93a17c59..f4c21b5a1242 100644 --- a/include/net/iucv/af_iucv.h +++ b/include/net/iucv/af_iucv.h @@ -153,7 +153,7 @@ struct iucv_sock_list { atomic_t autobind_name; }; -unsigned int iucv_sock_poll(struct file *file, struct socket *sock, +__poll_t iucv_sock_poll(struct file *file, struct socket *sock, poll_table *wait); void iucv_sock_link(struct iucv_sock_list *l, struct sock *s); void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *s); diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 749a42882437..6b765de288d8 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -107,7 +107,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb); int sctp_inet_listen(struct socket *sock, int backlog); void sctp_write_space(struct sock *sk); void sctp_data_ready(struct sock *sk); -unsigned int sctp_poll(struct file *file, struct socket *sock, +__poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait); void sctp_sock_rfree(struct sk_buff *skb); void sctp_copy_sock(struct sock *newsk, struct sock *sk, diff --git a/include/net/sock.h b/include/net/sock.h index 79e1a2c7912c..b33078333518 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1582,7 +1582,7 @@ int sock_no_connect(struct socket *, struct sockaddr *, int, int); int sock_no_socketpair(struct socket *, struct socket *); int sock_no_accept(struct socket *, struct socket *, int, bool); int sock_no_getname(struct socket *, struct sockaddr *, int *, int); -unsigned int sock_no_poll(struct file *, struct socket *, +__poll_t sock_no_poll(struct file *, struct socket *, struct poll_table_struct *); int sock_no_ioctl(struct socket *, unsigned int, unsigned long); int sock_no_listen(struct socket *, int); diff --git a/include/net/tcp.h b/include/net/tcp.h index 4e09398009c1..a6343271c629 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -387,7 +387,7 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); void tcp_init_transfer(struct sock *sk, int bpf_op); -unsigned int tcp_poll(struct file *file, struct socket *sock, +__poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); diff --git a/include/net/udp.h b/include/net/udp.h index 6c759c8594e2..850a8e581cce 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -275,7 +275,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); int udp_init_sock(struct sock *sk); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); -unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait); +__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, bool is_ipv6); -- cgit v1.2.3 From af2697a0273f7665429c47d71ab26f2412af924e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 26 Nov 2017 20:16:07 +0800 Subject: sctp: force the params with right types for sctp csum apis Now sctp_csum_xxx doesn't really match the param types of these common csum apis. As sctp_csum_xxx is defined in sctp/checksum.h, many sparse errors occur when make C=2 not only with M=net/sctp but also with other modules that include this header file. This patch is to force them fit in csum apis with the right types. Fixes: e6d8b64b34aa ("net: sctp: fix and consolidate SCTP checksumming code") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/checksum.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h index 4a5b9a306c69..32ee65a30aff 100644 --- a/include/net/sctp/checksum.h +++ b/include/net/sctp/checksum.h @@ -48,31 +48,32 @@ static inline __wsum sctp_csum_update(const void *buff, int len, __wsum sum) /* This uses the crypto implementation of crc32c, which is either * implemented w/ hardware support or resolves to __crc32c_le(). */ - return crc32c(sum, buff, len); + return (__force __wsum)crc32c((__force __u32)sum, buff, len); } static inline __wsum sctp_csum_combine(__wsum csum, __wsum csum2, int offset, int len) { - return __crc32c_le_combine(csum, csum2, len); + return (__force __wsum)__crc32c_le_combine((__force __u32)csum, + (__force __u32)csum2, len); } static inline __le32 sctp_compute_cksum(const struct sk_buff *skb, unsigned int offset) { struct sctphdr *sh = sctp_hdr(skb); - __le32 ret, old = sh->checksum; const struct skb_checksum_ops ops = { .update = sctp_csum_update, .combine = sctp_csum_combine, }; + __le32 old = sh->checksum; + __wsum new; sh->checksum = 0; - ret = cpu_to_le32(~__skb_checksum(skb, offset, skb->len - offset, - ~(__u32)0, &ops)); + new = ~__skb_checksum(skb, offset, skb->len - offset, ~(__wsum)0, &ops); sh->checksum = old; - return ret; + return cpu_to_le32((__force __u32)new); } #endif /* __sctp_checksum_h__ */ -- cgit v1.2.3 From 1ba896f6f52bfafac6dec4ca583cdd9a073858e8 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 26 Nov 2017 20:16:08 +0800 Subject: sctp: remove extern from stream sched Now each stream sched ops is defined in different .c file and added into the global ops in another .c file, it uses extern to make this work. However extern is not good coding style to get them in and even make C=2 reports errors for this. This patch adds sctp_sched_ops_xxx_init for each stream sched ops in their .c file, then get them into the global ops by calling them when initializing sctp module. Fixes: 637784ade221 ("sctp: introduce priority based stream scheduler") Fixes: ac1ed8b82cd6 ("sctp: introduce round robin stream scheduler") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 5 +++++ include/net/sctp/stream_sched.h | 5 +++++ 2 files changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 749a42882437..906a9c0efa71 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -194,6 +194,11 @@ void sctp_remaddr_proc_exit(struct net *net); */ int sctp_offload_init(void); +/* + * sctp/stream_sched.c + */ +void sctp_sched_ops_init(void); + /* * sctp/stream.c */ diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h index c676550a4c7d..5c5da48f65e7 100644 --- a/include/net/sctp/stream_sched.h +++ b/include/net/sctp/stream_sched.h @@ -69,4 +69,9 @@ void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch); int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp); struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream); +void sctp_sched_ops_register(enum sctp_sched_type sched, + struct sctp_sched_ops *sched_ops); +void sctp_sched_ops_prio_init(void); +void sctp_sched_ops_rr_init(void); + #endif /* __sctp_stream_sched_h__ */ -- cgit v1.2.3 From ca2c374a5c47492f7c221d9966a7b11d4c2383f2 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:39:59 -0500 Subject: net: dst->rt_next is unused. Delete it. Signed-off-by: David S. Miller Reviewed-by: Eric Dumazet --- include/net/dst.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index b091fd536098..349374750ee7 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -101,7 +101,6 @@ struct dst_entry { struct lwtunnel_state *lwtstate; union { struct dst_entry *next; - struct rtable __rcu *rt_next; struct rt6_info __rcu *rt6_next; struct dn_route __rcu *dn_next; }; -- cgit v1.2.3 From fe736e778c604dee8f02d3381dfadba8d621605e Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:40:08 -0500 Subject: decnet: Move dn_next into decnet route structure. Signed-off-by: David S. Miller Reviewed-by: Eric Dumazet --- include/net/dn_route.h | 1 + include/net/dst.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/dn_route.h b/include/net/dn_route.h index 55df9939bca2..342d2503cba5 100644 --- a/include/net/dn_route.h +++ b/include/net/dn_route.h @@ -69,6 +69,7 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, */ struct dn_route { struct dst_entry dst; + struct dn_route __rcu *dn_next; struct neighbour *n; diff --git a/include/net/dst.h b/include/net/dst.h index 349374750ee7..cc7e8166be0d 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -102,7 +102,6 @@ struct dst_entry { union { struct dst_entry *next; struct rt6_info __rcu *rt6_next; - struct dn_route __rcu *dn_next; }; }; -- cgit v1.2.3 From 071fb37ec43dcd88937a669c5f97bd37f7d29dea Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:40:15 -0500 Subject: ipv6: Move rt6_next from dst_entry into ipv6 route structure. Signed-off-by: David S. Miller Reviewed-by: Eric Dumazet --- include/net/dst.h | 1 - include/net/ip6_fib.h | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index cc7e8166be0d..acbb3fb89c4d 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -101,7 +101,6 @@ struct dst_entry { struct lwtunnel_state *lwtstate; union { struct dst_entry *next; - struct rt6_info __rcu *rt6_next; }; }; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 10c913816032..281a922f0c62 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -129,6 +129,7 @@ struct rt6_exception { struct rt6_info { struct dst_entry dst; + struct rt6_info __rcu *rt6_next; /* * Tail elements of dst_entry (__refcnt etc.) @@ -176,11 +177,11 @@ struct rt6_info { #define for_each_fib6_node_rt_rcu(fn) \ for (rt = rcu_dereference((fn)->leaf); rt; \ - rt = rcu_dereference(rt->dst.rt6_next)) + rt = rcu_dereference(rt->rt6_next)) #define for_each_fib6_walker_rt(w) \ for (rt = (w)->leaf; rt; \ - rt = rcu_dereference_protected(rt->dst.rt6_next, 1)) + rt = rcu_dereference_protected(rt->rt6_next, 1)) static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) { -- cgit v1.2.3 From b92cf4aab8e688b1bd501ac2ac4f1b5c99601e3b Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:40:22 -0500 Subject: net: Create and use new helper xfrm_dst_child(). Only IPSEC routes have a non-NULL dst->child pointer. And IPSEC routes are identified by a non-NULL dst->xfrm pointer. Signed-off-by: David S. Miller --- include/net/xfrm.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index dc28a98ce97c..4021b49a6ce3 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -994,6 +994,15 @@ struct xfrm_dst { u32 path_cookie; }; +static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst) +{ +#ifdef CONFIG_XFRM + if (dst->xfrm) + return dst->child; +#endif + return NULL; +} + #ifdef CONFIG_XFRM static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) { -- cgit v1.2.3 From 45b018beddb631fb9a0ecbc3ba103521b03c4c80 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:40:28 -0500 Subject: ipsec: Create and use new helpers for dst child access. This will make a future change moving the dst->child pointer less invasive. Signed-off-by: David S. Miller Reviewed-by: Eric Dumazet --- include/net/xfrm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 4021b49a6ce3..4c08eb0d46ce 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1004,6 +1004,11 @@ static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst) } #ifdef CONFIG_XFRM +static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *child) +{ + xdst->u.dst.child = child; +} + static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) { xfrm_pols_put(xdst->pols, xdst->num_pols); -- cgit v1.2.3 From b6ca8bd5a9198c70c48297390723e4e56bd6e879 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:45:44 -0500 Subject: xfrm: Move child route linkage into xfrm_dst. XFRM bundle child chains look like this: xdst1 --> xdst2 --> xdst3 --> path_dst All of xdstN are xfrm_dst objects and xdst->u.dst.xfrm is non-NULL. The final child pointer in the chain, here called 'path_dst', is some other kind of route such as an ipv4 or ipv6 one. The xfrm output path pops routes, one at a time, via the child pointer, until we hit one which has a dst->xfrm pointer which is NULL. We can easily preserve the above mechanisms with child sitting only in the xfrm_dst structure. All children in the chain before we break out of the xfrm_output() loop have dst->xfrm non-NULL and are therefore xfrm_dst objects. Since we break out of the loop when we find dst->xfrm NULL, we will not try to dereference 'dst' as if it were an xfrm_dst. Signed-off-by: David S. Miller --- include/net/dst.h | 3 +-- include/net/xfrm.h | 15 ++++++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index acbb3fb89c4d..cef46207408c 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -35,7 +35,6 @@ struct sk_buff; struct dst_entry { struct net_device *dev; struct rcu_head rcu_head; - struct dst_entry *child; struct dst_ops *ops; unsigned long _metrics; unsigned long expires; @@ -89,7 +88,7 @@ struct dst_entry { * Align __refcnt to a 64 bytes alignment * (L1_CACHE_SIZE would be too much) */ - long __pad_to_align_refcnt[2]; + long __pad_to_align_refcnt[3]; #endif /* * __refcnt wants to be on a different cache line from diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 4c08eb0d46ce..0009dab61528 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -968,7 +968,7 @@ static inline bool xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_c /* A struct encoding bundle of transformations to apply to some set of flow. * - * dst->child points to the next element of bundle. + * xdst->child points to the next element of bundle. * dst->xfrm points to an instanse of transformer. * * Due to unfortunate limitations of current routing cache, which we @@ -984,6 +984,7 @@ struct xfrm_dst { struct rt6_info rt6; } u; struct dst_entry *route; + struct dst_entry *child; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols, num_xfrms; u32 xfrm_genid; @@ -997,8 +998,10 @@ struct xfrm_dst { static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst) { #ifdef CONFIG_XFRM - if (dst->xfrm) - return dst->child; + if (dst->xfrm) { + struct xfrm_dst *xdst = (struct xfrm_dst *) dst; + return xdst->child; + } #endif return NULL; } @@ -1006,7 +1009,7 @@ static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst) #ifdef CONFIG_XFRM static inline void xfrm_dst_set_child(struct xfrm_dst *xdst, struct dst_entry *child) { - xdst->u.dst.child = child; + xdst->child = child; } static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) @@ -1880,12 +1883,14 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) { struct xfrm_state *x = dst->xfrm; + struct xfrm_dst *xdst; if (!x || !x->type_offload) return false; + xdst = (struct xfrm_dst *) dst; if (x->xso.offload_handle && (x->xso.dev == dst->path->dev) && - !dst->child->xfrm) + !xdst->child->xfrm) return true; return false; -- cgit v1.2.3 From 3a2232e92e87166a8a5113e918b8c7b7bdce4d83 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:40:40 -0500 Subject: ipv6: Move dst->from into struct rt6_info. The dst->from value is only used by ipv6 routes to track where a route "came from". Any time we clone or copy a core ipv6 route in the ipv6 routing tables, we have the copy/clone's ->from point to the base route. This is used to handle route expiration properly. Only ipv6 uses this mechanism, and only ipv6 code references it. So it is safe to move it into rt6_info. Signed-off-by: David S. Miller Reviewed-by: Eric Dumazet --- include/net/dst.h | 3 +-- include/net/ip6_fib.h | 9 ++++----- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index cef46207408c..13c839d8235a 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -39,7 +39,6 @@ struct dst_entry { unsigned long _metrics; unsigned long expires; struct dst_entry *path; - struct dst_entry *from; #ifdef CONFIG_XFRM struct xfrm_state *xfrm; #else @@ -88,7 +87,7 @@ struct dst_entry { * Align __refcnt to a 64 bytes alignment * (L1_CACHE_SIZE would be too much) */ - long __pad_to_align_refcnt[3]; + long __pad_to_align_refcnt[4]; #endif /* * __refcnt wants to be on a different cache line from diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 281a922f0c62..44d96a91e745 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -130,6 +130,7 @@ struct rt6_exception { struct rt6_info { struct dst_entry dst; struct rt6_info __rcu *rt6_next; + struct rt6_info *from; /* * Tail elements of dst_entry (__refcnt etc.) @@ -204,11 +205,9 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) { struct rt6_info *rt; - for (rt = rt0; rt && !(rt->rt6i_flags & RTF_EXPIRES); - rt = (struct rt6_info *)rt->dst.from); + for (rt = rt0; rt && !(rt->rt6i_flags & RTF_EXPIRES); rt = rt->from); if (rt && rt != rt0) rt0->dst.expires = rt->dst.expires; - dst_set_expires(&rt0->dst, timeout); rt0->rt6i_flags |= RTF_EXPIRES; } @@ -243,8 +242,8 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt) u32 cookie = 0; if (rt->rt6i_flags & RTF_PCPU || - (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) - rt = (struct rt6_info *)(rt->dst.from); + (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) + rt = rt->from; rt6_get_cookie_safe(rt, &cookie); -- cgit v1.2.3 From 0f6c480f23f49b53644b383c5554e579498347f3 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:40:46 -0500 Subject: xfrm: Move dst->path into struct xfrm_dst The first member of an IPSEC route bundle chain sets it's dst->path to the underlying ipv4/ipv6 route that carries the bundle. Stated another way, if one were to follow the xfrm_dst->child chain of the bundle, the final non-NULL pointer would be the path and point to either an ipv4 or an ipv6 route. This is largely used to make sure that PMTU events propagate down to the correct ipv4 or ipv6 route. When we don't have the top of an IPSEC bundle 'dst->path == dst'. Move it down into xfrm_dst and key off of dst->xfrm. Signed-off-by: David S. Miller Reviewed-by: Eric Dumazet --- include/net/dst.h | 3 +-- include/net/xfrm.h | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index 13c839d8235a..424bff66f16d 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -38,7 +38,6 @@ struct dst_entry { struct dst_ops *ops; unsigned long _metrics; unsigned long expires; - struct dst_entry *path; #ifdef CONFIG_XFRM struct xfrm_state *xfrm; #else @@ -87,7 +86,7 @@ struct dst_entry { * Align __refcnt to a 64 bytes alignment * (L1_CACHE_SIZE would be too much) */ - long __pad_to_align_refcnt[4]; + long __pad_to_align_refcnt[5]; #endif /* * __refcnt wants to be on a different cache line from diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 0009dab61528..1ec0c4760646 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -985,6 +985,7 @@ struct xfrm_dst { } u; struct dst_entry *route; struct dst_entry *child; + struct dst_entry *path; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols, num_xfrms; u32 xfrm_genid; @@ -995,6 +996,18 @@ struct xfrm_dst { u32 path_cookie; }; +static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst) +{ +#ifdef CONFIG_XFRM + if (dst->xfrm) { + const struct xfrm_dst *xdst = (const struct xfrm_dst *) dst; + + return xdst->path; + } +#endif + return (struct dst_entry *) dst; +} + static inline struct dst_entry *xfrm_dst_child(const struct dst_entry *dst) { #ifdef CONFIG_XFRM @@ -1889,7 +1902,7 @@ static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) return false; xdst = (struct xfrm_dst *) dst; - if (x->xso.offload_handle && (x->xso.dev == dst->path->dev) && + if (x->xso.offload_handle && (x->xso.dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) return true; -- cgit v1.2.3 From 8b207e7374c244484d6cb0f0e80ae400ce0ec0cf Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:40:53 -0500 Subject: net: Rearrange dst_entry layout to avoid useless padding. We have padding to try and align the refcount on a separate cache line. But after several simplifications the padding has increased substantially. So now it's easy to change the layout to get rid of the padding entirely. We group the write-heavy __refcnt and __use with less often used items such as the rcu_head and the error code. Signed-off-by: David S. Miller Reviewed-by: Eric Dumazet --- include/net/dst.h | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index 424bff66f16d..2270513d0790 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -34,7 +34,6 @@ struct sk_buff; struct dst_entry { struct net_device *dev; - struct rcu_head rcu_head; struct dst_ops *ops; unsigned long _metrics; unsigned long expires; @@ -56,8 +55,6 @@ struct dst_entry { #define DST_XFRM_QUEUE 0x0040 #define DST_METADATA 0x0080 - short error; - /* A non-zero value of dst->obsolete forces by-hand validation * of the route entry. Positive values are set by the generic * dst layer to indicate that the entry has been forcefully @@ -73,29 +70,25 @@ struct dst_entry { #define DST_OBSOLETE_KILL -2 unsigned short header_len; /* more space at head required */ unsigned short trailer_len; /* space to reserve at tail */ - unsigned short __pad3; - -#ifdef CONFIG_IP_ROUTE_CLASSID - __u32 tclassid; -#else - __u32 __pad2; -#endif -#ifdef CONFIG_64BIT - /* - * Align __refcnt to a 64 bytes alignment - * (L1_CACHE_SIZE would be too much) - */ - long __pad_to_align_refcnt[5]; -#endif /* * __refcnt wants to be on a different cache line from * input/output/ops or performance tanks badly */ - atomic_t __refcnt; /* client references */ +#ifdef CONFIG_64BIT + atomic_t __refcnt; /* 64-bit offset 64 */ +#endif int __use; unsigned long lastuse; struct lwtunnel_state *lwtstate; + struct rcu_head rcu_head; + short error; + short __pad; + __u32 tclassid; +#ifndef CONFIG_64BIT + atomic_t __refcnt; /* 32-bit offset 64 */ +#endif + union { struct dst_entry *next; }; @@ -244,7 +237,7 @@ static inline void dst_hold(struct dst_entry *dst) { /* * If your kernel compilation stops here, please check - * __pad_to_align_refcnt declaration in struct dst_entry + * the placement of __refcnt in struct dst_entry */ BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63); WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0); -- cgit v1.2.3 From 7149f813d12d92ba9abcf49026f7cebc3d55c426 Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 28 Nov 2017 15:41:07 -0500 Subject: net: Remove dst->next There are no more users. Signed-off-by: David S. Miller Reviewed-by: Eric Dumazet --- include/net/dst.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index 2270513d0790..33d2a5433924 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -88,10 +88,6 @@ struct dst_entry { #ifndef CONFIG_64BIT atomic_t __refcnt; /* 32-bit offset 64 */ #endif - - union { - struct dst_entry *next; - }; }; struct dst_metrics { -- cgit v1.2.3 From 90a6ec85351b31449c2c6b5406b5396ac96f191d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 29 Nov 2017 16:07:51 -0800 Subject: act_sample: get rid of tcf_sample_cleanup_rcu() Similar to commit d7fb60b9cafb ("net_sched: get rid of tcfa_rcu"), TC actions don't need to respect RCU grace period, because it is either just detached from tc filter (standalone case) or it is removed together with tc filter (bound case) in which case RCU grace period is already respected at filter layer. Fixes: 5c5670fae430 ("net/sched: Introduce sample tc action") Reported-by: Eric Dumazet Cc: Jamal Hadi Salim Cc: Jiri Pirko Cc: Yotam Gigi Signed-off-by: Cong Wang Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tc_act/tc_sample.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h index 524cee4f4c81..01dbfea32672 100644 --- a/include/net/tc_act/tc_sample.h +++ b/include/net/tc_act/tc_sample.h @@ -14,7 +14,6 @@ struct tcf_sample { struct psample_group __rcu *psample_group; u32 psample_group_num; struct list_head tcfm_list; - struct rcu_head rcu; }; #define to_sample(a) ((struct tcf_sample *)a) -- cgit v1.2.3 From e5f612969c6f965e3bd1158598e0a3b1c4f389b9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 25 Nov 2017 21:18:35 +0800 Subject: sctp: abandon the whole msg if one part of a fragmented message is abandoned As rfc3758#section-3.1 demands: A3) When a TSN is "abandoned", if it is part of a fragmented message, all other TSN's within that fragmented message MUST be abandoned at the same time. Besides, if it couldn't handle this, the rest frags would never get assembled in peer side. This patch supports it by adding abandoned flag in sctp_datamsg, when one chunk is being abandoned, set chunk->msg->abandoned as well. Next time when checking for abandoned, go checking chunk->msg->abandoned first. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 16f949eef52f..2f8f93da5dc2 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -503,7 +503,8 @@ struct sctp_datamsg { /* Did the messenge fail to send? */ int send_error; u8 send_failed:1, - can_delay; /* should this message be Nagle delayed */ + can_delay:1, /* should this message be Nagle delayed */ + abandoned:1; /* should this message be abandoned */ }; struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *, -- cgit v1.2.3 From a3222dc95ca751cdc5f6ac3c9b092b160b73ed9f Mon Sep 17 00:00:00 2001 From: William Tu Date: Thu, 30 Nov 2017 11:51:27 -0800 Subject: ip_gre: Refector the erpsan tunnel code. Move two erspan functions to header file, erspan.h, so ipv6 erspan implementation can use it. Signed-off-by: William Tu Signed-off-by: David S. Miller --- include/net/erspan.h | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'include/net') diff --git a/include/net/erspan.h b/include/net/erspan.h index ca94fc86865e..6e758d08c9ee 100644 --- a/include/net/erspan.h +++ b/include/net/erspan.h @@ -58,4 +58,55 @@ struct erspanhdr { struct erspan_metadata md; }; +static inline u8 tos_to_cos(u8 tos) +{ + u8 dscp, cos; + + dscp = tos >> 2; + cos = dscp >> 3; + return cos; +} + +static inline void erspan_build_header(struct sk_buff *skb, + __be32 id, u32 index, + bool truncate, bool is_ipv4) +{ + struct ethhdr *eth = eth_hdr(skb); + enum erspan_encap_type enc_type; + struct erspanhdr *ershdr; + struct qtag_prefix { + __be16 eth_type; + __be16 tci; + } *qp; + u16 vlan_tci = 0; + u8 tos; + + tos = is_ipv4 ? ip_hdr(skb)->tos : + (ipv6_hdr(skb)->priority << 4) + + (ipv6_hdr(skb)->flow_lbl[0] >> 4); + + enc_type = ERSPAN_ENCAP_NOVLAN; + + /* If mirrored packet has vlan tag, extract tci and + * perserve vlan header in the mirrored frame. + */ + if (eth->h_proto == htons(ETH_P_8021Q)) { + qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); + vlan_tci = ntohs(qp->tci); + enc_type = ERSPAN_ENCAP_INFRAME; + } + + skb_push(skb, sizeof(*ershdr)); + ershdr = (struct erspanhdr *)skb->data; + memset(ershdr, 0, sizeof(*ershdr)); + + ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) | + (ERSPAN_VERSION << VER_OFFSET)); + ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) | + ((tos_to_cos(tos) << COS_OFFSET) & COS_MASK) | + (enc_type << EN_OFFSET & EN_MASK) | + ((truncate << T_OFFSET) & T_MASK)); + ershdr->md.index = htonl(index & INDEX_MASK); +} + #endif -- cgit v1.2.3 From 5a963eb61b7c39e6c422b6e48619d19d04719358 Mon Sep 17 00:00:00 2001 From: William Tu Date: Thu, 30 Nov 2017 11:51:29 -0800 Subject: ip6_gre: Add ERSPAN native tunnel support The patch adds support for ERSPAN tunnel over ipv6. Signed-off-by: William Tu Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index d66f70f63734..109a5a8877ef 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -36,6 +36,7 @@ struct __ip6_tnl_parm { __be32 o_key; __u32 fwmark; + __u32 index; /* ERSPAN type II index */ }; /* IPv6 tunnel */ -- cgit v1.2.3 From 80e023607982faa6245507c45acf93bb0feb0ded Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 30 Nov 2017 11:23:57 -0500 Subject: net: dsa: remove trans argument from vlan ops The DSA switch VLAN ops pass the switchdev_trans structure down to the drivers, but no one is using them and they aren't supposed to anyway. Remove the trans argument from VLAN prepare and add operations. At the same time, fix the following checkpatch warning: WARNING: line over 80 characters #74: FILE: drivers/net/dsa/dsa_loop.c:177: + const struct switchdev_obj_port_vlan *vlan) Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 2a05738570d8..0c4fbb34379e 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -412,12 +412,10 @@ struct dsa_switch_ops { */ int (*port_vlan_filtering)(struct dsa_switch *ds, int port, bool vlan_filtering); - int (*port_vlan_prepare)(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan, - struct switchdev_trans *trans); - void (*port_vlan_add)(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_vlan *vlan, - struct switchdev_trans *trans); + int (*port_vlan_prepare)(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_vlan *vlan); + void (*port_vlan_add)(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_vlan *vlan); int (*port_vlan_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); /* -- cgit v1.2.3 From 3709aadc8375a1b0c42da5b12e38eddf8133dd4e Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 30 Nov 2017 11:23:58 -0500 Subject: net: dsa: remove trans argument from mdb ops The DSA switch MDB ops pass the switchdev_trans structure down to the drivers, but no one is using them and they aren't supposed to anyway. Remove the trans argument from MDB prepare and add operations. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 0c4fbb34379e..6700dff46a80 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -431,12 +431,10 @@ struct dsa_switch_ops { /* * Multicast database */ - int (*port_mdb_prepare)(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb, - struct switchdev_trans *trans); - void (*port_mdb_add)(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb, - struct switchdev_trans *trans); + int (*port_mdb_prepare)(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb); + void (*port_mdb_add)(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb); int (*port_mdb_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb); /* -- cgit v1.2.3 From 3b8fac5d905ee15346ba2de1e2427ef43fbbd880 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 30 Nov 2017 12:56:42 -0500 Subject: net: dsa: introduce dsa_towards_port helper Add a new helper returning the local port used to reach an arbitrary switch port in the fabric. Its only user at the moment is the dsa_upstream_port helper, which returns the local port reaching the dedicated CPU port, but it will be used in cross-chip FDB operations. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6700dff46a80..8198efcc8ced 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -296,20 +296,23 @@ static inline u32 dsa_user_ports(struct dsa_switch *ds) return mask; } +/* Return the local port used to reach an arbitrary switch port */ +static inline unsigned int dsa_towards_port(struct dsa_switch *ds, int device, + int port) +{ + if (device == ds->index) + return port; + else + return ds->rtable[device]; +} + +/* Return the local port used to reach the dedicated CPU port */ static inline u8 dsa_upstream_port(struct dsa_switch *ds) { struct dsa_switch_tree *dst = ds->dst; + struct dsa_port *cpu_dp = dst->cpu_dp; - /* - * If this is the root switch (i.e. the switch that connects - * to the CPU), return the cpu port number on this switch. - * Else return the (DSA) port number that connects to the - * switch that is one hop closer to the cpu. - */ - if (dst->cpu_dp->ds == ds) - return dst->cpu_dp->index; - else - return ds->rtable[dst->cpu_dp->ds->index]; + return dsa_towards_port(ds, cpu_dp->ds->index, cpu_dp->index); } typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, -- cgit v1.2.3 From 76d013b20ba9a5f88eee7c90ac82cbc3ee64be18 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 1 Dec 2017 12:52:29 -0800 Subject: inet: Add a count to struct inet_listen_hashbucket This patch adds a count to the 'struct inet_listen_hashbucket'. It counts how many sk is hashed to a bucket. It will be used to decide if the (to-be-added) portaddr listener's hashtable should be used during inet[6]_lookup_listener(). Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 2dbbbff5e1e3..4cce516c41ac 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -111,6 +111,7 @@ struct inet_bind_hashbucket { */ struct inet_listen_hashbucket { spinlock_t lock; + unsigned int count; struct hlist_head head; }; -- cgit v1.2.3 From f0b1e64c1331dd8a2f0c30fcd0838db6cb406098 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 1 Dec 2017 12:52:30 -0800 Subject: udp: Move udp[46]_portaddr_hash() to net/ip[v6].h This patch moves the udp[46]_portaddr_hash() to net/ip[v6].h. The function name is renamed to ipv[46]_portaddr_hash(). It will be used by a later patch which adds a second listener hashtable hashed by the address and port. Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 9 +++++++++ include/net/ipv6.h | 17 +++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index 9896f46cbbf1..fc9bf1b1fe2c 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -26,12 +26,14 @@ #include #include #include +#include #include #include #include #include #include +#include #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ @@ -521,6 +523,13 @@ static inline unsigned int ipv4_addr_hash(__be32 ip) return (__force unsigned int) ip; } +static inline u32 ipv4_portaddr_hash(const struct net *net, + __be32 saddr, + unsigned int port) +{ + return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; +} + bool ip_call_ra_chain(struct sk_buff *skb); /* diff --git a/include/net/ipv6.h b/include/net/ipv6.h index f73797e2fa60..25be4715578c 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -22,6 +22,7 @@ #include #include #include +#include #define SIN6_LEN_RFC2133 24 @@ -673,6 +674,22 @@ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a) cpu_to_be32(0x0000ffff))) == 0UL; } +static inline u32 ipv6_portaddr_hash(const struct net *net, + const struct in6_addr *addr6, + unsigned int port) +{ + unsigned int hash, mix = net_hash_mix(net); + + if (ipv6_addr_any(addr6)) + hash = jhash_1word(0, mix); + else if (ipv6_addr_v4mapped(addr6)) + hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix); + else + hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix); + + return hash ^ port; +} + /* * Check for a RFC 4843 ORCHID address * (Overlay Routable Cryptographic Hash Identifiers) -- cgit v1.2.3 From 61b7c691c7317529375f90f0a81a331990b1ec1b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 1 Dec 2017 12:52:31 -0800 Subject: inet: Add a 2nd listener hashtable (port+addr) The current listener hashtable is hashed by port only. When a process is listening at many IP addresses with the same port (e.g. [IP1]:443, [IP2]:443... [IPN]:443), the inet[6]_lookup_listener() performance is degraded to a link list. It is prone to syn attack. UDP had a similar issue and a second hashtable was added to resolve it. This patch adds a second hashtable for the listener's sockets. The second hashtable is hashed by port and address. It cannot reuse the existing skc_portaddr_node which is shared with skc_bind_node. TCP listener needs to use skc_bind_node. Instead, this patch adds a hlist_node 'icsk_listen_portaddr_node' to the inet_connection_sock which the listener (like TCP) also belongs to. The new portaddr hashtable may need two lookup (First by IP:PORT. Second by INADDR_ANY:PORT if the IP:PORT is a not found). Hence, it implements a similar cut off as UDP such that it will only consult the new portaddr hashtable if the current port-only hashtable has >10 sk in the link-list. lhash2 and lhash2_mask are added to 'struct inet_hashinfo'. I take this chance to plug a 4 bytes hole. It is done by first moving the existing bind_bucket_cachep up and then add the new (int lhash2_mask, *lhash2) after the existing bhash_size. Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 2 ++ include/net/inet_hashtables.h | 28 ++++++++++++++++++++++------ 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 0358745ea059..8e1bf9ae4a5e 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -77,6 +77,7 @@ struct inet_connection_sock_af_ops { * @icsk_af_ops Operations which are AF_INET{4,6} specific * @icsk_ulp_ops Pluggable ULP control hook * @icsk_ulp_data ULP private data + * @icsk_listen_portaddr_node hash to the portaddr listener hashtable * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event @@ -101,6 +102,7 @@ struct inet_connection_sock { const struct inet_connection_sock_af_ops *icsk_af_ops; const struct tcp_ulp_ops *icsk_ulp_ops; void *icsk_ulp_data; + struct hlist_node icsk_listen_portaddr_node; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state:6, icsk_ca_setsockopt:1, diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 4cce516c41ac..9141e95529e7 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -133,12 +133,13 @@ struct inet_hashinfo { /* Ok, let's try this, I give up, we do need a local binding * TCP hash as well as the others for fast bind/connect. */ + struct kmem_cache *bind_bucket_cachep; struct inet_bind_hashbucket *bhash; - unsigned int bhash_size; - /* 4 bytes hole on 64 bit */ - struct kmem_cache *bind_bucket_cachep; + /* The 2nd listener table hashed by local port and address */ + unsigned int lhash2_mask; + struct inet_listen_hashbucket *lhash2; /* All the above members are written once at bootup and * never written again _or_ are predominantly read-access. @@ -146,14 +147,25 @@ struct inet_hashinfo { * Now align to a new cache line as all the following members * might be often dirty. */ - /* All sockets in TCP_LISTEN state will be in here. This is the only - * table where wildcard'd TCP sockets can exist. Hash function here - * is just local port number. + /* All sockets in TCP_LISTEN state will be in listening_hash. + * This is the only table where wildcard'd TCP sockets can + * exist. listening_hash is only hashed by local port number. + * If lhash2 is initialized, the same socket will also be hashed + * to lhash2 by port and address. */ struct inet_listen_hashbucket listening_hash[INET_LHTABLE_SIZE] ____cacheline_aligned_in_smp; }; +#define inet_lhash2_for_each_icsk_rcu(__icsk, list) \ + hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node) + +static inline struct inet_listen_hashbucket * +inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash) +{ + return &h->lhash2[hash & h->lhash2_mask]; +} + static inline struct inet_ehash_bucket *inet_ehash_bucket( struct inet_hashinfo *hashinfo, unsigned int hash) @@ -209,6 +221,10 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child); void inet_put_port(struct sock *sk); void inet_hashinfo_init(struct inet_hashinfo *h); +void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, + unsigned long numentries, int scale, + unsigned long low_limit, + unsigned long high_limit); bool inet_ehash_insert(struct sock *sk, struct sock *osk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk); -- cgit v1.2.3 From b4d1605a8ea608fd7dc45b926a05d75d340bde4b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 3 Dec 2017 09:33:00 -0800 Subject: tcp: use IPCB instead of TCP_SKB_CB in inet_exact_dif_match() After this fix : ("tcp: add tcp_v4_fill_cb()/tcp_v4_restore_cb()"), socket lookups happen while skb->cb[] has not been mangled yet by TCP. Fixes: a04a480d4392 ("net: Require exact match for TCP socket lookups if dif is l3mdev") Signed-off-by: David Ahern Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 4e09398009c1..6998707e81f3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -844,12 +844,11 @@ static inline int tcp_v6_sdif(const struct sk_buff *skb) } #endif -/* TCP_SKB_CB reference means this can not be used from early demux */ static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (!net->ipv4.sysctl_tcp_l3mdev_accept && - skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags)) + skb && ipv4_l3mdev_skb(IPCB(skb)->flags)) return true; #endif return false; -- cgit v1.2.3 From e4202511480da5f8e6870d8f6ecbb821aeaa8caf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 2 Dec 2017 21:44:06 +0100 Subject: rtnetlink: get reference on module before invoking handlers Add yet another rtnl_register function. It will be used by modules that can be removed. The passed module struct is used to prevent module unload while a netlink dump is in progress or when a DOIT_UNLOCKED doit callback is called. Cc: Peter Zijlstra Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/rtnetlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index ead018744ff5..e326b3f9eb5f 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -17,6 +17,8 @@ int __rtnl_register(int protocol, int msgtype, rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); void rtnl_register(int protocol, int msgtype, rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); +int rtnl_register_module(struct module *owner, int protocol, int msgtype, + rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); int rtnl_unregister(int protocol, int msgtype); void rtnl_unregister_all(int protocol); -- cgit v1.2.3 From 16feebcf2350aa369001a529f50ce33f2472c01c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 2 Dec 2017 21:44:08 +0100 Subject: rtnetlink: remove __rtnl_register This removes __rtnl_register and switches callers to either rtnl_register or rtnl_register_module. Also, rtnl_register() will now print an error if memory allocation failed rather than panic the kernel. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/rtnetlink.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index e326b3f9eb5f..14b6b3af8918 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -13,8 +13,6 @@ enum rtnl_link_flags { RTNL_FLAG_DOIT_UNLOCKED = 1, }; -int __rtnl_register(int protocol, int msgtype, - rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); void rtnl_register(int protocol, int msgtype, rtnl_doit_func, rtnl_dumpit_func, unsigned int flags); int rtnl_register_module(struct module *owner, int protocol, int msgtype, -- cgit v1.2.3 From a3fde2addd5f0218b64102005a237ef727b0dc30 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Dec 2017 19:19:18 +0100 Subject: rtnetlink: ipv6: convert remaining users to rtnl_register_module convert remaining users of rtnl_register to rtnl_register_module and un-export rtnl_register. Requested-by: David S. Miller Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index b623b65a79d1..c4185a7b0e90 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -180,7 +180,7 @@ static inline int addrconf_finite_timeout(unsigned long timeout) */ int ipv6_addr_label_init(void); void ipv6_addr_label_cleanup(void); -void ipv6_addr_label_rtnl_register(void); +int ipv6_addr_label_rtnl_register(void); u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr, int type, int ifindex); -- cgit v1.2.3 From f19397a5c65665d66e3866b42056f1f58b7a366b Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Fri, 1 Dec 2017 10:15:04 -0800 Subject: bpf: Add access to snd_cwnd and others in sock_ops Adds read access to snd_cwnd and srtt_us fields of tcp_sock. Since these fields are only valid if the socket associated with the sock_ops program call is a full socket, the field is_fullsock is also added to the bpf_sock_ops struct. If the socket is not a full socket, reading these fields returns 0. Note that in most cases it will not be necessary to check is_fullsock to know if there is a full socket. The context of the call, as specified by the 'op' field, can sometimes determine whether there is a full socket. The struct bpf_sock_ops has the following fields added: __u32 is_fullsock; /* Some TCP fields are only valid if * there is a full socket. If not, the * fields read as zero. */ __u32 snd_cwnd; __u32 srtt_us; /* Averaged RTT << 3 in usecs */ There is a new macro, SOCK_OPS_GET_TCP32(NAME), to make it easier to add read access to more 32 bit tcp_sock fields. Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/net/tcp.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 4e09398009c1..89a656077884 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2012,10 +2012,12 @@ static inline int tcp_call_bpf(struct sock *sk, int op) struct bpf_sock_ops_kern sock_ops; int ret; - if (sk_fullsock(sk)) + memset(&sock_ops, 0, sizeof(sock_ops)); + if (sk_fullsock(sk)) { + sock_ops.is_fullsock = 1; sock_owned_by_me(sk); + } - memset(&sock_ops, 0, sizeof(sock_ops)); sock_ops.sk = sk; sock_ops.op = op; -- cgit v1.2.3 From 5c472203421ab4f928aa1ae9e1dbcfdd80324148 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Mon, 4 Dec 2017 13:31:10 +0200 Subject: net_sched: red: Avoid devision by zero Do not allow delta value to be zero since it is used as a divisor. Fixes: 8af2a218de38 ("sch_red: Adaptative RED AQM") Signed-off-by: Nogah Frankel Signed-off-by: David S. Miller --- include/net/red.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/red.h b/include/net/red.h index 9a9347710701..5918f78d36a0 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -179,7 +179,7 @@ static inline void red_set_parms(struct red_parms *p, p->qth_max = qth_max << Wlog; p->Wlog = Wlog; p->Plog = Plog; - if (delta < 0) + if (delta <= 0) delta = 1; p->qth_delta = delta; if (!max_P) { -- cgit v1.2.3 From 8afa10cbe281b10371fee5a87ab266e48d71a7f9 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Mon, 4 Dec 2017 13:31:11 +0200 Subject: net_sched: red: Avoid illegal values Check the qmin & qmax values doesn't overflow for the given Wlog value. Check that qmin <= qmax. Fixes: a783474591f2 ("[PKT_SCHED]: Generic RED layer") Signed-off-by: Nogah Frankel Signed-off-by: David S. Miller --- include/net/red.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/red.h b/include/net/red.h index 5918f78d36a0..9665582c4687 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -168,6 +168,17 @@ static inline void red_set_vars(struct red_vars *v) v->qcount = -1; } +static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog) +{ + if (fls(qth_min) + Wlog > 32) + return false; + if (fls(qth_max) + Wlog > 32) + return false; + if (qth_max < qth_min) + return false; + return true; +} + static inline void red_set_parms(struct red_parms *p, u32 qth_min, u32 qth_max, u8 Wlog, u8 Plog, u8 Scell_log, u8 *stab, u32 max_P) -- cgit v1.2.3 From efbf78973978b0d25af59bc26c8013a942af6e64 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 4 Dec 2017 10:48:18 -0800 Subject: net_sched: get rid of rcu_barrier() in tcf_block_put_ext() Both Eric and Paolo noticed the rcu_barrier() we use in tcf_block_put_ext() could be a performance bottleneck when we have a lot of tc classes. Paolo provided the following to demonstrate the issue: tc qdisc add dev lo root htb for I in `seq 1 1000`; do tc class add dev lo parent 1: classid 1:$I htb rate 100kbit tc qdisc add dev lo parent 1:$I handle $((I + 1)): htb for J in `seq 1 10`; do tc filter add dev lo parent $((I + 1)): u32 match ip src 1.1.1.$J done done time tc qdisc del dev root real 0m54.764s user 0m0.023s sys 0m0.000s The rcu_barrier() there is to ensure we free the block after all chains are gone, that is, to queue tcf_block_put_final() at the tail of workqueue. We can achieve this ordering requirement by refcnt'ing tcf block instead, that is, the tcf block is freed only when the last chain in this block is gone. This also simplifies the code. Paolo reported after this patch we get: real 0m0.017s user 0m0.000s sys 0m0.017s Tested-by: Paolo Abeni Cc: Eric Dumazet Cc: Jiri Pirko Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 65d0d25f2648..02e7ad8b8dad 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -278,7 +278,6 @@ struct tcf_block { struct net *net; struct Qdisc *q; struct list_head cb_list; - struct work_struct work; }; static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) -- cgit v1.2.3 From 0ac4bd68ab50a9f0860b10caacc1285fda5da0ca Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 4 Dec 2017 18:39:59 -0500 Subject: net: sched: sch_api: fix code style issues This patch fix checkpatch issues for upcomming patches according to the sched api file. It changes checking on null pointer, remove unnecessary brackets, add variable names for parameters and adjust 80 char width. Cc: David Ahern Signed-off-by: Alexander Aring Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/sch_generic.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 02e7ad8b8dad..7dd8b0b0d244 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -161,7 +161,8 @@ struct Qdisc_class_ops { void (*walk)(struct Qdisc *, struct qdisc_walker * arg); /* Filter manipulation */ - struct tcf_block * (*tcf_block)(struct Qdisc *, unsigned long); + struct tcf_block * (*tcf_block)(struct Qdisc *sch, + unsigned long arg); unsigned long (*bind_tcf)(struct Qdisc *, unsigned long, u32 classid); void (*unbind_tcf)(struct Qdisc *, unsigned long); @@ -185,11 +186,12 @@ struct Qdisc_ops { struct sk_buff * (*dequeue)(struct Qdisc *); struct sk_buff * (*peek)(struct Qdisc *); - int (*init)(struct Qdisc *, struct nlattr *arg); + int (*init)(struct Qdisc *sch, struct nlattr *arg); void (*reset)(struct Qdisc *); void (*destroy)(struct Qdisc *); - int (*change)(struct Qdisc *, struct nlattr *arg); - void (*attach)(struct Qdisc *); + int (*change)(struct Qdisc *sch, + struct nlattr *arg); + void (*attach)(struct Qdisc *sch); int (*dump)(struct Qdisc *, struct sk_buff *); int (*dump_stats)(struct Qdisc *, struct gnet_dump *); -- cgit v1.2.3 From 07073c79bf878988d8d0da94869fa5f9d1aa5005 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Tue, 5 Dec 2017 15:34:13 -0500 Subject: net: dsa: return per-port upstream port The current dsa_upstream_port() helper still assumes a unique CPU port in the whole switch fabric. This is becoming wrong, as every port in the fabric has its dedicated CPU port, thus every port has an upstream port. Add a port argument to the dsa_upstream_port() helper and fetch its CPU port instead of the deprecated unique fabric CPU port. A CPU or unused port has no dedicated CPU port, so return itself in this case. At the same time, change the return value from u8 to unsigned int since there is no need to limit the size here. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 8198efcc8ced..d29feccaefab 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -307,10 +307,13 @@ static inline unsigned int dsa_towards_port(struct dsa_switch *ds, int device, } /* Return the local port used to reach the dedicated CPU port */ -static inline u8 dsa_upstream_port(struct dsa_switch *ds) +static inline unsigned int dsa_upstream_port(struct dsa_switch *ds, int port) { - struct dsa_switch_tree *dst = ds->dst; - struct dsa_port *cpu_dp = dst->cpu_dp; + const struct dsa_port *dp = dsa_to_port(ds, port); + const struct dsa_port *cpu_dp = dp->cpu_dp; + + if (!cpu_dp) + return port; return dsa_towards_port(ds, cpu_dp->ds->index, cpu_dp->index); } -- cgit v1.2.3 From d7efc6c11b277d9d80b99b1334a78bfe7d7edf10 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Dec 2017 12:45:56 -0800 Subject: net: remove hlist_nulls_add_tail_rcu() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Alexander Potapenko reported use of uninitialized memory [1] This happens when inserting a request socket into TCP ehash, in __sk_nulls_add_node_rcu(), since sk_reuseport is not initialized. Bug was added by commit d894ba18d4e4 ("soreuseport: fix ordering for mixed v4/v6 sockets") Note that d296ba60d8e2 ("soreuseport: Resolve merge conflict for v4/v6 ordering fix") missed the opportunity to get rid of hlist_nulls_add_tail_rcu() : Both UDP sockets and TCP/DCCP listeners no longer use __sk_nulls_add_node_rcu() for their hash insertion. Since all other sockets have unique 4-tuple, the reuseport status has no special meaning, so we can always use hlist_nulls_add_head_rcu() for them and save few cycles/instructions. [1] ================================================================== BUG: KMSAN: use of uninitialized memory in inet_ehash_insert+0xd40/0x1050 CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.13.0+ #3288 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace:    __dump_stack lib/dump_stack.c:16  dump_stack+0x185/0x1d0 lib/dump_stack.c:52  kmsan_report+0x13f/0x1c0 mm/kmsan/kmsan.c:1016  __msan_warning_32+0x69/0xb0 mm/kmsan/kmsan_instr.c:766  __sk_nulls_add_node_rcu ./include/net/sock.h:684  inet_ehash_insert+0xd40/0x1050 net/ipv4/inet_hashtables.c:413  reqsk_queue_hash_req net/ipv4/inet_connection_sock.c:754  inet_csk_reqsk_queue_hash_add+0x1cc/0x300 net/ipv4/inet_connection_sock.c:765  tcp_conn_request+0x31e7/0x36f0 net/ipv4/tcp_input.c:6414  tcp_v4_conn_request+0x16d/0x220 net/ipv4/tcp_ipv4.c:1314  tcp_rcv_state_process+0x42a/0x7210 net/ipv4/tcp_input.c:5917  tcp_v4_do_rcv+0xa6a/0xcd0 net/ipv4/tcp_ipv4.c:1483  tcp_v4_rcv+0x3de0/0x4ab0 net/ipv4/tcp_ipv4.c:1763  ip_local_deliver_finish+0x6bb/0xcb0 net/ipv4/ip_input.c:216  NF_HOOK ./include/linux/netfilter.h:248  ip_local_deliver+0x3fa/0x480 net/ipv4/ip_input.c:257  dst_input ./include/net/dst.h:477  ip_rcv_finish+0x6fb/0x1540 net/ipv4/ip_input.c:397  NF_HOOK ./include/linux/netfilter.h:248  ip_rcv+0x10f6/0x15c0 net/ipv4/ip_input.c:488  __netif_receive_skb_core+0x36f6/0x3f60 net/core/dev.c:4298  __netif_receive_skb net/core/dev.c:4336  netif_receive_skb_internal+0x63c/0x19c0 net/core/dev.c:4497  napi_skb_finish net/core/dev.c:4858  napi_gro_receive+0x629/0xa50 net/core/dev.c:4889  e1000_receive_skb drivers/net/ethernet/intel/e1000/e1000_main.c:4018  e1000_clean_rx_irq+0x1492/0x1d30 drivers/net/ethernet/intel/e1000/e1000_main.c:4474  e1000_clean+0x43aa/0x5970 drivers/net/ethernet/intel/e1000/e1000_main.c:3819  napi_poll net/core/dev.c:5500  net_rx_action+0x73c/0x1820 net/core/dev.c:5566  __do_softirq+0x4b4/0x8dd kernel/softirq.c:284  invoke_softirq kernel/softirq.c:364  irq_exit+0x203/0x240 kernel/softirq.c:405  exiting_irq+0xe/0x10 ./arch/x86/include/asm/apic.h:638  do_IRQ+0x15e/0x1a0 arch/x86/kernel/irq.c:263  common_interrupt+0x86/0x86 Fixes: d894ba18d4e4 ("soreuseport: fix ordering for mixed v4/v6 sockets") Fixes: d296ba60d8e2 ("soreuseport: Resolve merge conflict for v4/v6 ordering fix") Signed-off-by: Eric Dumazet Reported-by: Alexander Potapenko Acked-by: Craig Gallek Signed-off-by: David S. Miller --- include/net/sock.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 79e1a2c7912c..9155da422692 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -685,11 +685,7 @@ static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { - if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && - sk->sk_family == AF_INET6) - hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list); - else - hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); + hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); } static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) -- cgit v1.2.3 From 9a63b255dffd6de31fe47a80d16d26d0291d3714 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 5 Dec 2017 12:53:07 -0800 Subject: net_sched: remove unused parameter from act cleanup ops No one actually uses it. Cc: Jiri Pirko Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index fd08df74c466..02bf409140d0 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -86,7 +86,7 @@ struct tc_action_ops { int (*act)(struct sk_buff *, const struct tc_action *, struct tcf_result *); int (*dump)(struct sk_buff *, struct tc_action *, int, int); - void (*cleanup)(struct tc_action *, int bind); + void (*cleanup)(struct tc_action *); int (*lookup)(struct net *, struct tc_action **, u32); int (*init)(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, int ovr, -- cgit v1.2.3 From 9f8a739e72f1546fb0f8c518af1193522c45be12 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 5 Dec 2017 16:17:26 -0800 Subject: act_mirred: get rid of tcfm_ifindex from struct tcf_mirred tcfm_dev always points to the correct netdev and we already hold a refcnt, so no need to use tcfm_ifindex to lookup again. If we would support moving target netdev across netns, using pointer would be better than ifindex. This also fixes dumping obsolete ifindex, now after the target device is gone we just dump 0 as ifindex. Cc: Jiri Pirko Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/tc_act/tc_mirred.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h index 21d253c9a8c6..a2e9cbca5c9e 100644 --- a/include/net/tc_act/tc_mirred.h +++ b/include/net/tc_act/tc_mirred.h @@ -8,10 +8,8 @@ struct tcf_mirred { struct tc_action common; int tcfm_eaction; - int tcfm_ifindex; bool tcfm_mac_header_xmit; struct net_device __rcu *tcfm_dev; - struct net *net; struct list_head tcfm_list; }; #define to_mirred(a) ((struct tcf_mirred *)a) @@ -34,9 +32,9 @@ static inline bool is_tcf_mirred_egress_mirror(const struct tc_action *a) return false; } -static inline int tcf_mirred_ifindex(const struct tc_action *a) +static inline struct net_device *tcf_mirred_dev(const struct tc_action *a) { - return to_mirred(a)->tcfm_ifindex; + return rtnl_dereference(to_mirred(a)->tcfm_dev); } #endif /* __NET_TC_MIR_H */ -- cgit v1.2.3 From 2a93c1a3651fb41b580676c849887b68af6da02b Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 6 Dec 2017 15:03:33 -0800 Subject: net: dsa: Allow compiling out legacy support Introduce a configuration option: CONFIG_NET_DSA_LEGACY allowing to compile out support for the old platform device and Device Tree binding registration. Support for these configurations is scheduled to be removed in 4.17. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index d29feccaefab..6cb602dd970c 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -321,12 +321,14 @@ static inline unsigned int dsa_upstream_port(struct dsa_switch *ds, int port) typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { +#if IS_ENABLED(CONFIG_NET_DSA_LEGACY) /* * Legacy probing. */ const char *(*probe)(struct device *dsa_dev, struct device *host_dev, int sw_addr, void **priv); +#endif enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds, int port); @@ -474,11 +476,20 @@ struct dsa_switch_driver { const struct dsa_switch_ops *ops; }; +#if IS_ENABLED(CONFIG_NET_DSA_LEGACY) /* Legacy driver registration */ void register_switch_driver(struct dsa_switch_driver *type); void unregister_switch_driver(struct dsa_switch_driver *type); struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev); +#else +static inline void register_switch_driver(struct dsa_switch_driver *type) { } +static inline void unregister_switch_driver(struct dsa_switch_driver *type) { } +static inline struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev) +{ + return NULL; +} +#endif struct net_device *dsa_dev_to_net_device(struct device *dev); /* Keep inline for faster access in hot path */ -- cgit v1.2.3 From d4761754b4fb2ef8d9a1e9d121c4bec84e1fe292 Mon Sep 17 00:00:00 2001 From: Yousuk Seung Date: Thu, 7 Dec 2017 13:41:34 -0800 Subject: tcp: invalidate rate samples during SACK reneging Mark tcp_sock during a SACK reneging event and invalidate rate samples while marked. Such rate samples may overestimate bw by including packets that were SACKed before reneging. < ack 6001 win 10000 sack 7001:38001 < ack 7001 win 0 sack 8001:38001 // Reneg detected > seq 7001:8001 // RTO, SACK cleared. < ack 38001 win 10000 In above example the rate sample taken after the last ack will count 7001-38001 as delivered while the actual delivery rate likely could be much lower i.e. 7001-8001. This patch adds a new field tcp_sock.sack_reneg and marks it when we declare SACK reneging and entering TCP_CA_Loss, and unmarks it after the last rate sample was taken before moving back to TCP_CA_Open. This patch also invalidates rate samples taken while tcp_sock.is_sack_reneg is set. Fixes: b9f64820fb22 ("tcp: track data delivery rate for a TCP connection") Signed-off-by: Yousuk Seung Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Acked-by: Eric Dumazet Acked-by: Priyaranjan Jha Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 6998707e81f3..6da880d2f022 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1055,7 +1055,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - struct rate_sample *rs); + bool is_sack_reneg, struct rate_sample *rs); void tcp_rate_check_app_limited(struct sock *sk); /* These functions determine how the current flow behaves in respect of SACK -- cgit v1.2.3 From 6c148184b5c868ad2c8a5a4a777cd8097622368a Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:54:06 -0800 Subject: net: sched: cleanup qdisc_run and __qdisc_run semantics Currently __qdisc_run calls qdisc_run_end() but does not call qdisc_run_begin(). This makes it hard to track pairs of qdisc_run_{begin,end} across function calls. To simplify reading these code paths this patch moves begin/end calls into qdisc_run(). Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index d1f413f06c72..4eea7198898e 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -113,8 +113,10 @@ void __qdisc_run(struct Qdisc *q); static inline void qdisc_run(struct Qdisc *q) { - if (qdisc_run_begin(q)) + if (qdisc_run_begin(q)) { __qdisc_run(q); + qdisc_run_end(q); + } } static inline __be16 tc_skb_protocol(const struct sk_buff *skb) -- cgit v1.2.3 From 6b3ba9146fe64b9bebb6346c9dcfe3b4851de2d7 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:54:25 -0800 Subject: net: sched: allow qdiscs to handle locking This patch adds a flag for queueing disciplines to indicate the stack does not need to use the qdisc lock to protect operations. This can be used to build lockless scheduling algorithms and improving performance. The flag is checked in the tx path and the qdisc lock is only taken if it is not set. For now use a conditional if statement. Later we could be more aggressive if it proves worthwhile and use a static key or wrap this in a likely(). Also the lockless case drops the TCQ_F_CAN_BYPASS logic. The reason for this is synchronizing a qlen counter across threads proves to cost more than doing the enqueue/dequeue operations when tested with pktgen. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 7dd8b0b0d244..77791fa055de 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -71,6 +71,7 @@ struct Qdisc { * qdisc_tree_decrease_qlen() should stop. */ #define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ +#define TCQ_F_NOLOCK 0x100 /* qdisc does not require locking */ u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; -- cgit v1.2.3 From 29b86cdac00a82f88b81c16659e64cc624550216 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:54:47 -0800 Subject: net: sched: remove remaining uses for qdisc_qlen in xmit path sch_direct_xmit() uses qdisc_qlen as a return value but all call sites of the routine only check if it is zero or not. Simplify the logic so that we don't need to return an actual queue length value. This introduces a case now where sch_direct_xmit would have returned a qlen of zero but now it returns true. However in this case all call sites of sch_direct_xmit will implement a dequeue() and get a null skb and abort. This trades tracking qlen in the hotpath for an extra dequeue operation. Overall this seems to be good for performance. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 4eea7198898e..240469228851 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -105,9 +105,9 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, void qdisc_put_rtab(struct qdisc_rate_table *tab); void qdisc_put_stab(struct qdisc_size_table *tab); void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc); -int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, - struct net_device *dev, struct netdev_queue *txq, - spinlock_t *root_lock, bool validate); +bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, + struct net_device *dev, struct netdev_queue *txq, + spinlock_t *root_lock, bool validate); void __qdisc_run(struct Qdisc *q); -- cgit v1.2.3 From 40bd036219dca86938fd5bd84b3fc19ffa812596 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:55:07 -0800 Subject: net: sched: provide per cpu qstat helpers The per cpu qstats support was added with per cpu bstat support which is currently used by the ingress qdisc. This patch adds a set of helpers needed to make other qdiscs that use qstats per cpu as well. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/sch_generic.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 77791fa055de..eff31d824861 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -632,12 +632,39 @@ static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch, sch->qstats.backlog -= qdisc_pkt_len(skb); } +static inline void qdisc_qstats_cpu_backlog_dec(struct Qdisc *sch, + const struct sk_buff *skb) +{ + this_cpu_sub(sch->cpu_qstats->backlog, qdisc_pkt_len(skb)); +} + static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch, const struct sk_buff *skb) { sch->qstats.backlog += qdisc_pkt_len(skb); } +static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch, + const struct sk_buff *skb) +{ + this_cpu_add(sch->cpu_qstats->backlog, qdisc_pkt_len(skb)); +} + +static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch) +{ + this_cpu_inc(sch->cpu_qstats->qlen); +} + +static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch) +{ + this_cpu_dec(sch->cpu_qstats->qlen); +} + +static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch) +{ + this_cpu_inc(sch->cpu_qstats->requeues); +} + static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count) { sch->qstats.drops += count; @@ -845,6 +872,14 @@ static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch) qdisc_qstats_drop(sch); } +static inline int qdisc_drop_cpu(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + __qdisc_drop(skb, to_free); + qdisc_qstats_cpu_drop(sch); + + return NET_XMIT_DROP; +} static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) -- cgit v1.2.3 From d59f5ffa59d80ff3a2f3b56a9acea4310974c6d1 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:55:26 -0800 Subject: net: sched: a dflt qdisc may be used with per cpu stats Enable dflt qdisc support for per cpu stats before this patch a dflt qdisc was required to use the global statistics qstats and bstats. This adds a static flags field to qdisc_ops that is propagated into qdisc->flags in qdisc allocate call. This allows the allocation block to completely allocate the qdisc object so we don't have dangling allocations after qdisc init. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index eff31d824861..6fd9a4e70066 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -180,6 +180,7 @@ struct Qdisc_ops { const struct Qdisc_class_ops *cl_ops; char id[IFNAMSIZ]; int priv_size; + unsigned int static_flags; int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, -- cgit v1.2.3 From a53851e2c3218aa30b77abd6e68cf1c371f15afe Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:55:45 -0800 Subject: net: sched: explicit locking in gso_cpu fallback This work is preparing the qdisc layer to support egress lockless qdiscs. If we are running the egress qdisc lockless in the case we overrun the netdev, for whatever reason, the netdev returns a busy error code and the skb is parked on the gso_skb pointer. With many cores all hitting this case at once its possible to have multiple sk_buffs here so we turn gso_skb into a queue. This should be the edge case and if we see this frequently then the netdev/qdisc layer needs to back off. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/sch_generic.h | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 6fd9a4e70066..9b9e4feda127 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -88,7 +88,7 @@ struct Qdisc { /* * For performance sake on SMP, we put highly modified fields at the end */ - struct sk_buff *gso_skb ____cacheline_aligned_in_smp; + struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; struct qdisc_skb_head q; struct gnet_stats_basic_packed bstats; seqcount_t running; @@ -796,26 +796,30 @@ static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch) /* generic pseudo peek method for non-work-conserving qdisc */ static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch) { + struct sk_buff *skb = skb_peek(&sch->gso_skb); + /* we can reuse ->gso_skb because peek isn't called for root qdiscs */ - if (!sch->gso_skb) { - sch->gso_skb = sch->dequeue(sch); - if (sch->gso_skb) { + if (!skb) { + skb = sch->dequeue(sch); + + if (skb) { + __skb_queue_head(&sch->gso_skb, skb); /* it's still part of the queue */ - qdisc_qstats_backlog_inc(sch, sch->gso_skb); + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; } } - return sch->gso_skb; + return skb; } /* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) { - struct sk_buff *skb = sch->gso_skb; + struct sk_buff *skb = skb_peek(&sch->gso_skb); if (skb) { - sch->gso_skb = NULL; + skb = __skb_dequeue(&sch->gso_skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; } else { -- cgit v1.2.3 From 70e57d5e3f8ec7c482b92ef43e543d87134689ab Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:56:23 -0800 Subject: net: sched: use skb list for skb_bad_tx Similar to how gso is handled use skb list for skb_bad_tx this is required with lockless qdiscs because we may have multiple cores attempting to push skbs into skb_bad_tx concurrently Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/sch_generic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 9b9e4feda127..da2528036e2e 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -95,7 +95,7 @@ struct Qdisc { struct gnet_stats_queue qstats; unsigned long state; struct Qdisc *next_sched; - struct sk_buff *skb_bad_txq; + struct sk_buff_head skb_bad_txq; int padded; refcount_t refcnt; -- cgit v1.2.3 From 7e66016f2c65bfc1181f42274fcb7f1183ab1bb5 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:57:00 -0800 Subject: net: sched: helpers to sum qlen and qlen for per cpu logic Add qdisc qlen helper routines for lockless qdiscs to use. The qdisc qlen is no longer used in the hotpath but it is reported via stats query on the qdisc so it still needs to be tracked. This adds the per cpu operations needed along with a helper to return the summation of per cpu stats. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/sch_generic.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index da2528036e2e..8f8c0afe529b 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -292,11 +292,31 @@ static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) BUILD_BUG_ON(sizeof(qcb->data) < sz); } +static inline int qdisc_qlen_cpu(const struct Qdisc *q) +{ + return this_cpu_ptr(q->cpu_qstats)->qlen; +} + static inline int qdisc_qlen(const struct Qdisc *q) { return q->q.qlen; } +static inline int qdisc_qlen_sum(const struct Qdisc *q) +{ + __u32 qlen = 0; + int i; + + if (q->flags & TCQ_F_NOLOCK) { + for_each_possible_cpu(i) + qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen; + } else { + qlen = q->q.qlen; + } + + return qlen; +} + static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb) { return (struct qdisc_skb_cb *)skb->cb; -- cgit v1.2.3 From b01ac095c740fc21f4bb21abe900b0f5b3042cf9 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:57:20 -0800 Subject: net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mq The sch_mq qdisc creates a sub-qdisc per tx queue which are then called independently for enqueue and dequeue operations. However statistics are aggregated and pushed up to the "master" qdisc. This patch adds support for any of the sub-qdiscs to be per cpu statistic qdiscs. To handle this case add a check when calculating stats and aggregate the per cpu stats if needed. Also exports __gnet_stats_copy_queue() to use as a helper function. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/gen_stats.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index 304f7aa9cc01..0304ba2ae353 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -49,6 +49,9 @@ int gnet_stats_copy_rate_est(struct gnet_dump *d, int gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue __percpu *cpu_q, struct gnet_stats_queue *q, __u32 qlen); +void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu_q, + const struct gnet_stats_queue *q, __u32 qlen); int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); int gnet_stats_finish_copy(struct gnet_dump *d); -- cgit v1.2.3 From e2fb1b839208ad776c0ffbb55f17e6968389ce02 Mon Sep 17 00:00:00 2001 From: Yingying Tang Date: Tue, 24 Oct 2017 16:51:10 +0800 Subject: mac80211: enable TDLS peer buffer STA feature Allow drivers to set the buffer station extended capability for TDLS links, with a new hardware flag indicating this. Signed-off-by: Yingying Tang [change commit log/documentation wording] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index eec143cca1c0..2ee4af25256d 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2056,6 +2056,9 @@ struct ieee80211_txq { * The stack will not do fragmentation. * The callback for @set_frag_threshold should be set as well. * + * @IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA: Hardware supports buffer STA on + * TDLS links. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2098,6 +2101,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_TX_FRAG_LIST, IEEE80211_HW_REPORTS_LOW_ACK, IEEE80211_HW_SUPPORTS_TX_FRAG, + IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS -- cgit v1.2.3 From 9ae3b172e886c851d8cd6a88569025160b485e95 Mon Sep 17 00:00:00 2001 From: Tova Mussai Date: Sun, 29 Oct 2017 11:51:11 +0200 Subject: cfg80211: IBSS: Add support for static WEP in driver for IBSS Add support for drivers that implement static WEP internally for IBSS. Add the WEP keys to the IBSS params struct, that will allow the driver to use the keys in the join flow, and not only after the connection. Signed-off-by: Tova Mussai Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8b8118a7fadb..698cebf2de2a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2021,6 +2021,9 @@ struct cfg80211_disassoc_request { * @ht_capa: HT Capabilities over-rides. Values set in ht_capa_mask * will be used in ht_capa. Un-supported values will be ignored. * @ht_capa_mask: The bits of ht_capa which are to be used. + * @wep_keys: static WEP keys, if not NULL points to an array of + * CFG80211_MAX_WEP_KEYS WEP keys + * @wep_tx_key: key index (0..3) of the default TX static WEP key */ struct cfg80211_ibss_params { const u8 *ssid; @@ -2037,6 +2040,8 @@ struct cfg80211_ibss_params { int mcast_rate[NUM_NL80211_BANDS]; struct ieee80211_ht_cap ht_capa; struct ieee80211_ht_cap ht_capa_mask; + struct key_params *wep_keys; + int wep_tx_key; }; /** -- cgit v1.2.3 From 6c2fb1e6527b1092e09c786fee66b1759fa9d574 Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Thu, 9 Nov 2017 14:46:30 +0300 Subject: cfg80211: cleanup signal strength units notation Both cfg80211_rx_mgmt and cfg80211_report_obss_beacon functions send reports to userspace using NL80211_ATTR_RX_SIGNAL_DBM attribute w/o any processing of their input signal values. Which means that in order to match userspace tools expectations, input signal values for those functions are supposed to be in dBm units. This patch cleans up comments, variable names, and trace reports for those functions, replacing confusing 'mBm' by 'dBm'. Signed-off-by: Sergey Matyukevich Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 698cebf2de2a..d7f8e7b96bcb 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5581,7 +5581,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, * cfg80211_rx_mgmt - notification of received, unprocessed management frame * @wdev: wireless device receiving the frame * @freq: Frequency on which the frame was received in MHz - * @sig_dbm: signal strength in mBm, or 0 if unknown + * @sig_dbm: signal strength in dBm, or 0 if unknown * @buf: Management frame (header + body) * @len: length of the frame data * @flags: flags, as defined in enum nl80211_rxmgmt_flags @@ -5760,7 +5760,7 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, * @frame: the frame * @len: length of the frame * @freq: frequency the frame was received on - * @sig_dbm: signal strength in mBm, or 0 if unknown + * @sig_dbm: signal strength in dBm, or 0 if unknown * * Use this function to report to userspace when a beacon was * received. It is not useful to call this when there is no -- cgit v1.2.3 From 9de18d8186cb070d22ed67a3f75a2ef5fbf3ef6f Mon Sep 17 00:00:00 2001 From: David Spinadel Date: Fri, 1 Dec 2017 13:50:52 +0200 Subject: mac80211: Add MIC space only for TX key option Add a key flag to indicates that the device only needs MIC space and not a real MIC. In such cases, keep the MIC zeroed for ease of debug. Signed-off-by: David Spinadel Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 2ee4af25256d..906e90223066 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1552,6 +1552,9 @@ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif); * @IEEE80211_KEY_FLAG_RESERVE_TAILROOM: This flag should be set by the * driver for a key to indicate that sufficient tailroom must always * be reserved for ICV or MIC, even when HW encryption is enabled. + * @IEEE80211_KEY_FLAG_PUT_MIC_SPACE: This flag should be set by the driver for + * a TKIP key if it only requires MIC space. Do not set together with + * @IEEE80211_KEY_FLAG_GENERATE_MMIC on the same key. */ enum ieee80211_key_flags { IEEE80211_KEY_FLAG_GENERATE_IV_MGMT = BIT(0), @@ -1562,6 +1565,7 @@ enum ieee80211_key_flags { IEEE80211_KEY_FLAG_PUT_IV_SPACE = BIT(5), IEEE80211_KEY_FLAG_RX_MGMT = BIT(6), IEEE80211_KEY_FLAG_RESERVE_TAILROOM = BIT(7), + IEEE80211_KEY_FLAG_PUT_MIC_SPACE = BIT(8), }; /** @@ -1593,8 +1597,8 @@ struct ieee80211_key_conf { u8 icv_len; u8 iv_len; u8 hw_key_idx; - u8 flags; s8 keyidx; + u16 flags; u8 keylen; u8 key[0]; }; -- cgit v1.2.3 From e937b8da5a591f141fe41aa48a2e898df9888c95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 31 Oct 2017 12:27:45 +0100 Subject: mac80211: Add TXQ scheduling API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds an API to mac80211 to handle scheduling of TXQs and changes the interface between driver and mac80211 for TXQ handling as follows: - The wake_tx_queue callback interface no longer includes the TXQ. Instead, the driver is expected to retrieve that from ieee80211_next_txq() - Two new mac80211 functions are added: ieee80211_next_txq() and ieee80211_schedule_txq(). The former returns the next TXQ that should be scheduled, and is how the driver gets a queue to pull packets from. The latter is called internally by mac80211 to start scheduling a queue, and the driver is supposed to call it to re-schedule the TXQ after it is finished pulling packets from it (unless the queue emptied). The ath9k and ath10k drivers are changed to use the new API. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- include/net/mac80211.h | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 906e90223066..45155803c875 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -105,9 +105,12 @@ * The driver is expected to initialize its private per-queue data for stations * and interfaces in the .add_interface and .sta_add ops. * - * The driver can't access the queue directly. To dequeue a frame, it calls - * ieee80211_tx_dequeue(). Whenever mac80211 adds a new frame to a queue, it - * calls the .wake_tx_queue driver op. + * The driver can't access the queue directly. To obtain the next queue to pull + * frames from, the driver calls ieee80211_next_txq(). To dequeue a frame from a + * txq, it calls ieee80211_tx_dequeue(). Whenever mac80211 adds a new frame to a + * queue, it calls the .wake_tx_queue driver op. The driver is expected to + * re-schedule the txq using ieee80211_schedule_txq() if it is still active + * after the driver has finished pulling packets from it. * * For AP powersave TIM handling, the driver only needs to indicate if it has * buffered packets in the driver specific data structures by calling @@ -3731,8 +3734,7 @@ struct ieee80211_ops { struct ieee80211_vif *vif, struct ieee80211_tdls_ch_sw_params *params); - void (*wake_tx_queue)(struct ieee80211_hw *hw, - struct ieee80211_txq *txq); + void (*wake_tx_queue)(struct ieee80211_hw *hw); void (*sync_rx_queues)(struct ieee80211_hw *hw); int (*start_nan)(struct ieee80211_hw *hw, @@ -5883,13 +5885,36 @@ void ieee80211_unreserve_tid(struct ieee80211_sta *sta, u8 tid); * ieee80211_tx_dequeue - dequeue a packet from a software tx queue * * @hw: pointer as obtained from ieee80211_alloc_hw() - * @txq: pointer obtained from station or virtual interface + * @txq: pointer obtained from ieee80211_next_txq() * * Returns the skb if successful, %NULL if no frame was available. */ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_txq *txq); +/** + * ieee80211_schedule_txq - add txq to scheduling loop + * + * @hw: pointer as obtained from ieee80211_alloc_hw() + * @txq: pointer obtained from station or virtual interface + * + * Returns %true if the txq was actually added to the scheduling, + * %false otherwise. + */ +bool ieee80211_schedule_txq(struct ieee80211_hw *hw, + struct ieee80211_txq *txq); + +/** + * ieee80211_next_txq - get next tx queue to pull packets from + * + * @hw: pointer as obtained from ieee80211_alloc_hw() + * + * Returns the next txq if successful, %NULL if no queue is eligible. If a txq + * is returned, it will have been removed from the scheduler queue and needs to + * be re-scheduled with ieee80211_schedule_txq() to continue to be active. + */ +struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw); + /** * ieee80211_txq_get_depth - get pending frame/byte count of given txq * -- cgit v1.2.3 From b0d52ad821843a6c5badebd80feef9f871904fa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 31 Oct 2017 12:27:46 +0100 Subject: mac80211: Add airtime account and scheduling to TXQs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds airtime accounting and scheduling to the mac80211 TXQ scheduler. A new hardware flag, AIRTIME_ACCOUNTING, is added that drivers can set if they support reporting airtime usage of transmissions. When this flag is set, mac80211 will expect the actual airtime usage to be reported in the tx_time and rx_time fields of the respective status structs. When airtime information is present, mac80211 will schedule TXQs (through ieee80211_next_txq()) in a way that enforces airtime fairness between active stations. This scheduling works the same way as the ath9k in-driver airtime fairness scheduling. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- include/net/mac80211.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 45155803c875..531b526a10db 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1188,6 +1188,8 @@ enum mac80211_rx_encoding { * HT or VHT is used (%RX_FLAG_HT/%RX_FLAG_VHT) * @nss: number of streams (VHT and HE only) * @flag: %RX_FLAG_\* + * @airtime: Duration of frame in usec. See @IEEE80211_HW_AIRTIME_ACCOUNTING for + * how to use this. * @encoding: &enum mac80211_rx_encoding * @bw: &enum rate_info_bw * @enc_flags: uses bits from &enum mac80211_rx_encoding_flags @@ -1202,6 +1204,7 @@ struct ieee80211_rx_status { u32 device_timestamp; u32 ampdu_reference; u32 flag; + u16 airtime; u16 freq; u8 enc_flags; u8 encoding:2, bw:3; @@ -2066,6 +2069,26 @@ struct ieee80211_txq { * @IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA: Hardware supports buffer STA on * TDLS links. * + * @IEEE80211_HW_AIRTIME_ACCOUNTING: Hardware supports accounting the airtime + * usage of other stations and reports it in the @tx_time and/or @airtime + * fields of the TX/RX status structs. + * When setting this flag, the driver should ensure that the respective + * fields in the TX and RX status structs are always either zero or + * contains a valid duration for the frame in usec. The driver can choose + * to report either or both of TX and RX airtime, but it is recommended to + * report both. + * The reported airtime should as a minimum include all time that is spent + * transmitting to the remote station, including overhead and padding, but + * not including time spent waiting for a TXOP. If the time is not reported + * by the hardware it can in some cases be calculated from the rate and + * known frame composition. When possible, the time should include any + * failed transmission attempts. + * For aggregated frames, there are two possible strategies to report the + * airtime: Either include the airtime of the entire aggregate in the first + * (or last) frame and leave the others at zero. Alternatively, include the + * overhead of the full aggregate in the first or last frame and report the + * time of each frame + padding not including the full aggregate overhead. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2109,6 +2132,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_REPORTS_LOW_ACK, IEEE80211_HW_SUPPORTS_TX_FRAG, IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA, + IEEE80211_HW_AIRTIME_ACCOUNTING, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS -- cgit v1.2.3 From 97a6ec4ac021f7fbec05c15a3aa0c4aaf0461af5 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 4 Dec 2017 10:31:41 -0800 Subject: rhashtable: Change rhashtable_walk_start to return void Most callers of rhashtable_walk_start don't care about a resize event which is indicated by a return value of -EAGAIN. So calls to rhashtable_walk_start are wrapped wih code to ignore -EAGAIN. Something like this is common: ret = rhashtable_walk_start(rhiter); if (ret && ret != -EAGAIN) goto out; Since zero and -EAGAIN are the only possible return values from the function this check is pointless. The condition never evaluates to true. This patch changes rhashtable_walk_start to return void. This simplifies code for the callers that ignore -EAGAIN. For the few cases where the caller cares about the resize event, particularly where the table can be walked in mulitple parts for netlink or seq file dump, the function rhashtable_walk_start_check has been added that returns -EAGAIN on a resize event. Signed-off-by: Tom Herbert Acked-by: Herbert Xu Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 906a9c0efa71..6f79415f6634 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -116,7 +116,7 @@ extern struct percpu_counter sctp_sockets_allocated; int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *); struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *); -int sctp_transport_walk_start(struct rhashtable_iter *iter); +void sctp_transport_walk_start(struct rhashtable_iter *iter); void sctp_transport_walk_stop(struct rhashtable_iter *iter); struct sctp_transport *sctp_transport_get_next(struct net *net, struct rhashtable_iter *iter); -- cgit v1.2.3 From 772a58693fc3116d05b7969223a80a6376e639eb Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Dec 2017 21:03:58 +0800 Subject: sctp: add stream interleave enable members and sockopt This patch adds intl_enable in asoc and netns, and strm_interleave in sctp_sock to indicate if stream interleave is enabled and supported. netns intl_enable would be set via procfs, but that is not added yet until all stream interleave codes are completely implemented; asoc intl_enable will be set when doing 4-shakehands. sp strm_interleave can be set by sockopt SCTP_INTERLEAVING_SUPPORTED which is also added in this patch. This socket option is defined in section 4.3.1 of RFC8260. Note that strm_interleave can only be set by sockopt when both netns intl_enable and sp frag_interleave are set. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/netns/sctp.h | 5 ++++- include/net/sctp/structs.h | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index ebc813277662..0db7fb3e4e15 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -122,9 +122,12 @@ struct netns_sctp { /* Flag to indicate if PR-CONFIG is enabled. */ int reconf_enable; - /* Flag to idicate if SCTP-AUTH is enabled */ + /* Flag to indicate if SCTP-AUTH is enabled */ int auth_enable; + /* Flag to indicate if stream interleave is enabled */ + int intl_enable; + /* * Policy to control SCTP IPv4 address scoping * 0 - Disable IPv4 address scoping diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 2f8f93da5dc2..7030cbe11f45 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -217,6 +217,7 @@ struct sctp_sock { disable_fragments:1, v4mapped:1, frag_interleave:1, + strm_interleave:1, recvrcvinfo:1, recvnxtinfo:1, data_ready_signalled:1; @@ -1940,6 +1941,7 @@ struct sctp_association { __u8 need_ecne:1, /* Need to send an ECNE Chunk? */ temp:1, /* Is it a temporary association? */ force_delay:1, + intl_enable:1, prsctp_enable:1, reconf_enable:1; -- cgit v1.2.3 From ad05a7a05ede28ba6dd935d9e932264a22518b1f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Dec 2017 21:04:00 +0800 Subject: sctp: add basic structures and make chunk function for idata sctp_idatahdr and sctp_idata_chunk are used to define and parse I-DATA chunk format, and sctp_make_idata is a function to build the chunk. The I-DATA Chunk Format is defined in section 2.1 of RFC8260. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/sm.h | 2 ++ include/net/sctp/structs.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 70fb397f65b0..5389ae034cfa 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -197,6 +197,8 @@ struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc, struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc, const __u32 lowest_tsn, const struct sctp_chunk *chunk); +struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc, + __u8 flags, int paylen, gfp_t gfp); struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, int len, const __u8 flags, diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 7030cbe11f45..7026a8039367 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -575,6 +575,7 @@ struct sctp_chunk { struct sctp_addiphdr *addip_hdr; struct sctp_fwdtsn_hdr *fwdtsn_hdr; struct sctp_authhdr *auth_hdr; + struct sctp_idatahdr *idata_hdr; } subh; __u8 *chunk_end; -- cgit v1.2.3 From 0c3f6f655487d12c7a0c16914c98c599043e88d3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Dec 2017 21:04:01 +0800 Subject: sctp: implement make_datafrag for sctp_stream_interleave To avoid hundreds of checks for the different process on I-DATA chunk, struct sctp_stream_interleave is defined as a group of functions used to replace the codes in some place where it needs to do different job according to if the asoc intl_enabled is set. With these ops, it only needs to initialize asoc->stream.si with sctp_stream_interleave_0 for normal data if asoc intl_enable is 0, or sctp_stream_interleave_1 for idata if asoc intl_enable is set in sctp_stream_init. After that, the members in asoc->stream.si can be used directly in some special places without checking asoc intl_enable. make_datafrag is the first member for sctp_stream_interleave, it's used to make data or idata frags, called in sctp_datamsg_from_user. The old function sctp_make_datafrag_empty needs to be adjust some to fit in this ops. Note that as idata and data chunks have different length, it also defines data_chunk_len for sctp_stream_interleave to describe the chunk size. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/sm.h | 5 ++-- include/net/sctp/stream_interleave.h | 44 ++++++++++++++++++++++++++++++++++++ include/net/sctp/structs.h | 12 ++++++++++ 3 files changed, 58 insertions(+), 3 deletions(-) create mode 100644 include/net/sctp/stream_interleave.h (limited to 'include/net') diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 5389ae034cfa..f950186aae34 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -199,10 +199,9 @@ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc, const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp); -struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc, +struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, - int len, const __u8 flags, - __u16 ssn, gfp_t gfp); + int len, __u8 flags, gfp_t gfp); struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc, const __u32 lowest_tsn); struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc); diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h new file mode 100644 index 000000000000..7b9fa8dbe620 --- /dev/null +++ b/include/net/sctp/stream_interleave.h @@ -0,0 +1,44 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * These are definitions used by the stream schedulers, defined in RFC + * draft ndata (https://tools.ietf.org/html/draft-ietf-tsvwg-sctp-ndata-11) + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * . + * + * Please send any bug reports or fixes you make to the + * email addresses: + * lksctp developers + * + * Written or modified by: + * Xin Long + */ + +#ifndef __sctp_stream_interleave_h__ +#define __sctp_stream_interleave_h__ + +struct sctp_stream_interleave { + __u16 data_chunk_len; + /* (I-)DATA process */ + struct sctp_chunk *(*make_datafrag)(const struct sctp_association *asoc, + const struct sctp_sndrcvinfo *sinfo, + int len, __u8 flags, gfp_t gfp); +}; + +void sctp_stream_interleave_init(struct sctp_stream *stream); + +#endif /* __sctp_stream_interleave_h__ */ diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 7026a8039367..96cc898787f1 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -89,6 +89,7 @@ struct sctp_stream; #include #include #include +#include /* Structures useful for managing bind/connect. */ @@ -1389,11 +1390,22 @@ struct sctp_stream { struct sctp_stream_out_ext *rr_next; }; }; + struct sctp_stream_interleave *si; }; #define SCTP_STREAM_CLOSED 0x00 #define SCTP_STREAM_OPEN 0x01 +static inline __u16 sctp_datachk_len(const struct sctp_stream *stream) +{ + return stream->si->data_chunk_len; +} + +static inline __u16 sctp_datahdr_len(const struct sctp_stream *stream) +{ + return stream->si->data_chunk_len - sizeof(struct sctp_chunkhdr); +} + /* SCTP_GET_ASSOC_STATS counters */ struct sctp_priv_assoc_stats { /* Maximum observed rto in the association during subsequent -- cgit v1.2.3 From 668c9beb9020d5834ee9e43c208190a07d2b1928 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Dec 2017 21:04:02 +0800 Subject: sctp: implement assign_number for sctp_stream_interleave assign_number is added as a member of sctp_stream_interleave, used to assign ssn for data or mid (message id) for idata, called in sctp_packet_append_data. sctp_chunk_assign_ssn is left as it is, and sctp_chunk_assign_mid is added for sctp_stream_interleave_1. This procedure is described in section 2.2.2 of RFC8260. All sizeof(struct sctp_data_chunk) in tx path is replaced with sctp_datachk_len, to make it right for idata as well. And also adjust sctp_chunk_is_data for SCTP_CID_I_DATA. After this patch, idata can be built and sent in tx path. Note that if sp strm_interleave is set, it has to wait_connect in sctp_sendmsg, as asoc intl_enable need to be known after 4 shake- hands, to decide if it should use data or idata later. data and idata can't be mixed to send in one asoc. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/constants.h | 9 +++++---- include/net/sctp/sctp.h | 4 ++-- include/net/sctp/sm.h | 2 +- include/net/sctp/stream_interleave.h | 1 + include/net/sctp/structs.h | 18 +++++++++++++++++- 5 files changed, 26 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index deaafa9b09cb..20ff237c5eb2 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -145,12 +145,13 @@ SCTP_SUBTYPE_CONSTRUCTOR(OTHER, enum sctp_event_other, other) SCTP_SUBTYPE_CONSTRUCTOR(PRIMITIVE, enum sctp_event_primitive, primitive) -#define sctp_chunk_is_data(a) (a->chunk_hdr->type == SCTP_CID_DATA) +#define sctp_chunk_is_data(a) (a->chunk_hdr->type == SCTP_CID_DATA || \ + a->chunk_hdr->type == SCTP_CID_I_DATA) /* Calculate the actual data size in a data chunk */ -#define SCTP_DATA_SNDSIZE(c) ((int)((unsigned long)(c->chunk_end)\ - - (unsigned long)(c->chunk_hdr)\ - - sizeof(struct sctp_data_chunk))) +#define SCTP_DATA_SNDSIZE(c) ((int)((unsigned long)(c->chunk_end) - \ + (unsigned long)(c->chunk_hdr) - \ + sctp_datachk_len(&c->asoc->stream))) /* Internal error codes */ enum sctp_ierror { diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 6f79415f6634..20c0c1be2ca7 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -444,13 +444,13 @@ static inline int sctp_frag_point(const struct sctp_association *asoc, int pmtu) int frag = pmtu; frag -= sp->pf->af->net_header_len; - frag -= sizeof(struct sctphdr) + sizeof(struct sctp_data_chunk); + frag -= sizeof(struct sctphdr) + sctp_datachk_len(&asoc->stream); if (asoc->user_frag) frag = min_t(int, frag, asoc->user_frag); frag = SCTP_TRUNC4(min_t(int, frag, SCTP_MAX_CHUNK_LEN - - sizeof(struct sctp_data_chunk))); + sctp_datachk_len(&asoc->stream))); return frag; } diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index f950186aae34..ca1db8997e5d 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -343,7 +343,7 @@ static inline __u16 sctp_data_size(struct sctp_chunk *chunk) __u16 size; size = ntohs(chunk->chunk_hdr->length); - size -= sizeof(struct sctp_data_chunk); + size -= sctp_datahdr_len(&chunk->asoc->stream); return size; } diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h index 7b9fa8dbe620..99f399e7a871 100644 --- a/include/net/sctp/stream_interleave.h +++ b/include/net/sctp/stream_interleave.h @@ -37,6 +37,7 @@ struct sctp_stream_interleave { struct sctp_chunk *(*make_datafrag)(const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, int len, __u8 flags, gfp_t gfp); + void (*assign_number)(struct sctp_chunk *chunk); }; void sctp_stream_interleave_init(struct sctp_stream *stream); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 96cc898787f1..fd93973bb667 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -399,6 +399,18 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new); #define sctp_ssn_skip(stream, type, sid, ssn) \ ((stream)->type[sid].ssn = ssn + 1) +/* What is the current MID number for this stream? */ +#define sctp_mid_peek(stream, type, sid) \ + ((stream)->type[sid].mid) + +/* Return the next MID number for this stream. */ +#define sctp_mid_next(stream, type, sid) \ + ((stream)->type[sid].mid++) + +/* Skip over this mid and all below. */ +#define sctp_mid_skip(stream, type, sid, mid) \ + ((stream)->type[sid].mid = mid + 1) + /* * Pointers to address related SCTP functions. * (i.e. things that depend on the address family.) @@ -623,6 +635,7 @@ struct sctp_chunk { __u16 rtt_in_progress:1, /* This chunk used for RTT calc? */ has_tsn:1, /* Does this chunk have a TSN yet? */ has_ssn:1, /* Does this chunk have a SSN yet? */ +#define has_mid has_ssn singleton:1, /* Only chunk in the packet? */ end_of_packet:1, /* Last chunk in the packet? */ ecn_ce_done:1, /* Have we processed the ECN CE bit? */ @@ -1360,7 +1373,10 @@ struct sctp_stream_out_ext { }; struct sctp_stream_out { - __u16 ssn; + union { + __u32 mid; + __u16 ssn; + }; __u8 state; struct sctp_stream_out_ext *ext; }; -- cgit v1.2.3 From 9d4ceaf154a947e69648041bcb11a24a7a40c380 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Dec 2017 21:04:03 +0800 Subject: sctp: implement validate_data for sctp_stream_interleave validate_data is added as a member of sctp_stream_interleave, used to validate ssn/chunk type for data or mid (message id)/chunk type for idata, called in sctp_eat_data. If this check fails, an abort packet will be sent, as said in section 2.2.3 of RFC8260. It also adds the process for idata in rx path. As Marcelo pointed out, there's no need to add event table for idata, but just share chunk_event_table with data's. It would drop data chunk for idata and drop idata chunk for data by calling validate_data in sctp_eat_data. As last patch did, it also replaces sizeof(struct sctp_data_chunk) with sctp_datachk_len for rx path. After this patch, the idata can be accepted and delivered to ulp layer. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/sm.h | 6 ++++++ include/net/sctp/stream_interleave.h | 1 + include/net/sctp/structs.h | 6 +++++- 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index ca1db8997e5d..0993b4953b3a 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -359,6 +359,12 @@ static inline __u16 sctp_data_size(struct sctp_chunk *chunk) typecheck(__u32, b) && \ ((__s32)((a) - (b)) <= 0)) +/* Compare two MIDs */ +#define MID_lt(a, b) \ + (typecheck(__u32, a) && \ + typecheck(__u32, b) && \ + ((__s32)((a) - (b)) < 0)) + /* Compare two SSNs */ #define SSN_lt(a,b) \ (typecheck(__u16, a) && \ diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h index 99f399e7a871..d8d1b51e1362 100644 --- a/include/net/sctp/stream_interleave.h +++ b/include/net/sctp/stream_interleave.h @@ -38,6 +38,7 @@ struct sctp_stream_interleave { const struct sctp_sndrcvinfo *sinfo, int len, __u8 flags, gfp_t gfp); void (*assign_number)(struct sctp_chunk *chunk); + bool (*validate_data)(struct sctp_chunk *chunk); }; void sctp_stream_interleave_init(struct sctp_stream *stream); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index fd93973bb667..be4cc73f3106 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1382,7 +1382,11 @@ struct sctp_stream_out { }; struct sctp_stream_in { - __u16 ssn; + union { + __u32 mid; + __u16 ssn; + }; + __u32 fsn; }; struct sctp_stream { -- cgit v1.2.3 From bd4d627dbd5adb8130d5c54a4135d89f45e41905 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Dec 2017 21:04:04 +0800 Subject: sctp: implement ulpevent_data for sctp_stream_interleave ulpevent_data is added as a member of sctp_stream_interleave, used to do the most process in ulpq, including to convert data or idata chunk to event, reasm them in reasm queue and put them in lobby queue in right order, and deliver them up to user sk rx queue. This procedure is described in section 2.2.3 of RFC8260. It adds most functions for idata here to do the similar process as the old functions for data. But since the details are very different between them, the old functions can not be reused for idata. event->ssn and event->ppid settings are moved to ulpevent_data from sctp_ulpevent_make_rcvmsg, so that sctp_ulpevent_make_rcvmsg could work for both data and idata. Note that mid is added in sctp_ulpevent for idata, __packed has to be used for defining sctp_ulpevent, or it would exceeds the skb cb that saves a sctp_ulpevent variable for ulp layer process. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/stream_interleave.h | 2 ++ include/net/sctp/structs.h | 3 +++ include/net/sctp/ulpevent.h | 20 +++++++++++++++++--- 3 files changed, 22 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h index d8d1b51e1362..02f60f541f1e 100644 --- a/include/net/sctp/stream_interleave.h +++ b/include/net/sctp/stream_interleave.h @@ -39,6 +39,8 @@ struct sctp_stream_interleave { int len, __u8 flags, gfp_t gfp); void (*assign_number)(struct sctp_chunk *chunk); bool (*validate_data)(struct sctp_chunk *chunk); + int (*ulpevent_data)(struct sctp_ulpq *ulpq, + struct sctp_chunk *chunk, gfp_t gfp); }; void sctp_stream_interleave_init(struct sctp_stream *stream); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index be4cc73f3106..73b315de2fef 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -411,6 +411,8 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new); #define sctp_mid_skip(stream, type, sid, mid) \ ((stream)->type[sid].mid = mid + 1) +#define sctp_stream_in(asoc, sid) (&(asoc)->stream.in[sid]) + /* * Pointers to address related SCTP functions. * (i.e. things that depend on the address family.) @@ -1387,6 +1389,7 @@ struct sctp_stream_in { __u16 ssn; }; __u32 fsn; + char pd_mode; }; struct sctp_stream { diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index 231dc42f1da6..ce4f2aa35d56 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -45,19 +45,29 @@ /* A structure to carry information to the ULP (e.g. Sockets API) */ /* Warning: This sits inside an skb.cb[] area. Be very careful of * growing this structure as it is at the maximum limit now. + * + * sctp_ulpevent is saved in sk->cb(48 bytes), whose last 4 bytes + * have been taken by sock_skb_cb, So here it has to use 'packed' + * to make sctp_ulpevent fit into the rest 44 bytes. */ struct sctp_ulpevent { struct sctp_association *asoc; struct sctp_chunk *chunk; unsigned int rmem_len; - __u32 ppid; + union { + __u32 mid; + __u16 ssn; + }; + union { + __u32 ppid; + __u32 fsn; + }; __u32 tsn; __u32 cumtsn; __u16 stream; - __u16 ssn; __u16 flags; __u16 msg_flags; -}; +} __packed; /* Retrieve the skb this event sits inside of. */ static inline struct sk_buff *sctp_event2skb(const struct sctp_ulpevent *ev) @@ -140,6 +150,10 @@ struct sctp_ulpevent *sctp_ulpevent_make_stream_change_event( const struct sctp_association *asoc, __u16 flags, __u32 strchange_instrms, __u32 strchange_outstrms, gfp_t gfp); +struct sctp_ulpevent *sctp_make_reassembled_event( + struct net *net, struct sk_buff_head *queue, + struct sk_buff *f_frag, struct sk_buff *l_frag); + void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, struct msghdr *); void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event, -- cgit v1.2.3 From 9162e0ed9e238c1f1d738cb36ee59d96b097f8e1 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Dec 2017 21:04:05 +0800 Subject: sctp: implement enqueue_event for sctp_stream_interleave enqueue_event is added as a member of sctp_stream_interleave, used to enqueue either data, idata or notification events into user socket rx queue. It replaces sctp_ulpq_tail_event used in the other places with enqueue_event. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/stream_interleave.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h index 02f60f541f1e..a0f61bc08ae8 100644 --- a/include/net/sctp/stream_interleave.h +++ b/include/net/sctp/stream_interleave.h @@ -41,6 +41,8 @@ struct sctp_stream_interleave { bool (*validate_data)(struct sctp_chunk *chunk); int (*ulpevent_data)(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp); + int (*enqueue_event)(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event); }; void sctp_stream_interleave_init(struct sctp_stream *stream); -- cgit v1.2.3 From 94014e8d871ae43d834828710c098518be44b5d9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Dec 2017 21:04:06 +0800 Subject: sctp: implement renege_events for sctp_stream_interleave renege_events is added as a member of sctp_stream_interleave, used to renege some old data or idata in reasm or lobby queue properly to free some memory for the new data when there's memory stress. It defines sctp_renege_events for idata, and leaves sctp_ulpq_renege as it is for data. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/stream_interleave.h | 2 ++ include/net/sctp/ulpqueue.h | 9 +++------ 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h index a0f61bc08ae8..16a71cb2b098 100644 --- a/include/net/sctp/stream_interleave.h +++ b/include/net/sctp/stream_interleave.h @@ -43,6 +43,8 @@ struct sctp_stream_interleave { struct sctp_chunk *chunk, gfp_t gfp); int (*enqueue_event)(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event); + void (*renege_events)(struct sctp_ulpq *ulpq, + struct sctp_chunk *chunk, gfp_t gfp); }; void sctp_stream_interleave_init(struct sctp_stream *stream); diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h index e0dce07b8794..eb98c7150a56 100644 --- a/include/net/sctp/ulpqueue.h +++ b/include/net/sctp/ulpqueue.h @@ -76,11 +76,8 @@ int sctp_clear_pd(struct sock *sk, struct sctp_association *asoc); void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn); void sctp_ulpq_reasm_flushtsn(struct sctp_ulpq *, __u32); -#endif /* __sctp_ulpqueue_h__ */ - - - - - +__u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq, + struct sk_buff_head *list, __u16 needed); +#endif /* __sctp_ulpqueue_h__ */ -- cgit v1.2.3 From be4e0ce10dc64b9a8aae42ec3dbd906022f91ec5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Dec 2017 21:04:07 +0800 Subject: sctp: implement start_pd for sctp_stream_interleave start_pd is added as a member of sctp_stream_interleave, used to do partial_delivery for data or idata when datalen >= asoc->rwnd in sctp_eat_data. The codes have been done in last patches, but they need to be extracted into start_pd, so that it could be used for SCTP_CMD_PART_DELIVER cmd as well. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/stream_interleave.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h index 16a71cb2b098..317d9b3a5299 100644 --- a/include/net/sctp/stream_interleave.h +++ b/include/net/sctp/stream_interleave.h @@ -45,6 +45,7 @@ struct sctp_stream_interleave { struct sctp_ulpevent *event); void (*renege_events)(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp); + void (*start_pd)(struct sctp_ulpq *ulpq, gfp_t gfp); }; void sctp_stream_interleave_init(struct sctp_stream *stream); -- cgit v1.2.3 From 65f5e357839e40817aead853d7a7f61ff828b52b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Dec 2017 21:04:08 +0800 Subject: sctp: implement abort_pd for sctp_stream_interleave abort_pd is added as a member of sctp_stream_interleave, used to abort partial delivery for data or idata, called in sctp_cmd_assoc_failed. Since stream interleave allows to do partial delivery for each stream at the same time, sctp_intl_abort_pd for idata would be very different from the old function sctp_ulpq_abort_pd for data. Note that sctp_ulpevent_make_pdapi will support per stream in this patch by adding pdapi_stream and pdapi_seq in sctp_pdapi_event, as described in section 6.1.7 of RFC6458. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/stream_interleave.h | 1 + include/net/sctp/ulpevent.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h index 317d9b3a5299..501b2be049a3 100644 --- a/include/net/sctp/stream_interleave.h +++ b/include/net/sctp/stream_interleave.h @@ -46,6 +46,7 @@ struct sctp_stream_interleave { void (*renege_events)(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp); void (*start_pd)(struct sctp_ulpq *ulpq, gfp_t gfp); + void (*abort_pd)(struct sctp_ulpq *ulpq, gfp_t gfp); }; void sctp_stream_interleave_init(struct sctp_stream *stream); diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index ce4f2aa35d56..51b4e0626c34 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -122,7 +122,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event( struct sctp_ulpevent *sctp_ulpevent_make_pdapi( const struct sctp_association *asoc, - __u32 indication, gfp_t gfp); + __u32 indication, __u32 sid, __u32 seq, + __u32 flags, gfp_t gfp); struct sctp_ulpevent *sctp_ulpevent_make_adaptation_indication( const struct sctp_association *asoc, gfp_t gfp); -- cgit v1.2.3 From 132282386f5d0eff7a84a119599216b5f9e9bfc6 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Dec 2017 21:04:09 +0800 Subject: sctp: add support for the process of unordered idata Unordered idata process is more complicated than unordered data: - It has to add mid into sctp_stream_out to save the next mid value, which is separated from ordered idata's. - To support pd for unordered idata, another mid and pd_mode need to be added to save the message id and pd state in sctp_stream_in. - To make unordered idata reasm easier, it adds a new event queue to save frags for idata. The patch mostly adds the samilar reasm functions for unordered idata as ordered idata's, and also adjusts some other codes on assign_mid, abort_pd and ulpevent_data for idata. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 14 +++++++++++++- include/net/sctp/ulpqueue.h | 1 + 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 73b315de2fef..8ef638d966f1 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -413,6 +413,14 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new); #define sctp_stream_in(asoc, sid) (&(asoc)->stream.in[sid]) +/* What is the current MID_uo number for this stream? */ +#define sctp_mid_uo_peek(stream, type, sid) \ + ((stream)->type[sid].mid_uo) + +/* Return the next MID_uo number for this stream. */ +#define sctp_mid_uo_next(stream, type, sid) \ + ((stream)->type[sid].mid_uo++) + /* * Pointers to address related SCTP functions. * (i.e. things that depend on the address family.) @@ -1379,8 +1387,9 @@ struct sctp_stream_out { __u32 mid; __u16 ssn; }; - __u8 state; + __u32 mid_uo; struct sctp_stream_out_ext *ext; + __u8 state; }; struct sctp_stream_in { @@ -1388,8 +1397,11 @@ struct sctp_stream_in { __u32 mid; __u16 ssn; }; + __u32 mid_uo; __u32 fsn; + __u32 fsn_uo; char pd_mode; + char pd_mode_uo; }; struct sctp_stream { diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h index eb98c7150a56..bb0ecba3db2b 100644 --- a/include/net/sctp/ulpqueue.h +++ b/include/net/sctp/ulpqueue.h @@ -45,6 +45,7 @@ struct sctp_ulpq { char pd_mode; struct sctp_association *asoc; struct sk_buff_head reasm; + struct sk_buff_head reasm_uo; struct sk_buff_head lobby; }; -- cgit v1.2.3 From 200809716aed1cac586fcac4c0551a688439be1f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 10 Dec 2017 16:56:00 +0800 Subject: fou: fix some member types in guehdr guehdr struct is used to build or parse gue packets, which are always in big endian. It's better to define all guehdr members as __beXX types. Also, in validate_gue_flags it's not good to use a __be32 variable for both Standard flags(__be16) and Private flags (__be32), and pass it to other funcions. This patch could fix a bunch of sparse warnings from fou. Fixes: 5024c33ac354 ("gue: Add infrastructure for flags and options") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/gue.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/gue.h b/include/net/gue.h index 2fdb29ca74c2..fdad41469b65 100644 --- a/include/net/gue.h +++ b/include/net/gue.h @@ -44,10 +44,10 @@ struct guehdr { #else #error "Please fix " #endif - __u8 proto_ctype; - __u16 flags; + __u8 proto_ctype; + __be16 flags; }; - __u32 word; + __be32 word; }; }; @@ -84,11 +84,10 @@ static inline size_t guehdr_priv_flags_len(__be32 flags) * if there is an unknown standard or private flags, or the options length for * the flags exceeds the options length specific in hlen of the GUE header. */ -static inline int validate_gue_flags(struct guehdr *guehdr, - size_t optlen) +static inline int validate_gue_flags(struct guehdr *guehdr, size_t optlen) { + __be16 flags = guehdr->flags; size_t len; - __be32 flags = guehdr->flags; if (flags & ~GUE_FLAGS_ALL) return 1; @@ -101,12 +100,13 @@ static inline int validate_gue_flags(struct guehdr *guehdr, /* Private flags are last four bytes accounted in * guehdr_flags_len */ - flags = *(__be32 *)((void *)&guehdr[1] + len - GUE_LEN_PRIV); + __be32 pflags = *(__be32 *)((void *)&guehdr[1] + + len - GUE_LEN_PRIV); - if (flags & ~GUE_PFLAGS_ALL) + if (pflags & ~GUE_PFLAGS_ALL) return 1; - len += guehdr_priv_flags_len(flags); + len += guehdr_priv_flags_len(pflags); if (len > optlen) return 1; } -- cgit v1.2.3 From b5476022bbada3764609368f03329ca287528dc8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Dec 2017 07:17:39 -0800 Subject: ipv4: igmp: guard against silly MTU values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IPv4 stack reacts to changes to small MTU, by disabling itself under RTNL. But there is a window where threads not using RTNL can see a wrong device mtu. This can lead to surprises, in igmp code where it is assumed the mtu is suitable. Fix this by reading device mtu once and checking IPv4 minimal MTU. This patch adds missing IPV4_MIN_MTU define, to not abuse ETH_MIN_MTU anymore. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index 9896f46cbbf1..af8addbaa3c1 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -34,6 +34,7 @@ #include #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ +#define IPV4_MIN_MTU 68 /* RFC 791 */ struct sock; -- cgit v1.2.3 From 039af9c66b93154b493e3088a36b251b99c9b3c4 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 11 Dec 2017 15:35:03 -0800 Subject: net_sched: switch to exit_batch for action pernet ops Since we now hold RTNL lock in tc_action_net_exit(), it is good to batch them to speedup tc action dismantle. Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/act_api.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 02bf409140d0..6ed9692f20bd 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -120,12 +120,19 @@ int tc_action_net_init(struct tc_action_net *tn, void tcf_idrinfo_destroy(const struct tc_action_ops *ops, struct tcf_idrinfo *idrinfo); -static inline void tc_action_net_exit(struct tc_action_net *tn) +static inline void tc_action_net_exit(struct list_head *net_list, + unsigned int id) { + struct net *net; + rtnl_lock(); - tcf_idrinfo_destroy(tn->ops, tn->idrinfo); + list_for_each_entry(net, net_list, exit_list) { + struct tc_action_net *tn = net_generic(net, id); + + tcf_idrinfo_destroy(tn->ops, tn->idrinfo); + kfree(tn->idrinfo); + } rtnl_unlock(); - kfree(tn->idrinfo); } int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, -- cgit v1.2.3 From ec94c2696f0bcd5ae92a553244e4ac30d2171a2d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Dec 2017 21:25:12 -0800 Subject: tcp/dccp: avoid one atomic operation for timewait hashdance First, rename __inet_twsk_hashdance() to inet_twsk_hashdance() Then, remove one inet_twsk_put() by setting tw_refcnt to 3 instead of 4, but adding a fat warning that we do not have the right to access tw anymore after inet_twsk_hashdance() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 1356fa6a7566..899495589a7e 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -93,8 +93,8 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, struct inet_timewait_death_row *dr, const int state); -void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, - struct inet_hashinfo *hashinfo); +void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, + struct inet_hashinfo *hashinfo); void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm); -- cgit v1.2.3 From c9f1f58dc2eba550f208809d272bf0b14f41edba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Dec 2017 06:34:19 -0800 Subject: net: sk_pacing_shift_update() helper In commit 3a9b76fd0db9 ("tcp: allow drivers to tweak TSQ logic") I gave a code sample to set sk->sk_pacing_shift that was not complete. Better add a helper that can be used by drivers without worries, and maybe amended in the future. A wifi driver might use it from its ndo_start_xmit() Following call would setup TCP to allow up to ~8ms of queued data per flow. sk_pacing_shift_update(skb->sk, 7); Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 9155da422692..9a9047268d37 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2407,4 +2407,15 @@ static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto) return *proto->sysctl_rmem; } +/* Default TCP Small queue budget is ~1 ms of data (1sec >> 10) + * Some wifi drivers need to tweak it to get more chunks. + * They can use this helper from their ndo_start_xmit() + */ +static inline void sk_pacing_shift_update(struct sock *sk, int val) +{ + if (!sk || !sk_fullsock(sk) || sk->sk_pacing_shift == val) + return; + sk->sk_pacing_shift = val; +} + #endif /* _SOCK_H */ -- cgit v1.2.3 From 7268586baa530312041e597b518b5c6a05110df1 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 12 Dec 2017 13:10:40 -0800 Subject: tcp: pause Fast Open globally after third consecutive timeout Prior to this patch, active Fast Open is paused on a specific destination IP address if the previous connections to the IP address have experienced recurring timeouts . But recent experiments by Microsoft (https://goo.gl/cykmn7) and Mozilla browsers indicate the isssue is often caused by broken middle-boxes sitting close to the client. Therefore it is much better user experience if Fast Open is disabled out-right globally to avoid experiencing further timeouts on connections toward other destinations. This patch changes the destination-IP disablement to global disablement if a connection experiencing recurring timeouts or aborts due to timeout. Repeated incidents would still exponentially increase the pause time, starting from an hour. This is extremely conservative but an unfortunate compromise to minimize bad experience due to broken middle-boxes. Reported-by: Dragana Damjanovic Reported-by: Patrick McManus Signed-off-by: Yuchung Cheng Reviewed-by: Wei Wang Reviewed-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 3c3744e52cd1..6939e69d3c37 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1507,8 +1507,7 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, /* From tcp_fastopen.c */ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, - struct tcp_fastopen_cookie *cookie, int *syn_loss, - unsigned long *last_syn_loss); + struct tcp_fastopen_cookie *cookie); void tcp_fastopen_cache_set(struct sock *sk, u16 mss, struct tcp_fastopen_cookie *cookie, bool syn_lost, u16 try_exp); @@ -1546,7 +1545,7 @@ extern unsigned int sysctl_tcp_fastopen_blackhole_timeout; void tcp_fastopen_active_disable(struct sock *sk); bool tcp_fastopen_active_should_disable(struct sock *sk); void tcp_fastopen_active_disable_ofo_check(struct sock *sk); -void tcp_fastopen_active_timeout_reset(void); +void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired); /* Latencies incurred by various limits for a sender. They are * chronograph-like stats that are mutually exclusive. -- cgit v1.2.3 From 1d7e2ed22f8d9171fa8b629754022f22115b3f03 Mon Sep 17 00:00:00 2001 From: William Tu Date: Wed, 13 Dec 2017 16:38:55 -0800 Subject: net: erspan: refactor existing erspan code The patch refactors the existing erspan implementation in order to support erspan version 2, which has additional metadata. So, in stead of having one 'struct erspanhdr' holding erspan version 1, breaks it into 'struct erspan_base_hdr' and 'struct erspan_metadata'. Signed-off-by: William Tu Signed-off-by: David S. Miller --- include/net/erspan.h | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/erspan.h b/include/net/erspan.h index 6e758d08c9ee..70c40c7c75b2 100644 --- a/include/net/erspan.h +++ b/include/net/erspan.h @@ -15,7 +15,7 @@ * s, Recur, Flags, Version fields only S (bit 03) is set to 1. The * other fields are set to zero, so only a sequence number follows. * - * ERSPAN Type II header (8 octets [42:49]) + * ERSPAN Version 1 (Type II) header (8 octets [42:49]) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ @@ -27,7 +27,7 @@ * GRE proto ERSPAN type II = 0x88BE, type III = 0x22EB */ -#define ERSPAN_VERSION 0x1 +#define ERSPAN_VERSION 0x1 /* ERSPAN type II */ #define VER_MASK 0xf000 #define VLAN_MASK 0x0fff @@ -44,20 +44,29 @@ enum erspan_encap_type { ERSPAN_ENCAP_INFRAME = 0x3, /* VLAN tag perserved in frame */ }; +#define ERSPAN_V1_MDSIZE 4 +#define ERSPAN_V2_MDSIZE 8 struct erspan_metadata { - __be32 index; /* type II */ + union { + __be32 index; /* Version 1 (type II)*/ + } u; }; -struct erspanhdr { +struct erspan_base_hdr { __be16 ver_vlan; #define VER_OFFSET 12 __be16 session_id; #define COS_OFFSET 13 #define EN_OFFSET 11 #define T_OFFSET 10 - struct erspan_metadata md; }; +static inline int erspan_hdr_len(int version) +{ + return sizeof(struct erspan_base_hdr) + + (version == 1 ? ERSPAN_V1_MDSIZE : ERSPAN_V2_MDSIZE); +} + static inline u8 tos_to_cos(u8 tos) { u8 dscp, cos; @@ -73,7 +82,8 @@ static inline void erspan_build_header(struct sk_buff *skb, { struct ethhdr *eth = eth_hdr(skb); enum erspan_encap_type enc_type; - struct erspanhdr *ershdr; + struct erspan_base_hdr *ershdr; + struct erspan_metadata *ersmd; struct qtag_prefix { __be16 eth_type; __be16 tci; @@ -96,17 +106,21 @@ static inline void erspan_build_header(struct sk_buff *skb, enc_type = ERSPAN_ENCAP_INFRAME; } - skb_push(skb, sizeof(*ershdr)); - ershdr = (struct erspanhdr *)skb->data; - memset(ershdr, 0, sizeof(*ershdr)); + skb_push(skb, sizeof(*ershdr) + ERSPAN_V1_MDSIZE); + ershdr = (struct erspan_base_hdr *)skb->data; + memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V1_MDSIZE); + /* Build base header */ ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) | (ERSPAN_VERSION << VER_OFFSET)); ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) | ((tos_to_cos(tos) << COS_OFFSET) & COS_MASK) | (enc_type << EN_OFFSET & EN_MASK) | ((truncate << T_OFFSET) & T_MASK)); - ershdr->md.index = htonl(index & INDEX_MASK); + + /* Build metadata */ + ersmd = (struct erspan_metadata *)(ershdr + 1); + ersmd->u.index = htonl(index & INDEX_MASK); } #endif -- cgit v1.2.3 From f551c91de262ba36b20c3ac19538afb4f4507441 Mon Sep 17 00:00:00 2001 From: William Tu Date: Wed, 13 Dec 2017 16:38:56 -0800 Subject: net: erspan: introduce erspan v2 for ip_gre The patch adds support for erspan version 2. Not all features are supported in this patch. The SGT (security group tag), GRA (timestamp granularity), FT (frame type) are set to fixed value. Only hardware ID and direction are configurable. Optional subheader is also not supported. Signed-off-by: William Tu Signed-off-by: David S. Miller --- include/net/erspan.h | 120 ++++++++++++++++++++++++++++++++++++++++++++++- include/net/ip_tunnels.h | 5 +- 2 files changed, 122 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/erspan.h b/include/net/erspan.h index 70c40c7c75b2..acdf6843095d 100644 --- a/include/net/erspan.h +++ b/include/net/erspan.h @@ -24,11 +24,29 @@ * | Reserved | Index | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * + * + * ERSPAN Version 2 (Type III) header (12 octets [42:49]) + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Ver | VLAN | COS |BSO|T| Session ID | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Timestamp | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | SGT |P| FT | Hw ID |D|Gra|O| + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Platform Specific SubHeader (8 octets, optional) + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Platf ID | Platform Specific Info | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Platform Specific Info | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * * GRE proto ERSPAN type II = 0x88BE, type III = 0x22EB */ #define ERSPAN_VERSION 0x1 /* ERSPAN type II */ - #define VER_MASK 0xf000 #define VLAN_MASK 0x0fff #define COS_MASK 0xe000 @@ -37,6 +55,28 @@ #define ID_MASK 0x03ff #define INDEX_MASK 0xfffff +#define ERSPAN_VERSION2 0x2 /* ERSPAN type III*/ +#define BSO_MASK EN_MASK +#define SGT_MASK 0xffff0000 +#define P_MASK 0x8000 +#define FT_MASK 0x7c00 +#define HWID_MASK 0x03f0 +#define DIR_MASK 0x0008 +#define GRA_MASK 0x0006 +#define O_MASK 0x0001 + +/* ERSPAN version 2 metadata header */ +struct erspan_md2 { + __be32 timestamp; + __be16 sgt; /* security group tag */ + __be16 flags; +#define P_OFFSET 15 +#define FT_OFFSET 10 +#define HWID_OFFSET 4 +#define DIR_OFFSET 3 +#define GRA_OFFSET 1 +}; + enum erspan_encap_type { ERSPAN_ENCAP_NOVLAN = 0x0, /* originally without VLAN tag */ ERSPAN_ENCAP_ISL = 0x1, /* originally ISL encapsulated */ @@ -48,8 +88,10 @@ enum erspan_encap_type { #define ERSPAN_V2_MDSIZE 8 struct erspan_metadata { union { - __be32 index; /* Version 1 (type II)*/ + __be32 index; /* Version 1 (type II)*/ + struct erspan_md2 md2; /* Version 2 (type III) */ } u; + int version; }; struct erspan_base_hdr { @@ -58,6 +100,7 @@ struct erspan_base_hdr { __be16 session_id; #define COS_OFFSET 13 #define EN_OFFSET 11 +#define BSO_OFFSET EN_OFFSET #define T_OFFSET 10 }; @@ -123,4 +166,77 @@ static inline void erspan_build_header(struct sk_buff *skb, ersmd->u.index = htonl(index & INDEX_MASK); } +/* ERSPAN GRA: timestamp granularity + * 00b --> granularity = 100 microseconds + * 01b --> granularity = 100 nanoseconds + * 10b --> granularity = IEEE 1588 + * Here we only support 100 microseconds. + */ +static inline __be32 erspan_get_timestamp(void) +{ + u64 h_usecs; + ktime_t kt; + + kt = ktime_get_real(); + h_usecs = ktime_divns(kt, 100 * NSEC_PER_USEC); + + /* ERSPAN base header only has 32-bit, + * so it wraps around 4 days. + */ + return htonl((u32)h_usecs); +} + +static inline void erspan_build_header_v2(struct sk_buff *skb, + __be32 id, u8 direction, u16 hwid, + bool truncate, bool is_ipv4) +{ + struct ethhdr *eth = eth_hdr(skb); + struct erspan_base_hdr *ershdr; + struct erspan_metadata *md; + struct qtag_prefix { + __be16 eth_type; + __be16 tci; + } *qp; + u16 vlan_tci = 0; + u16 session_id; + u8 gra = 0; /* 100 usec */ + u8 bso = 0; /* Bad/Short/Oversized */ + u8 sgt = 0; + u8 tos; + + tos = is_ipv4 ? ip_hdr(skb)->tos : + (ipv6_hdr(skb)->priority << 4) + + (ipv6_hdr(skb)->flow_lbl[0] >> 4); + + /* Unlike v1, v2 does not have En field, + * so only extract vlan tci field. + */ + if (eth->h_proto == htons(ETH_P_8021Q)) { + qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); + vlan_tci = ntohs(qp->tci); + } + + skb_push(skb, sizeof(*ershdr) + ERSPAN_V2_MDSIZE); + ershdr = (struct erspan_base_hdr *)skb->data; + memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V2_MDSIZE); + + /* Build base header */ + ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) | + (ERSPAN_VERSION2 << VER_OFFSET)); + session_id = (u16)(ntohl(id) & ID_MASK) | + ((tos_to_cos(tos) << COS_OFFSET) & COS_MASK) | + (bso << BSO_OFFSET & BSO_MASK) | + ((truncate << T_OFFSET) & T_MASK); + ershdr->session_id = htons(session_id); + + /* Build metadata */ + md = (struct erspan_metadata *)(ershdr + 1); + md->u.md2.timestamp = erspan_get_timestamp(); + md->u.md2.sgt = htons(sgt); + md->u.md2.flags = htons(((1 << P_OFFSET) & P_MASK) | + ((hwid << HWID_OFFSET) & HWID_MASK) | + ((direction << DIR_OFFSET) & DIR_MASK) | + ((gra << GRA_OFFSET) & GRA_MASK)); +} + #endif diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 24628f6b09bf..1f16773cfd76 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -116,8 +116,11 @@ struct ip_tunnel { u32 o_seqno; /* The last output seqno */ int tun_hlen; /* Precalculated header length */ - /* This field used only by ERSPAN */ + /* These four fields used only by ERSPAN */ u32 index; /* ERSPAN type II index */ + u8 erspan_ver; /* ERSPAN version */ + u8 dir; /* ERSPAN direction */ + u16 hwid; /* ERSPAN hardware ID */ struct dst_cache dst_cache; -- cgit v1.2.3 From 94d7d8f2928701ef9b82527f889e0220dba11fa2 Mon Sep 17 00:00:00 2001 From: William Tu Date: Wed, 13 Dec 2017 16:38:57 -0800 Subject: ip6_gre: add erspan v2 support Similar to support for ipv4 erspan, this patch adds erspan v2 to ip6erspan tunnel. Signed-off-by: William Tu Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 109a5a8877ef..236e40ba06bf 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -37,6 +37,9 @@ struct __ip6_tnl_parm { __u32 fwmark; __u32 index; /* ERSPAN type II index */ + __u8 erspan_ver; /* ERSPAN version */ + __u8 dir; /* direction */ + __u16 hwid; /* hwid */ }; /* IPv6 tunnel */ -- cgit v1.2.3 From 7a4fa29106d9a38ef005f5ab15d493c259f269c0 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Thu, 14 Dec 2017 15:54:29 +0200 Subject: net: sched: Add TCA_HW_OFFLOAD Qdiscs can be offloaded to HW, but current implementation isn't uniform. Instead, qdiscs either pass information about offload status via their TCA_OPTIONS or omit it altogether. Introduce a new attribute - TCA_HW_OFFLOAD that would form a uniform uAPI for the offloading status of qdiscs. Signed-off-by: Yuval Mintz Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 65d0d25f2648..83a3e47d5845 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -71,6 +71,7 @@ struct Qdisc { * qdisc_tree_decrease_qlen() should stop. */ #define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ +#define TCQ_F_OFFLOADED 0x200 /* qdisc is offloaded to HW */ u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; -- cgit v1.2.3 From 2d07a49aded490a0a4a2748e64030a0f59b6b8be Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 15 Dec 2017 00:41:25 +0800 Subject: sctp: add basic structures and make chunk function for ifwdtsn sctp_ifwdtsn_skip, sctp_ifwdtsn_hdr and sctp_ifwdtsn_chunk are used to define and parse I-FWD TSN chunk format, and sctp_make_ifwdtsn is a function to build the chunk. The I-FORWARD-TSN Chunk Format is defined in section 2.3.1 of RFC8260. Signed-off-by: Xin Long Acked-by: Marcelo R. Leitner Signed-off-by: David S. Miller --- include/net/sctp/sm.h | 3 +++ include/net/sctp/structs.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 0993b4953b3a..2883c43c5258 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -199,6 +199,9 @@ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc, const struct sctp_chunk *chunk); struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp); +struct sctp_chunk *sctp_make_ifwdtsn(const struct sctp_association *asoc, + __u32 new_cum_tsn, size_t nstreams, + struct sctp_ifwdtsn_skip *skiplist); struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, int len, __u8 flags, gfp_t gfp); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 8ef638d966f1..a5c3cf41e693 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -599,6 +599,7 @@ struct sctp_chunk { struct sctp_fwdtsn_hdr *fwdtsn_hdr; struct sctp_authhdr *auth_hdr; struct sctp_idatahdr *idata_hdr; + struct sctp_ifwdtsn_hdr *ifwdtsn_hdr; } subh; __u8 *chunk_end; -- cgit v1.2.3 From 8e0c3b73cec1b943affde91b3c412ad8266b4694 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 15 Dec 2017 00:41:26 +0800 Subject: sctp: implement generate_ftsn for sctp_stream_interleave generate_ftsn is added as a member of sctp_stream_interleave, used to create fwdtsn or ifwdtsn chunk according to abandoned chunks, called in sctp_retransmit and sctp_outq_sack. sctp_generate_iftsn works for ifwdtsn, and sctp_generate_fwdtsn is still used for making fwdtsn. Signed-off-by: Xin Long Acked-by: Marcelo R. Leitner Signed-off-by: David S. Miller --- include/net/sctp/stream_interleave.h | 2 ++ include/net/sctp/structs.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h index 501b2be049a3..66267dbcecba 100644 --- a/include/net/sctp/stream_interleave.h +++ b/include/net/sctp/stream_interleave.h @@ -47,6 +47,8 @@ struct sctp_stream_interleave { struct sctp_chunk *chunk, gfp_t gfp); void (*start_pd)(struct sctp_ulpq *ulpq, gfp_t gfp); void (*abort_pd)(struct sctp_ulpq *ulpq, gfp_t gfp); + /* (I-)FORWARD-TSN process */ + void (*generate_ftsn)(struct sctp_outq *q, __u32 ctsn); }; void sctp_stream_interleave_init(struct sctp_stream *stream); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index a5c3cf41e693..b7720d65a975 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1100,6 +1100,7 @@ void sctp_retransmit_mark(struct sctp_outq *, struct sctp_transport *, __u8); void sctp_outq_uncork(struct sctp_outq *, gfp_t gfp); void sctp_prsctp_prune(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, int msg_len); +void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn); /* Uncork and flush an outqueue. */ static inline void sctp_outq_cork(struct sctp_outq *q) { -- cgit v1.2.3 From 0fc2ea922c8ad5520c80f03facbf396c81dce802 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 15 Dec 2017 00:41:27 +0800 Subject: sctp: implement validate_ftsn for sctp_stream_interleave validate_ftsn is added as a member of sctp_stream_interleave, used to validate ssn/chunk type for fwdtsn or mid (message id)/chunk type for ifwdtsn, called in sctp_sf_eat_fwd_tsn, just as validate_data. If this check fails, an abort packet will be sent, as said in section 2.3.1 of RFC8260. As ifwdtsn and fwdtsn chunks have different length, it also defines ftsn_chunk_len for sctp_stream_interleave to describe the chunk size. Then it replaces all sizeof(struct sctp_fwdtsn_chunk) with sctp_ftsnchk_len. It also adds the process for ifwdtsn in rx path. As Marcelo pointed out, there's no need to add event table for ifwdtsn, but just share prsctp_chunk_event_table with fwdtsn's. It would drop fwdtsn chunk for ifwdtsn and drop ifwdtsn chunk for fwdtsn by calling validate_ftsn in sctp_sf_eat_fwd_tsn. After this patch, the ifwdtsn can be accepted. Note that this patch also removes the sctp.intl_enable check for idata chunks in sctp_chunk_event_lookup, as it will do this check in validate_data later. Signed-off-by: Xin Long Acked-by: Marcelo R. Leitner Signed-off-by: David S. Miller --- include/net/sctp/stream_interleave.h | 2 ++ include/net/sctp/structs.h | 10 ++++++++++ 2 files changed, 12 insertions(+) (limited to 'include/net') diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h index 66267dbcecba..0db15b50c5e6 100644 --- a/include/net/sctp/stream_interleave.h +++ b/include/net/sctp/stream_interleave.h @@ -33,6 +33,7 @@ struct sctp_stream_interleave { __u16 data_chunk_len; + __u16 ftsn_chunk_len; /* (I-)DATA process */ struct sctp_chunk *(*make_datafrag)(const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, @@ -49,6 +50,7 @@ struct sctp_stream_interleave { void (*abort_pd)(struct sctp_ulpq *ulpq, gfp_t gfp); /* (I-)FORWARD-TSN process */ void (*generate_ftsn)(struct sctp_outq *q, __u32 ctsn); + bool (*validate_ftsn)(struct sctp_chunk *chunk); }; void sctp_stream_interleave_init(struct sctp_stream *stream); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index b7720d65a975..8ac4d5cdbfed 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1443,6 +1443,16 @@ static inline __u16 sctp_datahdr_len(const struct sctp_stream *stream) return stream->si->data_chunk_len - sizeof(struct sctp_chunkhdr); } +static inline __u16 sctp_ftsnchk_len(const struct sctp_stream *stream) +{ + return stream->si->ftsn_chunk_len; +} + +static inline __u16 sctp_ftsnhdr_len(const struct sctp_stream *stream) +{ + return stream->si->ftsn_chunk_len - sizeof(struct sctp_chunkhdr); +} + /* SCTP_GET_ASSOC_STATS counters */ struct sctp_priv_assoc_stats { /* Maximum observed rto in the association during subsequent -- cgit v1.2.3 From 47b20a88566f89dd0cc80c46f59ce0a12259d404 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 15 Dec 2017 00:41:28 +0800 Subject: sctp: implement report_ftsn for sctp_stream_interleave report_ftsn is added as a member of sctp_stream_interleave, used to skip tsn from tsnmap, remove old events from reasm or lobby queue, and abort pd for data or idata, called for SCTP_CMD_REPORT_FWDTSN cmd and asoc reset. sctp_report_iftsn works for ifwdtsn, and sctp_report_fwdtsn works for fwdtsn. Note that sctp_report_iftsn doesn't do asoc abort_pd, as stream abort_pd will be done when handling ifwdtsn. But when ftsn is equal with ftsn, which means asoc reset, asoc abort_pd has to be done. Signed-off-by: Xin Long Acked-by: Marcelo R. Leitner Signed-off-by: David S. Miller --- include/net/sctp/stream_interleave.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h index 0db15b50c5e6..0b55c70ac5af 100644 --- a/include/net/sctp/stream_interleave.h +++ b/include/net/sctp/stream_interleave.h @@ -51,6 +51,7 @@ struct sctp_stream_interleave { /* (I-)FORWARD-TSN process */ void (*generate_ftsn)(struct sctp_outq *q, __u32 ctsn); bool (*validate_ftsn)(struct sctp_chunk *chunk); + void (*report_ftsn)(struct sctp_ulpq *ulpq, __u32 ftsn); }; void sctp_stream_interleave_init(struct sctp_stream *stream); -- cgit v1.2.3 From de60fe9105431f504de9f8793b1da237a7d7f7ed Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 15 Dec 2017 00:41:29 +0800 Subject: sctp: implement handle_ftsn for sctp_stream_interleave handle_ftsn is added as a member of sctp_stream_interleave, used to skip ssn for data or mid for idata, called for SCTP_CMD_PROCESS_FWDTSN cmd. sctp_handle_iftsn works for ifwdtsn, and sctp_handle_fwdtsn works for fwdtsn. Note that different from sctp_handle_fwdtsn, sctp_handle_iftsn could do stream abort pd. Signed-off-by: Xin Long Acked-by: Marcelo R. Leitner Signed-off-by: David S. Miller --- include/net/sctp/stream_interleave.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/sctp/stream_interleave.h b/include/net/sctp/stream_interleave.h index 0b55c70ac5af..6657711c8bc4 100644 --- a/include/net/sctp/stream_interleave.h +++ b/include/net/sctp/stream_interleave.h @@ -52,6 +52,8 @@ struct sctp_stream_interleave { void (*generate_ftsn)(struct sctp_outq *q, __u32 ctsn); bool (*validate_ftsn)(struct sctp_chunk *chunk); void (*report_ftsn)(struct sctp_ulpq *ulpq, __u32 ftsn); + void (*handle_ftsn)(struct sctp_ulpq *ulpq, + struct sctp_chunk *chunk); }; void sctp_stream_interleave_init(struct sctp_stream *stream); -- cgit v1.2.3 From acf568ee859f098279eadf551612f103afdacb4e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 15 Dec 2017 16:40:44 +1100 Subject: xfrm: Reinject transport-mode packets through tasklet This is an old bugbear of mine: https://www.mail-archive.com/netdev@vger.kernel.org/msg03894.html By crafting special packets, it is possible to cause recursion in our kernel when processing transport-mode packets at levels that are only limited by packet size. The easiest one is with DNAT, but an even worse one is where UDP encapsulation is used in which case you just have to insert an UDP encapsulation header in between each level of recursion. This patch avoids this problem by reinjecting tranport-mode packets through a tasklet. Fixes: b05e106698d9 ("[IPV4/6]: Netfilter IPsec input hooks") Signed-off-by: Herbert Xu Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index dc28a98ce97c..ae35991b5877 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1570,6 +1570,9 @@ int xfrm_init_state(struct xfrm_state *x); int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb); int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm_input_resume(struct sk_buff *skb, int nexthdr); +int xfrm_trans_queue(struct sk_buff *skb, + int (*finish)(struct net *, struct sock *, + struct sk_buff *)); int xfrm_output_resume(struct sk_buff *skb, int err); int xfrm_output(struct sock *sk, struct sk_buff *skb); int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb); -- cgit v1.2.3 From 958a1b5a5ed02a768eb27760268251af93090caf Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Mon, 11 Dec 2017 15:37:49 -0700 Subject: nl80211: Remove obsolete kerneldoc line Commit ca986ad9bcd3 (nl80211: allow multiple active scheduled scan requests) removed WIPHY_FLAG_SUPPORTS_SCHED_SCAN but left the kerneldoc description in place, leading to this docs-build warning: ./include/net/cfg80211.h:3278: warning: Excess enum value 'WIPHY_FLAG_SUPPORTS_SCHED_SCAN' description in 'wiphy_flags' Remove the line and gain a bit of peace. Signed-off-by: Jonathan Corbet Acked-by: Arend van Spriel Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8b8118a7fadb..cb4d92b79cd9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3226,7 +3226,6 @@ struct cfg80211_ops { * @WIPHY_FLAG_IBSS_RSN: The device supports IBSS RSN. * @WIPHY_FLAG_MESH_AUTH: The device supports mesh authentication by routing * auth frames to userspace. See @NL80211_MESH_SETUP_USERSPACE_AUTH. - * @WIPHY_FLAG_SUPPORTS_SCHED_SCAN: The device supports scheduled scans. * @WIPHY_FLAG_SUPPORTS_FW_ROAM: The device supports roaming feature in the * firmware. * @WIPHY_FLAG_AP_UAPSD: The device supports uapsd on AP. -- cgit v1.2.3 From 0973dd45ecefd746569d414406f5733062fe2817 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 Dec 2017 10:10:48 +0100 Subject: Revert "mac80211: Add airtime account and scheduling to TXQs" This reverts commit b0d52ad821843a6c5badebd80feef9f871904fa6. We need to revert the TXQ scheduling API due to conflicts with a new driver, and this depends on that API. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 531b526a10db..45155803c875 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1188,8 +1188,6 @@ enum mac80211_rx_encoding { * HT or VHT is used (%RX_FLAG_HT/%RX_FLAG_VHT) * @nss: number of streams (VHT and HE only) * @flag: %RX_FLAG_\* - * @airtime: Duration of frame in usec. See @IEEE80211_HW_AIRTIME_ACCOUNTING for - * how to use this. * @encoding: &enum mac80211_rx_encoding * @bw: &enum rate_info_bw * @enc_flags: uses bits from &enum mac80211_rx_encoding_flags @@ -1204,7 +1202,6 @@ struct ieee80211_rx_status { u32 device_timestamp; u32 ampdu_reference; u32 flag; - u16 airtime; u16 freq; u8 enc_flags; u8 encoding:2, bw:3; @@ -2069,26 +2066,6 @@ struct ieee80211_txq { * @IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA: Hardware supports buffer STA on * TDLS links. * - * @IEEE80211_HW_AIRTIME_ACCOUNTING: Hardware supports accounting the airtime - * usage of other stations and reports it in the @tx_time and/or @airtime - * fields of the TX/RX status structs. - * When setting this flag, the driver should ensure that the respective - * fields in the TX and RX status structs are always either zero or - * contains a valid duration for the frame in usec. The driver can choose - * to report either or both of TX and RX airtime, but it is recommended to - * report both. - * The reported airtime should as a minimum include all time that is spent - * transmitting to the remote station, including overhead and padding, but - * not including time spent waiting for a TXOP. If the time is not reported - * by the hardware it can in some cases be calculated from the rate and - * known frame composition. When possible, the time should include any - * failed transmission attempts. - * For aggregated frames, there are two possible strategies to report the - * airtime: Either include the airtime of the entire aggregate in the first - * (or last) frame and leave the others at zero. Alternatively, include the - * overhead of the full aggregate in the first or last frame and report the - * time of each frame + padding not including the full aggregate overhead. - * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2132,7 +2109,6 @@ enum ieee80211_hw_flags { IEEE80211_HW_REPORTS_LOW_ACK, IEEE80211_HW_SUPPORTS_TX_FRAG, IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA, - IEEE80211_HW_AIRTIME_ACCOUNTING, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS -- cgit v1.2.3 From e7881bd5942df7df2fc450fd2aaa753fc4c4e125 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 Dec 2017 10:11:54 +0100 Subject: Revert "mac80211: Add TXQ scheduling API" This reverts commit e937b8da5a591f141fe41aa48a2e898df9888c95. Turns out that a new driver (mt76) is coming in through Kalle's tree, and will conflict with this. It also has some conflicting requirements, so we'll revisit this later. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 37 ++++++------------------------------- 1 file changed, 6 insertions(+), 31 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 45155803c875..906e90223066 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -105,12 +105,9 @@ * The driver is expected to initialize its private per-queue data for stations * and interfaces in the .add_interface and .sta_add ops. * - * The driver can't access the queue directly. To obtain the next queue to pull - * frames from, the driver calls ieee80211_next_txq(). To dequeue a frame from a - * txq, it calls ieee80211_tx_dequeue(). Whenever mac80211 adds a new frame to a - * queue, it calls the .wake_tx_queue driver op. The driver is expected to - * re-schedule the txq using ieee80211_schedule_txq() if it is still active - * after the driver has finished pulling packets from it. + * The driver can't access the queue directly. To dequeue a frame, it calls + * ieee80211_tx_dequeue(). Whenever mac80211 adds a new frame to a queue, it + * calls the .wake_tx_queue driver op. * * For AP powersave TIM handling, the driver only needs to indicate if it has * buffered packets in the driver specific data structures by calling @@ -3734,7 +3731,8 @@ struct ieee80211_ops { struct ieee80211_vif *vif, struct ieee80211_tdls_ch_sw_params *params); - void (*wake_tx_queue)(struct ieee80211_hw *hw); + void (*wake_tx_queue)(struct ieee80211_hw *hw, + struct ieee80211_txq *txq); void (*sync_rx_queues)(struct ieee80211_hw *hw); int (*start_nan)(struct ieee80211_hw *hw, @@ -5885,36 +5883,13 @@ void ieee80211_unreserve_tid(struct ieee80211_sta *sta, u8 tid); * ieee80211_tx_dequeue - dequeue a packet from a software tx queue * * @hw: pointer as obtained from ieee80211_alloc_hw() - * @txq: pointer obtained from ieee80211_next_txq() + * @txq: pointer obtained from station or virtual interface * * Returns the skb if successful, %NULL if no frame was available. */ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, struct ieee80211_txq *txq); -/** - * ieee80211_schedule_txq - add txq to scheduling loop - * - * @hw: pointer as obtained from ieee80211_alloc_hw() - * @txq: pointer obtained from station or virtual interface - * - * Returns %true if the txq was actually added to the scheduling, - * %false otherwise. - */ -bool ieee80211_schedule_txq(struct ieee80211_hw *hw, - struct ieee80211_txq *txq); - -/** - * ieee80211_next_txq - get next tx queue to pull packets from - * - * @hw: pointer as obtained from ieee80211_alloc_hw() - * - * Returns the next txq if successful, %NULL if no queue is eligible. If a txq - * is returned, it will have been removed from the scheduler queue and needs to - * be re-scheduled with ieee80211_schedule_txq() to continue to be active. - */ -struct ieee80211_txq *ieee80211_next_txq(struct ieee80211_hw *hw); - /** * ieee80211_txq_get_depth - get pending frame/byte count of given txq * -- cgit v1.2.3 From 983dafaab799511e092ffd006f3a064b37ccbccf Mon Sep 17 00:00:00 2001 From: Sunil Dutt Date: Wed, 13 Dec 2017 19:51:36 +0200 Subject: cfg80211: Scan results to also report the per chain signal strength This commit enhances the scan results to report the per chain signal strength based on the latest BSS update. This provides similar information to what is already available through STA information. Signed-off-by: Sunil Dutt Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d7f8e7b96bcb..3a4a1a903a4d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1773,6 +1773,8 @@ enum cfg80211_signal_type { * by %parent_bssid. * @parent_bssid: the BSS according to which %parent_tsf is set. This is set to * the BSS that requested the scan in which the beacon/probe was received. + * @chains: bitmask for filled values in @chain_signal. + * @chain_signal: per-chain signal strength of last received BSS in dBm. */ struct cfg80211_inform_bss { struct ieee80211_channel *chan; @@ -1781,6 +1783,8 @@ struct cfg80211_inform_bss { u64 boottime_ns; u64 parent_tsf; u8 parent_bssid[ETH_ALEN] __aligned(2); + u8 chains; + s8 chain_signal[IEEE80211_MAX_CHAINS]; }; /** @@ -1824,6 +1828,8 @@ struct cfg80211_bss_ies { * that holds the beacon data. @beacon_ies is still valid, of course, and * points to the same data as hidden_beacon_bss->beacon_ies in that case. * @signal: signal strength value (type depends on the wiphy's signal_type) + * @chains: bitmask for filled values in @chain_signal. + * @chain_signal: per-chain signal strength of last received BSS in dBm. * @priv: private area for driver use, has at least wiphy->bss_priv_size bytes */ struct cfg80211_bss { @@ -1842,6 +1848,8 @@ struct cfg80211_bss { u16 capability; u8 bssid[ETH_ALEN]; + u8 chains; + s8 chain_signal[IEEE80211_MAX_CHAINS]; u8 priv[0] __aligned(sizeof(void *)); }; -- cgit v1.2.3 From 08fc7f8140730d2f8499c91b5abad44581b74635 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Thu, 14 Dec 2017 05:51:57 -0800 Subject: sock: Change the netns_core member name. Change the member name will make the code more readable. This patch will be used in next patch. Signed-off-by: Martin Zhang Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller --- include/net/netns/core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/core.h b/include/net/netns/core.h index 0ad4d0c71228..45cfb5dc76c7 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -11,7 +11,7 @@ struct netns_core { int sysctl_somaxconn; - struct prot_inuse __percpu *inuse; + struct prot_inuse __percpu *prot_inuse; }; #endif -- cgit v1.2.3 From 648845ab7e200993dccd3948c719c858368c91e7 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Thu, 14 Dec 2017 05:51:58 -0800 Subject: sock: Move the socket inuse to namespace. In some case, we want to know how many sockets are in use in different _net_ namespaces. It's a key resource metric. This patch add a member in struct netns_core. This is a counter for socket-inuse in the _net_ namespace. The patch will add/sub counter in the sk_alloc, sk_clone_lock and __sk_free. This patch will not counter the socket created in kernel. It's not very useful for userspace to know how many kernel sockets we created. The main reasons for doing this are that: 1. When linux calls the 'do_exit' for process to exit, the functions 'exit_task_namespaces' and 'exit_task_work' will be called sequentially. 'exit_task_namespaces' may have destroyed the _net_ namespace, but 'sock_release' called in 'exit_task_work' may use the _net_ namespace if we counter the socket-inuse in sock_release. 2. socket and sock are in pair. More important, sock holds the _net_ namespace. We counter the socket-inuse in sock, for avoiding holding _net_ namespace again in socket. It's a easy way to maintain the code. Signed-off-by: Martin Zhang Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller --- include/net/netns/core.h | 3 +++ include/net/sock.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/netns/core.h b/include/net/netns/core.h index 45cfb5dc76c7..a5e8a66c57b4 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -11,6 +11,9 @@ struct netns_core { int sysctl_somaxconn; +#ifdef CONFIG_PROC_FS + int __percpu *sock_inuse; +#endif struct prot_inuse __percpu *prot_inuse; }; diff --git a/include/net/sock.h b/include/net/sock.h index 9a9047268d37..0a32f3ce381c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1262,6 +1262,7 @@ proto_memory_pressure(struct proto *prot) /* Called with local bh disabled */ void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc); int sock_prot_inuse_get(struct net *net, struct proto *proto); +int sock_inuse_get(struct net *net); #else static inline void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc) -- cgit v1.2.3 From 398b841e4ad69a822f615442b5ea4ca767330a3b Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Thu, 14 Dec 2017 05:51:59 -0800 Subject: sock: Hide unused variable when !CONFIG_PROC_FS. When CONFIG_PROC_FS is disabled, we will not use the prot_inuse counter. This adds an #ifdef to hide the variable definition in that case. This is not a bugfix. But we can save bytes when there are many network namespace. Cc: Pavel Emelyanov Signed-off-by: Martin Zhang Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller --- include/net/netns/core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/core.h b/include/net/netns/core.h index a5e8a66c57b4..36c2d998a43c 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -13,8 +13,8 @@ struct netns_core { #ifdef CONFIG_PROC_FS int __percpu *sock_inuse; -#endif struct prot_inuse __percpu *prot_inuse; +#endif }; #endif -- cgit v1.2.3 From 3dca3f38cfb8efb8571040568cac7d0025fa5bb1 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 20 Dec 2017 10:41:31 +0100 Subject: xfrm: Separate ESP handling from segmentation for GRO packets. We change the ESP GSO handlers to only segment the packets. The ESP handling and encryption is defered to validate_xmit_xfrm() where this is done for non GRO packets too. This makes the code more robust and prepares for asynchronous crypto handling. Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 1ec0c4760646..df7f3d0ac4a1 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1888,7 +1888,7 @@ static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb) void __net_init xfrm_dev_init(void); #ifdef CONFIG_XFRM_OFFLOAD -int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features); +struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features); int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo); bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); @@ -1929,9 +1929,9 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x) } } #else -static inline int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features) +static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features) { - return 0; + return skb; } static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo) -- cgit v1.2.3 From f53c723902d1ac5f0b0a11d7c9dcbff748dde74e Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 20 Dec 2017 10:41:36 +0100 Subject: net: Add asynchronous callbacks for xfrm on layer 2. This patch implements asynchronous crypto callbacks and a backlog handler that can be used when IPsec is done at layer 2 in the TX path. It also extends the skb validate functions so that we can update the driver transmit return codes based on async crypto operation or to indicate that we queued the packet in a backlog queue. Joint work with: Aviv Heller Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index df7f3d0ac4a1..2517c4f7781a 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1051,6 +1051,7 @@ struct xfrm_offload { #define XFRM_GSO_SEGMENT 16 #define XFRM_GRO 32 #define XFRM_ESP_NO_TRAILER 64 +#define XFRM_DEV_RESUME 128 __u32 status; #define CRYPTO_SUCCESS 1 @@ -1874,21 +1875,28 @@ static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb) { return skb->sp->xvec[skb->sp->len - 1]; } +#endif + static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb) { +#ifdef CONFIG_XFRM struct sec_path *sp = skb->sp; if (!sp || !sp->olen || sp->len != sp->olen) return NULL; return &sp->ovec[sp->olen - 1]; -} +#else + return NULL; #endif +} void __net_init xfrm_dev_init(void); #ifdef CONFIG_XFRM_OFFLOAD -struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features); +void xfrm_dev_resume(struct sk_buff *skb); +void xfrm_dev_backlog(struct softnet_data *sd); +struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again); int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo); bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); @@ -1929,7 +1937,15 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x) } } #else -static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features) +static inline void xfrm_dev_resume(struct sk_buff *skb) +{ +} + +static inline void xfrm_dev_backlog(struct softnet_data *sd) +{ +} + +static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again) { return skb; } -- cgit v1.2.3 From 2271d5190ec60b06921c2e4e184fd1f4fad4e634 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 20 Dec 2017 10:41:48 +0100 Subject: xfrm: Allow IPsec GSO with software crypto for local sockets. With support of async crypto operations in the GSO codepath we have everything in place to allow GSO for local sockets. This patch enables the GSO codepath. Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2517c4f7781a..357764a2bb4e 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1910,6 +1910,8 @@ static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) return false; xdst = (struct xfrm_dst *) dst; + if (!x->xso.offload_handle && !xdst->child->xfrm) + return true; if (x->xso.offload_handle && (x->xso.dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) return true; -- cgit v1.2.3 From 102740bd9436a3a6ba129af3a48271d794009fa5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 19 Dec 2017 13:32:13 -0800 Subject: cls_bpf: fix offload assumptions after callback conversion cls_bpf used to take care of tracking what offload state a filter is in, i.e. it would track if offload request succeeded or not. This information would then be used to issue correct requests to the driver, e.g. requests for statistics only on offloaded filters, removing only filters which were offloaded, using add instead of replace if previous filter was not added etc. This tracking of offload state no longer functions with the new callback infrastructure. There could be multiple entities trying to offload the same filter. Throw out all the tracking and corresponding commands and simply pass to the drivers both old and new bpf program. Drivers will have to deal with offload state tracking by themselves. Fixes: 3f7889c4c79b ("net: sched: cls_bpf: call block callbacks for offload") Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 0105445cab83..8e08b6da72f3 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -694,9 +694,7 @@ struct tc_cls_matchall_offload { }; enum tc_clsbpf_command { - TC_CLSBPF_ADD, - TC_CLSBPF_REPLACE, - TC_CLSBPF_DESTROY, + TC_CLSBPF_OFFLOAD, TC_CLSBPF_STATS, }; @@ -705,6 +703,7 @@ struct tc_cls_bpf_offload { enum tc_clsbpf_command command; struct tcf_exts *exts; struct bpf_prog *prog; + struct bpf_prog *oldprog; const char *name; bool exts_integrated; u32 gen_flags; -- cgit v1.2.3 From 563e0bb0dc74b3ca888e24f8c08f0239fe4016b0 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 20 Dec 2017 11:12:51 +0800 Subject: net: tracepoint: replace tcp_set_state tracepoint with inet_sock_set_state tracepoint As sk_state is a common field for struct sock, so the state transition tracepoint should not be a TCP specific feature. Currently it traces all AF_INET state transition, so I rename this tracepoint to inet_sock_set_state tracepoint with some minor changes and move it into trace/events/sock.h. We dont need to create a file named trace/events/inet_sock.h for this one single tracepoint. Two helpers are introduced to trace sk_state transition - void inet_sk_state_store(struct sock *sk, int newstate); - void inet_sk_set_state(struct sock *sk, int state); As trace header should not be included in other header files, so they are defined in sock.c. The protocol such as SCTP maybe compiled as a ko, hence export inet_sk_set_state(). Signed-off-by: Yafang Shao Signed-off-by: David S. Miller --- include/net/inet_sock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 39efb968b7a4..a3431a4ff9cc 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -290,6 +290,8 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to, #endif int inet_sk_rebuild_header(struct sock *sk); +void inet_sk_set_state(struct sock *sk, int state); +void inet_sk_state_store(struct sock *sk, int newstate); static inline unsigned int __inet_ehashfn(const __be32 laddr, const __u16 lport, -- cgit v1.2.3 From 986ffdfd08dbaae721e82720e6bfc2c307e732dd Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 20 Dec 2017 11:12:52 +0800 Subject: net: sock: replace sk_state_load with inet_sk_state_load and remove sk_state_store sk_state_load is only used by AF_INET/AF_INET6, so rename it to inet_sk_state_load and move it into inet_sock.h. sk_state_store is removed as it is not used any more. Signed-off-by: Yafang Shao Signed-off-by: David S. Miller --- include/net/inet_sock.h | 25 ++++++++++++++++++++++++- include/net/sock.h | 25 ------------------------- 2 files changed, 24 insertions(+), 26 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index a3431a4ff9cc..0a671c32d6b9 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -290,9 +290,32 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to, #endif int inet_sk_rebuild_header(struct sock *sk); -void inet_sk_set_state(struct sock *sk, int state); + +/** + * inet_sk_state_load - read sk->sk_state for lockless contexts + * @sk: socket pointer + * + * Paired with inet_sk_state_store(). Used in places we don't hold socket lock: + * tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ... + */ +static inline int inet_sk_state_load(const struct sock *sk) +{ + /* state change might impact lockless readers. */ + return smp_load_acquire(&sk->sk_state); +} + +/** + * inet_sk_state_store - update sk->sk_state + * @sk: socket pointer + * @newstate: new state + * + * Paired with inet_sk_state_load(). Should be used in contexts where + * state change might impact lockless readers. + */ void inet_sk_state_store(struct sock *sk, int newstate); +void inet_sk_set_state(struct sock *sk, int state); + static inline unsigned int __inet_ehashfn(const __be32 laddr, const __u16 lport, const __be32 faddr, diff --git a/include/net/sock.h b/include/net/sock.h index 0a32f3ce381c..6c1db823f8b9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2333,31 +2333,6 @@ static inline bool sk_listener(const struct sock *sk) return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); } -/** - * sk_state_load - read sk->sk_state for lockless contexts - * @sk: socket pointer - * - * Paired with sk_state_store(). Used in places we do not hold socket lock : - * tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ... - */ -static inline int sk_state_load(const struct sock *sk) -{ - return smp_load_acquire(&sk->sk_state); -} - -/** - * sk_state_store - update sk->sk_state - * @sk: socket pointer - * @newstate: new state - * - * Paired with sk_state_load(). Should be used in contexts where - * state change might impact lockless readers. - */ -static inline void sk_state_store(struct sock *sk, int newstate) -{ - smp_store_release(&sk->sk_state, newstate); -} - void sock_enable_timestamp(struct sock *sk, int flag); int sock_get_timestamp(struct sock *, struct timeval __user *); int sock_get_timestampns(struct sock *, struct timespec __user *); -- cgit v1.2.3 From 7f05b467a735aba1476d9ae8e0ae9d9d8e60066c Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Tue, 19 Dec 2017 15:35:47 -0800 Subject: xfrm: check for xdo_dev_state_free The current XFRM code assumes that we've implemented the xdo_dev_state_free() callback, even if it is meaningless to the driver. This patch adds a check for it before calling, as done in other APIs, to prevent a NULL function pointer kernel crash. Signed-off-by: Shannon Nelson Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 357764a2bb4e..079ea9455bcd 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1933,7 +1933,8 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x) struct net_device *dev = xso->dev; if (dev && dev->xfrmdev_ops) { - dev->xfrmdev_ops->xdo_dev_state_free(x); + if (dev->xfrmdev_ops->xdo_dev_state_free) + dev->xfrmdev_ops->xdo_dev_state_free(x); xso->dev = NULL; dev_put(dev); } -- cgit v1.2.3 From e63d7dfd2df7aa204849599c6f378e627e926657 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 20 Dec 2017 12:35:13 -0500 Subject: net: sched: sch: add extack for init callback This patch adds extack support for init callback to prepare per-qdisc specific changes for extack. Cc: David Ahern Acked-by: Jamal Hadi Salim Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/sch_generic.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index bc6b25faba99..4c5faa0ff47d 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -189,7 +189,8 @@ struct Qdisc_ops { struct sk_buff * (*dequeue)(struct Qdisc *); struct sk_buff * (*peek)(struct Qdisc *); - int (*init)(struct Qdisc *sch, struct nlattr *arg); + int (*init)(struct Qdisc *sch, struct nlattr *arg, + struct netlink_ext_ack *extack); void (*reset)(struct Qdisc *); void (*destroy)(struct Qdisc *); int (*change)(struct Qdisc *sch, -- cgit v1.2.3 From 2030721cc0c39ff19df94a0df77b0401fdb71c1a Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 20 Dec 2017 12:35:14 -0500 Subject: net: sched: sch: add extack for change qdisc ops This patch adds extack support for change callback for qdisc ops structtur to prepare per-qdisc specific changes for extack. Cc: David Ahern Acked-by: Jamal Hadi Salim Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/sch_generic.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 4c5faa0ff47d..e7a3e206b904 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -194,7 +194,8 @@ struct Qdisc_ops { void (*reset)(struct Qdisc *); void (*destroy)(struct Qdisc *); int (*change)(struct Qdisc *sch, - struct nlattr *arg); + struct nlattr *arg, + struct netlink_ext_ack *extack); void (*attach)(struct Qdisc *sch); int (*dump)(struct Qdisc *, struct sk_buff *); -- cgit v1.2.3 From 793d81d6a1965f1e1806ebc9aacc84a639b90282 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 20 Dec 2017 12:35:15 -0500 Subject: net: sched: sch: add extack to change class This patch adds extack support for class change callback api. This prepares to handle extack support inside each specific class implementation. Cc: David Ahern Acked-by: Jamal Hadi Salim Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/sch_generic.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e7a3e206b904..b4660a3ea99c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -158,7 +158,8 @@ struct Qdisc_class_ops { /* Class manipulation routines */ unsigned long (*find)(struct Qdisc *, u32 classid); int (*change)(struct Qdisc *, u32, u32, - struct nlattr **, unsigned long *); + struct nlattr **, unsigned long *, + struct netlink_ext_ack *); int (*delete)(struct Qdisc *, unsigned long); void (*walk)(struct Qdisc *, struct qdisc_walker * arg); -- cgit v1.2.3 From cbaacc4e8a394d63bcd707775ca5bb7a51aaabee Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 20 Dec 2017 12:35:16 -0500 Subject: net: sched: sch: add extack for block callback This patch adds extack support for block callback to prepare per-qdisc specific changes for extack. Cc: David Ahern Acked-by: Jamal Hadi Salim Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/sch_generic.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index b4660a3ea99c..f65dd2837142 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -165,7 +165,8 @@ struct Qdisc_class_ops { /* Filter manipulation */ struct tcf_block * (*tcf_block)(struct Qdisc *sch, - unsigned long arg); + unsigned long arg, + struct netlink_ext_ack *extack); unsigned long (*bind_tcf)(struct Qdisc *, unsigned long, u32 classid); void (*unbind_tcf)(struct Qdisc *, unsigned long); -- cgit v1.2.3 From 653d6fd68d8e5b43d496ca8a1d38331d515a226b Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 20 Dec 2017 12:35:17 -0500 Subject: net: sched: sch: add extack for graft callback This patch adds extack support for graft callback to prepare per-qdisc specific changes for extack. Cc: David Ahern Acked-by: Jamal Hadi Salim Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/sch_generic.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f65dd2837142..3baadac9e7a5 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -151,7 +151,8 @@ struct Qdisc_class_ops { /* Child qdisc manipulation */ struct netdev_queue * (*select_queue)(struct Qdisc *, struct tcmsg *); int (*graft)(struct Qdisc *, unsigned long cl, - struct Qdisc *, struct Qdisc **); + struct Qdisc *, struct Qdisc **, + struct netlink_ext_ack *extack); struct Qdisc * (*leaf)(struct Qdisc *, unsigned long cl); void (*qlen_notify)(struct Qdisc *, unsigned long); -- cgit v1.2.3 From e9bc3fa28bae7612f41e3538f241a2f87f629c94 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 20 Dec 2017 12:35:18 -0500 Subject: net: sch: api: add extack support in qdisc_get_rtab This patch adds extack support for the function qdisc_get_rtab which is a common used function in the tc subsystem. Callers which are interested in the receiving error can assign extack to get a more detailed information why qdisc_get_rtab failed. Cc: David Ahern Acked-by: Jamal Hadi Salim Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 240469228851..a4f21c0b4a43 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -101,7 +101,8 @@ void qdisc_hash_del(struct Qdisc *q); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); struct Qdisc *qdisc_lookup_class(struct net_device *dev, u32 handle); struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, - struct nlattr *tab); + struct nlattr *tab, + struct netlink_ext_ack *extack); void qdisc_put_rtab(struct qdisc_rate_table *tab); void qdisc_put_stab(struct qdisc_size_table *tab); void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc); -- cgit v1.2.3 From 8d1a77f974ca61d39afa5bf0aeab210525d31475 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 20 Dec 2017 12:35:19 -0500 Subject: net: sch: api: add extack support in tcf_block_get This patch adds extack support for the function tcf_block_get which is a common used function in the tc subsystem. Callers which are interested in the receiving error can assign extack to get a more detailed information why tcf_block_get failed. Cc: David Ahern Acked-by: Jamal Hadi Salim Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 0105445cab83..58bba9c769ea 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -39,9 +39,11 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create); void tcf_chain_put(struct tcf_chain *chain); int tcf_block_get(struct tcf_block **p_block, - struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q); + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, + struct netlink_ext_ack *extack); int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, - struct tcf_block_ext_info *ei); + struct tcf_block_ext_info *ei, + struct netlink_ext_ack *extack); void tcf_block_put(struct tcf_block *block); void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei); -- cgit v1.2.3 From d0bd684dddab51ed017ece0359f26b038ec31940 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 20 Dec 2017 12:35:20 -0500 Subject: net: sch: api: add extack support in qdisc_alloc This patch adds extack support for the function qdisc_alloc which is a common used function in the tc subsystem. Callers which are interested in the receiving error can assign extack to get a more detailed information why qdisc_alloc failed. Cc: David Ahern Acked-by: Jamal Hadi Salim Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/sch_generic.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 3baadac9e7a5..faf6b2dbc1b2 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -471,7 +471,8 @@ void qdisc_destroy(struct Qdisc *qdisc); void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, unsigned int n, unsigned int len); struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, - const struct Qdisc_ops *ops); + const struct Qdisc_ops *ops, + struct netlink_ext_ack *extack); struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, u32 parentid); void __qdisc_calculate_pkt_len(struct sk_buff *skb, -- cgit v1.2.3 From a38a98821c939e67e5906bddbed1d15af5ca860d Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 20 Dec 2017 12:35:21 -0500 Subject: net: sch: api: add extack support in qdisc_create_dflt This patch adds extack support for the function qdisc_create_dflt which is a common used function in the tc subsystem. Callers which are interested in the receiving error can assign extack to get a more detailed information why qdisc_create_dflt failed. The function qdisc_create_dflt will also call an init callback which can fail by any per-qdisc specific handling. Cc: David Ahern Acked-by: Jamal Hadi Salim Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 3 ++- include/net/sch_generic.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index a4f21c0b4a43..e2c75f52557b 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -89,7 +89,8 @@ extern struct Qdisc_ops pfifo_head_drop_qdisc_ops; int fifo_set_limit(struct Qdisc *q, unsigned int limit); struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, - unsigned int limit); + unsigned int limit, + struct netlink_ext_ack *extack); int register_qdisc(struct Qdisc_ops *qops); int unregister_qdisc(struct Qdisc_ops *qops); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index faf6b2dbc1b2..ac029d5d88e4 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -474,7 +474,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack); struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, - const struct Qdisc_ops *ops, u32 parentid); + const struct Qdisc_ops *ops, u32 parentid, + struct netlink_ext_ack *extack); void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab); int skb_do_redirect(struct sk_buff *); -- cgit v1.2.3 From 3c1490913f3bcf750261649e7ffd62390187058c Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Fri, 22 Dec 2017 15:52:05 +0000 Subject: net: sch: api: fix tcf_block_get The build of mips bcm47xx_defconfig is failing with the error: net/sched/sch_fq_codel.c: In function 'fq_codel_init': net/sched/sch_fq_codel.c:487:8: error: too many arguments to function 'tcf_block_get' While adding the extack support, the commit missed adding it in the headers when CONFIG_NET_CLS is not defined. Fixes: 8d1a77f974ca ("net: sch: api: add extack support in tcf_block_get") Signed-off-by: Sudip Mukherjee Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 31574c958673..5cd3cf51cb35 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -79,7 +79,8 @@ int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, #else static inline int tcf_block_get(struct tcf_block **p_block, - struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q) + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, + struct netlink_ext_ack *extack) { return 0; } -- cgit v1.2.3 From 602f7a2714a3b3aa4bec82ab0a86a9f5a2c4aa61 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 28 Dec 2017 11:00:43 -0800 Subject: sock: Add sock_owned_by_user_nocheck This allows checking socket lock ownership with producing lockdep warnings. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/sock.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 9155da422692..7a7b14e9628a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1514,6 +1514,11 @@ static inline bool sock_owned_by_user(const struct sock *sk) return sk->sk_lock.owned; } +static inline bool sock_owned_by_user_nocheck(const struct sock *sk) +{ + return sk->sk_lock.owned; +} + /* no reclassification while locks are held */ static inline bool sock_allow_reclassification(const struct sock *csk) { -- cgit v1.2.3 From f1c8d3720f2e6c8c2b209120678236debd0360e5 Mon Sep 17 00:00:00 2001 From: William Tu Date: Tue, 2 Jan 2018 14:05:19 -0800 Subject: vxlan: trivial indenting fix. Fix indentation of reserved_flags2 field in vxlanhdr_gpe. Fixes: e1e5314de08b ("vxlan: implement GPE") Signed-off-by: William Tu Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/vxlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 13223396dc64..f96391e84a8a 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -146,7 +146,7 @@ struct vxlanhdr_gpe { np_applied:1, instance_applied:1, version:2, -reserved_flags2:2; + reserved_flags2:2; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved_flags2:2, version:2, -- cgit v1.2.3 From ff847ee47be27621f978921919f035fcd87d6d08 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Sat, 3 Jun 2017 20:10:03 +0200 Subject: can: af_can: give struct holding the CAN per device receive lists a sensible name This patch adds a "can_" prefix to the "struct dev_rcv_lists" to better reflect the meaning and improbe code readability. The conversion is done with: sed -i \ -e "s/struct dev_rcv_lists/struct can_dev_rcv_lists/g" \ net/can/*.[ch] include/net/netns/can.h Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/net/netns/can.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/can.h b/include/net/netns/can.h index ecf238b8862c..ca9bd9fba5b5 100644 --- a/include/net/netns/can.h +++ b/include/net/netns/can.h @@ -8,7 +8,7 @@ #include -struct dev_rcv_lists; +struct can_dev_rcv_lists; struct s_stats; struct s_pstats; @@ -28,7 +28,7 @@ struct netns_can { #endif /* receive filters subscribed for 'all' CAN devices */ - struct dev_rcv_lists *can_rx_alldev_list; + struct can_dev_rcv_lists *can_rx_alldev_list; spinlock_t can_rcvlists_lock; struct timer_list can_stattimer;/* timer for statistics update */ struct s_stats *can_stats; /* packet statistics */ -- cgit v1.2.3 From 33c30a8b68cf837b2be3e885ae0a329b880d7666 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 3 Jan 2018 17:30:45 -0800 Subject: net: sched: fix tcf_block_get_ext() in case CONFIG_NET_CLS is not set The definition of functions tcf_block_get() and tcf_block_get_ext() depends of CONFIG_NET_CLS being set. When those functions gained extack support, only one version of the declaration of those functions was updated. Function tcf_block_get() was later fixed with commit 3c1490913f3b ("net: sch: api: fix tcf_block_get"). Change arguments of tcf_block_get_ext() for the case when CONFIG_NET_CLS is not set. Fixes: 8d1a77f974ca ("net: sch: api: add extack support in tcf_block_get") Signed-off-by: Quentin Monnet Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 5cd3cf51cb35..c4f4e46ea8d6 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -87,7 +87,8 @@ int tcf_block_get(struct tcf_block **p_block, static inline int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, - struct tcf_block_ext_info *ei) + struct tcf_block_ext_info *ei, + struct netlink_ext_ack *extack) { return 0; } -- cgit v1.2.3 From aecd67b60722dd24353b0bc50e78a55b30707dcd Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 3 Jan 2018 11:25:13 +0100 Subject: xdp: base API for new XDP rx-queue info concept This patch only introduce the core data structures and API functions. All XDP enabled drivers must use the API before this info can used. There is a need for XDP to know more about the RX-queue a given XDP frames have arrived on. For both the XDP bpf-prog and kernel side. Instead of extending xdp_buff each time new info is needed, the patch creates a separate read-mostly struct xdp_rxq_info, that contains this info. We stress this data/cache-line is for read-only info. This is NOT for dynamic per packet info, use the data_meta for such use-cases. The performance advantage is this info can be setup at RX-ring init time, instead of updating N-members in xdp_buff. A possible (driver level) micro optimization is that xdp_buff->rxq assignment could be done once per XDP/NAPI loop. The extra pointer deref only happens for program needing access to this info (thus, no slowdown to existing use-cases). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 include/net/xdp.h (limited to 'include/net') diff --git a/include/net/xdp.h b/include/net/xdp.h new file mode 100644 index 000000000000..86c41631a908 --- /dev/null +++ b/include/net/xdp.h @@ -0,0 +1,47 @@ +/* include/net/xdp.h + * + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * Released under terms in GPL version 2. See COPYING. + */ +#ifndef __LINUX_NET_XDP_H__ +#define __LINUX_NET_XDP_H__ + +/** + * DOC: XDP RX-queue information + * + * The XDP RX-queue info (xdp_rxq_info) is associated with the driver + * level RX-ring queues. It is information that is specific to how + * the driver have configured a given RX-ring queue. + * + * Each xdp_buff frame received in the driver carry a (pointer) + * reference to this xdp_rxq_info structure. This provides the XDP + * data-path read-access to RX-info for both kernel and bpf-side + * (limited subset). + * + * For now, direct access is only safe while running in NAPI/softirq + * context. Contents is read-mostly and must not be updated during + * driver NAPI/softirq poll. + * + * The driver usage API is a register and unregister API. + * + * The struct is not directly tied to the XDP prog. A new XDP prog + * can be attached as long as it doesn't change the underlying + * RX-ring. If the RX-ring does change significantly, the NIC driver + * naturally need to stop the RX-ring before purging and reallocating + * memory. In that process the driver MUST call unregistor (which + * also apply for driver shutdown and unload). The register API is + * also mandatory during RX-ring setup. + */ + +struct xdp_rxq_info { + struct net_device *dev; + u32 queue_index; + u32 reg_state; +} ____cacheline_aligned; /* perf critical, avoid false-sharing */ + +int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index); +void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); +void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); + +#endif /* __LINUX_NET_XDP_H__ */ -- cgit v1.2.3 From c0124f327e5cabd844a10d7e1fc5aa2a81e796a9 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 3 Jan 2018 11:25:34 +0100 Subject: xdp/qede: setup xdp_rxq_info and intro xdp_rxq_info_is_reg The driver code qede_free_fp_array() depend on kfree() can be called with a NULL pointer. This stems from the qede_alloc_fp_array() function which either (kz)alloc memory for fp->txq or fp->rxq. This also simplifies error handling code in case of memory allocation failures, but xdp_rxq_info_unreg need to know the difference. Introduce xdp_rxq_info_is_reg() to handle if a memory allocation fails and detect this is the failure path by seeing that xdp_rxq_info was not registred yet, which first happens after successful alloaction in qede_init_fp(). Driver hook points for xdp_rxq_info: * reg : qede_init_fp * unreg: qede_free_fp_array Tested on actual hardware with samples/bpf program. V2: Driver have no proper error path for failed XDP RX-queue info reg, as qede_init_fp() is a void function. Cc: everest-linux-l2@cavium.com Cc: Ariel Elior Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/xdp.h b/include/net/xdp.h index 86c41631a908..b2362ddfa694 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -43,5 +43,6 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); +bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); #endif /* __LINUX_NET_XDP_H__ */ -- cgit v1.2.3 From 2127d95aef6c795c3bd8b805722c5c46e8fe45dd Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:03 +0200 Subject: ipv6: Clear nexthop flags upon netdev up Previous patch marked nexthops with the 'dead' and 'linkdown' flags. Clear these flags when the netdev comes back up. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 18e442ea93d8..caad39198c2a 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -169,6 +169,7 @@ void rt6_ifdown(struct net *net, struct net_device *dev); void rt6_mtu_change(struct net_device *dev, unsigned int mtu); void rt6_remove_prefsrc(struct inet6_ifaddr *ifp); void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); +void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) { -- cgit v1.2.3 From 4c981e28d373e391b76577635e7e216976b71c57 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:04 +0200 Subject: ipv6: Prepare to handle multiple netdev events To make IPv6 more in line with IPv4 we need to be able to respond differently to different netdev events. For example, when a netdev is unregistered all the routes using it as their nexthop device should be flushed, whereas when the netdev's carrier changes only the 'linkdown' flag should be toggled. Currently, this is not possible, as the function that traverses the routing tables is not aware of the triggering event. Propagate the triggering event down, so that it could be used in later patches. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index caad39198c2a..6a2f80cbdf65 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -165,11 +165,11 @@ struct rt6_rtnl_dump_arg { }; int rt6_dump_route(struct rt6_info *rt, void *p_arg); -void rt6_ifdown(struct net *net, struct net_device *dev); void rt6_mtu_change(struct net_device *dev, unsigned int mtu); void rt6_remove_prefsrc(struct inet6_ifaddr *ifp); void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); +void rt6_disable_ip(struct net_device *dev, unsigned long event); static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) { -- cgit v1.2.3 From 27c6fa73f93b81671a77bdaa242473c3bda0ac4a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:05 +0200 Subject: ipv6: Set nexthop flags upon carrier change Similar to IPv4, when the carrier of a netdev changes we should toggle the 'linkdown' flag on all the nexthops using it as their nexthop device. This will later allow us to test for the presence of this flag during route lookup and dump. Up until commit 4832c30d5458 ("net: ipv6: put host and anycast routes on device with address") host and anycast routes used the loopback netdev as their nexthop device and thus were not marked with the 'linkdown' flag. The patch preserves this behavior and allows one to ping the local address even when the nexthop device does not have a carrier and the 'ignore_routes_with_linkdown' sysctl is set. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 6a2f80cbdf65..34cd3b0c6ded 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -170,6 +170,7 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp); void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); void rt6_disable_ip(struct net_device *dev, unsigned long event); +void rt6_sync_down_dev(struct net_device *dev, unsigned long event); static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) { -- cgit v1.2.3 From a2c554d3f8f65dd9c46a55811efad8156e328f85 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:11 +0200 Subject: ipv6: Add explicit flush indication to routes When routes that are a part of a multipath route are evaluated by fib6_ifdown() in response to NETDEV_DOWN and NETDEV_UNREGISTER events the state of their sibling routes is not considered. This will change in subsequent patches in order to align IPv6 with IPv4's behavior. For example, when the last sibling in a multipath route becomes dead, the entire multipath route needs to be removed. To prevent the tree walker from re-evaluating all the sibling routes each time, we can simply evaluate them once - when the first sibling is traversed. If we determine the entire multipath route needs to be removed, then the 'should_flush' bit is set in all the siblings, which will cause the walker to flush them when it traverses them. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 44d96a91e745..affea1aa6ae4 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -173,7 +173,8 @@ struct rt6_info { unsigned short rt6i_nfheader_len; u8 rt6i_protocol; u8 exception_bucket_flushed:1, - unused:7; + should_flush:1, + unused:6; }; #define for_each_fib6_node_rt_rcu(fn) \ -- cgit v1.2.3 From 4a8e56ee2c8551e674f69ba007aabede8f0b88d9 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 7 Jan 2018 12:45:13 +0200 Subject: ipv6: Export sernum update function We are going to allow dead routes to stay in the FIB tree (e.g., when they are part of a multipath route, directly connected route with no carrier) and revive them when their nexthop device gains carrier or when it is put administratively up. This is equivalent to the addition of the route to the FIB tree and we should therefore take care of updating the sernum of all the parent nodes of the node where the route is stored. Otherwise, we risk sockets caching and using sub-optimal dst entries. Export the function that performs the above, so that it could be invoked from fib6_ifup() later on. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index affea1aa6ae4..ddf53dd1e948 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -405,6 +405,7 @@ unsigned int fib6_tables_seq_read(struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb); void fib6_update_sernum(struct rt6_info *rt); +void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt); #ifdef CONFIG_IPV6_MULTIPLE_TABLES int fib6_rules_init(void); -- cgit v1.2.3 From 39215846740a9f29ac7dac276f9df98135f39bb0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 7 Nov 2017 07:20:07 +0100 Subject: netfilter: conntrack: remove nlattr_size pointer from l4proto trackers similar to previous commit, but instead compute this at compile time and turn nlattr_size into an u16. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 7ef56c13698a..0e5618ec8b9d 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -27,6 +27,9 @@ struct nf_conntrack_l4proto { /* Resolve clashes on insertion races. */ bool allow_clash; + /* protoinfo nlattr size, closes a hole */ + u16 nlattr_size; + /* Try to fill in the third arg: dataoff is offset past network protocol hdr. Return true if possible. */ bool (*pkt_to_tuple)(const struct sk_buff *skb, unsigned int dataoff, @@ -66,8 +69,6 @@ struct nf_conntrack_l4proto { /* convert protoinfo to nfnetink attributes */ int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct); - /* Calculate protoinfo nlattr size */ - int (*nlattr_size)(void); /* convert nfnetlink attributes to protoinfo */ int (*from_nlattr)(struct nlattr *tb[], struct nf_conn *ct); @@ -80,8 +81,6 @@ struct nf_conntrack_l4proto { struct nf_conntrack_tuple *t); const struct nla_policy *nla_policy; - size_t nla_size; - #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) struct { int (*nlattr_to_obj)(struct nlattr *tb[], -- cgit v1.2.3 From cd9ceafc0a761a15b4cbfe6c0024edf88f861d66 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 7 Nov 2017 07:20:08 +0100 Subject: netfilter: conntrack: constify list of builtin trackers Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_l4proto.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 0e5618ec8b9d..7fbb8f64a96e 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -125,18 +125,18 @@ int nf_ct_l4proto_pernet_register_one(struct net *net, void nf_ct_l4proto_pernet_unregister_one(struct net *net, const struct nf_conntrack_l4proto *proto); int nf_ct_l4proto_pernet_register(struct net *net, - struct nf_conntrack_l4proto *const proto[], + const struct nf_conntrack_l4proto *const proto[], unsigned int num_proto); void nf_ct_l4proto_pernet_unregister(struct net *net, - struct nf_conntrack_l4proto *const proto[], + const struct nf_conntrack_l4proto *const proto[], unsigned int num_proto); /* Protocol global registration. */ -int nf_ct_l4proto_register_one(struct nf_conntrack_l4proto *proto); +int nf_ct_l4proto_register_one(const struct nf_conntrack_l4proto *proto); void nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *proto); -int nf_ct_l4proto_register(struct nf_conntrack_l4proto *proto[], +int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const proto[], unsigned int num_proto); -void nf_ct_l4proto_unregister(struct nf_conntrack_l4proto *proto[], +void nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const proto[], unsigned int num_proto); /* Generic netlink helpers */ -- cgit v1.2.3 From 9dae47aba0a055f761176d9297371d5bb24289ec Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 7 Nov 2017 07:20:09 +0100 Subject: netfilter: conntrack: l4 protocol trackers can be const previous patches removed all writes to these structs so we can now mark them as const. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_conntrack_ipv4.h | 12 ++++++------ include/net/netfilter/ipv6/nf_conntrack_ipv6.h | 12 ++++++------ include/net/netfilter/nf_conntrack_l4proto.h | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h index 4ed1040bbe4a..73f825732326 100644 --- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h @@ -13,17 +13,17 @@ const extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; -extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4; -extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4; -extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp; +extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4; +extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4; +extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp; #ifdef CONFIG_NF_CT_PROTO_DCCP -extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4; +extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4; #endif #ifdef CONFIG_NF_CT_PROTO_SCTP -extern struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4; +extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4; #endif #ifdef CONFIG_NF_CT_PROTO_UDPLITE -extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4; +extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4; #endif int nf_conntrack_ipv4_compat_init(void); diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h index 9cd55be95853..effa8dfba68c 100644 --- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h +++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h @@ -4,17 +4,17 @@ extern const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6; -extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6; -extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6; -extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6; +extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6; +extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6; +extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6; #ifdef CONFIG_NF_CT_PROTO_DCCP -extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6; +extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6; #endif #ifdef CONFIG_NF_CT_PROTO_SCTP -extern struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6; +extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6; #endif #ifdef CONFIG_NF_CT_PROTO_UDPLITE -extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6; +extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6; #endif #include diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 7fbb8f64a96e..a7220eef9aee 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -108,7 +108,7 @@ struct nf_conntrack_l4proto { }; /* Existing built-in generic protocol */ -extern struct nf_conntrack_l4proto nf_conntrack_l4proto_generic; +extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic; #define MAX_NF_CT_PROTO 256 -- cgit v1.2.3 From 6b3d933000cbe539e5b234d639b083da60bb275c Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Mon, 13 Nov 2017 22:58:18 +0800 Subject: netfilter: ipvs: Remove useless ipvsh param of frag_safe_skb_hp The param of frag_safe_skb_hp, ipvsh, isn't used now. So remove it and update the callers' codes too. Signed-off-by: Gao Feng Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index ff68cf288f9b..eb0bec043c96 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -69,8 +69,7 @@ struct ip_vs_iphdr { }; static inline void *frag_safe_skb_hp(const struct sk_buff *skb, int offset, - int len, void *buffer, - const struct ip_vs_iphdr *ipvsh) + int len, void *buffer) { return skb_header_pointer(skb, offset, len, buffer); } -- cgit v1.2.3 From 26888dfd7e7454686b8d3ea9ba5045d5f236e4d7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 1 Dec 2017 00:21:03 +0100 Subject: netfilter: core: remove synchronize_net call if nfqueue is used since commit 960632ece6949b ("netfilter: convert hook list to an array") nfqueue no longer stores a pointer to the hook that caused the packet to be queued. Therefore no extra synchronize_net() call is needed after dropping the packets enqueued by the old rule blob. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_queue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 814058d0f167..a50a69f5334c 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -25,7 +25,7 @@ struct nf_queue_entry { struct nf_queue_handler { int (*outfn)(struct nf_queue_entry *entry, unsigned int queuenum); - unsigned int (*nf_hook_drop)(struct net *net); + void (*nf_hook_drop)(struct net *net); }; void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh); -- cgit v1.2.3 From b0f38338aef2dae5ade3c16acf713737e3b15a73 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 3 Dec 2017 00:58:47 +0100 Subject: netfilter: reduce size of hook entry point locations struct net contains: struct nf_hook_entries __rcu *hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; which store the hook entry point locations for the various protocol families and the hooks. Using array results in compact c code when doing accesses, i.e. x = rcu_dereference(net->nf.hooks[pf][hook]); but its also wasting a lot of memory, as most families are not used. So split the array into those families that are used, which are only 5 (instead of 13). In most cases, the 'pf' argument is constant, i.e. gcc removes switch statement. struct net before: /* size: 5184, cachelines: 81, members: 46 */ after: /* size: 4672, cachelines: 73, members: 46 */ Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netns/netfilter.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index cc00af2ac2d7..b39c563c2fce 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -17,7 +17,11 @@ struct netns_nf { #ifdef CONFIG_SYSCTL struct ctl_table_header *nf_log_dir_header; #endif - struct nf_hook_entries __rcu *hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; + struct nf_hook_entries __rcu *hooks_ipv4[NF_MAX_HOOKS]; + struct nf_hook_entries __rcu *hooks_ipv6[NF_MAX_HOOKS]; + struct nf_hook_entries __rcu *hooks_arp[NF_MAX_HOOKS]; + struct nf_hook_entries __rcu *hooks_bridge[NF_MAX_HOOKS]; + struct nf_hook_entries __rcu *hooks_decnet[NF_MAX_HOOKS]; #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) bool defrag_ipv4; #endif -- cgit v1.2.3 From ef57170bbfdd6958281011332b1fd237712f69f0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 7 Dec 2017 16:28:24 +0100 Subject: netfilter: reduce hook array sizes to what is needed Not all families share the same hook count, adjust sizes to what is needed. struct net before: /* size: 6592, cachelines: 103, members: 46 */ after: /* size: 5952, cachelines: 93, members: 46 */ Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netns/netfilter.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index b39c563c2fce..8f756a4b9205 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -17,11 +17,11 @@ struct netns_nf { #ifdef CONFIG_SYSCTL struct ctl_table_header *nf_log_dir_header; #endif - struct nf_hook_entries __rcu *hooks_ipv4[NF_MAX_HOOKS]; - struct nf_hook_entries __rcu *hooks_ipv6[NF_MAX_HOOKS]; - struct nf_hook_entries __rcu *hooks_arp[NF_MAX_HOOKS]; - struct nf_hook_entries __rcu *hooks_bridge[NF_MAX_HOOKS]; - struct nf_hook_entries __rcu *hooks_decnet[NF_MAX_HOOKS]; + struct nf_hook_entries __rcu *hooks_ipv4[NF_INET_NUMHOOKS]; + struct nf_hook_entries __rcu *hooks_ipv6[NF_INET_NUMHOOKS]; + struct nf_hook_entries __rcu *hooks_arp[NF_ARP_NUMHOOKS]; + struct nf_hook_entries __rcu *hooks_bridge[NF_INET_NUMHOOKS]; + struct nf_hook_entries __rcu *hooks_decnet[NF_DN_NUMHOOKS]; #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) bool defrag_ipv4; #endif -- cgit v1.2.3 From bb4badf3a3dc81190f7c1c1fa063cdefb18df45f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 7 Dec 2017 16:28:25 +0100 Subject: netfilter: don't allocate space for decnet hooks unless needed no need to define hook points if the family isn't supported. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netns/netfilter.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index 8f756a4b9205..432609fd9899 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -21,7 +21,9 @@ struct netns_nf { struct nf_hook_entries __rcu *hooks_ipv6[NF_INET_NUMHOOKS]; struct nf_hook_entries __rcu *hooks_arp[NF_ARP_NUMHOOKS]; struct nf_hook_entries __rcu *hooks_bridge[NF_INET_NUMHOOKS]; +#if IS_ENABLED(CONFIG_DECNET) struct nf_hook_entries __rcu *hooks_decnet[NF_DN_NUMHOOKS]; +#endif #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) bool defrag_ipv4; #endif -- cgit v1.2.3 From 2a95183a5e0375df756efb2ca37602d71e8455f9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 7 Dec 2017 16:28:26 +0100 Subject: netfilter: don't allocate space for arp/bridge hooks unless needed no need to define hook points if the family isn't supported. Because we need these hooks for either nftables, arp/ebtables or the 'call-iptables' hack we have in the bridge layer add two new dependencies, NETFILTER_FAMILY_{ARP,BRIDGE}, and have the users select them. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netns/netfilter.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/net') diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index 432609fd9899..ca043342c0eb 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -19,8 +19,12 @@ struct netns_nf { #endif struct nf_hook_entries __rcu *hooks_ipv4[NF_INET_NUMHOOKS]; struct nf_hook_entries __rcu *hooks_ipv6[NF_INET_NUMHOOKS]; +#ifdef CONFIG_NETFILTER_FAMILY_ARP struct nf_hook_entries __rcu *hooks_arp[NF_ARP_NUMHOOKS]; +#endif +#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE struct nf_hook_entries __rcu *hooks_bridge[NF_INET_NUMHOOKS]; +#endif #if IS_ENABLED(CONFIG_DECNET) struct nf_hook_entries __rcu *hooks_decnet[NF_DN_NUMHOOKS]; #endif -- cgit v1.2.3 From 7a4473a31a6974c0fbf9afe80ef16ac5bc67cf79 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 10 Dec 2017 01:43:14 +0100 Subject: netfilter: nf_tables: explicit nft_set_pktinfo() call from hook path Instead of calling this function from the family specific variant, this reduces the code size in the fast path for the netdev, bridge and inet families. After this change, we must call nft_set_pktinfo() upfront from the chain hook indirection. Before: text data bss dec hex filename 2145 208 0 2353 931 net/netfilter/nf_tables_netdev.o After: text data bss dec hex filename 2125 208 0 2333 91d net/netfilter/nf_tables_netdev.o Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 12 ++---------- include/net/netfilter/nf_tables_ipv4.h | 25 ++++++++----------------- include/net/netfilter/nf_tables_ipv6.h | 27 +++++++++------------------ 3 files changed, 19 insertions(+), 45 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index fecc6112c768..f6e4325b3306 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -54,8 +54,8 @@ static inline void nft_set_pktinfo(struct nft_pktinfo *pkt, pkt->xt.state = state; } -static inline void nft_set_pktinfo_proto_unspec(struct nft_pktinfo *pkt, - struct sk_buff *skb) +static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt, + struct sk_buff *skb) { pkt->tprot_set = false; pkt->tprot = 0; @@ -63,14 +63,6 @@ static inline void nft_set_pktinfo_proto_unspec(struct nft_pktinfo *pkt, pkt->xt.fragoff = 0; } -static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - nft_set_pktinfo(pkt, skb, state); - nft_set_pktinfo_proto_unspec(pkt, skb); -} - /** * struct nft_verdict - nf_tables verdict * diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h index f0896ba456c4..b2deeb2755a4 100644 --- a/include/net/netfilter/nf_tables_ipv4.h +++ b/include/net/netfilter/nf_tables_ipv4.h @@ -5,15 +5,11 @@ #include #include -static inline void -nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) +static inline void nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt, + struct sk_buff *skb) { struct iphdr *ip; - nft_set_pktinfo(pkt, skb, state); - ip = ip_hdr(pkt->skb); pkt->tprot_set = true; pkt->tprot = ip->protocol; @@ -21,10 +17,8 @@ nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt, pkt->xt.fragoff = ntohs(ip->frag_off) & IP_OFFSET; } -static inline int -__nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) +static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt, + struct sk_buff *skb) { struct iphdr *iph, _iph; u32 len, thoff; @@ -52,14 +46,11 @@ __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt, return 0; } -static inline void -nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) +static inline void nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt, + struct sk_buff *skb) { - nft_set_pktinfo(pkt, skb, state); - if (__nft_set_pktinfo_ipv4_validate(pkt, skb, state) < 0) - nft_set_pktinfo_proto_unspec(pkt, skb); + if (__nft_set_pktinfo_ipv4_validate(pkt, skb) < 0) + nft_set_pktinfo_unspec(pkt, skb); } extern struct nft_af_info nft_af_ipv4; diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index b8065b72f56e..1890c5bc3c3c 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -5,20 +5,16 @@ #include #include -static inline void -nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) +static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt, + struct sk_buff *skb) { unsigned int flags = IP6_FH_F_AUTH; int protohdr, thoff = 0; unsigned short frag_off; - nft_set_pktinfo(pkt, skb, state); - protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0) { - nft_set_pktinfo_proto_unspec(pkt, skb); + nft_set_pktinfo_unspec(pkt, skb); return; } @@ -28,10 +24,8 @@ nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt, pkt->xt.fragoff = frag_off; } -static inline int -__nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) +static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, + struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) unsigned int flags = IP6_FH_F_AUTH; @@ -68,14 +62,11 @@ __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, #endif } -static inline void -nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) +static inline void nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, + struct sk_buff *skb) { - nft_set_pktinfo(pkt, skb, state); - if (__nft_set_pktinfo_ipv6_validate(pkt, skb, state) < 0) - nft_set_pktinfo_proto_unspec(pkt, skb); + if (__nft_set_pktinfo_ipv6_validate(pkt, skb) < 0) + nft_set_pktinfo_unspec(pkt, skb); } extern struct nft_af_info nft_af_ipv6; -- cgit v1.2.3 From 408070d6ee3490da63430bc8ce13348cf2eb47ea Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 24 Nov 2017 13:39:57 +0100 Subject: netfilter: nf_tables: add nft_set_is_anonymous() helper Add helper function to test for the NFT_SET_ANONYMOUS flag. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index f6e4325b3306..169b562df226 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -416,6 +416,11 @@ struct nft_set { __attribute__((aligned(__alignof__(u64)))); }; +static inline bool nft_set_is_anonymous(const struct nft_set *set) +{ + return set->flags & NFT_SET_ANONYMOUS; +} + static inline void *nft_set_priv(const struct nft_set *set) { return (void *)set->data; -- cgit v1.2.3 From 12355d3670dac0dde5aae3deefb59f8cc0a9ed2a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 9 Dec 2017 15:36:24 +0100 Subject: netfilter: nf_tables_inet: don't use multihook infrastructure anymore Use new native NFPROTO_INET support in netfilter core, this gets rid of ad-hoc code in the nf_tables API codebase. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_ipv4.h | 2 -- include/net/netfilter/nf_tables_ipv6.h | 2 -- 2 files changed, 4 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h index b2deeb2755a4..ed7b511f0a59 100644 --- a/include/net/netfilter/nf_tables_ipv4.h +++ b/include/net/netfilter/nf_tables_ipv4.h @@ -53,6 +53,4 @@ static inline void nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt, nft_set_pktinfo_unspec(pkt, skb); } -extern struct nft_af_info nft_af_ipv4; - #endif diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index 1890c5bc3c3c..dabe6fdb553a 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -69,6 +69,4 @@ static inline void nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, nft_set_pktinfo_unspec(pkt, skb); } -extern struct nft_af_info nft_af_ipv6; - #endif -- cgit v1.2.3 From c974a3a36468d1947c96f0c694c8a1b2e7810043 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 9 Dec 2017 15:40:25 +0100 Subject: netfilter: nf_tables: remove multihook chains and families Since NFPROTO_INET is handled from the core, we don't need to maintain extra infrastructure in nf_tables to handle the double hook registration, one for IPv4 and another for IPv6. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 169b562df226..a3560fd55f99 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -902,8 +902,6 @@ struct nft_stats { struct u64_stats_sync syncp; }; -#define NFT_HOOK_OPS_MAX 2 - /** * struct nft_base_chain - nf_tables base chain * @@ -915,7 +913,7 @@ struct nft_stats { * @dev_name: device name that this base chain is attached to (if any) */ struct nft_base_chain { - struct nf_hook_ops ops[NFT_HOOK_OPS_MAX]; + struct nf_hook_ops ops; const struct nf_chain_type *type; u8 policy; u8 flags; @@ -976,8 +974,6 @@ enum nft_af_flags { * @owner: module owner * @tables: used internally * @flags: family flags - * @nops: number of hook ops in this family - * @hook_ops_init: initialization function for chain hook ops * @hooks: hookfn overrides for packet validation */ struct nft_af_info { @@ -987,9 +983,6 @@ struct nft_af_info { struct module *owner; struct list_head tables; u32 flags; - unsigned int nops; - void (*hook_ops_init)(struct nf_hook_ops *, - unsigned int); nf_hookfn *hooks[NF_MAX_HOOKS]; }; -- cgit v1.2.3 From c2f9eafee9aaeedaad9eadbf47913f4681d723df Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 9 Dec 2017 15:43:17 +0100 Subject: netfilter: nf_tables: remove hooks from family definition They don't belong to the family definition, move them to the filter chain type definition instead. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index a3560fd55f99..e040b6151acc 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -880,7 +880,7 @@ enum nft_chain_type { * @family: address family * @owner: module owner * @hook_mask: mask of valid hooks - * @hooks: hookfn overrides + * @hooks: array of hook functions */ struct nf_chain_type { const char *name; @@ -974,7 +974,6 @@ enum nft_af_flags { * @owner: module owner * @tables: used internally * @flags: family flags - * @hooks: hookfn overrides for packet validation */ struct nft_af_info { struct list_head list; @@ -983,7 +982,6 @@ struct nft_af_info { struct module *owner; struct list_head tables; u32 flags; - nf_hookfn *hooks[NF_MAX_HOOKS]; }; int nft_register_afinfo(struct net *, struct nft_af_info *); -- cgit v1.2.3 From 625c556118f3c2fd28bb8ef6da18c53bd4037be4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 9 Dec 2017 21:01:08 +0100 Subject: netfilter: connlimit: split xt_connlimit into front and backend This allows to reuse xt_connlimit infrastructure from nf_tables. The upcoming nf_tables frontend can just pass in an nftables register as input key, this allows limiting by any nft-supported key, including concatenations. For xt_connlimit, pass in the zone and the ip/ipv6 address. With help from Yi-Hung Wei. Signed-off-by: Florian Westphal Acked-by: Yi-Hung Wei Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 include/net/netfilter/nf_conntrack_count.h (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h new file mode 100644 index 000000000000..adf8db44cf86 --- /dev/null +++ b/include/net/netfilter/nf_conntrack_count.h @@ -0,0 +1,17 @@ +#ifndef _NF_CONNTRACK_COUNT_H +#define _NF_CONNTRACK_COUNT_H + +struct nf_conncount_data; + +struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, + unsigned int keylen); +void nf_conncount_destroy(struct net *net, unsigned int family, + struct nf_conncount_data *data); + +unsigned int nf_conncount_count(struct net *net, + struct nf_conncount_data *data, + const u32 *key, + unsigned int family, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone); +#endif -- cgit v1.2.3 From 0befd061af59c4ba426588930f09eb9ea2475534 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 2 Jan 2018 12:50:12 +0100 Subject: netfilter: nf_tables: remove nft_dereference() This macro is unnecessary, it just hides details for one single caller. nfnl_dereference() is just enough. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e040b6151acc..e3ec02fd0f67 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1113,9 +1113,6 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, void nft_trace_notify(struct nft_traceinfo *info); -#define nft_dereference(p) \ - nfnl_dereference(p, NFNL_SUBSYS_NFTABLES) - #define MODULE_ALIAS_NFT_FAMILY(family) \ MODULE_ALIAS("nft-afinfo-" __stringify(family)) -- cgit v1.2.3 From 3b49e2e94e6ebb8b23d0955d9e898254455734f8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 7 Jan 2018 01:04:07 +0100 Subject: netfilter: nf_tables: add flow table netlink frontend This patch introduces a netlink control plane to create, delete and dump flow tables. Flow tables are identified by name, this name is used from rules to refer to an specific flow table. Flow tables use the rhashtable class and a generic garbage collector to remove expired entries. This also adds the infrastructure to add different flow table types, so we can add one for each layer 3 protocol family. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 23 +++++++++++++++++ include/net/netfilter/nf_tables.h | 48 +++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 include/net/netfilter/nf_flow_table.h (limited to 'include/net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h new file mode 100644 index 000000000000..3a0779589281 --- /dev/null +++ b/include/net/netfilter/nf_flow_table.h @@ -0,0 +1,23 @@ +#ifndef _NF_FLOW_TABLE_H +#define _NF_FLOW_TABLE_H + +#include + +struct nf_flowtable; + +struct nf_flowtable_type { + struct list_head list; + int family; + void (*gc)(struct work_struct *work); + const struct rhashtable_params *params; + nf_hookfn *hook; + struct module *owner; +}; + +struct nf_flowtable { + struct rhashtable rhashtable; + const struct nf_flowtable_type *type; + struct delayed_work gc_work; +}; + +#endif /* _FLOW_OFFLOAD_H */ diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e3ec02fd0f67..dd238950df81 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #define NFT_JUMP_STACK_SIZE 16 @@ -943,6 +944,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); * @chains: chains in the table * @sets: sets in the table * @objects: stateful objects in the table + * @flowtables: flow tables in the table * @hgenerator: handle generator state * @use: number of chain references to this table * @flags: table flag (see enum nft_table_flags) @@ -954,6 +956,7 @@ struct nft_table { struct list_head chains; struct list_head sets; struct list_head objects; + struct list_head flowtables; u64 hgenerator; u32 use; u16 flags:14, @@ -1084,6 +1087,44 @@ struct nft_object_ops { int nft_register_obj(struct nft_object_type *obj_type); void nft_unregister_obj(struct nft_object_type *obj_type); +/** + * struct nft_flowtable - nf_tables flow table + * + * @list: flow table list node in table list + * @table: the table the flow table is contained in + * @name: name of this flow table + * @hooknum: hook number + * @priority: hook priority + * @ops_len: number of hooks in array + * @genmask: generation mask + * @use: number of references to this flow table + * @data: rhashtable and garbage collector + * @ops: array of hooks + */ +struct nft_flowtable { + struct list_head list; + struct nft_table *table; + char *name; + int hooknum; + int priority; + int ops_len; + u32 genmask:2, + use:30; + /* runtime data below here */ + struct nf_hook_ops *ops ____cacheline_aligned; + struct nf_flowtable data; +}; + +struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, + const struct nlattr *nla, + u8 genmask); +void nft_flow_table_iterate(struct net *net, + void (*iter)(struct nf_flowtable *flowtable, void *data), + void *data); + +void nft_register_flowtable_type(struct nf_flowtable_type *type); +void nft_unregister_flowtable_type(struct nf_flowtable_type *type); + /** * struct nft_traceinfo - nft tracing information and state * @@ -1317,4 +1358,11 @@ struct nft_trans_obj { #define nft_trans_obj(trans) \ (((struct nft_trans_obj *)trans->data)->obj) +struct nft_trans_flowtable { + struct nft_flowtable *flowtable; +}; + +#define nft_trans_flowtable(trans) \ + (((struct nft_trans_flowtable *)trans->data)->flowtable) + #endif /* _NET_NF_TABLES_H */ -- cgit v1.2.3 From ac2a66665e231847cab11b8c8e844ce43207dd2e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 7 Jan 2018 01:04:11 +0100 Subject: netfilter: add generic flow table infrastructure This patch defines the API to interact with flow tables, this allows to add, delete and lookup for entries in the flow table. This also adds the generic garbage code that removes entries that have expired, ie. no traffic has been seen for a while. Users of the flow table infrastructure can delete entries via flow_offload_dead(), which sets the dying bit, this signals the garbage collector to release an entry from user context. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 94 +++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 3a0779589281..161f71ca78a0 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -1,7 +1,12 @@ #ifndef _NF_FLOW_TABLE_H #define _NF_FLOW_TABLE_H +#include +#include +#include #include +#include +#include struct nf_flowtable; @@ -20,4 +25,93 @@ struct nf_flowtable { struct delayed_work gc_work; }; +enum flow_offload_tuple_dir { + FLOW_OFFLOAD_DIR_ORIGINAL, + FLOW_OFFLOAD_DIR_REPLY, + __FLOW_OFFLOAD_DIR_MAX = FLOW_OFFLOAD_DIR_REPLY, +}; +#define FLOW_OFFLOAD_DIR_MAX (__FLOW_OFFLOAD_DIR_MAX + 1) + +struct flow_offload_tuple { + union { + struct in_addr src_v4; + struct in6_addr src_v6; + }; + union { + struct in_addr dst_v4; + struct in6_addr dst_v6; + }; + struct { + __be16 src_port; + __be16 dst_port; + }; + + int iifidx; + + u8 l3proto; + u8 l4proto; + u8 dir; + + int oifidx; + + struct dst_entry *dst_cache; +}; + +struct flow_offload_tuple_rhash { + struct rhash_head node; + struct flow_offload_tuple tuple; +}; + +#define FLOW_OFFLOAD_SNAT 0x1 +#define FLOW_OFFLOAD_DNAT 0x2 +#define FLOW_OFFLOAD_DYING 0x4 + +struct flow_offload { + struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX]; + u32 flags; + union { + /* Your private driver data here. */ + u32 timeout; + }; +}; + +#define NF_FLOW_TIMEOUT (30 * HZ) + +struct nf_flow_route { + struct { + struct dst_entry *dst; + int ifindex; + } tuple[FLOW_OFFLOAD_DIR_MAX]; +}; + +struct flow_offload *flow_offload_alloc(struct nf_conn *ct, + struct nf_flow_route *route); +void flow_offload_free(struct flow_offload *flow); + +int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow); +void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow); +struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table, + struct flow_offload_tuple *tuple); +int nf_flow_table_iterate(struct nf_flowtable *flow_table, + void (*iter)(struct flow_offload *flow, void *data), + void *data); +void nf_flow_offload_work_gc(struct work_struct *work); +extern const struct rhashtable_params nf_flow_offload_rhash_params; + +void flow_offload_dead(struct flow_offload *flow); + +int nf_flow_snat_port(const struct flow_offload *flow, + struct sk_buff *skb, unsigned int thoff, + u8 protocol, enum flow_offload_tuple_dir dir); +int nf_flow_dnat_port(const struct flow_offload *flow, + struct sk_buff *skb, unsigned int thoff, + u8 protocol, enum flow_offload_tuple_dir dir); + +struct flow_ports { + __be16 source, dest; +}; + +#define MODULE_ALIAS_NF_FLOWTABLE(family) \ + MODULE_ALIAS("nf-flowtable-" __stringify(family)) + #endif /* _FLOW_OFFLOAD_H */ -- cgit v1.2.3 From 0995210753a26c4fa1a3d8c63cc230e22a8537cd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 7 Jan 2018 01:04:19 +0100 Subject: netfilter: flow table support for IPv6 This patch adds the IPv6 flow table type, that implements the datapath flow table to forward IPv6 traffic. This patch exports ip6_dst_mtu_forward() that is required to check for mtu to pass up packets that need PMTUD handling to the classic forwarding path. Signed-off-by: Pablo Neira Ayuso --- include/net/ipv6.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 25be4715578c..9dc1230d789c 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -969,6 +969,8 @@ static inline struct sk_buff *ip6_finish_skb(struct sock *sk) &inet6_sk(sk)->cork); } +unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst); + int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, -- cgit v1.2.3 From 7c23b629a8085b11daccd68c62b5116ff498f84a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 7 Jan 2018 01:04:22 +0100 Subject: netfilter: flow table support for the mixed IPv4/IPv6 family This patch adds the IPv6 flow table type, that implements the datapath flow table to forward IPv6 traffic. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 161f71ca78a0..b22b22082733 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -111,6 +111,11 @@ struct flow_ports { __be16 source, dest; }; +unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state); +unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state); + #define MODULE_ALIAS_NF_FLOWTABLE(family) \ MODULE_ALIAS("nf-flowtable-" __stringify(family)) -- cgit v1.2.3 From 54dc3e3324829d346c959ff774626d9c6c9a65b5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 4 Jan 2018 14:03:54 -0800 Subject: net: ipv6: Allow connect to linklocal address from socket bound to vrf Allow a process bound to a VRF to connect to a linklocal address. Currently, this fails because of a mismatch between the scope of the linklocal address and the sk_bound_dev_if inherited by the VRF binding: $ ssh -6 fe80::70b8:cff:fedd:ead8%eth1 ssh: connect to host fe80::70b8:cff:fedd:ead8%eth1 port 22: Invalid argument Relax the scope check to allow the socket to be bound to the same L3 device as the scope id. This makes ipv6 linklocal consistent with other relaxed checks enabled by commits 1ff23beebdd3 ("net: l3mdev: Allow send on enslaved interface") and 7bb387c5ab12a ("net: Allow IP_MULTICAST_IF to set index to L3 slave"). Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/sock.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 66fd3951e6f3..73b7830b0bb8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -72,6 +72,7 @@ #include #include #include +#include /* * This structure really needs to be cleaned up. @@ -2399,4 +2400,23 @@ static inline void sk_pacing_shift_update(struct sock *sk, int val) sk->sk_pacing_shift = val; } +/* if a socket is bound to a device, check that the given device + * index is either the same or that the socket is bound to an L3 + * master device and the given device index is also enslaved to + * that L3 master + */ +static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) +{ + int mdif; + + if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif) + return true; + + mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif); + if (mdif && mdif == sk->sk_bound_dev_if) + return true; + + return false; +} + #endif /* _SOCK_H */ -- cgit v1.2.3 From b6c5734db07079c9410147b32407f2366d584e6c Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 5 Jan 2018 11:17:18 -0200 Subject: sctp: fix the handling of ICMP Frag Needed for too small MTUs syzbot reported a hang involving SCTP, on which it kept flooding dmesg with the message: [ 246.742374] sctp: sctp_transport_update_pmtu: Reported pmtu 508 too low, using default minimum of 512 That happened because whenever SCTP hits an ICMP Frag Needed, it tries to adjust to the new MTU and triggers an immediate retransmission. But it didn't consider the fact that MTUs smaller than the SCTP minimum MTU allowed (512) would not cause the PMTU to change, and issued the retransmission anyway (thus leading to another ICMP Frag Needed, and so on). As IPv4 (ip_rt_min_pmtu=556) and IPv6 (IPV6_MIN_MTU=1280) minimum MTU are higher than that, sctp_transport_update_pmtu() is changed to re-fetch the PMTU that got set after our request, and with that, detect if there was an actual change or not. The fix, thus, skips the immediate retransmission if the received ICMP resulted in no change, in the hope that SCTP will select another path. Note: The value being used for the minimum MTU (512, SCTP_DEFAULT_MINSEGMENT) is not right and instead it should be (576, SCTP_MIN_PMTU), but such change belongs to another patch. Changes from v1: - do not disable PMTU discovery, in the light of commit 06ad391919b2 ("[SCTP] Don't disable PMTU discovery when mtu is small") and as suggested by Xin Long. - changed the way to break the rtx loop by detecting if the icmp resulted in a change or not Changes from v2: none See-also: https://lkml.org/lkml/2017/12/22/811 Reported-by: syzbot Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 2f8f93da5dc2..9a5ccf03a59b 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -966,7 +966,7 @@ void sctp_transport_burst_limited(struct sctp_transport *); void sctp_transport_burst_reset(struct sctp_transport *); unsigned long sctp_transport_timeout(struct sctp_transport *); void sctp_transport_reset(struct sctp_transport *t); -void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu); +bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu); void sctp_transport_immediate_rtx(struct sctp_transport *); void sctp_transport_dst_release(struct sctp_transport *t); void sctp_transport_dst_confirm(struct sctp_transport *t); -- cgit v1.2.3 From fe19c04ca13737a48277fad28d912efbd72c1772 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 19 Dec 2017 13:53:45 +0100 Subject: netfilter: nf_tables: remove nhooks field from struct nft_af_info We already validate the hook through bitmask, so this check is superfluous. When removing this, this patch is also fixing a bug in the new flowtable codebase, since ctx->afi points to the table family instead of the netdev family which is where the flowtable is really hooked in. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index dd238950df81..536aaec96a07 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -973,7 +973,6 @@ enum nft_af_flags { * * @list: used internally * @family: address family - * @nhooks: number of hooks in this family * @owner: module owner * @tables: used internally * @flags: family flags @@ -981,7 +980,6 @@ enum nft_af_flags { struct nft_af_info { struct list_head list; int family; - unsigned int nhooks; struct module *owner; struct list_head tables; u32 flags; -- cgit v1.2.3 From e7bb5c714020a2dce85b12766899f528883585ac Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 19 Dec 2017 14:07:52 +0100 Subject: netfilter: nf_tables: remove flag field from struct nft_af_info Replace it by a direct check for the netdev protocol family. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 536aaec96a07..9a85893a5e30 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -964,10 +964,6 @@ struct nft_table { char *name; }; -enum nft_af_flags { - NFT_AF_NEEDS_DEV = (1 << 0), -}; - /** * struct nft_af_info - nf_tables address family info * @@ -975,14 +971,12 @@ enum nft_af_flags { * @family: address family * @owner: module owner * @tables: used internally - * @flags: family flags */ struct nft_af_info { struct list_head list; int family; struct module *owner; struct list_head tables; - u32 flags; }; int nft_register_afinfo(struct net *, struct nft_af_info *); -- cgit v1.2.3 From 36596dadf54a920d26286cf9f421fb4ef648b51f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Jan 2018 02:38:03 +0100 Subject: netfilter: nf_tables: add single table list for all families Place all existing user defined tables in struct net *, instead of having one list per family. This saves us from one level of indentation in netlink dump functions. Place pointer to struct nft_af_info in struct nft_table temporarily, as we still need this to put back reference module reference counter on table removal. This patch comes in preparation for the removal of struct nft_af_info. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 8 ++++---- include/net/netns/nftables.h | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 9a85893a5e30..c55e836e6a2f 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -143,22 +143,22 @@ static inline void nft_data_debug(const struct nft_data *data) * struct nft_ctx - nf_tables rule/set context * * @net: net namespace - * @afi: address family info * @table: the table the chain is contained in * @chain: the chain the rule is contained in * @nla: netlink attributes * @portid: netlink portID of the original message * @seq: netlink sequence number + * @family: protocol family * @report: notify via unicast netlink message */ struct nft_ctx { struct net *net; - struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; const struct nlattr * const *nla; u32 portid; u32 seq; + u8 family; bool report; }; @@ -949,6 +949,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); * @use: number of chain references to this table * @flags: table flag (see enum nft_table_flags) * @genmask: generation mask + * @afinfo: address family info * @name: name of the table */ struct nft_table { @@ -961,6 +962,7 @@ struct nft_table { u32 use; u16 flags:14, genmask:2; + struct nft_af_info *afi; char *name; }; @@ -970,13 +972,11 @@ struct nft_table { * @list: used internally * @family: address family * @owner: module owner - * @tables: used internally */ struct nft_af_info { struct list_head list; int family; struct module *owner; - struct list_head tables; }; int nft_register_afinfo(struct net *, struct nft_af_info *); diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index 4109b5f3010f..7f86a63ac21f 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -8,6 +8,7 @@ struct nft_af_info; struct netns_nftables { struct list_head af_info; + struct list_head tables; struct list_head commit_list; struct nft_af_info *ipv4; struct nft_af_info *ipv6; -- cgit v1.2.3 From dd4cbef7235154f163501ffbf396c0dadd830c9c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Jan 2018 02:42:11 +0100 Subject: netfilter: nf_tables: get rid of pernet families Now that we have a single table list for each netns, we can get rid of one pointer per family and the global afinfo list, thus, shrinking struct netns for nftables that now becomes 64 bytes smaller. And call __nft_release_afinfo() from __net_exit path accordingly to release netnamespace objects on removal. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++-- include/net/netns/nftables.h | 7 ------- 2 files changed, 2 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index c55e836e6a2f..12f83d223caa 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -979,8 +979,8 @@ struct nft_af_info { struct module *owner; }; -int nft_register_afinfo(struct net *, struct nft_af_info *); -void nft_unregister_afinfo(struct net *, struct nft_af_info *); +int nft_register_afinfo(struct nft_af_info *); +void nft_unregister_afinfo(struct nft_af_info *); int nft_register_chain_type(const struct nf_chain_type *); void nft_unregister_chain_type(const struct nf_chain_type *); diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index 7f86a63ac21f..48134353411d 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -7,15 +7,8 @@ struct nft_af_info; struct netns_nftables { - struct list_head af_info; struct list_head tables; struct list_head commit_list; - struct nft_af_info *ipv4; - struct nft_af_info *ipv6; - struct nft_af_info *inet; - struct nft_af_info *arp; - struct nft_af_info *bridge; - struct nft_af_info *netdev; unsigned int base_seq; u8 gencursor; }; -- cgit v1.2.3 From 98319cb9089844d76e65a6cce5bfbd165e698735 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Jan 2018 02:48:47 +0100 Subject: netfilter: nf_tables: get rid of struct nft_af_info abstraction Remove the infrastructure to register/unregister nft_af_info structure, this structure stores no useful information anymore. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 12f83d223caa..4aca413367ee 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -960,28 +960,12 @@ struct nft_table { struct list_head flowtables; u64 hgenerator; u32 use; - u16 flags:14, + u16 family:6, + flags:8, genmask:2; - struct nft_af_info *afi; char *name; }; -/** - * struct nft_af_info - nf_tables address family info - * - * @list: used internally - * @family: address family - * @owner: module owner - */ -struct nft_af_info { - struct list_head list; - int family; - struct module *owner; -}; - -int nft_register_afinfo(struct nft_af_info *); -void nft_unregister_afinfo(struct nft_af_info *); - int nft_register_chain_type(const struct nf_chain_type *); void nft_unregister_chain_type(const struct nf_chain_type *); @@ -1146,9 +1130,6 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, void nft_trace_notify(struct nft_traceinfo *info); -#define MODULE_ALIAS_NFT_FAMILY(family) \ - MODULE_ALIAS("nft-afinfo-" __stringify(family)) - #define MODULE_ALIAS_NFT_CHAIN(family, name) \ MODULE_ALIAS("nft-chain-" __stringify(family) "-" name) -- cgit v1.2.3 From d7dedee184e775f77d321cfa1c660a7680cf6588 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 9 Jan 2018 16:40:25 +0200 Subject: ipv6: Calculate hash thresholds for IPv6 nexthops Before we convert IPv6 to use hash-threshold instead of modulo-N, we first need each nexthop to store its region boundary in the hash function's output space. The boundary is calculated by dividing the output space equally between the different active nexthops. That is, nexthops that are not dead or linkdown. The boundaries are rebalanced whenever a nexthop is added or removed to a multipath route and whenever a nexthop becomes active or inactive. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + include/net/ip6_route.h | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index ddf53dd1e948..97cd05d87780 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -149,6 +149,7 @@ struct rt6_info { */ struct list_head rt6i_siblings; unsigned int rt6i_nsiblings; + atomic_t rt6i_nh_upper_bound; atomic_t rt6i_ref; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 34cd3b0c6ded..27d23a65f3cd 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -66,6 +66,12 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } +static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt) +{ + return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == + RTF_GATEWAY; +} + void ip6_route_input(struct sk_buff *skb); struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, @@ -171,6 +177,7 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); void rt6_disable_ip(struct net_device *dev, unsigned long event); void rt6_sync_down_dev(struct net_device *dev, unsigned long event); +void rt6_multipath_rebalance(struct rt6_info *rt); static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) { -- cgit v1.2.3 From 398958ae48f44bb036d0fa9829cd489270bf1fc2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 9 Jan 2018 16:40:28 +0200 Subject: ipv6: Add support for non-equal-cost multipath The use of hash-threshold instead of modulo-N makes it trivial to add support for non-equal-cost multipath. Instead of dividing the multipath hash function's output space equally between the nexthops, each nexthop is assigned a region size which is proportional to its weight. Signed-off-by: Ido Schimmel Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 97cd05d87780..34ec321d6a03 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -171,6 +171,7 @@ struct rt6_info { u32 rt6i_metric; u32 rt6i_pmtu; /* more non-fragment space at head required */ + int rt6i_nh_weight; unsigned short rt6i_nfheader_len; u8 rt6i_protocol; u8 exception_bucket_flushed:1, -- cgit v1.2.3 From f34b4aac46b2c7b76d3313c94efe02a64ac8f24a Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 10 Jan 2018 14:59:58 +0100 Subject: net: sch: red: Change the name of the stats struct to be generic Change the name of the stats struct to be generic, so it could be used for other qdisc offload, that will be added in the next patches. Signed-off-by: Nogah Frankel Reviewed-by: Yuval Mintz Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index c4f4e46ea8d6..0d1343cba84c 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -731,6 +731,11 @@ struct tc_cookie { u32 len; }; +struct tc_qopt_offload_stats { + struct gnet_stats_basic_packed *bstats; + struct gnet_stats_queue *qstats; +}; + enum tc_red_command { TC_RED_REPLACE, TC_RED_DESTROY, @@ -744,10 +749,6 @@ struct tc_red_qopt_offload_params { u32 probability; bool is_ecn; }; -struct tc_red_qopt_offload_stats { - struct gnet_stats_basic_packed *bstats; - struct gnet_stats_queue *qstats; -}; struct tc_red_qopt_offload { enum tc_red_command command; @@ -755,7 +756,7 @@ struct tc_red_qopt_offload { u32 parent; union { struct tc_red_qopt_offload_params set; - struct tc_red_qopt_offload_stats stats; + struct tc_qopt_offload_stats stats; struct red_stats *xstats; }; }; -- cgit v1.2.3 From 7fdb61b44c0c95d00f6c856d9fb61a9f647bc85f Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Sun, 14 Jan 2018 12:33:15 +0100 Subject: net: sch: prio: Add offload ability to PRIO qdisc Add the ability to offload PRIO qdisc by using ndo_setup_tc. There are three commands for PRIO offloading: * TC_PRIO_REPLACE: handles set and tune * TC_PRIO_DESTROY: handles qdisc destroy * TC_PRIO_STATS: updates the qdiscs counters (given as reference) Like RED qdisc, the indication of whether PRIO is being offloaded is being set and updated as part of the dump function. It is so because the driver could decide to offload or not based on the qdisc parent, which could change without notifying the qdisc. Signed-off-by: Nogah Frankel Reviewed-by: Yuval Mintz Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 0d1343cba84c..9c341f003091 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -761,4 +761,29 @@ struct tc_red_qopt_offload { }; }; +enum tc_prio_command { + TC_PRIO_REPLACE, + TC_PRIO_DESTROY, + TC_PRIO_STATS, +}; + +struct tc_prio_qopt_offload_params { + int bands; + u8 priomap[TC_PRIO_MAX + 1]; + /* In case that a prio qdisc is offloaded and now is changed to a + * non-offloadedable config, it needs to update the backlog & qlen + * values to negate the HW backlog & qlen values (and only them). + */ + struct gnet_stats_queue *qstats; +}; + +struct tc_prio_qopt_offload { + enum tc_prio_command command; + u32 handle; + u32 parent; + union { + struct tc_prio_qopt_offload_params replace_params; + struct tc_qopt_offload_stats stats; + }; +}; #endif -- cgit v1.2.3 From 51a1aaa631c90223888d8beac4d649dc11d2ca55 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 15 Jan 2018 09:32:36 +0100 Subject: mac80211_hwsim: validate number of different channels When creating a new radio on the fly, hwsim allows this to be done with an arbitrary number of channels, but cfg80211 only supports a limited number of simultaneous channels, leading to a warning. Fix this by validating the number - this requires moving the define for the maximum out to a visible header file. Reported-by: syzbot+8dd9051ff19940290931@syzkaller.appspotmail.com Fixes: b59ec8dd4394 ("mac80211_hwsim: fix number of channels in interface combinations") Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index cb4d92b79cd9..fb94a8bd8ab5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -815,6 +815,8 @@ struct cfg80211_csa_settings { u8 count; }; +#define CFG80211_MAX_NUM_DIFFERENT_CHANNELS 10 + /** * struct iface_combination_params - input parameters for interface combinations * -- cgit v1.2.3 From 30be8f8dba1bd2aff73e8447d59228471233a3d4 Mon Sep 17 00:00:00 2001 From: "r.hering@avm.de" Date: Fri, 12 Jan 2018 15:42:06 +0100 Subject: net/tls: Fix inverted error codes to avoid endless loop sendfile() calls can hang endless with using Kernel TLS if a socket error occurs. Socket error codes must be inverted by Kernel TLS before returning because they are stored with positive sign. If returned non-inverted they are interpreted as number of bytes sent, causing endless looping of the splice mechanic behind sendfile(). Signed-off-by: Robert Hering Signed-off-by: David S. Miller --- include/net/tls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index 936cfc5cab7d..9185e53a743c 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -170,7 +170,7 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) static inline void tls_err_abort(struct sock *sk) { - sk->sk_err = -EBADMSG; + sk->sk_err = EBADMSG; sk->sk_error_report(sk); } -- cgit v1.2.3 From 273c28bc57ca9672f7b70bed764ecdfb964930c8 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 Jan 2018 18:28:31 +0300 Subject: net: Convert atomic_t net::count to refcount_t Since net could be obtained from RCU lists, and there is a race with net destruction, the patch converts net::count to refcount_t. This provides sanity checks for the cases of incrementing counter of already dead net, when maybe_get_net() has to used instead of get_net(). Drivers: allyesconfig and allmodconfig are OK. Suggested-by: Eric Dumazet Signed-off-by: Kirill Tkhai Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/net_namespace.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 10f99dafd5ac..f8a84a2c2341 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -51,7 +51,7 @@ struct net { refcount_t passive; /* To decided when the network * namespace should be freed. */ - atomic_t count; /* To decided when the network + refcount_t count; /* To decided when the network * namespace should be shut down. */ spinlock_t rules_mod_lock; @@ -195,7 +195,7 @@ void __put_net(struct net *net); static inline struct net *get_net(struct net *net) { - atomic_inc(&net->count); + refcount_inc(&net->count); return net; } @@ -206,14 +206,14 @@ static inline struct net *maybe_get_net(struct net *net) * exists. If the reference count is zero this * function fails and returns NULL. */ - if (!atomic_inc_not_zero(&net->count)) + if (!refcount_inc_not_zero(&net->count)) net = NULL; return net; } static inline void put_net(struct net *net) { - if (atomic_dec_and_test(&net->count)) + if (refcount_dec_and_test(&net->count)) __put_net(net); } -- cgit v1.2.3 From cd9ff4de0107c65d69d02253bb25d6db93c3dbc1 Mon Sep 17 00:00:00 2001 From: Jim Westfall Date: Sun, 14 Jan 2018 04:18:51 -0800 Subject: ipv4: Make neigh lookup keys for loopback/point-to-point devices be INADDR_ANY Map all lookup neigh keys to INADDR_ANY for loopback/point-to-point devices to avoid making an entry for every remote ip the device needs to talk to. This used the be the old behavior but became broken in a263b3093641f (ipv4: Make neigh lookups directly in output packet path) and later removed in 0bb4087cbec0 (ipv4: Fix neigh lookup keying over loopback/point-to-point devices) because it was broken. Signed-off-by: Jim Westfall Signed-off-by: David S. Miller --- include/net/arp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/arp.h b/include/net/arp.h index dc8cd47f883b..977aabfcdc03 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -20,6 +20,9 @@ static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) + key = INADDR_ANY; + return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } -- cgit v1.2.3 From 30c2c9f158f6c9cef41e916d1c7c11097df4befb Mon Sep 17 00:00:00 2001 From: David Windsor Date: Sat, 10 Jun 2017 22:50:42 -0400 Subject: net: Define usercopy region in struct proto slab cache In support of usercopy hardening, this patch defines a region in the struct proto slab cache in which userspace copy operations are allowed. Some protocols need to copy objects to/from userspace, and they can declare the region via their proto structure with the new usersize and useroffset fields. Initially, if no region is specified (usersize == 0), the entire field is marked as whitelisted. This allows protocols to be whitelisted in subsequent patches. Once all protocols have been annotated, the full-whitelist default can be removed. This region is known as the slab cache's usercopy region. Slab caches can now check that each dynamically sized copy operation involving cache-managed memory falls entirely within the slab's usercopy region. This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY whitelisting code in the last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Signed-off-by: David Windsor [kees: adjust commit log, split off per-proto patches] [kees: add logic for by-default full-whitelist] Cc: "David S. Miller" Cc: Eric Dumazet Cc: Paolo Abeni Cc: David Howells Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook --- include/net/sock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 79e1a2c7912c..b77a710ee831 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1112,6 +1112,8 @@ struct proto { struct kmem_cache *slab; unsigned int obj_size; slab_flags_t slab_flags; + size_t useroffset; /* Usercopy region offset */ + size_t usersize; /* Usercopy region size */ struct percpu_counter *orphan_count; -- cgit v1.2.3 From ab9ee8e38b292f9a6698a4fedbb6ff8d08ce2012 Mon Sep 17 00:00:00 2001 From: David Windsor Date: Thu, 24 Aug 2017 16:57:57 -0700 Subject: sctp: Define usercopy region in SCTP proto slab cache The SCTP socket event notification subscription information need to be copied to/from userspace. In support of usercopy hardening, this patch defines a region in the struct proto slab cache in which userspace copy operations are allowed. Additionally moves the usercopy fields to be adjacent for the region to cover both. example usage trace: net/sctp/socket.c: sctp_getsockopt_events(...): ... copy_to_user(..., &sctp_sk(sk)->subscribe, len) sctp_setsockopt_events(...): ... copy_from_user(&sctp_sk(sk)->subscribe, ..., optlen) sctp_getsockopt_initmsg(...): ... copy_to_user(..., &sctp_sk(sk)->initmsg, len) This region is known as the slab cache's usercopy region. Slab caches can now check that each dynamically sized copy operation involving cache-managed memory falls entirely within the slab's usercopy region. This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY whitelisting code in the last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Signed-off-by: David Windsor [kees: split from network patch, move struct members adjacent] [kees: add SCTPv6 struct whitelist, provide usage trace] Cc: Vlad Yasevich Cc: Neil Horman Cc: "David S. Miller" Cc: linux-sctp@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook --- include/net/sctp/structs.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 16f949eef52f..6168e3449131 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -202,12 +202,17 @@ struct sctp_sock { /* Flags controlling Heartbeat, SACK delay, and Path MTU Discovery. */ __u32 param_flags; - struct sctp_initmsg initmsg; struct sctp_rtoinfo rtoinfo; struct sctp_paddrparams paddrparam; - struct sctp_event_subscribe subscribe; struct sctp_assocparams assocparams; + /* + * These two structures must be grouped together for the usercopy + * whitelist region. + */ + struct sctp_event_subscribe subscribe; + struct sctp_initmsg initmsg; + int user_frag; __u32 autoclose; -- cgit v1.2.3 From 2406e7e546b223e8cf42c44ac7352d4d1fd1dbcd Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 15 Jan 2018 08:59:02 +0100 Subject: devlink: Add per devlink instance lock This is a preparation before introducing resources and hot reload support. Currently there are two global lock where one protects all devlink access, and the second one protects devlink port access. This patch adds per devlink instance lock which protects the internal members which are the sb/dpipe/ resource/ports. By introducing this lock the global devlink port lock can be discarded. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index b9654e133599..4d2c6fc94837 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -30,6 +30,7 @@ struct devlink { const struct devlink_ops *ops; struct device *dev; possible_net_t _net; + struct mutex lock; char priv[0] __aligned(NETDEV_ALIGN); }; -- cgit v1.2.3 From d9f9b9a4d05fab693fd23a9ecaa330e03ebe2c31 Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 15 Jan 2018 08:59:03 +0100 Subject: devlink: Add support for resource abstraction Add support for hardware resource abstraction over devlink. Each resource is identified via id, furthermore it contains information regarding its size and its related sub resources. Each resource can also provide its current occupancy. In some cases the sizes of some resources can be changed, yet for those changes to take place a hot driver reload may be needed. The reload capability will be introduced in the next patch. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 4d2c6fc94837..ceb1895d119b 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -26,6 +26,7 @@ struct devlink { struct list_head port_list; struct list_head sb_list; struct list_head dpipe_table_list; + struct list_head resource_list; struct devlink_dpipe_headers *dpipe_headers; const struct devlink_ops *ops; struct device *dev; @@ -224,6 +225,61 @@ struct devlink_dpipe_headers { unsigned int headers_count; }; +/** + * struct devlink_resource_ops - resource ops + * @occ_get: get the occupied size + * @size_validate: validate the size of the resource before update, reload + * is needed for changes to take place + */ +struct devlink_resource_ops { + u64 (*occ_get)(struct devlink *devlink); + int (*size_validate)(struct devlink *devlink, u64 size, + struct netlink_ext_ack *extack); +}; + +/** + * struct devlink_resource_size_params - resource's size parameters + * @size_min: minimum size which can be set + * @size_max: maximum size which can be set + * @size_granularity: size granularity + * @size_unit: resource's basic unit + */ +struct devlink_resource_size_params { + u64 size_min; + u64 size_max; + u64 size_granularity; + enum devlink_resource_unit unit; +}; + +/** + * struct devlink_resource - devlink resource + * @name: name of the resource + * @id: id, per devlink instance + * @size: size of the resource + * @size_new: updated size of the resource, reload is needed + * @size_valid: valid in case the total size of the resource is valid + * including its children + * @parent: parent resource + * @size_params: size parameters + * @list: parent list + * @resource_list: list of child resources + * @resource_ops: resource ops + */ +struct devlink_resource { + const char *name; + u64 id; + u64 size; + u64 size_new; + bool size_valid; + struct devlink_resource *parent; + struct devlink_resource_size_params *size_params; + struct list_head list; + struct list_head resource_list; + const struct devlink_resource_ops *resource_ops; +}; + +#define DEVLINK_RESOURCE_ID_PARENT_TOP 0 + struct devlink_ops { int (*port_type_set)(struct devlink_port *devlink_port, enum devlink_port_type port_type); @@ -333,6 +389,20 @@ extern struct devlink_dpipe_header devlink_dpipe_header_ethernet; extern struct devlink_dpipe_header devlink_dpipe_header_ipv4; extern struct devlink_dpipe_header devlink_dpipe_header_ipv6; +int devlink_resource_register(struct devlink *devlink, + const char *resource_name, + bool top_hierarchy, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + struct devlink_resource_size_params *size_params, + const struct devlink_resource_ops *resource_ops); +void devlink_resources_unregister(struct devlink *devlink, + struct devlink_resource *resource); +int devlink_resource_size_get(struct devlink *devlink, + u64 resource_id, + u64 *p_resource_size); + #else static inline struct devlink *devlink_alloc(const struct devlink_ops *ops, @@ -469,6 +539,32 @@ devlink_dpipe_match_put(struct sk_buff *skb, return 0; } +static inline int +devlink_resource_register(struct devlink *devlink, + const char *resource_name, + bool top_hierarchy, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + struct devlink_resource_size_params *size_params, + const struct devlink_resource_ops *resource_ops) +{ + return 0; +} + +static inline void +devlink_resources_unregister(struct devlink *devlink, + struct devlink_resource *resource) +{ +} + +static inline int +devlink_resource_size_get(struct devlink *devlink, u64 resource_id, + u64 *p_resource_size) +{ + return -EOPNOTSUPP; +} + #endif #endif /* _NET_DEVLINK_H_ */ -- cgit v1.2.3 From 2d8dc5bbf4e7603747875eb5cadcd67c1fa8b1bb Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 15 Jan 2018 08:59:04 +0100 Subject: devlink: Add support for reload Add support for performing driver hot reload. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index ceb1895d119b..c698883fb0bb 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -281,6 +281,7 @@ struct devlink_resource { #define DEVLINK_RESOURCE_ID_PARENT_TOP 0 struct devlink_ops { + int (*reload)(struct devlink *devlink); int (*port_type_set)(struct devlink_port *devlink_port, enum devlink_port_type port_type); int (*port_split)(struct devlink *devlink, unsigned int port_index, -- cgit v1.2.3 From 56dc7cd0a87a1ff4f49ee1e67bd88e768385d51a Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 15 Jan 2018 08:59:05 +0100 Subject: devlink: Add relation between dpipe and resource The hardware processes which are modeled via dpipe commonly use some internal hardware resources. Such relation can improve the understanding of hardware limitations. The number of resource's unit consumed per table's entry are also provided for each table. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index c698883fb0bb..6545b03e97f7 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -183,6 +183,9 @@ struct devlink_dpipe_table_ops; * @counters_enabled: indicates if counters are active * @counter_control_extern: indicates if counter control is in dpipe or * external tool + * @resource_valid: Indicate that the resource id is valid + * @resource_id: relative resource this table is related to + * @resource_units: number of resource's unit consumed per table's entry * @table_ops: table operations * @rcu: rcu */ @@ -192,6 +195,9 @@ struct devlink_dpipe_table { const char *name; bool counters_enabled; bool counter_control_extern; + bool resource_valid; + u64 resource_id; + u64 resource_units; struct devlink_dpipe_table_ops *table_ops; struct rcu_head rcu; }; @@ -403,6 +409,9 @@ void devlink_resources_unregister(struct devlink *devlink, int devlink_resource_size_get(struct devlink *devlink, u64 resource_id, u64 *p_resource_size); +int devlink_dpipe_table_resource_set(struct devlink *devlink, + const char *table_name, u64 resource_id, + u64 resource_units); #else @@ -566,6 +575,14 @@ devlink_resource_size_get(struct devlink *devlink, u64 resource_id, return -EOPNOTSUPP; } +static inline int +devlink_dpipe_table_resource_set(struct devlink *devlink, + const char *table_name, u64 resource_id, + u64 resource_units) +{ + return -EOPNOTSUPP; +} + #endif #endif /* _NET_DEVLINK_H_ */ -- cgit v1.2.3 From 868717ae73c5ad297b91fb52db1396f2336609a8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 15 Jan 2018 12:13:32 -0800 Subject: net: remove prototype of qdisc_lookup_class() Looks like qdisc_lookup_class() never existed in the tree in the git era. Remove the prototype from the header. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index e2c75f52557b..815b92a23936 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -100,7 +100,6 @@ int qdisc_set_default(const char *id); void qdisc_hash_add(struct Qdisc *q, bool invisible); void qdisc_hash_del(struct Qdisc *q); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); -struct Qdisc *qdisc_lookup_class(struct net_device *dev, u32 handle); struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab, struct netlink_ext_ack *extack); -- cgit v1.2.3 From 81d947e2b8dd2394586c3eaffdd2357797d3bf59 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 15 Jan 2018 23:12:09 +0100 Subject: net, sched: fix panic when updating miniq {b,q}stats While working on fixing another bug, I ran into the following panic on arm64 by simply attaching clsact qdisc, adding a filter and running traffic on ingress to it: [...] [ 178.188591] Unable to handle kernel read from unreadable memory at virtual address 810fb501f000 [ 178.197314] Mem abort info: [ 178.200121] ESR = 0x96000004 [ 178.203168] Exception class = DABT (current EL), IL = 32 bits [ 178.209095] SET = 0, FnV = 0 [ 178.212157] EA = 0, S1PTW = 0 [ 178.215288] Data abort info: [ 178.218175] ISV = 0, ISS = 0x00000004 [ 178.222019] CM = 0, WnR = 0 [ 178.224997] user pgtable: 4k pages, 48-bit VAs, pgd = 0000000023cb3f33 [ 178.231531] [0000810fb501f000] *pgd=0000000000000000 [ 178.236508] Internal error: Oops: 96000004 [#1] SMP [...] [ 178.311855] CPU: 73 PID: 2497 Comm: ping Tainted: G W 4.15.0-rc7+ #5 [ 178.319413] Hardware name: FOXCONN R2-1221R-A4/C2U4N_MB, BIOS G31FB18A 03/31/2017 [ 178.326887] pstate: 60400005 (nZCv daif +PAN -UAO) [ 178.331685] pc : __netif_receive_skb_core+0x49c/0xac8 [ 178.336728] lr : __netif_receive_skb+0x28/0x78 [ 178.341161] sp : ffff00002344b750 [ 178.344465] x29: ffff00002344b750 x28: ffff810fbdfd0580 [ 178.349769] x27: 0000000000000000 x26: ffff000009378000 [...] [ 178.418715] x1 : 0000000000000054 x0 : 0000000000000000 [ 178.424020] Process ping (pid: 2497, stack limit = 0x000000009f0a3ff4) [ 178.430537] Call trace: [ 178.432976] __netif_receive_skb_core+0x49c/0xac8 [ 178.437670] __netif_receive_skb+0x28/0x78 [ 178.441757] process_backlog+0x9c/0x160 [ 178.445584] net_rx_action+0x2f8/0x3f0 [...] Reason is that sch_ingress and sch_clsact are doing mini_qdisc_pair_init() which sets up miniq pointers to cpu_{b,q}stats from the underlying qdisc. Problem is that this cannot work since they are actually set up right after the qdisc ->init() callback in qdisc_create(), so first packet going into sch_handle_ingress() tries to call mini_qdisc_bstats_cpu_update() and we therefore panic. In order to fix this, allocation of {b,q}stats needs to happen before we call into ->init(). In net-next, there's already such option through commit d59f5ffa59d8 ("net: sched: a dflt qdisc may be used with per cpu stats"). However, the bug needs to be fixed in net still for 4.15. Thus, include these bits to reduce any merge churn and reuse the static_flags field to set TCQ_F_CPUSTATS, and remove the allocation from qdisc_create() since there is no other user left. Prashant Bhole ran into the same issue but for net-next, thus adding him below as well as co-author. Same issue was also reported by Sandipan Das when using bcc. Fixes: 46209401f8f6 ("net: core: introduce mini_Qdisc and eliminate usage of tp->q for clsact fastpath") Reference: https://lists.iovisor.org/pipermail/iovisor-dev/2018-January/001190.html Reported-by: Sandipan Das Co-authored-by: Prashant Bhole Co-authored-by: John Fastabend Signed-off-by: Daniel Borkmann Cc: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 83a3e47d5845..becf86aa4ac6 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -179,6 +179,7 @@ struct Qdisc_ops { const struct Qdisc_class_ops *cl_ops; char id[IFNAMSIZ]; int priv_size; + unsigned int static_flags; int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, @@ -444,6 +445,7 @@ void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, unsigned int n, unsigned int len); struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops); +void qdisc_free(struct Qdisc *qdisc); struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, u32 parentid); void __qdisc_calculate_pkt_len(struct sk_buff *skb, -- cgit v1.2.3 From 416ef9b15c688b91edbf654ebe7bc349c9151147 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 14 Jan 2018 20:01:26 -0800 Subject: net: sched: red: don't reset the backlog on every stat dump Commit 0dfb33a0d7e2 ("sch_red: report backlog information") copied child's backlog into RED's backlog. Back then RED did not maintain its own backlog counts. This has changed after commit 2ccccf5fb43f ("net_sched: update hierarchical backlog too") and commit d7f4f332f082 ("sch_red: update backlog as well"). Copying is no longer necessary. Tested: $ tc -s qdisc show dev veth0 qdisc red 1: root refcnt 2 limit 400000b min 30000b max 30000b ecn Sent 20942 bytes 221 pkt (dropped 0, overlimits 0 requeues 0) backlog 1260b 14p requeues 14 marked 0 early 0 pdrop 0 other 0 qdisc tbf 2: parent 1: rate 1Kbit burst 15000b lat 3585.0s Sent 20942 bytes 221 pkt (dropped 0, overlimits 138 requeues 0) backlog 1260b 14p requeues 14 Recently RED offload was added. We need to make sure drivers don't depend on resetting the stats. This means backlog should be treated like any other statistic: total_stat = new_hw_stat - prev_hw_stat; Adjust mlxsw. Signed-off-by: Jakub Kicinski Acked-by: Nogah Frankel Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 9c341f003091..cc23c041a6d7 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -748,6 +748,7 @@ struct tc_red_qopt_offload_params { u32 max; u32 probability; bool is_ecn; + struct gnet_stats_queue *qstats; }; struct tc_red_qopt_offload { -- cgit v1.2.3 From a9b19443edbaac97c5c094f3cc903c1f1548b3f5 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 Jan 2018 11:46:45 +0100 Subject: net: sched: introduce support for multiple filter chain pointers registration So far, there was possible only to register a single filter chain pointer to block->chain[0]. However, when the blocks will get shareable, we need to allow multiple filter chain pointers registration. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/sch_generic.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index bd9125b0481f..17d8cfd0efda 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -275,8 +275,7 @@ typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); struct tcf_chain { struct tcf_proto __rcu *filter_chain; - tcf_chain_head_change_t *chain_head_change; - void *chain_head_change_priv; + struct list_head filter_chain_list; struct list_head list; struct tcf_block *block; u32 index; /* chain index */ -- cgit v1.2.3 From 4861738775d70e0165d04fe014f32b41bcb5414a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 Jan 2018 11:46:46 +0100 Subject: net: sched: introduce shared filter blocks infrastructure Allow qdiscs to share filter blocks among them. Each qdisc type has to use block get/put extended modifications that enable sharing. Shared blocks are tracked within each net namespace and identified by u32 index. This index is passed from user during the qdisc creation. If user passes index that is not used by any other qdisc, new block is created. If user passes index that is already used, the existing block will be re-used. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 7 +++++++ include/net/sch_generic.h | 2 ++ 2 files changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index cc23c041a6d7..c40d60e6a883 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -29,6 +29,7 @@ struct tcf_block_ext_info { enum tcf_block_binder_type binder_type; tcf_chain_head_change_t *chain_head_change; void *chain_head_change_priv; + u32 block_index; }; struct tcf_block_cb; @@ -48,8 +49,14 @@ void tcf_block_put(struct tcf_block *block); void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, struct tcf_block_ext_info *ei); +static inline bool tcf_block_shared(struct tcf_block *block) +{ + return block->index; +} + static inline struct Qdisc *tcf_block_q(struct tcf_block *block) { + WARN_ON(tcf_block_shared(block)); return block->q; } diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 17d8cfd0efda..cc0c1e4711dc 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -284,6 +284,8 @@ struct tcf_chain { struct tcf_block { struct list_head chain_list; + u32 index; /* block index for shared blocks */ + unsigned int refcnt; struct net *net; struct Qdisc *q; struct list_head cb_list; -- cgit v1.2.3 From f36fe1c498c8959812415c57b683abaa4527dec5 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 Jan 2018 11:46:48 +0100 Subject: net: sched: introduce block mechanism to handle netif_keep_dst calls Couple of classifiers call netif_keep_dst directly on q->dev. That is not possible to do directly for shared blocke where multiple qdiscs are owning the block. So introduce a infrastructure to keep track of the block owners in list and use this list to implement block variant of netif_keep_dst. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 1 + include/net/sch_generic.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index c40d60e6a883..789d818c4a61 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -39,6 +39,7 @@ bool tcf_queue_work(struct work_struct *work); struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create); void tcf_chain_put(struct tcf_chain *chain); +void tcf_block_netif_keep_dst(struct tcf_block *block); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, struct netlink_ext_ack *extack); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index cc0c1e4711dc..f655e66ce742 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -289,6 +289,8 @@ struct tcf_block { struct net *net; struct Qdisc *q; struct list_head cb_list; + struct list_head owner_list; + bool keep_dst; }; static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) -- cgit v1.2.3 From edf6711c9840fd92e0047f98c411c94114168f19 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 Jan 2018 11:46:49 +0100 Subject: net: sched: remove classid and q fields from tcf_proto Both are no longer used, so remove them. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/sch_generic.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f655e66ce742..54b9a1ca26bd 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -255,8 +255,6 @@ struct tcf_proto { /* All the rest */ u32 prio; - u32 classid; - struct Qdisc *q; void *data; const struct tcf_proto_ops *ops; struct tcf_chain *chain; -- cgit v1.2.3 From caa7260156eb3a1496348a2c69fa68e85183d5d7 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 Jan 2018 11:46:50 +0100 Subject: net: sched: keep track of offloaded filters and check tc offload feature During block bind, we need to check tc offload feature. If it is disabled yet still the block contains offloaded filters, forbid the bind. Also forbid to register callback for a block that already contains offloaded filters, as the play back is not supported now. For keeping track of offloaded filters there is a new counter introduced, alongside with couple of helpers called from cls_* code. These helpers set and clear TCA_CLS_FLAGS_IN_HW flag. Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/sch_generic.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 54b9a1ca26bd..bf5cc0a1d0f6 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -289,8 +289,26 @@ struct tcf_block { struct list_head cb_list; struct list_head owner_list; bool keep_dst; + unsigned int offloadcnt; /* Number of oddloaded filters */ + unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */ }; +static inline void tcf_block_offload_inc(struct tcf_block *block, u32 *flags) +{ + if (*flags & TCA_CLS_FLAGS_IN_HW) + return; + *flags |= TCA_CLS_FLAGS_IN_HW; + block->offloadcnt++; +} + +static inline void tcf_block_offload_dec(struct tcf_block *block, u32 *flags) +{ + if (!(*flags & TCA_CLS_FLAGS_IN_HW)) + return; + *flags &= ~TCA_CLS_FLAGS_IN_HW; + block->offloadcnt--; +} + static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) { struct qdisc_skb_cb *qcb; -- cgit v1.2.3 From d47a6b0e7c492a4ba4524d557db388e34fd0a47a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 17 Jan 2018 11:46:52 +0100 Subject: net: sched: introduce ingress/egress block index attributes for qdisc Introduce two new attributes to be used for qdisc creation and dumping. One for ingress block, one for egress block. Introduce a set of ops that qdisc which supports block sharing would implement. Passing block indexes in qdisc change is not supported yet and it is checked and forbidded. In future, these attributes are to be reused for specifying block indexes for classes as well. As of this moment however, it is not supported so a check is in place to forbid it. Suggested-by: Roopa Prabhu Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Acked-by: David Ahern Signed-off-by: David S. Miller --- include/net/sch_generic.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index bf5cc0a1d0f6..cfc19d0ba2ad 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -204,6 +204,13 @@ struct Qdisc_ops { int (*dump)(struct Qdisc *, struct sk_buff *); int (*dump_stats)(struct Qdisc *, struct gnet_dump *); + void (*ingress_block_set)(struct Qdisc *sch, + u32 block_index); + void (*egress_block_set)(struct Qdisc *sch, + u32 block_index); + u32 (*ingress_block_get)(struct Qdisc *sch); + u32 (*egress_block_get)(struct Qdisc *sch); + struct module *owner; }; -- cgit v1.2.3 From 5ef7e0ba1039d65c4eefde92056769548b214273 Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Tue, 16 Jan 2018 15:03:32 +0000 Subject: vxlan: Fix trailing semicolon The trailing semicolon is an empty statement that does no operation. It is completely stripped out by the compiler. Removing it since it doesn't do anything. Fixes: 5f35227ea34b ("net: Generalize ndo_gso_check to ndo_features_check") Signed-off-by: Luis de Bethencourt Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/vxlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index f96391e84a8a..ad73d8b3fcc2 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -301,7 +301,7 @@ static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, l4_hdr = ipv6_hdr(skb)->nexthdr; break; default: - return features;; + return features; } if ((l4_hdr == IPPROTO_UDP) && -- cgit v1.2.3 From 50bd870a9e5cca9fcf5fb4c130c373643d7d9906 Mon Sep 17 00:00:00 2001 From: Yossef Efraim Date: Sun, 14 Jan 2018 11:39:10 +0200 Subject: xfrm: Add ESN support for IPSec HW offload This patch adds ESN support to IPsec device offload. Adding new xfrm device operation to synchronize device ESN. Signed-off-by: Yossef Efraim Signed-off-by: Shannon Nelson Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2e6d4fe6b0ba..7d2077665c0b 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1904,6 +1904,14 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo); bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); +static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x) +{ + struct xfrm_state_offload *xso = &x->xso; + + if (xso->dev && xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn) + xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn(x); +} + static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) { struct xfrm_state *x = dst->xfrm; @@ -1974,6 +1982,10 @@ static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x return false; } +static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x) +{ +} + static inline bool xfrm_dst_offload_ok(struct dst_entry *dst) { return false; -- cgit v1.2.3 From 05b93801a23c21a6f355f4c492c51715d6ccc96d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 17 Jan 2018 07:14:14 -0800 Subject: lockdep: Convert some users to const These users of lockdep_is_held() either wanted lockdep_is_held to take a const pointer, or would benefit from providing a const pointer. Signed-off-by: Matthew Wilcox Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: "David S. Miller" Link: https://lkml.kernel.org/r/20180117151414.23686-4-willy@infradead.org --- include/net/sock.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 7a7b14e9628a..c4a424fe6fdd 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1445,10 +1445,8 @@ do { \ } while (0) #ifdef CONFIG_LOCKDEP -static inline bool lockdep_sock_is_held(const struct sock *csk) +static inline bool lockdep_sock_is_held(const struct sock *sk) { - struct sock *sk = (struct sock *)csk; - return lockdep_is_held(&sk->sk_lock) || lockdep_is_held(&sk->sk_lock.slock); } -- cgit v1.2.3 From 3ecbfd65f50e5ff9c538c1bfa3356ef52cc66586 Mon Sep 17 00:00:00 2001 From: Harsha Sharma Date: Wed, 27 Dec 2017 00:59:00 +0530 Subject: netfilter: nf_tables: allocate handle and delete objects via handle This patch allows deletion of objects via unique handle which can be listed via '-a' option. Signed-off-by: Harsha Sharma Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4aca413367ee..663b015dace5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -374,6 +374,7 @@ void nft_unregister_set(struct nft_set_type *type); * @list: table set list node * @bindings: list of set bindings * @name: name of the set + * @handle: unique handle of the set * @ktype: key type (numeric type defined by userspace, not used in the kernel) * @dtype: data type (verdict or numeric type defined by userspace) * @objtype: object type (see NFT_OBJECT_* definitions) @@ -396,6 +397,7 @@ struct nft_set { struct list_head list; struct list_head bindings; char *name; + u64 handle; u32 ktype; u32 dtype; u32 objtype; @@ -946,6 +948,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); * @objects: stateful objects in the table * @flowtables: flow tables in the table * @hgenerator: handle generator state + * @handle: table handle * @use: number of chain references to this table * @flags: table flag (see enum nft_table_flags) * @genmask: generation mask @@ -959,6 +962,7 @@ struct nft_table { struct list_head objects; struct list_head flowtables; u64 hgenerator; + u64 handle; u32 use; u16 family:6, flags:8, @@ -983,9 +987,9 @@ int nft_verdict_dump(struct sk_buff *skb, int type, * @name: name of this stateful object * @genmask: generation mask * @use: number of references to this stateful object - * @data: object data, layout depends on type + * @handle: unique object handle * @ops: object operations - * @data: pointer to object data + * @data: object data, layout depends on type */ struct nft_object { struct list_head list; @@ -993,6 +997,7 @@ struct nft_object { struct nft_table *table; u32 genmask:2, use:30; + u64 handle; /* runtime data below here */ const struct nft_object_ops *ops ____cacheline_aligned; unsigned char data[] @@ -1074,6 +1079,7 @@ void nft_unregister_obj(struct nft_object_type *obj_type); * @ops_len: number of hooks in array * @genmask: generation mask * @use: number of references to this flow table + * @handle: unique object handle * @data: rhashtable and garbage collector * @ops: array of hooks */ @@ -1086,6 +1092,7 @@ struct nft_flowtable { int ops_len; u32 genmask:2, use:30; + u64 handle; /* runtime data below here */ struct nf_hook_ops *ops ____cacheline_aligned; struct nf_flowtable data; -- cgit v1.2.3 From ce6289661b14a8b391d90db918c91b6d6da6540a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 Jan 2018 17:34:00 +0100 Subject: caif: reduce stack size with KASAN When CONFIG_KASAN is set, we can use relatively large amounts of kernel stack space: net/caif/cfctrl.c:555:1: warning: the frame size of 1600 bytes is larger than 1280 bytes [-Wframe-larger-than=] This adds convenience wrappers around cfpkt_extr_head(), which is responsible for most of the stack growth. With those wrapper functions, gcc apparently starts reusing the stack slots for each instance, thus avoiding the problem. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/net/caif/cfpkt.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/net') diff --git a/include/net/caif/cfpkt.h b/include/net/caif/cfpkt.h index fe328c52c46b..801489bb14c3 100644 --- a/include/net/caif/cfpkt.h +++ b/include/net/caif/cfpkt.h @@ -32,6 +32,33 @@ void cfpkt_destroy(struct cfpkt *pkt); */ int cfpkt_extr_head(struct cfpkt *pkt, void *data, u16 len); +static inline u8 cfpkt_extr_head_u8(struct cfpkt *pkt) +{ + u8 tmp; + + cfpkt_extr_head(pkt, &tmp, 1); + + return tmp; +} + +static inline u16 cfpkt_extr_head_u16(struct cfpkt *pkt) +{ + __le16 tmp; + + cfpkt_extr_head(pkt, &tmp, 2); + + return le16_to_cpu(tmp); +} + +static inline u32 cfpkt_extr_head_u32(struct cfpkt *pkt) +{ + __le32 tmp; + + cfpkt_extr_head(pkt, &tmp, 4); + + return le32_to_cpu(tmp); +} + /* * Peek header from packet. * Reads data from packet without changing packet. -- cgit v1.2.3 From e42866031ff03c89a5bdd2056c76dd6cb41c3d35 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 17 Jan 2018 12:11:01 -0800 Subject: tcp: avoid min RTT bloat by skipping RTT from delayed-ACK in BBR A persistent connection may send tiny amount of data (e.g. health-check) for a long period of time. BBR's windowed min RTT filter may only see RTT samples from delayed ACKs causing BBR to grossly over-estimate the path delay depending how much the ACK was delayed at the receiver. This patch skips RTT samples that are likely coming from delayed ACKs. Note that it is possible the sender never obtains a valid measure to set the min RTT. In this case BBR will continue to set cwnd to initial window which seems fine because the connection is thin stream. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Priyaranjan Jha Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 6939e69d3c37..5a1d26a18599 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -953,6 +953,7 @@ struct rate_sample { u32 prior_in_flight; /* in flight before this ACK */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ }; struct tcp_congestion_ops { -- cgit v1.2.3 From 8865fdd4e1538a775c5ac2157fb8eb45bee9dc18 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 18 Jan 2018 11:20:49 -0500 Subject: net: sched: cls: fix code style issues This patch changes some code style issues pointed out by checkpatch inside the TC cls subsystem. Signed-off-by: Alexander Aring Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index cfc19d0ba2ad..c90f5fe6bed9 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -240,7 +240,8 @@ struct tcf_proto_ops { struct tcf_proto*, unsigned long, u32 handle, struct nlattr **, void **, bool); - int (*delete)(struct tcf_proto*, void *, bool*); + int (*delete)(struct tcf_proto *tp, void *arg, + bool *last); void (*walk)(struct tcf_proto*, struct tcf_walker *arg); void (*bind_class)(void *, u32, unsigned long); -- cgit v1.2.3 From 7306db38a67cf6b8e1ca354b1d0c0117b7b880d5 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 18 Jan 2018 11:20:51 -0500 Subject: net: sched: cls: add extack support for change callback This patch adds extack support for classifier change callback api. This prepares to handle extack support inside each specific classifier implementation. Cc: David Ahern Signed-off-by: Alexander Aring Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c90f5fe6bed9..ee398bcd46e7 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -239,7 +239,8 @@ struct tcf_proto_ops { int (*change)(struct net *net, struct sk_buff *, struct tcf_proto*, unsigned long, u32 handle, struct nlattr **, - void **, bool); + void **, bool, + struct netlink_ext_ack *); int (*delete)(struct tcf_proto *tp, void *arg, bool *last); void (*walk)(struct tcf_proto*, struct tcf_walker *arg); -- cgit v1.2.3 From 50a561900e66a03f5127edac57487079bc0b8201 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 18 Jan 2018 11:20:52 -0500 Subject: net: sched: cls: add extack support for tcf_exts_validate The tcf_exts_validate function calls the act api change callback. For preparing extack support for act api, this patch adds the extack as parameter for this function which is common used in cls implementations. Furthermore the tcf_exts_validate will call action init callback which prepares the TC action subsystem for extack support. Cc: David Ahern Signed-off-by: Alexander Aring Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 789d818c4a61..6dd009e10e5d 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -376,7 +376,8 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts, int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *rate_tlv, - struct tcf_exts *exts, bool ovr); + struct tcf_exts *exts, bool ovr, + struct netlink_ext_ack *extack); void tcf_exts_destroy(struct tcf_exts *exts); void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); -- cgit v1.2.3 From 571acf2106963d6c1c0ce1ed13e711bd296b2d25 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 18 Jan 2018 11:20:53 -0500 Subject: net: sched: cls: add extack support for delete callback This patch adds extack support for classifier delete callback api. This prepares to handle extack support inside each specific classifier implementation. Cc: David Ahern Signed-off-by: Alexander Aring Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index ee398bcd46e7..cd1be1f25c36 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -242,7 +242,8 @@ struct tcf_proto_ops { void **, bool, struct netlink_ext_ack *); int (*delete)(struct tcf_proto *tp, void *arg, - bool *last); + bool *last, + struct netlink_ext_ack *); void (*walk)(struct tcf_proto*, struct tcf_walker *arg); void (*bind_class)(void *, u32, unsigned long); -- cgit v1.2.3 From 1057c55f6b6cdc4fa3e8e29cfb9061c211e58395 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 18 Jan 2018 11:20:54 -0500 Subject: net: sched: cls: add extack support for tcf_change_indev This patch adds extack handling for the tcf_change_indev function which is common used by TC classifier implementations. Cc: David Ahern Signed-off-by: Alexander Aring Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 6dd009e10e5d..2e4b8e436d25 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -557,13 +557,16 @@ static inline int tcf_valid_offset(const struct sk_buff *skb, #include static inline int -tcf_change_indev(struct net *net, struct nlattr *indev_tlv) +tcf_change_indev(struct net *net, struct nlattr *indev_tlv, + struct netlink_ext_ack *extack) { char indev[IFNAMSIZ]; struct net_device *dev; - if (nla_strlcpy(indev, indev_tlv, IFNAMSIZ) >= IFNAMSIZ) + if (nla_strlcpy(indev, indev_tlv, IFNAMSIZ) >= IFNAMSIZ) { + NL_SET_ERR_MSG(extack, "Interface name too long"); return -EINVAL; + } dev = __dev_get_by_name(net, indev); if (!dev) return -ENODEV; -- cgit v1.2.3 From 8f0b425a712b82732127ff7880f92504f20fcc11 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 19 Jan 2018 17:44:47 -0800 Subject: net: sched: add extack support for offload via tc_cls_common_offload Add extack support for hardware offload of classifiers. In order to achieve this, a pointer to a struct netlink_ext_ack is added to the struct tc_cls_common_offload that is passed to the callback for setting up the classifier. Function tc_cls_common_offload_init() is updated to support initialization of this new attribute. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 2e4b8e436d25..f497f622580b 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -602,15 +602,18 @@ struct tc_cls_common_offload { u32 chain_index; __be16 protocol; u32 prio; + struct netlink_ext_ack *extack; }; static inline void tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common, - const struct tcf_proto *tp) + const struct tcf_proto *tp, + struct netlink_ext_ack *extack) { cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio; + cls_common->extack = extack; } struct tc_cls_u32_knode { -- cgit v1.2.3 From f9eda14f039294298643d034bc5c06340fe16cc0 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 19 Jan 2018 17:44:48 -0800 Subject: net: sched: create tc_can_offload_extack() wrapper Create a wrapper around tc_can_offload() that takes an additional extack pointer argument in order to output an error message if TC offload is disabled on the device. In this way, the error message is handled by the core and can be the same for all drivers. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index f497f622580b..2f8f16a4d88e 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -656,6 +656,17 @@ static inline bool tc_can_offload(const struct net_device *dev) return dev->features & NETIF_F_HW_TC; } +static inline bool tc_can_offload_extack(const struct net_device *dev, + struct netlink_ext_ack *extack) +{ + bool can = tc_can_offload(dev); + + if (!can) + NL_SET_ERR_MSG(extack, "TC offload is disabled on net device"); + + return can; +} + static inline bool tc_skip_hw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false; -- cgit v1.2.3 From 9c5f69bbd75a7db80578782b037629c5f1e59dce Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Mon, 22 Jan 2018 18:14:32 +0100 Subject: net/sched: act_csum: don't use spinlock in the fast path use RCU instead of spin_{,unlock}_bh() to protect concurrent read/write on act_csum configuration, to reduce the effects of contention in the data path when multiple readers are present. Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- include/net/tc_act/tc_csum.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h index 781f3433a0be..9470fd7e4350 100644 --- a/include/net/tc_act/tc_csum.h +++ b/include/net/tc_act/tc_csum.h @@ -6,10 +6,16 @@ #include #include +struct tcf_csum_params { + int action; + u32 update_flags; + struct rcu_head rcu; +}; + struct tcf_csum { struct tc_action common; - u32 update_flags; + struct tcf_csum_params __rcu *params; }; #define to_tcf_csum(a) ((struct tcf_csum *)a) @@ -24,7 +30,13 @@ static inline bool is_tcf_csum(const struct tc_action *a) static inline u32 tcf_csum_update_flags(const struct tc_action *a) { - return to_tcf_csum(a)->update_flags; + u32 update_flags; + + rcu_read_lock(); + update_flags = rcu_dereference(to_tcf_csum(a)->params)->update_flags; + rcu_read_unlock(); + + return update_flags; } #endif /* __NET_TC_CSUM_H */ -- cgit v1.2.3 From e9191ffb65d8e159680ce0ad2224e1acbde6985c Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 22 Jan 2018 20:06:42 +0000 Subject: ipv6: Fix getsockopt() for sockets with default IPV6_AUTOFLOWLABEL Commit 513674b5a2c9 ("net: reevalulate autoflowlabel setting after sysctl setting") removed the initialisation of ipv6_pinfo::autoflowlabel and added a second flag to indicate whether this field or the net namespace default should be used. The getsockopt() handling for this case was not updated, so it currently returns 0 for all sockets for which IPV6_AUTOFLOWLABEL is not explicitly enabled. Fix it to return the effective value, whether that has been set at the socket or net namespace level. Fixes: 513674b5a2c9 ("net: reevalulate autoflowlabel setting after sysctl ...") Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- include/net/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index f73797e2fa60..221238254eb7 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -331,6 +331,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags); int ip6_flowlabel_init(void); void ip6_flowlabel_cleanup(void); +bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np); static inline void fl6_sock_release(struct ip6_flowlabel *fl) { -- cgit v1.2.3 From d3303a65a00c94372ddab831570647488e6c06e2 Mon Sep 17 00:00:00 2001 From: Wolfgang Bumiller Date: Thu, 18 Jan 2018 11:32:36 +0100 Subject: net: sched: fix TCF_LAYER_LINK case in tcf_get_base_ptr TCF_LAYER_LINK and TCF_LAYER_NETWORK returned the same pointer as skb->data points to the network header. Use skb_mac_header instead. Signed-off-by: Wolfgang Bumiller Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 8e08b6da72f3..753ac9361154 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -522,7 +522,7 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer) { switch (layer) { case TCF_LAYER_LINK: - return skb->data; + return skb_mac_header(skb); case TCF_LAYER_NETWORK: return skb_network_header(skb); case TCF_LAYER_TRANSPORT: -- cgit v1.2.3 From 715df5ecab0f22685930cb8bb0cc70ed8fb9279e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Jan 2018 12:54:13 -0800 Subject: net: sched: propagate extack to cls->destroy callbacks Propagate extack to cls->destroy callbacks when called from non-error paths. On error paths pass NULL to avoid overwriting the failure message. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/sch_generic.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index cd1be1f25c36..eac43e8ca96d 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -233,7 +233,8 @@ struct tcf_proto_ops { const struct tcf_proto *, struct tcf_result *); int (*init)(struct tcf_proto*); - void (*destroy)(struct tcf_proto*); + void (*destroy)(struct tcf_proto *tp, + struct netlink_ext_ack *extack); void* (*get)(struct tcf_proto*, u32 handle); int (*change)(struct net *net, struct sk_buff *, -- cgit v1.2.3 From 34832e1c701553ed3eeefe5413fa93d185cff7f4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Jan 2018 12:54:14 -0800 Subject: net: sched: prepare for reimplementation of tc_cls_common_offload_init() Rename the tc_cls_common_offload_init() helper function to tc_cls_common_offload_init_deprecated() and add a new implementation which also takes flags argument. We will only set extack if flags indicate that offload is forced (skip_sw) otherwise driver errors should be ignored, as they don't influence the overall filter installation. Note that we need the tc_skip_hw() helper for new version, therefore it is added later in the file. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 2f8f16a4d88e..08815fe9314d 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -606,9 +606,9 @@ struct tc_cls_common_offload { }; static inline void -tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common, - const struct tcf_proto *tp, - struct netlink_ext_ack *extack) +tc_cls_common_offload_init_deprecated(struct tc_cls_common_offload *cls_common, + const struct tcf_proto *tp, + struct netlink_ext_ack *extack) { cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; @@ -694,6 +694,18 @@ static inline bool tc_in_hw(u32 flags) return (flags & TCA_CLS_FLAGS_IN_HW) ? true : false; } +static inline void +tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common, + const struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) +{ + cls_common->chain_index = tp->chain->index; + cls_common->protocol = tp->protocol; + cls_common->prio = tp->prio; + if (tc_skip_sw(flags)) + cls_common->extack = extack; +} + enum tc_fl_command { TC_CLSFLOWER_REPLACE, TC_CLSFLOWER_DESTROY, -- cgit v1.2.3 From f558fdea03bf70f23a3ac63d8c7cdd9755797f80 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Jan 2018 12:54:15 -0800 Subject: cls_bpf: remove gen_flags from bpf_offload cls_bpf now guarantees that only device-bound programs are allowed with skip_sw. The drivers no longer pay attention to flags on filter load, therefore the bpf_offload member can be removed. If flags are needed again they should probably be added to struct tc_cls_common_offload instead. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 08815fe9314d..85cee929b9ce 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -748,7 +748,6 @@ struct tc_cls_bpf_offload { struct bpf_prog *oldprog; const char *name; bool exts_integrated; - u32 gen_flags; }; struct tc_mqprio_qopt_offload { -- cgit v1.2.3 From c846adb6be17d77395c579a61c44d2863e6f9762 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Jan 2018 12:54:24 -0800 Subject: net: sched: remove tc_cls_common_offload_init_deprecated() All users are now converted to tc_cls_common_offload_init(). Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 85cee929b9ce..1a41513cec7f 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -605,17 +605,6 @@ struct tc_cls_common_offload { struct netlink_ext_ack *extack; }; -static inline void -tc_cls_common_offload_init_deprecated(struct tc_cls_common_offload *cls_common, - const struct tcf_proto *tp, - struct netlink_ext_ack *extack) -{ - cls_common->chain_index = tp->chain->index; - cls_common->protocol = tp->protocol; - cls_common->prio = tp->prio; - cls_common->extack = extack; -} - struct tc_cls_u32_knode { struct tcf_exts *exts; struct tc_u32_sel *sel; -- cgit v1.2.3 From b423d13c08a656c719fa56324a8f4279c835d90c Mon Sep 17 00:00:00 2001 From: William Tu Date: Tue, 23 Jan 2018 17:01:29 -0800 Subject: net: erspan: fix use-after-free When building the erspan header for either v1 or v2, the eth_hdr() does not point to the right inner packet's eth_hdr, causing kasan report use-after-free and slab-out-of-bouds read. The patch fixes the following syzkaller issues: [1] BUG: KASAN: slab-out-of-bounds in erspan_xmit+0x22d4/0x2430 net/ipv4/ip_gre.c:735 [2] BUG: KASAN: slab-out-of-bounds in erspan_build_header+0x3bf/0x3d0 net/ipv4/ip_gre.c:698 [3] BUG: KASAN: use-after-free in erspan_xmit+0x22d4/0x2430 net/ipv4/ip_gre.c:735 [4] BUG: KASAN: use-after-free in erspan_build_header+0x3bf/0x3d0 net/ipv4/ip_gre.c:698 [2] CPU: 0 PID: 3654 Comm: syzkaller377964 Not tainted 4.15.0-rc9+ #185 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x25b/0x340 mm/kasan/report.c:409 __asan_report_load_n_noabort+0xf/0x20 mm/kasan/report.c:440 erspan_build_header+0x3bf/0x3d0 net/ipv4/ip_gre.c:698 erspan_xmit+0x3b8/0x13b0 net/ipv4/ip_gre.c:740 __netdev_start_xmit include/linux/netdevice.h:4042 [inline] netdev_start_xmit include/linux/netdevice.h:4051 [inline] packet_direct_xmit+0x315/0x6b0 net/packet/af_packet.c:266 packet_snd net/packet/af_packet.c:2943 [inline] packet_sendmsg+0x3aed/0x60b0 net/packet/af_packet.c:2968 sock_sendmsg_nosec net/socket.c:638 [inline] sock_sendmsg+0xca/0x110 net/socket.c:648 SYSC_sendto+0x361/0x5c0 net/socket.c:1729 SyS_sendto+0x40/0x50 net/socket.c:1697 do_syscall_32_irqs_on arch/x86/entry/common.c:327 [inline] do_fast_syscall_32+0x3ee/0xf9d arch/x86/entry/common.c:389 entry_SYSENTER_compat+0x54/0x63 arch/x86/entry/entry_64_compat.S:129 RIP: 0023:0xf7fcfc79 RSP: 002b:00000000ffc6976c EFLAGS: 00000286 ORIG_RAX: 0000000000000171 RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 0000000020011000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000020008000 RBP: 000000000000001c R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Fixes: f551c91de262 ("net: erspan: introduce erspan v2 for ip_gre") Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN") Reported-by: syzbot+9723f2d288e49b492cf0@syzkaller.appspotmail.com Reported-by: syzbot+f0ddeb2b032a8e1d9098@syzkaller.appspotmail.com Reported-by: syzbot+f14b3703cd8d7670203f@syzkaller.appspotmail.com Reported-by: syzbot+eefa384efad8d7997f20@syzkaller.appspotmail.com Signed-off-by: William Tu Signed-off-by: David S. Miller --- include/net/erspan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/erspan.h b/include/net/erspan.h index acdf6843095d..712ea1b1f4db 100644 --- a/include/net/erspan.h +++ b/include/net/erspan.h @@ -123,7 +123,7 @@ static inline void erspan_build_header(struct sk_buff *skb, __be32 id, u32 index, bool truncate, bool is_ipv4) { - struct ethhdr *eth = eth_hdr(skb); + struct ethhdr *eth = (struct ethhdr *)skb->data; enum erspan_encap_type enc_type; struct erspan_base_hdr *ershdr; struct erspan_metadata *ersmd; @@ -190,7 +190,7 @@ static inline void erspan_build_header_v2(struct sk_buff *skb, __be32 id, u8 direction, u16 hwid, bool truncate, bool is_ipv4) { - struct ethhdr *eth = eth_hdr(skb); + struct ethhdr *eth = (struct ethhdr *)skb->data; struct erspan_base_hdr *ershdr; struct erspan_metadata *md; struct qtag_prefix { -- cgit v1.2.3 From ca25c30040f93c127ff1651aa636c0174f1e0cdb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 1 Jul 2017 08:03:10 -0400 Subject: ip_rt_ioctl(): take copyin to caller Reviewed-by: Christoph Hellwig Signed-off-by: Al Viro --- include/net/route.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index d538e6db1afe..1eb9ce470e25 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -217,7 +217,7 @@ unsigned int inet_addr_type_dev_table(struct net *net, const struct net_device *dev, __be32 addr); void ip_rt_multicast_event(struct in_device *); -int ip_rt_ioctl(struct net *, unsigned int cmd, void __user *arg); +int ip_rt_ioctl(struct net *, unsigned int cmd, struct rtentry *rt); void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt); struct rtable *rt_dst_alloc(struct net_device *dev, unsigned int flags, u16 type, -- cgit v1.2.3 From b1b0c245067268043e0e832432f3d537a5cae33b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 1 Oct 2017 20:13:08 -0400 Subject: lift handling of SIOCIW... out of dev_ioctl() Signed-off-by: Al Viro --- include/net/wext.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/wext.h b/include/net/wext.h index e51f067fdb3a..aa192a670304 100644 --- a/include/net/wext.h +++ b/include/net/wext.h @@ -7,7 +7,7 @@ struct net; #ifdef CONFIG_WEXT_CORE -int wext_handle_ioctl(struct net *net, struct iwreq *iwr, unsigned int cmd, +int wext_handle_ioctl(struct net *net, unsigned int cmd, void __user *arg); int compat_wext_handle_ioctl(struct net *net, unsigned int cmd, unsigned long arg); @@ -15,7 +15,7 @@ int compat_wext_handle_ioctl(struct net *net, unsigned int cmd, struct iw_statistics *get_wireless_stats(struct net_device *dev); int call_commit_handler(struct net_device *dev); #else -static inline int wext_handle_ioctl(struct net *net, struct iwreq *iwr, unsigned int cmd, +static inline int wext_handle_ioctl(struct net *net, unsigned int cmd, void __user *arg) { return -EINVAL; -- cgit v1.2.3 From 4ee806d51176ba7b8ff1efd81f271d7252e03a1d Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 18 Jan 2018 16:14:26 -0500 Subject: net: tcp: close sock if net namespace is exiting When a tcp socket is closed, if it detects that its net namespace is exiting, close immediately and do not wait for FIN sequence. For normal sockets, a reference is taken to their net namespace, so it will never exit while the socket is open. However, kernel sockets do not take a reference to their net namespace, so it may begin exiting while the kernel socket is still open. In this case if the kernel socket is a tcp socket, it will stay open trying to complete its close sequence. The sock's dst(s) hold a reference to their interface, which are all transferred to the namespace's loopback interface when the real interfaces are taken down. When the namespace tries to take down its loopback interface, it hangs waiting for all references to the loopback interface to release, which results in messages like: unregister_netdevice: waiting for lo to become free. Usage count = 1 These messages continue until the socket finally times out and closes. Since the net namespace cleanup holds the net_mutex while calling its registered pernet callbacks, any new net namespace initialization is blocked until the current net namespace finishes exiting. After this change, the tcp socket notices the exiting net namespace, and closes immediately, releasing its dst(s) and their reference to the loopback interface, which lets the net namespace continue exiting. Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811 Signed-off-by: Dan Streetman Signed-off-by: David S. Miller --- include/net/net_namespace.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 10f99dafd5ac..049008493faf 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -223,6 +223,11 @@ int net_eq(const struct net *net1, const struct net *net2) return net1 == net2; } +static inline int check_net(const struct net *net) +{ + return atomic_read(&net->count) != 0; +} + void net_drop_ns(void *); #else @@ -247,6 +252,11 @@ int net_eq(const struct net *net1, const struct net *net2) return 1; } +static inline int check_net(const struct net *net) +{ + return 1; +} + #define net_drop_ns NULL #endif -- cgit v1.2.3 From f15ca723c1ebe6c1a06bc95fda6b62cd87b44559 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 25 Jan 2018 19:03:03 +0100 Subject: net: don't call update_pmtu unconditionally Some dst_ops (e.g. md_dst_ops)) doesn't set this handler. It may result to: "BUG: unable to handle kernel NULL pointer dereference at (null)" Let's add a helper to check if update_pmtu is available before calling it. Fixes: 52a589d51f10 ("geneve: update skb dst pmtu on tx path") Fixes: a93bf0ff4490 ("vxlan: update skb dst pmtu on tx path") CC: Roman Kapl CC: Xin Long Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/net/dst.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index b091fd536098..d49d607dd2b3 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -521,4 +521,12 @@ static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst) } #endif +static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu) +{ + struct dst_entry *dst = skb_dst(skb); + + if (dst && dst->ops->update_pmtu) + dst->ops->update_pmtu(dst, NULL, skb, mtu); +} + #endif /* _NET_DST_H */ -- cgit v1.2.3 From b73042b8a28e2603ac178295ab96c876ba5a97a1 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:08 -0800 Subject: bpf: Add write access to tcp_sock and sock fields This patch adds a macro, SOCK_OPS_SET_FIELD, for writing to struct tcp_sock or struct sock fields. This required adding a new field "temp" to struct bpf_sock_ops_kern for temporary storage that is used by sock_ops_convert_ctx_access. It is used to store and recover the contents of a register, so the register can be used to store the address of the sk. Since we cannot overwrite the dst_reg because it contains the pointer to ctx, nor the src_reg since it contains the value we want to store, we need an extra register to contain the address of the sk. Also adds the macro SOCK_OPS_GET_OR_SET_FIELD that calls one of the GET or SET macros depending on the value of the TYPE field. Signed-off-by: Lawrence Brakmo Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 5a1d26a18599..6092eaff61cf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2011,7 +2011,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op) struct bpf_sock_ops_kern sock_ops; int ret; - memset(&sock_ops, 0, sizeof(sock_ops)); + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); if (sk_fullsock(sk)) { sock_ops.is_fullsock = 1; sock_owned_by_me(sk); -- cgit v1.2.3 From de525be2ca2734865d29c4b67ddd29913b214906 Mon Sep 17 00:00:00 2001 From: Lawrence Brakmo Date: Thu, 25 Jan 2018 16:14:09 -0800 Subject: bpf: Support passing args to sock_ops bpf function Adds support for passing up to 4 arguments to sock_ops bpf functions. It reusues the reply union, so the bpf_sock_ops structures are not increased in size. Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 6092eaff61cf..093e967a2960 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2006,7 +2006,7 @@ void tcp_cleanup_ulp(struct sock *sk); * program loaded). */ #ifdef CONFIG_BPF -static inline int tcp_call_bpf(struct sock *sk, int op) +static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { struct bpf_sock_ops_kern sock_ops; int ret; @@ -2019,6 +2019,8 @@ static inline int tcp_call_bpf(struct sock *sk, int op) sock_ops.sk = sk; sock_ops.op = op; + if (nargs > 0) + memcpy(sock_ops.args, args, nargs * sizeof(*args)); ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); if (ret == 0) @@ -2027,18 +2029,46 @@ static inline int tcp_call_bpf(struct sock *sk, int op) ret = -1; return ret; } + +static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) +{ + u32 args[2] = {arg1, arg2}; + + return tcp_call_bpf(sk, op, 2, args); +} + +static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, + u32 arg3) +{ + u32 args[3] = {arg1, arg2, arg3}; + + return tcp_call_bpf(sk, op, 3, args); +} + #else -static inline int tcp_call_bpf(struct sock *sk, int op) +static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { return -EPERM; } + +static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) +{ + return -EPERM; +} + +static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, + u32 arg3) +{ + return -EPERM; +} + #endif static inline u32 tcp_timeout_init(struct sock *sk) { int timeout; - timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT); + timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL); if (timeout <= 0) timeout = TCP_TIMEOUT_INIT; @@ -2049,7 +2079,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk) { int rwnd; - rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT); + rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL); if (rwnd < 0) rwnd = 0; @@ -2058,7 +2088,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk) static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) { - return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1); + return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } #if IS_ENABLED(CONFIG_SMC) -- cgit v1.2.3 From 878db9f0f26df7674f6fc46f1c1c9390c99880ec Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Jan 2018 14:00:43 -0800 Subject: pkt_cls: add new tc cls helper to check offload flag and chain index Very few (mlxsw) upstream drivers seem to allow offload of chains other than 0. Save driver developers typing and add a helper for checking both if ethtool's TC offload flag is on and if chain is 0. This helper will set the extack appropriately in both error cases. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index fa2f6fb14093..87406252f0a3 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -656,6 +656,20 @@ static inline bool tc_can_offload_extack(const struct net_device *dev, return can; } +static inline bool +tc_cls_can_offload_and_chain0(const struct net_device *dev, + struct tc_cls_common_offload *common) +{ + if (!tc_can_offload_extack(dev, common->extack)) + return false; + if (common->chain_index) { + NL_SET_ERR_MSG(common->extack, + "Driver supports only offload of chain 0"); + return false; + } + return true; +} + static inline bool tc_skip_hw(u32 flags) { return (flags & TCA_CLS_FLAGS_SKIP_HW) ? true : false; -- cgit v1.2.3 From c69de58ba84f480879de64571d9dae5102d10ed6 Mon Sep 17 00:00:00 2001 From: William Tu Date: Thu, 25 Jan 2018 13:20:09 -0800 Subject: net: erspan: use bitfield instead of mask and offset Originally the erspan fields are defined as a group into a __be16 field, and use mask and offset to access each field. This is more costly due to calling ntohs/htons. The patch changes it to use bitfields. Signed-off-by: William Tu Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/erspan.h | 127 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 94 insertions(+), 33 deletions(-) (limited to 'include/net') diff --git a/include/net/erspan.h b/include/net/erspan.h index 712ea1b1f4db..6d30fe898286 100644 --- a/include/net/erspan.h +++ b/include/net/erspan.h @@ -65,16 +65,30 @@ #define GRA_MASK 0x0006 #define O_MASK 0x0001 +#define HWID_OFFSET 4 +#define DIR_OFFSET 3 + /* ERSPAN version 2 metadata header */ struct erspan_md2 { __be32 timestamp; __be16 sgt; /* security group tag */ - __be16 flags; -#define P_OFFSET 15 -#define FT_OFFSET 10 -#define HWID_OFFSET 4 -#define DIR_OFFSET 3 -#define GRA_OFFSET 1 +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 hwid_upper:2, + ft:5, + p:1; + __u8 o:1, + gra:2, + dir:1, + hwid:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 p:1, + ft:5, + hwid_upper:2; + __u8 hwid:4, + dir:1, + gra:2, + o:1; +#endif }; enum erspan_encap_type { @@ -95,15 +109,62 @@ struct erspan_metadata { }; struct erspan_base_hdr { - __be16 ver_vlan; -#define VER_OFFSET 12 - __be16 session_id; -#define COS_OFFSET 13 -#define EN_OFFSET 11 -#define BSO_OFFSET EN_OFFSET -#define T_OFFSET 10 +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 vlan_upper:4, + ver:4; + __u8 vlan:8; + __u8 session_id_upper:2, + t:1, + en:2, + cos:3; + __u8 session_id:8; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 ver: 4, + vlan_upper:4; + __u8 vlan:8; + __u8 cos:3, + en:2, + t:1, + session_id_upper:2; + __u8 session_id:8; +#else +#error "Please fix " +#endif }; +static inline void set_session_id(struct erspan_base_hdr *ershdr, u16 id) +{ + ershdr->session_id = id & 0xff; + ershdr->session_id_upper = (id >> 8) & 0x3; +} + +static inline u16 get_session_id(const struct erspan_base_hdr *ershdr) +{ + return (ershdr->session_id_upper << 8) + ershdr->session_id; +} + +static inline void set_vlan(struct erspan_base_hdr *ershdr, u16 vlan) +{ + ershdr->vlan = vlan & 0xff; + ershdr->vlan_upper = (vlan >> 8) & 0xf; +} + +static inline u16 get_vlan(const struct erspan_base_hdr *ershdr) +{ + return (ershdr->vlan_upper << 8) + ershdr->vlan; +} + +static inline void set_hwid(struct erspan_md2 *md2, u8 hwid) +{ + md2->hwid = hwid & 0xf; + md2->hwid_upper = (hwid >> 4) & 0x3; +} + +static inline u8 get_hwid(const struct erspan_md2 *md2) +{ + return (md2->hwid_upper << 4) + md2->hwid; +} + static inline int erspan_hdr_len(int version) { return sizeof(struct erspan_base_hdr) + @@ -120,7 +181,7 @@ static inline u8 tos_to_cos(u8 tos) } static inline void erspan_build_header(struct sk_buff *skb, - __be32 id, u32 index, + u32 id, u32 index, bool truncate, bool is_ipv4) { struct ethhdr *eth = (struct ethhdr *)skb->data; @@ -154,12 +215,12 @@ static inline void erspan_build_header(struct sk_buff *skb, memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V1_MDSIZE); /* Build base header */ - ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) | - (ERSPAN_VERSION << VER_OFFSET)); - ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) | - ((tos_to_cos(tos) << COS_OFFSET) & COS_MASK) | - (enc_type << EN_OFFSET & EN_MASK) | - ((truncate << T_OFFSET) & T_MASK)); + ershdr->ver = ERSPAN_VERSION; + ershdr->cos = tos_to_cos(tos); + ershdr->en = enc_type; + ershdr->t = truncate; + set_vlan(ershdr, vlan_tci); + set_session_id(ershdr, id); /* Build metadata */ ersmd = (struct erspan_metadata *)(ershdr + 1); @@ -187,7 +248,7 @@ static inline __be32 erspan_get_timestamp(void) } static inline void erspan_build_header_v2(struct sk_buff *skb, - __be32 id, u8 direction, u16 hwid, + u32 id, u8 direction, u16 hwid, bool truncate, bool is_ipv4) { struct ethhdr *eth = (struct ethhdr *)skb->data; @@ -198,7 +259,6 @@ static inline void erspan_build_header_v2(struct sk_buff *skb, __be16 tci; } *qp; u16 vlan_tci = 0; - u16 session_id; u8 gra = 0; /* 100 usec */ u8 bso = 0; /* Bad/Short/Oversized */ u8 sgt = 0; @@ -221,22 +281,23 @@ static inline void erspan_build_header_v2(struct sk_buff *skb, memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V2_MDSIZE); /* Build base header */ - ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) | - (ERSPAN_VERSION2 << VER_OFFSET)); - session_id = (u16)(ntohl(id) & ID_MASK) | - ((tos_to_cos(tos) << COS_OFFSET) & COS_MASK) | - (bso << BSO_OFFSET & BSO_MASK) | - ((truncate << T_OFFSET) & T_MASK); - ershdr->session_id = htons(session_id); + ershdr->ver = ERSPAN_VERSION2; + ershdr->cos = tos_to_cos(tos); + ershdr->en = bso; + ershdr->t = truncate; + set_vlan(ershdr, vlan_tci); + set_session_id(ershdr, id); /* Build metadata */ md = (struct erspan_metadata *)(ershdr + 1); md->u.md2.timestamp = erspan_get_timestamp(); md->u.md2.sgt = htons(sgt); - md->u.md2.flags = htons(((1 << P_OFFSET) & P_MASK) | - ((hwid << HWID_OFFSET) & HWID_MASK) | - ((direction << DIR_OFFSET) & DIR_MASK) | - ((gra << GRA_OFFSET) & GRA_MASK)); + md->u.md2.p = 1; + md->u.md2.ft = 0; + md->u.md2.dir = direction; + md->u.md2.gra = gra; + md->u.md2.o = 0; + set_hwid(&md->u.md2, hwid); } #endif -- cgit v1.2.3 From d350a823020e71e20a10d1dfa44f1d1d653b0334 Mon Sep 17 00:00:00 2001 From: William Tu Date: Thu, 25 Jan 2018 13:20:10 -0800 Subject: net: erspan: create erspan metadata uapi header The patch adds a new uapi header file, erspan.h, and moves the 'struct erspan_metadata' from internal erspan.h to it. Signed-off-by: William Tu Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/erspan.h | 32 ++------------------------------ 1 file changed, 2 insertions(+), 30 deletions(-) (limited to 'include/net') diff --git a/include/net/erspan.h b/include/net/erspan.h index 6d30fe898286..5daa4866412b 100644 --- a/include/net/erspan.h +++ b/include/net/erspan.h @@ -46,6 +46,8 @@ * GRE proto ERSPAN type II = 0x88BE, type III = 0x22EB */ +#include + #define ERSPAN_VERSION 0x1 /* ERSPAN type II */ #define VER_MASK 0xf000 #define VLAN_MASK 0x0fff @@ -68,29 +70,6 @@ #define HWID_OFFSET 4 #define DIR_OFFSET 3 -/* ERSPAN version 2 metadata header */ -struct erspan_md2 { - __be32 timestamp; - __be16 sgt; /* security group tag */ -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 hwid_upper:2, - ft:5, - p:1; - __u8 o:1, - gra:2, - dir:1, - hwid:4; -#elif defined(__BIG_ENDIAN_BITFIELD) - __u8 p:1, - ft:5, - hwid_upper:2; - __u8 hwid:4, - dir:1, - gra:2, - o:1; -#endif -}; - enum erspan_encap_type { ERSPAN_ENCAP_NOVLAN = 0x0, /* originally without VLAN tag */ ERSPAN_ENCAP_ISL = 0x1, /* originally ISL encapsulated */ @@ -100,13 +79,6 @@ enum erspan_encap_type { #define ERSPAN_V1_MDSIZE 4 #define ERSPAN_V2_MDSIZE 8 -struct erspan_metadata { - union { - __be32 index; /* Version 1 (type II)*/ - struct erspan_md2 md2; /* Version 2 (type III) */ - } u; - int version; -}; struct erspan_base_hdr { #if defined(__LITTLE_ENDIAN_BITFIELD) -- cgit v1.2.3 From 48bfd55e7e4149a304e89c1999436cf52d094a27 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 25 Jan 2018 18:26:23 -0800 Subject: net_sched: plug in qdisc ops change_tx_queue_len Introduce a new qdisc ops ->change_tx_queue_len() so that each qdisc could decide how to implement this if it wants. Previously we simply read dev->tx_queue_len, after pfifo_fast switches to skb array, we need this API to resize the skb array when we change dev->tx_queue_len. To avoid handling race conditions with TX BH, we need to deactivate all TX queues before change the value and bring them back after we are done, this also makes implementation easier. Cc: John Fastabend Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/sch_generic.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index eac43e8ca96d..e2ab13687fb9 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -200,6 +200,7 @@ struct Qdisc_ops { struct nlattr *arg, struct netlink_ext_ack *extack); void (*attach)(struct Qdisc *sch); + int (*change_tx_queue_len)(struct Qdisc *, unsigned int); int (*dump)(struct Qdisc *, struct sk_buff *); int (*dump_stats)(struct Qdisc *, struct gnet_dump *); @@ -489,6 +490,7 @@ void qdisc_class_hash_remove(struct Qdisc_class_hash *, void qdisc_class_hash_grow(struct Qdisc *, struct Qdisc_class_hash *); void qdisc_class_hash_destroy(struct Qdisc_class_hash *); +int dev_qdisc_change_tx_queue_len(struct net_device *dev); void dev_init_scheduler(struct net_device *dev); void dev_shutdown(struct net_device *dev); void dev_activate(struct net_device *dev); -- cgit v1.2.3 From a54667f6728c2714a400f3c884727da74b6d1717 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Wed, 31 Jan 2018 21:34:37 +0530 Subject: tls: Add support for encryption using async offload accelerator Async crypto accelerators (e.g. drivers/crypto/caam) support offloading GCM operation. If they are enabled, crypto_aead_encrypt() return error code -EINPROGRESS. In this case tls_do_encryption() needs to wait on a completion till the time the response for crypto offload request is received. Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- include/net/tls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/tls.h b/include/net/tls.h index 9185e53a743c..4913430ab807 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -36,6 +36,7 @@ #include #include +#include #include #include #include @@ -57,6 +58,7 @@ struct tls_sw_context { struct crypto_aead *aead_send; + struct crypto_wait async_wait; /* Sending context */ char aad_space[TLS_AAD_SPACE_SIZE]; -- cgit v1.2.3 From b11a632c442eef34a0afeba61fab923241f317e9 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 5 Feb 2018 10:17:43 -0800 Subject: net: add a UID to use for ULP socket assignment Create a UID field and enum that can be used to assign ULPs to sockets. This saves a set of string comparisons if the ULP id is known. For sockmap, which is added in the next patches, a ULP is used to hook into TCP sockets close state. In this case the ULP being added is done at map insert time and the ULP is known and done on the kernel side. In this case the named lookup is not needed. Because we don't want to expose psock internals to user space socket options a user visible flag is also added. For TLS this is set for BPF it will be cleared. Alos remove pr_notice, user gets an error code back and should check that rather than rely on logs. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/net/tcp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 58278669cc55..a58292d31e12 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1983,6 +1983,10 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer); #define TCP_ULP_MAX 128 #define TCP_ULP_BUF_MAX (TCP_ULP_NAME_MAX*TCP_ULP_MAX) +enum { + TCP_ULP_TLS, +}; + struct tcp_ulp_ops { struct list_head list; @@ -1991,7 +1995,9 @@ struct tcp_ulp_ops { /* cleanup ulp */ void (*release)(struct sock *sk); + int uid; char name[TCP_ULP_NAME_MAX]; + bool user_visible; struct module *owner; }; int tcp_register_ulp(struct tcp_ulp_ops *type); -- cgit v1.2.3 From 1aa12bdf1bfb95db7e75bfecf0e39a65f4e8fbf8 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 5 Feb 2018 10:17:49 -0800 Subject: bpf: sockmap, add sock close() hook to remove socks The selftests test_maps program was leaving dangling BPF sockmap programs around because not all psock elements were removed from the map. The elements in turn hold a reference on the BPF program they are attached to causing BPF programs to stay open even after test_maps has completed. The original intent was that sk_state_change() would be called when TCP socks went through TCP_CLOSE state. However, because socks may be in SOCK_DEAD state or the sock may be a listening socket the event is not always triggered. To resolve this use the ULP infrastructure and register our own proto close() handler. This fixes the above case. Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Reported-by: Prashant Bhole Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index a58292d31e12..e3fc667f9ac2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1985,6 +1985,7 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer); enum { TCP_ULP_TLS, + TCP_ULP_BPF, }; struct tcp_ulp_ops { @@ -2003,6 +2004,7 @@ struct tcp_ulp_ops { int tcp_register_ulp(struct tcp_ulp_ops *type); void tcp_unregister_ulp(struct tcp_ulp_ops *type); int tcp_set_ulp(struct sock *sk, const char *name); +int tcp_set_ulp_id(struct sock *sk, const int ulp); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); -- cgit v1.2.3 From 3df1928302950dfa728ab2eade28eea0da291567 Mon Sep 17 00:00:00 2001 From: William Tu Date: Mon, 5 Feb 2018 13:35:34 -0800 Subject: net: erspan: fix metadata extraction Commit d350a823020e ("net: erspan: create erspan metadata uapi header") moves the erspan 'version' in front of the 'struct erspan_md2' for later extensibility reason. This breaks the existing erspan metadata extraction code because the erspan_md2 then has a 4-byte offset to between the erspan_metadata and erspan_base_hdr. This patch fixes it. Fixes: 1a66a836da63 ("gre: add collect_md mode to ERSPAN tunnel") Fixes: ef7baf5e083c ("ip6_gre: add ip6 erspan collect_md mode") Fixes: 1d7e2ed22f8d ("net: erspan: refactor existing erspan code") Signed-off-by: William Tu Signed-off-by: David S. Miller --- include/net/erspan.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/erspan.h b/include/net/erspan.h index 5daa4866412b..d044aa60cc76 100644 --- a/include/net/erspan.h +++ b/include/net/erspan.h @@ -159,13 +159,13 @@ static inline void erspan_build_header(struct sk_buff *skb, struct ethhdr *eth = (struct ethhdr *)skb->data; enum erspan_encap_type enc_type; struct erspan_base_hdr *ershdr; - struct erspan_metadata *ersmd; struct qtag_prefix { __be16 eth_type; __be16 tci; } *qp; u16 vlan_tci = 0; u8 tos; + __be32 *idx; tos = is_ipv4 ? ip_hdr(skb)->tos : (ipv6_hdr(skb)->priority << 4) + @@ -195,8 +195,8 @@ static inline void erspan_build_header(struct sk_buff *skb, set_session_id(ershdr, id); /* Build metadata */ - ersmd = (struct erspan_metadata *)(ershdr + 1); - ersmd->u.index = htonl(index & INDEX_MASK); + idx = (__be32 *)(ershdr + 1); + *idx = htonl(index & INDEX_MASK); } /* ERSPAN GRA: timestamp granularity @@ -225,7 +225,7 @@ static inline void erspan_build_header_v2(struct sk_buff *skb, { struct ethhdr *eth = (struct ethhdr *)skb->data; struct erspan_base_hdr *ershdr; - struct erspan_metadata *md; + struct erspan_md2 *md2; struct qtag_prefix { __be16 eth_type; __be16 tci; @@ -261,15 +261,15 @@ static inline void erspan_build_header_v2(struct sk_buff *skb, set_session_id(ershdr, id); /* Build metadata */ - md = (struct erspan_metadata *)(ershdr + 1); - md->u.md2.timestamp = erspan_get_timestamp(); - md->u.md2.sgt = htons(sgt); - md->u.md2.p = 1; - md->u.md2.ft = 0; - md->u.md2.dir = direction; - md->u.md2.gra = gra; - md->u.md2.o = 0; - set_hwid(&md->u.md2, hwid); + md2 = (struct erspan_md2 *)(ershdr + 1); + md2->timestamp = erspan_get_timestamp(); + md2->sgt = htons(sgt); + md2->p = 1; + md2->ft = 0; + md2->dir = direction; + md2->gra = gra; + md2->o = 0; + set_hwid(md2, hwid); } #endif -- cgit v1.2.3 From c0ea1bcb39352b57ac5c4b6da8acd65bddeee2c5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 6 Feb 2018 13:22:44 +0100 Subject: netfilter: nft_flow_offload: move flowtable cleanup routines to nf_flow_table Move the flowtable cleanup routines to nf_flow_table and expose the nf_flow_table_cleanup() helper function. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index b22b22082733..ed49cd169ecf 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -95,6 +95,9 @@ struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_t int nf_flow_table_iterate(struct nf_flowtable *flow_table, void (*iter)(struct flow_offload *flow, void *data), void *data); + +void nf_flow_table_cleanup(struct net *net, struct net_device *dev); + void nf_flow_offload_work_gc(struct work_struct *work); extern const struct rhashtable_params nf_flow_offload_rhash_params; -- cgit v1.2.3 From b408c5b04f82fe4e20bceb8e4f219453d4f21f02 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 6 Feb 2018 13:22:47 +0100 Subject: netfilter: nf_tables: fix flowtable free Every flow_offload entry is added into the table twice. Because of this, rhashtable_free_and_destroy can't be used, since it would call kfree for each flow_offload object twice. This patch cleans up the flowtable via nf_flow_table_iterate() to schedule removal of entries by setting on the dying bit, then there is an explicitly invocation of the garbage collector to release resources. Based on patch from Felix Fietkau. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index ed49cd169ecf..020ae903066f 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -14,6 +14,7 @@ struct nf_flowtable_type { struct list_head list; int family; void (*gc)(struct work_struct *work); + void (*free)(struct nf_flowtable *ft); const struct rhashtable_params *params; nf_hookfn *hook; struct module *owner; @@ -98,6 +99,7 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table, void nf_flow_table_cleanup(struct net *net, struct net_device *dev); +void nf_flow_table_free(struct nf_flowtable *flow_table); void nf_flow_offload_work_gc(struct work_struct *work); extern const struct rhashtable_params nf_flow_offload_rhash_params; -- cgit v1.2.3 From d8ed9600581d40d818ae417b3086a333841b0559 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 7 Feb 2018 11:50:41 +0900 Subject: netfilter: remove useless prototype prototype nf_ct_nat_offset is not used anymore. Signed-off-by: Taehee Yoo --- include/net/netfilter/nf_conntrack.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index f5223bf2c420..062dc19b5840 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -213,11 +213,6 @@ static inline bool nf_ct_kill(struct nf_conn *ct) return nf_ct_delete(ct, 0, 0); } -/* These are for NAT. Icky. */ -extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct, - enum ip_conntrack_dir dir, - u32 seq); - /* Set all unconfirmed conntrack as dying */ void nf_ct_unconfirmed_destroy(struct net *); -- cgit v1.2.3 From 0ff90b6c20340e57616a51ae1a1bf18156d6638a Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 7 Feb 2018 09:49:02 +0100 Subject: netfilter: nf_flow_offload: fix use-after-free and a resource leak flow_offload_del frees the flow, so all associated resource must be freed before. Since the ct entry in struct flow_offload_entry was allocated by flow_offload_alloc, it should be freed by flow_offload_free to take care of the error handling path when flow_offload_add fails. While at it, make flow_offload_del static, since it should never be called directly, only from the gc step Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 020ae903066f..833752dd0c58 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -90,7 +90,6 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct, void flow_offload_free(struct flow_offload *flow); int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow); -void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow); struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table, struct flow_offload_tuple *tuple); int nf_flow_table_iterate(struct nf_flowtable *flow_table, -- cgit v1.2.3 From a9a08845e9acbd224e4ee466f5c1275ed50054e8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 11 Feb 2018 14:34:03 -0800 Subject: vfs: do bulk POLL* -> EPOLL* replacement This is the mindless scripted replacement of kernel use of POLL* variables as described by Al, done by this script: for V in IN OUT PRI ERR RDNORM RDBAND WRNORM WRBAND HUP RDHUP NVAL MSG; do L=`git grep -l -w POLL$V | grep -v '^t' | grep -v /um/ | grep -v '^sa' | grep -v '/poll.h$'|grep -v '^D'` for f in $L; do sed -i "-es/^\([^\"]*\)\(\\)/\\1E\\2/" $f; done done with de-mangling cleanups yet to come. NOTE! On almost all architectures, the EPOLL* constants have the same values as the POLL* constants do. But they keyword here is "almost". For various bad reasons they aren't the same, and epoll() doesn't actually work quite correctly in some cases due to this on Sparc et al. The next patch from Al will sort out the final differences, and we should be all done. Scripted-by: Al Viro Signed-off-by: Linus Torvalds --- include/net/inet_connection_sock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 6692d67e9245..c1a93ce35e62 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -310,7 +310,7 @@ void inet_csk_prepare_forced_close(struct sock *sk); static inline __poll_t inet_csk_listen_poll(const struct sock *sk) { return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ? - (POLLIN | POLLRDNORM) : 0; + (EPOLLIN | EPOLLRDNORM) : 0; } int inet_csk_listen_start(struct sock *sk, int backlog); -- cgit v1.2.3 From 15f35d49c93f4fa9875235e7bf3e3783d2dd7a1b Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Thu, 15 Feb 2018 20:18:43 +0300 Subject: udplite: fix partial checksum initialization Since UDP-Lite is always using checksum, the following path is triggered when calculating pseudo header for it: udp4_csum_init() or udp6_csum_init() skb_checksum_init_zero_check() __skb_checksum_validate_complete() The problem can appear if skb->len is less than CHECKSUM_BREAK. In this particular case __skb_checksum_validate_complete() also invokes __skb_checksum_complete(skb). If UDP-Lite is using partial checksum that covers only part of a packet, the function will return bad checksum and the packet will be dropped. It can be fixed if we skip skb_checksum_init_zero_check() and only set the required pseudo header checksum for UDP-Lite with partial checksum before udp4_csum_init()/udp6_csum_init() functions return. Fixes: ed70fcfcee95 ("net: Call skb_checksum_init in IPv4") Fixes: e4f45b7f40bd ("net: Call skb_checksum_init in IPv6") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- include/net/udplite.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/net') diff --git a/include/net/udplite.h b/include/net/udplite.h index 81bdbf97319b..9185e45b997f 100644 --- a/include/net/udplite.h +++ b/include/net/udplite.h @@ -64,6 +64,7 @@ static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh) UDP_SKB_CB(skb)->cscov = cscov; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; + skb->csum_valid = 0; } return 0; -- cgit v1.2.3 From 651b9920d7a694ffb1f885aef2bbb068a25d9d66 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 10 Feb 2018 13:20:34 +0100 Subject: mac80211: round IEEE80211_TX_STATUS_HEADROOM up to multiple of 4 This ensures that mac80211 allocated management frames are properly aligned, which makes copying them more efficient. For instance, mt76 uses iowrite32_copy to copy beacon frames to beacon template memory on the chip. Misaligned 32-bit accesses cause CPU exceptions on MIPS and should be avoided. Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index eec143cca1c0..c9077a832977 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4141,7 +4141,7 @@ void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *sta, u8 tid); * The TX headroom reserved by mac80211 for its own tx_status functions. * This is enough for the radiotap header. */ -#define IEEE80211_TX_STATUS_HEADROOM 14 +#define IEEE80211_TX_STATUS_HEADROOM ALIGN(14, 4) /** * ieee80211_sta_set_buffered - inform mac80211 about driver-buffered frames -- cgit v1.2.3 From 657308f73e674e86b60509a430a46e569bf02846 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 22 Feb 2018 20:55:28 +0100 Subject: regulatory: add NUL to request alpha2 Similar to the ancient commit a5fe8e7695dc ("regulatory: add NUL to alpha2"), add another byte to alpha2 in the request struct so that when we use nla_put_string(), we don't overrun anything. Fixes: 73d54c9e74c4 ("cfg80211: add regulatory netlink multicast group") Reported-by: Kees Cook Signed-off-by: Johannes Berg --- include/net/regulatory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/regulatory.h b/include/net/regulatory.h index ebc5a2ed8631..f83cacce3308 100644 --- a/include/net/regulatory.h +++ b/include/net/regulatory.h @@ -78,7 +78,7 @@ struct regulatory_request { int wiphy_idx; enum nl80211_reg_initiator initiator; enum nl80211_user_reg_hint_type user_reg_hint_type; - char alpha2[2]; + char alpha2[3]; enum nl80211_dfs_regions dfs_region; bool intersect; bool processed; -- cgit v1.2.3